1099 lines
49 KiB
C++
1099 lines
49 KiB
C++
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
==============================================================================*/
|
|
|
|
// Enable the use of M_* math constants.
|
|
// NOTE: this must be first in the file to ensure that if cmath is transitively
|
|
// included by any other header it has the define set on first processing.
|
|
// https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants
|
|
#define _USE_MATH_DEFINES
|
|
#include <cmath>
|
|
#include <numeric>
|
|
#include <vector>
|
|
|
|
#include "mlir-hlo/Dialect/mhlo/IR/chlo_ops.h"
|
|
#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
|
|
#include "mlir-hlo/Dialect/mhlo/transforms/map_chlo_to_hlo_op.h"
|
|
#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
|
|
#include "mlir-hlo/utils/broadcast_utils.h"
|
|
#include "mlir/Dialect/SCF/SCF.h"
|
|
#include "mlir/Dialect/Shape/IR/Shape.h"
|
|
#include "mlir/Dialect/StandardOps/IR/Ops.h"
|
|
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
|
#include "mlir/IR/Attributes.h"
|
|
#include "mlir/IR/BuiltinTypes.h"
|
|
#include "mlir/IR/ImplicitLocOpBuilder.h"
|
|
#include "mlir/IR/MLIRContext.h"
|
|
#include "mlir/IR/OperationSupport.h"
|
|
#include "mlir/IR/PatternMatch.h"
|
|
#include "mlir/Transforms/DialectConversion.h"
|
|
|
|
namespace mlir {
|
|
namespace chlo {
|
|
namespace {
|
|
|
|
struct ConvertConstantLikeOp : public OpConversionPattern<ConstantLikeOp> {
|
|
using OpConversionPattern<ConstantLikeOp>::OpConversionPattern;
|
|
LogicalResult matchAndRewrite(
|
|
ConstantLikeOp op, ArrayRef<Value> operands,
|
|
ConversionPatternRewriter &rewriter) const override {
|
|
auto result_ty = op.getType().cast<ShapedType>();
|
|
|
|
// Unranked uses are not supported. Consider `transform-unranked-hlo`.
|
|
if (!result_ty.hasRank()) return failure();
|
|
|
|
// Lower to MHLO constant if statically shaped.
|
|
if (result_ty.hasStaticShape()) {
|
|
rewriter.replaceOpWithNewOp<mhlo::ConstOp>(
|
|
op, DenseElementsAttr::get(result_ty, op.value()));
|
|
return success();
|
|
}
|
|
|
|
// Lower to broadcasted constant.
|
|
ConstantLikeOp::Adaptor transformed(operands);
|
|
auto loc = op.getLoc();
|
|
Type extent_tensor_type = shape::getExtentTensorType(op.getContext());
|
|
Value constant = rewriter.create<mhlo::ConstOp>(loc, op.value());
|
|
Value uncasted_shape = rewriter.create<shape::ShapeOfOp>(
|
|
loc, extent_tensor_type, transformed.operand());
|
|
Type shape_ty =
|
|
RankedTensorType::get({result_ty.getRank()}, rewriter.getIndexType());
|
|
Value shape =
|
|
rewriter.create<tensor::CastOp>(loc, shape_ty, uncasted_shape);
|
|
rewriter.replaceOpWithNewOp<mhlo::DynamicBroadcastInDimOp>(
|
|
op, result_ty, constant, shape, rewriter.getI64TensorAttr({}));
|
|
return success();
|
|
}
|
|
};
|
|
|
|
template <typename FTy>
|
|
Value MaterializePolynomialApproximation(ConversionPatternRewriter &rewriter,
|
|
Location loc, Value x,
|
|
const std::vector<FTy> &coefficients) {
|
|
Value poly = chlo::getConstantLike(rewriter, loc, 0.0, x);
|
|
for (FTy c : coefficients) {
|
|
poly = rewriter.create<mhlo::MulOp>(loc, x.getType(), poly, x);
|
|
poly = rewriter.create<mhlo::AddOp>(
|
|
loc, x.getType(), poly, chlo::getConstantLike(rewriter, loc, c, x));
|
|
}
|
|
return poly;
|
|
}
|
|
|
|
// Precondition is |x| >= 1. Use erf approximation, otherwise.
|
|
//
|
|
// We rely on multiple polynomial approximations for x >= 1. We pass |x| as an
|
|
// argument and derive the final approximation for all |x| >= 1.
|
|
// This implementation is based on Cephes.
|
|
Value MaterializeErfcApproximationF64ForMagnituteGEOne(
|
|
ConversionPatternRewriter &rewriter, Location loc, Value x) {
|
|
assert(x.getType().cast<ShapedType>().getElementType().isF64() &&
|
|
"expect f64 element type");
|
|
const double kMaxlog = 7.09782712893383996843E2;
|
|
const std::vector<double> kErfcPCoefficients{
|
|
2.46196981473530512524E-10, 5.64189564831068821977E-1,
|
|
7.46321056442269912687E0, 4.86371970985681366614E1,
|
|
1.96520832956077098242E2, 5.26445194995477358631E2,
|
|
9.34528527171957607540E2, 1.02755188689515710272E3,
|
|
5.57535335369399327526E2};
|
|
const std::vector<double> kErfcQCoefficients{
|
|
1.00000000000000000000E0, 1.32281951154744992508E1,
|
|
8.67072140885989742329E1, 3.54937778887819891062E2,
|
|
9.75708501743205489753E2, 1.82390916687909736289E3,
|
|
2.24633760818710981792E3, 1.65666309194161350182E3,
|
|
5.57535340817727675546E2};
|
|
const std::vector<double> kErfcRCoefficients{
|
|
5.64189583547755073984E-1, 1.27536670759978104416E0,
|
|
5.01905042251180477414E0, 6.16021097993053585195E0,
|
|
7.40974269950448939160E0, 2.97886665372100240670E0};
|
|
const std::vector<double> kErfcSCoefficients{
|
|
1.00000000000000000000E0, 2.26052863220117276590E0,
|
|
9.39603524938001434673E0, 1.20489539808096656605E1,
|
|
1.70814450747565897222E1, 9.60896809063285878198E0,
|
|
3.36907645100081516050E0};
|
|
|
|
// Let z = -x^2.
|
|
Value x_sq = rewriter.create<mhlo::MulOp>(loc, x, x);
|
|
Value z = rewriter.create<mhlo::NegOp>(loc, x_sq);
|
|
|
|
// Materialize polynomial approximation for x in [1, 8) as
|
|
// erfc(x) = exp(z) P(|x|) / Q(|x|).
|
|
Value exp_z = rewriter.create<mhlo::ExpOp>(loc, z);
|
|
Value abs_x = rewriter.create<mhlo::AbsOp>(loc, x);
|
|
Value poly_p = MaterializePolynomialApproximation(rewriter, loc, abs_x,
|
|
kErfcPCoefficients);
|
|
Value exp_z_mul_poly_p = rewriter.create<mhlo::MulOp>(loc, exp_z, poly_p);
|
|
Value poly_q = MaterializePolynomialApproximation(rewriter, loc, abs_x,
|
|
kErfcQCoefficients);
|
|
Value erfc_approx_1_8 =
|
|
rewriter.create<mhlo::DivOp>(loc, exp_z_mul_poly_p, poly_q);
|
|
|
|
// Materialize polynomial approximation for x in >= 8 as
|
|
// erfc(x) exp(z) R(|x|) / S(|x|).
|
|
Value poly_r = MaterializePolynomialApproximation(rewriter, loc, abs_x,
|
|
kErfcRCoefficients);
|
|
Value exp_z_mul_poly_r = rewriter.create<mhlo::MulOp>(loc, exp_z, poly_r);
|
|
Value poly_s = MaterializePolynomialApproximation(rewriter, loc, abs_x,
|
|
kErfcSCoefficients);
|
|
Value erfc_approx_8_inf =
|
|
rewriter.create<mhlo::DivOp>(loc, exp_z_mul_poly_r, poly_s);
|
|
|
|
// Combine polynomial approximations for x >= 1.
|
|
const StringAttr kLT = rewriter.getStringAttr(
|
|
mhlo::stringifyComparisonDirection(mhlo::ComparisonDirection::LT));
|
|
Value eight = chlo::getConstantLike(rewriter, loc, 8.0, x);
|
|
Value abs_x_lt_8 = rewriter.create<mhlo::CompareOp>(loc, abs_x, eight, kLT);
|
|
Value erfc_approx = rewriter.create<mhlo::SelectOp>(
|
|
loc, abs_x_lt_8, erfc_approx_1_8, erfc_approx_8_inf);
|
|
|
|
// Clamp to prevent overflow and materialize approximation for large x as
|
|
// erfc(x) = 0.
|
|
Value z_lt_neg_maxlog = rewriter.create<mhlo::CompareOp>(
|
|
loc, z, chlo::getConstantLike(rewriter, loc, -kMaxlog, x), kLT);
|
|
Value zero = chlo::getConstantLike(rewriter, loc, 0.0, x);
|
|
Value erfc_approx_clamped =
|
|
rewriter.create<mhlo::SelectOp>(loc, z_lt_neg_maxlog, zero, erfc_approx);
|
|
|
|
// Derive approximation for x <= -1 as
|
|
// erfc(x) = 2 - erfc(-x).
|
|
// Reuse previously materialized approximations all of which take |x| as their
|
|
// argument.
|
|
Value x_lt_zero = rewriter.create<mhlo::CompareOp>(loc, x, zero, kLT);
|
|
Value two = chlo::getConstantLike(rewriter, loc, 2.0, x);
|
|
Value two_sub_erfc_approx_clamped =
|
|
rewriter.create<mhlo::SubOp>(loc, two, erfc_approx_clamped);
|
|
return rewriter.create<mhlo::SelectOp>(
|
|
loc, x_lt_zero, two_sub_erfc_approx_clamped, erfc_approx_clamped);
|
|
}
|
|
|
|
// Precondition is |x| <= 1. Use erfc approximation, otherwise.
|
|
// This implementation is based on Cephes.
|
|
Value MaterializeErfApproximationF64ForMagnituteLEOne(
|
|
ConversionPatternRewriter &rewriter, Location loc, Value x) {
|
|
assert(x.getType().cast<ShapedType>().getElementType().isF64() &&
|
|
"expect f64 element type");
|
|
const std::vector<double> kErfTCoefficients{
|
|
9.60497373987051638749E0, 9.00260197203842689217E1,
|
|
2.23200534594684319226E3, 7.00332514112805075473E3,
|
|
5.55923013010394962768E4};
|
|
const std::vector<double> kErfUCoefficients{
|
|
1.00000000000000000000E0, 3.35617141647503099647E1,
|
|
5.21357949780152679795E2, 4.59432382970980127987E3,
|
|
2.26290000613890934246E4, 4.92673942608635921086E4};
|
|
|
|
// Materialize polynomial approximation for |x| <= 1 as
|
|
// erf(x) = x T(x^2) / U(x^2).
|
|
Value x_sq = rewriter.create<mhlo::MulOp>(loc, x, x);
|
|
Value poly_t = MaterializePolynomialApproximation(rewriter, loc, x_sq,
|
|
kErfTCoefficients);
|
|
Value x_mul_poly_t = rewriter.create<mhlo::MulOp>(loc, x, poly_t);
|
|
Value poly_u = MaterializePolynomialApproximation(rewriter, loc, x_sq,
|
|
kErfUCoefficients);
|
|
return rewriter.create<mhlo::DivOp>(loc, x_mul_poly_t, poly_u);
|
|
}
|
|
|
|
// This implementation is based on Cephes.
|
|
Value MaterializeErfApproximationF64(ConversionPatternRewriter &rewriter,
|
|
Location loc, Value x) {
|
|
assert(x.getType().cast<ShapedType>().getElementType().isF64() &&
|
|
"expect f64 element type");
|
|
|
|
// Rely on erf approximation for |x| < 1
|
|
// erf(x) = erf_approx(x)
|
|
Value erf_approx =
|
|
MaterializeErfApproximationF64ForMagnituteLEOne(rewriter, loc, x);
|
|
|
|
// Rely on erfc approximation for |x| >= 1 and materialize erf as
|
|
// erf(x) = 1 - erfc_approx(x)
|
|
Value one = chlo::getConstantLike(rewriter, loc, 1.0, x);
|
|
Value erfc_approx =
|
|
MaterializeErfcApproximationF64ForMagnituteGEOne(rewriter, loc, x);
|
|
Value erfc_based_approx = rewriter.create<mhlo::SubOp>(loc, one, erfc_approx);
|
|
|
|
// Materialize approximation selection based on argument.
|
|
Value abs_x = rewriter.create<mhlo::AbsOp>(loc, x);
|
|
const StringAttr kLT = rewriter.getStringAttr(
|
|
mhlo::stringifyComparisonDirection(mhlo::ComparisonDirection::LT));
|
|
Value abs_x_lt_one = rewriter.create<mhlo::CompareOp>(loc, abs_x, one, kLT);
|
|
return rewriter.create<mhlo::SelectOp>(loc, abs_x_lt_one, erf_approx,
|
|
erfc_based_approx);
|
|
}
|
|
|
|
Value MaterializeErfcApproximationF64(ConversionPatternRewriter &rewriter,
|
|
Location loc, Value x) {
|
|
assert(x.getType().cast<ShapedType>().getElementType().isF64() &&
|
|
"expect f64 element type");
|
|
|
|
// Rely on erfc approximation for |x| >= 1
|
|
// erfc(x) = erfc_approx(x)
|
|
Value erfc_approx =
|
|
MaterializeErfcApproximationF64ForMagnituteGEOne(rewriter, loc, x);
|
|
|
|
// Rely on erf approximation for |x| < 1 and materialize erfc as
|
|
// erfc(x) = 1 - erf_approx(x)
|
|
Value one = chlo::getConstantLike(rewriter, loc, 1.0, x);
|
|
Value erf_approx =
|
|
MaterializeErfApproximationF64ForMagnituteLEOne(rewriter, loc, x);
|
|
Value erf_based_approx = rewriter.create<mhlo::SubOp>(loc, one, erf_approx);
|
|
|
|
// Materialize approximation selection based on argument.
|
|
Value abs_x = rewriter.create<mhlo::AbsOp>(loc, x);
|
|
const StringAttr kLT = rewriter.getStringAttr(
|
|
mhlo::stringifyComparisonDirection(mhlo::ComparisonDirection::LT));
|
|
Value abs_x_lt_one = rewriter.create<mhlo::CompareOp>(loc, abs_x, one, kLT);
|
|
return rewriter.create<mhlo::SelectOp>(loc, abs_x_lt_one, erf_based_approx,
|
|
erfc_approx);
|
|
}
|
|
|
|
// Precondition is |x| >= 1. Use erf approximation, otherwise.
|
|
//
|
|
// We rely on multiple polynomial approximations for x >= 1. We pass |x| as an
|
|
// argument and derive the final approximation for all |x| >= 1.
|
|
// This implementation is based on Cephes.
|
|
Value MaterializeErfcApproximationF32ForMagnitudeGEOne(
|
|
ConversionPatternRewriter &rewriter, Location loc, Value x) {
|
|
assert(x.getType().cast<ShapedType>().getElementType().isF32() &&
|
|
"expect f32 element type");
|
|
const double kMaxlog = 88.72283905206835;
|
|
const std::vector<float> kErfcPCoefficients{
|
|
+2.326819970068386E-2, -1.387039388740657E-1, +3.687424674597105E-1,
|
|
-5.824733027278666E-1, +6.210004621745983E-1, -4.944515323274145E-1,
|
|
+3.404879937665872E-1, -2.741127028184656E-1, +5.638259427386472E-1,
|
|
};
|
|
const std::vector<float> kErfcRCoefficients{
|
|
-1.047766399936249E+1, +1.297719955372516E+1, -7.495518717768503E+0,
|
|
+2.921019019210786E+0, -1.015265279202700E+0, +4.218463358204948E-1,
|
|
-2.820767439740514E-1, +5.641895067754075E-1,
|
|
};
|
|
|
|
// Let z = -x^2.
|
|
Value x_sq = rewriter.create<mhlo::MulOp>(loc, x, x);
|
|
Value z = rewriter.create<mhlo::NegOp>(loc, x_sq);
|
|
|
|
// Materialize polynomial approximation for x >= 1 as
|
|
// erfc(x) = exp(z) 1/x P(1/x^2) if x in [1, 2)
|
|
// erfc(x) = exp(z) 1/x R(1/x^2) if x >= 2
|
|
const StringAttr kLT = rewriter.getStringAttr(
|
|
mhlo::stringifyComparisonDirection(mhlo::ComparisonDirection::LT));
|
|
Value abs_x = rewriter.create<mhlo::AbsOp>(loc, x);
|
|
Value one = chlo::getConstantLike(rewriter, loc, 1.0, x);
|
|
Value reciprocal_x_sq = rewriter.create<mhlo::DivOp>(loc, one, x_sq);
|
|
Value exp_z = rewriter.create<mhlo::ExpOp>(loc, z);
|
|
Value one_div_abs_x = rewriter.create<mhlo::DivOp>(loc, one, abs_x);
|
|
Value exp_z_mul_one_div_abs_x =
|
|
rewriter.create<mhlo::MulOp>(loc, exp_z, one_div_abs_x);
|
|
Value two = chlo::getConstantLike(rewriter, loc, 2.0, x);
|
|
Value abs_x_lt_two = rewriter.create<mhlo::CompareOp>(loc, abs_x, two, kLT);
|
|
Value poly_p = MaterializePolynomialApproximation(
|
|
rewriter, loc, reciprocal_x_sq, kErfcPCoefficients);
|
|
Value poly_r = MaterializePolynomialApproximation(
|
|
rewriter, loc, reciprocal_x_sq, kErfcRCoefficients);
|
|
Value poly =
|
|
rewriter.create<mhlo::SelectOp>(loc, abs_x_lt_two, poly_p, poly_r);
|
|
Value erfc_approx =
|
|
rewriter.create<mhlo::MulOp>(loc, exp_z_mul_one_div_abs_x, poly);
|
|
|
|
// Clamp to prevent overflow and materialize approximation for large x as
|
|
// erfc(x) = 0.
|
|
Value z_lt_neq_maxlog = rewriter.create<mhlo::CompareOp>(
|
|
loc, z, chlo::getConstantLike(rewriter, loc, -kMaxlog, x), kLT);
|
|
Value zero = chlo::getConstantLike(rewriter, loc, 0.0, x);
|
|
Value erfc_approx_clamped =
|
|
rewriter.create<mhlo::SelectOp>(loc, z_lt_neq_maxlog, zero, erfc_approx);
|
|
|
|
// Derive approximation for x <= -1 as
|
|
// erfc(x) = 2 - erfc(-x).
|
|
// Reuse previously materialized approximations all of which take |x| as their
|
|
// argument.
|
|
Value x_lt_zero = rewriter.create<mhlo::CompareOp>(loc, x, zero, kLT);
|
|
Value two_sub_erfc_approx =
|
|
rewriter.create<mhlo::SubOp>(loc, two, erfc_approx_clamped);
|
|
return rewriter.create<mhlo::SelectOp>(loc, x_lt_zero, two_sub_erfc_approx,
|
|
erfc_approx_clamped);
|
|
}
|
|
|
|
// Precondition is |x| <= 1. Use erfc approximation, otherwise.
|
|
// This implementation is based on Cephes.
|
|
Value MaterializeErfApproximationF32ForMagnitudeLEOne(
|
|
ConversionPatternRewriter &rewriter, Location loc, Value x) {
|
|
assert(x.getType().cast<ShapedType>().getElementType().isF32() &&
|
|
"expect f32 element type");
|
|
const std::vector<float> kErfTCoefficients{
|
|
+7.853861353153693E-5, -8.010193625184903E-4, +5.188327685732524E-3,
|
|
-2.685381193529856E-2, +1.128358514861418E-1, -3.761262582423300E-1,
|
|
+1.128379165726710E+0,
|
|
};
|
|
|
|
// Materialize polynomial approximation for |x| <= 1 as
|
|
// erf(x) = x T(x^2).
|
|
Value x_sq = rewriter.create<mhlo::MulOp>(loc, x, x);
|
|
Value poly_t = MaterializePolynomialApproximation(rewriter, loc, x_sq,
|
|
kErfTCoefficients);
|
|
return rewriter.create<mhlo::MulOp>(loc, x, poly_t);
|
|
}
|
|
|
|
// This is the same approximation as used in Eigen.
|
|
Value MaterializeErfApproximationF32(ConversionPatternRewriter &rewriter,
|
|
Location loc, Value operand) {
|
|
assert(operand.getType().cast<ShapedType>().getElementType().isF32() &&
|
|
"expect f32 element type");
|
|
const std::vector<float> kAlpha{
|
|
-2.72614225801306e-10f, 2.77068142495902e-08f, -2.10102402082508e-06f,
|
|
-5.69250639462346e-05f, -7.34990630326855e-04f, -2.95459980854025e-03f,
|
|
-1.60960333262415e-02f,
|
|
};
|
|
const std::vector<float> kBeta{
|
|
-1.45660718464996e-05f, -2.13374055278905e-04f, -1.68282697438203e-03f,
|
|
-7.37332916720468e-03f, -1.42647390514189e-02f,
|
|
};
|
|
|
|
// Clamp argument between -4 and 4.
|
|
Value lb = chlo::getConstantLike(rewriter, loc, -4.0, operand);
|
|
Value ub = chlo::getConstantLike(rewriter, loc, 4.0, operand);
|
|
Value x =
|
|
rewriter.create<mhlo::ClampOp>(loc, operand.getType(), lb, operand, ub);
|
|
Value x_sq = rewriter.create<mhlo::MulOp>(loc, x, x);
|
|
|
|
// Materialize polynomial approximation for x in [-4, 4] as
|
|
// erf(x) = x * Alpha(x^2) / Beta(x^2).
|
|
Value alpha_poly =
|
|
MaterializePolynomialApproximation(rewriter, loc, x_sq, kAlpha);
|
|
Value beta_poly =
|
|
MaterializePolynomialApproximation(rewriter, loc, x_sq, kBeta);
|
|
Value x_mul_alpha_poly = rewriter.create<mhlo::MulOp>(loc, x, alpha_poly);
|
|
return rewriter.create<mhlo::DivOp>(loc, x_mul_alpha_poly, beta_poly);
|
|
}
|
|
|
|
Value MaterializeErfcApproximationF32(ConversionPatternRewriter &rewriter,
|
|
Location loc, Value x) {
|
|
assert(x.getType().cast<ShapedType>().getElementType().isF32() &&
|
|
"expect f32 element type");
|
|
|
|
// Rely on erfc approximation for |x| >= 1
|
|
// erfc(x) = erfc_approx(x)
|
|
Value erfc_approx =
|
|
MaterializeErfcApproximationF32ForMagnitudeGEOne(rewriter, loc, x);
|
|
|
|
// Rely on erf approximation for |x| < 1 and materialize erfc as
|
|
// erfc(x) = 1 - erf_approx(x)
|
|
Value one = chlo::getConstantLike(rewriter, loc, 1.0, x);
|
|
Value erf_approx =
|
|
MaterializeErfApproximationF32ForMagnitudeLEOne(rewriter, loc, x);
|
|
Value erf_based_approx = rewriter.create<mhlo::SubOp>(loc, one, erf_approx);
|
|
|
|
// Materialize approximation selection based on argument.
|
|
const StringAttr kLT = rewriter.getStringAttr(
|
|
mhlo::stringifyComparisonDirection(mhlo::ComparisonDirection::LT));
|
|
Value abs_x = rewriter.create<mhlo::AbsOp>(loc, x);
|
|
Value abs_x_lt_one = rewriter.create<mhlo::CompareOp>(loc, abs_x, one, kLT);
|
|
return rewriter.create<mhlo::SelectOp>(loc, abs_x_lt_one, erf_based_approx,
|
|
erfc_approx);
|
|
}
|
|
|
|
Value MaterializeWithUpcast(ConversionPatternRewriter &rewriter, Location loc,
|
|
Value arg, FloatType min_precision_ty,
|
|
Value callback(ConversionPatternRewriter &,
|
|
Location, Value)) {
|
|
auto original_ty = getElementTypeOrSelf(arg.getType()).cast<FloatType>();
|
|
bool needs_upcast = original_ty.getWidth() < min_precision_ty.getWidth();
|
|
if (needs_upcast) {
|
|
arg = rewriter.create<mhlo::ConvertOp>(loc, arg, min_precision_ty);
|
|
}
|
|
Value result = callback(rewriter, loc, arg);
|
|
if (needs_upcast) {
|
|
result = rewriter.create<mhlo::ConvertOp>(loc, result, original_ty);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
struct ConvertErfOp : public OpConversionPattern<ErfOp> {
|
|
using OpConversionPattern<ErfOp>::OpConversionPattern;
|
|
LogicalResult matchAndRewrite(
|
|
ErfOp op, ArrayRef<Value> operands,
|
|
ConversionPatternRewriter &rewriter) const override {
|
|
Location loc = op.getLoc();
|
|
ErfOp::Adaptor transformed(operands);
|
|
Value x = transformed.operand();
|
|
Type ty = x.getType().cast<ShapedType>().getElementType();
|
|
|
|
// For now, we support only f64, f32, and f16.
|
|
if (!ty.isF64() && !ty.isF32() && !ty.isF16()) return failure();
|
|
|
|
if (ty.isF64()) {
|
|
rewriter.replaceOp(op, MaterializeErfApproximationF64(rewriter, loc, x));
|
|
return success();
|
|
}
|
|
|
|
rewriter.replaceOp(
|
|
op, MaterializeWithUpcast(rewriter, loc, x, rewriter.getF32Type(),
|
|
&MaterializeErfApproximationF32));
|
|
return success();
|
|
}
|
|
};
|
|
|
|
struct ConvertErfcOp : public OpConversionPattern<ErfcOp> {
|
|
using OpConversionPattern<ErfcOp>::OpConversionPattern;
|
|
LogicalResult matchAndRewrite(
|
|
ErfcOp op, ArrayRef<Value> operands,
|
|
ConversionPatternRewriter &rewriter) const override {
|
|
Location loc = op.getLoc();
|
|
ErfcOp::Adaptor transformed(operands);
|
|
Value x = transformed.operand();
|
|
Type ty = x.getType().cast<ShapedType>().getElementType();
|
|
|
|
// For now, we support only f64, f32, and f16.
|
|
if (!ty.isF64() && !ty.isF32() && !ty.isF16()) return failure();
|
|
|
|
if (ty.isF64()) {
|
|
rewriter.replaceOp(op, MaterializeErfcApproximationF64(rewriter, loc, x));
|
|
return success();
|
|
}
|
|
|
|
rewriter.replaceOp(
|
|
op, MaterializeWithUpcast(rewriter, loc, x, rewriter.getF32Type(),
|
|
&MaterializeErfcApproximationF32));
|
|
return success();
|
|
}
|
|
};
|
|
|
|
// Coefficients for the Lanczos approximation of the gamma function. The
|
|
// coefficients are uniquely determined by the choice of g and n (kLanczosGamma
|
|
// and kLanczosCoefficients.size() + 1). The coefficients below correspond to
|
|
// [7, 9]. [5, 7], [7, 9], [9, 10], and [607/128.0, 15] were evaluated and
|
|
// [7, 9] seemed to be the least sensitive to the quality of the log function.
|
|
// In particular, [5, 7] is the only choice where -1.5e-5 <= lgamma(2) <= 1.5e-5
|
|
// for a particularly inaccurate log function.
|
|
constexpr double kLanczosGamma = 7; // aka g
|
|
constexpr double kBaseLanczosCoeff = 0.99999999999980993227684700473478;
|
|
constexpr std::array<double, 8> kLanczosCoefficients = {
|
|
676.520368121885098567009190444019, -1259.13921672240287047156078755283,
|
|
771.3234287776530788486528258894, -176.61502916214059906584551354,
|
|
12.507343278686904814458936853, -0.13857109526572011689554707,
|
|
9.984369578019570859563e-6, 1.50563273514931155834e-7};
|
|
|
|
// Compute the Lgamma function using Lanczos' approximation from "A Precision
|
|
// Approximation of the Gamma Function". SIAM Journal on Numerical Analysis
|
|
// series B. Vol. 1:
|
|
// lgamma(z + 1) = (log(2) + log(pi)) / 2
|
|
// + (z + 1/2) * log(t(z))
|
|
// - t(z) + log(a(z))
|
|
// with t(z) = z + kLanczosGamma + 1/2
|
|
// a(z) = kBaseLanczosCoeff
|
|
// + sum(k = 1, n, kLanczosCoefficients[i] / (z + k))
|
|
Value MaterializeLgamma(ConversionPatternRewriter &rewriter, Location loc,
|
|
Value x) {
|
|
// If the input is less than 0.5 use Euler's reflection formula.
|
|
// gamma(x) = pi / (sin(pi * x) * gamma(1 - x))
|
|
// Let z be
|
|
// z = -x if x < 1/2
|
|
// z = x - 1 otheriwse
|
|
const StringAttr kLT = rewriter.getStringAttr(
|
|
mhlo::stringifyComparisonDirection(mhlo::ComparisonDirection::LT));
|
|
Value half = getConstantLike(rewriter, loc, 0.5, x);
|
|
Value need_to_reflect = rewriter.create<mhlo::CompareOp>(loc, x, half, kLT);
|
|
Value neg_x = rewriter.create<mhlo::NegOp>(loc, x);
|
|
Value one = getConstantLike(rewriter, loc, 1, x);
|
|
Value x_sub_one = rewriter.create<mhlo::SubOp>(loc, x, one);
|
|
Value z =
|
|
rewriter.create<mhlo::SelectOp>(loc, need_to_reflect, neg_x, x_sub_one);
|
|
|
|
// Materialize
|
|
// a(z) = kBaseLanczosCoeff
|
|
// + sum(k = 1, n, kLanczosCoefficients[i] / (z + k))
|
|
Value a = getConstantLike(rewriter, loc, kBaseLanczosCoeff, x);
|
|
for (int i = 0, end = kLanczosCoefficients.size(); i < end; ++i) {
|
|
Value coeff = getConstantLike(rewriter, loc, kLanczosCoefficients[i], x);
|
|
Value one_based_index = getConstantLike(rewriter, loc, i + 1, x);
|
|
Value quotient = rewriter.create<mhlo::DivOp>(
|
|
loc, coeff, rewriter.create<mhlo::AddOp>(loc, z, one_based_index));
|
|
a = rewriter.create<mhlo::AddOp>(loc, a, quotient);
|
|
}
|
|
|
|
// To improve accuracy on platforms with less-precise log implementations,
|
|
// compute log(kLanczosGamma + 1/2) at compile time and use log1p on the
|
|
// device.
|
|
// Materialize as
|
|
// log(t) = log(kLanczosGamma + 1/2 + z)
|
|
// = log(kLanczosGamma + 1/2) + log1p(z / (kLanczosGamma + 1/2)).
|
|
Value lanczos_plus_half =
|
|
getConstantLike(rewriter, loc, kLanczosGamma + 0.5, x);
|
|
Value t = rewriter.create<mhlo::AddOp>(loc, lanczos_plus_half, z);
|
|
Value log_term =
|
|
getConstantLike(rewriter, loc, std::log(kLanczosGamma + 0.5), x);
|
|
Value log1p_term = rewriter.create<mhlo::Log1pOp>(
|
|
loc, rewriter.create<mhlo::DivOp>(loc, z, lanczos_plus_half));
|
|
Value log_t = rewriter.create<mhlo::AddOp>(loc, log_term, log1p_term);
|
|
|
|
// Note that t(z) may be large and we need to be careful not to overflow to
|
|
// infinity in the relevant term
|
|
// r = (z + 1/2) * log(t(z)) - t(z).
|
|
// Therefore, we compute this as
|
|
// r = (z + 1/2 - t(z) / log(t(z))) * log(t(z)).
|
|
Value t_div_log_t = rewriter.create<mhlo::DivOp>(loc, t, log_t);
|
|
Value sum = rewriter.create<mhlo::SubOp>(
|
|
loc, rewriter.create<mhlo::AddOp>(loc, z, half), t_div_log_t);
|
|
Value r = rewriter.create<mhlo::MulOp>(loc, sum, log_t);
|
|
|
|
// Compute the final result (modulo reflection) as
|
|
// lgamma(z + 1) = (log(2) + log(pi)) / 2 + r + log(a(z)).
|
|
Value log_a = rewriter.create<mhlo::LogOp>(loc, a);
|
|
Value lgamma = rewriter.create<mhlo::AddOp>(
|
|
loc,
|
|
rewriter.create<mhlo::AddOp>(
|
|
loc,
|
|
getConstantLike(rewriter, loc, (std::log(2) + std::log(M_PI)) / 2, x),
|
|
r),
|
|
log_a);
|
|
|
|
// Compute the reflected value for x < 0.5 as
|
|
// lgamma(x) = log(pi) - lgamma(1-x) - log(abs(sin(pi * x))).
|
|
//
|
|
// The abs is needed because lgamma is the log of the absolute value of the
|
|
// gamma function.
|
|
//
|
|
// We have to be careful when computing the final term above. gamma(x) goes
|
|
// to +/-inf at every integer x < 0, and this is controlled by the sin(pi * x)
|
|
// term. The slope is large, so precision is particularly important.
|
|
//
|
|
// Because abs(sin(pi * x)) has period of 1 we can equivalently use
|
|
// abs(sin(pi * frac(x))) where frac(x) is the fractional part of x. This is
|
|
// more numerically accurate: It doesn't overflow to inf like pi * x would and
|
|
// if x is an integer it evaluates to exactly 0 which is important because we
|
|
// then take the log of this value, and log(0) is inf.
|
|
//
|
|
// We don't have a frac(x) primitive in HLO and computing it is tricky, but
|
|
// because abs(sin(pi * x)) = abs(sin(pi * abs(x))), it's good enough for our
|
|
// purposes to use abs(frac(x)) = abs(x) - floor(abs(x)).
|
|
//
|
|
// Furthermore, pi * abs(frac(x)) loses precision when abs(frac(x)) is close
|
|
// to 1. To remedy this, we can use the fact that sin(pi * x) in the domain
|
|
// [0, 1] is symmetric across the line Y=0.5.
|
|
//
|
|
|
|
// Convert values of abs_frac > 0.5 to (1 - abs_frac) to improve precision of
|
|
// pi * abs_frac for values of abs_frac close to 1.
|
|
Value abs = rewriter.create<mhlo::AbsOp>(loc, x);
|
|
Value abs_frac = rewriter.create<mhlo::SubOp>(
|
|
loc, abs, rewriter.create<mhlo::FloorOp>(loc, abs));
|
|
Value reduce_abs_frac =
|
|
rewriter.create<mhlo::CompareOp>(loc, half, abs_frac, kLT);
|
|
abs_frac = rewriter.create<mhlo::SelectOp>(
|
|
loc, reduce_abs_frac, rewriter.create<mhlo::SubOp>(loc, one, abs_frac),
|
|
abs_frac);
|
|
|
|
// Materialize reflection.
|
|
Value reflection_denom = rewriter.create<mhlo::LogOp>(
|
|
loc,
|
|
rewriter.create<mhlo::SinOp>(
|
|
loc, rewriter.create<mhlo::MulOp>(
|
|
loc, getConstantLike(rewriter, loc, M_PI, x), abs_frac)));
|
|
Value lgamma_reflection = rewriter.create<mhlo::SubOp>(
|
|
loc,
|
|
rewriter.create<mhlo::SubOp>(
|
|
loc, getConstantLike(rewriter, loc, std::log(M_PI), x),
|
|
reflection_denom),
|
|
lgamma);
|
|
|
|
// Avoid computing -inf - inf, which is nan. If reflection_denom is +/-inf,
|
|
// then it "wins" and the result is +/-inf.
|
|
Value finite_reflection_denom =
|
|
rewriter.create<mhlo::IsFiniteOp>(loc, reflection_denom);
|
|
Value neg_reflection_denom =
|
|
rewriter.create<mhlo::NegOp>(loc, reflection_denom);
|
|
lgamma_reflection = rewriter.create<mhlo::SelectOp>(
|
|
loc, finite_reflection_denom, lgamma_reflection, neg_reflection_denom);
|
|
|
|
// Select whether or not to rely on the reflection.
|
|
lgamma = rewriter.create<mhlo::SelectOp>(loc, need_to_reflect,
|
|
lgamma_reflection, lgamma);
|
|
|
|
// Materialize +/-inf behavior as
|
|
// lgamma(+/-inf) = +inf.
|
|
Value x_is_inf = rewriter.create<chlo::IsInfOp>(loc, x);
|
|
return rewriter.create<mhlo::SelectOp>(
|
|
loc, x_is_inf,
|
|
chlo::getConstantLikeInfValue(rewriter, loc, x, /*negative=*/false),
|
|
lgamma);
|
|
}
|
|
|
|
// Compute the Digamma function using Lanczos' approximation from "A Precision
|
|
// Approximation of the Gamma Function". SIAM Journal on Numerical Analysis
|
|
// series B. Vol. 1:
|
|
// digamma(z + 1) = log(t(z)) + a'(z) / a(z) - kLanczosGamma / t(z)
|
|
// with t(z) = z + kLanczosGamma + 1/2
|
|
// a(z) = kBaseLanczosCoeff
|
|
// + sum(k = 1, n, kLanczosCoefficients[i] / (z + k))
|
|
// a'(z) = - sum(k = 1, n, kLanczosCoefficients[i] / (z + k) / (z + k))
|
|
Value MaterializeDigamma(ConversionPatternRewriter &rewriter, Location loc,
|
|
Value x) {
|
|
// If the input is less than 0.5 use Euler's reflection formula.
|
|
// digamma(x) = digamma(1 - x) - pi * cot(pi * x)
|
|
// Let z be
|
|
// z = -x if x < 1/2
|
|
// z = x - 1 otheriwse
|
|
const StringAttr kLT = rewriter.getStringAttr(
|
|
mhlo::stringifyComparisonDirection(mhlo::ComparisonDirection::LT));
|
|
Value half = getConstantLike(rewriter, loc, 0.5, x);
|
|
Value need_to_reflect = rewriter.create<mhlo::CompareOp>(loc, x, half, kLT);
|
|
Value neg_x = rewriter.create<mhlo::NegOp>(loc, x);
|
|
Value one = getConstantLike(rewriter, loc, 1, x);
|
|
Value x_sub_one = rewriter.create<mhlo::SubOp>(loc, x, one);
|
|
Value z =
|
|
rewriter.create<mhlo::SelectOp>(loc, need_to_reflect, neg_x, x_sub_one);
|
|
|
|
// Materialize
|
|
// a(z) = kBaseLanczosCoeff
|
|
// + sum(k = 1, n, kLanczosCoefficients[i] / (z + k))
|
|
// a'(z) = - sum(k = 1, n, kLanczosCoefficients[i] / (z + k) / (z + k))
|
|
Value zero = getConstantLike(rewriter, loc, 0.0, x);
|
|
Value a = getConstantLike(rewriter, loc, kBaseLanczosCoeff, x);
|
|
Value a_prime = zero;
|
|
for (int i = 0, end = kLanczosCoefficients.size(); i < end; ++i) {
|
|
Value coeff = getConstantLike(rewriter, loc, kLanczosCoefficients[i], x);
|
|
Value one_based_index = getConstantLike(rewriter, loc, i + 1, x);
|
|
Value z_term = rewriter.create<mhlo::AddOp>(loc, z, one_based_index);
|
|
a_prime = rewriter.create<mhlo::SubOp>(
|
|
loc, a_prime,
|
|
rewriter.create<mhlo::DivOp>(
|
|
loc, coeff, rewriter.create<mhlo::MulOp>(loc, z_term, z_term)));
|
|
a = rewriter.create<mhlo::AddOp>(
|
|
loc, a, rewriter.create<mhlo::DivOp>(loc, coeff, z_term));
|
|
}
|
|
|
|
// To improve accuracy on platforms with less-precise log implementations,
|
|
// compute log(kLanczosGamma + 1/2) at compile time and use log1p on the
|
|
// device.
|
|
// Materialize as
|
|
// log(t) = log(kLanczosGamma + 1/2 + z)
|
|
// = log(kLanczosGamma + 1/2) + log1p(z / (kLanczosGamma + 1/2)).
|
|
Value lanczos_plus_half =
|
|
getConstantLike(rewriter, loc, kLanczosGamma + 0.5, x);
|
|
Value t = rewriter.create<mhlo::AddOp>(loc, lanczos_plus_half, z);
|
|
Value log_term =
|
|
getConstantLike(rewriter, loc, std::log(kLanczosGamma + 0.5), x);
|
|
Value log1p_term = rewriter.create<mhlo::Log1pOp>(
|
|
loc, rewriter.create<mhlo::DivOp>(loc, z, lanczos_plus_half));
|
|
Value log_t = rewriter.create<mhlo::AddOp>(loc, log_term, log1p_term);
|
|
|
|
// Materialize the final result (modulo reflection) as
|
|
// digamma(z + 1) = log(t(z)) + a'(z) / a(z) - kLanczosGamma / t(z).
|
|
Value a_prime_div_a = rewriter.create<mhlo::DivOp>(loc, a_prime, a);
|
|
Value lanczos_gamma_div_t = rewriter.create<mhlo::DivOp>(
|
|
loc, getConstantLike(rewriter, loc, kLanczosGamma, x), t);
|
|
Value digamma = rewriter.create<mhlo::SubOp>(
|
|
loc, rewriter.create<mhlo::AddOp>(loc, log_t, a_prime_div_a),
|
|
lanczos_gamma_div_t);
|
|
|
|
// We need to be careful how we compute cot(pi * input) below: For
|
|
// near-integral arguments, pi * input can lose precision.
|
|
//
|
|
// Input is already known to be less than 0.5 (otherwise we don't have to
|
|
// reflect). We shift values smaller than -0.5 into the range [-0.5, 0.5] to
|
|
// increase precision of pi * x and the resulting cotangent.
|
|
Value reduced_x = rewriter.create<mhlo::AddOp>(
|
|
loc, x,
|
|
rewriter.create<mhlo::AbsOp>(
|
|
loc, rewriter.create<mhlo::FloorOp>(
|
|
loc, rewriter.create<mhlo::AddOp>(
|
|
loc, x, getConstantLike(rewriter, loc, 0.5, x)))));
|
|
|
|
// Materialize reflection for inputs less than 0.5 as
|
|
// digamma(x) = digamma(1 - x) - pi * cot(pi * x)
|
|
// = digamma(1 - x) - pi * cos(pi * x) / sin(pi * x)
|
|
Value pi = getConstantLike(rewriter, loc, M_PI, x);
|
|
Value pi_mul_reduced_x = rewriter.create<mhlo::MulOp>(loc, pi, reduced_x);
|
|
Value cos = rewriter.create<mhlo::CosOp>(loc, pi_mul_reduced_x);
|
|
Value sin = rewriter.create<mhlo::SinOp>(loc, pi_mul_reduced_x);
|
|
Value reflection = rewriter.create<mhlo::SubOp>(
|
|
loc, digamma,
|
|
rewriter.create<mhlo::DivOp>(
|
|
loc, rewriter.create<mhlo::MulOp>(loc, pi, cos), sin));
|
|
|
|
// Select whether or not to rely on the reflection.
|
|
digamma = rewriter.create<mhlo::SelectOp>(loc, need_to_reflect, reflection,
|
|
digamma);
|
|
|
|
// Digamma has poles at negative integers and zero; return nan for those.
|
|
const StringAttr kLE = rewriter.getStringAttr(
|
|
mhlo::stringifyComparisonDirection(mhlo::ComparisonDirection::LE));
|
|
Value is_le_zero = rewriter.create<mhlo::CompareOp>(loc, x, zero, kLE);
|
|
const StringAttr kEQ = rewriter.getStringAttr(
|
|
mhlo::stringifyComparisonDirection(mhlo::ComparisonDirection::EQ));
|
|
Value is_int = rewriter.create<mhlo::CompareOp>(
|
|
loc, x, rewriter.create<mhlo::FloorOp>(loc, x), kEQ);
|
|
Value is_pole = rewriter.create<mhlo::AndOp>(loc, is_le_zero, is_int);
|
|
return rewriter.create<mhlo::SelectOp>(
|
|
loc, is_pole,
|
|
getConstantLike(rewriter, loc, std::numeric_limits<double>::quiet_NaN(),
|
|
x),
|
|
digamma);
|
|
}
|
|
|
|
struct ConvertLgammaOp : public OpConversionPattern<LgammaOp> {
|
|
using OpConversionPattern<LgammaOp>::OpConversionPattern;
|
|
LogicalResult matchAndRewrite(
|
|
LgammaOp op, ArrayRef<Value> operands,
|
|
ConversionPatternRewriter &rewriter) const override {
|
|
LgammaOp::Adaptor transformed(operands);
|
|
FloatType min_precision_ty = rewriter.getF32Type();
|
|
rewriter.replaceOp(
|
|
op, MaterializeWithUpcast(rewriter, op.getLoc(), transformed.operand(),
|
|
min_precision_ty, &MaterializeLgamma));
|
|
return success();
|
|
}
|
|
};
|
|
|
|
struct ConvertDigammaOp : public OpConversionPattern<DigammaOp> {
|
|
using OpConversionPattern<DigammaOp>::OpConversionPattern;
|
|
LogicalResult matchAndRewrite(
|
|
DigammaOp op, ArrayRef<Value> operands,
|
|
ConversionPatternRewriter &rewriter) const override {
|
|
DigammaOp::Adaptor transformed(operands);
|
|
FloatType min_precision_ty = rewriter.getF32Type();
|
|
rewriter.replaceOp(
|
|
op, MaterializeWithUpcast(rewriter, op.getLoc(), transformed.operand(),
|
|
min_precision_ty, &MaterializeDigamma));
|
|
return success();
|
|
}
|
|
};
|
|
|
|
Value MaterializeZetaComputation(ConversionPatternRewriter &rewriter,
|
|
Location loc, Value x, Value q) {
|
|
static const std::array<double, 12> kZetaCoeffs{
|
|
-7.1661652561756670113e18,
|
|
1.8152105401943546773e17,
|
|
-4.5979787224074726105e15,
|
|
1.1646782814350067249e14,
|
|
-2.950130727918164224e12,
|
|
7.47242496e10,
|
|
-1.8924375803183791606e9,
|
|
47900160.0,
|
|
-1209600.0,
|
|
30240.0,
|
|
-720.0,
|
|
12.0,
|
|
};
|
|
|
|
// For speed we'll always use 9 iterations for the initial series estimate,
|
|
// and a 12 term expansion for the Euler-Maclaurin formula.
|
|
Value a = q;
|
|
Value zero_like_a = chlo::getConstantLike(rewriter, loc, 0.0, a);
|
|
Value neg_power = zero_like_a;
|
|
Value neg_x = rewriter.create<mhlo::NegOp>(loc, x);
|
|
Value initial_sum = rewriter.create<mhlo::PowOp>(loc, q, neg_x);
|
|
Value one_like_a = chlo::getConstantLike(rewriter, loc, 1.0, a);
|
|
for (int i = 0; i < 9; ++i) {
|
|
a = rewriter.create<mhlo::AddOp>(loc, a, one_like_a);
|
|
neg_power = rewriter.create<mhlo::PowOp>(loc, a, neg_x);
|
|
initial_sum = rewriter.create<mhlo::AddOp>(loc, initial_sum, neg_power);
|
|
}
|
|
a = rewriter.create<mhlo::AddOp>(loc, a, one_like_a);
|
|
neg_power = rewriter.create<mhlo::PowOp>(loc, a, neg_x);
|
|
Value one_like_x = chlo::getConstantLike(rewriter, loc, 1.0, x);
|
|
Value x_minus_one = rewriter.create<mhlo::SubOp>(loc, x, one_like_x);
|
|
Value neg_power_mul_a = rewriter.create<mhlo::MulOp>(loc, neg_power, a);
|
|
Value neg_power_mul_a_div_x_minus_one =
|
|
rewriter.create<mhlo::DivOp>(loc, neg_power_mul_a, x_minus_one);
|
|
Value s = rewriter.create<mhlo::AddOp>(loc, initial_sum,
|
|
neg_power_mul_a_div_x_minus_one);
|
|
Value a_inverse_square = rewriter.create<mhlo::DivOp>(
|
|
loc, one_like_a, rewriter.create<mhlo::MulOp>(loc, a, a));
|
|
|
|
Value horner_sum = zero_like_a;
|
|
Value factor = one_like_a;
|
|
// Use Horner's rule for this.
|
|
// Note this differs from Cephes which does a 'naive' polynomial evaluation.
|
|
// Using Horner's rule allows to avoid some NaN's and Infs from happening,
|
|
// resulting in more numerically stable code.
|
|
for (int i = 0; i < 11; ++i) {
|
|
Value factor_lhs = rewriter.create<mhlo::SubOp>(
|
|
loc, x, chlo::getConstantLike(rewriter, loc, 22 - 2 * i, x));
|
|
Value factor_rhs = rewriter.create<mhlo::SubOp>(
|
|
loc, x, chlo::getConstantLike(rewriter, loc, 21 - 2 * i, x));
|
|
factor = rewriter.create<mhlo::MulOp>(loc, factor_lhs, factor_rhs);
|
|
horner_sum = rewriter.create<mhlo::MulOp>(
|
|
loc, factor,
|
|
rewriter.create<mhlo::MulOp>(
|
|
loc, a_inverse_square,
|
|
rewriter.create<mhlo::AddOp>(
|
|
loc, horner_sum,
|
|
chlo::getConstantLike(rewriter, loc, 1. / kZetaCoeffs[i], a))));
|
|
}
|
|
Value zero_point_five_like_neg_power =
|
|
chlo::getConstantLike(rewriter, loc, .5, neg_power);
|
|
Value x_div_a = rewriter.create<mhlo::DivOp>(loc, x, a);
|
|
s = rewriter.create<mhlo::AddOp>(
|
|
loc, s,
|
|
rewriter.create<mhlo::MulOp>(
|
|
loc, neg_power,
|
|
rewriter.create<mhlo::AddOp>(
|
|
loc, zero_point_five_like_neg_power,
|
|
rewriter.create<mhlo::MulOp>(
|
|
loc, x_div_a,
|
|
rewriter.create<mhlo::AddOp>(
|
|
loc,
|
|
chlo::getConstantLike(rewriter, loc, 1. / kZetaCoeffs[11],
|
|
a),
|
|
horner_sum)))));
|
|
const double nan = std::numeric_limits<double>::quiet_NaN();
|
|
const double inf = std::numeric_limits<double>::infinity();
|
|
// Use the initial zeta sum without the correction term coming
|
|
// from Euler-Maclaurin if it is accurate enough.
|
|
const StringAttr kLT = rewriter.getStringAttr(
|
|
mhlo::stringifyComparisonDirection(mhlo::ComparisonDirection::LT));
|
|
Value abs_neg_power = rewriter.create<mhlo::AbsOp>(loc, neg_power);
|
|
Value abs_initial_sum = rewriter.create<mhlo::AbsOp>(loc, initial_sum);
|
|
Value output = rewriter.create<mhlo::SelectOp>(
|
|
loc,
|
|
rewriter.create<mhlo::CompareOp>(
|
|
loc, abs_neg_power,
|
|
rewriter.create<mhlo::MulOp>(
|
|
loc, abs_initial_sum,
|
|
chlo::getConstantLikeSmallestFiniteValue(rewriter, loc, a)),
|
|
kLT),
|
|
initial_sum, s);
|
|
// This is the harmonic series.
|
|
const StringAttr kEQ = rewriter.getStringAttr(
|
|
mhlo::stringifyComparisonDirection(mhlo::ComparisonDirection::EQ));
|
|
Value inf_like_x = chlo::getConstantLike(rewriter, loc, inf, x);
|
|
output = rewriter.create<mhlo::SelectOp>(
|
|
loc, rewriter.create<mhlo::CompareOp>(loc, x, one_like_x, kEQ),
|
|
inf_like_x, output);
|
|
// Function is not defined for x < 1.
|
|
Value nan_like_x = chlo::getConstantLike(rewriter, loc, nan, x);
|
|
output = rewriter.create<mhlo::SelectOp>(
|
|
loc, rewriter.create<mhlo::CompareOp>(loc, x, one_like_x, kLT),
|
|
nan_like_x, output);
|
|
// If q <= 0, then when q is an integer or x is not an integer, this is
|
|
// NaN.
|
|
const StringAttr kLE = rewriter.getStringAttr(
|
|
mhlo::stringifyComparisonDirection(mhlo::ComparisonDirection::LE));
|
|
const StringAttr kNE = rewriter.getStringAttr(
|
|
mhlo::stringifyComparisonDirection(mhlo::ComparisonDirection::NE));
|
|
Value zero_like_q = chlo::getConstantLike(rewriter, loc, 0.0, q);
|
|
Value q_le_zero = rewriter.create<mhlo::CompareOp>(loc, q, zero_like_q, kLE);
|
|
Value domain_error = rewriter.create<mhlo::AndOp>(
|
|
loc, q_le_zero,
|
|
rewriter.create<mhlo::CompareOp>(
|
|
loc, x, rewriter.create<mhlo::FloorOp>(loc, x), kNE));
|
|
Value negative_integer_q = rewriter.create<mhlo::AndOp>(
|
|
loc, q_le_zero,
|
|
rewriter.create<mhlo::CompareOp>(
|
|
loc, q, rewriter.create<mhlo::FloorOp>(loc, q), kEQ));
|
|
output = rewriter.create<mhlo::SelectOp>(loc, negative_integer_q, inf_like_x,
|
|
output);
|
|
output =
|
|
rewriter.create<mhlo::SelectOp>(loc, domain_error, nan_like_x, output);
|
|
return output;
|
|
}
|
|
|
|
struct ConvertZetaOp : public OpConversionPattern<ZetaOp> {
|
|
using OpConversionPattern<ZetaOp>::OpConversionPattern;
|
|
LogicalResult matchAndRewrite(
|
|
ZetaOp op, ArrayRef<Value> operands,
|
|
ConversionPatternRewriter &rewriter) const override {
|
|
ZetaOpAdaptor adaptor(operands);
|
|
Location loc = op.getLoc();
|
|
|
|
// Zeta is only defined on tensors of float elements and statically
|
|
// verified that both have the same type. So it suffices to look at one
|
|
// here.
|
|
auto elm_type = adaptor.x().getType().cast<ShapedType>().getElementType();
|
|
|
|
bool needs_upcast = elm_type.isF16() || elm_type.isBF16();
|
|
|
|
Value x = adaptor.x();
|
|
Value q = adaptor.q();
|
|
|
|
if (needs_upcast) {
|
|
x = rewriter.create<mhlo::ConvertOp>(loc, x, rewriter.getF32Type());
|
|
q = rewriter.create<mhlo::ConvertOp>(loc, q, rewriter.getF32Type());
|
|
}
|
|
Value result = MaterializeZetaComputation(rewriter, loc, x, q);
|
|
if (needs_upcast) {
|
|
result = rewriter.create<mhlo::ConvertOp>(loc, result, elm_type);
|
|
}
|
|
rewriter.replaceOp(op, {result});
|
|
|
|
return success();
|
|
}
|
|
};
|
|
|
|
// Converts binary ops that statically are determined to not broadcast directly
|
|
// to the corresponding mhlo non-broadcasting op.
|
|
template <typename ChloOpTy, typename HloOpTy, typename Adaptor>
|
|
struct ConvertTrivialNonBroadcastBinaryOp
|
|
: public OpConversionPattern<ChloOpTy> {
|
|
using OpConversionPattern<ChloOpTy>::OpConversionPattern;
|
|
LogicalResult matchAndRewrite(
|
|
ChloOpTy op, ArrayRef<Value> operands,
|
|
ConversionPatternRewriter &rewriter) const override {
|
|
// Only rewrite for statically determinable non-broadcasting cases.
|
|
typename ChloOpTy::Adaptor transformed(operands);
|
|
auto lhs_type =
|
|
transformed.lhs().getType().template dyn_cast<RankedTensorType>();
|
|
auto rhs_type =
|
|
transformed.rhs().getType().template dyn_cast<RankedTensorType>();
|
|
if (!lhs_type || !rhs_type) return failure();
|
|
|
|
// Requires rank broadcast.
|
|
if (lhs_type.getRank() != rhs_type.getRank()) return failure();
|
|
// Any dynamic dimension may require broadcasting and requires more
|
|
// analysis.
|
|
if (!lhs_type.hasStaticShape() || !rhs_type.hasStaticShape())
|
|
return failure();
|
|
|
|
for (auto extents : llvm::zip(lhs_type.getShape(), rhs_type.getShape())) {
|
|
auto lhs_extent = std::get<0>(extents);
|
|
auto rhs_extent = std::get<1>(extents);
|
|
if (lhs_extent != rhs_extent) {
|
|
return failure();
|
|
}
|
|
}
|
|
|
|
rewriter.replaceOp(
|
|
op, {Adaptor::CreateOp(op, op.getResult().getType(), operands[0],
|
|
operands[1], rewriter)});
|
|
return success();
|
|
}
|
|
};
|
|
|
|
// Converts a binary op with ranked broadcasting operands to explicitly
|
|
// broadcast and invoke the corresponding mhlo non-broadcasting op.
|
|
// Note that dynamic broadcasting supported by this pattern is only valid for
|
|
// "numpy" broadcasting semantics as defined here:
|
|
// https://docs.scipy.org/doc/numpy/reference/ufuncs.html
|
|
// Specifically, this includes the following cases:
|
|
// - Same rank broadcast (operands have the same static rank).
|
|
// - Different-rank broadcast, either without a broadcast_dims attribte or
|
|
// with the broadcast_dims attribute set to map to a prefix padding.
|
|
// - Legal combinations of degenerate (1-dim) implicit broadcasting.
|
|
// The restriction on broadcast_dims derives from the definition of the
|
|
// `shape.broadcast` op, which only supports prefix-padding.
|
|
template <typename ChloOpTy, typename HloOpTy, typename Adaptor>
|
|
struct ConvertRankedDynamicBroadcastBinaryOp
|
|
: public OpConversionPattern<ChloOpTy> {
|
|
using OpConversionPattern<ChloOpTy>::OpConversionPattern;
|
|
LogicalResult matchAndRewrite(
|
|
ChloOpTy op, ArrayRef<Value> operands,
|
|
ConversionPatternRewriter &rewriter) const override {
|
|
// Only support ranked operands.
|
|
typename ChloOpTy::Adaptor transformed(operands);
|
|
Value lhs = transformed.lhs();
|
|
Value rhs = transformed.rhs();
|
|
auto lhs_type = lhs.getType().dyn_cast<RankedTensorType>();
|
|
auto rhs_type = rhs.getType().dyn_cast<RankedTensorType>();
|
|
auto result_type =
|
|
op.getResult().getType().template dyn_cast<RankedTensorType>();
|
|
if (!lhs_type || !rhs_type || !result_type) return failure();
|
|
|
|
// Check for "numpy"-style rank broadcast.
|
|
auto broadcast_dimensions = op.broadcast_dimensions();
|
|
if (broadcast_dimensions &&
|
|
!hlo::IsLegalNumpyRankedBroadcast(lhs, rhs, *broadcast_dimensions)) {
|
|
// Note: It is unclear whether the general specification of explicit
|
|
// broadcast_dimensions on binary ops is a feature we want to carry
|
|
// forward. While it can technically be implemented for ranked-dynamic,
|
|
// it is incompatible with unranked inputs. If this warning is emitted
|
|
// in real programs, it is an indication that the feature should be
|
|
// implemented versus just falling back on the more standard definition
|
|
// of numpy-like prefix-padding.
|
|
op.emitWarning() << "unsupported non prefix-padded dynamic rank "
|
|
<< "broadcast_dimensions = " << *broadcast_dimensions;
|
|
return failure();
|
|
}
|
|
|
|
// Compute result shape.
|
|
auto loc = op.getLoc();
|
|
|
|
// Insert a constraint on the shapes being broadcastable and insert all
|
|
// future code into an assuming block reliant on the constraint.
|
|
Value lhs_shape = rewriter.create<shape::ShapeOfOp>(loc, lhs);
|
|
Value rhs_shape = rewriter.create<shape::ShapeOfOp>(loc, rhs);
|
|
auto broadcastable_cstr =
|
|
rewriter.create<shape::CstrBroadcastableOp>(loc, lhs_shape, rhs_shape);
|
|
auto assuming_op = rewriter.create<shape::AssumingOp>(
|
|
loc, ArrayRef<Type>{result_type}, broadcastable_cstr.result());
|
|
|
|
OpBuilder::InsertionGuard guard(rewriter);
|
|
rewriter.createBlock(&assuming_op.doRegion());
|
|
|
|
int64_t result_rank = std::max(lhs_type.getRank(), rhs_type.getRank());
|
|
Value result_extents =
|
|
hlo::ComputeBinaryElementwiseBroadcastingResultExtents(
|
|
loc, lhs, rhs, rewriter, /*unsafe_as_extent_tensor=*/true);
|
|
|
|
// Note that we unconditionally emit DynamicBroadcastInDim ops and let
|
|
// downstream canonicalizations fold them away if possible. This is
|
|
// because, in the dynamic case, there are many corner cases regarding
|
|
// when it is safe to omit, and some of them require analysis to prove
|
|
// properly.
|
|
auto lhs_broadcast_dimensions = llvm::to_vector<4>(
|
|
llvm::seq<int64_t>(result_rank - lhs_type.getRank(), result_rank));
|
|
Value broadcasted_lhs = rewriter.create<mhlo::DynamicBroadcastInDimOp>(
|
|
loc,
|
|
RankedTensorType::get(result_type.getShape(),
|
|
lhs_type.getElementType()),
|
|
lhs, result_extents,
|
|
rewriter.getI64TensorAttr(lhs_broadcast_dimensions));
|
|
auto rhs_broadcast_dimensions = llvm::to_vector<4>(
|
|
llvm::seq<int64_t>(result_rank - rhs_type.getRank(), result_rank));
|
|
Value broadcasted_rhs = rewriter.create<mhlo::DynamicBroadcastInDimOp>(
|
|
loc,
|
|
RankedTensorType::get(result_type.getShape(),
|
|
rhs_type.getElementType()),
|
|
rhs, result_extents,
|
|
rewriter.getI64TensorAttr(rhs_broadcast_dimensions));
|
|
|
|
// And generate the final non-broadcasted binary op.
|
|
Value final_result = Adaptor::CreateOp(op, result_type, broadcasted_lhs,
|
|
broadcasted_rhs, rewriter);
|
|
rewriter.create<shape::AssumingYieldOp>(loc, final_result);
|
|
rewriter.replaceOp(op, {assuming_op.getResult(0)});
|
|
return success();
|
|
}
|
|
};
|
|
|
|
#include "generated_chlo_legalize_to_hlo.inc"
|
|
} // namespace
|
|
|
|
void PopulateChloBroadcastingPatterns(MLIRContext *context,
|
|
OwningRewritePatternList *patterns) {
|
|
// Instantiate conversion templates for conforming binary elementwise ops
|
|
// that do not have different dtypes between operands and results and do
|
|
// not have special attributes that need to be preserved.
|
|
PopulateForBroadcastingBinaryOp<ConvertTrivialNonBroadcastBinaryOp>(
|
|
context, patterns, 10);
|
|
PopulateForBroadcastingBinaryOp<ConvertRankedDynamicBroadcastBinaryOp>(
|
|
context, patterns, 5);
|
|
}
|
|
|
|
void PopulateLegalizeChloToHloPatterns(MLIRContext *context,
|
|
OwningRewritePatternList *patterns) {
|
|
populateWithGenerated(context, *patterns);
|
|
PopulateChloBroadcastingPatterns(context, patterns);
|
|
|
|
// Other patterns.
|
|
// clang-format off
|
|
patterns->insert<ConvertConstantLikeOp,
|
|
ConvertDigammaOp,
|
|
ConvertErfOp,
|
|
ConvertErfcOp,
|
|
ConvertLgammaOp,
|
|
ConvertZetaOp>(context);
|
|
// clang-format on
|
|
}
|
|
|
|
} // namespace chlo
|
|
} // namespace mlir
|