Replace std.load/std.store by affine.load/affine.store (#180)
* Move to more recent LLVM ID (May 15) * clang-format * Bump cache version up * Update readme * Fix doc check * Move to a newer commit id * Update LoopToStandard -> SCFToStandard * Change MLIRSideEffects to MLIRSideEffectInterfaces * Add AffineScope trait to KrnlIterateOp * [ElementWise] Load/Store op to AffineLoad/AffineStore op * [Gemm, MatMul, Reduction, Softmax] Load/Store op to AffineLoad/AffineStore op * [Concat] Load/Store op to AffineLoad/AffineStore op * [Pad, PadConstantValuePad, Reshape, Transpose] Load/Store op to AffineLoad/AffineStore op * [LSTM] Load/Store op to AffineLoad/AffineStore op * [Conv, Norm, Pooling] Load/Store op to AffineLoad/AffineStore op * Add affine-loop-fusion pass * Use Load/Store for scalar * Use Load/Store for scalar * Fix lit tests * Unknown dimensions for broadcasting ops * Affine Load/Store for scalar memref * clang-format Co-authored-by: Gheorghe-Teodor Bercea <gt.bercea@gmail.com> Co-authored-by: Tian Jin <tjingrant@gmail.com>
This commit is contained in:
parent
2c8f5701bd
commit
7e05f371de
|
@ -619,20 +619,35 @@ struct ONNXElementwiseVariadicOpLowering : public ConversionPattern {
|
|||
for (auto arg : iterationBlock.getArguments())
|
||||
loopIVs.push_back(arg);
|
||||
}
|
||||
// Fold over operands for each of their scalar values
|
||||
// Fold over operands for each of their scalar values.
|
||||
Value accumulated, next;
|
||||
auto accumulatedLoopIVs = getLoopIVsForBroadcasting(
|
||||
// Obtain the first operand.
|
||||
std::vector<Value> accumulatedLoopIVs = getLoopIVsForBroadcasting(
|
||||
loc, rewriter, loopIVs, operands[0], broadcastedDimInfo[0]);
|
||||
accumulated = rewriter.create<LoadOp>(loc, operands[0], accumulatedLoopIVs);
|
||||
if (!hasAllConstantDimensions(memRefType))
|
||||
// In case of unknown dimensions, use std.load since
|
||||
// 'getLoopIVsForBroadcasting' has not supported affine map so far.
|
||||
accumulated =
|
||||
rewriter.create<LoadOp>(loc, operands[0], accumulatedLoopIVs);
|
||||
else
|
||||
accumulated =
|
||||
rewriter.create<AffineLoadOp>(loc, operands[0], accumulatedLoopIVs);
|
||||
// Iterate over the remaining operands.
|
||||
for (unsigned i = 1; i < numArgs; i++) {
|
||||
auto nextLoopIVs = getLoopIVsForBroadcasting(
|
||||
std::vector<Value> nextLoopIVs = getLoopIVsForBroadcasting(
|
||||
loc, rewriter, loopIVs, operands[i], broadcastedDimInfo[i]);
|
||||
next = rewriter.create<LoadOp>(loc, operands[i], nextLoopIVs);
|
||||
if (!hasAllConstantDimensions(memRefType))
|
||||
// In case of unknown dimensions, use std.load since
|
||||
// 'getLoopIVsForBroadcasting' has not supported affine map so far.
|
||||
next = rewriter.create<LoadOp>(loc, operands[i], nextLoopIVs);
|
||||
else
|
||||
next = rewriter.create<AffineLoadOp>(loc, operands[i], nextLoopIVs);
|
||||
accumulated = emitScalarOpFor<ElementwiseVariadicOp>(
|
||||
rewriter, loc, op, memRefType.getElementType(), {accumulated, next});
|
||||
}
|
||||
|
||||
// Store result in the resulting array.
|
||||
rewriter.create<StoreOp>(loc, accumulated, alloc, loopIVs);
|
||||
rewriter.create<AffineStoreOp>(loc, accumulated, alloc, loopIVs);
|
||||
|
||||
rewriter.replaceOp(op, alloc);
|
||||
|
||||
|
|
|
@ -156,23 +156,23 @@ struct ONNXGemmOpLowering : public ConversionPattern {
|
|||
|
||||
// Initialize the output of A*B
|
||||
auto zero = emitConstantOp(rewriter, loc, memRefType.getElementType(), 0);
|
||||
rewriter.create<StoreOp>(loc, zero, alloc, loopMNIVs);
|
||||
rewriter.create<AffineStoreOp>(loc, zero, alloc, loopMNIVs);
|
||||
|
||||
// Compute A*B
|
||||
auto matmulIterateOp = rewriter.create<KrnlIterateOp>(loc, reductionPack);
|
||||
|
||||
// Compute beta*C, and add up to alpha*A*B (unidirectional broadcasting)
|
||||
auto loadedAB = rewriter.create<LoadOp>(loc, alloc, loopMNIVs);
|
||||
auto loadedAB = rewriter.create<AffineLoadOp>(loc, alloc, loopMNIVs);
|
||||
auto alphaAB = rewriter.create<MulFOp>(loc, alpha, loadedAB);
|
||||
if (hasBias) {
|
||||
auto loopCIVs = getLoopIVsForBroadcasting(
|
||||
loc, rewriter, loopMNIVs, C, broadcastedDimInfo);
|
||||
auto loadedC = rewriter.create<LoadOp>(loc, C, loopCIVs);
|
||||
auto loadedC = rewriter.create<AffineLoadOp>(loc, C, loopCIVs);
|
||||
auto betaC = rewriter.create<MulFOp>(loc, beta, loadedC);
|
||||
auto Y = rewriter.create<AddFOp>(loc, alphaAB, betaC);
|
||||
rewriter.create<StoreOp>(loc, Y, alloc, loopMNIVs);
|
||||
rewriter.create<AffineStoreOp>(loc, Y, alloc, loopMNIVs);
|
||||
} else {
|
||||
rewriter.create<StoreOp>(loc, alphaAB, alloc, loopMNIVs);
|
||||
rewriter.create<AffineStoreOp>(loc, alphaAB, alloc, loopMNIVs);
|
||||
}
|
||||
|
||||
// Insert instructions to do matrix multiplication: A*B
|
||||
|
@ -199,12 +199,12 @@ struct ONNXGemmOpLowering : public ConversionPattern {
|
|||
}
|
||||
|
||||
// Matmul computation
|
||||
auto loadedA = rewriter.create<LoadOp>(loc, A, loopAIVs);
|
||||
auto loadedB = rewriter.create<LoadOp>(loc, B, loopBIVs);
|
||||
auto loadedY = rewriter.create<LoadOp>(loc, alloc, loopMNIVs);
|
||||
auto loadedA = rewriter.create<AffineLoadOp>(loc, A, loopAIVs);
|
||||
auto loadedB = rewriter.create<AffineLoadOp>(loc, B, loopBIVs);
|
||||
auto loadedY = rewriter.create<AffineLoadOp>(loc, alloc, loopMNIVs);
|
||||
auto AB = rewriter.create<MulFOp>(loc, loadedA, loadedB);
|
||||
auto accumulated = rewriter.create<AddFOp>(loc, loadedY, AB);
|
||||
rewriter.create<StoreOp>(loc, accumulated, alloc, loopMNIVs);
|
||||
rewriter.create<AffineStoreOp>(loc, accumulated, alloc, loopMNIVs);
|
||||
|
||||
rewriter.replaceOp(op, alloc);
|
||||
|
||||
|
|
|
@ -221,7 +221,7 @@ struct ONNXMatMulOpLowering : public ConversionPattern {
|
|||
}
|
||||
|
||||
// Fill the output with value 0.
|
||||
rewriter.create<StoreOp>(loc, zero, alloc, loopBatchMNIVs);
|
||||
rewriter.create<AffineStoreOp>(loc, zero, alloc, loopBatchMNIVs);
|
||||
|
||||
// Iterate along the reduction dimension.
|
||||
// Use a value from A.
|
||||
|
@ -265,17 +265,17 @@ struct ONNXMatMulOpLowering : public ConversionPattern {
|
|||
loopBatchKNIVs.emplace_back(loopMNIVs[0]);
|
||||
}
|
||||
// Matmul computation
|
||||
auto loadedA = rewriter.create<LoadOp>(loc, A, loopBatchMKIVs);
|
||||
auto loadedB = rewriter.create<LoadOp>(loc, B, loopBatchKNIVs);
|
||||
auto loadedY = rewriter.create<LoadOp>(loc, alloc, loopBatchMNIVs);
|
||||
auto loadedA = rewriter.create<AffineLoadOp>(loc, A, loopBatchMKIVs);
|
||||
auto loadedB = rewriter.create<AffineLoadOp>(loc, B, loopBatchKNIVs);
|
||||
auto loadedY = rewriter.create<AffineLoadOp>(loc, alloc, loopBatchMNIVs);
|
||||
if (elementType.isa<IntegerType>()) {
|
||||
auto AB = rewriter.create<MulIOp>(loc, loadedA, loadedB);
|
||||
auto accumulated = rewriter.create<AddIOp>(loc, loadedY, AB);
|
||||
rewriter.create<StoreOp>(loc, accumulated, alloc, loopBatchMNIVs);
|
||||
rewriter.create<AffineStoreOp>(loc, accumulated, alloc, loopBatchMNIVs);
|
||||
} else if (elementType.isa<FloatType>()) {
|
||||
auto AB = rewriter.create<MulFOp>(loc, loadedA, loadedB);
|
||||
auto accumulated = rewriter.create<AddFOp>(loc, loadedY, AB);
|
||||
rewriter.create<StoreOp>(loc, accumulated, alloc, loopBatchMNIVs);
|
||||
rewriter.create<AffineStoreOp>(loc, accumulated, alloc, loopBatchMNIVs);
|
||||
}
|
||||
} else if ((AShape.size() == 1) && (BShape.size() == 1)) {
|
||||
// Case 3:
|
||||
|
@ -283,7 +283,7 @@ struct ONNXMatMulOpLowering : public ConversionPattern {
|
|||
|
||||
// Fill the output with value 0.
|
||||
Value zeroIndex = rewriter.create<ConstantIndexOp>(loc, 0);
|
||||
rewriter.create<StoreOp>(loc, zero, alloc, zeroIndex);
|
||||
rewriter.create<AffineStoreOp>(loc, zero, alloc, zeroIndex);
|
||||
|
||||
// Iterate along the reduction dimension.
|
||||
// Use a value from A.
|
||||
|
@ -310,17 +310,17 @@ struct ONNXMatMulOpLowering : public ConversionPattern {
|
|||
loopKIVs.emplace_back(reduceIterationBlock.getArgument(0));
|
||||
|
||||
// Matmul computation
|
||||
auto loadedA = rewriter.create<LoadOp>(loc, A, loopKIVs);
|
||||
auto loadedB = rewriter.create<LoadOp>(loc, B, loopKIVs);
|
||||
auto loadedY = rewriter.create<LoadOp>(loc, alloc, zeroIndex);
|
||||
auto loadedA = rewriter.create<AffineLoadOp>(loc, A, loopKIVs);
|
||||
auto loadedB = rewriter.create<AffineLoadOp>(loc, B, loopKIVs);
|
||||
auto loadedY = rewriter.create<AffineLoadOp>(loc, alloc, zeroIndex);
|
||||
if (elementType.isa<IntegerType>()) {
|
||||
auto AB = rewriter.create<MulIOp>(loc, loadedA, loadedB);
|
||||
auto accumulated = rewriter.create<AddIOp>(loc, loadedY, AB);
|
||||
rewriter.create<StoreOp>(loc, accumulated, alloc, zeroIndex);
|
||||
rewriter.create<AffineStoreOp>(loc, accumulated, alloc, zeroIndex);
|
||||
} else if (elementType.isa<FloatType>()) {
|
||||
auto AB = rewriter.create<MulFOp>(loc, loadedA, loadedB);
|
||||
auto accumulated = rewriter.create<AddFOp>(loc, loadedY, AB);
|
||||
rewriter.create<StoreOp>(loc, accumulated, alloc, zeroIndex);
|
||||
rewriter.create<AffineStoreOp>(loc, accumulated, alloc, zeroIndex);
|
||||
}
|
||||
} else {
|
||||
// No scalar matrix multiplication.
|
||||
|
|
|
@ -212,7 +212,7 @@ struct ONNXReductionOpLowering : public ConversionPattern {
|
|||
|
||||
Value identity =
|
||||
getIdentityValue<ONNXReductionOp>(rewriter, loc, elementOutType);
|
||||
rewriter.create<StoreOp>(loc, identity, alloc, loopIVs);
|
||||
rewriter.create<AffineStoreOp>(loc, identity, alloc, loopIVs);
|
||||
|
||||
// Define an Krnl loop to do reduction.
|
||||
rewriter.setInsertionPointAfter(iterateOpInit);
|
||||
|
@ -256,11 +256,11 @@ struct ONNXReductionOpLowering : public ConversionPattern {
|
|||
}
|
||||
|
||||
Value next, accumulated;
|
||||
next = rewriter.create<LoadOp>(loc, operands[0], inLoopIVs);
|
||||
accumulated = rewriter.create<LoadOp>(loc, alloc, outLoopIVs);
|
||||
next = rewriter.create<AffineLoadOp>(loc, operands[0], inLoopIVs);
|
||||
accumulated = rewriter.create<AffineLoadOp>(loc, alloc, outLoopIVs);
|
||||
accumulated = emitScalarOpFor<ONNXReductionOp>(
|
||||
rewriter, loc, op, memRefOutType.getElementType(), {accumulated, next});
|
||||
rewriter.create<StoreOp>(loc, accumulated, alloc, outLoopIVs);
|
||||
rewriter.create<AffineStoreOp>(loc, accumulated, alloc, outLoopIVs);
|
||||
|
||||
rewriter.replaceOp(op, alloc);
|
||||
return success();
|
||||
|
|
|
@ -104,8 +104,9 @@ struct ONNXSoftmaxOpLowering : public ConversionPattern {
|
|||
outerLoopIVs.push_back(arg);
|
||||
|
||||
// Reset accumulators.
|
||||
rewriter.create<StoreOp>(loc, zero, sumOp);
|
||||
rewriter.create<StoreOp>(loc, negInfinity, maxOp);
|
||||
rewriter.create<AffineStoreOp>(loc, zero, sumOp, ArrayRef<Value>{});
|
||||
rewriter.create<AffineStoreOp>(
|
||||
loc, negInfinity, maxOp, ArrayRef<Value>{});
|
||||
|
||||
// Create an inner loop to compute max.
|
||||
maxIterateOp = rewriter.create<KrnlIterateOp>(loc, innerPack);
|
||||
|
@ -115,8 +116,9 @@ struct ONNXSoftmaxOpLowering : public ConversionPattern {
|
|||
softmaxIterateOp = rewriter.create<KrnlIterateOp>(loc, innerPack);
|
||||
} else {
|
||||
// Reset accumulators.
|
||||
rewriter.create<StoreOp>(loc, zero, sumOp);
|
||||
rewriter.create<StoreOp>(loc, negInfinity, maxOp);
|
||||
rewriter.create<AffineStoreOp>(loc, zero, sumOp, ArrayRef<Value>{});
|
||||
rewriter.create<AffineStoreOp>(
|
||||
loc, negInfinity, maxOp, ArrayRef<Value>{});
|
||||
|
||||
// Create an inner loop to compute max.
|
||||
maxIterateOp = rewriter.create<KrnlIterateOp>(loc, innerPack);
|
||||
|
@ -142,16 +144,16 @@ struct ONNXSoftmaxOpLowering : public ConversionPattern {
|
|||
maxLoopIVs.push_back(arg);
|
||||
|
||||
// Compute the max value.
|
||||
Value max = rewriter.create<LoadOp>(loc, maxOp);
|
||||
Value nextMax = rewriter.create<LoadOp>(loc, input, maxLoopIVs);
|
||||
Value max = rewriter.create<AffineLoadOp>(loc, maxOp);
|
||||
Value nextMax = rewriter.create<AffineLoadOp>(loc, input, maxLoopIVs);
|
||||
auto maxCond =
|
||||
rewriter.create<CmpFOp>(loc, CmpFPredicate::OGT, max, nextMax);
|
||||
max = rewriter.create<SelectOp>(loc, maxCond, max, nextMax);
|
||||
rewriter.create<StoreOp>(loc, max, maxOp);
|
||||
rewriter.create<AffineStoreOp>(loc, max, maxOp, ArrayRef<Value>{});
|
||||
|
||||
// Get the max.
|
||||
rewriter.setInsertionPoint(sumIterateOp);
|
||||
max = rewriter.create<LoadOp>(loc, maxOp);
|
||||
max = rewriter.create<AffineLoadOp>(loc, maxOp);
|
||||
|
||||
// Insert instructions inside the sum loop.
|
||||
Block &sumIterationBlock = sumIterateOp.bodyRegion().front();
|
||||
|
@ -165,18 +167,18 @@ struct ONNXSoftmaxOpLowering : public ConversionPattern {
|
|||
sumLoopIVs.push_back(arg);
|
||||
|
||||
// Sum up values.
|
||||
Value sum = rewriter.create<LoadOp>(loc, sumOp);
|
||||
Value next = rewriter.create<LoadOp>(loc, input, sumLoopIVs);
|
||||
Value sum = rewriter.create<AffineLoadOp>(loc, sumOp);
|
||||
Value next = rewriter.create<AffineLoadOp>(loc, input, sumLoopIVs);
|
||||
Value sub = rewriter.create<SubFOp>(loc, next, max);
|
||||
Value exp = rewriter.create<ExpOp>(loc, sub);
|
||||
sum = rewriter.create<AddFOp>(loc, sum, exp);
|
||||
rewriter.create<StoreOp>(loc, sum, sumOp);
|
||||
rewriter.create<AffineStoreOp>(loc, sum, sumOp, ArrayRef<Value>{});
|
||||
// Store intermediate values in the result to avoid recomputation.
|
||||
rewriter.create<StoreOp>(loc, exp, alloc, sumLoopIVs);
|
||||
rewriter.create<AffineStoreOp>(loc, exp, alloc, sumLoopIVs);
|
||||
|
||||
// Get the sum.
|
||||
rewriter.setInsertionPoint(softmaxIterateOp);
|
||||
sum = rewriter.create<LoadOp>(loc, sumOp);
|
||||
sum = rewriter.create<AffineLoadOp>(loc, sumOp);
|
||||
|
||||
// Insert instructions inside the softmax loop.
|
||||
Block &softmaxIterationBlock = softmaxIterateOp.bodyRegion().front();
|
||||
|
@ -190,9 +192,10 @@ struct ONNXSoftmaxOpLowering : public ConversionPattern {
|
|||
softmaxLoopIVs.push_back(arg);
|
||||
|
||||
// Compute softmax.
|
||||
Value expLoadedVal = rewriter.create<LoadOp>(loc, alloc, softmaxLoopIVs);
|
||||
Value expLoadedVal =
|
||||
rewriter.create<AffineLoadOp>(loc, alloc, softmaxLoopIVs);
|
||||
Value result = rewriter.create<DivFOp>(loc, expLoadedVal, sum);
|
||||
rewriter.create<StoreOp>(loc, result, alloc, softmaxLoopIVs);
|
||||
rewriter.create<AffineStoreOp>(loc, result, alloc, softmaxLoopIVs);
|
||||
|
||||
rewriter.replaceOp(op, alloc);
|
||||
|
||||
|
|
|
@ -129,10 +129,14 @@ struct ONNXConvOpLowering : public ConversionPattern {
|
|||
if (group > 1) {
|
||||
// Middle loop is over groups and third loop is over the
|
||||
// kernel identifiers in the current group.
|
||||
auto kernelsOffset = rewriter.create<MulIOp>(
|
||||
loc, outerLoops.getInductionVar(gIndex), kernelsPerGroupValue);
|
||||
kernel = rewriter.create<AddIOp>(
|
||||
loc, kernelsOffset, outerLoops.getInductionVar(mIndex));
|
||||
AffineMap kernelMap = AffineMap::get(2, 1,
|
||||
/*gIndex=*/rewriter.getAffineDimExpr(0) *
|
||||
/*kernelsPerGroup=*/rewriter.getAffineSymbolExpr(0) +
|
||||
/*mIndex=*/rewriter.getAffineDimExpr(1));
|
||||
kernel = rewriter.create<AffineApplyOp>(loc, kernelMap,
|
||||
ArrayRef<Value>{/*gIndex=*/outerLoops.getInductionVar(gIndex),
|
||||
/*kernelsPerGroupValue=*/kernelsPerGroupValue,
|
||||
/*mIndex=*/outerLoops.getInductionVar(mIndex)});
|
||||
}
|
||||
|
||||
// 2.2 Define spatial loops
|
||||
|
@ -209,9 +213,8 @@ struct ONNXConvOpLowering : public ConversionPattern {
|
|||
/*subchannel=*/rewriter.getAffineSymbolExpr(0) +
|
||||
/*c=*/rewriter.getAffineDimExpr(1));
|
||||
channelDepth = rewriter.create<AffineApplyOp>(loc, indexMap,
|
||||
ValueRange(
|
||||
ArrayRef<Value>{/*g=*/outerLoops.getInductionVar(gIndex),
|
||||
/*c=*/channelDepth, /*subchannel=*/subchannels}));
|
||||
ArrayRef<Value>{/*g=*/outerLoops.getInductionVar(gIndex),
|
||||
/*c=*/channelDepth, /*subchannel=*/subchannels});
|
||||
}
|
||||
dataIndices.emplace_back(channelDepth);
|
||||
// sX * rX + kX
|
||||
|
@ -231,8 +234,8 @@ struct ONNXConvOpLowering : public ConversionPattern {
|
|||
/*sX=*/rewriter.getAffineDimExpr(0) * /*rX=*/stride +
|
||||
/*kX=*/rewriter.getAffineDimExpr(1));
|
||||
Value outIV = rewriter.create<AffineApplyOp>(loc, indexMap,
|
||||
ValueRange(ArrayRef<Value>{spatialLoops.getInductionVar(i),
|
||||
innerLoops.getInductionVar(i + 1)}));
|
||||
ArrayRef<Value>{spatialLoops.getInductionVar(i),
|
||||
innerLoops.getInductionVar(i + 1)});
|
||||
dataIndices.emplace_back(outIV);
|
||||
}
|
||||
|
||||
|
|
|
@ -79,10 +79,10 @@ struct ONNXBatchNormalizationTestModeOpLowering : public ConversionPattern {
|
|||
loopCIVs.emplace_back(rewriter.create<ConstantIndexOp>(loc, 0));
|
||||
}
|
||||
|
||||
auto scaleVal = rewriter.create<LoadOp>(loc, scale, loopCIVs);
|
||||
auto biasVal = rewriter.create<LoadOp>(loc, bias, loopCIVs);
|
||||
auto meanVal = rewriter.create<LoadOp>(loc, mean, loopCIVs);
|
||||
auto varianceVal = rewriter.create<LoadOp>(loc, variance, loopCIVs);
|
||||
auto scaleVal = rewriter.create<AffineLoadOp>(loc, scale, loopCIVs);
|
||||
auto biasVal = rewriter.create<AffineLoadOp>(loc, bias, loopCIVs);
|
||||
auto meanVal = rewriter.create<AffineLoadOp>(loc, mean, loopCIVs);
|
||||
auto varianceVal = rewriter.create<AffineLoadOp>(loc, variance, loopCIVs);
|
||||
|
||||
// Create a KrnlIterateOp along the other dimensions.
|
||||
SmallVector<int64_t, 4> axes;
|
||||
|
@ -118,7 +118,7 @@ struct ONNXBatchNormalizationTestModeOpLowering : public ConversionPattern {
|
|||
loopIVs.emplace_back(args[0]);
|
||||
}
|
||||
|
||||
auto xVal = rewriter.create<LoadOp>(loc, operand, loopIVs);
|
||||
auto xVal = rewriter.create<AffineLoadOp>(loc, operand, loopIVs);
|
||||
// normalize
|
||||
auto dividend = rewriter.create<SubFOp>(loc, xVal, meanVal);
|
||||
auto adjustedVarianceVal =
|
||||
|
@ -129,7 +129,7 @@ struct ONNXBatchNormalizationTestModeOpLowering : public ConversionPattern {
|
|||
auto scaleNormVal = rewriter.create<MulFOp>(loc, scaleVal, normVal);
|
||||
auto shiftScaleNormVal =
|
||||
rewriter.create<AddFOp>(loc, scaleNormVal, biasVal);
|
||||
rewriter.create<StoreOp>(loc, shiftScaleNormVal, alloc, loopIVs);
|
||||
rewriter.create<AffineStoreOp>(loc, shiftScaleNormVal, alloc, loopIVs);
|
||||
|
||||
rewriter.replaceOp(op, alloc);
|
||||
|
||||
|
|
|
@ -100,7 +100,7 @@ void postProcessPoolingWindow<ONNXAveragePoolOp>(
|
|||
ArrayRef<Value> poolDimValues) {
|
||||
// AveragePool's result type is FloatType, so it's safe to use DivFOp, SubFOp.
|
||||
bool countIncludePad = getCountIncludePad<ONNXAveragePoolOp>(poolOp);
|
||||
Value numerator = rewriter.create<LoadOp>(loc, alloc, resultIndices);
|
||||
Value numerator = rewriter.create<AffineLoadOp>(loc, alloc, resultIndices);
|
||||
Value denominator;
|
||||
if (countIncludePad) {
|
||||
int64_t kernelSize = 1;
|
||||
|
@ -120,7 +120,7 @@ void postProcessPoolingWindow<ONNXAveragePoolOp>(
|
|||
|
||||
Value average = rewriter.create<DivFOp>(loc, numerator, denominator);
|
||||
|
||||
rewriter.create<StoreOp>(loc, average, alloc, resultIndices);
|
||||
rewriter.create<AffineStoreOp>(loc, average, alloc, resultIndices);
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
@ -167,9 +167,7 @@ Value insertAllocAndDeallocForPooling(ConversionPatternRewriter &rewriter,
|
|||
dilations.empty() ? 1 : dilations[spatialIndex]));
|
||||
|
||||
// Apply the affine map.
|
||||
Value dimVal =
|
||||
rewriter.create<AffineApplyOp>(loc, dimMap, ValueRange(dimArgs));
|
||||
|
||||
Value dimVal = rewriter.create<AffineApplyOp>(loc, dimMap, dimArgs);
|
||||
allocOperands.emplace_back(dimVal);
|
||||
}
|
||||
}
|
||||
|
@ -346,7 +344,7 @@ struct ONNXPoolOpLowering : public ConversionPattern {
|
|||
outputIndices.emplace_back(outputLoops.getInductionVar(i));
|
||||
|
||||
// 2.1 Emit: output[n][c][ho][wo] = identity
|
||||
rewriter.create<StoreOp>(loc, identity, alloc, outputIndices);
|
||||
rewriter.create<AffineStoreOp>(loc, identity, alloc, outputIndices);
|
||||
|
||||
// 2.2 Emit affine maps which express the lower and upper bounds for the
|
||||
// pooling window's dimensions.
|
||||
|
@ -441,11 +439,11 @@ struct ONNXPoolOpLowering : public ConversionPattern {
|
|||
{ // Construct poolStartValues and poolDimValues.
|
||||
for (int i = 0; i < kernelShape.size(); ++i) {
|
||||
Value startIndex = rewriter.create<AffineMaxOp>(
|
||||
loc, poolStartMap, ValueRange(IVsAndConstants[i]));
|
||||
loc, poolStartMap, IVsAndConstants[i]);
|
||||
poolStartValues.emplace_back(startIndex);
|
||||
|
||||
Value endIndex = rewriter.create<AffineMinOp>(
|
||||
loc, poolEndMap, ValueRange(IVsAndConstants[i]));
|
||||
Value endIndex =
|
||||
rewriter.create<AffineMinOp>(loc, poolEndMap, IVsAndConstants[i]);
|
||||
|
||||
Value dim = rewriter.create<SubIOp>(loc, endIndex, startIndex);
|
||||
if (isDilated) {
|
||||
|
@ -514,10 +512,10 @@ struct ONNXPoolOpLowering : public ConversionPattern {
|
|||
Value loadInput =
|
||||
rewriter.create<LoadOp>(loc, inputOperand, inputIndices);
|
||||
Value loadPartialOutput =
|
||||
rewriter.create<LoadOp>(loc, alloc, outputIndices);
|
||||
rewriter.create<AffineLoadOp>(loc, alloc, outputIndices);
|
||||
Value output = emitScalarOpFor<PoolOp>(rewriter, loc, op,
|
||||
outputElementType, {loadPartialOutput, loadInput});
|
||||
rewriter.create<StoreOp>(loc, output, alloc, outputIndices);
|
||||
rewriter.create<AffineStoreOp>(loc, output, alloc, outputIndices);
|
||||
}
|
||||
|
||||
// 2.5 Post-processing for the pooling window, e.g. taking average.
|
||||
|
|
|
@ -222,13 +222,15 @@ LstmState allocAndInitializeStates<ONNXLSTMOp, LstmState>(
|
|||
|
||||
Value hiddenVal = zero;
|
||||
if (!isNoneType(operandAdaptor.initial_h()))
|
||||
hiddenVal = rewriter.create<LoadOp>(loc, operandAdaptor.initial_h(), IVs);
|
||||
rewriter.create<StoreOp>(loc, hiddenVal, state.ht, IVs);
|
||||
hiddenVal =
|
||||
rewriter.create<AffineLoadOp>(loc, operandAdaptor.initial_h(), IVs);
|
||||
rewriter.create<AffineStoreOp>(loc, hiddenVal, state.ht, IVs);
|
||||
|
||||
Value cellVal = zero;
|
||||
if (!isNoneType(operandAdaptor.initial_c()))
|
||||
cellVal = rewriter.create<LoadOp>(loc, operandAdaptor.initial_c(), IVs);
|
||||
rewriter.create<StoreOp>(loc, cellVal, state.ct, IVs);
|
||||
cellVal =
|
||||
rewriter.create<AffineLoadOp>(loc, operandAdaptor.initial_c(), IVs);
|
||||
rewriter.create<AffineStoreOp>(loc, cellVal, state.ct, IVs);
|
||||
}
|
||||
rewriter.restoreInsertionPoint(ipInitializationLoops);
|
||||
return state;
|
||||
|
@ -320,8 +322,8 @@ void calculateState<ONNXLSTMOp, LstmState, LstmActivationPack>(
|
|||
for (unsigned i = 0; i < 4; ++i) {
|
||||
Value wHiddenIV =
|
||||
rewriter.create<AffineApplyOp>(loc, accessByOffsetMap,
|
||||
ValueRange(std::vector<Value>{/*iv=*/hiddenIV,
|
||||
/*index=*/constantIndices[i], /*size=*/hiddenDimVal}));
|
||||
std::vector<Value>{/*iv=*/hiddenIV,
|
||||
/*index=*/constantIndices[i], /*size=*/hiddenDimVal});
|
||||
wbIOFCIVs.emplace_back(SmallVector<Value, 2>{directionIV, wHiddenIV});
|
||||
}
|
||||
// Rb[iofc]
|
||||
|
@ -329,8 +331,8 @@ void calculateState<ONNXLSTMOp, LstmState, LstmActivationPack>(
|
|||
SmallVector<Value, 4> rbIVs;
|
||||
Value rHiddenIV =
|
||||
rewriter.create<AffineApplyOp>(loc, accessByOffsetMap,
|
||||
ValueRange(std::vector<Value>{/*iv=*/hiddenIV,
|
||||
/*index=*/constantIndices[i], /*size=*/hiddenDimVal}));
|
||||
std::vector<Value>{/*iv=*/hiddenIV,
|
||||
/*index=*/constantIndices[i], /*size=*/hiddenDimVal});
|
||||
rbIOFCIVs.emplace_back(SmallVector<Value, 2>{directionIV, rHiddenIV});
|
||||
}
|
||||
}
|
||||
|
@ -339,17 +341,16 @@ void calculateState<ONNXLSTMOp, LstmState, LstmActivationPack>(
|
|||
if (hasPeepholes) {
|
||||
for (unsigned i = 0; i < 3; ++i) {
|
||||
SmallVector<Value, 4> pIVs;
|
||||
Value pHiddenIV =
|
||||
rewriter.create<AffineApplyOp>(loc, accessByOffsetMap,
|
||||
ValueRange(std::vector<Value>{
|
||||
hiddenIV, constantIndices[i], hiddenDimVal}));
|
||||
Value pHiddenIV = rewriter.create<AffineApplyOp>(loc,
|
||||
accessByOffsetMap,
|
||||
std::vector<Value>{hiddenIV, constantIndices[i], hiddenDimVal});
|
||||
pIOFIVs.emplace_back(SmallVector<Value, 2>{directionIV, pHiddenIV});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Value loadH = rewriter.create<LoadOp>(loc, state.ht, hIVs);
|
||||
Value loadC = rewriter.create<LoadOp>(loc, state.ct, cIVs);
|
||||
Value loadH = rewriter.create<AffineLoadOp>(loc, state.ht, hIVs);
|
||||
Value loadC = rewriter.create<AffineLoadOp>(loc, state.ct, cIVs);
|
||||
|
||||
// Emit instructions for matrix multiplications:
|
||||
// Xt*(Wi^T), Xt*(Wo^T), Xt*(Wf^t), Xt*(Wc^T)
|
||||
|
@ -361,9 +362,9 @@ void calculateState<ONNXLSTMOp, LstmState, LstmActivationPack>(
|
|||
MemRefType scalarMemRefType = MemRefType::get({}, elementType, {}, 0);
|
||||
for (unsigned i = 0; i < 4; ++i) {
|
||||
Value xwAlloc = rewriter.create<AllocOp>(loc, scalarMemRefType);
|
||||
rewriter.create<StoreOp>(loc, zero, xwAlloc);
|
||||
rewriter.create<AffineStoreOp>(loc, zero, xwAlloc, ArrayRef<Value>{});
|
||||
Value hrAlloc = rewriter.create<AllocOp>(loc, scalarMemRefType);
|
||||
rewriter.create<StoreOp>(loc, zero, hrAlloc);
|
||||
rewriter.create<AffineStoreOp>(loc, zero, hrAlloc, ArrayRef<Value>{});
|
||||
xwIOFC.emplace_back(xwAlloc);
|
||||
hrIOFC.emplace_back(hrAlloc);
|
||||
}
|
||||
|
@ -390,10 +391,9 @@ void calculateState<ONNXLSTMOp, LstmState, LstmActivationPack>(
|
|||
// R[iofc] :: [num_directions, 4*hidden_size, input_size]
|
||||
for (unsigned i = 0; i < 4; ++i) {
|
||||
SmallVector<Value, 4> wIVs, rIVs;
|
||||
Value wHiddenIV =
|
||||
rewriter.create<AffineApplyOp>(loc, accessByOffsetMap,
|
||||
ValueRange(std::vector<Value>{
|
||||
hiddenIV, constantIndices[i], hiddenDimVal}));
|
||||
Value wHiddenIV = rewriter.create<AffineApplyOp>(loc,
|
||||
accessByOffsetMap,
|
||||
std::vector<Value>{hiddenIV, constantIndices[i], hiddenDimVal});
|
||||
|
||||
wIVs = {directionIV, wHiddenIV, reductionIV};
|
||||
wIOFCIVs.emplace_back(wIVs);
|
||||
|
@ -402,77 +402,80 @@ void calculateState<ONNXLSTMOp, LstmState, LstmActivationPack>(
|
|||
rIOFCIVs.emplace_back(rIVs);
|
||||
}
|
||||
|
||||
Value loadX = rewriter.create<LoadOp>(loc, operandAdaptor.X(), xIVs);
|
||||
Value loadX =
|
||||
rewriter.create<AffineLoadOp>(loc, operandAdaptor.X(), xIVs);
|
||||
for (unsigned i = 0; i < 4; ++i) {
|
||||
// Xt * Wiofc
|
||||
Value loadW =
|
||||
rewriter.create<LoadOp>(loc, operandAdaptor.W(), wIOFCIVs[i]);
|
||||
Value loadW = rewriter.create<AffineLoadOp>(
|
||||
loc, operandAdaptor.W(), wIOFCIVs[i]);
|
||||
Value xwVal = rewriter.create<MulFOp>(loc, loadX, loadW);
|
||||
Value loadXW = rewriter.create<LoadOp>(loc, xwIOFC[i]);
|
||||
Value loadXW = rewriter.create<AffineLoadOp>(loc, xwIOFC[i]);
|
||||
Value nextXW = rewriter.create<AddFOp>(loc, loadXW, xwVal);
|
||||
rewriter.create<StoreOp>(loc, nextXW, xwIOFC[i]);
|
||||
rewriter.create<AffineStoreOp>(
|
||||
loc, nextXW, xwIOFC[i], ArrayRef<Value>{});
|
||||
// Ht-1 * Riofc
|
||||
Value loadR =
|
||||
rewriter.create<LoadOp>(loc, operandAdaptor.R(), rIOFCIVs[i]);
|
||||
Value loadR = rewriter.create<AffineLoadOp>(
|
||||
loc, operandAdaptor.R(), rIOFCIVs[i]);
|
||||
Value hrVal = rewriter.create<MulFOp>(loc, loadH, loadR);
|
||||
Value loadHR = rewriter.create<LoadOp>(loc, hrIOFC[i]);
|
||||
Value loadHR = rewriter.create<AffineLoadOp>(loc, hrIOFC[i]);
|
||||
Value nextHR = rewriter.create<AddFOp>(loc, loadHR, hrVal);
|
||||
rewriter.create<StoreOp>(loc, nextHR, hrIOFC[i]);
|
||||
rewriter.create<AffineStoreOp>(
|
||||
loc, nextHR, hrIOFC[i], ArrayRef<Value>{});
|
||||
}
|
||||
}
|
||||
rewriter.restoreInsertionPoint(ipReductionLoops);
|
||||
}
|
||||
|
||||
// it = f(Xt*(Wi^T) + Ht-1*(Ri^T) + Pi (.) Ct-1 + Wbi + Rbi)
|
||||
Value loadXWI = rewriter.create<LoadOp>(loc, xwIOFC[0]);
|
||||
Value loadHRI = rewriter.create<LoadOp>(loc, hrIOFC[0]);
|
||||
Value loadXWI = rewriter.create<AffineLoadOp>(loc, xwIOFC[0]);
|
||||
Value loadHRI = rewriter.create<AffineLoadOp>(loc, hrIOFC[0]);
|
||||
Value it = rewriter.create<AddFOp>(loc, loadXWI, loadHRI);
|
||||
if (hasPeepholes) {
|
||||
Value loadP =
|
||||
rewriter.create<LoadOp>(loc, operandAdaptor.P(), pIOFIVs[0]);
|
||||
rewriter.create<AffineLoadOp>(loc, operandAdaptor.P(), pIOFIVs[0]);
|
||||
Value PC = rewriter.create<MulFOp>(loc, loadP, loadC);
|
||||
it = rewriter.create<AddFOp>(loc, it, PC);
|
||||
}
|
||||
if (hasBiasForInput) {
|
||||
Value loadWB =
|
||||
rewriter.create<LoadOp>(loc, operandAdaptor.B(), wbIOFCIVs[0]);
|
||||
rewriter.create<AffineLoadOp>(loc, operandAdaptor.B(), wbIOFCIVs[0]);
|
||||
it = rewriter.create<AddFOp>(loc, it, loadWB);
|
||||
Value loadRB =
|
||||
rewriter.create<LoadOp>(loc, operandAdaptor.B(), rbIOFCIVs[0]);
|
||||
rewriter.create<AffineLoadOp>(loc, operandAdaptor.B(), rbIOFCIVs[0]);
|
||||
it = rewriter.create<AddFOp>(loc, it, loadRB);
|
||||
}
|
||||
it = applyActivation(rewriter, loc, activationPack.f, it);
|
||||
|
||||
// ft = f(Xt*(Wf^T) + Ht-1*(Rf^T) + Pf (.) Ct-1 + Wbf + Rbf)
|
||||
Value loadXWF = rewriter.create<LoadOp>(loc, xwIOFC[2]);
|
||||
Value loadHRF = rewriter.create<LoadOp>(loc, hrIOFC[2]);
|
||||
Value loadXWF = rewriter.create<AffineLoadOp>(loc, xwIOFC[2]);
|
||||
Value loadHRF = rewriter.create<AffineLoadOp>(loc, hrIOFC[2]);
|
||||
Value ft = rewriter.create<AddFOp>(loc, loadXWF, loadHRF);
|
||||
if (hasPeepholes) {
|
||||
Value loadP =
|
||||
rewriter.create<LoadOp>(loc, operandAdaptor.P(), pIOFIVs[2]);
|
||||
rewriter.create<AffineLoadOp>(loc, operandAdaptor.P(), pIOFIVs[2]);
|
||||
Value PC = rewriter.create<MulFOp>(loc, loadP, loadC);
|
||||
ft = rewriter.create<AddFOp>(loc, ft, PC);
|
||||
}
|
||||
if (hasBiasForInput) {
|
||||
Value loadWB =
|
||||
rewriter.create<LoadOp>(loc, operandAdaptor.B(), wbIOFCIVs[2]);
|
||||
rewriter.create<AffineLoadOp>(loc, operandAdaptor.B(), wbIOFCIVs[2]);
|
||||
ft = rewriter.create<AddFOp>(loc, ft, loadWB);
|
||||
Value loadRB =
|
||||
rewriter.create<LoadOp>(loc, operandAdaptor.B(), rbIOFCIVs[2]);
|
||||
rewriter.create<AffineLoadOp>(loc, operandAdaptor.B(), rbIOFCIVs[2]);
|
||||
ft = rewriter.create<AddFOp>(loc, ft, loadRB);
|
||||
}
|
||||
ft = applyActivation(rewriter, loc, activationPack.f, ft);
|
||||
|
||||
// ct = g(Xt*(Wc^T) + Ht-1*(Rc^T) + Wbc + Rbc)
|
||||
Value loadXWC = rewriter.create<LoadOp>(loc, xwIOFC[3]);
|
||||
Value loadHRC = rewriter.create<LoadOp>(loc, hrIOFC[3]);
|
||||
Value loadXWC = rewriter.create<AffineLoadOp>(loc, xwIOFC[3]);
|
||||
Value loadHRC = rewriter.create<AffineLoadOp>(loc, hrIOFC[3]);
|
||||
Value ct = rewriter.create<AddFOp>(loc, loadXWC, loadHRC);
|
||||
if (hasBiasForInput) {
|
||||
Value loadWB =
|
||||
rewriter.create<LoadOp>(loc, operandAdaptor.B(), wbIOFCIVs[3]);
|
||||
rewriter.create<AffineLoadOp>(loc, operandAdaptor.B(), wbIOFCIVs[3]);
|
||||
ct = rewriter.create<AddFOp>(loc, ct, loadWB);
|
||||
Value loadRB =
|
||||
rewriter.create<LoadOp>(loc, operandAdaptor.B(), rbIOFCIVs[3]);
|
||||
rewriter.create<AffineLoadOp>(loc, operandAdaptor.B(), rbIOFCIVs[3]);
|
||||
ct = rewriter.create<AddFOp>(loc, ct, loadRB);
|
||||
}
|
||||
ct = applyActivation(rewriter, loc, activationPack.g, ct);
|
||||
|
@ -481,24 +484,24 @@ void calculateState<ONNXLSTMOp, LstmState, LstmActivationPack>(
|
|||
Value FtCt1 = rewriter.create<MulFOp>(loc, ft, loadC);
|
||||
Value itct = rewriter.create<MulFOp>(loc, it, ct);
|
||||
Value Ct = rewriter.create<AddFOp>(loc, FtCt1, itct);
|
||||
rewriter.create<StoreOp>(loc, Ct, state.ct, cIVs);
|
||||
rewriter.create<AffineStoreOp>(loc, Ct, state.ct, cIVs);
|
||||
|
||||
// ot = f(Xt*(Wo^T) + Ht-1*(Ro^T) + Po (.) Ct + Wbo + Rbo)
|
||||
Value loadXWO = rewriter.create<LoadOp>(loc, xwIOFC[1]);
|
||||
Value loadHRO = rewriter.create<LoadOp>(loc, hrIOFC[1]);
|
||||
Value loadXWO = rewriter.create<AffineLoadOp>(loc, xwIOFC[1]);
|
||||
Value loadHRO = rewriter.create<AffineLoadOp>(loc, hrIOFC[1]);
|
||||
Value ot = rewriter.create<AddFOp>(loc, loadXWO, loadHRO);
|
||||
if (hasPeepholes) {
|
||||
Value loadP =
|
||||
rewriter.create<LoadOp>(loc, operandAdaptor.P(), pIOFIVs[1]);
|
||||
rewriter.create<AffineLoadOp>(loc, operandAdaptor.P(), pIOFIVs[1]);
|
||||
Value PC = rewriter.create<MulFOp>(loc, loadP, Ct);
|
||||
ot = rewriter.create<AddFOp>(loc, ot, PC);
|
||||
}
|
||||
if (hasBiasForInput) {
|
||||
Value loadWB =
|
||||
rewriter.create<LoadOp>(loc, operandAdaptor.B(), wbIOFCIVs[1]);
|
||||
rewriter.create<AffineLoadOp>(loc, operandAdaptor.B(), wbIOFCIVs[1]);
|
||||
ot = rewriter.create<AddFOp>(loc, ot, loadWB);
|
||||
Value loadRB =
|
||||
rewriter.create<LoadOp>(loc, operandAdaptor.B(), rbIOFCIVs[1]);
|
||||
rewriter.create<AffineLoadOp>(loc, operandAdaptor.B(), rbIOFCIVs[1]);
|
||||
ot = rewriter.create<AddFOp>(loc, ot, loadRB);
|
||||
}
|
||||
ot = applyActivation(rewriter, loc, activationPack.f, ot);
|
||||
|
@ -506,12 +509,12 @@ void calculateState<ONNXLSTMOp, LstmState, LstmActivationPack>(
|
|||
// Ht = ot (.) h(Ct)
|
||||
Value hCt = applyActivation(rewriter, loc, activationPack.h, Ct);
|
||||
Value Ht = rewriter.create<MulFOp>(loc, ot, hCt);
|
||||
rewriter.create<StoreOp>(loc, Ht, state.ht, hIVs);
|
||||
rewriter.create<AffineStoreOp>(loc, Ht, state.ht, hIVs);
|
||||
|
||||
// Store the current Ht if required.
|
||||
if (!isNoneType(state.allH)) {
|
||||
SmallVector<Value, 4> allHIVs{sequenceIV, directionIV, batchIV, hiddenIV};
|
||||
rewriter.create<StoreOp>(loc, Ht, state.allH, allHIVs);
|
||||
rewriter.create<AffineStoreOp>(loc, Ht, state.allH, allHIVs);
|
||||
}
|
||||
|
||||
// Deallocate the temporary results of matrix multiplications.
|
||||
|
|
|
@ -28,7 +28,7 @@ Value applyActivation(ConversionPatternRewriter &rewriter, Location loc,
|
|||
MemRefType scalarMemRefType =
|
||||
MemRefType::get({}, scalarOperand.getType(), {}, 0);
|
||||
Value alloc = rewriter.create<AllocOp>(loc, scalarMemRefType);
|
||||
rewriter.create<StoreOp>(loc, scalarOperand, alloc);
|
||||
rewriter.create<AffineStoreOp>(loc, scalarOperand, alloc, ArrayRef<Value>{});
|
||||
|
||||
std::vector<mlir::NamedAttribute> attributes;
|
||||
if (activation.alpha) {
|
||||
|
@ -68,6 +68,6 @@ Value applyActivation(ConversionPatternRewriter &rewriter, Location loc,
|
|||
else
|
||||
llvm_unreachable("Unsupported activation");
|
||||
|
||||
Value result = rewriter.create<LoadOp>(loc, res);
|
||||
Value result = rewriter.create<AffineLoadOp>(loc, res);
|
||||
return result;
|
||||
}
|
||||
|
|
|
@ -126,9 +126,9 @@ struct ONNXRNNOpLowering : public ConversionPattern {
|
|||
rewriter.getIndexType(), (direction == REVERSE) ? 0 : 1);
|
||||
Value reverseSequenceIV =
|
||||
rewriter.create<AffineApplyOp>(loc, reverseIVMap,
|
||||
ValueRange(std::vector<Value>{sequenceLoops.getInductionVar(0),
|
||||
std::vector<Value>{sequenceLoops.getInductionVar(0),
|
||||
emitConstantOp(rewriter, loc, rewriter.getIndexType(),
|
||||
sequenceDimSize)}));
|
||||
sequenceDimSize)});
|
||||
// Emit calculation for one RNN step.
|
||||
calculateState<RNNOp, S, A>(rewriter, loc, operandAdaptor, state,
|
||||
activationReverse, directionIV, reverseSequenceIV);
|
||||
|
|
|
@ -59,15 +59,18 @@ struct ONNXConcatOpLowering : public ConversionPattern {
|
|||
if (r != axis || writeOffset == 0) {
|
||||
writeIndices.emplace_back(inputLoops.getInductionVar(r));
|
||||
} else {
|
||||
auto indexWithOffset = rewriter.create<AddIOp>(loc,
|
||||
rewriter.create<ConstantIndexOp>(loc, writeOffset),
|
||||
inputLoops.getInductionVar(r));
|
||||
AffineMap indexWithOffsetMap =
|
||||
AffineMap::get(1, 0, rewriter.getAffineDimExpr(0) + writeOffset);
|
||||
Value indexWithOffset =
|
||||
rewriter.create<AffineApplyOp>(loc, indexWithOffsetMap,
|
||||
ArrayRef<Value>{inputLoops.getInductionVar(r)});
|
||||
writeIndices.emplace_back(indexWithOffset);
|
||||
}
|
||||
}
|
||||
// Insert copy.
|
||||
auto loadData = rewriter.create<LoadOp>(loc, operands[i], readIndices);
|
||||
rewriter.create<StoreOp>(loc, loadData, alloc, writeIndices);
|
||||
auto loadData =
|
||||
rewriter.create<AffineLoadOp>(loc, operands[i], readIndices);
|
||||
rewriter.create<AffineStoreOp>(loc, loadData, alloc, writeIndices);
|
||||
// Increment offset
|
||||
writeOffset += currShape[axis];
|
||||
}
|
||||
|
|
|
@ -88,16 +88,17 @@ struct ONNXPadOpLowering : public ConversionPattern {
|
|||
if (pads[i] == 0) {
|
||||
outLoopIVs.emplace_back(valueLoops.getInductionVar(i));
|
||||
} else {
|
||||
auto outIV = rewriter.create<AddIOp>(loc,
|
||||
rewriter.create<ConstantIndexOp>(loc, pads[i]),
|
||||
valueLoops.getInductionVar(i));
|
||||
AffineMap indexWithOffsetMap =
|
||||
AffineMap::get(1, 0, rewriter.getAffineDimExpr(0) + pads[i]);
|
||||
Value outIV = rewriter.create<AffineApplyOp>(loc, indexWithOffsetMap,
|
||||
ArrayRef<Value>{valueLoops.getInductionVar(i)});
|
||||
outLoopIVs.emplace_back(outIV);
|
||||
}
|
||||
}
|
||||
|
||||
auto originValue =
|
||||
rewriter.create<LoadOp>(loc, operandAdaptor.data(), inLoopIVs);
|
||||
rewriter.create<StoreOp>(loc, originValue, alloc, outLoopIVs);
|
||||
rewriter.create<AffineLoadOp>(loc, operandAdaptor.data(), inLoopIVs);
|
||||
rewriter.create<AffineStoreOp>(loc, originValue, alloc, outLoopIVs);
|
||||
rewriter.setInsertionPointToStart(padLoops.getIterateBlock());
|
||||
|
||||
SmallVector<Value, 4> outLoopIVs1;
|
||||
|
@ -105,7 +106,7 @@ struct ONNXPadOpLowering : public ConversionPattern {
|
|||
outLoopIVs1.emplace_back(padLoops.getInductionVar(i));
|
||||
|
||||
auto paddingValue = rewriter.create<ConstantOp>(loc, valueAttr);
|
||||
rewriter.create<StoreOp>(loc, paddingValue, alloc, outLoopIVs1);
|
||||
rewriter.create<AffineStoreOp>(loc, paddingValue, alloc, outLoopIVs1);
|
||||
|
||||
// Replace the original op with the generated code.
|
||||
rewriter.replaceOp(op, alloc);
|
||||
|
|
|
@ -77,15 +77,17 @@ struct ONNXPadConstantValuePadOpLowering : public ConversionPattern {
|
|||
if (pad_begin[i] == 0) {
|
||||
outLoopIVs.emplace_back(valueLoops.getInductionVar(i));
|
||||
} else {
|
||||
auto outIV = rewriter.create<AddIOp>(loc,
|
||||
rewriter.create<ConstantIndexOp>(loc, pad_begin[i]),
|
||||
valueLoops.getInductionVar(i));
|
||||
AffineMap indexWithOffsetMap =
|
||||
AffineMap::get(1, 0, rewriter.getAffineDimExpr(0) + pad_begin[i]);
|
||||
Value outIV = rewriter.create<AffineApplyOp>(loc, indexWithOffsetMap,
|
||||
ArrayRef<Value>{valueLoops.getInductionVar(i)});
|
||||
outLoopIVs.emplace_back(outIV);
|
||||
}
|
||||
}
|
||||
|
||||
auto inVal = rewriter.create<LoadOp>(loc, operandAdaptor.data(), inLoopIVs);
|
||||
rewriter.create<StoreOp>(loc, inVal, alloc, outLoopIVs);
|
||||
auto inVal =
|
||||
rewriter.create<AffineLoadOp>(loc, operandAdaptor.data(), inLoopIVs);
|
||||
rewriter.create<AffineStoreOp>(loc, inVal, alloc, outLoopIVs);
|
||||
rewriter.setInsertionPointToStart(padLoops.getIterateBlock());
|
||||
|
||||
SmallVector<Value, 4> outLoopIVs1;
|
||||
|
@ -93,7 +95,7 @@ struct ONNXPadConstantValuePadOpLowering : public ConversionPattern {
|
|||
outLoopIVs1.emplace_back(padLoops.getInductionVar(i));
|
||||
|
||||
auto inVal1 = rewriter.create<ConstantOp>(loc, constantValAttr);
|
||||
rewriter.create<StoreOp>(loc, inVal1, alloc, outLoopIVs1);
|
||||
rewriter.create<AffineStoreOp>(loc, inVal1, alloc, outLoopIVs1);
|
||||
|
||||
// Replace the original op with the generated code.
|
||||
rewriter.replaceOp(op, alloc);
|
||||
|
|
|
@ -64,7 +64,8 @@ struct ONNXReshapeOpLowering : public ConversionPattern {
|
|||
for (int i = 0; i < memRefShape.size(); ++i) {
|
||||
Value index = emitConstantOp(rewriter, loc, rewriter.getIndexType(), i);
|
||||
// Load index from array of indices.
|
||||
Value loadedVal = rewriter.create<LoadOp>(loc, operands[1], index);
|
||||
Value loadedVal =
|
||||
rewriter.create<AffineLoadOp>(loc, operands[1], index);
|
||||
// If a dimension is zero, the actual dimension value is taken from the
|
||||
// input tensor.
|
||||
//
|
||||
|
|
|
@ -92,8 +92,9 @@ struct ONNXSplitOpLowering : public ConversionPattern {
|
|||
writeIndices.emplace_back(outputLoops.getInductionVar(r));
|
||||
}
|
||||
// Insert copy.
|
||||
auto loadData = rewriter.create<LoadOp>(loc, operands[0], readIndices);
|
||||
rewriter.create<StoreOp>(loc, loadData, allocs[i], writeIndices);
|
||||
auto loadData =
|
||||
rewriter.create<AffineLoadOp>(loc, operands[0], readIndices);
|
||||
rewriter.create<AffineStoreOp>(loc, loadData, allocs[i], writeIndices);
|
||||
}
|
||||
rewriter.replaceOp(op, allocs);
|
||||
return success();
|
||||
|
|
|
@ -80,8 +80,8 @@ struct ONNXTransposeOpLowering : public ConversionPattern {
|
|||
for (int i = 0; i < iterationBlock.getArguments().size(); ++i)
|
||||
outLoopIVs.emplace_back(iterationBlock.getArguments()[perm[i]]);
|
||||
|
||||
auto inVal = rewriter.create<LoadOp>(loc, data, inLoopIVs);
|
||||
rewriter.create<StoreOp>(loc, inVal, alloc, outLoopIVs);
|
||||
auto inVal = rewriter.create<AffineLoadOp>(loc, data, inLoopIVs);
|
||||
rewriter.create<AffineStoreOp>(loc, inVal, alloc, outLoopIVs);
|
||||
|
||||
rewriter.replaceOp(op, alloc);
|
||||
|
||||
|
|
|
@ -14,10 +14,10 @@ func @test_enable_memory_pool(%arg0: tensor<10x10xf32>) -> tensor<10x10xf32> {
|
|||
// CHECK: krnl.define_loops
|
||||
// CHECK: krnl.optimize_loops
|
||||
// CHECK: krnl.iterate
|
||||
// CHECK: [[LOAD1:%.+]] = load %arg0[%arg1, %arg2] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD2:%.+]] = load %arg0[%arg1, %arg2] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg1, %arg2] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD2:%.+]] = affine.load %arg0[%arg1, %arg2] : memref<10x10xf32>
|
||||
// CHECK: [[ADDF1:%.+]] = addf [[LOAD1]], [[LOAD2]] : f32
|
||||
// CHECK: store [[ADDF1]], [[GETREF]][%arg1, %arg2] : memref<10x10xf32>
|
||||
// CHECK: affine.store [[ADDF1]], [[GETREF]][%arg1, %arg2] : memref<10x10xf32>
|
||||
// CHECK: krnl.define_loops
|
||||
// CHECK: krnl.optimize_loops
|
||||
// CHECK: krnl.iterate
|
||||
|
@ -43,26 +43,26 @@ func @test_enable_memory_pool_2(%arg0: tensor<10x10xf32>, %arg1: tensor<10x20xf3
|
|||
// CHECK: krnl.define_loops
|
||||
// CHECK: krnl.optimize_loops
|
||||
// CHECK: krnl.iterate
|
||||
// CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD2:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD2:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[ADDF1:%.+]] = addf [[LOAD1]], [[LOAD2]] : f32
|
||||
// CHECK: store [[ADDF1]], [[GETREF1]][%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: affine.store [[ADDF1]], [[GETREF1]][%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: krnl.define_loops
|
||||
// CHECK: krnl.optimize_loops
|
||||
// CHECK: krnl.iterate
|
||||
// CHECK: [[LOAD3:%.+]] = load [[GETREF1]][%arg2, %arg4] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD4:%.+]] = load %arg1[%arg4, %arg3] : memref<10x20xf32>
|
||||
// CHECK: [[LOAD5:%.+]] = load [[GETREF0]][%arg2, %arg3] : memref<10x20xf32>
|
||||
// CHECK: [[LOAD3:%.+]] = affine.load [[GETREF1]][%arg2, %arg4] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD4:%.+]] = affine.load %arg1[%arg4, %arg3] : memref<10x20xf32>
|
||||
// CHECK: [[LOAD5:%.+]] = affine.load [[GETREF0]][%arg2, %arg3] : memref<10x20xf32>
|
||||
// CHECK: [[MULF1:%.+]] = mulf [[LOAD3]], [[LOAD4]] : f32
|
||||
// CHECK: [[ADDF2:%.+]] = addf [[LOAD5]], [[MULF1]] : f32
|
||||
// CHECK: store [[ADDF2]], [[GETREF0]][%arg2, %arg3] : memref<10x20xf32>
|
||||
// CHECK: affine.store [[ADDF2]], [[GETREF0]][%arg2, %arg3] : memref<10x20xf32>
|
||||
// CHECK: krnl.define_loops
|
||||
// CHECK: krnl.optimize_loops
|
||||
// CHECK: krnl.iterate
|
||||
// CHECK: [[LOAD6:%.+]] = load [[GETREF0]][%arg2, %arg3] : memref<10x20xf32>
|
||||
// CHECK: [[LOAD7:%.+]] = load %arg1[%arg2, %arg3] : memref<10x20xf32>
|
||||
// CHECK: [[LOAD6:%.+]] = affine.load [[GETREF0]][%arg2, %arg3] : memref<10x20xf32>
|
||||
// CHECK: [[LOAD7:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x20xf32>
|
||||
// CHECK: [[ADDF3:%.+]] = addf [[LOAD6]], [[LOAD7]] : f32
|
||||
// CHECK: store [[ADDF3]], [[RES]][%arg2, %arg3] : memref<10x20xf32>
|
||||
// CHECK: affine.store [[ADDF3]], [[RES]][%arg2, %arg3] : memref<10x20xf32>
|
||||
// CHECK: dealloc [[MEMPOOL1]] : memref<400xi8>
|
||||
// CHECK: dealloc [[MEMPOOL0]] : memref<800xi8>
|
||||
// CHECK: return [[RES]] : memref<10x20xf32>
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -16,10 +16,10 @@ func @test_add_add(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
|
|||
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
|
||||
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
|
||||
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
|
||||
// CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[ADDF:%.+]] = addf [[LOAD1]], [[LOAD2]] : f32
|
||||
// CHECK: store [[ADDF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: affine.store [[ADDF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
|
||||
/// Second Add
|
||||
// CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
|
||||
|
@ -27,10 +27,10 @@ func @test_add_add(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
|
|||
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
|
||||
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
|
||||
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
|
||||
// CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[ADDF:%.+]] = addf [[LOAD1]], [[LOAD2]] : f32
|
||||
// CHECK: store [[ADDF]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: affine.store [[ADDF]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
|
||||
/// Dealloc of first result.
|
||||
// CHECK: dealloc [[RES]] : memref<10x10xf32>
|
||||
|
@ -55,10 +55,10 @@ func @test_mul_mul(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
|
|||
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
|
||||
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
|
||||
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
|
||||
// CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[MULF:%.+]] = mulf [[LOAD1]], [[LOAD2]] : f32
|
||||
// CHECK: store [[MULF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: affine.store [[MULF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
|
||||
/// Second Mul
|
||||
// CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
|
||||
|
@ -66,10 +66,10 @@ func @test_mul_mul(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
|
|||
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
|
||||
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
|
||||
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
|
||||
// CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[MULF:%.+]] = mulf [[LOAD1]], [[LOAD2]] : f32
|
||||
// CHECK: store [[MULF]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: affine.store [[MULF]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
|
||||
/// Dealloc of first result.
|
||||
// CHECK: dealloc [[RES]] : memref<10x10xf32>
|
||||
|
@ -94,10 +94,10 @@ func @test_div_div(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
|
|||
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
|
||||
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
|
||||
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
|
||||
// CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[DIVF:%.+]] = divf [[LOAD1]], [[LOAD2]] : f32
|
||||
// CHECK: store [[DIVF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: affine.store [[DIVF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
|
||||
/// Second Div
|
||||
// CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
|
||||
|
@ -105,10 +105,10 @@ func @test_div_div(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
|
|||
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
|
||||
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
|
||||
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
|
||||
// CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[DIVF:%.+]] = divf [[LOAD1]], [[LOAD2]] : f32
|
||||
// CHECK: store [[DIVF]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: affine.store [[DIVF]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
|
||||
/// Dealloc of first result.
|
||||
// CHECK: dealloc [[RES]] : memref<10x10xf32>
|
||||
|
@ -133,10 +133,10 @@ func @test_sub_sub(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
|
|||
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
|
||||
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
|
||||
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
|
||||
// CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[SUBF:%.+]] = subf [[LOAD1]], [[LOAD2]] : f32
|
||||
// CHECK: store [[SUBF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: affine.store [[SUBF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
|
||||
/// Second Sub
|
||||
// CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
|
||||
|
@ -144,10 +144,10 @@ func @test_sub_sub(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
|
|||
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
|
||||
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
|
||||
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
|
||||
// CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[SUBF:%.+]] = subf [[LOAD1]], [[LOAD2]] : f32
|
||||
// CHECK: store [[SUBF]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: affine.store [[SUBF]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
|
||||
/// Dealloc of first result.
|
||||
// CHECK: dealloc [[RES]] : memref<10x10xf32>
|
||||
|
@ -172,10 +172,10 @@ func @test_and_and(%arg0 : tensor<10x10xi1>, %arg1 : tensor<10x10xi1>) -> tensor
|
|||
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
|
||||
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
|
||||
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
|
||||
// CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xi1>
|
||||
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xi1>
|
||||
// CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xi1>
|
||||
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xi1>
|
||||
// CHECK: [[AND:%.+]] = and [[LOAD1]], [[LOAD2]] : i1
|
||||
// CHECK: store [[AND]], [[RES]][%arg2, %arg3] : memref<10x10xi1>
|
||||
// CHECK: affine.store [[AND]], [[RES]][%arg2, %arg3] : memref<10x10xi1>
|
||||
|
||||
/// Second And
|
||||
// CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
|
||||
|
@ -183,10 +183,10 @@ func @test_and_and(%arg0 : tensor<10x10xi1>, %arg1 : tensor<10x10xi1>) -> tensor
|
|||
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
|
||||
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
|
||||
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
|
||||
// CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xi1>
|
||||
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xi1>
|
||||
// CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xi1>
|
||||
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xi1>
|
||||
// CHECK: [[AND:%.+]] = and [[LOAD1]], [[LOAD2]] : i1
|
||||
// CHECK: store [[AND]], [[RET_RES]][%arg2, %arg3] : memref<10x10xi1>
|
||||
// CHECK: affine.store [[AND]], [[RET_RES]][%arg2, %arg3] : memref<10x10xi1>
|
||||
|
||||
/// Dealloc of first result.
|
||||
// CHECK: dealloc [[RES]] : memref<10x10xi1>
|
||||
|
@ -211,10 +211,10 @@ func @test_or_or(%arg0 : tensor<10x10xi1>, %arg1 : tensor<10x10xi1>) -> tensor<*
|
|||
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
|
||||
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
|
||||
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
|
||||
// CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xi1>
|
||||
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xi1>
|
||||
// CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xi1>
|
||||
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xi1>
|
||||
// CHECK: [[OR:%.+]] = or [[LOAD1]], [[LOAD2]] : i1
|
||||
// CHECK: store [[OR]], [[RES]][%arg2, %arg3] : memref<10x10xi1>
|
||||
// CHECK: affine.store [[OR]], [[RES]][%arg2, %arg3] : memref<10x10xi1>
|
||||
|
||||
/// Second Or
|
||||
// CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
|
||||
|
@ -222,10 +222,10 @@ func @test_or_or(%arg0 : tensor<10x10xi1>, %arg1 : tensor<10x10xi1>) -> tensor<*
|
|||
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
|
||||
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
|
||||
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
|
||||
// CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xi1>
|
||||
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xi1>
|
||||
// CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xi1>
|
||||
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xi1>
|
||||
// CHECK: [[OR:%.+]] = or [[LOAD1]], [[LOAD2]] : i1
|
||||
// CHECK: store [[OR]], [[RET_RES]][%arg2, %arg3] : memref<10x10xi1>
|
||||
// CHECK: affine.store [[OR]], [[RET_RES]][%arg2, %arg3] : memref<10x10xi1>
|
||||
|
||||
/// Dealloc of first result.
|
||||
// CHECK: dealloc [[RES]] : memref<10x10xi1>
|
||||
|
@ -250,10 +250,10 @@ func @test_xor_xor(%arg0 : tensor<10x10xi1>, %arg1 : tensor<10x10xi1>) -> tensor
|
|||
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
|
||||
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
|
||||
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
|
||||
// CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xi1>
|
||||
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xi1>
|
||||
// CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xi1>
|
||||
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xi1>
|
||||
// CHECK: [[XOR:%.+]] = xor [[LOAD1]], [[LOAD2]] : i1
|
||||
// CHECK: store [[XOR]], [[RES]][%arg2, %arg3] : memref<10x10xi1>
|
||||
// CHECK: affine.store [[XOR]], [[RES]][%arg2, %arg3] : memref<10x10xi1>
|
||||
|
||||
/// Second Xor
|
||||
// CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
|
||||
|
@ -261,10 +261,10 @@ func @test_xor_xor(%arg0 : tensor<10x10xi1>, %arg1 : tensor<10x10xi1>) -> tensor
|
|||
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
|
||||
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
|
||||
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
|
||||
// CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xi1>
|
||||
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xi1>
|
||||
// CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xi1>
|
||||
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xi1>
|
||||
// CHECK: [[XOR:%.+]] = xor [[LOAD1]], [[LOAD2]] : i1
|
||||
// CHECK: store [[XOR]], [[RET_RES]][%arg2, %arg3] : memref<10x10xi1>
|
||||
// CHECK: affine.store [[XOR]], [[RET_RES]][%arg2, %arg3] : memref<10x10xi1>
|
||||
|
||||
/// Dealloc of first result.
|
||||
// CHECK: dealloc [[RES]] : memref<10x10xi1>
|
||||
|
@ -585,10 +585,10 @@ func @test_sum_sum(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
|
|||
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
|
||||
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
|
||||
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
|
||||
// CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[ADD:%.+]] = addf [[LOAD1]], [[LOAD2]] : f32
|
||||
// CHECK: store [[ADD]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: affine.store [[ADD]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
|
||||
/// Second Sum
|
||||
// CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
|
||||
|
@ -596,10 +596,10 @@ func @test_sum_sum(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
|
|||
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
|
||||
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
|
||||
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
|
||||
// CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[ADD:%.+]] = addf [[LOAD1]], [[LOAD2]] : f32
|
||||
// CHECK: store [[ADD]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: affine.store [[ADD]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
|
||||
/// Dealloc of first result.
|
||||
// CHECK: dealloc [[RES]] : memref<10x10xf32>
|
||||
|
@ -624,11 +624,11 @@ func @test_max_max(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
|
|||
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
|
||||
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
|
||||
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
|
||||
// CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[MAX:%.+]] = cmpf "ogt", [[LOAD1]], [[LOAD2]] : f32
|
||||
// CHECK: [[RELU_RES:%.+]] = select [[MAX]], [[LOAD1]], [[LOAD2]] : f32
|
||||
// CHECK: store [[RELU_RES]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: affine.store [[RELU_RES]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
|
||||
/// Second Max
|
||||
// CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
|
||||
|
@ -636,11 +636,11 @@ func @test_max_max(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
|
|||
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
|
||||
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
|
||||
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
|
||||
// CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[MAX:%.+]] = cmpf "ogt", [[LOAD1]], [[LOAD2]] : f32
|
||||
// CHECK: [[RELU_RES:%.+]] = select [[MAX]], [[LOAD1]], [[LOAD2]] : f32
|
||||
// CHECK: store [[RELU_RES]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: affine.store [[RELU_RES]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
|
||||
/// Dealloc of first result.
|
||||
// CHECK: dealloc [[RES]] : memref<10x10xf32>
|
||||
|
@ -665,11 +665,11 @@ func @test_min_min(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
|
|||
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
|
||||
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
|
||||
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
|
||||
// CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[MIN:%.+]] = cmpf "olt", [[LOAD1]], [[LOAD2]] : f32
|
||||
// CHECK: [[RELU_RES:%.+]] = select [[MIN]], [[LOAD1]], [[LOAD2]] : f32
|
||||
// CHECK: store [[RELU_RES]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: affine.store [[RELU_RES]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
|
||||
/// Second Min
|
||||
// CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
|
||||
|
@ -677,11 +677,11 @@ func @test_min_min(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
|
|||
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
|
||||
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
|
||||
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
|
||||
// CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: [[MIN:%.+]] = cmpf "olt", [[LOAD1]], [[LOAD2]] : f32
|
||||
// CHECK: [[RELU_RES:%.+]] = select [[MIN]], [[LOAD1]], [[LOAD2]] : f32
|
||||
// CHECK: store [[RELU_RES]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
// CHECK: affine.store [[RELU_RES]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
|
||||
|
||||
/// Dealloc of first result.
|
||||
// CHECK: dealloc [[RES]] : memref<10x10xf32>
|
||||
|
|
Loading…
Reference in New Issue