Replace std.load/std.store by affine.load/affine.store (#180)

* Move to more recent LLVM ID (May 15)

* clang-format

* Bump cache version up

* Update readme

* Fix doc check

* Move to a newer commit id

* Update LoopToStandard -> SCFToStandard

* Change MLIRSideEffects to MLIRSideEffectInterfaces

* Add AffineScope trait to KrnlIterateOp

* [ElementWise] Load/Store op to AffineLoad/AffineStore op

* [Gemm, MatMul, Reduction, Softmax] Load/Store op to AffineLoad/AffineStore op

* [Concat] Load/Store op to AffineLoad/AffineStore op

* [Pad, PadConstantValuePad, Reshape, Transpose] Load/Store op to AffineLoad/AffineStore op

* [LSTM] Load/Store op to AffineLoad/AffineStore op

* [Conv, Norm, Pooling] Load/Store op to AffineLoad/AffineStore op

* Add affine-loop-fusion pass

* Use Load/Store for scalar

* Use Load/Store for scalar

* Fix lit tests

* Unknown dimensions for broadcasting ops

* Affine Load/Store for scalar memref

* clang-format

Co-authored-by: Gheorghe-Teodor Bercea <gt.bercea@gmail.com>
Co-authored-by: Tian Jin <tjingrant@gmail.com>
This commit is contained in:
Tung D. Le 2020-07-05 17:20:21 +09:00 committed by GitHub
parent 2c8f5701bd
commit 7e05f371de
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
20 changed files with 494 additions and 469 deletions

View File

@ -619,20 +619,35 @@ struct ONNXElementwiseVariadicOpLowering : public ConversionPattern {
for (auto arg : iterationBlock.getArguments())
loopIVs.push_back(arg);
}
// Fold over operands for each of their scalar values
// Fold over operands for each of their scalar values.
Value accumulated, next;
auto accumulatedLoopIVs = getLoopIVsForBroadcasting(
// Obtain the first operand.
std::vector<Value> accumulatedLoopIVs = getLoopIVsForBroadcasting(
loc, rewriter, loopIVs, operands[0], broadcastedDimInfo[0]);
accumulated = rewriter.create<LoadOp>(loc, operands[0], accumulatedLoopIVs);
if (!hasAllConstantDimensions(memRefType))
// In case of unknown dimensions, use std.load since
// 'getLoopIVsForBroadcasting' has not supported affine map so far.
accumulated =
rewriter.create<LoadOp>(loc, operands[0], accumulatedLoopIVs);
else
accumulated =
rewriter.create<AffineLoadOp>(loc, operands[0], accumulatedLoopIVs);
// Iterate over the remaining operands.
for (unsigned i = 1; i < numArgs; i++) {
auto nextLoopIVs = getLoopIVsForBroadcasting(
std::vector<Value> nextLoopIVs = getLoopIVsForBroadcasting(
loc, rewriter, loopIVs, operands[i], broadcastedDimInfo[i]);
next = rewriter.create<LoadOp>(loc, operands[i], nextLoopIVs);
if (!hasAllConstantDimensions(memRefType))
// In case of unknown dimensions, use std.load since
// 'getLoopIVsForBroadcasting' has not supported affine map so far.
next = rewriter.create<LoadOp>(loc, operands[i], nextLoopIVs);
else
next = rewriter.create<AffineLoadOp>(loc, operands[i], nextLoopIVs);
accumulated = emitScalarOpFor<ElementwiseVariadicOp>(
rewriter, loc, op, memRefType.getElementType(), {accumulated, next});
}
// Store result in the resulting array.
rewriter.create<StoreOp>(loc, accumulated, alloc, loopIVs);
rewriter.create<AffineStoreOp>(loc, accumulated, alloc, loopIVs);
rewriter.replaceOp(op, alloc);

View File

@ -156,23 +156,23 @@ struct ONNXGemmOpLowering : public ConversionPattern {
// Initialize the output of A*B
auto zero = emitConstantOp(rewriter, loc, memRefType.getElementType(), 0);
rewriter.create<StoreOp>(loc, zero, alloc, loopMNIVs);
rewriter.create<AffineStoreOp>(loc, zero, alloc, loopMNIVs);
// Compute A*B
auto matmulIterateOp = rewriter.create<KrnlIterateOp>(loc, reductionPack);
// Compute beta*C, and add up to alpha*A*B (unidirectional broadcasting)
auto loadedAB = rewriter.create<LoadOp>(loc, alloc, loopMNIVs);
auto loadedAB = rewriter.create<AffineLoadOp>(loc, alloc, loopMNIVs);
auto alphaAB = rewriter.create<MulFOp>(loc, alpha, loadedAB);
if (hasBias) {
auto loopCIVs = getLoopIVsForBroadcasting(
loc, rewriter, loopMNIVs, C, broadcastedDimInfo);
auto loadedC = rewriter.create<LoadOp>(loc, C, loopCIVs);
auto loadedC = rewriter.create<AffineLoadOp>(loc, C, loopCIVs);
auto betaC = rewriter.create<MulFOp>(loc, beta, loadedC);
auto Y = rewriter.create<AddFOp>(loc, alphaAB, betaC);
rewriter.create<StoreOp>(loc, Y, alloc, loopMNIVs);
rewriter.create<AffineStoreOp>(loc, Y, alloc, loopMNIVs);
} else {
rewriter.create<StoreOp>(loc, alphaAB, alloc, loopMNIVs);
rewriter.create<AffineStoreOp>(loc, alphaAB, alloc, loopMNIVs);
}
// Insert instructions to do matrix multiplication: A*B
@ -199,12 +199,12 @@ struct ONNXGemmOpLowering : public ConversionPattern {
}
// Matmul computation
auto loadedA = rewriter.create<LoadOp>(loc, A, loopAIVs);
auto loadedB = rewriter.create<LoadOp>(loc, B, loopBIVs);
auto loadedY = rewriter.create<LoadOp>(loc, alloc, loopMNIVs);
auto loadedA = rewriter.create<AffineLoadOp>(loc, A, loopAIVs);
auto loadedB = rewriter.create<AffineLoadOp>(loc, B, loopBIVs);
auto loadedY = rewriter.create<AffineLoadOp>(loc, alloc, loopMNIVs);
auto AB = rewriter.create<MulFOp>(loc, loadedA, loadedB);
auto accumulated = rewriter.create<AddFOp>(loc, loadedY, AB);
rewriter.create<StoreOp>(loc, accumulated, alloc, loopMNIVs);
rewriter.create<AffineStoreOp>(loc, accumulated, alloc, loopMNIVs);
rewriter.replaceOp(op, alloc);

View File

@ -221,7 +221,7 @@ struct ONNXMatMulOpLowering : public ConversionPattern {
}
// Fill the output with value 0.
rewriter.create<StoreOp>(loc, zero, alloc, loopBatchMNIVs);
rewriter.create<AffineStoreOp>(loc, zero, alloc, loopBatchMNIVs);
// Iterate along the reduction dimension.
// Use a value from A.
@ -265,17 +265,17 @@ struct ONNXMatMulOpLowering : public ConversionPattern {
loopBatchKNIVs.emplace_back(loopMNIVs[0]);
}
// Matmul computation
auto loadedA = rewriter.create<LoadOp>(loc, A, loopBatchMKIVs);
auto loadedB = rewriter.create<LoadOp>(loc, B, loopBatchKNIVs);
auto loadedY = rewriter.create<LoadOp>(loc, alloc, loopBatchMNIVs);
auto loadedA = rewriter.create<AffineLoadOp>(loc, A, loopBatchMKIVs);
auto loadedB = rewriter.create<AffineLoadOp>(loc, B, loopBatchKNIVs);
auto loadedY = rewriter.create<AffineLoadOp>(loc, alloc, loopBatchMNIVs);
if (elementType.isa<IntegerType>()) {
auto AB = rewriter.create<MulIOp>(loc, loadedA, loadedB);
auto accumulated = rewriter.create<AddIOp>(loc, loadedY, AB);
rewriter.create<StoreOp>(loc, accumulated, alloc, loopBatchMNIVs);
rewriter.create<AffineStoreOp>(loc, accumulated, alloc, loopBatchMNIVs);
} else if (elementType.isa<FloatType>()) {
auto AB = rewriter.create<MulFOp>(loc, loadedA, loadedB);
auto accumulated = rewriter.create<AddFOp>(loc, loadedY, AB);
rewriter.create<StoreOp>(loc, accumulated, alloc, loopBatchMNIVs);
rewriter.create<AffineStoreOp>(loc, accumulated, alloc, loopBatchMNIVs);
}
} else if ((AShape.size() == 1) && (BShape.size() == 1)) {
// Case 3:
@ -283,7 +283,7 @@ struct ONNXMatMulOpLowering : public ConversionPattern {
// Fill the output with value 0.
Value zeroIndex = rewriter.create<ConstantIndexOp>(loc, 0);
rewriter.create<StoreOp>(loc, zero, alloc, zeroIndex);
rewriter.create<AffineStoreOp>(loc, zero, alloc, zeroIndex);
// Iterate along the reduction dimension.
// Use a value from A.
@ -310,17 +310,17 @@ struct ONNXMatMulOpLowering : public ConversionPattern {
loopKIVs.emplace_back(reduceIterationBlock.getArgument(0));
// Matmul computation
auto loadedA = rewriter.create<LoadOp>(loc, A, loopKIVs);
auto loadedB = rewriter.create<LoadOp>(loc, B, loopKIVs);
auto loadedY = rewriter.create<LoadOp>(loc, alloc, zeroIndex);
auto loadedA = rewriter.create<AffineLoadOp>(loc, A, loopKIVs);
auto loadedB = rewriter.create<AffineLoadOp>(loc, B, loopKIVs);
auto loadedY = rewriter.create<AffineLoadOp>(loc, alloc, zeroIndex);
if (elementType.isa<IntegerType>()) {
auto AB = rewriter.create<MulIOp>(loc, loadedA, loadedB);
auto accumulated = rewriter.create<AddIOp>(loc, loadedY, AB);
rewriter.create<StoreOp>(loc, accumulated, alloc, zeroIndex);
rewriter.create<AffineStoreOp>(loc, accumulated, alloc, zeroIndex);
} else if (elementType.isa<FloatType>()) {
auto AB = rewriter.create<MulFOp>(loc, loadedA, loadedB);
auto accumulated = rewriter.create<AddFOp>(loc, loadedY, AB);
rewriter.create<StoreOp>(loc, accumulated, alloc, zeroIndex);
rewriter.create<AffineStoreOp>(loc, accumulated, alloc, zeroIndex);
}
} else {
// No scalar matrix multiplication.

View File

@ -212,7 +212,7 @@ struct ONNXReductionOpLowering : public ConversionPattern {
Value identity =
getIdentityValue<ONNXReductionOp>(rewriter, loc, elementOutType);
rewriter.create<StoreOp>(loc, identity, alloc, loopIVs);
rewriter.create<AffineStoreOp>(loc, identity, alloc, loopIVs);
// Define an Krnl loop to do reduction.
rewriter.setInsertionPointAfter(iterateOpInit);
@ -256,11 +256,11 @@ struct ONNXReductionOpLowering : public ConversionPattern {
}
Value next, accumulated;
next = rewriter.create<LoadOp>(loc, operands[0], inLoopIVs);
accumulated = rewriter.create<LoadOp>(loc, alloc, outLoopIVs);
next = rewriter.create<AffineLoadOp>(loc, operands[0], inLoopIVs);
accumulated = rewriter.create<AffineLoadOp>(loc, alloc, outLoopIVs);
accumulated = emitScalarOpFor<ONNXReductionOp>(
rewriter, loc, op, memRefOutType.getElementType(), {accumulated, next});
rewriter.create<StoreOp>(loc, accumulated, alloc, outLoopIVs);
rewriter.create<AffineStoreOp>(loc, accumulated, alloc, outLoopIVs);
rewriter.replaceOp(op, alloc);
return success();

View File

@ -104,8 +104,9 @@ struct ONNXSoftmaxOpLowering : public ConversionPattern {
outerLoopIVs.push_back(arg);
// Reset accumulators.
rewriter.create<StoreOp>(loc, zero, sumOp);
rewriter.create<StoreOp>(loc, negInfinity, maxOp);
rewriter.create<AffineStoreOp>(loc, zero, sumOp, ArrayRef<Value>{});
rewriter.create<AffineStoreOp>(
loc, negInfinity, maxOp, ArrayRef<Value>{});
// Create an inner loop to compute max.
maxIterateOp = rewriter.create<KrnlIterateOp>(loc, innerPack);
@ -115,8 +116,9 @@ struct ONNXSoftmaxOpLowering : public ConversionPattern {
softmaxIterateOp = rewriter.create<KrnlIterateOp>(loc, innerPack);
} else {
// Reset accumulators.
rewriter.create<StoreOp>(loc, zero, sumOp);
rewriter.create<StoreOp>(loc, negInfinity, maxOp);
rewriter.create<AffineStoreOp>(loc, zero, sumOp, ArrayRef<Value>{});
rewriter.create<AffineStoreOp>(
loc, negInfinity, maxOp, ArrayRef<Value>{});
// Create an inner loop to compute max.
maxIterateOp = rewriter.create<KrnlIterateOp>(loc, innerPack);
@ -142,16 +144,16 @@ struct ONNXSoftmaxOpLowering : public ConversionPattern {
maxLoopIVs.push_back(arg);
// Compute the max value.
Value max = rewriter.create<LoadOp>(loc, maxOp);
Value nextMax = rewriter.create<LoadOp>(loc, input, maxLoopIVs);
Value max = rewriter.create<AffineLoadOp>(loc, maxOp);
Value nextMax = rewriter.create<AffineLoadOp>(loc, input, maxLoopIVs);
auto maxCond =
rewriter.create<CmpFOp>(loc, CmpFPredicate::OGT, max, nextMax);
max = rewriter.create<SelectOp>(loc, maxCond, max, nextMax);
rewriter.create<StoreOp>(loc, max, maxOp);
rewriter.create<AffineStoreOp>(loc, max, maxOp, ArrayRef<Value>{});
// Get the max.
rewriter.setInsertionPoint(sumIterateOp);
max = rewriter.create<LoadOp>(loc, maxOp);
max = rewriter.create<AffineLoadOp>(loc, maxOp);
// Insert instructions inside the sum loop.
Block &sumIterationBlock = sumIterateOp.bodyRegion().front();
@ -165,18 +167,18 @@ struct ONNXSoftmaxOpLowering : public ConversionPattern {
sumLoopIVs.push_back(arg);
// Sum up values.
Value sum = rewriter.create<LoadOp>(loc, sumOp);
Value next = rewriter.create<LoadOp>(loc, input, sumLoopIVs);
Value sum = rewriter.create<AffineLoadOp>(loc, sumOp);
Value next = rewriter.create<AffineLoadOp>(loc, input, sumLoopIVs);
Value sub = rewriter.create<SubFOp>(loc, next, max);
Value exp = rewriter.create<ExpOp>(loc, sub);
sum = rewriter.create<AddFOp>(loc, sum, exp);
rewriter.create<StoreOp>(loc, sum, sumOp);
rewriter.create<AffineStoreOp>(loc, sum, sumOp, ArrayRef<Value>{});
// Store intermediate values in the result to avoid recomputation.
rewriter.create<StoreOp>(loc, exp, alloc, sumLoopIVs);
rewriter.create<AffineStoreOp>(loc, exp, alloc, sumLoopIVs);
// Get the sum.
rewriter.setInsertionPoint(softmaxIterateOp);
sum = rewriter.create<LoadOp>(loc, sumOp);
sum = rewriter.create<AffineLoadOp>(loc, sumOp);
// Insert instructions inside the softmax loop.
Block &softmaxIterationBlock = softmaxIterateOp.bodyRegion().front();
@ -190,9 +192,10 @@ struct ONNXSoftmaxOpLowering : public ConversionPattern {
softmaxLoopIVs.push_back(arg);
// Compute softmax.
Value expLoadedVal = rewriter.create<LoadOp>(loc, alloc, softmaxLoopIVs);
Value expLoadedVal =
rewriter.create<AffineLoadOp>(loc, alloc, softmaxLoopIVs);
Value result = rewriter.create<DivFOp>(loc, expLoadedVal, sum);
rewriter.create<StoreOp>(loc, result, alloc, softmaxLoopIVs);
rewriter.create<AffineStoreOp>(loc, result, alloc, softmaxLoopIVs);
rewriter.replaceOp(op, alloc);

View File

@ -129,10 +129,14 @@ struct ONNXConvOpLowering : public ConversionPattern {
if (group > 1) {
// Middle loop is over groups and third loop is over the
// kernel identifiers in the current group.
auto kernelsOffset = rewriter.create<MulIOp>(
loc, outerLoops.getInductionVar(gIndex), kernelsPerGroupValue);
kernel = rewriter.create<AddIOp>(
loc, kernelsOffset, outerLoops.getInductionVar(mIndex));
AffineMap kernelMap = AffineMap::get(2, 1,
/*gIndex=*/rewriter.getAffineDimExpr(0) *
/*kernelsPerGroup=*/rewriter.getAffineSymbolExpr(0) +
/*mIndex=*/rewriter.getAffineDimExpr(1));
kernel = rewriter.create<AffineApplyOp>(loc, kernelMap,
ArrayRef<Value>{/*gIndex=*/outerLoops.getInductionVar(gIndex),
/*kernelsPerGroupValue=*/kernelsPerGroupValue,
/*mIndex=*/outerLoops.getInductionVar(mIndex)});
}
// 2.2 Define spatial loops
@ -209,9 +213,8 @@ struct ONNXConvOpLowering : public ConversionPattern {
/*subchannel=*/rewriter.getAffineSymbolExpr(0) +
/*c=*/rewriter.getAffineDimExpr(1));
channelDepth = rewriter.create<AffineApplyOp>(loc, indexMap,
ValueRange(
ArrayRef<Value>{/*g=*/outerLoops.getInductionVar(gIndex),
/*c=*/channelDepth, /*subchannel=*/subchannels}));
ArrayRef<Value>{/*g=*/outerLoops.getInductionVar(gIndex),
/*c=*/channelDepth, /*subchannel=*/subchannels});
}
dataIndices.emplace_back(channelDepth);
// sX * rX + kX
@ -231,8 +234,8 @@ struct ONNXConvOpLowering : public ConversionPattern {
/*sX=*/rewriter.getAffineDimExpr(0) * /*rX=*/stride +
/*kX=*/rewriter.getAffineDimExpr(1));
Value outIV = rewriter.create<AffineApplyOp>(loc, indexMap,
ValueRange(ArrayRef<Value>{spatialLoops.getInductionVar(i),
innerLoops.getInductionVar(i + 1)}));
ArrayRef<Value>{spatialLoops.getInductionVar(i),
innerLoops.getInductionVar(i + 1)});
dataIndices.emplace_back(outIV);
}

View File

@ -79,10 +79,10 @@ struct ONNXBatchNormalizationTestModeOpLowering : public ConversionPattern {
loopCIVs.emplace_back(rewriter.create<ConstantIndexOp>(loc, 0));
}
auto scaleVal = rewriter.create<LoadOp>(loc, scale, loopCIVs);
auto biasVal = rewriter.create<LoadOp>(loc, bias, loopCIVs);
auto meanVal = rewriter.create<LoadOp>(loc, mean, loopCIVs);
auto varianceVal = rewriter.create<LoadOp>(loc, variance, loopCIVs);
auto scaleVal = rewriter.create<AffineLoadOp>(loc, scale, loopCIVs);
auto biasVal = rewriter.create<AffineLoadOp>(loc, bias, loopCIVs);
auto meanVal = rewriter.create<AffineLoadOp>(loc, mean, loopCIVs);
auto varianceVal = rewriter.create<AffineLoadOp>(loc, variance, loopCIVs);
// Create a KrnlIterateOp along the other dimensions.
SmallVector<int64_t, 4> axes;
@ -118,7 +118,7 @@ struct ONNXBatchNormalizationTestModeOpLowering : public ConversionPattern {
loopIVs.emplace_back(args[0]);
}
auto xVal = rewriter.create<LoadOp>(loc, operand, loopIVs);
auto xVal = rewriter.create<AffineLoadOp>(loc, operand, loopIVs);
// normalize
auto dividend = rewriter.create<SubFOp>(loc, xVal, meanVal);
auto adjustedVarianceVal =
@ -129,7 +129,7 @@ struct ONNXBatchNormalizationTestModeOpLowering : public ConversionPattern {
auto scaleNormVal = rewriter.create<MulFOp>(loc, scaleVal, normVal);
auto shiftScaleNormVal =
rewriter.create<AddFOp>(loc, scaleNormVal, biasVal);
rewriter.create<StoreOp>(loc, shiftScaleNormVal, alloc, loopIVs);
rewriter.create<AffineStoreOp>(loc, shiftScaleNormVal, alloc, loopIVs);
rewriter.replaceOp(op, alloc);

View File

@ -100,7 +100,7 @@ void postProcessPoolingWindow<ONNXAveragePoolOp>(
ArrayRef<Value> poolDimValues) {
// AveragePool's result type is FloatType, so it's safe to use DivFOp, SubFOp.
bool countIncludePad = getCountIncludePad<ONNXAveragePoolOp>(poolOp);
Value numerator = rewriter.create<LoadOp>(loc, alloc, resultIndices);
Value numerator = rewriter.create<AffineLoadOp>(loc, alloc, resultIndices);
Value denominator;
if (countIncludePad) {
int64_t kernelSize = 1;
@ -120,7 +120,7 @@ void postProcessPoolingWindow<ONNXAveragePoolOp>(
Value average = rewriter.create<DivFOp>(loc, numerator, denominator);
rewriter.create<StoreOp>(loc, average, alloc, resultIndices);
rewriter.create<AffineStoreOp>(loc, average, alloc, resultIndices);
}
//===----------------------------------------------------------------------===//
@ -167,9 +167,7 @@ Value insertAllocAndDeallocForPooling(ConversionPatternRewriter &rewriter,
dilations.empty() ? 1 : dilations[spatialIndex]));
// Apply the affine map.
Value dimVal =
rewriter.create<AffineApplyOp>(loc, dimMap, ValueRange(dimArgs));
Value dimVal = rewriter.create<AffineApplyOp>(loc, dimMap, dimArgs);
allocOperands.emplace_back(dimVal);
}
}
@ -346,7 +344,7 @@ struct ONNXPoolOpLowering : public ConversionPattern {
outputIndices.emplace_back(outputLoops.getInductionVar(i));
// 2.1 Emit: output[n][c][ho][wo] = identity
rewriter.create<StoreOp>(loc, identity, alloc, outputIndices);
rewriter.create<AffineStoreOp>(loc, identity, alloc, outputIndices);
// 2.2 Emit affine maps which express the lower and upper bounds for the
// pooling window's dimensions.
@ -441,11 +439,11 @@ struct ONNXPoolOpLowering : public ConversionPattern {
{ // Construct poolStartValues and poolDimValues.
for (int i = 0; i < kernelShape.size(); ++i) {
Value startIndex = rewriter.create<AffineMaxOp>(
loc, poolStartMap, ValueRange(IVsAndConstants[i]));
loc, poolStartMap, IVsAndConstants[i]);
poolStartValues.emplace_back(startIndex);
Value endIndex = rewriter.create<AffineMinOp>(
loc, poolEndMap, ValueRange(IVsAndConstants[i]));
Value endIndex =
rewriter.create<AffineMinOp>(loc, poolEndMap, IVsAndConstants[i]);
Value dim = rewriter.create<SubIOp>(loc, endIndex, startIndex);
if (isDilated) {
@ -514,10 +512,10 @@ struct ONNXPoolOpLowering : public ConversionPattern {
Value loadInput =
rewriter.create<LoadOp>(loc, inputOperand, inputIndices);
Value loadPartialOutput =
rewriter.create<LoadOp>(loc, alloc, outputIndices);
rewriter.create<AffineLoadOp>(loc, alloc, outputIndices);
Value output = emitScalarOpFor<PoolOp>(rewriter, loc, op,
outputElementType, {loadPartialOutput, loadInput});
rewriter.create<StoreOp>(loc, output, alloc, outputIndices);
rewriter.create<AffineStoreOp>(loc, output, alloc, outputIndices);
}
// 2.5 Post-processing for the pooling window, e.g. taking average.

View File

@ -222,13 +222,15 @@ LstmState allocAndInitializeStates<ONNXLSTMOp, LstmState>(
Value hiddenVal = zero;
if (!isNoneType(operandAdaptor.initial_h()))
hiddenVal = rewriter.create<LoadOp>(loc, operandAdaptor.initial_h(), IVs);
rewriter.create<StoreOp>(loc, hiddenVal, state.ht, IVs);
hiddenVal =
rewriter.create<AffineLoadOp>(loc, operandAdaptor.initial_h(), IVs);
rewriter.create<AffineStoreOp>(loc, hiddenVal, state.ht, IVs);
Value cellVal = zero;
if (!isNoneType(operandAdaptor.initial_c()))
cellVal = rewriter.create<LoadOp>(loc, operandAdaptor.initial_c(), IVs);
rewriter.create<StoreOp>(loc, cellVal, state.ct, IVs);
cellVal =
rewriter.create<AffineLoadOp>(loc, operandAdaptor.initial_c(), IVs);
rewriter.create<AffineStoreOp>(loc, cellVal, state.ct, IVs);
}
rewriter.restoreInsertionPoint(ipInitializationLoops);
return state;
@ -320,8 +322,8 @@ void calculateState<ONNXLSTMOp, LstmState, LstmActivationPack>(
for (unsigned i = 0; i < 4; ++i) {
Value wHiddenIV =
rewriter.create<AffineApplyOp>(loc, accessByOffsetMap,
ValueRange(std::vector<Value>{/*iv=*/hiddenIV,
/*index=*/constantIndices[i], /*size=*/hiddenDimVal}));
std::vector<Value>{/*iv=*/hiddenIV,
/*index=*/constantIndices[i], /*size=*/hiddenDimVal});
wbIOFCIVs.emplace_back(SmallVector<Value, 2>{directionIV, wHiddenIV});
}
// Rb[iofc]
@ -329,8 +331,8 @@ void calculateState<ONNXLSTMOp, LstmState, LstmActivationPack>(
SmallVector<Value, 4> rbIVs;
Value rHiddenIV =
rewriter.create<AffineApplyOp>(loc, accessByOffsetMap,
ValueRange(std::vector<Value>{/*iv=*/hiddenIV,
/*index=*/constantIndices[i], /*size=*/hiddenDimVal}));
std::vector<Value>{/*iv=*/hiddenIV,
/*index=*/constantIndices[i], /*size=*/hiddenDimVal});
rbIOFCIVs.emplace_back(SmallVector<Value, 2>{directionIV, rHiddenIV});
}
}
@ -339,17 +341,16 @@ void calculateState<ONNXLSTMOp, LstmState, LstmActivationPack>(
if (hasPeepholes) {
for (unsigned i = 0; i < 3; ++i) {
SmallVector<Value, 4> pIVs;
Value pHiddenIV =
rewriter.create<AffineApplyOp>(loc, accessByOffsetMap,
ValueRange(std::vector<Value>{
hiddenIV, constantIndices[i], hiddenDimVal}));
Value pHiddenIV = rewriter.create<AffineApplyOp>(loc,
accessByOffsetMap,
std::vector<Value>{hiddenIV, constantIndices[i], hiddenDimVal});
pIOFIVs.emplace_back(SmallVector<Value, 2>{directionIV, pHiddenIV});
}
}
}
Value loadH = rewriter.create<LoadOp>(loc, state.ht, hIVs);
Value loadC = rewriter.create<LoadOp>(loc, state.ct, cIVs);
Value loadH = rewriter.create<AffineLoadOp>(loc, state.ht, hIVs);
Value loadC = rewriter.create<AffineLoadOp>(loc, state.ct, cIVs);
// Emit instructions for matrix multiplications:
// Xt*(Wi^T), Xt*(Wo^T), Xt*(Wf^t), Xt*(Wc^T)
@ -361,9 +362,9 @@ void calculateState<ONNXLSTMOp, LstmState, LstmActivationPack>(
MemRefType scalarMemRefType = MemRefType::get({}, elementType, {}, 0);
for (unsigned i = 0; i < 4; ++i) {
Value xwAlloc = rewriter.create<AllocOp>(loc, scalarMemRefType);
rewriter.create<StoreOp>(loc, zero, xwAlloc);
rewriter.create<AffineStoreOp>(loc, zero, xwAlloc, ArrayRef<Value>{});
Value hrAlloc = rewriter.create<AllocOp>(loc, scalarMemRefType);
rewriter.create<StoreOp>(loc, zero, hrAlloc);
rewriter.create<AffineStoreOp>(loc, zero, hrAlloc, ArrayRef<Value>{});
xwIOFC.emplace_back(xwAlloc);
hrIOFC.emplace_back(hrAlloc);
}
@ -390,10 +391,9 @@ void calculateState<ONNXLSTMOp, LstmState, LstmActivationPack>(
// R[iofc] :: [num_directions, 4*hidden_size, input_size]
for (unsigned i = 0; i < 4; ++i) {
SmallVector<Value, 4> wIVs, rIVs;
Value wHiddenIV =
rewriter.create<AffineApplyOp>(loc, accessByOffsetMap,
ValueRange(std::vector<Value>{
hiddenIV, constantIndices[i], hiddenDimVal}));
Value wHiddenIV = rewriter.create<AffineApplyOp>(loc,
accessByOffsetMap,
std::vector<Value>{hiddenIV, constantIndices[i], hiddenDimVal});
wIVs = {directionIV, wHiddenIV, reductionIV};
wIOFCIVs.emplace_back(wIVs);
@ -402,77 +402,80 @@ void calculateState<ONNXLSTMOp, LstmState, LstmActivationPack>(
rIOFCIVs.emplace_back(rIVs);
}
Value loadX = rewriter.create<LoadOp>(loc, operandAdaptor.X(), xIVs);
Value loadX =
rewriter.create<AffineLoadOp>(loc, operandAdaptor.X(), xIVs);
for (unsigned i = 0; i < 4; ++i) {
// Xt * Wiofc
Value loadW =
rewriter.create<LoadOp>(loc, operandAdaptor.W(), wIOFCIVs[i]);
Value loadW = rewriter.create<AffineLoadOp>(
loc, operandAdaptor.W(), wIOFCIVs[i]);
Value xwVal = rewriter.create<MulFOp>(loc, loadX, loadW);
Value loadXW = rewriter.create<LoadOp>(loc, xwIOFC[i]);
Value loadXW = rewriter.create<AffineLoadOp>(loc, xwIOFC[i]);
Value nextXW = rewriter.create<AddFOp>(loc, loadXW, xwVal);
rewriter.create<StoreOp>(loc, nextXW, xwIOFC[i]);
rewriter.create<AffineStoreOp>(
loc, nextXW, xwIOFC[i], ArrayRef<Value>{});
// Ht-1 * Riofc
Value loadR =
rewriter.create<LoadOp>(loc, operandAdaptor.R(), rIOFCIVs[i]);
Value loadR = rewriter.create<AffineLoadOp>(
loc, operandAdaptor.R(), rIOFCIVs[i]);
Value hrVal = rewriter.create<MulFOp>(loc, loadH, loadR);
Value loadHR = rewriter.create<LoadOp>(loc, hrIOFC[i]);
Value loadHR = rewriter.create<AffineLoadOp>(loc, hrIOFC[i]);
Value nextHR = rewriter.create<AddFOp>(loc, loadHR, hrVal);
rewriter.create<StoreOp>(loc, nextHR, hrIOFC[i]);
rewriter.create<AffineStoreOp>(
loc, nextHR, hrIOFC[i], ArrayRef<Value>{});
}
}
rewriter.restoreInsertionPoint(ipReductionLoops);
}
// it = f(Xt*(Wi^T) + Ht-1*(Ri^T) + Pi (.) Ct-1 + Wbi + Rbi)
Value loadXWI = rewriter.create<LoadOp>(loc, xwIOFC[0]);
Value loadHRI = rewriter.create<LoadOp>(loc, hrIOFC[0]);
Value loadXWI = rewriter.create<AffineLoadOp>(loc, xwIOFC[0]);
Value loadHRI = rewriter.create<AffineLoadOp>(loc, hrIOFC[0]);
Value it = rewriter.create<AddFOp>(loc, loadXWI, loadHRI);
if (hasPeepholes) {
Value loadP =
rewriter.create<LoadOp>(loc, operandAdaptor.P(), pIOFIVs[0]);
rewriter.create<AffineLoadOp>(loc, operandAdaptor.P(), pIOFIVs[0]);
Value PC = rewriter.create<MulFOp>(loc, loadP, loadC);
it = rewriter.create<AddFOp>(loc, it, PC);
}
if (hasBiasForInput) {
Value loadWB =
rewriter.create<LoadOp>(loc, operandAdaptor.B(), wbIOFCIVs[0]);
rewriter.create<AffineLoadOp>(loc, operandAdaptor.B(), wbIOFCIVs[0]);
it = rewriter.create<AddFOp>(loc, it, loadWB);
Value loadRB =
rewriter.create<LoadOp>(loc, operandAdaptor.B(), rbIOFCIVs[0]);
rewriter.create<AffineLoadOp>(loc, operandAdaptor.B(), rbIOFCIVs[0]);
it = rewriter.create<AddFOp>(loc, it, loadRB);
}
it = applyActivation(rewriter, loc, activationPack.f, it);
// ft = f(Xt*(Wf^T) + Ht-1*(Rf^T) + Pf (.) Ct-1 + Wbf + Rbf)
Value loadXWF = rewriter.create<LoadOp>(loc, xwIOFC[2]);
Value loadHRF = rewriter.create<LoadOp>(loc, hrIOFC[2]);
Value loadXWF = rewriter.create<AffineLoadOp>(loc, xwIOFC[2]);
Value loadHRF = rewriter.create<AffineLoadOp>(loc, hrIOFC[2]);
Value ft = rewriter.create<AddFOp>(loc, loadXWF, loadHRF);
if (hasPeepholes) {
Value loadP =
rewriter.create<LoadOp>(loc, operandAdaptor.P(), pIOFIVs[2]);
rewriter.create<AffineLoadOp>(loc, operandAdaptor.P(), pIOFIVs[2]);
Value PC = rewriter.create<MulFOp>(loc, loadP, loadC);
ft = rewriter.create<AddFOp>(loc, ft, PC);
}
if (hasBiasForInput) {
Value loadWB =
rewriter.create<LoadOp>(loc, operandAdaptor.B(), wbIOFCIVs[2]);
rewriter.create<AffineLoadOp>(loc, operandAdaptor.B(), wbIOFCIVs[2]);
ft = rewriter.create<AddFOp>(loc, ft, loadWB);
Value loadRB =
rewriter.create<LoadOp>(loc, operandAdaptor.B(), rbIOFCIVs[2]);
rewriter.create<AffineLoadOp>(loc, operandAdaptor.B(), rbIOFCIVs[2]);
ft = rewriter.create<AddFOp>(loc, ft, loadRB);
}
ft = applyActivation(rewriter, loc, activationPack.f, ft);
// ct = g(Xt*(Wc^T) + Ht-1*(Rc^T) + Wbc + Rbc)
Value loadXWC = rewriter.create<LoadOp>(loc, xwIOFC[3]);
Value loadHRC = rewriter.create<LoadOp>(loc, hrIOFC[3]);
Value loadXWC = rewriter.create<AffineLoadOp>(loc, xwIOFC[3]);
Value loadHRC = rewriter.create<AffineLoadOp>(loc, hrIOFC[3]);
Value ct = rewriter.create<AddFOp>(loc, loadXWC, loadHRC);
if (hasBiasForInput) {
Value loadWB =
rewriter.create<LoadOp>(loc, operandAdaptor.B(), wbIOFCIVs[3]);
rewriter.create<AffineLoadOp>(loc, operandAdaptor.B(), wbIOFCIVs[3]);
ct = rewriter.create<AddFOp>(loc, ct, loadWB);
Value loadRB =
rewriter.create<LoadOp>(loc, operandAdaptor.B(), rbIOFCIVs[3]);
rewriter.create<AffineLoadOp>(loc, operandAdaptor.B(), rbIOFCIVs[3]);
ct = rewriter.create<AddFOp>(loc, ct, loadRB);
}
ct = applyActivation(rewriter, loc, activationPack.g, ct);
@ -481,24 +484,24 @@ void calculateState<ONNXLSTMOp, LstmState, LstmActivationPack>(
Value FtCt1 = rewriter.create<MulFOp>(loc, ft, loadC);
Value itct = rewriter.create<MulFOp>(loc, it, ct);
Value Ct = rewriter.create<AddFOp>(loc, FtCt1, itct);
rewriter.create<StoreOp>(loc, Ct, state.ct, cIVs);
rewriter.create<AffineStoreOp>(loc, Ct, state.ct, cIVs);
// ot = f(Xt*(Wo^T) + Ht-1*(Ro^T) + Po (.) Ct + Wbo + Rbo)
Value loadXWO = rewriter.create<LoadOp>(loc, xwIOFC[1]);
Value loadHRO = rewriter.create<LoadOp>(loc, hrIOFC[1]);
Value loadXWO = rewriter.create<AffineLoadOp>(loc, xwIOFC[1]);
Value loadHRO = rewriter.create<AffineLoadOp>(loc, hrIOFC[1]);
Value ot = rewriter.create<AddFOp>(loc, loadXWO, loadHRO);
if (hasPeepholes) {
Value loadP =
rewriter.create<LoadOp>(loc, operandAdaptor.P(), pIOFIVs[1]);
rewriter.create<AffineLoadOp>(loc, operandAdaptor.P(), pIOFIVs[1]);
Value PC = rewriter.create<MulFOp>(loc, loadP, Ct);
ot = rewriter.create<AddFOp>(loc, ot, PC);
}
if (hasBiasForInput) {
Value loadWB =
rewriter.create<LoadOp>(loc, operandAdaptor.B(), wbIOFCIVs[1]);
rewriter.create<AffineLoadOp>(loc, operandAdaptor.B(), wbIOFCIVs[1]);
ot = rewriter.create<AddFOp>(loc, ot, loadWB);
Value loadRB =
rewriter.create<LoadOp>(loc, operandAdaptor.B(), rbIOFCIVs[1]);
rewriter.create<AffineLoadOp>(loc, operandAdaptor.B(), rbIOFCIVs[1]);
ot = rewriter.create<AddFOp>(loc, ot, loadRB);
}
ot = applyActivation(rewriter, loc, activationPack.f, ot);
@ -506,12 +509,12 @@ void calculateState<ONNXLSTMOp, LstmState, LstmActivationPack>(
// Ht = ot (.) h(Ct)
Value hCt = applyActivation(rewriter, loc, activationPack.h, Ct);
Value Ht = rewriter.create<MulFOp>(loc, ot, hCt);
rewriter.create<StoreOp>(loc, Ht, state.ht, hIVs);
rewriter.create<AffineStoreOp>(loc, Ht, state.ht, hIVs);
// Store the current Ht if required.
if (!isNoneType(state.allH)) {
SmallVector<Value, 4> allHIVs{sequenceIV, directionIV, batchIV, hiddenIV};
rewriter.create<StoreOp>(loc, Ht, state.allH, allHIVs);
rewriter.create<AffineStoreOp>(loc, Ht, state.allH, allHIVs);
}
// Deallocate the temporary results of matrix multiplications.

View File

@ -28,7 +28,7 @@ Value applyActivation(ConversionPatternRewriter &rewriter, Location loc,
MemRefType scalarMemRefType =
MemRefType::get({}, scalarOperand.getType(), {}, 0);
Value alloc = rewriter.create<AllocOp>(loc, scalarMemRefType);
rewriter.create<StoreOp>(loc, scalarOperand, alloc);
rewriter.create<AffineStoreOp>(loc, scalarOperand, alloc, ArrayRef<Value>{});
std::vector<mlir::NamedAttribute> attributes;
if (activation.alpha) {
@ -68,6 +68,6 @@ Value applyActivation(ConversionPatternRewriter &rewriter, Location loc,
else
llvm_unreachable("Unsupported activation");
Value result = rewriter.create<LoadOp>(loc, res);
Value result = rewriter.create<AffineLoadOp>(loc, res);
return result;
}

View File

@ -126,9 +126,9 @@ struct ONNXRNNOpLowering : public ConversionPattern {
rewriter.getIndexType(), (direction == REVERSE) ? 0 : 1);
Value reverseSequenceIV =
rewriter.create<AffineApplyOp>(loc, reverseIVMap,
ValueRange(std::vector<Value>{sequenceLoops.getInductionVar(0),
std::vector<Value>{sequenceLoops.getInductionVar(0),
emitConstantOp(rewriter, loc, rewriter.getIndexType(),
sequenceDimSize)}));
sequenceDimSize)});
// Emit calculation for one RNN step.
calculateState<RNNOp, S, A>(rewriter, loc, operandAdaptor, state,
activationReverse, directionIV, reverseSequenceIV);

View File

@ -59,15 +59,18 @@ struct ONNXConcatOpLowering : public ConversionPattern {
if (r != axis || writeOffset == 0) {
writeIndices.emplace_back(inputLoops.getInductionVar(r));
} else {
auto indexWithOffset = rewriter.create<AddIOp>(loc,
rewriter.create<ConstantIndexOp>(loc, writeOffset),
inputLoops.getInductionVar(r));
AffineMap indexWithOffsetMap =
AffineMap::get(1, 0, rewriter.getAffineDimExpr(0) + writeOffset);
Value indexWithOffset =
rewriter.create<AffineApplyOp>(loc, indexWithOffsetMap,
ArrayRef<Value>{inputLoops.getInductionVar(r)});
writeIndices.emplace_back(indexWithOffset);
}
}
// Insert copy.
auto loadData = rewriter.create<LoadOp>(loc, operands[i], readIndices);
rewriter.create<StoreOp>(loc, loadData, alloc, writeIndices);
auto loadData =
rewriter.create<AffineLoadOp>(loc, operands[i], readIndices);
rewriter.create<AffineStoreOp>(loc, loadData, alloc, writeIndices);
// Increment offset
writeOffset += currShape[axis];
}

View File

@ -88,16 +88,17 @@ struct ONNXPadOpLowering : public ConversionPattern {
if (pads[i] == 0) {
outLoopIVs.emplace_back(valueLoops.getInductionVar(i));
} else {
auto outIV = rewriter.create<AddIOp>(loc,
rewriter.create<ConstantIndexOp>(loc, pads[i]),
valueLoops.getInductionVar(i));
AffineMap indexWithOffsetMap =
AffineMap::get(1, 0, rewriter.getAffineDimExpr(0) + pads[i]);
Value outIV = rewriter.create<AffineApplyOp>(loc, indexWithOffsetMap,
ArrayRef<Value>{valueLoops.getInductionVar(i)});
outLoopIVs.emplace_back(outIV);
}
}
auto originValue =
rewriter.create<LoadOp>(loc, operandAdaptor.data(), inLoopIVs);
rewriter.create<StoreOp>(loc, originValue, alloc, outLoopIVs);
rewriter.create<AffineLoadOp>(loc, operandAdaptor.data(), inLoopIVs);
rewriter.create<AffineStoreOp>(loc, originValue, alloc, outLoopIVs);
rewriter.setInsertionPointToStart(padLoops.getIterateBlock());
SmallVector<Value, 4> outLoopIVs1;
@ -105,7 +106,7 @@ struct ONNXPadOpLowering : public ConversionPattern {
outLoopIVs1.emplace_back(padLoops.getInductionVar(i));
auto paddingValue = rewriter.create<ConstantOp>(loc, valueAttr);
rewriter.create<StoreOp>(loc, paddingValue, alloc, outLoopIVs1);
rewriter.create<AffineStoreOp>(loc, paddingValue, alloc, outLoopIVs1);
// Replace the original op with the generated code.
rewriter.replaceOp(op, alloc);

View File

@ -77,15 +77,17 @@ struct ONNXPadConstantValuePadOpLowering : public ConversionPattern {
if (pad_begin[i] == 0) {
outLoopIVs.emplace_back(valueLoops.getInductionVar(i));
} else {
auto outIV = rewriter.create<AddIOp>(loc,
rewriter.create<ConstantIndexOp>(loc, pad_begin[i]),
valueLoops.getInductionVar(i));
AffineMap indexWithOffsetMap =
AffineMap::get(1, 0, rewriter.getAffineDimExpr(0) + pad_begin[i]);
Value outIV = rewriter.create<AffineApplyOp>(loc, indexWithOffsetMap,
ArrayRef<Value>{valueLoops.getInductionVar(i)});
outLoopIVs.emplace_back(outIV);
}
}
auto inVal = rewriter.create<LoadOp>(loc, operandAdaptor.data(), inLoopIVs);
rewriter.create<StoreOp>(loc, inVal, alloc, outLoopIVs);
auto inVal =
rewriter.create<AffineLoadOp>(loc, operandAdaptor.data(), inLoopIVs);
rewriter.create<AffineStoreOp>(loc, inVal, alloc, outLoopIVs);
rewriter.setInsertionPointToStart(padLoops.getIterateBlock());
SmallVector<Value, 4> outLoopIVs1;
@ -93,7 +95,7 @@ struct ONNXPadConstantValuePadOpLowering : public ConversionPattern {
outLoopIVs1.emplace_back(padLoops.getInductionVar(i));
auto inVal1 = rewriter.create<ConstantOp>(loc, constantValAttr);
rewriter.create<StoreOp>(loc, inVal1, alloc, outLoopIVs1);
rewriter.create<AffineStoreOp>(loc, inVal1, alloc, outLoopIVs1);
// Replace the original op with the generated code.
rewriter.replaceOp(op, alloc);

View File

@ -64,7 +64,8 @@ struct ONNXReshapeOpLowering : public ConversionPattern {
for (int i = 0; i < memRefShape.size(); ++i) {
Value index = emitConstantOp(rewriter, loc, rewriter.getIndexType(), i);
// Load index from array of indices.
Value loadedVal = rewriter.create<LoadOp>(loc, operands[1], index);
Value loadedVal =
rewriter.create<AffineLoadOp>(loc, operands[1], index);
// If a dimension is zero, the actual dimension value is taken from the
// input tensor.
//

View File

@ -92,8 +92,9 @@ struct ONNXSplitOpLowering : public ConversionPattern {
writeIndices.emplace_back(outputLoops.getInductionVar(r));
}
// Insert copy.
auto loadData = rewriter.create<LoadOp>(loc, operands[0], readIndices);
rewriter.create<StoreOp>(loc, loadData, allocs[i], writeIndices);
auto loadData =
rewriter.create<AffineLoadOp>(loc, operands[0], readIndices);
rewriter.create<AffineStoreOp>(loc, loadData, allocs[i], writeIndices);
}
rewriter.replaceOp(op, allocs);
return success();

View File

@ -80,8 +80,8 @@ struct ONNXTransposeOpLowering : public ConversionPattern {
for (int i = 0; i < iterationBlock.getArguments().size(); ++i)
outLoopIVs.emplace_back(iterationBlock.getArguments()[perm[i]]);
auto inVal = rewriter.create<LoadOp>(loc, data, inLoopIVs);
rewriter.create<StoreOp>(loc, inVal, alloc, outLoopIVs);
auto inVal = rewriter.create<AffineLoadOp>(loc, data, inLoopIVs);
rewriter.create<AffineStoreOp>(loc, inVal, alloc, outLoopIVs);
rewriter.replaceOp(op, alloc);

View File

@ -14,10 +14,10 @@ func @test_enable_memory_pool(%arg0: tensor<10x10xf32>) -> tensor<10x10xf32> {
// CHECK: krnl.define_loops
// CHECK: krnl.optimize_loops
// CHECK: krnl.iterate
// CHECK: [[LOAD1:%.+]] = load %arg0[%arg1, %arg2] : memref<10x10xf32>
// CHECK: [[LOAD2:%.+]] = load %arg0[%arg1, %arg2] : memref<10x10xf32>
// CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg1, %arg2] : memref<10x10xf32>
// CHECK: [[LOAD2:%.+]] = affine.load %arg0[%arg1, %arg2] : memref<10x10xf32>
// CHECK: [[ADDF1:%.+]] = addf [[LOAD1]], [[LOAD2]] : f32
// CHECK: store [[ADDF1]], [[GETREF]][%arg1, %arg2] : memref<10x10xf32>
// CHECK: affine.store [[ADDF1]], [[GETREF]][%arg1, %arg2] : memref<10x10xf32>
// CHECK: krnl.define_loops
// CHECK: krnl.optimize_loops
// CHECK: krnl.iterate
@ -43,26 +43,26 @@ func @test_enable_memory_pool_2(%arg0: tensor<10x10xf32>, %arg1: tensor<10x20xf3
// CHECK: krnl.define_loops
// CHECK: krnl.optimize_loops
// CHECK: krnl.iterate
// CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD2:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD2:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[ADDF1:%.+]] = addf [[LOAD1]], [[LOAD2]] : f32
// CHECK: store [[ADDF1]], [[GETREF1]][%arg2, %arg3] : memref<10x10xf32>
// CHECK: affine.store [[ADDF1]], [[GETREF1]][%arg2, %arg3] : memref<10x10xf32>
// CHECK: krnl.define_loops
// CHECK: krnl.optimize_loops
// CHECK: krnl.iterate
// CHECK: [[LOAD3:%.+]] = load [[GETREF1]][%arg2, %arg4] : memref<10x10xf32>
// CHECK: [[LOAD4:%.+]] = load %arg1[%arg4, %arg3] : memref<10x20xf32>
// CHECK: [[LOAD5:%.+]] = load [[GETREF0]][%arg2, %arg3] : memref<10x20xf32>
// CHECK: [[LOAD3:%.+]] = affine.load [[GETREF1]][%arg2, %arg4] : memref<10x10xf32>
// CHECK: [[LOAD4:%.+]] = affine.load %arg1[%arg4, %arg3] : memref<10x20xf32>
// CHECK: [[LOAD5:%.+]] = affine.load [[GETREF0]][%arg2, %arg3] : memref<10x20xf32>
// CHECK: [[MULF1:%.+]] = mulf [[LOAD3]], [[LOAD4]] : f32
// CHECK: [[ADDF2:%.+]] = addf [[LOAD5]], [[MULF1]] : f32
// CHECK: store [[ADDF2]], [[GETREF0]][%arg2, %arg3] : memref<10x20xf32>
// CHECK: affine.store [[ADDF2]], [[GETREF0]][%arg2, %arg3] : memref<10x20xf32>
// CHECK: krnl.define_loops
// CHECK: krnl.optimize_loops
// CHECK: krnl.iterate
// CHECK: [[LOAD6:%.+]] = load [[GETREF0]][%arg2, %arg3] : memref<10x20xf32>
// CHECK: [[LOAD7:%.+]] = load %arg1[%arg2, %arg3] : memref<10x20xf32>
// CHECK: [[LOAD6:%.+]] = affine.load [[GETREF0]][%arg2, %arg3] : memref<10x20xf32>
// CHECK: [[LOAD7:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x20xf32>
// CHECK: [[ADDF3:%.+]] = addf [[LOAD6]], [[LOAD7]] : f32
// CHECK: store [[ADDF3]], [[RES]][%arg2, %arg3] : memref<10x20xf32>
// CHECK: affine.store [[ADDF3]], [[RES]][%arg2, %arg3] : memref<10x20xf32>
// CHECK: dealloc [[MEMPOOL1]] : memref<400xi8>
// CHECK: dealloc [[MEMPOOL0]] : memref<800xi8>
// CHECK: return [[RES]] : memref<10x20xf32>

File diff suppressed because it is too large Load Diff

View File

@ -16,10 +16,10 @@ func @test_add_add(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
// CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[ADDF:%.+]] = addf [[LOAD1]], [[LOAD2]] : f32
// CHECK: store [[ADDF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
// CHECK: affine.store [[ADDF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
/// Second Add
// CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
@ -27,10 +27,10 @@ func @test_add_add(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
// CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[ADDF:%.+]] = addf [[LOAD1]], [[LOAD2]] : f32
// CHECK: store [[ADDF]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
// CHECK: affine.store [[ADDF]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
/// Dealloc of first result.
// CHECK: dealloc [[RES]] : memref<10x10xf32>
@ -55,10 +55,10 @@ func @test_mul_mul(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
// CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[MULF:%.+]] = mulf [[LOAD1]], [[LOAD2]] : f32
// CHECK: store [[MULF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
// CHECK: affine.store [[MULF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
/// Second Mul
// CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
@ -66,10 +66,10 @@ func @test_mul_mul(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
// CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[MULF:%.+]] = mulf [[LOAD1]], [[LOAD2]] : f32
// CHECK: store [[MULF]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
// CHECK: affine.store [[MULF]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
/// Dealloc of first result.
// CHECK: dealloc [[RES]] : memref<10x10xf32>
@ -94,10 +94,10 @@ func @test_div_div(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
// CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[DIVF:%.+]] = divf [[LOAD1]], [[LOAD2]] : f32
// CHECK: store [[DIVF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
// CHECK: affine.store [[DIVF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
/// Second Div
// CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
@ -105,10 +105,10 @@ func @test_div_div(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
// CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[DIVF:%.+]] = divf [[LOAD1]], [[LOAD2]] : f32
// CHECK: store [[DIVF]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
// CHECK: affine.store [[DIVF]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
/// Dealloc of first result.
// CHECK: dealloc [[RES]] : memref<10x10xf32>
@ -133,10 +133,10 @@ func @test_sub_sub(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
// CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[SUBF:%.+]] = subf [[LOAD1]], [[LOAD2]] : f32
// CHECK: store [[SUBF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
// CHECK: affine.store [[SUBF]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
/// Second Sub
// CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
@ -144,10 +144,10 @@ func @test_sub_sub(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
// CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[SUBF:%.+]] = subf [[LOAD1]], [[LOAD2]] : f32
// CHECK: store [[SUBF]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
// CHECK: affine.store [[SUBF]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
/// Dealloc of first result.
// CHECK: dealloc [[RES]] : memref<10x10xf32>
@ -172,10 +172,10 @@ func @test_and_and(%arg0 : tensor<10x10xi1>, %arg1 : tensor<10x10xi1>) -> tensor
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
// CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xi1>
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xi1>
// CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xi1>
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xi1>
// CHECK: [[AND:%.+]] = and [[LOAD1]], [[LOAD2]] : i1
// CHECK: store [[AND]], [[RES]][%arg2, %arg3] : memref<10x10xi1>
// CHECK: affine.store [[AND]], [[RES]][%arg2, %arg3] : memref<10x10xi1>
/// Second And
// CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
@ -183,10 +183,10 @@ func @test_and_and(%arg0 : tensor<10x10xi1>, %arg1 : tensor<10x10xi1>) -> tensor
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
// CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xi1>
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xi1>
// CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xi1>
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xi1>
// CHECK: [[AND:%.+]] = and [[LOAD1]], [[LOAD2]] : i1
// CHECK: store [[AND]], [[RET_RES]][%arg2, %arg3] : memref<10x10xi1>
// CHECK: affine.store [[AND]], [[RET_RES]][%arg2, %arg3] : memref<10x10xi1>
/// Dealloc of first result.
// CHECK: dealloc [[RES]] : memref<10x10xi1>
@ -211,10 +211,10 @@ func @test_or_or(%arg0 : tensor<10x10xi1>, %arg1 : tensor<10x10xi1>) -> tensor<*
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
// CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xi1>
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xi1>
// CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xi1>
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xi1>
// CHECK: [[OR:%.+]] = or [[LOAD1]], [[LOAD2]] : i1
// CHECK: store [[OR]], [[RES]][%arg2, %arg3] : memref<10x10xi1>
// CHECK: affine.store [[OR]], [[RES]][%arg2, %arg3] : memref<10x10xi1>
/// Second Or
// CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
@ -222,10 +222,10 @@ func @test_or_or(%arg0 : tensor<10x10xi1>, %arg1 : tensor<10x10xi1>) -> tensor<*
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
// CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xi1>
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xi1>
// CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xi1>
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xi1>
// CHECK: [[OR:%.+]] = or [[LOAD1]], [[LOAD2]] : i1
// CHECK: store [[OR]], [[RET_RES]][%arg2, %arg3] : memref<10x10xi1>
// CHECK: affine.store [[OR]], [[RET_RES]][%arg2, %arg3] : memref<10x10xi1>
/// Dealloc of first result.
// CHECK: dealloc [[RES]] : memref<10x10xi1>
@ -250,10 +250,10 @@ func @test_xor_xor(%arg0 : tensor<10x10xi1>, %arg1 : tensor<10x10xi1>) -> tensor
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
// CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xi1>
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xi1>
// CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xi1>
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xi1>
// CHECK: [[XOR:%.+]] = xor [[LOAD1]], [[LOAD2]] : i1
// CHECK: store [[XOR]], [[RES]][%arg2, %arg3] : memref<10x10xi1>
// CHECK: affine.store [[XOR]], [[RES]][%arg2, %arg3] : memref<10x10xi1>
/// Second Xor
// CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
@ -261,10 +261,10 @@ func @test_xor_xor(%arg0 : tensor<10x10xi1>, %arg1 : tensor<10x10xi1>) -> tensor
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
// CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xi1>
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xi1>
// CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xi1>
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xi1>
// CHECK: [[XOR:%.+]] = xor [[LOAD1]], [[LOAD2]] : i1
// CHECK: store [[XOR]], [[RET_RES]][%arg2, %arg3] : memref<10x10xi1>
// CHECK: affine.store [[XOR]], [[RET_RES]][%arg2, %arg3] : memref<10x10xi1>
/// Dealloc of first result.
// CHECK: dealloc [[RES]] : memref<10x10xi1>
@ -585,10 +585,10 @@ func @test_sum_sum(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
// CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[ADD:%.+]] = addf [[LOAD1]], [[LOAD2]] : f32
// CHECK: store [[ADD]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
// CHECK: affine.store [[ADD]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
/// Second Sum
// CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
@ -596,10 +596,10 @@ func @test_sum_sum(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
// CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[ADD:%.+]] = addf [[LOAD1]], [[LOAD2]] : f32
// CHECK: store [[ADD]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
// CHECK: affine.store [[ADD]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
/// Dealloc of first result.
// CHECK: dealloc [[RES]] : memref<10x10xf32>
@ -624,11 +624,11 @@ func @test_max_max(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
// CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[MAX:%.+]] = cmpf "ogt", [[LOAD1]], [[LOAD2]] : f32
// CHECK: [[RELU_RES:%.+]] = select [[MAX]], [[LOAD1]], [[LOAD2]] : f32
// CHECK: store [[RELU_RES]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
// CHECK: affine.store [[RELU_RES]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
/// Second Max
// CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
@ -636,11 +636,11 @@ func @test_max_max(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
// CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[MAX:%.+]] = cmpf "ogt", [[LOAD1]], [[LOAD2]] : f32
// CHECK: [[RELU_RES:%.+]] = select [[MAX]], [[LOAD1]], [[LOAD2]] : f32
// CHECK: store [[RELU_RES]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
// CHECK: affine.store [[RELU_RES]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
/// Dealloc of first result.
// CHECK: dealloc [[RES]] : memref<10x10xf32>
@ -665,11 +665,11 @@ func @test_min_min(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
// CHECK: [[LOAD1:%.+]] = load %arg0[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD1:%.+]] = affine.load %arg0[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[MIN:%.+]] = cmpf "olt", [[LOAD1]], [[LOAD2]] : f32
// CHECK: [[RELU_RES:%.+]] = select [[MIN]], [[LOAD1]], [[LOAD2]] : f32
// CHECK: store [[RELU_RES]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
// CHECK: affine.store [[RELU_RES]], [[RES]][%arg2, %arg3] : memref<10x10xf32>
/// Second Min
// CHECK: [[DEF_LOOPS:%.+]]:2 = krnl.define_loops 2
@ -677,11 +677,11 @@ func @test_min_min(%arg0 : tensor<10x10xf32>, %arg1 : tensor<10x10xf32>) -> tens
// CHECK: krnl.return_loops [[DEF_LOOPS]]#0, [[DEF_LOOPS]]#1
// CHECK: } : () -> (!krnl.loop, !krnl.loop)
// CHECK: krnl.iterate([[OPT_LOOPS]]#0, [[OPT_LOOPS]]#1) with ([[DEF_LOOPS]]#0 -> %arg2 = 0 to 10, [[DEF_LOOPS]]#1 -> %arg3 = 0 to 10) {
// CHECK: [[LOAD1:%.+]] = load [[RES]][%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD2:%.+]] = load %arg1[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD1:%.+]] = affine.load [[RES]][%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[LOAD2:%.+]] = affine.load %arg1[%arg2, %arg3] : memref<10x10xf32>
// CHECK: [[MIN:%.+]] = cmpf "olt", [[LOAD1]], [[LOAD2]] : f32
// CHECK: [[RELU_RES:%.+]] = select [[MIN]], [[LOAD1]], [[LOAD2]] : f32
// CHECK: store [[RELU_RES]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
// CHECK: affine.store [[RELU_RES]], [[RET_RES]][%arg2, %arg3] : memref<10x10xf32>
/// Dealloc of first result.
// CHECK: dealloc [[RES]] : memref<10x10xf32>