Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 23 additions & 23 deletions src/gpu/intel/jit/gemm/generator/pieces/gemm_setup.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,11 @@ void BLASKernelGenerator<hw>::gemmCheck32(const GEMMProblem &problem, GEMMStrate
auto temp3 = temp2GRF.ud(4);
auto flag = state.raVFlag.alloc();

auto mulHigh = [&](Subregister dst, Subregister src0, Subregister src1) {
auto mulHigh = [&](Subregister dst, Subregister src0, Subregister src1, SourceLocation loc = {}) {
if (emulate)
emul32High(1, dst.ud(), src0, src1);
emul32High(1, dst.ud(), src0, src1, loc);
else
mul(1, dst, src0, src1);
mul(1, dst, src0, src1, loc);
Copy link
Contributor

@petercad petercad Apr 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if we should be thinking about a more general mechanism for "scoping" location information in this way, instead of passing around lots of SourceLocation objects. Something like this:

auto mulHigh = [&](..., SourceLocationScoper scoper = {this}) {
// rest of code as before
};

And then SourceLocationScoper is defined something like:

template <HW hw>
class SourceLocationScoper {
    SourceLocation loc;
    BinaryCodeGenerator *generator = nullptr;
public:
    explicit SourceLocationScoper(ngen::BinaryCodeGenerator<hw> *g)
                : loc{std::source_location::current()}, generator(g) {
        g->enterLocationScope(loc);
    }
    ~SourceLocationScoper() {
        generator->exitLocationScope();
    }
}

The new generator methods {enter,exit}LocationScope would set/clear a location override (new variable in the generator class). You'd have a counter as well to properly support nested scopes (only the outermost is honored).

You could also use this method internally inside nGEN (e.g. pseudo-instructions, etc.) to avoid lots of passing around of loc objects.

Copy link
Contributor Author

@rjoursler rjoursler Apr 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't quite see what the benefit is here. Is this just supposed to be a performance optimization? For release builds, Source Location will be an empty object, so I expect it would mostly be optimized away anyway. If the goal is to just avoid forwarding loc in the source code, I don't see much benefit as forwarding the same location for a "large" operation seems misguided anyway.

In general, I agree we could use some improvement here, I just haven't been able to come up with a good mechanism. The core problem is that we have multiple source locations that we could reasonably map to each instruction, so I don't see a general mechanism we could use to pick the "right" location as it depends on the use and what is normally being debugged.

Copy link
Contributor

@petercad petercad Apr 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's for better readability/easier programmability, rather than performance. The idea is that you only need to change one line to combine location information:

auto mulHigh = [&](..., SourceLocationScoper scoper = {this}) {
      emul32High(...);    // don't have to pass loc here
      mul(...);           // don't have to pass loc here
};

instead of modifying every single instruction:

auto mulHigh = [&](..., SourceLocation loc = {}) {
      emul32High(..., loc);    // have to pass loc here
      mul(..., loc);           // have to pass loc here
};

When there are a lot of nested instructions, it's easy to miss a loc and just a lot of code changes to make. Since this pattern is appearing in lots of places, it'd be nice to simplify.

};

if (checkA) {
Expand Down Expand Up @@ -405,21 +405,21 @@ void BLASKernelGenerator<hw>::gemmOffsetABC(bool initial, Subregister i0, Subreg
// B += j0 * ldb (N, Pr) j0 (T)
// C += i0 + j0 * ldc (N, Pr) j0 + i0 * ldc (T, Pc)
// CO += i0 (row offsets) j0 (col offsets)
auto doAOffset = [&](Subregister offsetAx, Subregister i0x) {
auto doAOffset = [&](Subregister offsetAx, Subregister i0x, SourceLocation loc = {}) {
if (problem.A.layout == MatrixLayout::Nontranspose)
eaddScaled(1, offsetAx, offsetAx, i0x, Ta_ext, strategy, state);
eaddScaled(1, offsetAx, offsetAx, i0x, Ta_ext, strategy, state, loc);
else {
emul(1, tempQ1, i0x, state.inputs.lda, strategy, state);
eadd(1, offsetAx, offsetAx, tempQ1.reinterpret(0, offsetAx.getType()), strategy, state);
emul(1, tempQ1, i0x, state.inputs.lda, strategy, state, loc);
eadd(1, offsetAx, offsetAx, tempQ1.reinterpret(0, offsetAx.getType()), strategy, state, loc);
}
};

auto doBOffset = [&](Subregister offsetBx, Subregister j0x) {
auto doBOffset = [&](Subregister offsetBx, Subregister j0x, SourceLocation loc = {}) {
if (problem.B.layout == MatrixLayout::Transpose)
eaddScaled(1, offsetBx, offsetBx, j0x, Tb_ext, strategy, state);
eaddScaled(1, offsetBx, offsetBx, j0x, Tb_ext, strategy, state, loc);
else {
emul(1, tempQ0, j0x, state.inputs.ldb, strategy, state);
eadd(1, offsetBx, offsetBx, tempQ0.reinterpret(0, offsetBx.getType()), strategy, state);
emul(1, tempQ0, j0x, state.inputs.ldb, strategy, state, loc);
eadd(1, offsetBx, offsetBx, tempQ0.reinterpret(0, offsetBx.getType()), strategy, state, loc);
}
};

Expand Down Expand Up @@ -589,20 +589,20 @@ void BLASKernelGenerator<hw>::gemmOffsetBatchABC(const GEMMProblem &problem, con
template <HW hw>
void BLASKernelGenerator<hw>::gemmFoldOffsets(const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state)
{
auto foldOrSave = [&](const MatrixAddressingStrategy &sX, Subregister &inputX, Subregister &offsetX, const Subregister &inputOffsetX, Subregister &saveOffsetX, bool newInput = false) {
auto foldOrSave = [&](const MatrixAddressingStrategy &sX, Subregister &inputX, Subregister &offsetX, const Subregister &inputOffsetX, Subregister &saveOffsetX, bool newInput = false, SourceLocation loc = {}) {
if (sX.base.isStateless()) {
auto oldInputX = inputX;
if (newInput)
inputX = state.ra.alloc_sub(DataType::uq, getHint(HintType::LongTerm, strategy));
eadd(1, inputX, oldInputX, offsetX, strategy, state);
eadd(1, inputX, oldInputX, offsetX, strategy, state, loc);
if (getBytes(offsetX.getType()) < 8) {
state.ra.safeRelease(offsetX);
offsetX = state.ra.alloc_sub(DataType::uq, getHint(HintType::LongTerm, strategy));
}
emov(1, offsetX, 0, strategy, state);
emov(1, offsetX, 0, strategy, state, loc);
} else {
offsetX = state.ra.alloc_sub(offsetX.getType(), getHint(HintType::LongTerm, strategy));
mov(1, offsetX, inputOffsetX);
mov(1, offsetX, inputOffsetX, loc);
}
saveOffsetX = offsetX;
};
Expand All @@ -624,11 +624,11 @@ void BLASKernelGenerator<hw>::gemmFoldOffsets(const GEMMProblem &problem, const
template <HW hw>
void BLASKernelGenerator<hw>::gemmRestoreOffsets(const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state)
{
auto zeroOrRestore = [&](const MatrixAddressingStrategy &sX, const Subregister &offsetX, const Subregister &inputOffsetX) {
auto zeroOrRestore = [&](const MatrixAddressingStrategy &sX, const Subregister &offsetX, const Subregister &inputOffsetX, SourceLocation loc = {}) {
if (sX.base.isStateless())
emov(1, offsetX, 0, strategy, state);
emov(1, offsetX, 0, strategy, state, loc);
else
mov(1, offsetX, inputOffsetX);
mov(1, offsetX, inputOffsetX, loc);
};

zeroOrRestore(strategy.A, state.saveOffsetA, state.inputs.offsetA);
Expand Down Expand Up @@ -774,12 +774,12 @@ void BLASKernelGenerator<hw>::gemmScaleInputs(const GEMMProblem &problem, const
auto Ta_ext = problem.Ta_ext, Tb_ext = problem.Tb_ext, Tc_ext = problem.Tc_ext, Tco = problem.Tco;
auto &inputs = state.inputs;

auto scale = [&](Type T, Subregister &s, Subregister defaultSrc = Subregister()) {
auto scale = [&](Type T, Subregister &s, Subregister defaultSrc = Subregister(), SourceLocation loc = {}) {
if (s.isValid())
emulConstant(1, s, s, T, strategy, state);
emulConstant(1, s, s, T, strategy, state, loc);
else if (defaultSrc.isValid()) {
s = state.ra.alloc_sub(defaultSrc.getType(), getHint(HintType::LongTerm, strategy));
emulConstant(1, s, defaultSrc, T, strategy, state);
emulConstant(1, s, defaultSrc, T, strategy, state, loc);
}
};

Expand Down Expand Up @@ -1052,7 +1052,7 @@ bool BLASKernelGenerator<hw>::gemmAccumulateCSetup(GEMMProblem &problem, GEMMStr
for (LoopType loop: {LoopM, LoopN, LoopK})
state.remaindersCoop[loop] = state.remainders[loop];

auto calcMNRemCoop = [&](CoopSplit split, bool isM) {
auto calcMNRemCoop = [&](CoopSplit split, bool isM, SourceLocation loc = {}) {
auto loopX = isM ? LoopM : LoopN;
auto loopY = isM ? LoopN : LoopM;
switch (split) {
Expand All @@ -1064,7 +1064,7 @@ bool BLASKernelGenerator<hw>::gemmAccumulateCSetup(GEMMProblem &problem, GEMMStr
auto rem = state.ra.alloc_sub<uint16_t>();
int32_t chunk = strategy.unroll[loopX] / strategy.wg[loopY];
auto lid = isM ? state.lidN : state.lidM;
emad(1 | sat, rem, state.remainders[loopX], -lid.w(), chunk, strategy, state);
emad(1 | sat, rem, state.remainders[loopX], -lid.w(), chunk, strategy, state, loc);
return rem;
}
}
Expand Down
Loading