LLVM 20.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "GCNHazardRecognizer.h"
18#include "GCNSubtarget.h"
31#include "llvm/IR/IntrinsicsAMDGPU.h"
32#include "llvm/MC/MCContext.h"
35
36using namespace llvm;
37
38#define DEBUG_TYPE "si-instr-info"
39
40#define GET_INSTRINFO_CTOR_DTOR
41#include "AMDGPUGenInstrInfo.inc"
42
43namespace llvm::AMDGPU {
44#define GET_D16ImageDimIntrinsics_IMPL
45#define GET_ImageDimIntrinsicTable_IMPL
46#define GET_RsrcIntrinsics_IMPL
47#include "AMDGPUGenSearchableTables.inc"
48} // namespace llvm::AMDGPU
49
50// Must be at least 4 to be able to branch over minimum unconditional branch
51// code. This is only for making it possible to write reasonably small tests for
52// long branches.
54BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
55 cl::desc("Restrict range of branch instructions (DEBUG)"));
56
58 "amdgpu-fix-16-bit-physreg-copies",
59 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
60 cl::init(true),
62
64 : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
65 RI(ST), ST(ST) {
66 SchedModel.init(&ST);
67}
68
69//===----------------------------------------------------------------------===//
70// TargetInstrInfo callbacks
71//===----------------------------------------------------------------------===//
72
73static unsigned getNumOperandsNoGlue(SDNode *Node) {
74 unsigned N = Node->getNumOperands();
75 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
76 --N;
77 return N;
78}
79
80/// Returns true if both nodes have the same value for the given
81/// operand \p Op, or if both nodes do not have this operand.
82static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
83 unsigned Opc0 = N0->getMachineOpcode();
84 unsigned Opc1 = N1->getMachineOpcode();
85
86 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
87 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
88
89 if (Op0Idx == -1 && Op1Idx == -1)
90 return true;
91
92
93 if ((Op0Idx == -1 && Op1Idx != -1) ||
94 (Op1Idx == -1 && Op0Idx != -1))
95 return false;
96
97 // getNamedOperandIdx returns the index for the MachineInstr's operands,
98 // which includes the result as the first operand. We are indexing into the
99 // MachineSDNode's operands, so we need to skip the result operand to get
100 // the real index.
101 --Op0Idx;
102 --Op1Idx;
103
104 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
105}
106
107static bool canRemat(const MachineInstr &MI) {
108
112 return true;
113
114 if (SIInstrInfo::isSMRD(MI)) {
115 return !MI.memoperands_empty() &&
116 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
117 return MMO->isLoad() && MMO->isInvariant();
118 });
119 }
120
121 return false;
122}
123
125 const MachineInstr &MI) const {
126
127 if (canRemat(MI)) {
128 // Normally VALU use of exec would block the rematerialization, but that
129 // is OK in this case to have an implicit exec read as all VALU do.
130 // We really want all of the generic logic for this except for this.
131
132 // Another potential implicit use is mode register. The core logic of
133 // the RA will not attempt rematerialization if mode is set anywhere
134 // in the function, otherwise it is safe since mode is not changed.
135
136 // There is difference to generic method which does not allow
137 // rematerialization if there are virtual register uses. We allow this,
138 // therefore this method includes SOP instructions as well.
139 if (!MI.hasImplicitDef() &&
140 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
141 !MI.mayRaiseFPException())
142 return true;
143 }
144
146}
147
148// Returns true if the scalar result of a VALU instruction depends on exec.
150 // Ignore comparisons which are only used masked with exec.
151 // This allows some hoisting/sinking of VALU comparisons.
152 if (MI.isCompare()) {
153 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
154 Register DstReg = MI.getOperand(0).getReg();
155 if (!DstReg.isVirtual())
156 return true;
157 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
158 switch (Use.getOpcode()) {
159 case AMDGPU::S_AND_SAVEEXEC_B32:
160 case AMDGPU::S_AND_SAVEEXEC_B64:
161 break;
162 case AMDGPU::S_AND_B32:
163 case AMDGPU::S_AND_B64:
164 if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
165 return true;
166 break;
167 default:
168 return true;
169 }
170 }
171 return false;
172 }
173
174 switch (MI.getOpcode()) {
175 default:
176 break;
177 case AMDGPU::V_READFIRSTLANE_B32:
178 return true;
179 }
180
181 return false;
182}
183
185 // Any implicit use of exec by VALU is not a real register read.
186 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
188}
189
191 MachineBasicBlock *SuccToSinkTo,
192 MachineCycleInfo *CI) const {
193 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
194 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
195 return true;
196
197 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
198 // Check if sinking of MI would create temporal divergent use.
199 for (auto Op : MI.uses()) {
200 if (Op.isReg() && Op.getReg().isVirtual() &&
201 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
202 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
203
204 // SgprDef defined inside cycle
205 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
206 if (FromCycle == nullptr)
207 continue;
208
209 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
210 // Check if there is a FromCycle that contains SgprDef's basic block but
211 // does not contain SuccToSinkTo and also has divergent exit condition.
212 while (FromCycle && !FromCycle->contains(ToCycle)) {
214 FromCycle->getExitingBlocks(ExitingBlocks);
215
216 // FromCycle has divergent exit condition.
217 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
218 if (hasDivergentBranch(ExitingBlock))
219 return false;
220 }
221
222 FromCycle = FromCycle->getParentCycle();
223 }
224 }
225 }
226
227 return true;
228}
229
231 int64_t &Offset0,
232 int64_t &Offset1) const {
233 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
234 return false;
235
236 unsigned Opc0 = Load0->getMachineOpcode();
237 unsigned Opc1 = Load1->getMachineOpcode();
238
239 // Make sure both are actually loads.
240 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
241 return false;
242
243 // A mayLoad instruction without a def is not a load. Likely a prefetch.
244 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
245 return false;
246
247 if (isDS(Opc0) && isDS(Opc1)) {
248
249 // FIXME: Handle this case:
250 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
251 return false;
252
253 // Check base reg.
254 if (Load0->getOperand(0) != Load1->getOperand(0))
255 return false;
256
257 // Skip read2 / write2 variants for simplicity.
258 // TODO: We should report true if the used offsets are adjacent (excluded
259 // st64 versions).
260 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
261 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
262 if (Offset0Idx == -1 || Offset1Idx == -1)
263 return false;
264
265 // XXX - be careful of dataless loads
266 // getNamedOperandIdx returns the index for MachineInstrs. Since they
267 // include the output in the operand list, but SDNodes don't, we need to
268 // subtract the index by one.
269 Offset0Idx -= get(Opc0).NumDefs;
270 Offset1Idx -= get(Opc1).NumDefs;
271 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
272 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
273 return true;
274 }
275
276 if (isSMRD(Opc0) && isSMRD(Opc1)) {
277 // Skip time and cache invalidation instructions.
278 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
279 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
280 return false;
281
282 unsigned NumOps = getNumOperandsNoGlue(Load0);
283 if (NumOps != getNumOperandsNoGlue(Load1))
284 return false;
285
286 // Check base reg.
287 if (Load0->getOperand(0) != Load1->getOperand(0))
288 return false;
289
290 // Match register offsets, if both register and immediate offsets present.
291 assert(NumOps == 4 || NumOps == 5);
292 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
293 return false;
294
295 const ConstantSDNode *Load0Offset =
296 dyn_cast<ConstantSDNode>(Load0->getOperand(NumOps - 3));
297 const ConstantSDNode *Load1Offset =
298 dyn_cast<ConstantSDNode>(Load1->getOperand(NumOps - 3));
299
300 if (!Load0Offset || !Load1Offset)
301 return false;
302
303 Offset0 = Load0Offset->getZExtValue();
304 Offset1 = Load1Offset->getZExtValue();
305 return true;
306 }
307
308 // MUBUF and MTBUF can access the same addresses.
309 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
310
311 // MUBUF and MTBUF have vaddr at different indices.
312 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
313 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
314 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
315 return false;
316
317 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
318 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
319
320 if (OffIdx0 == -1 || OffIdx1 == -1)
321 return false;
322
323 // getNamedOperandIdx returns the index for MachineInstrs. Since they
324 // include the output in the operand list, but SDNodes don't, we need to
325 // subtract the index by one.
326 OffIdx0 -= get(Opc0).NumDefs;
327 OffIdx1 -= get(Opc1).NumDefs;
328
329 SDValue Off0 = Load0->getOperand(OffIdx0);
330 SDValue Off1 = Load1->getOperand(OffIdx1);
331
332 // The offset might be a FrameIndexSDNode.
333 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
334 return false;
335
336 Offset0 = Off0->getAsZExtVal();
337 Offset1 = Off1->getAsZExtVal();
338 return true;
339 }
340
341 return false;
342}
343
344static bool isStride64(unsigned Opc) {
345 switch (Opc) {
346 case AMDGPU::DS_READ2ST64_B32:
347 case AMDGPU::DS_READ2ST64_B64:
348 case AMDGPU::DS_WRITE2ST64_B32:
349 case AMDGPU::DS_WRITE2ST64_B64:
350 return true;
351 default:
352 return false;
353 }
354}
355
358 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
359 const TargetRegisterInfo *TRI) const {
360 if (!LdSt.mayLoadOrStore())
361 return false;
362
363 unsigned Opc = LdSt.getOpcode();
364 OffsetIsScalable = false;
365 const MachineOperand *BaseOp, *OffsetOp;
366 int DataOpIdx;
367
368 if (isDS(LdSt)) {
369 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
370 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
371 if (OffsetOp) {
372 // Normal, single offset LDS instruction.
373 if (!BaseOp) {
374 // DS_CONSUME/DS_APPEND use M0 for the base address.
375 // TODO: find the implicit use operand for M0 and use that as BaseOp?
376 return false;
377 }
378 BaseOps.push_back(BaseOp);
379 Offset = OffsetOp->getImm();
380 // Get appropriate operand, and compute width accordingly.
381 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
382 if (DataOpIdx == -1)
383 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
384 Width = getOpSize(LdSt, DataOpIdx);
385 } else {
386 // The 2 offset instructions use offset0 and offset1 instead. We can treat
387 // these as a load with a single offset if the 2 offsets are consecutive.
388 // We will use this for some partially aligned loads.
389 const MachineOperand *Offset0Op =
390 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
391 const MachineOperand *Offset1Op =
392 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
393
394 unsigned Offset0 = Offset0Op->getImm() & 0xff;
395 unsigned Offset1 = Offset1Op->getImm() & 0xff;
396 if (Offset0 + 1 != Offset1)
397 return false;
398
399 // Each of these offsets is in element sized units, so we need to convert
400 // to bytes of the individual reads.
401
402 unsigned EltSize;
403 if (LdSt.mayLoad())
404 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
405 else {
406 assert(LdSt.mayStore());
407 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
408 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
409 }
410
411 if (isStride64(Opc))
412 EltSize *= 64;
413
414 BaseOps.push_back(BaseOp);
415 Offset = EltSize * Offset0;
416 // Get appropriate operand(s), and compute width accordingly.
417 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
418 if (DataOpIdx == -1) {
419 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
420 Width = getOpSize(LdSt, DataOpIdx);
421 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
422 Width = Width.getValue() + getOpSize(LdSt, DataOpIdx);
423 } else {
424 Width = getOpSize(LdSt, DataOpIdx);
425 }
426 }
427 return true;
428 }
429
430 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
431 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
432 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
433 return false;
434 BaseOps.push_back(RSrc);
435 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
436 if (BaseOp && !BaseOp->isFI())
437 BaseOps.push_back(BaseOp);
438 const MachineOperand *OffsetImm =
439 getNamedOperand(LdSt, AMDGPU::OpName::offset);
440 Offset = OffsetImm->getImm();
441 const MachineOperand *SOffset =
442 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
443 if (SOffset) {
444 if (SOffset->isReg())
445 BaseOps.push_back(SOffset);
446 else
447 Offset += SOffset->getImm();
448 }
449 // Get appropriate operand, and compute width accordingly.
450 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
451 if (DataOpIdx == -1)
452 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
453 if (DataOpIdx == -1) // LDS DMA
454 return false;
455 Width = getOpSize(LdSt, DataOpIdx);
456 return true;
457 }
458
459 if (isImage(LdSt)) {
460 auto RsrcOpName =
461 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
462 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
463 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
464 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
465 if (VAddr0Idx >= 0) {
466 // GFX10 possible NSA encoding.
467 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
468 BaseOps.push_back(&LdSt.getOperand(I));
469 } else {
470 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
471 }
472 Offset = 0;
473 // Get appropriate operand, and compute width accordingly.
474 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
475 if (DataOpIdx == -1)
476 return false; // no return sampler
477 Width = getOpSize(LdSt, DataOpIdx);
478 return true;
479 }
480
481 if (isSMRD(LdSt)) {
482 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
483 if (!BaseOp) // e.g. S_MEMTIME
484 return false;
485 BaseOps.push_back(BaseOp);
486 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
487 Offset = OffsetOp ? OffsetOp->getImm() : 0;
488 // Get appropriate operand, and compute width accordingly.
489 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
490 if (DataOpIdx == -1)
491 return false;
492 Width = getOpSize(LdSt, DataOpIdx);
493 return true;
494 }
495
496 if (isFLAT(LdSt)) {
497 // Instructions have either vaddr or saddr or both or none.
498 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
499 if (BaseOp)
500 BaseOps.push_back(BaseOp);
501 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
502 if (BaseOp)
503 BaseOps.push_back(BaseOp);
504 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
505 // Get appropriate operand, and compute width accordingly.
506 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
507 if (DataOpIdx == -1)
508 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
509 if (DataOpIdx == -1) // LDS DMA
510 return false;
511 Width = getOpSize(LdSt, DataOpIdx);
512 return true;
513 }
514
515 return false;
516}
517
518static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
520 const MachineInstr &MI2,
522 // Only examine the first "base" operand of each instruction, on the
523 // assumption that it represents the real base address of the memory access.
524 // Other operands are typically offsets or indices from this base address.
525 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
526 return true;
527
528 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
529 return false;
530
531 auto *MO1 = *MI1.memoperands_begin();
532 auto *MO2 = *MI2.memoperands_begin();
533 if (MO1->getAddrSpace() != MO2->getAddrSpace())
534 return false;
535
536 const auto *Base1 = MO1->getValue();
537 const auto *Base2 = MO2->getValue();
538 if (!Base1 || !Base2)
539 return false;
540 Base1 = getUnderlyingObject(Base1);
541 Base2 = getUnderlyingObject(Base2);
542
543 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
544 return false;
545
546 return Base1 == Base2;
547}
548
550 int64_t Offset1, bool OffsetIsScalable1,
552 int64_t Offset2, bool OffsetIsScalable2,
553 unsigned ClusterSize,
554 unsigned NumBytes) const {
555 // If the mem ops (to be clustered) do not have the same base ptr, then they
556 // should not be clustered
557 unsigned MaxMemoryClusterDWords = DefaultMemoryClusterDWordsLimit;
558 if (!BaseOps1.empty() && !BaseOps2.empty()) {
559 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
560 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
561 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
562 return false;
563
564 const SIMachineFunctionInfo *MFI =
565 FirstLdSt.getMF()->getInfo<SIMachineFunctionInfo>();
566 MaxMemoryClusterDWords = MFI->getMaxMemoryClusterDWords();
567 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
568 // If only one base op is empty, they do not have the same base ptr
569 return false;
570 }
571
572 // In order to avoid register pressure, on an average, the number of DWORDS
573 // loaded together by all clustered mem ops should not exceed
574 // MaxMemoryClusterDWords. This is an empirical value based on certain
575 // observations and performance related experiments.
576 // The good thing about this heuristic is - it avoids clustering of too many
577 // sub-word loads, and also avoids clustering of wide loads. Below is the
578 // brief summary of how the heuristic behaves for various `LoadSize` when
579 // MaxMemoryClusterDWords is 8.
580 //
581 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
582 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
583 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
584 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
585 // (5) LoadSize >= 17: do not cluster
586 const unsigned LoadSize = NumBytes / ClusterSize;
587 const unsigned NumDWords = ((LoadSize + 3) / 4) * ClusterSize;
588 return NumDWords <= MaxMemoryClusterDWords;
589}
590
591// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
592// the first 16 loads will be interleaved with the stores, and the next 16 will
593// be clustered as expected. It should really split into 2 16 store batches.
594//
595// Loads are clustered until this returns false, rather than trying to schedule
596// groups of stores. This also means we have to deal with saying different
597// address space loads should be clustered, and ones which might cause bank
598// conflicts.
599//
600// This might be deprecated so it might not be worth that much effort to fix.
602 int64_t Offset0, int64_t Offset1,
603 unsigned NumLoads) const {
604 assert(Offset1 > Offset0 &&
605 "Second offset should be larger than first offset!");
606 // If we have less than 16 loads in a row, and the offsets are within 64
607 // bytes, then schedule together.
608
609 // A cacheline is 64 bytes (for global memory).
610 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
611}
612
615 const DebugLoc &DL, MCRegister DestReg,
616 MCRegister SrcReg, bool KillSrc,
617 const char *Msg = "illegal VGPR to SGPR copy") {
619 DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error);
621 C.diagnose(IllegalCopy);
622
623 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
624 .addReg(SrcReg, getKillRegState(KillSrc));
625}
626
627/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
628/// possible to have a direct copy in these cases on GFX908, so an intermediate
629/// VGPR copy is required.
633 const DebugLoc &DL, MCRegister DestReg,
634 MCRegister SrcReg, bool KillSrc,
635 RegScavenger &RS, bool RegsOverlap,
636 Register ImpDefSuperReg = Register(),
637 Register ImpUseSuperReg = Register()) {
638 assert((TII.getSubtarget().hasMAIInsts() &&
639 !TII.getSubtarget().hasGFX90AInsts()) &&
640 "Expected GFX908 subtarget.");
641
642 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
643 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
644 "Source register of the copy should be either an SGPR or an AGPR.");
645
646 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
647 "Destination register of the copy should be an AGPR.");
648
649 const SIRegisterInfo &RI = TII.getRegisterInfo();
650
651 // First try to find defining accvgpr_write to avoid temporary registers.
652 // In the case of copies of overlapping AGPRs, we conservatively do not
653 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
654 // an accvgpr_write used for this same copy due to implicit-defs
655 if (!RegsOverlap) {
656 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
657 --Def;
658
659 if (!Def->modifiesRegister(SrcReg, &RI))
660 continue;
661
662 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
663 Def->getOperand(0).getReg() != SrcReg)
664 break;
665
666 MachineOperand &DefOp = Def->getOperand(1);
667 assert(DefOp.isReg() || DefOp.isImm());
668
669 if (DefOp.isReg()) {
670 bool SafeToPropagate = true;
671 // Check that register source operand is not clobbered before MI.
672 // Immediate operands are always safe to propagate.
673 for (auto I = Def; I != MI && SafeToPropagate; ++I)
674 if (I->modifiesRegister(DefOp.getReg(), &RI))
675 SafeToPropagate = false;
676
677 if (!SafeToPropagate)
678 break;
679
680 DefOp.setIsKill(false);
681 }
682
683 MachineInstrBuilder Builder =
684 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
685 .add(DefOp);
686 if (ImpDefSuperReg)
687 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
688
689 if (ImpUseSuperReg) {
690 Builder.addReg(ImpUseSuperReg,
692 }
693
694 return;
695 }
696 }
697
699 RS.backward(std::next(MI));
700
701 // Ideally we want to have three registers for a long reg_sequence copy
702 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
703 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
704 *MBB.getParent());
705
706 // Registers in the sequence are allocated contiguously so we can just
707 // use register number to pick one of three round-robin temps.
708 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
709 Register Tmp =
710 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
712 "VGPR used for an intermediate copy should have been reserved.");
713
714 // Only loop through if there are any free registers left. We don't want to
715 // spill.
716 while (RegNo--) {
717 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
718 /* RestoreAfter */ false, 0,
719 /* AllowSpill */ false);
720 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
721 break;
722 Tmp = Tmp2;
723 RS.setRegUsed(Tmp);
724 }
725
726 // Insert copy to temporary VGPR.
727 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
728 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
729 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
730 } else {
731 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
732 }
733
734 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
735 .addReg(SrcReg, getKillRegState(KillSrc));
736 if (ImpUseSuperReg) {
737 UseBuilder.addReg(ImpUseSuperReg,
739 }
740
741 MachineInstrBuilder DefBuilder
742 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
743 .addReg(Tmp, RegState::Kill);
744
745 if (ImpDefSuperReg)
746 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
747}
748
751 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
752 const TargetRegisterClass *RC, bool Forward) {
753 const SIRegisterInfo &RI = TII.getRegisterInfo();
754 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
756 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
757
758 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
759 int16_t SubIdx = BaseIndices[Idx];
760 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
761 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
762 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
763 unsigned Opcode = AMDGPU::S_MOV_B32;
764
765 // Is SGPR aligned? If so try to combine with next.
766 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
767 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
768 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
769 // Can use SGPR64 copy
770 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
771 SubIdx = RI.getSubRegFromChannel(Channel, 2);
772 DestSubReg = RI.getSubReg(DestReg, SubIdx);
773 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
774 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
775 Opcode = AMDGPU::S_MOV_B64;
776 Idx++;
777 }
778
779 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
780 .addReg(SrcSubReg)
781 .addReg(SrcReg, RegState::Implicit);
782
783 if (!FirstMI)
784 FirstMI = LastMI;
785
786 if (!Forward)
787 I--;
788 }
789
790 assert(FirstMI && LastMI);
791 if (!Forward)
792 std::swap(FirstMI, LastMI);
793
794 FirstMI->addOperand(
795 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
796
797 if (KillSrc)
798 LastMI->addRegisterKilled(SrcReg, &RI);
799}
800
803 const DebugLoc &DL, MCRegister DestReg,
804 MCRegister SrcReg, bool KillSrc,
805 bool RenamableDest, bool RenamableSrc) const {
806 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
807 unsigned Size = RI.getRegSizeInBits(*RC);
808 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
809 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
810
811 // The rest of copyPhysReg assumes Src and Dst size are the same size.
812 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
813 // we remove Fix16BitCopies and this code block?
814 if (Fix16BitCopies) {
815 if (((Size == 16) != (SrcSize == 16))) {
816 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
818 MCRegister &RegToFix = (Size == 32) ? DestReg : SrcReg;
819 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
820 RegToFix = SubReg;
821
822 if (DestReg == SrcReg) {
823 // Identity copy. Insert empty bundle since ExpandPostRA expects an
824 // instruction here.
825 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
826 return;
827 }
828 RC = RI.getPhysRegBaseClass(DestReg);
829 Size = RI.getRegSizeInBits(*RC);
830 SrcRC = RI.getPhysRegBaseClass(SrcReg);
831 SrcSize = RI.getRegSizeInBits(*SrcRC);
832 }
833 }
834
835 if (RC == &AMDGPU::VGPR_32RegClass) {
836 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
837 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
838 AMDGPU::AGPR_32RegClass.contains(SrcReg));
839 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
840 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
841 BuildMI(MBB, MI, DL, get(Opc), DestReg)
842 .addReg(SrcReg, getKillRegState(KillSrc));
843 return;
844 }
845
846 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
847 RC == &AMDGPU::SReg_32RegClass) {
848 if (SrcReg == AMDGPU::SCC) {
849 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
850 .addImm(1)
851 .addImm(0);
852 return;
853 }
854
855 if (DestReg == AMDGPU::VCC_LO) {
856 if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
857 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
858 .addReg(SrcReg, getKillRegState(KillSrc));
859 } else {
860 // FIXME: Hack until VReg_1 removed.
861 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
862 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
863 .addImm(0)
864 .addReg(SrcReg, getKillRegState(KillSrc));
865 }
866
867 return;
868 }
869
870 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
871 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
872 return;
873 }
874
875 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
876 .addReg(SrcReg, getKillRegState(KillSrc));
877 return;
878 }
879
880 if (RC == &AMDGPU::SReg_64RegClass) {
881 if (SrcReg == AMDGPU::SCC) {
882 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
883 .addImm(1)
884 .addImm(0);
885 return;
886 }
887
888 if (DestReg == AMDGPU::VCC) {
889 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
890 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
891 .addReg(SrcReg, getKillRegState(KillSrc));
892 } else {
893 // FIXME: Hack until VReg_1 removed.
894 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
895 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
896 .addImm(0)
897 .addReg(SrcReg, getKillRegState(KillSrc));
898 }
899
900 return;
901 }
902
903 if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
904 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
905 return;
906 }
907
908 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
909 .addReg(SrcReg, getKillRegState(KillSrc));
910 return;
911 }
912
913 if (DestReg == AMDGPU::SCC) {
914 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
915 // but SelectionDAG emits such copies for i1 sources.
916 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
917 // This copy can only be produced by patterns
918 // with explicit SCC, which are known to be enabled
919 // only for subtargets with S_CMP_LG_U64 present.
921 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
922 .addReg(SrcReg, getKillRegState(KillSrc))
923 .addImm(0);
924 } else {
925 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
926 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
927 .addReg(SrcReg, getKillRegState(KillSrc))
928 .addImm(0);
929 }
930
931 return;
932 }
933
934 if (RC == &AMDGPU::AGPR_32RegClass) {
935 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
936 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
937 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
938 .addReg(SrcReg, getKillRegState(KillSrc));
939 return;
940 }
941
942 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
943 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
944 .addReg(SrcReg, getKillRegState(KillSrc));
945 return;
946 }
947
948 // FIXME: Pass should maintain scavenger to avoid scan through the block on
949 // every AGPR spill.
950 RegScavenger RS;
951 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
952 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
953 return;
954 }
955
956 if (Size == 16) {
957 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
958 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
959 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
960
961 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
962 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
963 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
964 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
965 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
966 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
967 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
968 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
969
970 if (IsSGPRDst) {
971 if (!IsSGPRSrc) {
972 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
973 return;
974 }
975
976 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
977 .addReg(NewSrcReg, getKillRegState(KillSrc));
978 return;
979 }
980
981 if (IsAGPRDst || IsAGPRSrc) {
982 if (!DstLow || !SrcLow) {
983 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
984 "Cannot use hi16 subreg with an AGPR!");
985 }
986
987 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
988 return;
989 }
990
991 if (ST.hasTrue16BitInsts()) {
992 if (IsSGPRSrc) {
993 assert(SrcLow);
994 SrcReg = NewSrcReg;
995 }
996 // Use the smaller instruction encoding if possible.
997 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
998 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
999 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
1000 .addReg(SrcReg);
1001 } else {
1002 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
1003 .addImm(0) // src0_modifiers
1004 .addReg(SrcReg)
1005 .addImm(0); // op_sel
1006 }
1007 return;
1008 }
1009
1010 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1011 if (!DstLow || !SrcLow) {
1012 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1013 "Cannot use hi16 subreg on VI!");
1014 }
1015
1016 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1017 .addReg(NewSrcReg, getKillRegState(KillSrc));
1018 return;
1019 }
1020
1021 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1022 .addImm(0) // src0_modifiers
1023 .addReg(NewSrcReg)
1024 .addImm(0) // clamp
1031 // First implicit operand is $exec.
1032 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1033 return;
1034 }
1035
1036 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1037 if (ST.hasMovB64()) {
1038 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1039 .addReg(SrcReg, getKillRegState(KillSrc));
1040 return;
1041 }
1042 if (ST.hasPkMovB32()) {
1043 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1045 .addReg(SrcReg)
1047 .addReg(SrcReg)
1048 .addImm(0) // op_sel_lo
1049 .addImm(0) // op_sel_hi
1050 .addImm(0) // neg_lo
1051 .addImm(0) // neg_hi
1052 .addImm(0) // clamp
1053 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1054 return;
1055 }
1056 }
1057
1058 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1059 if (RI.isSGPRClass(RC)) {
1060 if (!RI.isSGPRClass(SrcRC)) {
1061 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1062 return;
1063 }
1064 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1065 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1066 Forward);
1067 return;
1068 }
1069
1070 unsigned EltSize = 4;
1071 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1072 if (RI.isAGPRClass(RC)) {
1073 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1074 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1075 else if (RI.hasVGPRs(SrcRC) ||
1076 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1077 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1078 else
1079 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1080 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1081 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1082 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1083 (RI.isProperlyAlignedRC(*RC) &&
1084 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1085 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1086 if (ST.hasMovB64()) {
1087 Opcode = AMDGPU::V_MOV_B64_e32;
1088 EltSize = 8;
1089 } else if (ST.hasPkMovB32()) {
1090 Opcode = AMDGPU::V_PK_MOV_B32;
1091 EltSize = 8;
1092 }
1093 }
1094
1095 // For the cases where we need an intermediate instruction/temporary register
1096 // (destination is an AGPR), we need a scavenger.
1097 //
1098 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1099 // whole block for every handled copy.
1100 std::unique_ptr<RegScavenger> RS;
1101 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1102 RS = std::make_unique<RegScavenger>();
1103
1104 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1105
1106 // If there is an overlap, we can't kill the super-register on the last
1107 // instruction, since it will also kill the components made live by this def.
1108 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1109 const bool CanKillSuperReg = KillSrc && !Overlap;
1110
1111 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1112 unsigned SubIdx;
1113 if (Forward)
1114 SubIdx = SubIndices[Idx];
1115 else
1116 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1117 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1118 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1119 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1120
1121 bool IsFirstSubreg = Idx == 0;
1122 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1123
1124 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1125 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1126 Register ImpUseSuper = SrcReg;
1127 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1128 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1129 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1131 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1133 .addReg(SrcSubReg)
1135 .addReg(SrcSubReg)
1136 .addImm(0) // op_sel_lo
1137 .addImm(0) // op_sel_hi
1138 .addImm(0) // neg_lo
1139 .addImm(0) // neg_hi
1140 .addImm(0) // clamp
1141 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1142 if (IsFirstSubreg)
1144 } else {
1145 MachineInstrBuilder Builder =
1146 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1147 if (IsFirstSubreg)
1148 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1149
1150 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1151 }
1152 }
1153}
1154
1155int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1156 int NewOpc;
1157
1158 // Try to map original to commuted opcode
1159 NewOpc = AMDGPU::getCommuteRev(Opcode);
1160 if (NewOpc != -1)
1161 // Check if the commuted (REV) opcode exists on the target.
1162 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1163
1164 // Try to map commuted to original opcode
1165 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1166 if (NewOpc != -1)
1167 // Check if the original (non-REV) opcode exists on the target.
1168 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1169
1170 return Opcode;
1171}
1172
1175 const DebugLoc &DL, Register DestReg,
1176 int64_t Value) const {
1178 const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
1179 if (RegClass == &AMDGPU::SReg_32RegClass ||
1180 RegClass == &AMDGPU::SGPR_32RegClass ||
1181 RegClass == &AMDGPU::SReg_32_XM0RegClass ||
1182 RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
1183 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
1184 .addImm(Value);
1185 return;
1186 }
1187
1188 if (RegClass == &AMDGPU::SReg_64RegClass ||
1189 RegClass == &AMDGPU::SGPR_64RegClass ||
1190 RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
1191 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
1192 .addImm(Value);
1193 return;
1194 }
1195
1196 if (RegClass == &AMDGPU::VGPR_32RegClass) {
1197 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
1198 .addImm(Value);
1199 return;
1200 }
1201 if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) {
1202 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
1203 .addImm(Value);
1204 return;
1205 }
1206
1207 unsigned EltSize = 4;
1208 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1209 if (RI.isSGPRClass(RegClass)) {
1210 if (RI.getRegSizeInBits(*RegClass) > 32) {
1211 Opcode = AMDGPU::S_MOV_B64;
1212 EltSize = 8;
1213 } else {
1214 Opcode = AMDGPU::S_MOV_B32;
1215 EltSize = 4;
1216 }
1217 }
1218
1219 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
1220 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1221 int64_t IdxValue = Idx == 0 ? Value : 0;
1222
1223 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
1224 get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx]));
1225 Builder.addImm(IdxValue);
1226 }
1227}
1228
1229const TargetRegisterClass *
1231 return &AMDGPU::VGPR_32RegClass;
1232}
1233
1236 const DebugLoc &DL, Register DstReg,
1238 Register TrueReg,
1239 Register FalseReg) const {
1241 const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass();
1242 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1243 "Not a VGPR32 reg");
1244
1245 if (Cond.size() == 1) {
1246 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1247 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1248 .add(Cond[0]);
1249 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1250 .addImm(0)
1251 .addReg(FalseReg)
1252 .addImm(0)
1253 .addReg(TrueReg)
1254 .addReg(SReg);
1255 } else if (Cond.size() == 2) {
1256 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1257 switch (Cond[0].getImm()) {
1258 case SIInstrInfo::SCC_TRUE: {
1259 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1260 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1261 : AMDGPU::S_CSELECT_B64), SReg)
1262 .addImm(1)
1263 .addImm(0);
1264 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1265 .addImm(0)
1266 .addReg(FalseReg)
1267 .addImm(0)
1268 .addReg(TrueReg)
1269 .addReg(SReg);
1270 break;
1271 }
1272 case SIInstrInfo::SCC_FALSE: {
1273 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1274 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1275 : AMDGPU::S_CSELECT_B64), SReg)
1276 .addImm(0)
1277 .addImm(1);
1278 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1279 .addImm(0)
1280 .addReg(FalseReg)
1281 .addImm(0)
1282 .addReg(TrueReg)
1283 .addReg(SReg);
1284 break;
1285 }
1286 case SIInstrInfo::VCCNZ: {
1287 MachineOperand RegOp = Cond[1];
1288 RegOp.setImplicit(false);
1289 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1290 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1291 .add(RegOp);
1292 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1293 .addImm(0)
1294 .addReg(FalseReg)
1295 .addImm(0)
1296 .addReg(TrueReg)
1297 .addReg(SReg);
1298 break;
1299 }
1300 case SIInstrInfo::VCCZ: {
1301 MachineOperand RegOp = Cond[1];
1302 RegOp.setImplicit(false);
1303 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1304 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1305 .add(RegOp);
1306 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1307 .addImm(0)
1308 .addReg(TrueReg)
1309 .addImm(0)
1310 .addReg(FalseReg)
1311 .addReg(SReg);
1312 break;
1313 }
1314 case SIInstrInfo::EXECNZ: {
1315 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1316 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1317 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1318 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1319 .addImm(0);
1320 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1321 : AMDGPU::S_CSELECT_B64), SReg)
1322 .addImm(1)
1323 .addImm(0);
1324 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1325 .addImm(0)
1326 .addReg(FalseReg)
1327 .addImm(0)
1328 .addReg(TrueReg)
1329 .addReg(SReg);
1330 break;
1331 }
1332 case SIInstrInfo::EXECZ: {
1333 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1334 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1335 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1336 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1337 .addImm(0);
1338 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1339 : AMDGPU::S_CSELECT_B64), SReg)
1340 .addImm(0)
1341 .addImm(1);
1342 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1343 .addImm(0)
1344 .addReg(FalseReg)
1345 .addImm(0)
1346 .addReg(TrueReg)
1347 .addReg(SReg);
1348 llvm_unreachable("Unhandled branch predicate EXECZ");
1349 break;
1350 }
1351 default:
1352 llvm_unreachable("invalid branch predicate");
1353 }
1354 } else {
1355 llvm_unreachable("Can only handle Cond size 1 or 2");
1356 }
1357}
1358
1361 const DebugLoc &DL,
1362 Register SrcReg, int Value) const {
1364 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1365 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1366 .addImm(Value)
1367 .addReg(SrcReg);
1368
1369 return Reg;
1370}
1371
1374 const DebugLoc &DL,
1375 Register SrcReg, int Value) const {
1377 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1378 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1379 .addImm(Value)
1380 .addReg(SrcReg);
1381
1382 return Reg;
1383}
1384
1386
1387 if (RI.isAGPRClass(DstRC))
1388 return AMDGPU::COPY;
1389 if (RI.getRegSizeInBits(*DstRC) == 16) {
1390 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1391 // before RA.
1392 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1393 }
1394 if (RI.getRegSizeInBits(*DstRC) == 32)
1395 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1396 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1397 return AMDGPU::S_MOV_B64;
1398 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1399 return AMDGPU::V_MOV_B64_PSEUDO;
1400 return AMDGPU::COPY;
1401}
1402
1403const MCInstrDesc &
1405 bool IsIndirectSrc) const {
1406 if (IsIndirectSrc) {
1407 if (VecSize <= 32) // 4 bytes
1408 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1409 if (VecSize <= 64) // 8 bytes
1410 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1411 if (VecSize <= 96) // 12 bytes
1412 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1413 if (VecSize <= 128) // 16 bytes
1414 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1415 if (VecSize <= 160) // 20 bytes
1416 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1417 if (VecSize <= 256) // 32 bytes
1418 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1419 if (VecSize <= 288) // 36 bytes
1420 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1421 if (VecSize <= 320) // 40 bytes
1422 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1423 if (VecSize <= 352) // 44 bytes
1424 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1425 if (VecSize <= 384) // 48 bytes
1426 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1427 if (VecSize <= 512) // 64 bytes
1428 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1429 if (VecSize <= 1024) // 128 bytes
1430 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1431
1432 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1433 }
1434
1435 if (VecSize <= 32) // 4 bytes
1436 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1437 if (VecSize <= 64) // 8 bytes
1438 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1439 if (VecSize <= 96) // 12 bytes
1440 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1441 if (VecSize <= 128) // 16 bytes
1442 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1443 if (VecSize <= 160) // 20 bytes
1444 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1445 if (VecSize <= 256) // 32 bytes
1446 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1447 if (VecSize <= 288) // 36 bytes
1448 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1449 if (VecSize <= 320) // 40 bytes
1450 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1451 if (VecSize <= 352) // 44 bytes
1452 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1453 if (VecSize <= 384) // 48 bytes
1454 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1455 if (VecSize <= 512) // 64 bytes
1456 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1457 if (VecSize <= 1024) // 128 bytes
1458 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1459
1460 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1461}
1462
1463static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1464 if (VecSize <= 32) // 4 bytes
1465 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1466 if (VecSize <= 64) // 8 bytes
1467 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1468 if (VecSize <= 96) // 12 bytes
1469 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1470 if (VecSize <= 128) // 16 bytes
1471 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1472 if (VecSize <= 160) // 20 bytes
1473 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1474 if (VecSize <= 256) // 32 bytes
1475 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1476 if (VecSize <= 288) // 36 bytes
1477 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1478 if (VecSize <= 320) // 40 bytes
1479 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1480 if (VecSize <= 352) // 44 bytes
1481 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1482 if (VecSize <= 384) // 48 bytes
1483 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1484 if (VecSize <= 512) // 64 bytes
1485 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1486 if (VecSize <= 1024) // 128 bytes
1487 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1488
1489 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1490}
1491
1492static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1493 if (VecSize <= 32) // 4 bytes
1494 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1495 if (VecSize <= 64) // 8 bytes
1496 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1497 if (VecSize <= 96) // 12 bytes
1498 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1499 if (VecSize <= 128) // 16 bytes
1500 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1501 if (VecSize <= 160) // 20 bytes
1502 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1503 if (VecSize <= 256) // 32 bytes
1504 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1505 if (VecSize <= 288) // 36 bytes
1506 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1507 if (VecSize <= 320) // 40 bytes
1508 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1509 if (VecSize <= 352) // 44 bytes
1510 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1511 if (VecSize <= 384) // 48 bytes
1512 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1513 if (VecSize <= 512) // 64 bytes
1514 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1515 if (VecSize <= 1024) // 128 bytes
1516 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1517
1518 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1519}
1520
1521static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1522 if (VecSize <= 64) // 8 bytes
1523 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1524 if (VecSize <= 128) // 16 bytes
1525 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1526 if (VecSize <= 256) // 32 bytes
1527 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1528 if (VecSize <= 512) // 64 bytes
1529 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1530 if (VecSize <= 1024) // 128 bytes
1531 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1532
1533 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1534}
1535
1536const MCInstrDesc &
1537SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1538 bool IsSGPR) const {
1539 if (IsSGPR) {
1540 switch (EltSize) {
1541 case 32:
1542 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1543 case 64:
1544 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1545 default:
1546 llvm_unreachable("invalid reg indexing elt size");
1547 }
1548 }
1549
1550 assert(EltSize == 32 && "invalid reg indexing elt size");
1552}
1553
1554static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1555 switch (Size) {
1556 case 4:
1557 return AMDGPU::SI_SPILL_S32_SAVE;
1558 case 8:
1559 return AMDGPU::SI_SPILL_S64_SAVE;
1560 case 12:
1561 return AMDGPU::SI_SPILL_S96_SAVE;
1562 case 16:
1563 return AMDGPU::SI_SPILL_S128_SAVE;
1564 case 20:
1565 return AMDGPU::SI_SPILL_S160_SAVE;
1566 case 24:
1567 return AMDGPU::SI_SPILL_S192_SAVE;
1568 case 28:
1569 return AMDGPU::SI_SPILL_S224_SAVE;
1570 case 32:
1571 return AMDGPU::SI_SPILL_S256_SAVE;
1572 case 36:
1573 return AMDGPU::SI_SPILL_S288_SAVE;
1574 case 40:
1575 return AMDGPU::SI_SPILL_S320_SAVE;
1576 case 44:
1577 return AMDGPU::SI_SPILL_S352_SAVE;
1578 case 48:
1579 return AMDGPU::SI_SPILL_S384_SAVE;
1580 case 64:
1581 return AMDGPU::SI_SPILL_S512_SAVE;
1582 case 128:
1583 return AMDGPU::SI_SPILL_S1024_SAVE;
1584 default:
1585 llvm_unreachable("unknown register size");
1586 }
1587}
1588
1589static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1590 switch (Size) {
1591 case 4:
1592 return AMDGPU::SI_SPILL_V32_SAVE;
1593 case 8:
1594 return AMDGPU::SI_SPILL_V64_SAVE;
1595 case 12:
1596 return AMDGPU::SI_SPILL_V96_SAVE;
1597 case 16:
1598 return AMDGPU::SI_SPILL_V128_SAVE;
1599 case 20:
1600 return AMDGPU::SI_SPILL_V160_SAVE;
1601 case 24:
1602 return AMDGPU::SI_SPILL_V192_SAVE;
1603 case 28:
1604 return AMDGPU::SI_SPILL_V224_SAVE;
1605 case 32:
1606 return AMDGPU::SI_SPILL_V256_SAVE;
1607 case 36:
1608 return AMDGPU::SI_SPILL_V288_SAVE;
1609 case 40:
1610 return AMDGPU::SI_SPILL_V320_SAVE;
1611 case 44:
1612 return AMDGPU::SI_SPILL_V352_SAVE;
1613 case 48:
1614 return AMDGPU::SI_SPILL_V384_SAVE;
1615 case 64:
1616 return AMDGPU::SI_SPILL_V512_SAVE;
1617 case 128:
1618 return AMDGPU::SI_SPILL_V1024_SAVE;
1619 default:
1620 llvm_unreachable("unknown register size");
1621 }
1622}
1623
1624static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
1625 switch (Size) {
1626 case 4:
1627 return AMDGPU::SI_SPILL_A32_SAVE;
1628 case 8:
1629 return AMDGPU::SI_SPILL_A64_SAVE;
1630 case 12:
1631 return AMDGPU::SI_SPILL_A96_SAVE;
1632 case 16:
1633 return AMDGPU::SI_SPILL_A128_SAVE;
1634 case 20:
1635 return AMDGPU::SI_SPILL_A160_SAVE;
1636 case 24:
1637 return AMDGPU::SI_SPILL_A192_SAVE;
1638 case 28:
1639 return AMDGPU::SI_SPILL_A224_SAVE;
1640 case 32:
1641 return AMDGPU::SI_SPILL_A256_SAVE;
1642 case 36:
1643 return AMDGPU::SI_SPILL_A288_SAVE;
1644 case 40:
1645 return AMDGPU::SI_SPILL_A320_SAVE;
1646 case 44:
1647 return AMDGPU::SI_SPILL_A352_SAVE;
1648 case 48:
1649 return AMDGPU::SI_SPILL_A384_SAVE;
1650 case 64:
1651 return AMDGPU::SI_SPILL_A512_SAVE;
1652 case 128:
1653 return AMDGPU::SI_SPILL_A1024_SAVE;
1654 default:
1655 llvm_unreachable("unknown register size");
1656 }
1657}
1658
1659static unsigned getAVSpillSaveOpcode(unsigned Size) {
1660 switch (Size) {
1661 case 4:
1662 return AMDGPU::SI_SPILL_AV32_SAVE;
1663 case 8:
1664 return AMDGPU::SI_SPILL_AV64_SAVE;
1665 case 12:
1666 return AMDGPU::SI_SPILL_AV96_SAVE;
1667 case 16:
1668 return AMDGPU::SI_SPILL_AV128_SAVE;
1669 case 20:
1670 return AMDGPU::SI_SPILL_AV160_SAVE;
1671 case 24:
1672 return AMDGPU::SI_SPILL_AV192_SAVE;
1673 case 28:
1674 return AMDGPU::SI_SPILL_AV224_SAVE;
1675 case 32:
1676 return AMDGPU::SI_SPILL_AV256_SAVE;
1677 case 36:
1678 return AMDGPU::SI_SPILL_AV288_SAVE;
1679 case 40:
1680 return AMDGPU::SI_SPILL_AV320_SAVE;
1681 case 44:
1682 return AMDGPU::SI_SPILL_AV352_SAVE;
1683 case 48:
1684 return AMDGPU::SI_SPILL_AV384_SAVE;
1685 case 64:
1686 return AMDGPU::SI_SPILL_AV512_SAVE;
1687 case 128:
1688 return AMDGPU::SI_SPILL_AV1024_SAVE;
1689 default:
1690 llvm_unreachable("unknown register size");
1691 }
1692}
1693
1694static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1695 bool IsVectorSuperClass) {
1696 // Currently, there is only 32-bit WWM register spills needed.
1697 if (Size != 4)
1698 llvm_unreachable("unknown wwm register spill size");
1699
1700 if (IsVectorSuperClass)
1701 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1702
1703 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1704}
1705
1707 const TargetRegisterClass *RC,
1708 unsigned Size,
1709 const SIRegisterInfo &TRI,
1710 const SIMachineFunctionInfo &MFI) {
1711 bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1712
1713 // Choose the right opcode if spilling a WWM register.
1715 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1716
1717 if (IsVectorSuperClass)
1718 return getAVSpillSaveOpcode(Size);
1719
1720 return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size)
1722}
1723
1726 bool isKill, int FrameIndex, const TargetRegisterClass *RC,
1727 const TargetRegisterInfo *TRI, Register VReg,
1728 MachineInstr::MIFlag Flags) const {
1731 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1732 const DebugLoc &DL = MBB.findDebugLoc(MI);
1733
1734 MachinePointerInfo PtrInfo
1735 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1737 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1738 FrameInfo.getObjectAlign(FrameIndex));
1739 unsigned SpillSize = TRI->getSpillSize(*RC);
1740
1742 if (RI.isSGPRClass(RC)) {
1743 MFI->setHasSpilledSGPRs();
1744 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1745 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1746 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1747
1748 // We are only allowed to create one new instruction when spilling
1749 // registers, so we need to use pseudo instruction for spilling SGPRs.
1750 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1751
1752 // The SGPR spill/restore instructions only work on number sgprs, so we need
1753 // to make sure we are using the correct register class.
1754 if (SrcReg.isVirtual() && SpillSize == 4) {
1755 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1756 }
1757
1758 BuildMI(MBB, MI, DL, OpDesc)
1759 .addReg(SrcReg, getKillRegState(isKill)) // data
1760 .addFrameIndex(FrameIndex) // addr
1761 .addMemOperand(MMO)
1763
1764 if (RI.spillSGPRToVGPR())
1765 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1766 return;
1767 }
1768
1769 unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC,
1770 SpillSize, RI, *MFI);
1771 MFI->setHasSpilledVGPRs();
1772
1773 BuildMI(MBB, MI, DL, get(Opcode))
1774 .addReg(SrcReg, getKillRegState(isKill)) // data
1775 .addFrameIndex(FrameIndex) // addr
1776 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1777 .addImm(0) // offset
1778 .addMemOperand(MMO);
1779}
1780
1781static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1782 switch (Size) {
1783 case 4:
1784 return AMDGPU::SI_SPILL_S32_RESTORE;
1785 case 8:
1786 return AMDGPU::SI_SPILL_S64_RESTORE;
1787 case 12:
1788 return AMDGPU::SI_SPILL_S96_RESTORE;
1789 case 16:
1790 return AMDGPU::SI_SPILL_S128_RESTORE;
1791 case 20:
1792 return AMDGPU::SI_SPILL_S160_RESTORE;
1793 case 24:
1794 return AMDGPU::SI_SPILL_S192_RESTORE;
1795 case 28:
1796 return AMDGPU::SI_SPILL_S224_RESTORE;
1797 case 32:
1798 return AMDGPU::SI_SPILL_S256_RESTORE;
1799 case 36:
1800 return AMDGPU::SI_SPILL_S288_RESTORE;
1801 case 40:
1802 return AMDGPU::SI_SPILL_S320_RESTORE;
1803 case 44:
1804 return AMDGPU::SI_SPILL_S352_RESTORE;
1805 case 48:
1806 return AMDGPU::SI_SPILL_S384_RESTORE;
1807 case 64:
1808 return AMDGPU::SI_SPILL_S512_RESTORE;
1809 case 128:
1810 return AMDGPU::SI_SPILL_S1024_RESTORE;
1811 default:
1812 llvm_unreachable("unknown register size");
1813 }
1814}
1815
1816static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1817 switch (Size) {
1818 case 4:
1819 return AMDGPU::SI_SPILL_V32_RESTORE;
1820 case 8:
1821 return AMDGPU::SI_SPILL_V64_RESTORE;
1822 case 12:
1823 return AMDGPU::SI_SPILL_V96_RESTORE;
1824 case 16:
1825 return AMDGPU::SI_SPILL_V128_RESTORE;
1826 case 20:
1827 return AMDGPU::SI_SPILL_V160_RESTORE;
1828 case 24:
1829 return AMDGPU::SI_SPILL_V192_RESTORE;
1830 case 28:
1831 return AMDGPU::SI_SPILL_V224_RESTORE;
1832 case 32:
1833 return AMDGPU::SI_SPILL_V256_RESTORE;
1834 case 36:
1835 return AMDGPU::SI_SPILL_V288_RESTORE;
1836 case 40:
1837 return AMDGPU::SI_SPILL_V320_RESTORE;
1838 case 44:
1839 return AMDGPU::SI_SPILL_V352_RESTORE;
1840 case 48:
1841 return AMDGPU::SI_SPILL_V384_RESTORE;
1842 case 64:
1843 return AMDGPU::SI_SPILL_V512_RESTORE;
1844 case 128:
1845 return AMDGPU::SI_SPILL_V1024_RESTORE;
1846 default:
1847 llvm_unreachable("unknown register size");
1848 }
1849}
1850
1851static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
1852 switch (Size) {
1853 case 4:
1854 return AMDGPU::SI_SPILL_A32_RESTORE;
1855 case 8:
1856 return AMDGPU::SI_SPILL_A64_RESTORE;
1857 case 12:
1858 return AMDGPU::SI_SPILL_A96_RESTORE;
1859 case 16:
1860 return AMDGPU::SI_SPILL_A128_RESTORE;
1861 case 20:
1862 return AMDGPU::SI_SPILL_A160_RESTORE;
1863 case 24:
1864 return AMDGPU::SI_SPILL_A192_RESTORE;
1865 case 28:
1866 return AMDGPU::SI_SPILL_A224_RESTORE;
1867 case 32:
1868 return AMDGPU::SI_SPILL_A256_RESTORE;
1869 case 36:
1870 return AMDGPU::SI_SPILL_A288_RESTORE;
1871 case 40:
1872 return AMDGPU::SI_SPILL_A320_RESTORE;
1873 case 44:
1874 return AMDGPU::SI_SPILL_A352_RESTORE;
1875 case 48:
1876 return AMDGPU::SI_SPILL_A384_RESTORE;
1877 case 64:
1878 return AMDGPU::SI_SPILL_A512_RESTORE;
1879 case 128:
1880 return AMDGPU::SI_SPILL_A1024_RESTORE;
1881 default:
1882 llvm_unreachable("unknown register size");
1883 }
1884}
1885
1886static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1887 switch (Size) {
1888 case 4:
1889 return AMDGPU::SI_SPILL_AV32_RESTORE;
1890 case 8:
1891 return AMDGPU::SI_SPILL_AV64_RESTORE;
1892 case 12:
1893 return AMDGPU::SI_SPILL_AV96_RESTORE;
1894 case 16:
1895 return AMDGPU::SI_SPILL_AV128_RESTORE;
1896 case 20:
1897 return AMDGPU::SI_SPILL_AV160_RESTORE;
1898 case 24:
1899 return AMDGPU::SI_SPILL_AV192_RESTORE;
1900 case 28:
1901 return AMDGPU::SI_SPILL_AV224_RESTORE;
1902 case 32:
1903 return AMDGPU::SI_SPILL_AV256_RESTORE;
1904 case 36:
1905 return AMDGPU::SI_SPILL_AV288_RESTORE;
1906 case 40:
1907 return AMDGPU::SI_SPILL_AV320_RESTORE;
1908 case 44:
1909 return AMDGPU::SI_SPILL_AV352_RESTORE;
1910 case 48:
1911 return AMDGPU::SI_SPILL_AV384_RESTORE;
1912 case 64:
1913 return AMDGPU::SI_SPILL_AV512_RESTORE;
1914 case 128:
1915 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1916 default:
1917 llvm_unreachable("unknown register size");
1918 }
1919}
1920
1921static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1922 bool IsVectorSuperClass) {
1923 // Currently, there is only 32-bit WWM register spills needed.
1924 if (Size != 4)
1925 llvm_unreachable("unknown wwm register spill size");
1926
1927 if (IsVectorSuperClass)
1928 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1929
1930 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1931}
1932
1933static unsigned
1935 unsigned Size, const SIRegisterInfo &TRI,
1936 const SIMachineFunctionInfo &MFI) {
1937 bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1938
1939 // Choose the right opcode if restoring a WWM register.
1941 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1942
1943 if (IsVectorSuperClass)
1945
1946 return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size)
1948}
1949
1952 Register DestReg, int FrameIndex,
1953 const TargetRegisterClass *RC,
1954 const TargetRegisterInfo *TRI,
1955 Register VReg,
1956 MachineInstr::MIFlag Flags) const {
1959 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1960 const DebugLoc &DL = MBB.findDebugLoc(MI);
1961 unsigned SpillSize = TRI->getSpillSize(*RC);
1962
1963 MachinePointerInfo PtrInfo
1964 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1965
1967 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1968 FrameInfo.getObjectAlign(FrameIndex));
1969
1970 if (RI.isSGPRClass(RC)) {
1971 MFI->setHasSpilledSGPRs();
1972 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1973 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1974 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1975
1976 // FIXME: Maybe this should not include a memoperand because it will be
1977 // lowered to non-memory instructions.
1978 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1979 if (DestReg.isVirtual() && SpillSize == 4) {
1981 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1982 }
1983
1984 if (RI.spillSGPRToVGPR())
1985 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1986 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1987 .addFrameIndex(FrameIndex) // addr
1988 .addMemOperand(MMO)
1990
1991 return;
1992 }
1993
1994 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1995 SpillSize, RI, *MFI);
1996 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1997 .addFrameIndex(FrameIndex) // vaddr
1998 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1999 .addImm(0) // offset
2000 .addMemOperand(MMO);
2001}
2002
2005 insertNoops(MBB, MI, 1);
2006}
2007
2010 unsigned Quantity) const {
2012 while (Quantity > 0) {
2013 unsigned Arg = std::min(Quantity, 8u);
2014 Quantity -= Arg;
2015 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
2016 }
2017}
2018
2020 auto *MF = MBB.getParent();
2022
2023 assert(Info->isEntryFunction());
2024
2025 if (MBB.succ_empty()) {
2026 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
2027 if (HasNoTerminator) {
2028 if (Info->returnsVoid()) {
2029 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
2030 } else {
2031 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
2032 }
2033 }
2034 }
2035}
2036
2040 const DebugLoc &DL) const {
2042 constexpr unsigned DoorbellIDMask = 0x3ff;
2043 constexpr unsigned ECQueueWaveAbort = 0x400;
2044
2045 MachineBasicBlock *TrapBB = &MBB;
2046 MachineBasicBlock *ContBB = &MBB;
2047 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
2048
2049 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
2050 ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
2051 TrapBB = MF->CreateMachineBasicBlock();
2052 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
2053 MF->push_back(TrapBB);
2054 MBB.addSuccessor(TrapBB);
2055 }
2056
2057 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
2058 // will be a nop.
2059 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
2060 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
2061 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2062 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
2063 DoorbellReg)
2065 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
2066 .addUse(AMDGPU::M0);
2067 Register DoorbellRegMasked =
2068 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2069 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
2070 .addUse(DoorbellReg)
2071 .addImm(DoorbellIDMask);
2072 Register SetWaveAbortBit =
2073 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2074 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
2075 .addUse(DoorbellRegMasked)
2076 .addImm(ECQueueWaveAbort);
2077 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2078 .addUse(SetWaveAbortBit);
2079 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
2081 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2082 .addUse(AMDGPU::TTMP2);
2083 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
2084 TrapBB->addSuccessor(HaltLoopBB);
2085
2086 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2087 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2088 .addMBB(HaltLoopBB);
2089 MF->push_back(HaltLoopBB);
2090 HaltLoopBB->addSuccessor(HaltLoopBB);
2091
2092 return ContBB;
2093}
2094
2096 switch (MI.getOpcode()) {
2097 default:
2098 if (MI.isMetaInstruction())
2099 return 0;
2100 return 1; // FIXME: Do wait states equal cycles?
2101
2102 case AMDGPU::S_NOP:
2103 return MI.getOperand(0).getImm() + 1;
2104 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2105 // hazard, even if one exist, won't really be visible. Should we handle it?
2106 }
2107}
2108
2110 MachineBasicBlock &MBB = *MI.getParent();
2112 switch (MI.getOpcode()) {
2113 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2114 case AMDGPU::S_MOV_B64_term:
2115 // This is only a terminator to get the correct spill code placement during
2116 // register allocation.
2117 MI.setDesc(get(AMDGPU::S_MOV_B64));
2118 break;
2119
2120 case AMDGPU::S_MOV_B32_term:
2121 // This is only a terminator to get the correct spill code placement during
2122 // register allocation.
2123 MI.setDesc(get(AMDGPU::S_MOV_B32));
2124 break;
2125
2126 case AMDGPU::S_XOR_B64_term:
2127 // This is only a terminator to get the correct spill code placement during
2128 // register allocation.
2129 MI.setDesc(get(AMDGPU::S_XOR_B64));
2130 break;
2131
2132 case AMDGPU::S_XOR_B32_term:
2133 // This is only a terminator to get the correct spill code placement during
2134 // register allocation.
2135 MI.setDesc(get(AMDGPU::S_XOR_B32));
2136 break;
2137 case AMDGPU::S_OR_B64_term:
2138 // This is only a terminator to get the correct spill code placement during
2139 // register allocation.
2140 MI.setDesc(get(AMDGPU::S_OR_B64));
2141 break;
2142 case AMDGPU::S_OR_B32_term:
2143 // This is only a terminator to get the correct spill code placement during
2144 // register allocation.
2145 MI.setDesc(get(AMDGPU::S_OR_B32));
2146 break;
2147
2148 case AMDGPU::S_ANDN2_B64_term:
2149 // This is only a terminator to get the correct spill code placement during
2150 // register allocation.
2151 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2152 break;
2153
2154 case AMDGPU::S_ANDN2_B32_term:
2155 // This is only a terminator to get the correct spill code placement during
2156 // register allocation.
2157 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2158 break;
2159
2160 case AMDGPU::S_AND_B64_term:
2161 // This is only a terminator to get the correct spill code placement during
2162 // register allocation.
2163 MI.setDesc(get(AMDGPU::S_AND_B64));
2164 break;
2165
2166 case AMDGPU::S_AND_B32_term:
2167 // This is only a terminator to get the correct spill code placement during
2168 // register allocation.
2169 MI.setDesc(get(AMDGPU::S_AND_B32));
2170 break;
2171
2172 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2173 // This is only a terminator to get the correct spill code placement during
2174 // register allocation.
2175 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2176 break;
2177
2178 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2179 // This is only a terminator to get the correct spill code placement during
2180 // register allocation.
2181 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2182 break;
2183
2184 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2185 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2186 break;
2187
2188 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2189 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2190 break;
2191
2192 case AMDGPU::V_MOV_B64_PSEUDO: {
2193 Register Dst = MI.getOperand(0).getReg();
2194 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2195 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2196
2197 const MachineOperand &SrcOp = MI.getOperand(1);
2198 // FIXME: Will this work for 64-bit floating point immediates?
2199 assert(!SrcOp.isFPImm());
2200 if (ST.hasMovB64()) {
2201 MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
2202 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2203 isUInt<32>(SrcOp.getImm()))
2204 break;
2205 }
2206 if (SrcOp.isImm()) {
2207 APInt Imm(64, SrcOp.getImm());
2208 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2209 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2210 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2211 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2213 .addImm(Lo.getSExtValue())
2215 .addImm(Lo.getSExtValue())
2216 .addImm(0) // op_sel_lo
2217 .addImm(0) // op_sel_hi
2218 .addImm(0) // neg_lo
2219 .addImm(0) // neg_hi
2220 .addImm(0); // clamp
2221 } else {
2222 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2223 .addImm(Lo.getSExtValue())
2225 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2226 .addImm(Hi.getSExtValue())
2228 }
2229 } else {
2230 assert(SrcOp.isReg());
2231 if (ST.hasPkMovB32() &&
2232 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2233 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2234 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2235 .addReg(SrcOp.getReg())
2237 .addReg(SrcOp.getReg())
2238 .addImm(0) // op_sel_lo
2239 .addImm(0) // op_sel_hi
2240 .addImm(0) // neg_lo
2241 .addImm(0) // neg_hi
2242 .addImm(0); // clamp
2243 } else {
2244 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2245 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2247 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2248 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2250 }
2251 }
2252 MI.eraseFromParent();
2253 break;
2254 }
2255 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2257 break;
2258 }
2259 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2260 const MachineOperand &SrcOp = MI.getOperand(1);
2261 assert(!SrcOp.isFPImm());
2262 APInt Imm(64, SrcOp.getImm());
2263 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2264 MI.setDesc(get(AMDGPU::S_MOV_B64));
2265 break;
2266 }
2267
2268 Register Dst = MI.getOperand(0).getReg();
2269 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2270 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2271
2272 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2273 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2274 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2275 .addImm(Lo.getSExtValue())
2277 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2278 .addImm(Hi.getSExtValue())
2280 MI.eraseFromParent();
2281 break;
2282 }
2283 case AMDGPU::V_SET_INACTIVE_B32: {
2284 // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
2285 Register DstReg = MI.getOperand(0).getReg();
2286 BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2287 .add(MI.getOperand(3))
2288 .add(MI.getOperand(4))
2289 .add(MI.getOperand(1))
2290 .add(MI.getOperand(2))
2291 .add(MI.getOperand(5));
2292 MI.eraseFromParent();
2293 break;
2294 }
2295 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2296 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2297 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2298 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2299 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2300 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2301 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2302 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2303 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2304 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2305 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2306 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2307 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2308 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2309 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2310 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2311 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2312 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2313 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2314 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2315 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2316 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2317 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2318 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2319 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2320 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2321 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2322 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2323 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2324 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2325
2326 unsigned Opc;
2327 if (RI.hasVGPRs(EltRC)) {
2328 Opc = AMDGPU::V_MOVRELD_B32_e32;
2329 } else {
2330 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2331 : AMDGPU::S_MOVRELD_B32;
2332 }
2333
2334 const MCInstrDesc &OpDesc = get(Opc);
2335 Register VecReg = MI.getOperand(0).getReg();
2336 bool IsUndef = MI.getOperand(1).isUndef();
2337 unsigned SubReg = MI.getOperand(3).getImm();
2338 assert(VecReg == MI.getOperand(1).getReg());
2339
2341 BuildMI(MBB, MI, DL, OpDesc)
2342 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2343 .add(MI.getOperand(2))
2345 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2346
2347 const int ImpDefIdx =
2348 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2349 const int ImpUseIdx = ImpDefIdx + 1;
2350 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2351 MI.eraseFromParent();
2352 break;
2353 }
2354 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2355 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2356 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2357 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2358 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2359 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2360 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2361 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2362 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2363 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2364 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2365 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2367 Register VecReg = MI.getOperand(0).getReg();
2368 bool IsUndef = MI.getOperand(1).isUndef();
2369 Register Idx = MI.getOperand(3).getReg();
2370 Register SubReg = MI.getOperand(4).getImm();
2371
2372 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2373 .addReg(Idx)
2375 SetOn->getOperand(3).setIsUndef();
2376
2377 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2379 BuildMI(MBB, MI, DL, OpDesc)
2380 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2381 .add(MI.getOperand(2))
2383 .addReg(VecReg,
2384 RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2385
2386 const int ImpDefIdx =
2387 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2388 const int ImpUseIdx = ImpDefIdx + 1;
2389 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2390
2391 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2392
2393 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2394
2395 MI.eraseFromParent();
2396 break;
2397 }
2398 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2399 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2400 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2401 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2402 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2403 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2404 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2405 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2406 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2407 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2408 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2409 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2411 Register Dst = MI.getOperand(0).getReg();
2412 Register VecReg = MI.getOperand(1).getReg();
2413 bool IsUndef = MI.getOperand(1).isUndef();
2414 Register Idx = MI.getOperand(2).getReg();
2415 Register SubReg = MI.getOperand(3).getImm();
2416
2417 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2418 .addReg(Idx)
2420 SetOn->getOperand(3).setIsUndef();
2421
2422 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2423 .addDef(Dst)
2424 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2425 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2426
2427 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2428
2429 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2430
2431 MI.eraseFromParent();
2432 break;
2433 }
2434 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2435 MachineFunction &MF = *MBB.getParent();
2436 Register Reg = MI.getOperand(0).getReg();
2437 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2438 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2439 MachineOperand OpLo = MI.getOperand(1);
2440 MachineOperand OpHi = MI.getOperand(2);
2441
2442 // Create a bundle so these instructions won't be re-ordered by the
2443 // post-RA scheduler.
2444 MIBundleBuilder Bundler(MBB, MI);
2445 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2446
2447 // What we want here is an offset from the value returned by s_getpc (which
2448 // is the address of the s_add_u32 instruction) to the global variable, but
2449 // since the encoding of $symbol starts 4 bytes after the start of the
2450 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2451 // small. This requires us to add 4 to the global variable offset in order
2452 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2453 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2454 // instruction.
2455
2456 int64_t Adjust = 0;
2457 if (ST.hasGetPCZeroExtension()) {
2458 // Fix up hardware that does not sign-extend the 48-bit PC value by
2459 // inserting: s_sext_i32_i16 reghi, reghi
2460 Bundler.append(
2461 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2462 Adjust += 4;
2463 }
2464
2465 if (OpLo.isGlobal())
2466 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2467 Bundler.append(
2468 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2469
2470 if (OpHi.isGlobal())
2471 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2472 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2473 .addReg(RegHi)
2474 .add(OpHi));
2475
2476 finalizeBundle(MBB, Bundler.begin());
2477
2478 MI.eraseFromParent();
2479 break;
2480 }
2481 case AMDGPU::ENTER_STRICT_WWM: {
2482 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2483 // Whole Wave Mode is entered.
2484 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
2485 : AMDGPU::S_OR_SAVEEXEC_B64));
2486 break;
2487 }
2488 case AMDGPU::ENTER_STRICT_WQM: {
2489 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2490 // STRICT_WQM is entered.
2491 const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2492 const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64;
2493 const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2494 BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec);
2495 BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec);
2496
2497 MI.eraseFromParent();
2498 break;
2499 }
2500 case AMDGPU::EXIT_STRICT_WWM:
2501 case AMDGPU::EXIT_STRICT_WQM: {
2502 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2503 // WWM/STICT_WQM is exited.
2504 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
2505 break;
2506 }
2507 case AMDGPU::SI_RETURN: {
2508 const MachineFunction *MF = MBB.getParent();
2509 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2510 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2511 // Hiding the return address use with SI_RETURN may lead to extra kills in
2512 // the function and missing live-ins. We are fine in practice because callee
2513 // saved register handling ensures the register value is restored before
2514 // RET, but we need the undef flag here to appease the MachineVerifier
2515 // liveness checks.
2517 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2518 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2519
2520 MIB.copyImplicitOps(MI);
2521 MI.eraseFromParent();
2522 break;
2523 }
2524
2525 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2526 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2527 MI.setDesc(get(AMDGPU::S_MUL_U64));
2528 break;
2529
2530 case AMDGPU::S_GETPC_B64_pseudo:
2531 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2532 if (ST.hasGetPCZeroExtension()) {
2533 Register Dst = MI.getOperand(0).getReg();
2534 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2535 // Fix up hardware that does not sign-extend the 48-bit PC value by
2536 // inserting: s_sext_i32_i16 dsthi, dsthi
2537 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2538 DstHi)
2539 .addReg(DstHi);
2540 }
2541 break;
2542 }
2543 return true;
2544}
2545
2548 unsigned SubIdx, const MachineInstr &Orig,
2549 const TargetRegisterInfo &RI) const {
2550
2551 // Try shrinking the instruction to remat only the part needed for current
2552 // context.
2553 // TODO: Handle more cases.
2554 unsigned Opcode = Orig.getOpcode();
2555 switch (Opcode) {
2556 case AMDGPU::S_LOAD_DWORDX16_IMM:
2557 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2558 if (SubIdx != 0)
2559 break;
2560
2561 if (I == MBB.end())
2562 break;
2563
2564 if (I->isBundled())
2565 break;
2566
2567 // Look for a single use of the register that is also a subreg.
2568 Register RegToFind = Orig.getOperand(0).getReg();
2569 MachineOperand *UseMO = nullptr;
2570 for (auto &CandMO : I->operands()) {
2571 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2572 continue;
2573 if (UseMO) {
2574 UseMO = nullptr;
2575 break;
2576 }
2577 UseMO = &CandMO;
2578 }
2579 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2580 break;
2581
2582 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2583 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2584
2587 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2588
2589 unsigned NewOpcode = -1;
2590 if (SubregSize == 256)
2591 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2592 else if (SubregSize == 128)
2593 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2594 else
2595 break;
2596
2597 const MCInstrDesc &TID = get(NewOpcode);
2598 const TargetRegisterClass *NewRC =
2599 RI.getAllocatableClass(getRegClass(TID, 0, &RI, *MF));
2600 MRI.setRegClass(DestReg, NewRC);
2601
2602 UseMO->setReg(DestReg);
2603 UseMO->setSubReg(AMDGPU::NoSubRegister);
2604
2605 // Use a smaller load with the desired size, possibly with updated offset.
2606 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2607 MI->setDesc(TID);
2608 MI->getOperand(0).setReg(DestReg);
2609 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2610 if (Offset) {
2611 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2612 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2613 OffsetMO->setImm(FinalOffset);
2614 }
2616 for (const MachineMemOperand *MemOp : Orig.memoperands())
2617 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2618 SubregSize / 8));
2619 MI->setMemRefs(*MF, NewMMOs);
2620
2621 MBB.insert(I, MI);
2622 return;
2623 }
2624
2625 default:
2626 break;
2627 }
2628
2629 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
2630}
2631
2632std::pair<MachineInstr*, MachineInstr*>
2634 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2635
2636 if (ST.hasMovB64() &&
2638 getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2639 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2640 return std::pair(&MI, nullptr);
2641 }
2642
2643 MachineBasicBlock &MBB = *MI.getParent();
2647 Register Dst = MI.getOperand(0).getReg();
2648 unsigned Part = 0;
2649 MachineInstr *Split[2];
2650
2651 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2652 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2653 if (Dst.isPhysical()) {
2654 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2655 } else {
2656 assert(MRI.isSSA());
2657 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2658 MovDPP.addDef(Tmp);
2659 }
2660
2661 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2662 const MachineOperand &SrcOp = MI.getOperand(I);
2663 assert(!SrcOp.isFPImm());
2664 if (SrcOp.isImm()) {
2665 APInt Imm(64, SrcOp.getImm());
2666 Imm.ashrInPlace(Part * 32);
2667 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2668 } else {
2669 assert(SrcOp.isReg());
2670 Register Src = SrcOp.getReg();
2671 if (Src.isPhysical())
2672 MovDPP.addReg(RI.getSubReg(Src, Sub));
2673 else
2674 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
2675 }
2676 }
2677
2678 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2679 MovDPP.addImm(MO.getImm());
2680
2681 Split[Part] = MovDPP;
2682 ++Part;
2683 }
2684
2685 if (Dst.isVirtual())
2686 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2687 .addReg(Split[0]->getOperand(0).getReg())
2688 .addImm(AMDGPU::sub0)
2689 .addReg(Split[1]->getOperand(0).getReg())
2690 .addImm(AMDGPU::sub1);
2691
2692 MI.eraseFromParent();
2693 return std::pair(Split[0], Split[1]);
2694}
2695
2696std::optional<DestSourcePair>
2698 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2699 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2700
2701 return std::nullopt;
2702}
2703
2705 MachineOperand &Src0,
2706 unsigned Src0OpName,
2707 MachineOperand &Src1,
2708 unsigned Src1OpName) const {
2709 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2710 if (!Src0Mods)
2711 return false;
2712
2713 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2714 assert(Src1Mods &&
2715 "All commutable instructions have both src0 and src1 modifiers");
2716
2717 int Src0ModsVal = Src0Mods->getImm();
2718 int Src1ModsVal = Src1Mods->getImm();
2719
2720 Src1Mods->setImm(Src0ModsVal);
2721 Src0Mods->setImm(Src1ModsVal);
2722 return true;
2723}
2724
2726 MachineOperand &RegOp,
2727 MachineOperand &NonRegOp) {
2728 Register Reg = RegOp.getReg();
2729 unsigned SubReg = RegOp.getSubReg();
2730 bool IsKill = RegOp.isKill();
2731 bool IsDead = RegOp.isDead();
2732 bool IsUndef = RegOp.isUndef();
2733 bool IsDebug = RegOp.isDebug();
2734
2735 if (NonRegOp.isImm())
2736 RegOp.ChangeToImmediate(NonRegOp.getImm());
2737 else if (NonRegOp.isFI())
2738 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2739 else if (NonRegOp.isGlobal()) {
2740 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2741 NonRegOp.getTargetFlags());
2742 } else
2743 return nullptr;
2744
2745 // Make sure we don't reinterpret a subreg index in the target flags.
2746 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2747
2748 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2749 NonRegOp.setSubReg(SubReg);
2750
2751 return &MI;
2752}
2753
2755 MachineOperand &NonRegOp1,
2756 MachineOperand &NonRegOp2) {
2757 unsigned TargetFlags = NonRegOp1.getTargetFlags();
2758 int64_t NonRegVal = NonRegOp1.getImm();
2759
2760 NonRegOp1.setImm(NonRegOp2.getImm());
2761 NonRegOp2.setImm(NonRegVal);
2762 NonRegOp1.setTargetFlags(NonRegOp2.getTargetFlags());
2763 NonRegOp2.setTargetFlags(TargetFlags);
2764 return &MI;
2765}
2766
2767bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0,
2768 const MachineOperand *MO0, unsigned OpIdx1,
2769 const MachineOperand *MO1) const {
2770 const MCInstrDesc &InstDesc = MI.getDesc();
2771 const MCOperandInfo &OpInfo0 = InstDesc.operands()[OpIdx0];
2772 const MCOperandInfo &OpInfo1 = InstDesc.operands()[OpIdx1];
2773 const TargetRegisterClass *DefinedRC1 =
2774 OpInfo1.RegClass != -1 ? RI.getRegClass(OpInfo1.RegClass) : nullptr;
2775 const TargetRegisterClass *DefinedRC0 =
2776 OpInfo1.RegClass != -1 ? RI.getRegClass(OpInfo0.RegClass) : nullptr;
2777
2778 unsigned Opc = MI.getOpcode();
2779 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2780
2781 // Swap doesn't breach constant bus or literal limits
2782 // It may move literal to position other than src0, this is not allowed
2783 // pre-gfx10 However, most test cases need literals in Src0 for VOP
2784 // FIXME: After gfx9, literal can be in place other than Src0
2785 if (isVALU(MI)) {
2786 if ((int)OpIdx0 == Src0Idx && !MO0->isReg() &&
2787 !isInlineConstant(*MO0, OpInfo1))
2788 return false;
2789 if ((int)OpIdx1 == Src0Idx && !MO1->isReg() &&
2790 !isInlineConstant(*MO1, OpInfo0))
2791 return false;
2792 }
2793
2794 if ((int)OpIdx1 != Src0Idx && MO0->isReg()) {
2795 if (!DefinedRC1)
2796 return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN;
2797 return isLegalRegOperand(MI, OpIdx1, *MO0);
2798 }
2799 if ((int)OpIdx0 != Src0Idx && MO1->isReg()) {
2800 if (!DefinedRC0)
2801 return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN;
2802 return isLegalRegOperand(MI, OpIdx0, *MO1);
2803 }
2804
2805 // No need to check 64-bit literals since swapping does not bring new
2806 // 64-bit literals into current instruction to fold to 32-bit
2807
2808 return isImmOperandLegal(MI, OpIdx1, *MO0);
2809}
2810
2812 unsigned Src0Idx,
2813 unsigned Src1Idx) const {
2814 assert(!NewMI && "this should never be used");
2815
2816 unsigned Opc = MI.getOpcode();
2817 int CommutedOpcode = commuteOpcode(Opc);
2818 if (CommutedOpcode == -1)
2819 return nullptr;
2820
2821 if (Src0Idx > Src1Idx)
2822 std::swap(Src0Idx, Src1Idx);
2823
2824 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2825 static_cast<int>(Src0Idx) &&
2826 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2827 static_cast<int>(Src1Idx) &&
2828 "inconsistency with findCommutedOpIndices");
2829
2830 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2831 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2832 if (!isLegalToSwap(MI, Src0Idx, &Src0, Src1Idx, &Src1)) {
2833 return nullptr;
2834 }
2835 MachineInstr *CommutedMI = nullptr;
2836 if (Src0.isReg() && Src1.isReg()) {
2837 // Be sure to copy the source modifiers to the right place.
2838 CommutedMI =
2839 TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2840 } else if (Src0.isReg() && !Src1.isReg()) {
2841 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2842 } else if (!Src0.isReg() && Src1.isReg()) {
2843 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2844 } else if (Src0.isImm() && Src1.isImm()) {
2845 CommutedMI = swapImmOperands(MI, Src0, Src1);
2846 } else {
2847 // FIXME: Found two non registers to commute. This does happen.
2848 return nullptr;
2849 }
2850
2851 if (CommutedMI) {
2852 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2853 Src1, AMDGPU::OpName::src1_modifiers);
2854
2855 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_sel, Src1,
2856 AMDGPU::OpName::src1_sel);
2857
2858 CommutedMI->setDesc(get(CommutedOpcode));
2859 }
2860
2861 return CommutedMI;
2862}
2863
2864// This needs to be implemented because the source modifiers may be inserted
2865// between the true commutable operands, and the base
2866// TargetInstrInfo::commuteInstruction uses it.
2868 unsigned &SrcOpIdx0,
2869 unsigned &SrcOpIdx1) const {
2870 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2871}
2872
2874 unsigned &SrcOpIdx0,
2875 unsigned &SrcOpIdx1) const {
2876 if (!Desc.isCommutable())
2877 return false;
2878
2879 unsigned Opc = Desc.getOpcode();
2880 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2881 if (Src0Idx == -1)
2882 return false;
2883
2884 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2885 if (Src1Idx == -1)
2886 return false;
2887
2888 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2889}
2890
2892 int64_t BrOffset) const {
2893 // BranchRelaxation should never have to check s_setpc_b64 because its dest
2894 // block is unanalyzable.
2895 assert(BranchOp != AMDGPU::S_SETPC_B64);
2896
2897 // Convert to dwords.
2898 BrOffset /= 4;
2899
2900 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2901 // from the next instruction.
2902 BrOffset -= 1;
2903
2904 return isIntN(BranchOffsetBits, BrOffset);
2905}
2906
2909 return MI.getOperand(0).getMBB();
2910}
2911
2913 for (const MachineInstr &MI : MBB->terminators()) {
2914 if (MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2915 MI.getOpcode() == AMDGPU::SI_LOOP)
2916 return true;
2917 }
2918 return false;
2919}
2920
2922 MachineBasicBlock &DestBB,
2923 MachineBasicBlock &RestoreBB,
2924 const DebugLoc &DL, int64_t BrOffset,
2925 RegScavenger *RS) const {
2926 assert(RS && "RegScavenger required for long branching");
2927 assert(MBB.empty() &&
2928 "new block should be inserted for expanding unconditional branch");
2929 assert(MBB.pred_size() == 1);
2930 assert(RestoreBB.empty() &&
2931 "restore block should be inserted for restoring clobbered registers");
2932
2936
2937 // FIXME: Virtual register workaround for RegScavenger not working with empty
2938 // blocks.
2939 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2940
2941 auto I = MBB.end();
2942
2943 // Note: as this is used after hazard recognizer we need to apply some hazard
2944 // workarounds directly.
2945 const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) ||
2947 auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
2948 if (FlushSGPRWrites)
2949 BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
2951 };
2952
2953 // We need to compute the offset relative to the instruction immediately after
2954 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2955 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2956 ApplyHazardWorkarounds();
2957
2958 auto &MCCtx = MF->getContext();
2959 MCSymbol *PostGetPCLabel =
2960 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2961 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2962
2963 MCSymbol *OffsetLo =
2964 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2965 MCSymbol *OffsetHi =
2966 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2967 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2968 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2969 .addReg(PCReg, 0, AMDGPU::sub0)
2970 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2971 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2972 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2973 .addReg(PCReg, 0, AMDGPU::sub1)
2974 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2975 ApplyHazardWorkarounds();
2976
2977 // Insert the indirect branch after the other terminator.
2978 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2979 .addReg(PCReg);
2980
2981 // If a spill is needed for the pc register pair, we need to insert a spill
2982 // restore block right before the destination block, and insert a short branch
2983 // into the old destination block's fallthrough predecessor.
2984 // e.g.:
2985 //
2986 // s_cbranch_scc0 skip_long_branch:
2987 //
2988 // long_branch_bb:
2989 // spill s[8:9]
2990 // s_getpc_b64 s[8:9]
2991 // s_add_u32 s8, s8, restore_bb
2992 // s_addc_u32 s9, s9, 0
2993 // s_setpc_b64 s[8:9]
2994 //
2995 // skip_long_branch:
2996 // foo;
2997 //
2998 // .....
2999 //
3000 // dest_bb_fallthrough_predecessor:
3001 // bar;
3002 // s_branch dest_bb
3003 //
3004 // restore_bb:
3005 // restore s[8:9]
3006 // fallthrough dest_bb
3007 ///
3008 // dest_bb:
3009 // buzz;
3010
3011 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
3012 Register Scav;
3013
3014 // If we've previously reserved a register for long branches
3015 // avoid running the scavenger and just use those registers
3016 if (LongBranchReservedReg) {
3017 RS->enterBasicBlock(MBB);
3018 Scav = LongBranchReservedReg;
3019 } else {
3021 Scav = RS->scavengeRegisterBackwards(
3022 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
3023 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
3024 }
3025 if (Scav) {
3026 RS->setRegUsed(Scav);
3027 MRI.replaceRegWith(PCReg, Scav);
3028 MRI.clearVirtRegs();
3029 } else {
3030 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
3031 // SGPR spill.
3032 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3033 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3034 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
3035 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
3036 MRI.clearVirtRegs();
3037 }
3038
3039 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
3040 // Now, the distance could be defined.
3042 MCSymbolRefExpr::create(DestLabel, MCCtx),
3043 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
3044 // Add offset assignments.
3045 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
3046 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
3047 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
3048 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
3049}
3050
3051unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
3052 switch (Cond) {
3053 case SIInstrInfo::SCC_TRUE:
3054 return AMDGPU::S_CBRANCH_SCC1;
3055 case SIInstrInfo::SCC_FALSE:
3056 return AMDGPU::S_CBRANCH_SCC0;
3057 case SIInstrInfo::VCCNZ:
3058 return AMDGPU::S_CBRANCH_VCCNZ;
3059 case SIInstrInfo::VCCZ:
3060 return AMDGPU::S_CBRANCH_VCCZ;
3061 case SIInstrInfo::EXECNZ:
3062 return AMDGPU::S_CBRANCH_EXECNZ;
3063 case SIInstrInfo::EXECZ:
3064 return AMDGPU::S_CBRANCH_EXECZ;
3065 default:
3066 llvm_unreachable("invalid branch predicate");
3067 }
3068}
3069
3070SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3071 switch (Opcode) {
3072 case AMDGPU::S_CBRANCH_SCC0:
3073 return SCC_FALSE;
3074 case AMDGPU::S_CBRANCH_SCC1:
3075 return SCC_TRUE;
3076 case AMDGPU::S_CBRANCH_VCCNZ:
3077 return VCCNZ;
3078 case AMDGPU::S_CBRANCH_VCCZ:
3079 return VCCZ;
3080 case AMDGPU::S_CBRANCH_EXECNZ:
3081 return EXECNZ;
3082 case AMDGPU::S_CBRANCH_EXECZ:
3083 return EXECZ;
3084 default:
3085 return INVALID_BR;
3086 }
3087}
3088
3092 MachineBasicBlock *&FBB,
3094 bool AllowModify) const {
3095 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3096 // Unconditional Branch
3097 TBB = I->getOperand(0).getMBB();
3098 return false;
3099 }
3100
3101 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3102 if (Pred == INVALID_BR)
3103 return true;
3104
3105 MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
3106 Cond.push_back(MachineOperand::CreateImm(Pred));
3107 Cond.push_back(I->getOperand(1)); // Save the branch register.
3108
3109 ++I;
3110
3111 if (I == MBB.end()) {
3112 // Conditional branch followed by fall-through.
3113 TBB = CondBB;
3114 return false;
3115 }
3116
3117 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3118 TBB = CondBB;
3119 FBB = I->getOperand(0).getMBB();
3120 return false;
3121 }
3122
3123 return true;
3124}
3125
3127 MachineBasicBlock *&FBB,
3129 bool AllowModify) const {
3131 auto E = MBB.end();
3132 if (I == E)
3133 return false;
3134
3135 // Skip over the instructions that are artificially terminators for special
3136 // exec management.
3137 while (I != E && !I->isBranch() && !I->isReturn()) {
3138 switch (I->getOpcode()) {
3139 case AMDGPU::S_MOV_B64_term:
3140 case AMDGPU::S_XOR_B64_term:
3141 case AMDGPU::S_OR_B64_term:
3142 case AMDGPU::S_ANDN2_B64_term:
3143 case AMDGPU::S_AND_B64_term:
3144 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3145 case AMDGPU::S_MOV_B32_term:
3146 case AMDGPU::S_XOR_B32_term:
3147 case AMDGPU::S_OR_B32_term:
3148 case AMDGPU::S_ANDN2_B32_term:
3149 case AMDGPU::S_AND_B32_term:
3150 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3151 break;
3152 case AMDGPU::SI_IF:
3153 case AMDGPU::SI_ELSE:
3154 case AMDGPU::SI_KILL_I1_TERMINATOR:
3155 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3156 // FIXME: It's messy that these need to be considered here at all.
3157 return true;
3158 default:
3159 llvm_unreachable("unexpected non-branch terminator inst");
3160 }
3161
3162 ++I;
3163 }
3164
3165 if (I == E)
3166 return false;
3167
3168 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3169}
3170
3172 int *BytesRemoved) const {
3173 unsigned Count = 0;
3174 unsigned RemovedSize = 0;
3176 // Skip over artificial terminators when removing instructions.
3177 if (MI.isBranch() || MI.isReturn()) {
3178 RemovedSize += getInstSizeInBytes(MI);
3179 MI.eraseFromParent();
3180 ++Count;
3181 }
3182 }
3183
3184 if (BytesRemoved)
3185 *BytesRemoved = RemovedSize;
3186
3187 return Count;
3188}
3189
3190// Copy the flags onto the implicit condition register operand.
3192 const MachineOperand &OrigCond) {
3193 CondReg.setIsUndef(OrigCond.isUndef());
3194 CondReg.setIsKill(OrigCond.isKill());
3195}
3196
3199 MachineBasicBlock *FBB,
3201 const DebugLoc &DL,
3202 int *BytesAdded) const {
3203 if (!FBB && Cond.empty()) {
3204 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3205 .addMBB(TBB);
3206 if (BytesAdded)
3207 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3208 return 1;
3209 }
3210
3211 assert(TBB && Cond[0].isImm());
3212
3213 unsigned Opcode
3214 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3215
3216 if (!FBB) {
3217 MachineInstr *CondBr =
3218 BuildMI(&MBB, DL, get(Opcode))
3219 .addMBB(TBB);
3220
3221 // Copy the flags onto the implicit condition register operand.
3222 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3223 fixImplicitOperands(*CondBr);
3224
3225 if (BytesAdded)
3226 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3227 return 1;
3228 }
3229
3230 assert(TBB && FBB);
3231
3232 MachineInstr *CondBr =
3233 BuildMI(&MBB, DL, get(Opcode))
3234 .addMBB(TBB);
3235 fixImplicitOperands(*CondBr);
3236 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3237 .addMBB(FBB);
3238
3239 MachineOperand &CondReg = CondBr->getOperand(1);
3240 CondReg.setIsUndef(Cond[1].isUndef());
3241 CondReg.setIsKill(Cond[1].isKill());
3242
3243 if (BytesAdded)
3244 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3245
3246 return 2;
3247}
3248
3251 if (Cond.size() != 2) {
3252 return true;
3253 }
3254
3255 if (Cond[0].isImm()) {
3256 Cond[0].setImm(-Cond[0].getImm());
3257 return false;
3258 }
3259
3260 return true;
3261}
3262
3265 Register DstReg, Register TrueReg,
3266 Register FalseReg, int &CondCycles,
3267 int &TrueCycles, int &FalseCycles) const {
3268 switch (Cond[0].getImm()) {
3269 case VCCNZ:
3270 case VCCZ: {
3272 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3273 if (MRI.getRegClass(FalseReg) != RC)
3274 return false;
3275
3276 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3277 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3278
3279 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3280 return RI.hasVGPRs(RC) && NumInsts <= 6;
3281 }
3282 case SCC_TRUE:
3283 case SCC_FALSE: {
3284 // FIXME: We could insert for VGPRs if we could replace the original compare
3285 // with a vector one.
3287 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3288 if (MRI.getRegClass(FalseReg) != RC)
3289 return false;
3290
3291 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3292
3293 // Multiples of 8 can do s_cselect_b64
3294 if (NumInsts % 2 == 0)
3295 NumInsts /= 2;
3296
3297 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3298 return RI.isSGPRClass(RC);
3299 }
3300 default:
3301 return false;
3302 }
3303}
3304
3308 Register TrueReg, Register FalseReg) const {
3309 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3310 if (Pred == VCCZ || Pred == SCC_FALSE) {
3311 Pred = static_cast<BranchPredicate>(-Pred);
3312 std::swap(TrueReg, FalseReg);
3313 }
3314
3316 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3317 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3318
3319 if (DstSize == 32) {
3321 if (Pred == SCC_TRUE) {
3322 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3323 .addReg(TrueReg)
3324 .addReg(FalseReg);
3325 } else {
3326 // Instruction's operands are backwards from what is expected.
3327 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3328 .addReg(FalseReg)
3329 .addReg(TrueReg);
3330 }
3331
3332 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3333 return;
3334 }
3335
3336 if (DstSize == 64 && Pred == SCC_TRUE) {
3338 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3339 .addReg(TrueReg)
3340 .addReg(FalseReg);
3341
3342 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3343 return;
3344 }
3345
3346 static const int16_t Sub0_15[] = {
3347 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3348 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3349 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3350 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3351 };
3352
3353 static const int16_t Sub0_15_64[] = {
3354 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3355 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3356 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3357 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3358 };
3359
3360 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3361 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3362 const int16_t *SubIndices = Sub0_15;
3363 int NElts = DstSize / 32;
3364
3365 // 64-bit select is only available for SALU.
3366 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3367 if (Pred == SCC_TRUE) {
3368 if (NElts % 2) {
3369 SelOp = AMDGPU::S_CSELECT_B32;
3370 EltRC = &AMDGPU::SGPR_32RegClass;
3371 } else {
3372 SelOp = AMDGPU::S_CSELECT_B64;
3373 EltRC = &AMDGPU::SGPR_64RegClass;
3374 SubIndices = Sub0_15_64;
3375 NElts /= 2;
3376 }
3377 }
3378
3380 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3381
3382 I = MIB->getIterator();
3383
3385 for (int Idx = 0; Idx != NElts; ++Idx) {
3386 Register DstElt = MRI.createVirtualRegister(EltRC);
3387 Regs.push_back(DstElt);
3388
3389 unsigned SubIdx = SubIndices[Idx];
3390
3392 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3393 Select =
3394 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3395 .addReg(FalseReg, 0, SubIdx)
3396 .addReg(TrueReg, 0, SubIdx);
3397 } else {
3398 Select =
3399 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3400 .addReg(TrueReg, 0, SubIdx)
3401 .addReg(FalseReg, 0, SubIdx);
3402 }
3403
3404 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3406
3407 MIB.addReg(DstElt)
3408 .addImm(SubIdx);
3409 }
3410}
3411
3413 switch (MI.getOpcode()) {
3414 case AMDGPU::V_MOV_B16_t16_e32:
3415 case AMDGPU::V_MOV_B16_t16_e64:
3416 case AMDGPU::V_MOV_B32_e32:
3417 case AMDGPU::V_MOV_B32_e64:
3418 case AMDGPU::V_MOV_B64_PSEUDO:
3419 case AMDGPU::V_MOV_B64_e32:
3420 case AMDGPU::V_MOV_B64_e64:
3421 case AMDGPU::S_MOV_B32:
3422 case AMDGPU::S_MOV_B64:
3423 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3424 case AMDGPU::COPY:
3425 case AMDGPU::WWM_COPY:
3426 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3427 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3428 case AMDGPU::V_ACCVGPR_MOV_B32:
3429 return true;
3430 default:
3431 return false;
3432 }
3433}
3434
3435static constexpr unsigned ModifierOpNames[] = {
3436 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3437 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3438 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3439
3441 unsigned Opc = MI.getOpcode();
3442 for (unsigned Name : reverse(ModifierOpNames)) {
3444 if (Idx >= 0)
3445 MI.removeOperand(Idx);
3446 }
3447}
3448
3450 Register Reg, MachineRegisterInfo *MRI) const {
3451 if (!MRI->hasOneNonDBGUse(Reg))
3452 return false;
3453
3454 switch (DefMI.getOpcode()) {
3455 default:
3456 return false;
3457 case AMDGPU::V_MOV_B64_e32:
3458 case AMDGPU::S_MOV_B64:
3459 case AMDGPU::V_MOV_B64_PSEUDO:
3460 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3461 case AMDGPU::V_MOV_B32_e32:
3462 case AMDGPU::S_MOV_B32:
3463 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3464 break;
3465 }
3466
3467 const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
3468 assert(ImmOp);
3469 // FIXME: We could handle FrameIndex values here.
3470 if (!ImmOp->isImm())
3471 return false;
3472
3473 auto getImmFor = [ImmOp](const MachineOperand &UseOp) -> int64_t {
3474 int64_t Imm = ImmOp->getImm();
3475 switch (UseOp.getSubReg()) {
3476 default:
3477 return Imm;
3478 case AMDGPU::sub0:
3479 return Lo_32(Imm);
3480 case AMDGPU::sub1:
3481 return Hi_32(Imm);
3482 case AMDGPU::lo16:
3483 return SignExtend64<16>(Imm);
3484 case AMDGPU::hi16:
3485 return SignExtend64<16>(Imm >> 16);
3486 case AMDGPU::sub1_lo16:
3487 return SignExtend64<16>(Imm >> 32);
3488 case AMDGPU::sub1_hi16:
3489 return SignExtend64<16>(Imm >> 48);
3490 }
3491 };
3492
3493 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3494
3495 unsigned Opc = UseMI.getOpcode();
3496 if (Opc == AMDGPU::COPY) {
3497 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3498
3499 Register DstReg = UseMI.getOperand(0).getReg();
3500 unsigned OpSize = getOpSize(UseMI, 0);
3501 bool Is16Bit = OpSize == 2;
3502 bool Is64Bit = OpSize == 8;
3503 bool isVGPRCopy = RI.isVGPR(*MRI, DstReg);
3504 unsigned NewOpc = isVGPRCopy ? Is64Bit ? AMDGPU::V_MOV_B64_PSEUDO
3505 : AMDGPU::V_MOV_B32_e32
3506 : Is64Bit ? AMDGPU::S_MOV_B64_IMM_PSEUDO
3507 : AMDGPU::S_MOV_B32;
3508 APInt Imm(Is64Bit ? 64 : 32, getImmFor(UseMI.getOperand(1)),
3509 /*isSigned=*/true, /*implicitTrunc=*/true);
3510
3511 if (RI.isAGPR(*MRI, DstReg)) {
3512 if (Is64Bit || !isInlineConstant(Imm))
3513 return false;
3514 NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
3515 }
3516
3517 if (Is16Bit) {
3518 if (isVGPRCopy)
3519 return false; // Do not clobber vgpr_hi16
3520
3521 if (DstReg.isVirtual() && UseMI.getOperand(0).getSubReg() != AMDGPU::lo16)
3522 return false;
3523
3524 UseMI.getOperand(0).setSubReg(0);
3525 if (DstReg.isPhysical()) {
3526 DstReg = RI.get32BitRegister(DstReg);
3527 UseMI.getOperand(0).setReg(DstReg);
3528 }
3529 assert(UseMI.getOperand(1).getReg().isVirtual());
3530 }
3531
3532 const MCInstrDesc &NewMCID = get(NewOpc);
3533 if (DstReg.isPhysical() &&
3534 !RI.getRegClass(NewMCID.operands()[0].RegClass)->contains(DstReg))
3535 return false;
3536
3537 UseMI.setDesc(NewMCID);
3538 UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue());
3539 UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
3540 return true;
3541 }
3542
3543 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3544 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3545 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3546 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3547 Opc == AMDGPU::V_FMAC_F16_fake16_e64) {
3548 // Don't fold if we are using source or output modifiers. The new VOP2
3549 // instructions don't have them.
3551 return false;
3552
3553 // If this is a free constant, there's no reason to do this.
3554 // TODO: We could fold this here instead of letting SIFoldOperands do it
3555 // later.
3556 MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
3557
3558 // Any src operand can be used for the legality check.
3559 if (isInlineConstant(UseMI, *Src0, *ImmOp))
3560 return false;
3561
3562 bool IsF32 = Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3563 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64;
3564 bool IsFMA =
3565 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3566 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3567 Opc == AMDGPU::V_FMAC_F16_fake16_e64;
3568 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3569 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3570
3571 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3572 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3573 (Src1->isReg() && Src1->getReg() == Reg)) {
3574 MachineOperand *RegSrc =
3575 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3576 if (!RegSrc->isReg())
3577 return false;
3578 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3579 ST.getConstantBusLimit(Opc) < 2)
3580 return false;
3581
3582 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3583 return false;
3584
3585 // If src2 is also a literal constant then we have to choose which one to
3586 // fold. In general it is better to choose madak so that the other literal
3587 // can be materialized in an sgpr instead of a vgpr:
3588 // s_mov_b32 s0, literal
3589 // v_madak_f32 v0, s0, v0, literal
3590 // Instead of:
3591 // v_mov_b32 v1, literal
3592 // v_madmk_f32 v0, v0, literal, v1
3593 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3594 if (Def && Def->isMoveImmediate() &&
3595 !isInlineConstant(Def->getOperand(1)))
3596 return false;
3597
3598 unsigned NewOpc =
3599 IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
3600 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
3601 : AMDGPU::V_FMAMK_F16)
3602 : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
3603 if (pseudoToMCOpcode(NewOpc) == -1)
3604 return false;
3605
3606 // V_FMAMK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
3607 // would also require restricting their register classes. For now
3608 // just bail out.
3609 if (NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3610 return false;
3611
3612 const int64_t Imm = getImmFor(RegSrc == Src1 ? *Src0 : *Src1);
3613
3614 // FIXME: This would be a lot easier if we could return a new instruction
3615 // instead of having to modify in place.
3616
3617 Register SrcReg = RegSrc->getReg();
3618 unsigned SrcSubReg = RegSrc->getSubReg();
3619 Src0->setReg(SrcReg);
3620 Src0->setSubReg(SrcSubReg);
3621 Src0->setIsKill(RegSrc->isKill());
3622
3623 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3624 Opc == AMDGPU::V_FMAC_F32_e64 ||
3625 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
3626 UseMI.untieRegOperand(
3627 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3628
3629 Src1->ChangeToImmediate(Imm);
3630
3632 UseMI.setDesc(get(NewOpc));
3633
3634 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3635 if (DeleteDef)
3636 DefMI.eraseFromParent();
3637
3638 return true;
3639 }
3640
3641 // Added part is the constant: Use v_madak_{f16, f32}.
3642 if (Src2->isReg() && Src2->getReg() == Reg) {
3643 if (ST.getConstantBusLimit(Opc) < 2) {
3644 // Not allowed to use constant bus for another operand.
3645 // We can however allow an inline immediate as src0.
3646 bool Src0Inlined = false;
3647 if (Src0->isReg()) {
3648 // Try to inline constant if possible.
3649 // If the Def moves immediate and the use is single
3650 // We are saving VGPR here.
3651 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3652 if (Def && Def->isMoveImmediate() &&
3653 isInlineConstant(Def->getOperand(1)) &&
3654 MRI->hasOneUse(Src0->getReg())) {
3655 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3656 Src0Inlined = true;
3657 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3658 RI.isSGPRReg(*MRI, Src0->getReg())) {
3659 return false;
3660 }
3661 // VGPR is okay as Src0 - fallthrough
3662 }
3663
3664 if (Src1->isReg() && !Src0Inlined) {
3665 // We have one slot for inlinable constant so far - try to fill it
3666 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3667 if (Def && Def->isMoveImmediate() &&
3668 isInlineConstant(Def->getOperand(1)) &&
3669 MRI->hasOneUse(Src1->getReg()) && commuteInstruction(UseMI))
3670 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3671 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3672 return false;
3673 // VGPR is okay as Src1 - fallthrough
3674 }
3675 }
3676
3677 unsigned NewOpc =
3678 IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32
3679 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
3680 : AMDGPU::V_FMAAK_F16)
3681 : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
3682 if (pseudoToMCOpcode(NewOpc) == -1)
3683 return false;
3684
3685 // V_FMAAK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
3686 // would also require restricting their register classes. For now
3687 // just bail out.
3688 if (NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3689 return false;
3690
3691 // FIXME: This would be a lot easier if we could return a new instruction
3692 // instead of having to modify in place.
3693
3694 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3695 Opc == AMDGPU::V_FMAC_F32_e64 ||
3696 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
3697 UseMI.untieRegOperand(
3698 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3699
3700 // ChangingToImmediate adds Src2 back to the instruction.
3701 Src2->ChangeToImmediate(getImmFor(*Src2));
3702
3703 // These come before src2.
3705 UseMI.setDesc(get(NewOpc));
3706 // It might happen that UseMI was commuted
3707 // and we now have SGPR as SRC1. If so 2 inlined
3708 // constant and SGPR are illegal.
3710
3711 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3712 if (DeleteDef)
3713 DefMI.eraseFromParent();
3714
3715 return true;
3716 }
3717 }
3718
3719 return false;
3720}
3721
3722static bool
3725 if (BaseOps1.size() != BaseOps2.size())
3726 return false;
3727 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3728 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3729 return false;
3730 }
3731 return true;
3732}
3733
3734static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3735 LocationSize WidthB, int OffsetB) {
3736 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3737 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3738 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3739 return LowWidth.hasValue() &&
3740 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3741}
3742
3743bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3744 const MachineInstr &MIb) const {
3745 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3746 int64_t Offset0, Offset1;
3747 LocationSize Dummy0 = 0, Dummy1 = 0;
3748 bool Offset0IsScalable, Offset1IsScalable;
3749 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3750 Dummy0, &RI) ||
3751 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3752 Dummy1, &RI))
3753 return false;
3754
3755 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3756 return false;
3757
3758 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3759 // FIXME: Handle ds_read2 / ds_write2.
3760 return false;
3761 }
3762 LocationSize Width0 = MIa.memoperands().front()->getSize();
3763 LocationSize Width1 = MIb.memoperands().front()->getSize();
3764 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3765}
3766
3768 const MachineInstr &MIb) const {
3769 assert(MIa.mayLoadOrStore() &&
3770 "MIa must load from or modify a memory location");
3771 assert(MIb.mayLoadOrStore() &&
3772 "MIb must load from or modify a memory location");
3773
3775 return false;
3776
3777 // XXX - Can we relax this between address spaces?
3778 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3779 return false;
3780
3781 if (isLDSDMA(MIa) || isLDSDMA(MIb))
3782 return false;
3783
3784 // TODO: Should we check the address space from the MachineMemOperand? That
3785 // would allow us to distinguish objects we know don't alias based on the
3786 // underlying address space, even if it was lowered to a different one,
3787 // e.g. private accesses lowered to use MUBUF instructions on a scratch
3788 // buffer.
3789 if (isDS(MIa)) {
3790 if (isDS(MIb))
3791 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3792
3793 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
3794 }
3795
3796 if (isMUBUF(MIa) || isMTBUF(MIa)) {
3797 if (isMUBUF(MIb) || isMTBUF(MIb))
3798 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3799
3800 if (isFLAT(MIb))
3801 return isFLATScratch(MIb);
3802
3803 return !isSMRD(MIb);
3804 }
3805
3806 if (isSMRD(MIa)) {
3807 if (isSMRD(MIb))
3808 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3809
3810 if (isFLAT(MIb))
3811 return isFLATScratch(MIb);
3812
3813 return !isMUBUF(MIb) && !isMTBUF(MIb);
3814 }
3815
3816 if (isFLAT(MIa)) {
3817 if (isFLAT(MIb)) {
3818 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
3819 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
3820 return true;
3821
3822 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3823 }
3824
3825 return false;
3826 }
3827
3828 return false;
3829}
3830
3832 int64_t &Imm, MachineInstr **DefMI = nullptr) {
3833 if (Reg.isPhysical())
3834 return false;
3835 auto *Def = MRI.getUniqueVRegDef(Reg);
3836 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
3837 Imm = Def->getOperand(1).getImm();
3838 if (DefMI)
3839 *DefMI = Def;
3840 return true;
3841 }
3842 return false;
3843}
3844
3845static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
3846 MachineInstr **DefMI = nullptr) {
3847 if (!MO->isReg())
3848 return false;
3849 const MachineFunction *MF = MO->getParent()->getParent()->getParent();
3850 const MachineRegisterInfo &MRI = MF->getRegInfo();
3851 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
3852}
3853
3855 MachineInstr &NewMI) {
3856 if (LV) {
3857 unsigned NumOps = MI.getNumOperands();
3858 for (unsigned I = 1; I < NumOps; ++I) {
3859 MachineOperand &Op = MI.getOperand(I);
3860 if (Op.isReg() && Op.isKill())
3861 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
3862 }
3863 }
3864}
3865
3866static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
3867 switch (Opc) {
3868 case AMDGPU::V_MAC_F16_e32:
3869 case AMDGPU::V_MAC_F16_e64:
3870 return AMDGPU::V_MAD_F16_e64;
3871 case AMDGPU::V_MAC_F32_e32:
3872 case AMDGPU::V_MAC_F32_e64:
3873 return AMDGPU::V_MAD_F32_e64;
3874 case AMDGPU::V_MAC_LEGACY_F32_e32:
3875 case AMDGPU::V_MAC_LEGACY_F32_e64:
3876 return AMDGPU::V_MAD_LEGACY_F32_e64;
3877 case AMDGPU::V_FMAC_LEGACY_F32_e32:
3878 case AMDGPU::V_FMAC_LEGACY_F32_e64:
3879 return AMDGPU::V_FMA_LEGACY_F32_e64;
3880 case AMDGPU::V_FMAC_F16_e32:
3881 case AMDGPU::V_FMAC_F16_e64:
3882 case AMDGPU::V_FMAC_F16_fake16_e64:
3883 return ST.hasTrue16BitInsts() ? AMDGPU::V_FMA_F16_gfx9_fake16_e64
3884 : AMDGPU::V_FMA_F16_gfx9_e64;
3885 case AMDGPU::V_FMAC_F32_e32:
3886 case AMDGPU::V_FMAC_F32_e64:
3887 return AMDGPU::V_FMA_F32_e64;
3888 case AMDGPU::V_FMAC_F64_e32:
3889 case AMDGPU::V_FMAC_F64_e64:
3890 return AMDGPU::V_FMA_F64_e64;
3891 default:
3892 llvm_unreachable("invalid instruction");
3893 }
3894}
3895
3897 LiveVariables *LV,
3898 LiveIntervals *LIS) const {
3899 MachineBasicBlock &MBB = *MI.getParent();
3900 unsigned Opc = MI.getOpcode();
3901
3902 // Handle MFMA.
3903 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
3904 if (NewMFMAOpc != -1) {
3906 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
3907 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3908 MIB.add(MI.getOperand(I));
3909 updateLiveVariables(LV, MI, *MIB);
3910 if (LIS) {
3911 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3912 // SlotIndex of defs needs to be updated when converting to early-clobber
3913 MachineOperand &Def = MIB->getOperand(0);
3914 if (Def.isEarlyClobber() && Def.isReg() &&
3915 LIS->hasInterval(Def.getReg())) {
3916 SlotIndex OldIndex = LIS->getInstructionIndex(*MIB).getRegSlot(false);
3917 SlotIndex NewIndex = LIS->getInstructionIndex(*MIB).getRegSlot(true);
3918 auto &LI = LIS->getInterval(Def.getReg());
3919 auto UpdateDefIndex = [&](LiveRange &LR) {
3920 auto *S = LR.find(OldIndex);
3921 if (S != LR.end() && S->start == OldIndex) {
3922 assert(S->valno && S->valno->def == OldIndex);
3923 S->start = NewIndex;
3924 S->valno->def = NewIndex;
3925 }
3926 };
3927 UpdateDefIndex(LI);
3928 for (auto &SR : LI.subranges())
3929 UpdateDefIndex(SR);
3930 }
3931 }
3932 return MIB;
3933 }
3934
3935 if (SIInstrInfo::isWMMA(MI)) {
3936 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
3937 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3938 .setMIFlags(MI.getFlags());
3939 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3940 MIB->addOperand(MI.getOperand(I));
3941
3942 updateLiveVariables(LV, MI, *MIB);
3943 if (LIS)
3944 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3945
3946 return MIB;
3947 }
3948
3949 assert(
3950 Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
3951 "V_FMAC_F16_fake16_e32 is not supported and not expected to be present "
3952 "pre-RA");
3953
3954 // Handle MAC/FMAC.
3955 bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
3956 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3957 Opc == AMDGPU::V_FMAC_F16_fake16_e64;
3958 bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3959 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3960 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
3961 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3962 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
3963 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3964 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3965 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
3966 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
3967 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3968 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
3969 bool Src0Literal = false;
3970
3971 switch (Opc) {
3972 default:
3973 return nullptr;
3974 case AMDGPU::V_MAC_F16_e64:
3975 case AMDGPU::V_FMAC_F16_e64:
3976 case AMDGPU::V_FMAC_F16_fake16_e64:
3977 case AMDGPU::V_MAC_F32_e64:
3978 case AMDGPU::V_MAC_LEGACY_F32_e64:
3979 case AMDGPU::V_FMAC_F32_e64:
3980 case AMDGPU::V_FMAC_LEGACY_F32_e64:
3981 case AMDGPU::V_FMAC_F64_e64:
3982 break;
3983 case AMDGPU::V_MAC_F16_e32:
3984 case AMDGPU::V_FMAC_F16_e32:
3985 case AMDGPU::V_MAC_F32_e32:
3986 case AMDGPU::V_MAC_LEGACY_F32_e32:
3987 case AMDGPU::V_FMAC_F32_e32:
3988 case AMDGPU::V_FMAC_LEGACY_F32_e32:
3989 case AMDGPU::V_FMAC_F64_e32: {
3990 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
3991 AMDGPU::OpName::src0);
3992 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
3993 if (!Src0->isReg() && !Src0->isImm())
3994 return nullptr;
3995
3996 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
3997 Src0Literal = true;
3998
3999 break;
4000 }
4001 }
4002
4004 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4005 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
4006 const MachineOperand *Src0Mods =
4007 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4008 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4009 const MachineOperand *Src1Mods =
4010 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
4011 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4012 const MachineOperand *Src2Mods =
4013 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
4014 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4015 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
4016 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
4017
4018 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsF64 &&
4019 !IsLegacy &&
4020 // If we have an SGPR input, we will violate the constant bus restriction.
4021 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
4022 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
4024 const auto killDef = [&]() -> void {
4026 // The only user is the instruction which will be killed.
4027 Register DefReg = DefMI->getOperand(0).getReg();
4028
4029 if (MRI.hasOneNonDBGUse(DefReg)) {
4030 // We cannot just remove the DefMI here, calling pass will crash.
4031 DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
4032 DefMI->getOperand(0).setIsDead(true);
4033 for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
4035 if (LV)
4036 LV->getVarInfo(DefReg).AliveBlocks.clear();
4037 }
4038
4039 if (LIS) {
4040 LiveInterval &DefLI = LIS->getInterval(DefReg);
4041
4042 // We cannot delete the original instruction here, so hack out the use
4043 // in the original instruction with a dummy register so we can use
4044 // shrinkToUses to deal with any multi-use edge cases. Other targets do
4045 // not have the complexity of deleting a use to consider here.
4046 Register DummyReg = MRI.cloneVirtualRegister(DefReg);
4047 for (MachineOperand &MIOp : MI.uses()) {
4048 if (MIOp.isReg() && MIOp.getReg() == DefReg) {
4049 MIOp.setIsUndef(true);
4050 MIOp.setReg(DummyReg);
4051 }
4052 }
4053
4054 LIS->shrinkToUses(&DefLI);
4055 }
4056 };
4057
4058 int64_t Imm;
4059 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
4060 unsigned NewOpc =
4061 IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16
4062 : AMDGPU::V_FMAAK_F16)
4063 : AMDGPU::V_FMAAK_F32)
4064 : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
4065 if (pseudoToMCOpcode(NewOpc) != -1) {
4066 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4067 .add(*Dst)
4068 .add(*Src0)
4069 .add(*Src1)
4070 .addImm(Imm)
4071 .setMIFlags(MI.getFlags());
4072 updateLiveVariables(LV, MI, *MIB);
4073 if (LIS)
4074 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4075 killDef();
4076 return MIB;
4077 }
4078 }
4079 unsigned NewOpc =
4080 IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16
4081 : AMDGPU::V_FMAMK_F16)
4082 : AMDGPU::V_FMAMK_F32)
4083 : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
4084 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
4085 if (pseudoToMCOpcode(NewOpc) != -1) {
4086 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4087 .add(*Dst)
4088 .add(*Src0)
4089 .addImm(Imm)
4090 .add(*Src2)
4091 .setMIFlags(MI.getFlags());
4092 updateLiveVariables(LV, MI, *MIB);
4093
4094 if (LIS)
4095 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4096 killDef();
4097 return MIB;
4098 }
4099 }
4100 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4101 if (Src0Literal) {
4102 Imm = Src0->getImm();
4103 DefMI = nullptr;
4104 }
4105 if (pseudoToMCOpcode(NewOpc) != -1 &&
4107 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4108 Src1)) {
4109 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4110 .add(*Dst)
4111 .add(*Src1)
4112 .addImm(Imm)
4113 .add(*Src2)
4114 .setMIFlags(MI.getFlags());
4115 updateLiveVariables(LV, MI, *MIB);
4116
4117 if (LIS)
4118 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4119 if (DefMI)
4120 killDef();
4121 return MIB;
4122 }
4123 }
4124 }
4125
4126 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4127 // if VOP3 does not allow a literal operand.
4128 if (Src0Literal && !ST.hasVOP3Literal())
4129 return nullptr;
4130
4131 unsigned NewOpc = getNewFMAInst(ST, Opc);
4132
4133 if (pseudoToMCOpcode(NewOpc) == -1)
4134 return nullptr;
4135
4136 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4137 .add(*Dst)
4138 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4139 .add(*Src0)
4140 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4141 .add(*Src1)
4142 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4143 .add(*Src2)
4144 .addImm(Clamp ? Clamp->getImm() : 0)
4145 .addImm(Omod ? Omod->getImm() : 0)
4146 .setMIFlags(MI.getFlags());
4147 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4148 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4149 updateLiveVariables(LV, MI, *MIB);
4150 if (LIS)
4151 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4152 return MIB;
4153}
4154
4155// It's not generally safe to move VALU instructions across these since it will
4156// start using the register as a base index rather than directly.
4157// XXX - Why isn't hasSideEffects sufficient for these?
4159 switch (MI.getOpcode()) {
4160 case AMDGPU::S_SET_GPR_IDX_ON:
4161 case AMDGPU::S_SET_GPR_IDX_MODE:
4162 case AMDGPU::S_SET_GPR_IDX_OFF:
4163 return true;
4164 default:
4165 return false;
4166 }
4167}
4168
4170 const MachineBasicBlock *MBB,
4171 const MachineFunction &MF) const {
4172 // Skipping the check for SP writes in the base implementation. The reason it
4173 // was added was apparently due to compile time concerns.
4174 //
4175 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4176 // but is probably avoidable.
4177
4178 // Copied from base implementation.
4179 // Terminators and labels can't be scheduled around.
4180 if (MI.isTerminator() || MI.isPosition())
4181 return true;
4182
4183 // INLINEASM_BR can jump to another block
4184 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4185 return true;
4186
4187 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4188 return true;
4189
4190 // Target-independent instructions do not have an implicit-use of EXEC, even
4191 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4192 // boundaries prevents incorrect movements of such instructions.
4193 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4194 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4195 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4196 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4198}
4199
4201 return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode);
4202}
4203
4205 // Skip the full operand and register alias search modifiesRegister
4206 // does. There's only a handful of instructions that touch this, it's only an
4207 // implicit def, and doesn't alias any other registers.
4208 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4209}
4210
4212 unsigned Opcode = MI.getOpcode();
4213
4214 if (MI.mayStore() && isSMRD(MI))
4215 return true; // scalar store or atomic
4216
4217 // This will terminate the function when other lanes may need to continue.
4218 if (MI.isReturn())
4219 return true;
4220
4221 // These instructions cause shader I/O that may cause hardware lockups
4222 // when executed with an empty EXEC mask.
4223 //
4224 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4225 // EXEC = 0, but checking for that case here seems not worth it
4226 // given the typical code patterns.
4227 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4228 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4229 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT)
4230 return true;
4231
4232 if (MI.isCall() || MI.isInlineAsm())
4233 return true; // conservative assumption
4234
4235 // Assume that barrier interactions are only intended with active lanes.
4236 if (isBarrier(Opcode))
4237 return true;
4238
4239 // A mode change is a scalar operation that influences vector instructions.
4241 return true;
4242
4243 // These are like SALU instructions in terms of effects, so it's questionable
4244 // whether we should return true for those.
4245 //
4246 // However, executing them with EXEC = 0 causes them to operate on undefined
4247 // data, which we avoid by returning true here.
4248 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4249 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4250 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4251 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4252 return true;
4253
4254 return false;
4255}
4256
4258 const MachineInstr &MI) const {
4259 if (MI.isMetaInstruction())
4260 return false;
4261
4262 // This won't read exec if this is an SGPR->SGPR copy.
4263 if (MI.isCopyLike()) {
4264 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4265 return true;
4266
4267 // Make sure this isn't copying exec as a normal operand
4268 return MI.readsRegister(AMDGPU::EXEC, &RI);
4269 }
4270
4271 // Make a conservative assumption about the callee.
4272 if (MI.isCall())
4273 return true;
4274
4275 // Be conservative with any unhandled generic opcodes.
4276 if (!isTargetSpecificOpcode(MI.getOpcode()))
4277 return true;
4278
4279 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4280}
4281
4282bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4283 switch (Imm.getBitWidth()) {
4284 case 1: // This likely will be a condition code mask.
4285 return true;
4286
4287 case 32:
4288 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4289 ST.hasInv2PiInlineImm());
4290 case 64:
4291 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4292 ST.hasInv2PiInlineImm());
4293 case 16:
4294 return ST.has16BitInsts() &&
4295 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4296 ST.hasInv2PiInlineImm());
4297 default:
4298 llvm_unreachable("invalid bitwidth");
4299 }
4300}
4301
4303 APInt IntImm = Imm.bitcastToAPInt();
4304 int64_t IntImmVal = IntImm.getSExtValue();
4305 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4306 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4307 default:
4308 llvm_unreachable("invalid fltSemantics");
4311 return isInlineConstant(IntImm);
4313 return ST.has16BitInsts() &&
4314 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4316 return ST.has16BitInsts() &&
4317 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4318 }
4319}
4320
4322 uint8_t OperandType) const {
4323 assert(!MO.isReg() && "isInlineConstant called on register operand!");
4324 if (!MO.isImm())
4325 return false;
4326
4327 // MachineOperand provides no way to tell the true operand size, since it only
4328 // records a 64-bit value. We need to know the size to determine if a 32-bit
4329 // floating point immediate bit pattern is legal for an integer immediate. It
4330 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4331
4332 int64_t Imm = MO.getImm();
4333 switch (OperandType) {
4346 int32_t Trunc = static_cast<int32_t>(Imm);
4348 }
4355 ST.hasInv2PiInlineImm());
4359 // We would expect inline immediates to not be concerned with an integer/fp
4360 // distinction. However, in the case of 16-bit integer operations, the
4361 // "floating point" values appear to not work. It seems read the low 16-bits
4362 // of 32-bit immediates, which happens to always work for the integer
4363 // values.
4364 //
4365 // See llvm bugzilla 46302.
4366 //
4367 // TODO: Theoretically we could use op-sel to use the high bits of the
4368 // 32-bit FP values.
4386 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4387 // A few special case instructions have 16-bit operands on subtargets
4388 // where 16-bit instructions are not legal.
4389 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4390 // constants in these cases
4391 int16_t Trunc = static_cast<int16_t>(Imm);
4392 return ST.has16BitInsts() &&
4394 }
4395
4396 return false;
4397 }
4402 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4403 int16_t Trunc = static_cast<int16_t>(Imm);
4404 return ST.has16BitInsts() &&
4406 }
4407 return false;
4408 }
4411 return false;
4414 // Always embedded in the instruction for free.
4415 return true;
4425 // Just ignore anything else.
4426 return true;
4427 default:
4428 llvm_unreachable("invalid operand type");
4429 }
4430}
4431
4432static bool compareMachineOp(const MachineOperand &Op0,
4433 const MachineOperand &Op1) {
4434 if (Op0.getType() != Op1.getType())
4435 return false;
4436
4437 switch (Op0.getType()) {
4439 return Op0.getReg() == Op1.getReg();
4441 return Op0.getImm() == Op1.getImm();
4442 default:
4443 llvm_unreachable("Didn't expect to be comparing these operand types");
4444 }
4445}
4446
4448 const MachineOperand &MO) const {
4449 const MCInstrDesc &InstDesc = MI.getDesc();
4450 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4451
4452 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
4453
4455 return true;
4456
4457 if (OpInfo.RegClass < 0)
4458 return false;
4459
4460 if (MO.isImm() && isInlineConstant(MO, OpInfo)) {
4461 if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() &&
4462 OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4463 AMDGPU::OpName::src2))
4464 return false;
4465 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4466 }
4467
4468 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4469 return false;
4470
4471 if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo))
4472 return true;
4473
4474 return ST.hasVOP3Literal();
4475}
4476
4477bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4478 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4479 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4480 return false;
4481
4482 int Op32 = AMDGPU::getVOPe32(Opcode);
4483 if (Op32 == -1)
4484 return false;
4485
4486 return pseudoToMCOpcode(Op32) != -1;
4487}
4488
4489bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4490 // The src0_modifier operand is present on all instructions
4491 // that have modifiers.
4492
4493 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4494}
4495
4497 unsigned OpName) const {
4498 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4499 return Mods && Mods->getImm();
4500}
4501
4503 return any_of(ModifierOpNames,
4504 [&](unsigned Name) { return hasModifiersSet(MI, Name); });
4505}
4506
4508 const MachineRegisterInfo &MRI) const {
4509 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4510 // Can't shrink instruction with three operands.
4511 if (Src2) {
4512 switch (MI.getOpcode()) {
4513 default: return false;
4514
4515 case AMDGPU::V_ADDC_U32_e64:
4516 case AMDGPU::V_SUBB_U32_e64:
4517 case AMDGPU::V_SUBBREV_U32_e64: {
4518 const MachineOperand *Src1
4519 = getNamedOperand(MI, AMDGPU::OpName::src1);
4520 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4521 return false;
4522 // Additional verification is needed for sdst/src2.
4523 return true;
4524 }
4525 case AMDGPU::V_MAC_F16_e64:
4526 case AMDGPU::V_MAC_F32_e64:
4527 case AMDGPU::V_MAC_LEGACY_F32_e64:
4528 case AMDGPU::V_FMAC_F16_e64:
4529 case AMDGPU::V_FMAC_F16_fake16_e64:
4530 case AMDGPU::V_FMAC_F32_e64:
4531 case AMDGPU::V_FMAC_F64_e64:
4532 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4533 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4534 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4535 return false;
4536 break;
4537
4538 case AMDGPU::V_CNDMASK_B32_e64:
4539 break;
4540 }
4541 }
4542
4543 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4544 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4545 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4546 return false;
4547
4548 // We don't need to check src0, all input types are legal, so just make sure
4549 // src0 isn't using any modifiers.
4550 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4551 return false;
4552
4553 // Can it be shrunk to a valid 32 bit opcode?
4554 if (!hasVALU32BitEncoding(MI.getOpcode()))
4555 return false;
4556
4557 // Check output modifiers
4558 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4559 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4560 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel) &&
4561 // TODO: Can we avoid checking bound_ctrl/fi here?
4562 // They are only used by permlane*_swap special case.
4563 !hasModifiersSet(MI, AMDGPU::OpName::bound_ctrl) &&
4564 !hasModifiersSet(MI, AMDGPU::OpName::fi);
4565}
4566
4567// Set VCC operand with all flags from \p Orig, except for setting it as
4568// implicit.
4570 const MachineOperand &Orig) {
4571
4572 for (MachineOperand &Use : MI.implicit_operands()) {
4573 if (Use.isUse() &&
4574 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4575 Use.setIsUndef(Orig.isUndef());
4576 Use.setIsKill(Orig.isKill());
4577 return;
4578 }
4579 }
4580}
4581
4583 unsigned Op32) const {
4584 MachineBasicBlock *MBB = MI.getParent();
4585
4586 const MCInstrDesc &Op32Desc = get(Op32);
4587 MachineInstrBuilder Inst32 =
4588 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
4589 .setMIFlags(MI.getFlags());
4590
4591 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4592 // For VOPC instructions, this is replaced by an implicit def of vcc.
4593
4594 // We assume the defs of the shrunk opcode are in the same order, and the
4595 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4596 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
4597 Inst32.add(MI.getOperand(I));
4598
4599 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4600
4601 int Idx = MI.getNumExplicitDefs();
4602 for (const MachineOperand &Use : MI.explicit_uses()) {
4603 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
4605 continue;
4606
4607 if (&Use == Src2) {
4608 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
4609 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4610 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4611 // of vcc was already added during the initial BuildMI, but we
4612 // 1) may need to change vcc to vcc_lo to preserve the original register
4613 // 2) have to preserve the original flags.
4614 copyFlagsToImplicitVCC(*Inst32, *Src2);
4615 continue;
4616 }
4617 }
4618
4619 Inst32.add(Use);
4620 }
4621
4622 // FIXME: Losing implicit operands
4623 fixImplicitOperands(*Inst32);
4624 return Inst32;
4625}
4626
4628 const MachineOperand &MO,
4629 const MCOperandInfo &OpInfo) const {
4630 // Literal constants use the constant bus.
4631 if (!MO.isReg())
4632 return !isInlineConstant(MO, OpInfo);
4633
4634 if (!MO.isUse())
4635 return false;
4636
4637 if (MO.getReg().isVirtual())
4638 return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
4639
4640 // Null is free
4641 if (MO.getReg() == AMDGPU::SGPR_NULL || MO.getReg() == AMDGPU::SGPR_NULL64)
4642 return false;
4643
4644 // SGPRs use the constant bus
4645 if (MO.isImplicit()) {
4646 return MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC ||
4647 MO.getReg() == AMDGPU::VCC_LO;
4648 }
4649 return AMDGPU::SReg_32RegClass.contains(MO.getReg()) ||
4650 AMDGPU::SReg_64RegClass.contains(MO.getReg());
4651}
4652
4654 for (const MachineOperand &MO : MI.implicit_operands()) {
4655 // We only care about reads.
4656 if (MO.isDef())
4657 continue;
4658
4659 switch (MO.getReg()) {
4660 case AMDGPU::VCC:
4661 case AMDGPU::VCC_LO:
4662 case AMDGPU::VCC_HI:
4663 case AMDGPU::M0:
4664 case AMDGPU::FLAT_SCR:
4665 return MO.getReg();
4666
4667 default:
4668 break;
4669 }
4670 }
4671
4672 return Register();
4673}
4674
4675static bool shouldReadExec(const MachineInstr &MI) {
4676 if (SIInstrInfo::isVALU(MI)) {
4677 switch (MI.getOpcode()) {
4678 case AMDGPU::V_READLANE_B32:
4679 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
4680 case AMDGPU::V_WRITELANE_B32:
4681 case AMDGPU::SI_SPILL_S32_TO_VGPR:
4682 return false;
4683 }
4684
4685 return true;
4686 }
4687
4688 if (MI.isPreISelOpcode() ||
4689 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
4692 return false;
4693
4694 return true;
4695}
4696
4697static bool isRegOrFI(const MachineOperand &MO) {
4698 return MO.isReg() || MO.isFI();
4699}
4700
4701static bool isSubRegOf(const SIRegisterInfo &TRI,
4702 const MachineOperand &SuperVec,
4703 const MachineOperand &SubReg) {
4704 if (SubReg.getReg().isPhysical())
4705 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
4706
4707 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
4708 SubReg.getReg() == SuperVec.getReg();
4709}
4710
4711// Verify the illegal copy from vector register to SGPR for generic opcode COPY
4712bool SIInstrInfo::verifyCopy(const MachineInstr &MI,
4713 const MachineRegisterInfo &MRI,
4714 StringRef &ErrInfo) const {
4715 Register DstReg = MI.getOperand(0).getReg();
4716 Register SrcReg = MI.getOperand(1).getReg();
4717 // This is a check for copy from vector register to SGPR
4718 if (RI.isVectorRegister(MRI, SrcReg) && RI.isSGPRReg(MRI, DstReg)) {
4719 ErrInfo = "illegal copy from vector register to SGPR";
4720 return false;
4721 }
4722 return true;
4723}
4724
4726 StringRef &ErrInfo) const {
4727 uint16_t Opcode = MI.getOpcode();
4728 const MachineFunction *MF = MI.getParent()->getParent();
4729 const MachineRegisterInfo &MRI = MF->getRegInfo();
4730
4731 // FIXME: At this point the COPY verify is done only for non-ssa forms.
4732 // Find a better property to recognize the point where instruction selection
4733 // is just done.
4734 // We can only enforce this check after SIFixSGPRCopies pass so that the
4735 // illegal copies are legalized and thereafter we don't expect a pass
4736 // inserting similar copies.
4737 if (!MRI.isSSA() && MI.isCopy())
4738 return verifyCopy(MI, MRI, ErrInfo);
4739
4740 if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
4741 return true;
4742
4743 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
4744 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
4745 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
4746 int Src3Idx = -1;
4747 if (Src0Idx == -1) {
4748 // VOPD V_DUAL_* instructions use different operand names.
4749 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
4750 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
4751 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
4752 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
4753 }
4754
4755 // Make sure the number of operands is correct.
4756 const MCInstrDesc &Desc = get(Opcode);
4757 if (!Desc.isVariadic() &&
4758 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
4759 ErrInfo = "Instruction has wrong number of operands.";
4760 return false;
4761 }
4762
4763 if (MI.isInlineAsm()) {
4764 // Verify register classes for inlineasm constraints.
4765 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
4766 I != E; ++I) {
4767 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
4768 if (!RC)
4769 continue;
4770
4771 const MachineOperand &Op = MI.getOperand(I);
4772 if (!Op.isReg())
4773 continue;
4774
4775 Register Reg = Op.getReg();
4776 if (!Reg.isVirtual() && !RC->contains(Reg)) {
4777 ErrInfo = "inlineasm operand has incorrect register class.";
4778 return false;
4779 }
4780 }
4781
4782 return true;
4783 }
4784
4785 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
4786 ErrInfo = "missing memory operand from image instruction.";
4787 return false;
4788 }
4789
4790 // Make sure the register classes are correct.
4791 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
4792 const MachineOperand &MO = MI.getOperand(i);
4793 if (MO.isFPImm()) {
4794 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
4795 "all fp values to integers.";
4796 return false;
4797 }
4798
4799 int RegClass = Desc.operands()[i].RegClass;
4800
4801 switch (Desc.operands()[i].OperandType) {
4803 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
4804 ErrInfo = "Illegal immediate value for operand.";
4805 return false;
4806 }
4807 break;
4812 break;
4824 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
4825 ErrInfo = "Illegal immediate value for operand.";
4826 return false;
4827 }
4828 break;
4829 }
4831 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
4832 ErrInfo = "Expected inline constant for operand.";
4833 return false;
4834 }
4835 break;
4838 // Check if this operand is an immediate.
4839 // FrameIndex operands will be replaced by immediates, so they are
4840 // allowed.
4841 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
4842 ErrInfo = "Expected immediate, but got non-immediate";
4843 return false;
4844 }
4845 [[fallthrough]];
4846 default:
4847 continue;
4848 }
4849
4850 if (!MO.isReg())
4851 continue;
4852 Register Reg = MO.getReg();
4853 if (!Reg)
4854 continue;
4855
4856 // FIXME: Ideally we would have separate instruction definitions with the
4857 // aligned register constraint.
4858 // FIXME: We do not verify inline asm operands, but custom inline asm
4859 // verification is broken anyway
4860 if (ST.needsAlignedVGPRs()) {
4861 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
4862 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
4863 if (const TargetRegisterClass *SubRC =
4864 RI.getSubRegisterClass(RC, MO.getSubReg())) {
4865 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
4866 if (RC)
4867 RC = SubRC;
4868 }
4869 }
4870
4871 // Check that this is the aligned version of the class.
4872 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
4873 ErrInfo = "Subtarget requires even aligned vector registers";
4874 return false;
4875 }
4876 }
4877
4878 if (RegClass != -1) {
4879 if (Reg.isVirtual())
4880 continue;
4881
4882 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
4883 if (!RC->contains(Reg)) {
4884 ErrInfo = "Operand has incorrect register class.";
4885 return false;
4886 }
4887 }
4888 }
4889
4890 // Verify SDWA
4891 if (isSDWA(MI)) {
4892 if (!ST.hasSDWA()) {
4893 ErrInfo = "SDWA is not supported on this target";
4894 return false;
4895 }
4896
4897 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
4898
4899 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
4900 if (OpIdx == -1)
4901 continue;
4902 const MachineOperand &MO = MI.getOperand(OpIdx);
4903
4904 if (!ST.hasSDWAScalar()) {
4905 // Only VGPRS on VI
4906 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
4907 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
4908 return false;
4909 }
4910 } else {
4911 // No immediates on GFX9
4912 if (!MO.isReg()) {
4913 ErrInfo =
4914 "Only reg allowed as operands in SDWA instructions on GFX9+";
4915 return false;
4916 }
4917 }
4918 }
4919
4920 if (!ST.hasSDWAOmod()) {
4921 // No omod allowed on VI
4922 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
4923 if (OMod != nullptr &&
4924 (!OMod->isImm() || OMod->getImm() != 0)) {
4925 ErrInfo = "OMod not allowed in SDWA instructions on VI";
4926 return false;
4927 }
4928 }
4929
4930 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
4931 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
4932 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
4933 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
4934 const MachineOperand *Src0ModsMO =
4935 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4936 unsigned Mods = Src0ModsMO->getImm();
4937 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
4938 Mods & SISrcMods::SEXT) {
4939 ErrInfo = "sext, abs and neg are not allowed on this instruction";
4940 return false;
4941 }
4942 }
4943
4944 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
4945 if (isVOPC(BasicOpcode)) {
4946 if (!ST.hasSDWASdst() && DstIdx != -1) {
4947 // Only vcc allowed as dst on VI for VOPC
4948 const MachineOperand &Dst = MI.getOperand(DstIdx);
4949 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
4950 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
4951 return false;
4952 }
4953 } else if (!ST.hasSDWAOutModsVOPC()) {
4954 // No clamp allowed on GFX9 for VOPC
4955 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4956 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
4957 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
4958 return false;
4959 }
4960
4961 // No omod allowed on GFX9 for VOPC
4962 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
4963 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
4964 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
4965 return false;
4966 }
4967 }
4968 }
4969
4970 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
4971 if (DstUnused && DstUnused->isImm() &&
4972 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
4973 const MachineOperand &Dst = MI.getOperand(DstIdx);
4974 if (!Dst.isReg() || !Dst.isTied()) {
4975 ErrInfo = "Dst register should have tied register";
4976 return false;
4977 }
4978
4979 const MachineOperand &TiedMO =
4980 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
4981 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
4982 ErrInfo =
4983 "Dst register should be tied to implicit use of preserved register";
4984 return false;
4985 }
4986 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
4987 ErrInfo = "Dst register should use same physical register as preserved";
4988 return false;
4989 }
4990 }
4991 }
4992
4993 // Verify MIMG / VIMAGE / VSAMPLE
4994 if (isImage(MI.getOpcode()) && !MI.mayStore()) {
4995 // Ensure that the return type used is large enough for all the options
4996 // being used TFE/LWE require an extra result register.
4997 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
4998 if (DMask) {
4999 uint64_t DMaskImm = DMask->getImm();
5000 uint32_t RegCount =
5001 isGather4(MI.getOpcode()) ? 4 : llvm::popcount(DMaskImm);
5002 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
5003 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
5004 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
5005
5006 // Adjust for packed 16 bit values
5007 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
5008 RegCount = divideCeil(RegCount, 2);
5009
5010 // Adjust if using LWE or TFE
5011 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
5012 RegCount += 1;
5013
5014 const uint32_t DstIdx =
5015 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
5016 const MachineOperand &Dst = MI.getOperand(DstIdx);
5017 if (Dst.isReg()) {
5018 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
5019 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
5020 if (RegCount > DstSize) {
5021 ErrInfo = "Image instruction returns too many registers for dst "
5022 "register class";
5023 return false;
5024 }
5025 }
5026 }
5027 }
5028
5029 // Verify VOP*. Ignore multiple sgpr operands on writelane.
5030 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
5031 unsigned ConstantBusCount = 0;
5032 bool UsesLiteral = false;
5033 const MachineOperand *LiteralVal = nullptr;
5034
5035 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
5036 if (ImmIdx != -1) {
5037 ++ConstantBusCount;
5038 UsesLiteral = true;
5039 LiteralVal = &MI.getOperand(ImmIdx);
5040 }
5041
5042 SmallVector<Register, 2> SGPRsUsed;
5043 Register SGPRUsed;
5044
5045 // Only look at the true operands. Only a real operand can use the constant
5046 // bus, and we don't want to check pseudo-operands like the source modifier
5047 // flags.
5048 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
5049 if (OpIdx == -1)
5050 continue;
5051 const MachineOperand &MO = MI.getOperand(OpIdx);
5052 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5053 if (MO.isReg()) {
5054 SGPRUsed = MO.getReg();
5055 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
5056 ++ConstantBusCount;
5057 SGPRsUsed.push_back(SGPRUsed);
5058 }
5059 } else if (!MO.isFI()) { // Treat FI like a register.
5060 if (!UsesLiteral) {
5061 ++ConstantBusCount;
5062 UsesLiteral = true;
5063 LiteralVal = &MO;
5064 } else if (!MO.isIdenticalTo(*LiteralVal)) {
5065 assert(isVOP2(MI) || isVOP3(MI));
5066 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
5067 return false;
5068 }
5069 }
5070 }
5071 }
5072
5073 SGPRUsed = findImplicitSGPRRead(MI);
5074 if (SGPRUsed) {
5075 // Implicit uses may safely overlap true operands
5076 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
5077 return !RI.regsOverlap(SGPRUsed, SGPR);
5078 })) {
5079 ++ConstantBusCount;
5080 SGPRsUsed.push_back(SGPRUsed);
5081 }
5082 }
5083
5084 // v_writelane_b32 is an exception from constant bus restriction:
5085 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
5086 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
5087 Opcode != AMDGPU::V_WRITELANE_B32) {
5088 ErrInfo = "VOP* instruction violates constant bus restriction";
5089 return false;
5090 }
5091
5092 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
5093 ErrInfo = "VOP3 instruction uses literal";
5094 return false;
5095 }
5096 }
5097
5098 // Special case for writelane - this can break the multiple constant bus rule,
5099 // but still can't use more than one SGPR register
5100 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
5101 unsigned SGPRCount = 0;
5102 Register SGPRUsed;
5103
5104 for (int OpIdx : {Src0Idx, Src1Idx}) {
5105 if (OpIdx == -1)
5106 break;
5107
5108 const MachineOperand &MO = MI.getOperand(OpIdx);
5109
5110 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
5111 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
5112 if (MO.getReg() != SGPRUsed)
5113 ++SGPRCount;
5114 SGPRUsed = MO.getReg();
5115 }
5116 }
5117 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5118 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5119 return false;
5120 }
5121 }
5122 }
5123
5124 // Verify misc. restrictions on specific instructions.
5125 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5126 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5127 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5128 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5129 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5130 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5131 if (!compareMachineOp(Src0, Src1) &&
5132 !compareMachineOp(Src0, Src2)) {
5133 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5134 return false;
5135 }
5136 }
5137 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5138 SISrcMods::ABS) ||
5139 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5140 SISrcMods::ABS) ||
5141 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5142 SISrcMods::ABS)) {
5143 ErrInfo = "ABS not allowed in VOP3B instructions";
5144 return false;
5145 }
5146 }
5147
5148 if (isSOP2(MI) || isSOPC(MI)) {
5149 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5150 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5151
5152 if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5153 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5154 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5155 !Src0.isIdenticalTo(Src1)) {
5156 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5157 return false;
5158 }
5159 }
5160
5161 if (isSOPK(MI)) {
5162 const auto *Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5163 if (Desc.isBranch()) {
5164 if (!Op->isMBB()) {
5165 ErrInfo = "invalid branch target for SOPK instruction";
5166 return false;
5167 }
5168 } else {
5169 uint64_t Imm = Op->getImm();
5170 if (sopkIsZext(Opcode)) {
5171 if (!isUInt<16>(Imm)) {
5172 ErrInfo = "invalid immediate for SOPK instruction";
5173 return false;
5174 }
5175 } else {
5176 if (!isInt<16>(Imm)) {
5177 ErrInfo = "invalid immediate for SOPK instruction";
5178 return false;
5179 }
5180 }
5181 }
5182 }
5183
5184 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5185 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5186 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5187 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5188 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5189 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5190
5191 const unsigned StaticNumOps =
5192 Desc.getNumOperands() + Desc.implicit_uses().size();
5193 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5194
5195 // Allow additional implicit operands. This allows a fixup done by the post
5196 // RA scheduler where the main implicit operand is killed and implicit-defs
5197 // are added for sub-registers that remain live after this instruction.
5198 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5199 ErrInfo = "missing implicit register operands";
5200 return false;
5201 }
5202
5203 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5204 if (IsDst) {
5205 if (!Dst->isUse()) {
5206 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5207 return false;
5208 }
5209
5210 unsigned UseOpIdx;
5211 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5212 UseOpIdx != StaticNumOps + 1) {
5213 ErrInfo = "movrel implicit operands should be tied";
5214 return false;
5215 }
5216 }
5217
5218 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5219 const MachineOperand &ImpUse
5220 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5221 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5222 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5223 ErrInfo = "src0 should be subreg of implicit vector use";
5224 return false;
5225 }
5226 }
5227
5228 // Make sure we aren't losing exec uses in the td files. This mostly requires
5229 // being careful when using let Uses to try to add other use registers.
5230 if (shouldReadExec(MI)) {
5231 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5232 ErrInfo = "VALU instruction does not implicitly read exec mask";
5233 return false;
5234 }
5235 }
5236
5237 if (isSMRD(MI)) {
5238 if (MI.mayStore() &&
5240 // The register offset form of scalar stores may only use m0 as the
5241 // soffset register.
5242 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5243 if (Soff && Soff->getReg() != AMDGPU::M0) {
5244 ErrInfo = "scalar stores must use m0 as offset register";
5245 return false;
5246 }
5247 }
5248 }
5249
5250 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5251 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5252 if (Offset->getImm() != 0) {
5253 ErrInfo = "subtarget does not support offsets in flat instructions";
5254 return false;
5255 }
5256 }
5257
5258 if (isDS(MI) && !ST.hasGDS()) {
5259 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5260 if (GDSOp && GDSOp->getImm() != 0) {
5261 ErrInfo = "GDS is not supported on this subtarget";
5262 return false;
5263 }
5264 }
5265
5266 if (isImage(MI)) {
5267 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5268 if (DimOp) {
5269 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5270 AMDGPU::OpName::vaddr0);
5271 int RSrcOpName =
5272 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5273 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5274 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5275 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5277 const AMDGPU::MIMGDimInfo *Dim =
5279
5280 if (!Dim) {
5281 ErrInfo = "dim is out of range";
5282 return false;
5283 }
5284
5285 bool IsA16 = false;
5286 if (ST.hasR128A16()) {
5287 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5288 IsA16 = R128A16->getImm() != 0;
5289 } else if (ST.hasA16()) {
5290 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5291 IsA16 = A16->getImm() != 0;
5292 }
5293
5294 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5295
5296 unsigned AddrWords =
5297 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5298
5299 unsigned VAddrWords;
5300 if (IsNSA) {
5301 VAddrWords = RsrcIdx - VAddr0Idx;
5302 if (ST.hasPartialNSAEncoding() &&
5303 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5304 unsigned LastVAddrIdx = RsrcIdx - 1;
5305 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5306 }
5307 } else {
5308 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5309 if (AddrWords > 12)
5310 AddrWords = 16;
5311 }
5312
5313 if (VAddrWords != AddrWords) {
5314 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5315 << " but got " << VAddrWords << "\n");
5316 ErrInfo = "bad vaddr size";
5317 return false;
5318 }
5319 }
5320 }
5321
5322 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5323 if (DppCt) {
5324 using namespace AMDGPU::DPP;
5325
5326 unsigned DC = DppCt->getImm();
5327 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5328 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5329 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5330 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5331 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5332 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5333 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5334 ErrInfo = "Invalid dpp_ctrl value";
5335 return false;
5336 }
5337 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5339 ErrInfo = "Invalid dpp_ctrl value: "
5340 "wavefront shifts are not supported on GFX10+";
5341 return false;
5342 }
5343 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5345 ErrInfo = "Invalid dpp_ctrl value: "
5346 "broadcasts are not supported on GFX10+";
5347 return false;
5348 }
5349 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5351 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5352 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5353 !ST.hasGFX90AInsts()) {
5354 ErrInfo = "Invalid dpp_ctrl value: "
5355 "row_newbroadcast/row_share is not supported before "
5356 "GFX90A/GFX10";
5357 return false;
5358 }
5359 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5360 ErrInfo = "Invalid dpp_ctrl value: "
5361 "row_share and row_xmask are not supported before GFX10";
5362 return false;
5363 }
5364 }
5365
5366 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5368 ErrInfo = "Invalid dpp_ctrl value: "
5369 "DP ALU dpp only support row_newbcast";
5370 return false;
5371 }
5372 }
5373
5374 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5375 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5376 uint16_t DataNameIdx = isDS(Opcode) ? AMDGPU::OpName::data0
5377 : AMDGPU::OpName::vdata;
5378 const MachineOperand *Data = getNamedOperand(MI, DataNameIdx);
5379 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5380 if (Data && !Data->isReg())
5381 Data = nullptr;
5382
5383 if (ST.hasGFX90AInsts()) {
5384 if (Dst && Data &&
5385 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5386 ErrInfo = "Invalid register class: "
5387 "vdata and vdst should be both VGPR or AGPR";
5388 return false;
5389 }
5390 if (Data && Data2 &&
5391 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5392 ErrInfo = "Invalid register class: "
5393 "both data operands should be VGPR or AGPR";
5394 return false;
5395 }
5396 } else {
5397 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5398 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5399 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5400 ErrInfo = "Invalid register class: "
5401 "agpr loads and stores not supported on this GPU";
5402 return false;
5403 }
5404 }
5405 }
5406
5407 if (ST.needsAlignedVGPRs()) {
5408 const auto isAlignedReg = [&MI, &MRI, this](unsigned OpName) -> bool {
5410 if (!Op)
5411 return true;
5412 Register Reg = Op->getReg();
5413 if (Reg.isPhysical())
5414 return !(RI.getHWRegIndex(Reg) & 1);
5415 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5416 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5417 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5418 };
5419
5420 if (MI.getOpcode() == AMDGPU::DS_GWS_INIT ||
5421 MI.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
5422 MI.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
5423
5424 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5425 ErrInfo = "Subtarget requires even aligned vector registers "
5426 "for DS_GWS instructions";
5427 return false;
5428 }
5429 }
5430
5431 if (isMIMG(MI)) {
5432 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5433 ErrInfo = "Subtarget requires even aligned vector registers "
5434 "for vaddr operand of image instructions";
5435 return false;
5436 }
5437 }
5438 }
5439
5440 if (MI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
5441 !ST.hasGFX90AInsts()) {
5442 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5443 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5444 ErrInfo = "Invalid register class: "
5445 "v_accvgpr_write with an SGPR is not supported on this GPU";
5446 return false;
5447 }
5448 }
5449
5450 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5451 const MachineOperand &SrcOp = MI.getOperand(1);
5452 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5453 ErrInfo = "pseudo expects only physical SGPRs";
5454 return false;
5455 }
5456 }
5457
5458 return true;
5459}
5460
5461// It is more readable to list mapped opcodes on the same line.
5462// clang-format off
5463
5465 switch (MI.getOpcode()) {
5466 default: return AMDGPU::INSTRUCTION_LIST_END;
5467 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5468 case AMDGPU::COPY: return AMDGPU::COPY;
5469 case AMDGPU::PHI: return AMDGPU::PHI;
5470 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5471 case AMDGPU::WQM: return AMDGPU::WQM;
5472 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5473 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5474 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5475 case AMDGPU::S_MOV_B32: {
5476 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5477 return MI.getOperand(1).isReg() ||
5478 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
5479 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5480 }
5481 case AMDGPU::S_ADD_I32:
5482 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5483 case AMDGPU::S_ADDC_U32:
5484 return AMDGPU::V_ADDC_U32_e32;
5485 case AMDGPU::S_SUB_I32:
5486 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5487 // FIXME: These are not consistently handled, and selected when the carry is
5488 // used.
5489 case AMDGPU::S_ADD_U32:
5490 return AMDGPU::V_ADD_CO_U32_e32;
5491 case AMDGPU::S_SUB_U32:
5492 return AMDGPU::V_SUB_CO_U32_e32;
5493 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5494 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5495 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5496 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5497 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5498 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5499 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5500 case AMDGPU::S_XNOR_B32:
5501 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5502 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5503 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5504 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5505 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5506 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5507 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5508 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5509 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5510 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5511 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5512 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5513 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5514 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5515 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5516 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5517 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5518 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5519 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5520 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5521 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5522 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5523 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5524 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5525 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5526 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5527 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5528 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5529 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5530 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5531 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5532 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5533 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5534 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5535 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5536 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5537 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5538 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5539 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5540 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
5541 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
5542 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
5543 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
5544 case AMDGPU::S_CVT_F32_F16:
5545 case AMDGPU::S_CVT_HI_F32_F16:
5546 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F32_F16_t16_e64
5547 : AMDGPU::V_CVT_F32_F16_fake16_e64;
5548 case AMDGPU::S_CVT_F16_F32:
5549 return ST.useRealTrue16Insts() ? AMDGPU::V_CVT_F16_F32_t16_e64
5550 : AMDGPU::V_CVT_F16_F32_fake16_e64;
5551 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
5552 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
5553 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
5554 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
5555 case AMDGPU::S_CEIL_F16:
5556 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
5557 : AMDGPU::V_CEIL_F16_fake16_e64;
5558 case AMDGPU::S_FLOOR_F16:
5559 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
5560 : AMDGPU::V_FLOOR_F16_fake16_e64;
5561 case AMDGPU::S_TRUNC_F16:
5562 return AMDGPU::V_TRUNC_F16_fake16_e64;
5563 case AMDGPU::S_RNDNE_F16:
5564 return AMDGPU::V_RNDNE_F16_fake16_e64;
5565 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
5566 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
5567 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
5568 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
5569 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
5570 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
5571 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
5572 case AMDGPU::S_ADD_F16: return AMDGPU::V_ADD_F16_fake16_e64;
5573 case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_fake16_e64;
5574 case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_fake16_e64;
5575 case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_fake16_e64;
5576 case AMDGPU::S_MINIMUM_F16: return AMDGPU::V_MINIMUM_F16_e64;
5577 case AMDGPU::S_MAXIMUM_F16: return AMDGPU::V_MAXIMUM_F16_e64;
5578 case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
5579 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
5580 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5581 case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_fake16_e64;
5582 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
5583 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
5584 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
5585 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
5586 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
5587 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
5588 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
5589 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
5590 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
5591 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
5592 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
5593 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
5594 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
5595 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
5596 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
5597 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
5598 case AMDGPU::S_CMP_LT_F16:
5599 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LT_F16_t16_e64
5600 : AMDGPU::V_CMP_LT_F16_fake16_e64;
5601 case AMDGPU::S_CMP_EQ_F16:
5602 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_EQ_F16_t16_e64
5603 : AMDGPU::V_CMP_EQ_F16_fake16_e64;
5604 case AMDGPU::S_CMP_LE_F16:
5605 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LE_F16_t16_e64
5606 : AMDGPU::V_CMP_LE_F16_fake16_e64;
5607 case AMDGPU::S_CMP_GT_F16:
5608 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GT_F16_t16_e64
5609 : AMDGPU::V_CMP_GT_F16_fake16_e64;
5610 case AMDGPU::S_CMP_LG_F16:
5611 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_LG_F16_t16_e64
5612 : AMDGPU::V_CMP_LG_F16_fake16_e64;
5613 case AMDGPU::S_CMP_GE_F16:
5614 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_GE_F16_t16_e64
5615 : AMDGPU::V_CMP_GE_F16_fake16_e64;
5616 case AMDGPU::S_CMP_O_F16:
5617 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_O_F16_t16_e64
5618 : AMDGPU::V_CMP_O_F16_fake16_e64;
5619 case AMDGPU::S_CMP_U_F16:
5620 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_U_F16_t16_e64
5621 : AMDGPU::V_CMP_U_F16_fake16_e64;
5622 case AMDGPU::S_CMP_NGE_F16:
5623 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGE_F16_t16_e64
5624 : AMDGPU::V_CMP_NGE_F16_fake16_e64;
5625 case AMDGPU::S_CMP_NLG_F16:
5626 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLG_F16_t16_e64
5627 : AMDGPU::V_CMP_NLG_F16_fake16_e64;
5628 case AMDGPU::S_CMP_NGT_F16:
5629 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NGT_F16_t16_e64
5630 : AMDGPU::V_CMP_NGT_F16_fake16_e64;
5631 case AMDGPU::S_CMP_NLE_F16:
5632 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLE_F16_t16_e64
5633 : AMDGPU::V_CMP_NLE_F16_fake16_e64;
5634 case AMDGPU::S_CMP_NEQ_F16:
5635 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NEQ_F16_t16_e64
5636 : AMDGPU::V_CMP_NEQ_F16_fake16_e64;
5637 case AMDGPU::S_CMP_NLT_F16:
5638 return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64
5639 : AMDGPU::V_CMP_NLT_F16_fake16_e64;
5640 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
5641 case AMDGPU::V_S_EXP_F16_e64: return AMDGPU::V_EXP_F16_fake16_e64;
5642 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
5643 case AMDGPU::V_S_LOG_F16_e64: return AMDGPU::V_LOG_F16_fake16_e64;
5644 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
5645 case AMDGPU::V_S_RCP_F16_e64: return AMDGPU::V_RCP_F16_fake16_e64;
5646 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
5647 case AMDGPU::V_S_RSQ_F16_e64: return AMDGPU::V_RSQ_F16_fake16_e64;
5648 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
5649 case AMDGPU::V_S_SQRT_F16_e64: return AMDGPU::V_SQRT_F16_fake16_e64;
5650 }
5652 "Unexpected scalar opcode without corresponding vector one!");
5653}
5654
5655// clang-format on
5656
5660 const DebugLoc &DL, Register Reg,
5661 bool IsSCCLive,
5662 SlotIndexes *Indexes) const {
5663 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
5664 const SIInstrInfo *TII = ST.getInstrInfo();
5665 bool IsWave32 = ST.isWave32();
5666 if (IsSCCLive) {
5667 // Insert two move instructions, one to save the original value of EXEC and
5668 // the other to turn on all bits in EXEC. This is required as we can't use
5669 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
5670 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5671 MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5672 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg)
5673 .addReg(Exec, RegState::Kill);
5674 auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
5675 if (Indexes) {
5676 Indexes->insertMachineInstrInMaps(*StoreExecMI);
5677 Indexes->insertMachineInstrInMaps(*FlipExecMI);
5678 }
5679 } else {
5680 const unsigned OrSaveExec =
5681 IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
5682 auto SaveExec =
5683 BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1);
5684 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
5685 if (Indexes)
5686 Indexes->insertMachineInstrInMaps(*SaveExec);
5687 }
5688}
5689
5692 const DebugLoc &DL, Register Reg,
5693 SlotIndexes *Indexes) const {
5694 unsigned ExecMov = isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5695 MCRegister Exec = isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5696 auto ExecRestoreMI =
5697 BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill);
5698 if (Indexes)
5699 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
5700}
5701
5702static const TargetRegisterClass *
5704 const MachineRegisterInfo &MRI,
5705 const MCInstrDesc &TID, unsigned RCID,
5706 bool IsAllocatable) {
5707 if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
5708 (((TID.mayLoad() || TID.mayStore()) &&
5709 !(TID.TSFlags & SIInstrFlags::Spill)) ||
5711 switch (RCID) {
5712 case AMDGPU::AV_32RegClassID:
5713 RCID = AMDGPU::VGPR_32RegClassID;
5714 break;
5715 case AMDGPU::AV_64RegClassID:
5716 RCID = AMDGPU::VReg_64RegClassID;
5717 break;
5718 case AMDGPU::AV_96RegClassID:
5719 RCID = AMDGPU::VReg_96RegClassID;
5720 break;
5721 case AMDGPU::AV_128RegClassID:
5722 RCID = AMDGPU::VReg_128RegClassID;
5723 break;
5724 case AMDGPU::AV_160RegClassID:
5725 RCID = AMDGPU::VReg_160RegClassID;
5726 break;
5727 case AMDGPU::AV_512RegClassID:
5728 RCID = AMDGPU::VReg_512RegClassID;
5729 break;
5730 default:
5731 break;
5732 }
5733 }
5734
5735 return RI.getProperlyAlignedRC(RI.getRegClass(RCID));
5736}
5737
5739 unsigned OpNum, const TargetRegisterInfo *TRI,
5740 const MachineFunction &MF)
5741 const {
5742 if (OpNum >= TID.getNumOperands())
5743 return nullptr;
5744 auto RegClass = TID.operands()[OpNum].RegClass;
5745 bool IsAllocatable = false;
5747 // vdst and vdata should be both VGPR or AGPR, same for the DS instructions
5748 // with two data operands. Request register class constrained to VGPR only
5749 // of both operands present as Machine Copy Propagation can not check this
5750 // constraint and possibly other passes too.
5751 //
5752 // The check is limited to FLAT and DS because atomics in non-flat encoding
5753 // have their vdst and vdata tied to be the same register.
5754 const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
5755 AMDGPU::OpName::vdst);
5756 const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
5757 (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
5758 : AMDGPU::OpName::vdata);
5759 if (DataIdx != -1) {
5760 IsAllocatable = VDstIdx != -1 || AMDGPU::hasNamedOperand(
5761 TID.Opcode, AMDGPU::OpName::data1);
5762 }
5763 }
5764 return adjustAllocatableRegClass(ST, RI, MF.getRegInfo(), TID, RegClass,
5765 IsAllocatable);
5766}
5767
5769 unsigned OpNo) const {
5770 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5771 const MCInstrDesc &Desc = get(MI.getOpcode());
5772 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
5773 Desc.operands()[OpNo].RegClass == -1) {
5774 Register Reg = MI.getOperand(OpNo).getReg();
5775
5776 if (Reg.isVirtual())
5777 return MRI.getRegClass(Reg);
5778 return RI.getPhysRegBaseClass(Reg);
5779 }
5780
5781 unsigned RCID = Desc.operands()[OpNo].RegClass;
5782 return adjustAllocatableRegClass(ST, RI, MRI, Desc, RCID, true);
5783}
5784
5787 MachineBasicBlock *MBB = MI.getParent();
5788 MachineOperand &MO = MI.getOperand(OpIdx);
5790 unsigned RCID = get(MI.getOpcode()).operands()[OpIdx].RegClass;
5791 const TargetRegisterClass *RC = RI.getRegClass(RCID);
5792 unsigned Size = RI.getRegSizeInBits(*RC);
5793 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
5794 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
5795 : AMDGPU::V_MOV_B32_e32;
5796 if (MO.isReg())
5797 Opcode = AMDGPU::COPY;
5798 else if (RI.isSGPRClass(RC))
5799 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
5800
5801 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
5802 Register Reg = MRI.createVirtualRegister(VRC);
5804 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
5805 MO.ChangeToRegister(Reg, false);
5806}
5807
5810 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
5811 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
5812 if (!SuperReg.getReg().isVirtual())
5813 return RI.getSubReg(SuperReg.getReg(), SubIdx);
5814
5815 MachineBasicBlock *MBB = MI->getParent();
5816 DebugLoc DL = MI->getDebugLoc();
5817 Register SubReg = MRI.createVirtualRegister(SubRC);
5818
5819 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
5820 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
5821 .addReg(SuperReg.getReg(), 0, NewSubIdx);
5822 return SubReg;
5823}
5824
5827 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
5828 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
5829 if (Op.isImm()) {
5830 if (SubIdx == AMDGPU::sub0)
5831 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
5832 if (SubIdx == AMDGPU::sub1)
5833 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
5834
5835 llvm_unreachable("Unhandled register index for immediate");
5836 }
5837
5838 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
5839 SubIdx, SubRC);
5840 return MachineOperand::CreateReg(SubReg, false);
5841}
5842
5843// Change the order of operands from (0, 1, 2) to (0, 2, 1)
5844void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
5845 assert(Inst.getNumExplicitOperands() == 3);
5846 MachineOperand Op1 = Inst.getOperand(1);
5847 Inst.removeOperand(1);
5848 Inst.addOperand(Op1);
5849}
5850
5852 const MCOperandInfo &OpInfo,
5853 const MachineOperand &MO) const {
5854 if (!MO.isReg())
5855 return false;
5856
5857 Register Reg = MO.getReg();
5858
5859 const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass);
5860 if (Reg.isPhysical())
5861 return DRC->contains(Reg);
5862
5863 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
5864
5865 if (MO.getSubReg()) {
5866 const MachineFunction *MF = MO.getParent()->getParent()->getParent();
5867 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
5868 if (!SuperRC)
5869 return false;
5870
5871 DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg());
5872 if (!DRC)
5873 return false;
5874 }
5875 return RC->hasSuperClassEq(DRC);
5876}
5877
5879 const MachineOperand &MO) const {
5880 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5881 const MCOperandInfo OpInfo = MI.getDesc().operands()[OpIdx];
5882 unsigned Opc = MI.getOpcode();
5883
5884 if (!isLegalRegOperand(MRI, OpInfo, MO))
5885 return false;
5886
5887 // check Accumulate GPR operand
5888 bool IsAGPR = RI.isAGPR(MRI, MO.getReg());
5889 if (IsAGPR && !ST.hasMAIInsts())
5890 return false;
5891 if (IsAGPR && (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
5892 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
5893 return false;
5894 // Atomics should have both vdst and vdata either vgpr or agpr.
5895 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
5896 const int DataIdx = AMDGPU::getNamedOperandIdx(
5897 Opc, isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
5898 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
5899 MI.getOperand(DataIdx).isReg() &&
5900 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
5901 return false;
5902 if ((int)OpIdx == DataIdx) {
5903 if (VDstIdx != -1 &&
5904 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
5905 return false;
5906 // DS instructions with 2 src operands also must have tied RC.
5907 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
5908 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
5909 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
5910 return false;
5911 }
5912
5913 // Check V_ACCVGPR_WRITE_B32_e64
5914 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
5915 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
5916 RI.isSGPRReg(MRI, MO.getReg()))
5917 return false;
5918 return true;
5919}
5920
5922 const MCOperandInfo &OpInfo,
5923 const MachineOperand &MO) const {
5924 if (MO.isReg())
5925 return isLegalRegOperand(MRI, OpInfo, MO);
5926
5927 // Handle non-register types that are treated like immediates.
5928 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
5929 return true;
5930}
5931
5932bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
5933 const MachineOperand *MO) const {
5934 const MachineFunction &MF = *MI.getParent()->getParent();
5935 const MachineRegisterInfo &MRI = MF.getRegInfo();
5936 const MCInstrDesc &InstDesc = MI.getDesc();
5937 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
5938 const TargetRegisterClass *DefinedRC =
5939 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
5940 if (!MO)
5941 MO = &MI.getOperand(OpIdx);
5942
5943 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
5944 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
5945 if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
5946 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo) && !LiteralLimit--)
5947 return false;
5948
5950 if (MO->isReg())
5951 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
5952
5953 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
5954 if (i == OpIdx)
5955 continue;
5956 const MachineOperand &Op = MI.getOperand(i);
5957 if (Op.isReg()) {
5958 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
5959 if (!SGPRsUsed.count(SGPR) &&
5960 // FIXME: This can access off the end of the operands() array.
5961 usesConstantBus(MRI, Op, InstDesc.operands().begin()[i])) {
5962 if (--ConstantBusLimit <= 0)
5963 return false;
5964 SGPRsUsed.insert(SGPR);
5965 }
5966 } else if (AMDGPU::isSISrcOperand(InstDesc, i) &&
5967 !isInlineConstant(Op, InstDesc.operands()[i])) {
5968 if (!LiteralLimit--)
5969 return false;
5970 if (--ConstantBusLimit <= 0)
5971 return false;
5972 }
5973 }
5974 } else if (ST.hasNoF16PseudoScalarTransInlineConstants() && !MO->isReg() &&
5975 isF16PseudoScalarTrans(MI.getOpcode()) &&
5976 isInlineConstant(*MO, OpInfo)) {
5977 return false;
5978 }
5979
5980 if (MO->isReg()) {
5981 if (!DefinedRC)
5982 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
5983 return isLegalRegOperand(MI, OpIdx, *MO);
5984 }
5985
5986 if (MO->isImm()) {
5987 uint64_t Imm = MO->getImm();
5988 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
5989 bool Is64BitOp = Is64BitFPOp ||
5993 if (Is64BitOp &&
5995 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp))
5996 return false;
5997
5998 // FIXME: We can use sign extended 64-bit literals, but only for signed
5999 // operands. At the moment we do not know if an operand is signed.
6000 // Such operand will be encoded as its low 32 bits and then either
6001 // correctly sign extended or incorrectly zero extended by HW.
6002 if (!Is64BitFPOp && (int32_t)Imm < 0)
6003 return false;
6004 }
6005 }
6006
6007 // Handle non-register types that are treated like immediates.
6008 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
6009
6010 if (!DefinedRC) {
6011 // This operand expects an immediate.
6012 return true;
6013 }
6014
6015 return isImmOperandLegal(MI, OpIdx, *MO);
6016}
6017
6019 MachineInstr &MI) const {
6020 unsigned Opc = MI.getOpcode();
6021 const MCInstrDesc &InstrDesc = get(Opc);
6022
6023 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
6024 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6025
6026 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
6027 MachineOperand &Src1 = MI.getOperand(Src1Idx);
6028
6029 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
6030 // we need to only have one constant bus use before GFX10.
6031 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
6032 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
6033 RI.isSGPRReg(MRI, Src0.getReg()))
6034 legalizeOpWithMove(MI, Src0Idx);
6035
6036 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
6037 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
6038 // src0/src1 with V_READFIRSTLANE.
6039 if (Opc == AMDGPU::V_WRITELANE_B32) {
6040 const DebugLoc &DL = MI.getDebugLoc();
6041 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
6042 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6043 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6044 .add(Src0);
6045 Src0.ChangeToRegister(Reg, false);
6046 }
6047 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
6048 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6049 const DebugLoc &DL = MI.getDebugLoc();
6050 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6051 .add(Src1);
6052 Src1.ChangeToRegister(Reg, false);
6053 }
6054 return;
6055 }
6056
6057 // No VOP2 instructions support AGPRs.
6058 if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg()))
6059 legalizeOpWithMove(MI, Src0Idx);
6060
6061 if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg()))
6062 legalizeOpWithMove(MI, Src1Idx);
6063
6064 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
6065 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
6066 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
6067 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
6068 legalizeOpWithMove(MI, Src2Idx);
6069 }
6070
6071 // VOP2 src0 instructions support all operand types, so we don't need to check
6072 // their legality. If src1 is already legal, we don't need to do anything.
6073 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
6074 return;
6075
6076 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
6077 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
6078 // select is uniform.
6079 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
6080 RI.isVGPR(MRI, Src1.getReg())) {
6081 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6082 const DebugLoc &DL = MI.getDebugLoc();
6083 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6084 .add(Src1);
6085 Src1.ChangeToRegister(Reg, false);
6086 return;
6087 }
6088
6089 // We do not use commuteInstruction here because it is too aggressive and will
6090 // commute if it is possible. We only want to commute here if it improves
6091 // legality. This can be called a fairly large number of times so don't waste
6092 // compile time pointlessly swapping and checking legality again.
6093 if (HasImplicitSGPR || !MI.isCommutable()) {
6094 legalizeOpWithMove(MI, Src1Idx);
6095 return;
6096 }
6097
6098 // If src0 can be used as src1, commuting will make the operands legal.
6099 // Otherwise we have to give up and insert a move.
6100 //
6101 // TODO: Other immediate-like operand kinds could be commuted if there was a
6102 // MachineOperand::ChangeTo* for them.
6103 if ((!Src1.isImm() && !Src1.isReg()) ||
6104 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
6105 legalizeOpWithMove(MI, Src1Idx);
6106 return;
6107 }
6108
6109 int CommutedOpc = commuteOpcode(MI);
6110 if (CommutedOpc == -1) {
6111 legalizeOpWithMove(MI, Src1Idx);
6112 return;
6113 }
6114
6115 MI.setDesc(get(CommutedOpc));
6116
6117 Register Src0Reg = Src0.getReg();
6118 unsigned Src0SubReg = Src0.getSubReg();
6119 bool Src0Kill = Src0.isKill();
6120
6121 if (Src1.isImm())
6122 Src0.ChangeToImmediate(Src1.getImm());
6123 else if (Src1.isReg()) {
6124 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
6125 Src0.setSubReg(Src1.getSubReg());
6126 } else
6127 llvm_unreachable("Should only have register or immediate operands");
6128
6129 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
6130 Src1.setSubReg(Src0SubReg);
6132}
6133
6134// Legalize VOP3 operands. All operand types are supported for any operand
6135// but only one literal constant and only starting from GFX10.
6137 MachineInstr &MI) const {
6138 unsigned Opc = MI.getOpcode();
6139
6140 int VOP3Idx[3] = {
6141 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
6142 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
6143 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
6144 };
6145
6146 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
6147 Opc == AMDGPU::V_PERMLANEX16_B32_e64) {
6148 // src1 and src2 must be scalar
6149 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
6150 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
6151 const DebugLoc &DL = MI.getDebugLoc();
6152 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
6153 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6154 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6155 .add(Src1);
6156 Src1.ChangeToRegister(Reg, false);
6157 }
6158 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
6159 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6160 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6161 .add(Src2);
6162 Src2.ChangeToRegister(Reg, false);
6163 }
6164 }
6165
6166 // Find the one SGPR operand we are allowed to use.
6167 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6168 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6169 SmallDenseSet<unsigned> SGPRsUsed;
6170 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6171 if (SGPRReg) {
6172 SGPRsUsed.insert(SGPRReg);
6173 --ConstantBusLimit;
6174 }
6175
6176 for (int Idx : VOP3Idx) {
6177 if (Idx == -1)
6178 break;
6179 MachineOperand &MO = MI.getOperand(Idx);
6180
6181 if (!MO.isReg()) {
6182 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6183 continue;
6184
6185 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6186 --LiteralLimit;
6187 --ConstantBusLimit;
6188 continue;
6189 }
6190
6191 --LiteralLimit;
6192 --ConstantBusLimit;
6194 continue;
6195 }
6196
6197 if (RI.hasAGPRs(RI.getRegClassForReg(MRI, MO.getReg())) &&
6198 !isOperandLegal(MI, Idx, &MO)) {
6200 continue;
6201 }
6202
6203 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6204 continue; // VGPRs are legal
6205
6206 // We can use one SGPR in each VOP3 instruction prior to GFX10
6207 // and two starting from GFX10.
6208 if (SGPRsUsed.count(MO.getReg()))
6209 continue;
6210 if (ConstantBusLimit > 0) {
6211 SGPRsUsed.insert(MO.getReg());
6212 --ConstantBusLimit;
6213 continue;
6214 }
6215
6216 // If we make it this far, then the operand is not legal and we must
6217 // legalize it.
6219 }
6220
6221 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6222 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6223 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6224 legalizeOpWithMove(MI, VOP3Idx[2]);
6225}
6226
6229 const TargetRegisterClass *DstRC /*=nullptr*/) const {
6230 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6231 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6232 if (DstRC)
6233 SRC = RI.getCommonSubClass(SRC, DstRC);
6234
6235 Register DstReg = MRI.createVirtualRegister(SRC);
6236 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6237
6238 if (RI.hasAGPRs(VRC)) {
6239 VRC = RI.getEquivalentVGPRClass(VRC);
6240 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6241 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6242 get(TargetOpcode::COPY), NewSrcReg)
6243 .addReg(SrcReg);
6244 SrcReg = NewSrcReg;
6245 }
6246
6247 if (SubRegs == 1) {
6248 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6249 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6250 .addReg(SrcReg);
6251 return DstReg;
6252 }
6253
6255 for (unsigned i = 0; i < SubRegs; ++i) {
6256 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6257 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6258 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6259 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
6260 SRegs.push_back(SGPR);
6261 }
6262
6264 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6265 get(AMDGPU::REG_SEQUENCE), DstReg);
6266 for (unsigned i = 0; i < SubRegs; ++i) {
6267 MIB.addReg(SRegs[i]);
6268 MIB.addImm(RI.getSubRegFromChannel(i));
6269 }
6270 return DstReg;
6271}
6272
6274 MachineInstr &MI) const {
6275
6276 // If the pointer is store in VGPRs, then we need to move them to
6277 // SGPRs using v_readfirstlane. This is safe because we only select
6278 // loads with uniform pointers to SMRD instruction so we know the
6279 // pointer value is uniform.
6280 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6281 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6282 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6283 SBase->setReg(SGPR);
6284 }
6285 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6286 if (SOff && !RI.isSGPRReg(MRI, SOff->getReg())) {
6287 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6288 SOff->setReg(SGPR);
6289 }
6290}
6291
6293 unsigned Opc = Inst.getOpcode();
6294 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6295 if (OldSAddrIdx < 0)
6296 return false;
6297
6299
6300 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6301 if (NewOpc < 0)
6303 if (NewOpc < 0)
6304 return false;
6305
6307 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6308 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6309 return false;
6310
6311 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6312 if (NewVAddrIdx < 0)
6313 return false;
6314
6315 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6316
6317 // Check vaddr, it shall be zero or absent.
6318 MachineInstr *VAddrDef = nullptr;
6319 if (OldVAddrIdx >= 0) {
6320 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6321 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6322 if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 ||
6323 !VAddrDef->getOperand(1).isImm() ||
6324 VAddrDef->getOperand(1).getImm() != 0)
6325 return false;
6326 }
6327
6328 const MCInstrDesc &NewDesc = get(NewOpc);
6329 Inst.setDesc(NewDesc);
6330
6331 // Callers expect iterator to be valid after this call, so modify the
6332 // instruction in place.
6333 if (OldVAddrIdx == NewVAddrIdx) {
6334 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6335 // Clear use list from the old vaddr holding a zero register.
6336 MRI.removeRegOperandFromUseList(&NewVAddr);
6337 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6338 Inst.removeOperand(OldSAddrIdx);
6339 // Update the use list with the pointer we have just moved from vaddr to
6340 // saddr position. Otherwise new vaddr will be missing from the use list.
6341 MRI.removeRegOperandFromUseList(&NewVAddr);
6342 MRI.addRegOperandToUseList(&NewVAddr);
6343 } else {
6344 assert(OldSAddrIdx == NewVAddrIdx);
6345
6346 if (OldVAddrIdx >= 0) {
6347 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6348 AMDGPU::OpName::vdst_in);
6349
6350 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6351 // it asserts. Untie the operands for now and retie them afterwards.
6352 if (NewVDstIn != -1) {
6353 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6354 Inst.untieRegOperand(OldVDstIn);
6355 }
6356
6357 Inst.removeOperand(OldVAddrIdx);
6358
6359 if (NewVDstIn != -1) {
6360 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6361 Inst.tieOperands(NewVDst, NewVDstIn);
6362 }
6363 }
6364 }
6365
6366 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6367 VAddrDef->eraseFromParent();
6368
6369 return true;
6370}
6371
6372// FIXME: Remove this when SelectionDAG is obsoleted.
6374 MachineInstr &MI) const {
6376 return;
6377
6378 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6379 // thinks they are uniform, so a readfirstlane should be valid.
6380 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
6381 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6382 return;
6383
6385 return;
6386
6387 const TargetRegisterClass *DeclaredRC = getRegClass(
6388 MI.getDesc(), SAddr->getOperandNo(), &RI, *MI.getParent()->getParent());
6389
6390 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI, DeclaredRC);
6391 SAddr->setReg(ToSGPR);
6392}
6393
6396 const TargetRegisterClass *DstRC,
6399 const DebugLoc &DL) const {
6400 Register OpReg = Op.getReg();
6401 unsigned OpSubReg = Op.getSubReg();
6402
6403 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
6404 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
6405
6406 // Check if operand is already the correct register class.
6407 if (DstRC == OpRC)
6408 return;
6409
6410 Register DstReg = MRI.createVirtualRegister(DstRC);
6411 auto Copy =
6412 BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).addReg(OpReg);
6413 Op.setReg(DstReg);
6414
6415 MachineInstr *Def = MRI.getVRegDef(OpReg);
6416 if (!Def)
6417 return;
6418
6419 // Try to eliminate the copy if it is copying an immediate value.
6420 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
6421 foldImmediate(*Copy, *Def, OpReg, &MRI);
6422
6423 bool ImpDef = Def->isImplicitDef();
6424 while (!ImpDef && Def && Def->isCopy()) {
6425 if (Def->getOperand(1).getReg().isPhysical())
6426 break;
6427 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
6428 ImpDef = Def && Def->isImplicitDef();
6429 }
6430 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
6431 !ImpDef)
6432 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
6433}
6434
6435// Emit the actual waterfall loop, executing the wrapped instruction for each
6436// unique value of \p ScalarOps across all lanes. In the best case we execute 1
6437// iteration, in the worst case we execute 64 (once per lane).
6438static void
6441 MachineBasicBlock &LoopBB,
6442 MachineBasicBlock &BodyBB,
6443 const DebugLoc &DL,
6444 ArrayRef<MachineOperand *> ScalarOps) {
6445 MachineFunction &MF = *LoopBB.getParent();
6446 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6447 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6448 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6449 unsigned SaveExecOpc =
6450 ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
6451 unsigned XorTermOpc =
6452 ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
6453 unsigned AndOpc =
6454 ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6455 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
6456
6458 Register CondReg;
6459
6460 for (MachineOperand *ScalarOp : ScalarOps) {
6461 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
6462 unsigned NumSubRegs = RegSize / 32;
6463 Register VScalarOp = ScalarOp->getReg();
6464
6465 if (NumSubRegs == 1) {
6466 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6467
6468 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
6469 .addReg(VScalarOp);
6470
6471 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6472
6473 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
6474 .addReg(CurReg)
6475 .addReg(VScalarOp);
6476
6477 // Combine the comparison results with AND.
6478 if (!CondReg) // First.
6479 CondReg = NewCondReg;
6480 else { // If not the first, we create an AND.
6481 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6482 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6483 .addReg(CondReg)
6484 .addReg(NewCondReg);
6485 CondReg = AndReg;
6486 }
6487
6488 // Update ScalarOp operand to use the SGPR ScalarOp.
6489 ScalarOp->setReg(CurReg);
6490 ScalarOp->setIsKill();
6491 } else {
6492 SmallVector<Register, 8> ReadlanePieces;
6493 unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
6494 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
6495 "Unhandled register size");
6496
6497 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
6498 Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6499 Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6500
6501 // Read the next variant <- also loop target.
6502 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
6503 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
6504
6505 // Read the next variant <- also loop target.
6506 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
6507 .addReg(VScalarOp, VScalarOpUndef,
6508 TRI->getSubRegFromChannel(Idx + 1));
6509
6510 ReadlanePieces.push_back(CurRegLo);
6511 ReadlanePieces.push_back(CurRegHi);
6512
6513 // Comparison is to be done as 64-bit.
6514 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
6515 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
6516 .addReg(CurRegLo)
6517 .addImm(AMDGPU::sub0)
6518 .addReg(CurRegHi)
6519 .addImm(AMDGPU::sub1);
6520
6521 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6522 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
6523 NewCondReg)
6524 .addReg(CurReg);
6525 if (NumSubRegs <= 2)
6526 Cmp.addReg(VScalarOp);
6527 else
6528 Cmp.addReg(VScalarOp, VScalarOpUndef,
6529 TRI->getSubRegFromChannel(Idx, 2));
6530
6531 // Combine the comparison results with AND.
6532 if (!CondReg) // First.
6533 CondReg = NewCondReg;
6534 else { // If not the first, we create an AND.
6535 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6536 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6537 .addReg(CondReg)
6538 .addReg(NewCondReg);
6539 CondReg = AndReg;
6540 }
6541 } // End for loop.
6542
6543 const auto *SScalarOpRC =
6544 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
6545 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
6546
6547 // Build scalar ScalarOp.
6548 auto Merge =
6549 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
6550 unsigned Channel = 0;
6551 for (Register Piece : ReadlanePieces) {
6552 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
6553 }
6554
6555 // Update ScalarOp operand to use the SGPR ScalarOp.
6556 ScalarOp->setReg(SScalarOp);
6557 ScalarOp->setIsKill();
6558 }
6559 }
6560
6561 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6562 MRI.setSimpleHint(SaveExec, CondReg);
6563
6564 // Update EXEC to matching lanes, saving original to SaveExec.
6565 BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec)
6566 .addReg(CondReg, RegState::Kill);
6567
6568 // The original instruction is here; we insert the terminators after it.
6569 I = BodyBB.end();
6570
6571 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
6572 BuildMI(BodyBB, I, DL, TII.get(XorTermOpc), Exec)
6573 .addReg(Exec)
6574 .addReg(SaveExec);
6575
6576 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
6577}
6578
6579// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
6580// with SGPRs by iterating over all unique values across all lanes.
6581// Returns the loop basic block that now contains \p MI.
6582static MachineBasicBlock *
6586 MachineBasicBlock::iterator Begin = nullptr,
6587 MachineBasicBlock::iterator End = nullptr) {
6588 MachineBasicBlock &MBB = *MI.getParent();
6589 MachineFunction &MF = *MBB.getParent();
6590 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6591 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6593 if (!Begin.isValid())
6594 Begin = &MI;
6595 if (!End.isValid()) {
6596 End = &MI;
6597 ++End;
6598 }
6599 const DebugLoc &DL = MI.getDebugLoc();
6600 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6601 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
6602 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
6603
6604 // Save SCC. Waterfall Loop may overwrite SCC.
6605 Register SaveSCCReg;
6606
6607 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
6608 // rather than unlimited scan everywhere
6609 bool SCCNotDead =
6610 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
6611 std::numeric_limits<unsigned>::max()) !=
6613 if (SCCNotDead) {
6614 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6615 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
6616 .addImm(1)
6617 .addImm(0);
6618 }
6619
6620 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6621
6622 // Save the EXEC mask
6623 BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec);
6624
6625 // Killed uses in the instruction we are waterfalling around will be
6626 // incorrect due to the added control-flow.
6628 ++AfterMI;
6629 for (auto I = Begin; I != AfterMI; I++) {
6630 for (auto &MO : I->all_uses())
6631 MRI.clearKillFlags(MO.getReg());
6632 }
6633
6634 // To insert the loop we need to split the block. Move everything after this
6635 // point to a new block, and insert a new empty block between the two.
6638 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
6640 ++MBBI;
6641
6642 MF.insert(MBBI, LoopBB);
6643 MF.insert(MBBI, BodyBB);
6644 MF.insert(MBBI, RemainderBB);
6645
6646 LoopBB->addSuccessor(BodyBB);
6647 BodyBB->addSuccessor(LoopBB);
6648 BodyBB->addSuccessor(RemainderBB);
6649
6650 // Move Begin to MI to the BodyBB, and the remainder of the block to
6651 // RemainderBB.
6652 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
6653 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
6654 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
6655
6656 MBB.addSuccessor(LoopBB);
6657
6658 // Update dominators. We know that MBB immediately dominates LoopBB, that
6659 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
6660 // RemainderBB. RemainderBB immediately dominates all of the successors
6661 // transferred to it from MBB that MBB used to properly dominate.
6662 if (MDT) {
6663 MDT->addNewBlock(LoopBB, &MBB);
6664 MDT->addNewBlock(BodyBB, LoopBB);
6665 MDT->addNewBlock(RemainderBB, BodyBB);
6666 for (auto &Succ : RemainderBB->successors()) {
6667 if (MDT->properlyDominates(&MBB, Succ)) {
6668 MDT->changeImmediateDominator(Succ, RemainderBB);
6669 }
6670 }
6671 }
6672
6673 emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps);
6674
6675 MachineBasicBlock::iterator First = RemainderBB->begin();
6676 // Restore SCC
6677 if (SCCNotDead) {
6678 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
6679 .addReg(SaveSCCReg, RegState::Kill)
6680 .addImm(0);
6681 }
6682
6683 // Restore the EXEC mask
6684 BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec);
6685 return BodyBB;
6686}
6687
6688// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
6689static std::tuple<unsigned, unsigned>
6691 MachineBasicBlock &MBB = *MI.getParent();
6692 MachineFunction &MF = *MBB.getParent();
6694
6695 // Extract the ptr from the resource descriptor.
6696 unsigned RsrcPtr =
6697 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
6698 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
6699
6700 // Create an empty resource descriptor
6701 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6702 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6703 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6704 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
6705 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
6706
6707 // Zero64 = 0
6708 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
6709 .addImm(0);
6710
6711 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
6712 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
6713 .addImm(Lo_32(RsrcDataFormat));
6714
6715 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
6716 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
6717 .addImm(Hi_32(RsrcDataFormat));
6718
6719 // NewSRsrc = {Zero64, SRsrcFormat}
6720 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
6721 .addReg(Zero64)
6722 .addImm(AMDGPU::sub0_sub1)
6723 .addReg(SRsrcFormatLo)
6724 .addImm(AMDGPU::sub2)
6725 .addReg(SRsrcFormatHi)
6726 .addImm(AMDGPU::sub3);
6727
6728 return std::tuple(RsrcPtr, NewSRsrc);
6729}
6730
6733 MachineDominatorTree *MDT) const {
6734 MachineFunction &MF = *MI.getParent()->getParent();
6736 MachineBasicBlock *CreatedBB = nullptr;
6737
6738 // Legalize VOP2
6739 if (isVOP2(MI) || isVOPC(MI)) {
6741 return CreatedBB;
6742 }
6743
6744 // Legalize VOP3
6745 if (isVOP3(MI)) {
6747 return CreatedBB;
6748 }
6749
6750 // Legalize SMRD
6751 if (isSMRD(MI)) {
6753 return CreatedBB;
6754 }
6755
6756 // Legalize FLAT
6757 if (isFLAT(MI)) {
6759 return CreatedBB;
6760 }
6761
6762 // Legalize REG_SEQUENCE and PHI
6763 // The register class of the operands much be the same type as the register
6764 // class of the output.
6765 if (MI.getOpcode() == AMDGPU::PHI) {
6766 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
6767 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
6768 if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
6769 continue;
6770 const TargetRegisterClass *OpRC =
6771 MRI.getRegClass(MI.getOperand(i).getReg());
6772 if (RI.hasVectorRegisters(OpRC)) {
6773 VRC = OpRC;
6774 } else {
6775 SRC = OpRC;
6776 }
6777 }
6778
6779 // If any of the operands are VGPR registers, then they all most be
6780 // otherwise we will create illegal VGPR->SGPR copies when legalizing
6781 // them.
6782 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
6783 if (!VRC) {
6784 assert(SRC);
6785 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
6786 VRC = &AMDGPU::VReg_1RegClass;
6787 } else
6788 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
6789 ? RI.getEquivalentAGPRClass(SRC)
6790 : RI.getEquivalentVGPRClass(SRC);
6791 } else {
6792 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
6793 ? RI.getEquivalentAGPRClass(VRC)
6794 : RI.getEquivalentVGPRClass(VRC);
6795 }
6796 RC = VRC;
6797 } else {
6798 RC = SRC;
6799 }
6800
6801 // Update all the operands so they have the same type.
6802 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6803 MachineOperand &Op = MI.getOperand(I);
6804 if (!Op.isReg() || !Op.getReg().isVirtual())
6805 continue;
6806
6807 // MI is a PHI instruction.
6808 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
6810
6811 // Avoid creating no-op copies with the same src and dst reg class. These
6812 // confuse some of the machine passes.
6813 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
6814 }
6815 }
6816
6817 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
6818 // VGPR dest type and SGPR sources, insert copies so all operands are
6819 // VGPRs. This seems to help operand folding / the register coalescer.
6820 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
6821 MachineBasicBlock *MBB = MI.getParent();
6822 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
6823 if (RI.hasVGPRs(DstRC)) {
6824 // Update all the operands so they are VGPR register classes. These may
6825 // not be the same register class because REG_SEQUENCE supports mixing
6826 // subregister index types e.g. sub0_sub1 + sub2 + sub3
6827 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6828 MachineOperand &Op = MI.getOperand(I);
6829 if (!Op.isReg() || !Op.getReg().isVirtual())
6830 continue;
6831
6832 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
6833 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
6834 if (VRC == OpRC)
6835 continue;
6836
6837 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
6838 Op.setIsKill();
6839 }
6840 }
6841
6842 return CreatedBB;
6843 }
6844
6845 // Legalize INSERT_SUBREG
6846 // src0 must have the same register class as dst
6847 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
6848 Register Dst = MI.getOperand(0).getReg();
6849 Register Src0 = MI.getOperand(1).getReg();
6850 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
6851 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
6852 if (DstRC != Src0RC) {
6853 MachineBasicBlock *MBB = MI.getParent();
6854 MachineOperand &Op = MI.getOperand(1);
6855 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
6856 }
6857 return CreatedBB;
6858 }
6859
6860 // Legalize SI_INIT_M0
6861 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
6862 MachineOperand &Src = MI.getOperand(0);
6863 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
6864 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
6865 return CreatedBB;
6866 }
6867
6868 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
6869 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
6870 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
6871 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
6872 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
6873 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
6874 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
6875 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
6876 MachineOperand &Src = MI.getOperand(1);
6877 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
6878 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
6879 return CreatedBB;
6880 }
6881
6882 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
6883 //
6884 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
6885 // scratch memory access. In both cases, the legalization never involves
6886 // conversion to the addr64 form.
6888 (isMUBUF(MI) || isMTBUF(MI)))) {
6889 int RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI)) ? AMDGPU::OpName::rsrc
6890 : AMDGPU::OpName::srsrc;
6891 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
6892 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
6893 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
6894
6895 int SampOpName = isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
6896 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
6897 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
6898 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
6899
6900 return CreatedBB;
6901 }
6902
6903 // Legalize SI_CALL
6904 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
6905 MachineOperand *Dest = &MI.getOperand(0);
6906 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
6907 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
6908 // following copies, we also need to move copies from and to physical
6909 // registers into the loop block.
6910 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
6911 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
6912
6913 // Also move the copies to physical registers into the loop block
6914 MachineBasicBlock &MBB = *MI.getParent();
6916 while (Start->getOpcode() != FrameSetupOpcode)
6917 --Start;
6919 while (End->getOpcode() != FrameDestroyOpcode)
6920 ++End;
6921 // Also include following copies of the return value
6922 ++End;
6923 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
6924 MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
6925 ++End;
6926 CreatedBB =
6927 loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
6928 }
6929 }
6930
6931 // Legalize s_sleep_var.
6932 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
6933 const DebugLoc &DL = MI.getDebugLoc();
6934 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6935 int Src0Idx =
6936 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
6937 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6938 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6939 .add(Src0);
6940 Src0.ChangeToRegister(Reg, false);
6941 return nullptr;
6942 }
6943
6944 // Legalize MUBUF instructions.
6945 bool isSoffsetLegal = true;
6946 int SoffsetIdx =
6947 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
6948 if (SoffsetIdx != -1) {
6949 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
6950 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
6951 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
6952 isSoffsetLegal = false;
6953 }
6954 }
6955
6956 bool isRsrcLegal = true;
6957 int RsrcIdx =
6958 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
6959 if (RsrcIdx != -1) {
6960 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
6961 if (Rsrc->isReg() && !RI.isSGPRReg(MRI, Rsrc->getReg()))
6962 isRsrcLegal = false;
6963 }
6964
6965 // The operands are legal.
6966 if (isRsrcLegal && isSoffsetLegal)
6967 return CreatedBB;
6968
6969 if (!isRsrcLegal) {
6970 // Legalize a VGPR Rsrc
6971 //
6972 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
6973 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
6974 // a zero-value SRsrc.
6975 //
6976 // If the instruction is _OFFSET (both idxen and offen disabled), and we
6977 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
6978 // above.
6979 //
6980 // Otherwise we are on non-ADDR64 hardware, and/or we have
6981 // idxen/offen/bothen and we fall back to a waterfall loop.
6982
6983 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
6984 MachineBasicBlock &MBB = *MI.getParent();
6985
6986 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
6987 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
6988 // This is already an ADDR64 instruction so we need to add the pointer
6989 // extracted from the resource descriptor to the current value of VAddr.
6990 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6991 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6992 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
6993
6994 const auto *BoolXExecRC = RI.getWaveMaskRegClass();
6995 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
6996 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
6997
6998 unsigned RsrcPtr, NewSRsrc;
6999 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7000
7001 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
7002 const DebugLoc &DL = MI.getDebugLoc();
7003 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
7004 .addDef(CondReg0)
7005 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7006 .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
7007 .addImm(0);
7008
7009 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
7010 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
7011 .addDef(CondReg1, RegState::Dead)
7012 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7013 .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
7014 .addReg(CondReg0, RegState::Kill)
7015 .addImm(0);
7016
7017 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7018 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
7019 .addReg(NewVAddrLo)
7020 .addImm(AMDGPU::sub0)
7021 .addReg(NewVAddrHi)
7022 .addImm(AMDGPU::sub1);
7023
7024 VAddr->setReg(NewVAddr);
7025 Rsrc->setReg(NewSRsrc);
7026 } else if (!VAddr && ST.hasAddr64()) {
7027 // This instructions is the _OFFSET variant, so we need to convert it to
7028 // ADDR64.
7030 "FIXME: Need to emit flat atomics here");
7031
7032 unsigned RsrcPtr, NewSRsrc;
7033 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
7034
7035 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7036 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
7037 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
7038 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7039 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
7040
7041 // Atomics with return have an additional tied operand and are
7042 // missing some of the special bits.
7043 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
7044 MachineInstr *Addr64;
7045
7046 if (!VDataIn) {
7047 // Regular buffer load / store.
7049 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7050 .add(*VData)
7051 .addReg(NewVAddr)
7052 .addReg(NewSRsrc)
7053 .add(*SOffset)
7054 .add(*Offset);
7055
7056 if (const MachineOperand *CPol =
7057 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
7058 MIB.addImm(CPol->getImm());
7059 }
7060
7061 if (const MachineOperand *TFE =
7062 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
7063 MIB.addImm(TFE->getImm());
7064 }
7065
7066 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
7067
7068 MIB.cloneMemRefs(MI);
7069 Addr64 = MIB;
7070 } else {
7071 // Atomics with return.
7072 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
7073 .add(*VData)
7074 .add(*VDataIn)
7075 .addReg(NewVAddr)
7076 .addReg(NewSRsrc)
7077 .add(*SOffset)
7078 .add(*Offset)
7079 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
7080 .cloneMemRefs(MI);
7081 }
7082
7083 MI.removeFromParent();
7084
7085 // NewVaddr = {NewVaddrHi, NewVaddrLo}
7086 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
7087 NewVAddr)
7088 .addReg(RsrcPtr, 0, AMDGPU::sub0)
7089 .addImm(AMDGPU::sub0)
7090 .addReg(RsrcPtr, 0, AMDGPU::sub1)
7091 .addImm(AMDGPU::sub1);
7092 } else {
7093 // Legalize a VGPR Rsrc and soffset together.
7094 if (!isSoffsetLegal) {
7095 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7096 CreatedBB =
7097 loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
7098 return CreatedBB;
7099 }
7100 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
7101 return CreatedBB;
7102 }
7103 }
7104
7105 // Legalize a VGPR soffset.
7106 if (!isSoffsetLegal) {
7107 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
7108 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
7109 return CreatedBB;
7110 }
7111 return CreatedBB;
7112}
7113
7115 InstrList.insert(MI);
7116 // Add MBUF instructiosn to deferred list.
7117 int RsrcIdx =
7118 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
7119 if (RsrcIdx != -1) {
7120 DeferredList.insert(MI);
7121 }
7122}
7123
7125 return DeferredList.contains(MI);
7126}
7127
7129 MachineDominatorTree *MDT) const {
7130
7131 while (!Worklist.empty()) {
7132 MachineInstr &Inst = *Worklist.top();
7133 Worklist.erase_top();
7134 // Skip MachineInstr in the deferred list.
7135 if (Worklist.isDeferred(&Inst))
7136 continue;
7137 moveToVALUImpl(Worklist, MDT, Inst);
7138 }
7139
7140 // Deferred list of instructions will be processed once
7141 // all the MachineInstr in the worklist are done.
7142 for (MachineInstr *Inst : Worklist.getDeferredList()) {
7143 moveToVALUImpl(Worklist, MDT, *Inst);
7144 assert(Worklist.empty() &&
7145 "Deferred MachineInstr are not supposed to re-populate worklist");
7146 }
7147}
7148
7151 MachineInstr &Inst) const {
7152
7154 if (!MBB)
7155 return;
7157 unsigned Opcode = Inst.getOpcode();
7158 unsigned NewOpcode = getVALUOp(Inst);
7159 // Handle some special cases
7160 switch (Opcode) {
7161 default:
7162 break;
7163 case AMDGPU::S_ADD_U64_PSEUDO:
7164 NewOpcode = AMDGPU::V_ADD_U64_PSEUDO;
7165 break;
7166 case AMDGPU::S_SUB_U64_PSEUDO:
7167 NewOpcode = AMDGPU::V_SUB_U64_PSEUDO;
7168 break;
7169 case AMDGPU::S_ADD_I32:
7170 case AMDGPU::S_SUB_I32: {
7171 // FIXME: The u32 versions currently selected use the carry.
7172 bool Changed;
7173 MachineBasicBlock *CreatedBBTmp = nullptr;
7174 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7175 if (Changed)
7176 return;
7177
7178 // Default handling
7179 break;
7180 }
7181
7182 case AMDGPU::S_MUL_U64:
7183 // Split s_mul_u64 in 32-bit vector multiplications.
7184 splitScalarSMulU64(Worklist, Inst, MDT);
7185 Inst.eraseFromParent();
7186 return;
7187
7188 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7189 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7190 // This is a special case of s_mul_u64 where all the operands are either
7191 // zero extended or sign extended.
7192 splitScalarSMulPseudo(Worklist, Inst, MDT);
7193 Inst.eraseFromParent();
7194 return;
7195
7196 case AMDGPU::S_AND_B64:
7197 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7198 Inst.eraseFromParent();
7199 return;
7200
7201 case AMDGPU::S_OR_B64:
7202 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7203 Inst.eraseFromParent();
7204 return;
7205
7206 case AMDGPU::S_XOR_B64:
7207 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7208 Inst.eraseFromParent();
7209 return;
7210
7211 case AMDGPU::S_NAND_B64:
7212 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7213 Inst.eraseFromParent();
7214 return;
7215
7216 case AMDGPU::S_NOR_B64:
7217 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7218 Inst.eraseFromParent();
7219 return;
7220
7221 case AMDGPU::S_XNOR_B64:
7222 if (ST.hasDLInsts())
7223 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7224 else
7225 splitScalar64BitXnor(Worklist, Inst, MDT);
7226 Inst.eraseFromParent();
7227 return;
7228
7229 case AMDGPU::S_ANDN2_B64:
7230 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7231 Inst.eraseFromParent();
7232 return;
7233
7234 case AMDGPU::S_ORN2_B64:
7235 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7236 Inst.eraseFromParent();
7237 return;
7238
7239 case AMDGPU::S_BREV_B64:
7240 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7241 Inst.eraseFromParent();
7242 return;
7243
7244 case AMDGPU::S_NOT_B64:
7245 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7246 Inst.eraseFromParent();
7247 return;
7248
7249 case AMDGPU::S_BCNT1_I32_B64:
7250 splitScalar64BitBCNT(Worklist, Inst);
7251 Inst.eraseFromParent();
7252 return;
7253
7254 case AMDGPU::S_BFE_I64:
7255 splitScalar64BitBFE(Worklist, Inst);
7256 Inst.eraseFromParent();
7257 return;
7258
7259 case AMDGPU::S_FLBIT_I32_B64:
7260 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7261 Inst.eraseFromParent();
7262 return;
7263 case AMDGPU::S_FF1_I32_B64:
7264 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7265 Inst.eraseFromParent();
7266 return;
7267
7268 case AMDGPU::S_LSHL_B32:
7269 if (ST.hasOnlyRevVALUShifts()) {
7270 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7271 swapOperands(Inst);
7272 }
7273 break;
7274 case AMDGPU::S_ASHR_I32:
7275 if (ST.hasOnlyRevVALUShifts()) {
7276 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7277 swapOperands(Inst);
7278 }
7279 break;
7280 case AMDGPU::S_LSHR_B32:
7281 if (ST.hasOnlyRevVALUShifts()) {
7282 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7283 swapOperands(Inst);
7284 }
7285 break;
7286 case AMDGPU::S_LSHL_B64:
7287 if (ST.hasOnlyRevVALUShifts()) {
7288 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7289 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7290 : AMDGPU::V_LSHLREV_B64_e64;
7291 swapOperands(Inst);
7292 }
7293 break;
7294 case AMDGPU::S_ASHR_I64:
7295 if (ST.hasOnlyRevVALUShifts()) {
7296 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7297 swapOperands(Inst);
7298 }
7299 break;
7300 case AMDGPU::S_LSHR_B64:
7301 if (ST.hasOnlyRevVALUShifts()) {
7302 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7303 swapOperands(Inst);
7304 }
7305 break;
7306
7307 case AMDGPU::S_ABS_I32:
7308 lowerScalarAbs(Worklist, Inst);
7309 Inst.eraseFromParent();
7310 return;
7311
7312 case AMDGPU::S_CBRANCH_SCC0:
7313 case AMDGPU::S_CBRANCH_SCC1: {
7314 // Clear unused bits of vcc
7315 Register CondReg = Inst.getOperand(1).getReg();
7316 bool IsSCC = CondReg == AMDGPU::SCC;
7317 Register VCC = RI.getVCC();
7318 Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
7319 unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
7320 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC)
7321 .addReg(EXEC)
7322 .addReg(IsSCC ? VCC : CondReg);
7323 Inst.removeOperand(1);
7324 } break;
7325
7326 case AMDGPU::S_BFE_U64:
7327 case AMDGPU::S_BFM_B64:
7328 llvm_unreachable("Moving this op to VALU not implemented");
7329
7330 case AMDGPU::S_PACK_LL_B32_B16:
7331 case AMDGPU::S_PACK_LH_B32_B16:
7332 case AMDGPU::S_PACK_HL_B32_B16:
7333 case AMDGPU::S_PACK_HH_B32_B16:
7334 movePackToVALU(Worklist, MRI, Inst);
7335 Inst.eraseFromParent();
7336 return;
7337
7338 case AMDGPU::S_XNOR_B32:
7339 lowerScalarXnor(Worklist, Inst);
7340 Inst.eraseFromParent();
7341 return;
7342
7343 case AMDGPU::S_NAND_B32:
7344 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
7345 Inst.eraseFromParent();
7346 return;
7347
7348 case AMDGPU::S_NOR_B32:
7349 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
7350 Inst.eraseFromParent();
7351 return;
7352
7353 case AMDGPU::S_ANDN2_B32:
7354 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
7355 Inst.eraseFromParent();
7356 return;
7357
7358 case AMDGPU::S_ORN2_B32:
7359 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
7360 Inst.eraseFromParent();
7361 return;
7362
7363 // TODO: remove as soon as everything is ready
7364 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
7365 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
7366 // can only be selected from the uniform SDNode.
7367 case AMDGPU::S_ADD_CO_PSEUDO:
7368 case AMDGPU::S_SUB_CO_PSEUDO: {
7369 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
7370 ? AMDGPU::V_ADDC_U32_e64
7371 : AMDGPU::V_SUBB_U32_e64;
7372 const auto *CarryRC = RI.getWaveMaskRegClass();
7373
7374 Register CarryInReg = Inst.getOperand(4).getReg();
7375 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
7376 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
7377 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
7378 .addReg(CarryInReg);
7379 }
7380
7381 Register CarryOutReg = Inst.getOperand(1).getReg();
7382
7383 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
7384 MRI.getRegClass(Inst.getOperand(0).getReg())));
7385 MachineInstr *CarryOp =
7386 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
7387 .addReg(CarryOutReg, RegState::Define)
7388 .add(Inst.getOperand(2))
7389 .add(Inst.getOperand(3))
7390 .addReg(CarryInReg)
7391 .addImm(0);
7392 legalizeOperands(*CarryOp);
7393 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
7394 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
7395 Inst.eraseFromParent();
7396 }
7397 return;
7398 case AMDGPU::S_UADDO_PSEUDO:
7399 case AMDGPU::S_USUBO_PSEUDO: {
7400 const DebugLoc &DL = Inst.getDebugLoc();
7401 MachineOperand &Dest0 = Inst.getOperand(0);
7402 MachineOperand &Dest1 = Inst.getOperand(1);
7403 MachineOperand &Src0 = Inst.getOperand(2);
7404 MachineOperand &Src1 = Inst.getOperand(3);
7405
7406 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
7407 ? AMDGPU::V_ADD_CO_U32_e64
7408 : AMDGPU::V_SUB_CO_U32_e64;
7409 const TargetRegisterClass *NewRC =
7410 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
7411 Register DestReg = MRI.createVirtualRegister(NewRC);
7412 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
7413 .addReg(Dest1.getReg(), RegState::Define)
7414 .add(Src0)
7415 .add(Src1)
7416 .addImm(0); // clamp bit
7417
7418 legalizeOperands(*NewInstr, MDT);
7419 MRI.replaceRegWith(Dest0.getReg(), DestReg);
7420 addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
7421 Worklist);
7422 Inst.eraseFromParent();
7423 }
7424 return;
7425
7426 case AMDGPU::S_CSELECT_B32:
7427 case AMDGPU::S_CSELECT_B64:
7428 lowerSelect(Worklist, Inst, MDT);
7429 Inst.eraseFromParent();
7430 return;
7431 case AMDGPU::S_CMP_EQ_I32:
7432 case AMDGPU::S_CMP_LG_I32:
7433 case AMDGPU::S_CMP_GT_I32:
7434 case AMDGPU::S_CMP_GE_I32:
7435 case AMDGPU::S_CMP_LT_I32:
7436 case AMDGPU::S_CMP_LE_I32:
7437 case AMDGPU::S_CMP_EQ_U32:
7438 case AMDGPU::S_CMP_LG_U32:
7439 case AMDGPU::S_CMP_GT_U32:
7440 case AMDGPU::S_CMP_GE_U32:
7441 case AMDGPU::S_CMP_LT_U32:
7442 case AMDGPU::S_CMP_LE_U32:
7443 case AMDGPU::S_CMP_EQ_U64:
7444 case AMDGPU::S_CMP_LG_U64:
7445 case AMDGPU::S_CMP_LT_F32:
7446 case AMDGPU::S_CMP_EQ_F32:
7447 case AMDGPU::S_CMP_LE_F32:
7448 case AMDGPU::S_CMP_GT_F32:
7449 case AMDGPU::S_CMP_LG_F32:
7450 case AMDGPU::S_CMP_GE_F32:
7451 case AMDGPU::S_CMP_O_F32:
7452 case AMDGPU::S_CMP_U_F32:
7453 case AMDGPU::S_CMP_NGE_F32:
7454 case AMDGPU::S_CMP_NLG_F32:
7455 case AMDGPU::S_CMP_NGT_F32:
7456 case AMDGPU::S_CMP_NLE_F32:
7457 case AMDGPU::S_CMP_NEQ_F32:
7458 case AMDGPU::S_CMP_NLT_F32: {
7459 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7460 auto NewInstr =
7461 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7462 .setMIFlags(Inst.getFlags());
7463 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0_modifiers) >=
7464 0) {
7465 NewInstr
7466 .addImm(0) // src0_modifiers
7467 .add(Inst.getOperand(0)) // src0
7468 .addImm(0) // src1_modifiers
7469 .add(Inst.getOperand(1)) // src1
7470 .addImm(0); // clamp
7471 } else {
7472 NewInstr.add(Inst.getOperand(0)).add(Inst.getOperand(1));
7473 }
7474 legalizeOperands(*NewInstr, MDT);
7475 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7476 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7477 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7478 Inst.eraseFromParent();
7479 return;
7480 }
7481 case AMDGPU::S_CMP_LT_F16:
7482 case AMDGPU::S_CMP_EQ_F16:
7483 case AMDGPU::S_CMP_LE_F16:
7484 case AMDGPU::S_CMP_GT_F16:
7485 case AMDGPU::S_CMP_LG_F16:
7486 case AMDGPU::S_CMP_GE_F16:
7487 case AMDGPU::S_CMP_O_F16:
7488 case AMDGPU::S_CMP_U_F16:
7489 case AMDGPU::S_CMP_NGE_F16:
7490 case AMDGPU::S_CMP_NLG_F16:
7491 case AMDGPU::S_CMP_NGT_F16:
7492 case AMDGPU::S_CMP_NLE_F16:
7493 case AMDGPU::S_CMP_NEQ_F16:
7494 case AMDGPU::S_CMP_NLT_F16: {
7495 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7496 auto NewInstr =
7497 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7498 .setMIFlags(Inst.getFlags());
7499 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0_modifiers)) {
7500 NewInstr
7501 .addImm(0) // src0_modifiers
7502 .add(Inst.getOperand(0)) // src0
7503 .addImm(0) // src1_modifiers
7504 .add(Inst.getOperand(1)) // src1
7505 .addImm(0); // clamp
7506 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::op_sel))
7507 NewInstr.addImm(0); // op_sel0
7508 } else {
7509 NewInstr
7510 .add(Inst.getOperand(0))
7511 .add(Inst.getOperand(1));
7512 }
7513 legalizeOperands(*NewInstr, MDT);
7514 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7515 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7516 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7517 Inst.eraseFromParent();
7518 return;
7519 }
7520 case AMDGPU::S_CVT_HI_F32_F16: {
7521 const DebugLoc &DL = Inst.getDebugLoc();
7522 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7523 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7524 if (ST.useRealTrue16Insts()) {
7525 BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
7526 .add(Inst.getOperand(1));
7527 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7528 .addImm(0) // src0_modifiers
7529 .addReg(TmpReg, 0, AMDGPU::hi16)
7530 .addImm(0) // clamp
7531 .addImm(0) // omod
7532 .addImm(0); // op_sel0
7533 } else {
7534 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
7535 .addImm(16)
7536 .add(Inst.getOperand(1));
7537 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7538 .addImm(0) // src0_modifiers
7539 .addReg(TmpReg)
7540 .addImm(0) // clamp
7541 .addImm(0); // omod
7542 }
7543
7544 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7545 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7546 Inst.eraseFromParent();
7547 return;
7548 }
7549 case AMDGPU::S_MINIMUM_F32:
7550 case AMDGPU::S_MAXIMUM_F32:
7551 case AMDGPU::S_MINIMUM_F16:
7552 case AMDGPU::S_MAXIMUM_F16: {
7553 const DebugLoc &DL = Inst.getDebugLoc();
7554 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7555 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7556 .addImm(0) // src0_modifiers
7557 .add(Inst.getOperand(1))
7558 .addImm(0) // src1_modifiers
7559 .add(Inst.getOperand(2))
7560 .addImm(0) // clamp
7561 .addImm(0); // omod
7562 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7563
7564 legalizeOperands(*NewInstr, MDT);
7565 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7566 Inst.eraseFromParent();
7567 return;
7568 }
7569 }
7570
7571 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
7572 // We cannot move this instruction to the VALU, so we should try to
7573 // legalize its operands instead.
7574 legalizeOperands(Inst, MDT);
7575 return;
7576 }
7577 // Handle converting generic instructions like COPY-to-SGPR into
7578 // COPY-to-VGPR.
7579 if (NewOpcode == Opcode) {
7580 Register DstReg = Inst.getOperand(0).getReg();
7581 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
7582
7583 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
7584 // hope for the best.
7585 if (Inst.isCopy() && DstReg.isPhysical() &&
7586 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
7587 // TODO: Only works for 32 bit registers.
7588 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7589 get(AMDGPU::V_READFIRSTLANE_B32), Inst.getOperand(0).getReg())
7590 .add(Inst.getOperand(1));
7591 Inst.eraseFromParent();
7592 return;
7593 }
7594
7595 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
7596 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
7597 // Instead of creating a copy where src and dst are the same register
7598 // class, we just replace all uses of dst with src. These kinds of
7599 // copies interfere with the heuristics MachineSink uses to decide
7600 // whether or not to split a critical edge. Since the pass assumes
7601 // that copies will end up as machine instructions and not be
7602 // eliminated.
7603 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
7604 MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
7605 MRI.clearKillFlags(Inst.getOperand(1).getReg());
7606 Inst.getOperand(0).setReg(DstReg);
7607 // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
7608 // these are deleted later, but at -O0 it would leave a suspicious
7609 // looking illegal copy of an undef register.
7610 for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
7611 Inst.removeOperand(I);
7612 Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
7613 return;
7614 }
7615 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
7616 MRI.replaceRegWith(DstReg, NewDstReg);
7617 legalizeOperands(Inst, MDT);
7618 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7619 return;
7620 }
7621
7622 // Use the new VALU Opcode.
7623 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
7624 .setMIFlags(Inst.getFlags());
7625 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
7626 // Intersperse VOP3 modifiers among the SALU operands.
7627 NewInstr->addOperand(Inst.getOperand(0));
7628 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7629 AMDGPU::OpName::src0_modifiers) >= 0)
7630 NewInstr.addImm(0);
7631 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
7632 MachineOperand Src = Inst.getOperand(1);
7633 if (AMDGPU::isTrue16Inst(NewOpcode) && ST.useRealTrue16Insts() &&
7634 Src.isReg() && RI.isVGPR(MRI, Src.getReg()))
7635 NewInstr.addReg(Src.getReg(), 0, AMDGPU::lo16);
7636 else
7637 NewInstr->addOperand(Src);
7638 }
7639
7640 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
7641 // We are converting these to a BFE, so we need to add the missing
7642 // operands for the size and offset.
7643 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
7644 NewInstr.addImm(0);
7645 NewInstr.addImm(Size);
7646 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
7647 // The VALU version adds the second operand to the result, so insert an
7648 // extra 0 operand.
7649 NewInstr.addImm(0);
7650 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
7651 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
7652 // If we need to move this to VGPRs, we need to unpack the second
7653 // operand back into the 2 separate ones for bit offset and width.
7654 assert(OffsetWidthOp.isImm() &&
7655 "Scalar BFE is only implemented for constant width and offset");
7656 uint32_t Imm = OffsetWidthOp.getImm();
7657
7658 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
7659 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
7660 NewInstr.addImm(Offset);
7661 NewInstr.addImm(BitWidth);
7662 } else {
7663 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7664 AMDGPU::OpName::src1_modifiers) >= 0)
7665 NewInstr.addImm(0);
7666 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
7667 NewInstr->addOperand(Inst.getOperand(2));
7668 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7669 AMDGPU::OpName::src2_modifiers) >= 0)
7670 NewInstr.addImm(0);
7671 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
7672 NewInstr->addOperand(Inst.getOperand(3));
7673 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
7674 NewInstr.addImm(0);
7675 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
7676 NewInstr.addImm(0);
7677 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
7678 NewInstr.addImm(0);
7679 }
7680 } else {
7681 // Just copy the SALU operands.
7682 for (const MachineOperand &Op : Inst.explicit_operands())
7683 NewInstr->addOperand(Op);
7684 }
7685
7686 // Remove any references to SCC. Vector instructions can't read from it, and
7687 // We're just about to add the implicit use / defs of VCC, and we don't want
7688 // both.
7689 for (MachineOperand &Op : Inst.implicit_operands()) {
7690 if (Op.getReg() == AMDGPU::SCC) {
7691 // Only propagate through live-def of SCC.
7692 if (Op.isDef() && !Op.isDead())
7693 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
7694 if (Op.isUse())
7695 addSCCDefsToVALUWorklist(NewInstr, Worklist);
7696 }
7697 }
7698 Inst.eraseFromParent();
7699 Register NewDstReg;
7700 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
7701 Register DstReg = NewInstr->getOperand(0).getReg();
7702 assert(DstReg.isVirtual());
7703 // Update the destination register class.
7704 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
7705 assert(NewDstRC);
7706 NewDstReg = MRI.createVirtualRegister(NewDstRC);
7707 MRI.replaceRegWith(DstReg, NewDstReg);
7708 }
7709 fixImplicitOperands(*NewInstr);
7710 // Legalize the operands
7711 legalizeOperands(*NewInstr, MDT);
7712 if (NewDstReg)
7713 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7714}
7715
7716// Add/sub require special handling to deal with carry outs.
7717std::pair<bool, MachineBasicBlock *>
7718SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
7719 MachineDominatorTree *MDT) const {
7720 if (ST.hasAddNoCarry()) {
7721 // Assume there is no user of scc since we don't select this in that case.
7722 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
7723 // is used.
7724
7725 MachineBasicBlock &MBB = *Inst.getParent();
7727
7728 Register OldDstReg = Inst.getOperand(0).getReg();
7729 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7730
7731 unsigned Opc = Inst.getOpcode();
7732 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
7733
7734 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
7735 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
7736
7737 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
7738 Inst.removeOperand(3);
7739
7740 Inst.setDesc(get(NewOpc));
7741 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
7743 MRI.replaceRegWith(OldDstReg, ResultReg);
7744 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
7745
7746 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
7747 return std::pair(true, NewBB);
7748 }
7749
7750 return std::pair(false, nullptr);
7751}
7752
7753void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
7754 MachineDominatorTree *MDT) const {
7755
7756 MachineBasicBlock &MBB = *Inst.getParent();
7758 MachineBasicBlock::iterator MII = Inst;
7759 DebugLoc DL = Inst.getDebugLoc();
7760
7761 MachineOperand &Dest = Inst.getOperand(0);
7762 MachineOperand &Src0 = Inst.getOperand(1);
7763 MachineOperand &Src1 = Inst.getOperand(2);
7764 MachineOperand &Cond = Inst.getOperand(3);
7765
7766 Register CondReg = Cond.getReg();
7767 bool IsSCC = (CondReg == AMDGPU::SCC);
7768
7769 // If this is a trivial select where the condition is effectively not SCC
7770 // (CondReg is a source of copy to SCC), then the select is semantically
7771 // equivalent to copying CondReg. Hence, there is no need to create
7772 // V_CNDMASK, we can just use that and bail out.
7773 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
7774 (Src1.getImm() == 0)) {
7775 MRI.replaceRegWith(Dest.getReg(), CondReg);
7776 return;
7777 }
7778
7779 Register NewCondReg = CondReg;
7780 if (IsSCC) {
7782 NewCondReg = MRI.createVirtualRegister(TC);
7783
7784 // Now look for the closest SCC def if it is a copy
7785 // replacing the CondReg with the COPY source register
7786 bool CopyFound = false;
7787 for (MachineInstr &CandI :
7789 Inst.getParent()->rend())) {
7790 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
7791 -1) {
7792 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
7793 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
7794 .addReg(CandI.getOperand(1).getReg());
7795 CopyFound = true;
7796 }
7797 break;
7798 }
7799 }
7800 if (!CopyFound) {
7801 // SCC def is not a copy
7802 // Insert a trivial select instead of creating a copy, because a copy from
7803 // SCC would semantically mean just copying a single bit, but we may need
7804 // the result to be a vector condition mask that needs preserving.
7805 unsigned Opcode =
7806 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
7807 auto NewSelect =
7808 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
7809 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
7810 }
7811 }
7812
7813 Register NewDestReg = MRI.createVirtualRegister(
7814 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
7815 MachineInstr *NewInst;
7816 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
7817 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
7818 .addImm(0)
7819 .add(Src1) // False
7820 .addImm(0)
7821 .add(Src0) // True
7822 .addReg(NewCondReg);
7823 } else {
7824 NewInst =
7825 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
7826 .add(Src1) // False
7827 .add(Src0) // True
7828 .addReg(NewCondReg);
7829 }
7830 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
7831 legalizeOperands(*NewInst, MDT);
7832 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
7833}
7834
7835void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
7836 MachineInstr &Inst) const {
7837 MachineBasicBlock &MBB = *Inst.getParent();
7839 MachineBasicBlock::iterator MII = Inst;
7840 DebugLoc DL = Inst.getDebugLoc();
7841
7842 MachineOperand &Dest = Inst.getOperand(0);
7843 MachineOperand &Src = Inst.getOperand(1);
7844 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7845 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7846
7847 unsigned SubOp = ST.hasAddNoCarry() ?
7848 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
7849
7850 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
7851 .addImm(0)
7852 .addReg(Src.getReg());
7853
7854 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
7855 .addReg(Src.getReg())
7856 .addReg(TmpReg);
7857
7858 MRI.replaceRegWith(Dest.getReg(), ResultReg);
7859 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
7860}
7861
7862void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
7863 MachineInstr &Inst) const {
7864 MachineBasicBlock &MBB = *Inst.getParent();
7866 MachineBasicBlock::iterator MII = Inst;
7867 const DebugLoc &DL = Inst.getDebugLoc();
7868
7869 MachineOperand &Dest = Inst.getOperand(0);
7870 MachineOperand &Src0 = Inst.getOperand(1);
7871 MachineOperand &Src1 = Inst.getOperand(2);
7872
7873 if (ST.hasDLInsts()) {
7874 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7875 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
7876 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
7877
7878 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
7879 .add(Src0)
7880 .add(Src1);
7881
7882 MRI.replaceRegWith(Dest.getReg(), NewDest);
7883 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7884 } else {
7885 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
7886 // invert either source and then perform the XOR. If either source is a
7887 // scalar register, then we can leave the inversion on the scalar unit to
7888 // achieve a better distribution of scalar and vector instructions.
7889 bool Src0IsSGPR = Src0.isReg() &&
7890 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
7891 bool Src1IsSGPR = Src1.isReg() &&
7892 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
7894 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7895 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7896
7897 // Build a pair of scalar instructions and add them to the work list.
7898 // The next iteration over the work list will lower these to the vector
7899 // unit as necessary.
7900 if (Src0IsSGPR) {
7901 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
7902 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
7903 .addReg(Temp)
7904 .add(Src1);
7905 } else if (Src1IsSGPR) {
7906 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
7907 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
7908 .add(Src0)
7909 .addReg(Temp);
7910 } else {
7911 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
7912 .add(Src0)
7913 .add(Src1);
7914 MachineInstr *Not =
7915 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
7916 Worklist.insert(Not);
7917 }
7918
7919 MRI.replaceRegWith(Dest.getReg(), NewDest);
7920
7921 Worklist.insert(Xor);
7922
7923 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7924 }
7925}
7926
7927void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
7928 MachineInstr &Inst,
7929 unsigned Opcode) const {
7930 MachineBasicBlock &MBB = *Inst.getParent();
7932 MachineBasicBlock::iterator MII = Inst;
7933 const DebugLoc &DL = Inst.getDebugLoc();
7934
7935 MachineOperand &Dest = Inst.getOperand(0);
7936 MachineOperand &Src0 = Inst.getOperand(1);
7937 MachineOperand &Src1 = Inst.getOperand(2);
7938
7939 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7940 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7941
7942 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
7943 .add(Src0)
7944 .add(Src1);
7945
7946 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
7947 .addReg(Interm);
7948
7949 Worklist.insert(&Op);
7950 Worklist.insert(&Not);
7951
7952 MRI.replaceRegWith(Dest.getReg(), NewDest);
7953 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7954}
7955
7956void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
7957 MachineInstr &Inst,
7958 unsigned Opcode) const {
7959 MachineBasicBlock &MBB = *Inst.getParent();
7961 MachineBasicBlock::iterator MII = Inst;
7962 const DebugLoc &DL = Inst.getDebugLoc();
7963
7964 MachineOperand &Dest = Inst.getOperand(0);
7965 MachineOperand &Src0 = Inst.getOperand(1);
7966 MachineOperand &Src1 = Inst.getOperand(2);
7967
7968 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7969 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7970
7971 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
7972 .add(Src1);
7973
7974 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
7975 .add(Src0)
7976 .addReg(Interm);
7977
7978 Worklist.insert(&Not);
7979 Worklist.insert(&Op);
7980
7981 MRI.replaceRegWith(Dest.getReg(), NewDest);
7982 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7983}
7984
7985void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
7986 MachineInstr &Inst, unsigned Opcode,
7987 bool Swap) const {
7988 MachineBasicBlock &MBB = *Inst.getParent();
7990
7991 MachineOperand &Dest = Inst.getOperand(0);
7992 MachineOperand &Src0 = Inst.getOperand(1);
7993 DebugLoc DL = Inst.getDebugLoc();
7994
7995 MachineBasicBlock::iterator MII = Inst;
7996
7997 const MCInstrDesc &InstDesc = get(Opcode);
7998 const TargetRegisterClass *Src0RC = Src0.isReg() ?
7999 MRI.getRegClass(Src0.getReg()) :
8000 &AMDGPU::SGPR_32RegClass;
8001
8002 const TargetRegisterClass *Src0SubRC =
8003 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8004
8005 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8006 AMDGPU::sub0, Src0SubRC);
8007
8008 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8009 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8010 const TargetRegisterClass *NewDestSubRC =
8011 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8012
8013 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8014 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
8015
8016 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8017 AMDGPU::sub1, Src0SubRC);
8018
8019 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8020 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
8021
8022 if (Swap)
8023 std::swap(DestSub0, DestSub1);
8024
8025 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8026 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8027 .addReg(DestSub0)
8028 .addImm(AMDGPU::sub0)
8029 .addReg(DestSub1)
8030 .addImm(AMDGPU::sub1);
8031
8032 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8033
8034 Worklist.insert(&LoHalf);
8035 Worklist.insert(&HiHalf);
8036
8037 // We don't need to legalizeOperands here because for a single operand, src0
8038 // will support any kind of input.
8039
8040 // Move all users of this moved value.
8041 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8042}
8043
8044// There is not a vector equivalent of s_mul_u64. For this reason, we need to
8045// split the s_mul_u64 in 32-bit vector multiplications.
8046void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
8047 MachineInstr &Inst,
8048 MachineDominatorTree *MDT) const {
8049 MachineBasicBlock &MBB = *Inst.getParent();
8051
8052 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8053 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8054 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8055
8056 MachineOperand &Dest = Inst.getOperand(0);
8057 MachineOperand &Src0 = Inst.getOperand(1);
8058 MachineOperand &Src1 = Inst.getOperand(2);
8059 const DebugLoc &DL = Inst.getDebugLoc();
8060 MachineBasicBlock::iterator MII = Inst;
8061
8062 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8063 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8064 const TargetRegisterClass *Src0SubRC =
8065 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8066 if (RI.isSGPRClass(Src0SubRC))
8067 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8068 const TargetRegisterClass *Src1SubRC =
8069 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8070 if (RI.isSGPRClass(Src1SubRC))
8071 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8072
8073 // First, we extract the low 32-bit and high 32-bit values from each of the
8074 // operands.
8075 MachineOperand Op0L =
8076 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8077 MachineOperand Op1L =
8078 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8079 MachineOperand Op0H =
8080 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
8081 MachineOperand Op1H =
8082 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
8083
8084 // The multilication is done as follows:
8085 //
8086 // Op1H Op1L
8087 // * Op0H Op0L
8088 // --------------------
8089 // Op1H*Op0L Op1L*Op0L
8090 // + Op1H*Op0H Op1L*Op0H
8091 // -----------------------------------------
8092 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
8093 //
8094 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
8095 // value and that would overflow.
8096 // The low 32-bit value is Op1L*Op0L.
8097 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
8098
8099 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8100 MachineInstr *Op1L_Op0H =
8101 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
8102 .add(Op1L)
8103 .add(Op0H);
8104
8105 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8106 MachineInstr *Op1H_Op0L =
8107 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
8108 .add(Op1H)
8109 .add(Op0L);
8110
8111 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8112 MachineInstr *Carry =
8113 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
8114 .add(Op1L)
8115 .add(Op0L);
8116
8117 MachineInstr *LoHalf =
8118 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8119 .add(Op1L)
8120 .add(Op0L);
8121
8122 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8123 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
8124 .addReg(Op1L_Op0H_Reg)
8125 .addReg(Op1H_Op0L_Reg);
8126
8127 MachineInstr *HiHalf =
8128 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
8129 .addReg(AddReg)
8130 .addReg(CarryReg);
8131
8132 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8133 .addReg(DestSub0)
8134 .addImm(AMDGPU::sub0)
8135 .addReg(DestSub1)
8136 .addImm(AMDGPU::sub1);
8137
8138 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8139
8140 // Try to legalize the operands in case we need to swap the order to keep it
8141 // valid.
8142 legalizeOperands(*Op1L_Op0H, MDT);
8143 legalizeOperands(*Op1H_Op0L, MDT);
8144 legalizeOperands(*Carry, MDT);
8145 legalizeOperands(*LoHalf, MDT);
8146 legalizeOperands(*Add, MDT);
8147 legalizeOperands(*HiHalf, MDT);
8148
8149 // Move all users of this moved value.
8150 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8151}
8152
8153// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
8154// multiplications.
8155void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
8156 MachineInstr &Inst,
8157 MachineDominatorTree *MDT) const {
8158 MachineBasicBlock &MBB = *Inst.getParent();
8160
8161 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8162 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8163 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8164
8165 MachineOperand &Dest = Inst.getOperand(0);
8166 MachineOperand &Src0 = Inst.getOperand(1);
8167 MachineOperand &Src1 = Inst.getOperand(2);
8168 const DebugLoc &DL = Inst.getDebugLoc();
8169 MachineBasicBlock::iterator MII = Inst;
8170
8171 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
8172 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
8173 const TargetRegisterClass *Src0SubRC =
8174 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8175 if (RI.isSGPRClass(Src0SubRC))
8176 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
8177 const TargetRegisterClass *Src1SubRC =
8178 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8179 if (RI.isSGPRClass(Src1SubRC))
8180 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
8181
8182 // First, we extract the low 32-bit and high 32-bit values from each of the
8183 // operands.
8184 MachineOperand Op0L =
8185 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
8186 MachineOperand Op1L =
8187 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
8188
8189 unsigned Opc = Inst.getOpcode();
8190 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
8191 ? AMDGPU::V_MUL_HI_U32_e64
8192 : AMDGPU::V_MUL_HI_I32_e64;
8193 MachineInstr *HiHalf =
8194 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
8195
8196 MachineInstr *LoHalf =
8197 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
8198 .add(Op1L)
8199 .add(Op0L);
8200
8201 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8202 .addReg(DestSub0)
8203 .addImm(AMDGPU::sub0)
8204 .addReg(DestSub1)
8205 .addImm(AMDGPU::sub1);
8206
8207 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8208
8209 // Try to legalize the operands in case we need to swap the order to keep it
8210 // valid.
8211 legalizeOperands(*HiHalf, MDT);
8212 legalizeOperands(*LoHalf, MDT);
8213
8214 // Move all users of this moved value.
8215 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8216}
8217
8218void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
8219 MachineInstr &Inst, unsigned Opcode,
8220 MachineDominatorTree *MDT) const {
8221 MachineBasicBlock &MBB = *Inst.getParent();
8223
8224 MachineOperand &Dest = Inst.getOperand(0);
8225 MachineOperand &Src0 = Inst.getOperand(1);
8226 MachineOperand &Src1 = Inst.getOperand(2);
8227 DebugLoc DL = Inst.getDebugLoc();
8228
8229 MachineBasicBlock::iterator MII = Inst;
8230
8231 const MCInstrDesc &InstDesc = get(Opcode);
8232 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8233 MRI.getRegClass(Src0.getReg()) :
8234 &AMDGPU::SGPR_32RegClass;
8235
8236 const TargetRegisterClass *Src0SubRC =
8237 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8238 const TargetRegisterClass *Src1RC = Src1.isReg() ?
8239 MRI.getRegClass(Src1.getReg()) :
8240 &AMDGPU::SGPR_32RegClass;
8241
8242 const TargetRegisterClass *Src1SubRC =
8243 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8244
8245 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8246 AMDGPU::sub0, Src0SubRC);
8247 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8248 AMDGPU::sub0, Src1SubRC);
8249 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8250 AMDGPU::sub1, Src0SubRC);
8251 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8252 AMDGPU::sub1, Src1SubRC);
8253
8254 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8255 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8256 const TargetRegisterClass *NewDestSubRC =
8257 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8258
8259 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8260 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
8261 .add(SrcReg0Sub0)
8262 .add(SrcReg1Sub0);
8263
8264 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8265 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
8266 .add(SrcReg0Sub1)
8267 .add(SrcReg1Sub1);
8268
8269 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8270 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8271 .addReg(DestSub0)
8272 .addImm(AMDGPU::sub0)
8273 .addReg(DestSub1)
8274 .addImm(AMDGPU::sub1);
8275
8276 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8277
8278 Worklist.insert(&LoHalf);
8279 Worklist.insert(&HiHalf);
8280
8281 // Move all users of this moved value.
8282 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8283}
8284
8285void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
8286 MachineInstr &Inst,
8287 MachineDominatorTree *MDT) const {
8288 MachineBasicBlock &MBB = *Inst.getParent();
8290
8291 MachineOperand &Dest = Inst.getOperand(0);
8292 MachineOperand &Src0 = Inst.getOperand(1);
8293 MachineOperand &Src1 = Inst.getOperand(2);
8294 const DebugLoc &DL = Inst.getDebugLoc();
8295
8296 MachineBasicBlock::iterator MII = Inst;
8297
8298 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8299
8300 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
8301
8302 MachineOperand* Op0;
8303 MachineOperand* Op1;
8304
8305 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
8306 Op0 = &Src0;
8307 Op1 = &Src1;
8308 } else {
8309 Op0 = &Src1;
8310 Op1 = &Src0;
8311 }
8312
8313 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
8314 .add(*Op0);
8315
8316 Register NewDest = MRI.createVirtualRegister(DestRC);
8317
8318 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
8319 .addReg(Interm)
8320 .add(*Op1);
8321
8322 MRI.replaceRegWith(Dest.getReg(), NewDest);
8323
8324 Worklist.insert(&Xor);
8325}
8326
8327void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
8328 MachineInstr &Inst) const {
8329 MachineBasicBlock &MBB = *Inst.getParent();
8331
8332 MachineBasicBlock::iterator MII = Inst;
8333 const DebugLoc &DL = Inst.getDebugLoc();
8334
8335 MachineOperand &Dest = Inst.getOperand(0);
8336 MachineOperand &Src = Inst.getOperand(1);
8337
8338 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
8339 const TargetRegisterClass *SrcRC = Src.isReg() ?
8340 MRI.getRegClass(Src.getReg()) :
8341 &AMDGPU::SGPR_32RegClass;
8342
8343 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8344 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8345
8346 const TargetRegisterClass *SrcSubRC =
8347 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8348
8349 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8350 AMDGPU::sub0, SrcSubRC);
8351 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8352 AMDGPU::sub1, SrcSubRC);
8353
8354 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
8355
8356 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
8357
8358 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8359
8360 // We don't need to legalize operands here. src0 for either instruction can be
8361 // an SGPR, and the second input is unused or determined here.
8362 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8363}
8364
8365void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
8366 MachineInstr &Inst) const {
8367 MachineBasicBlock &MBB = *Inst.getParent();
8369 MachineBasicBlock::iterator MII = Inst;
8370 const DebugLoc &DL = Inst.getDebugLoc();
8371
8372 MachineOperand &Dest = Inst.getOperand(0);
8373 uint32_t Imm = Inst.getOperand(2).getImm();
8374 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8375 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8376
8377 (void) Offset;
8378
8379 // Only sext_inreg cases handled.
8380 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
8381 Offset == 0 && "Not implemented");
8382
8383 if (BitWidth < 32) {
8384 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8385 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8386 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8387
8388 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
8389 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
8390 .addImm(0)
8391 .addImm(BitWidth);
8392
8393 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
8394 .addImm(31)
8395 .addReg(MidRegLo);
8396
8397 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8398 .addReg(MidRegLo)
8399 .addImm(AMDGPU::sub0)
8400 .addReg(MidRegHi)
8401 .addImm(AMDGPU::sub1);
8402
8403 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8404 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8405 return;
8406 }
8407
8408 MachineOperand &Src = Inst.getOperand(1);
8409 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8410 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8411
8412 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
8413 .addImm(31)
8414 .addReg(Src.getReg(), 0, AMDGPU::sub0);
8415
8416 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8417 .addReg(Src.getReg(), 0, AMDGPU::sub0)
8418 .addImm(AMDGPU::sub0)
8419 .addReg(TmpReg)
8420 .addImm(AMDGPU::sub1);
8421
8422 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8423 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8424}
8425
8426void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
8427 MachineInstr &Inst, unsigned Opcode,
8428 MachineDominatorTree *MDT) const {
8429 // (S_FLBIT_I32_B64 hi:lo) ->
8430 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
8431 // (S_FF1_I32_B64 hi:lo) ->
8432 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
8433
8434 MachineBasicBlock &MBB = *Inst.getParent();
8436 MachineBasicBlock::iterator MII = Inst;
8437 const DebugLoc &DL = Inst.getDebugLoc();
8438
8439 MachineOperand &Dest = Inst.getOperand(0);
8440 MachineOperand &Src = Inst.getOperand(1);
8441
8442 const MCInstrDesc &InstDesc = get(Opcode);
8443
8444 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
8445 unsigned OpcodeAdd =
8446 ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
8447
8448 const TargetRegisterClass *SrcRC =
8449 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
8450 const TargetRegisterClass *SrcSubRC =
8451 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8452
8453 MachineOperand SrcRegSub0 =
8454 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
8455 MachineOperand SrcRegSub1 =
8456 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
8457
8458 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8459 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8460 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8461 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8462
8463 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
8464
8465 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
8466
8467 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
8468 .addReg(IsCtlz ? MidReg1 : MidReg2)
8469 .addImm(32)
8470 .addImm(1); // enable clamp
8471
8472 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
8473 .addReg(MidReg3)
8474 .addReg(IsCtlz ? MidReg2 : MidReg1);
8475
8476 MRI.replaceRegWith(Dest.getReg(), MidReg4);
8477
8478 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
8479}
8480
8481void SIInstrInfo::addUsersToMoveToVALUWorklist(
8483 SIInstrWorklist &Worklist) const {
8484 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
8485 E = MRI.use_end(); I != E;) {
8486 MachineInstr &UseMI = *I->getParent();
8487
8488 unsigned OpNo = 0;
8489
8490 switch (UseMI.getOpcode()) {
8491 case AMDGPU::COPY:
8492 case AMDGPU::WQM:
8493 case AMDGPU::SOFT_WQM:
8494 case AMDGPU::STRICT_WWM:
8495 case AMDGPU::STRICT_WQM:
8496 case AMDGPU::REG_SEQUENCE:
8497 case AMDGPU::PHI:
8498 case AMDGPU::INSERT_SUBREG:
8499 break;
8500 default:
8501 OpNo = I.getOperandNo();
8502 break;
8503 }
8504
8505 if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) {
8506 Worklist.insert(&UseMI);
8507
8508 do {
8509 ++I;
8510 } while (I != E && I->getParent() == &UseMI);
8511 } else {
8512 ++I;
8513 }
8514 }
8515}
8516
8517void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
8519 MachineInstr &Inst) const {
8520 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8522 MachineOperand &Src0 = Inst.getOperand(1);
8523 MachineOperand &Src1 = Inst.getOperand(2);
8524 const DebugLoc &DL = Inst.getDebugLoc();
8525
8526 switch (Inst.getOpcode()) {
8527 case AMDGPU::S_PACK_LL_B32_B16: {
8528 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8529 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8530
8531 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
8532 // 0.
8533 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8534 .addImm(0xffff);
8535
8536 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
8537 .addReg(ImmReg, RegState::Kill)
8538 .add(Src0);
8539
8540 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
8541 .add(Src1)
8542 .addImm(16)
8543 .addReg(TmpReg, RegState::Kill);
8544 break;
8545 }
8546 case AMDGPU::S_PACK_LH_B32_B16: {
8547 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8548 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8549 .addImm(0xffff);
8550 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
8551 .addReg(ImmReg, RegState::Kill)
8552 .add(Src0)
8553 .add(Src1);
8554 break;
8555 }
8556 case AMDGPU::S_PACK_HL_B32_B16: {
8557 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8558 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8559 .addImm(16)
8560 .add(Src0);
8561 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
8562 .add(Src1)
8563 .addImm(16)
8564 .addReg(TmpReg, RegState::Kill);
8565 break;
8566 }
8567 case AMDGPU::S_PACK_HH_B32_B16: {
8568 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8569 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8570 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8571 .addImm(16)
8572 .add(Src0);
8573 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8574 .addImm(0xffff0000);
8575 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
8576 .add(Src1)
8577 .addReg(ImmReg, RegState::Kill)
8578 .addReg(TmpReg, RegState::Kill);
8579 break;
8580 }
8581 default:
8582 llvm_unreachable("unhandled s_pack_* instruction");
8583 }
8584
8585 MachineOperand &Dest = Inst.getOperand(0);
8586 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8587 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8588}
8589
8590void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
8591 MachineInstr &SCCDefInst,
8592 SIInstrWorklist &Worklist,
8593 Register NewCond) const {
8594
8595 // Ensure that def inst defines SCC, which is still live.
8596 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
8597 !Op.isDead() && Op.getParent() == &SCCDefInst);
8598 SmallVector<MachineInstr *, 4> CopyToDelete;
8599 // This assumes that all the users of SCC are in the same block
8600 // as the SCC def.
8601 for (MachineInstr &MI : // Skip the def inst itself.
8602 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
8603 SCCDefInst.getParent()->end())) {
8604 // Check if SCC is used first.
8605 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
8606 if (SCCIdx != -1) {
8607 if (MI.isCopy()) {
8608 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8609 Register DestReg = MI.getOperand(0).getReg();
8610
8611 MRI.replaceRegWith(DestReg, NewCond);
8612 CopyToDelete.push_back(&MI);
8613 } else {
8614
8615 if (NewCond.isValid())
8616 MI.getOperand(SCCIdx).setReg(NewCond);
8617
8618 Worklist.insert(&MI);
8619 }
8620 }
8621 // Exit if we find another SCC def.
8622 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
8623 break;
8624 }
8625 for (auto &Copy : CopyToDelete)
8626 Copy->eraseFromParent();
8627}
8628
8629// Instructions that use SCC may be converted to VALU instructions. When that
8630// happens, the SCC register is changed to VCC_LO. The instruction that defines
8631// SCC must be changed to an instruction that defines VCC. This function makes
8632// sure that the instruction that defines SCC is added to the moveToVALU
8633// worklist.
8634void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
8635 SIInstrWorklist &Worklist) const {
8636 // Look for a preceding instruction that either defines VCC or SCC. If VCC
8637 // then there is nothing to do because the defining instruction has been
8638 // converted to a VALU already. If SCC then that instruction needs to be
8639 // converted to a VALU.
8640 for (MachineInstr &MI :
8641 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
8642 SCCUseInst->getParent()->rend())) {
8643 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
8644 break;
8645 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
8646 Worklist.insert(&MI);
8647 break;
8648 }
8649 }
8650}
8651
8652const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
8653 const MachineInstr &Inst) const {
8654 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
8655
8656 switch (Inst.getOpcode()) {
8657 // For target instructions, getOpRegClass just returns the virtual register
8658 // class associated with the operand, so we need to find an equivalent VGPR
8659 // register class in order to move the instruction to the VALU.
8660 case AMDGPU::COPY:
8661 case AMDGPU::PHI:
8662 case AMDGPU::REG_SEQUENCE:
8663 case AMDGPU::INSERT_SUBREG:
8664 case AMDGPU::WQM:
8665 case AMDGPU::SOFT_WQM:
8666 case AMDGPU::STRICT_WWM:
8667 case AMDGPU::STRICT_WQM: {
8668 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
8669 if (RI.isAGPRClass(SrcRC)) {
8670 if (RI.isAGPRClass(NewDstRC))
8671 return nullptr;
8672
8673 switch (Inst.getOpcode()) {
8674 case AMDGPU::PHI:
8675 case AMDGPU::REG_SEQUENCE:
8676 case AMDGPU::INSERT_SUBREG:
8677 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
8678 break;
8679 default:
8680 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
8681 }
8682
8683 if (!NewDstRC)
8684 return nullptr;
8685 } else {
8686 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
8687 return nullptr;
8688
8689 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
8690 if (!NewDstRC)
8691 return nullptr;
8692 }
8693
8694 return NewDstRC;
8695 }
8696 default:
8697 return NewDstRC;
8698 }
8699}
8700
8701// Find the one SGPR operand we are allowed to use.
8702Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
8703 int OpIndices[3]) const {
8704 const MCInstrDesc &Desc = MI.getDesc();
8705
8706 // Find the one SGPR operand we are allowed to use.
8707 //
8708 // First we need to consider the instruction's operand requirements before
8709 // legalizing. Some operands are required to be SGPRs, such as implicit uses
8710 // of VCC, but we are still bound by the constant bus requirement to only use
8711 // one.
8712 //
8713 // If the operand's class is an SGPR, we can never move it.
8714
8715 Register SGPRReg = findImplicitSGPRRead(MI);
8716 if (SGPRReg)
8717 return SGPRReg;
8718
8719 Register UsedSGPRs[3] = {Register()};
8720 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8721
8722 for (unsigned i = 0; i < 3; ++i) {
8723 int Idx = OpIndices[i];
8724 if (Idx == -1)
8725 break;
8726
8727 const MachineOperand &MO = MI.getOperand(Idx);
8728 if (!MO.isReg())
8729 continue;
8730
8731 // Is this operand statically required to be an SGPR based on the operand
8732 // constraints?
8733 const TargetRegisterClass *OpRC =
8734 RI.getRegClass(Desc.operands()[Idx].RegClass);
8735 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
8736 if (IsRequiredSGPR)
8737 return MO.getReg();
8738
8739 // If this could be a VGPR or an SGPR, Check the dynamic register class.
8740 Register Reg = MO.getReg();
8741 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
8742 if (RI.isSGPRClass(RegRC))
8743 UsedSGPRs[i] = Reg;
8744 }
8745
8746 // We don't have a required SGPR operand, so we have a bit more freedom in
8747 // selecting operands to move.
8748
8749 // Try to select the most used SGPR. If an SGPR is equal to one of the
8750 // others, we choose that.
8751 //
8752 // e.g.
8753 // V_FMA_F32 v0, s0, s0, s0 -> No moves
8754 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
8755
8756 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
8757 // prefer those.
8758
8759 if (UsedSGPRs[0]) {
8760 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
8761 SGPRReg = UsedSGPRs[0];
8762 }
8763
8764 if (!SGPRReg && UsedSGPRs[1]) {
8765 if (UsedSGPRs[1] == UsedSGPRs[2])
8766 SGPRReg = UsedSGPRs[1];
8767 }
8768
8769 return SGPRReg;
8770}
8771
8773 unsigned OperandName) const {
8774 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
8775 if (Idx == -1)
8776 return nullptr;
8777
8778 return &MI.getOperand(Idx);
8779}
8780
8786 return (Format << 44) |
8787 (1ULL << 56) | // RESOURCE_LEVEL = 1
8788 (3ULL << 60); // OOB_SELECT = 3
8789 }
8790
8791 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
8792 if (ST.isAmdHsaOS()) {
8793 // Set ATC = 1. GFX9 doesn't have this bit.
8795 RsrcDataFormat |= (1ULL << 56);
8796
8797 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
8798 // BTW, it disables TC L2 and therefore decreases performance.
8800 RsrcDataFormat |= (2ULL << 59);
8801 }
8802
8803 return RsrcDataFormat;
8804}
8805
8809 0xffffffff; // Size;
8810
8811 // GFX9 doesn't have ELEMENT_SIZE.
8813 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
8814 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
8815 }
8816
8817 // IndexStride = 64 / 32.
8818 uint64_t IndexStride = ST.isWave64() ? 3 : 2;
8819 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
8820
8821 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
8822 // Clear them unless we want a huge stride.
8825 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
8826
8827 return Rsrc23;
8828}
8829
8831 unsigned Opc = MI.getOpcode();
8832
8833 return isSMRD(Opc);
8834}
8835
8837 return get(Opc).mayLoad() &&
8838 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
8839}
8840
8842 int &FrameIndex) const {
8843 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
8844 if (!Addr || !Addr->isFI())
8845 return Register();
8846
8847 assert(!MI.memoperands_empty() &&
8848 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
8849
8850 FrameIndex = Addr->getIndex();
8851 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
8852}
8853
8855 int &FrameIndex) const {
8856 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
8857 assert(Addr && Addr->isFI());
8858 FrameIndex = Addr->getIndex();
8859 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
8860}
8861
8863 int &FrameIndex) const {
8864 if (!MI.mayLoad())
8865 return Register();
8866
8867 if (isMUBUF(MI) || isVGPRSpill(MI))
8868 return isStackAccess(MI, FrameIndex);
8869
8870 if (isSGPRSpill(MI))
8871 return isSGPRStackAccess(MI, FrameIndex);
8872
8873 return Register();
8874}
8875
8877 int &FrameIndex) const {
8878 if (!MI.mayStore())
8879 return Register();
8880
8881 if (isMUBUF(MI) || isVGPRSpill(MI))
8882 return isStackAccess(MI, FrameIndex);
8883
8884 if (isSGPRSpill(MI))
8885 return isSGPRStackAccess(MI, FrameIndex);
8886
8887 return Register();
8888}
8889
8891 unsigned Size = 0;
8893 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
8894 while (++I != E && I->isInsideBundle()) {
8895 assert(!I->isBundle() && "No nested bundle!");
8897 }
8898
8899 return Size;
8900}
8901
8903 unsigned Opc = MI.getOpcode();
8905 unsigned DescSize = Desc.getSize();
8906
8907 // If we have a definitive size, we can use it. Otherwise we need to inspect
8908 // the operands to know the size.
8909 if (isFixedSize(MI)) {
8910 unsigned Size = DescSize;
8911
8912 // If we hit the buggy offset, an extra nop will be inserted in MC so
8913 // estimate the worst case.
8914 if (MI.isBranch() && ST.hasOffset3fBug())
8915 Size += 4;
8916
8917 return Size;
8918 }
8919
8920 // Instructions may have a 32-bit literal encoded after them. Check
8921 // operands that could ever be literals.
8922 if (isVALU(MI) || isSALU(MI)) {
8923 if (isDPP(MI))
8924 return DescSize;
8925 bool HasLiteral = false;
8926 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
8927 const MachineOperand &Op = MI.getOperand(I);
8928 const MCOperandInfo &OpInfo = Desc.operands()[I];
8929 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
8930 HasLiteral = true;
8931 break;
8932 }
8933 }
8934 return HasLiteral ? DescSize + 4 : DescSize;
8935 }
8936
8937 // Check whether we have extra NSA words.
8938 if (isMIMG(MI)) {
8939 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
8940 if (VAddr0Idx < 0)
8941 return 8;
8942
8943 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
8944 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
8945 }
8946
8947 switch (Opc) {
8948 case TargetOpcode::BUNDLE:
8949 return getInstBundleSize(MI);
8950 case TargetOpcode::INLINEASM:
8951 case TargetOpcode::INLINEASM_BR: {
8952 const MachineFunction *MF = MI.getParent()->getParent();
8953 const char *AsmStr = MI.getOperand(0).getSymbolName();
8954 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
8955 }
8956 default:
8957 if (MI.isMetaInstruction())
8958 return 0;
8959 return DescSize;
8960 }
8961}
8962
8964 if (!isFLAT(MI))
8965 return false;
8966
8967 if (MI.memoperands_empty())
8968 return true;
8969
8970 for (const MachineMemOperand *MMO : MI.memoperands()) {
8971 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
8972 return true;
8973 }
8974 return false;
8975}
8976
8979 static const std::pair<int, const char *> TargetIndices[] = {
8980 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
8981 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
8982 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
8983 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
8984 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
8985 return ArrayRef(TargetIndices);
8986}
8987
8988/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
8989/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
8992 const ScheduleDAG *DAG) const {
8993 return new GCNHazardRecognizer(DAG->MF);
8994}
8995
8996/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
8997/// pass.
9000 return new GCNHazardRecognizer(MF);
9001}
9002
9003// Called during:
9004// - pre-RA scheduling and post-RA scheduling
9007 const ScheduleDAGMI *DAG) const {
9008 // Borrowed from Arm Target
9009 // We would like to restrict this hazard recognizer to only
9010 // post-RA scheduling; we can tell that we're post-RA because we don't
9011 // track VRegLiveness.
9012 if (!DAG->hasVRegLiveness())
9013 return new GCNHazardRecognizer(DAG->MF);
9015}
9016
9017std::pair<unsigned, unsigned>
9019 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
9020}
9021
9024 static const std::pair<unsigned, const char *> TargetFlags[] = {
9025 { MO_GOTPCREL, "amdgpu-gotprel" },
9026 { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
9027 { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
9028 { MO_REL32_LO, "amdgpu-rel32-lo" },
9029 { MO_REL32_HI, "amdgpu-rel32-hi" },
9030 { MO_ABS32_LO, "amdgpu-abs32-lo" },
9031 { MO_ABS32_HI, "amdgpu-abs32-hi" },
9032 };
9033
9034 return ArrayRef(TargetFlags);
9035}
9036
9039 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
9040 {
9041 {MONoClobber, "amdgpu-noclobber"},
9042 {MOLastUse, "amdgpu-last-use"},
9043 };
9044
9045 return ArrayRef(TargetFlags);
9046}
9047
9049 const MachineFunction &MF) const {
9051 assert(SrcReg.isVirtual());
9052 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
9053 return AMDGPU::WWM_COPY;
9054
9055 return AMDGPU::COPY;
9056}
9057
9059 Register Reg) const {
9060 // We need to handle instructions which may be inserted during register
9061 // allocation to handle the prolog. The initial prolog instruction may have
9062 // been separated from the start of the block by spills and copies inserted
9063 // needed by the prolog. However, the insertions for scalar registers can
9064 // always be placed at the BB top as they are independent of the exec mask
9065 // value.
9066 const MachineFunction *MF = MI.getParent()->getParent();
9067 bool IsNullOrVectorRegister = true;
9068 if (Reg) {
9069 const MachineRegisterInfo &MRI = MF->getRegInfo();
9070 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
9071 }
9072
9073 uint16_t Opcode = MI.getOpcode();
9075 return IsNullOrVectorRegister &&
9076 (isSGPRSpill(Opcode) || isWWMRegSpillOpcode(Opcode) ||
9077 (Opcode == AMDGPU::IMPLICIT_DEF &&
9078 MFI->isWWMReg(MI.getOperand(0).getReg())) ||
9079 (!MI.isTerminator() && Opcode != AMDGPU::COPY &&
9080 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
9081}
9082
9086 const DebugLoc &DL,
9087 Register DestReg) const {
9088 if (ST.hasAddNoCarry())
9089 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
9090
9092 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
9093 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
9094
9095 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9096 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9097}
9098
9101 const DebugLoc &DL,
9102 Register DestReg,
9103 RegScavenger &RS) const {
9104 if (ST.hasAddNoCarry())
9105 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
9106
9107 // If available, prefer to use vcc.
9108 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
9109 ? Register(RI.getVCC())
9111 *RI.getBoolRC(), I, /* RestoreAfter */ false,
9112 0, /* AllowSpill */ false);
9113
9114 // TODO: Users need to deal with this.
9115 if (!UnusedCarry.isValid())
9116 return MachineInstrBuilder();
9117
9118 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
9119 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
9120}
9121
9122bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
9123 switch (Opcode) {
9124 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
9125 case AMDGPU::SI_KILL_I1_TERMINATOR:
9126 return true;
9127 default:
9128 return false;
9129 }
9130}
9131
9133 switch (Opcode) {
9134 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
9135 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
9136 case AMDGPU::SI_KILL_I1_PSEUDO:
9137 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
9138 default:
9139 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
9140 }
9141}
9142
9143bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
9144 return Imm <= getMaxMUBUFImmOffset(ST);
9145}
9146
9148 // GFX12 field is non-negative 24-bit signed byte offset.
9149 const unsigned OffsetBits =
9150 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
9151 return (1 << OffsetBits) - 1;
9152}
9153
9155 if (!ST.isWave32())
9156 return;
9157
9158 if (MI.isInlineAsm())
9159 return;
9160
9161 for (auto &Op : MI.implicit_operands()) {
9162 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
9163 Op.setReg(AMDGPU::VCC_LO);
9164 }
9165}
9166
9168 if (!isSMRD(MI))
9169 return false;
9170
9171 // Check that it is using a buffer resource.
9172 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
9173 if (Idx == -1) // e.g. s_memtime
9174 return false;
9175
9176 const auto RCID = MI.getDesc().operands()[Idx].RegClass;
9177 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
9178}
9179
9180// Given Imm, split it into the values to put into the SOffset and ImmOffset
9181// fields in an MUBUF instruction. Return false if it is not possible (due to a
9182// hardware bug needing a workaround).
9183//
9184// The required alignment ensures that individual address components remain
9185// aligned if they are aligned to begin with. It also ensures that additional
9186// offsets within the given alignment can be added to the resulting ImmOffset.
9188 uint32_t &ImmOffset, Align Alignment) const {
9189 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
9190 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
9191 uint32_t Overflow = 0;
9192
9193 if (Imm > MaxImm) {
9194 if (Imm <= MaxImm + 64) {
9195 // Use an SOffset inline constant for 4..64
9196 Overflow = Imm - MaxImm;
9197 Imm = MaxImm;
9198 } else {
9199 // Try to keep the same value in SOffset for adjacent loads, so that
9200 // the corresponding register contents can be re-used.
9201 //
9202 // Load values with all low-bits (except for alignment bits) set into
9203 // SOffset, so that a larger range of values can be covered using
9204 // s_movk_i32.
9205 //
9206 // Atomic operations fail to work correctly when individual address
9207 // components are unaligned, even if their sum is aligned.
9208 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
9209 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
9210 Imm = Low;
9211 Overflow = High - Alignment.value();
9212 }
9213 }
9214
9215 if (Overflow > 0) {
9216 // There is a hardware bug in SI and CI which prevents address clamping in
9217 // MUBUF instructions from working correctly with SOffsets. The immediate
9218 // offset is unaffected.
9220 return false;
9221
9222 // It is not possible to set immediate in SOffset field on some targets.
9223 if (ST.hasRestrictedSOffset())
9224 return false;
9225 }
9226
9227 ImmOffset = Imm;
9228 SOffset = Overflow;
9229 return true;
9230}
9231
9232// Depending on the used address space and instructions, some immediate offsets
9233// are allowed and some are not.
9234// Pre-GFX12, flat instruction offsets can only be non-negative, global and
9235// scratch instruction offsets can also be negative. On GFX12, offsets can be
9236// negative for all variants.
9237//
9238// There are several bugs related to these offsets:
9239// On gfx10.1, flat instructions that go into the global address space cannot
9240// use an offset.
9241//
9242// For scratch instructions, the address can be either an SGPR or a VGPR.
9243// The following offsets can be used, depending on the architecture (x means
9244// cannot be used):
9245// +----------------------------+------+------+
9246// | Address-Mode | SGPR | VGPR |
9247// +----------------------------+------+------+
9248// | gfx9 | | |
9249// | negative, 4-aligned offset | x | ok |
9250// | negative, unaligned offset | x | ok |
9251// +----------------------------+------+------+
9252// | gfx10 | | |
9253// | negative, 4-aligned offset | ok | ok |
9254// | negative, unaligned offset | ok | x |
9255// +----------------------------+------+------+
9256// | gfx10.3 | | |
9257// | negative, 4-aligned offset | ok | ok |
9258// | negative, unaligned offset | ok | ok |
9259// +----------------------------+------+------+
9260//
9261// This function ignores the addressing mode, so if an offset cannot be used in
9262// one addressing mode, it is considered illegal.
9263bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
9264 uint64_t FlatVariant) const {
9265 // TODO: Should 0 be special cased?
9266 if (!ST.hasFlatInstOffsets())
9267 return false;
9268
9269 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
9270 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
9271 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
9272 return false;
9273
9275 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
9276 (Offset % 4) != 0) {
9277 return false;
9278 }
9279
9280 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9281 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
9282 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
9283}
9284
9285// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
9286std::pair<int64_t, int64_t>
9287SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
9288 uint64_t FlatVariant) const {
9289 int64_t RemainderOffset = COffsetVal;
9290 int64_t ImmField = 0;
9291
9292 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9293 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
9294
9295 if (AllowNegative) {
9296 // Use signed division by a power of two to truncate towards 0.
9297 int64_t D = 1LL << NumBits;
9298 RemainderOffset = (COffsetVal / D) * D;
9299 ImmField = COffsetVal - RemainderOffset;
9300
9302 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
9303 (ImmField % 4) != 0) {
9304 // Make ImmField a multiple of 4
9305 RemainderOffset += ImmField % 4;
9306 ImmField -= ImmField % 4;
9307 }
9308 } else if (COffsetVal >= 0) {
9309 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
9310 RemainderOffset = COffsetVal - ImmField;
9311 }
9312
9313 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
9314 assert(RemainderOffset + ImmField == COffsetVal);
9315 return {ImmField, RemainderOffset};
9316}
9317
9319 if (ST.hasNegativeScratchOffsetBug() &&
9320 FlatVariant == SIInstrFlags::FlatScratch)
9321 return false;
9322
9323 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
9324}
9325
9326static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
9327 switch (ST.getGeneration()) {
9328 default:
9329 break;
9332 return SIEncodingFamily::SI;
9335 return SIEncodingFamily::VI;
9342 }
9343 llvm_unreachable("Unknown subtarget generation!");
9344}
9345
9346bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
9347 switch(MCOp) {
9348 // These opcodes use indirect register addressing so
9349 // they need special handling by codegen (currently missing).
9350 // Therefore it is too risky to allow these opcodes
9351 // to be selected by dpp combiner or sdwa peepholer.
9352 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
9353 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
9354 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
9355 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
9356 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
9357 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
9358 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
9359 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
9360 return true;
9361 default:
9362 return false;
9363 }
9364}
9365
9366#define GENERATE_RENAMED_GFX9_CASES(OPCODE) \
9367 case OPCODE##_dpp: \
9368 case OPCODE##_e32: \
9369 case OPCODE##_e64: \
9370 case OPCODE##_e64_dpp: \
9371 case OPCODE##_sdwa:
9372
9373static bool isRenamedInGFX9(int Opcode) {
9374 switch (Opcode) {
9375 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADDC_U32)
9376 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_CO_U32)
9377 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_ADD_U32)
9378 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBBREV_U32)
9379 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBB_U32)
9380 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_CO_U32)
9381 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUBREV_U32)
9382 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_CO_U32)
9383 GENERATE_RENAMED_GFX9_CASES(AMDGPU::V_SUB_U32)
9384 //
9385 case AMDGPU::V_DIV_FIXUP_F16_gfx9_e64:
9386 case AMDGPU::V_DIV_FIXUP_F16_gfx9_fake16_e64:
9387 case AMDGPU::V_FMA_F16_gfx9_e64:
9388 case AMDGPU::V_FMA_F16_gfx9_fake16_e64:
9389 case AMDGPU::V_INTERP_P2_F16:
9390 case AMDGPU::V_MAD_F16_e64:
9391 case AMDGPU::V_MAD_U16_e64:
9392 case AMDGPU::V_MAD_I16_e64:
9393 return true;
9394 default:
9395 return false;
9396 }
9397}
9398
9399int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
9400 Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
9401
9402 unsigned Gen = subtargetEncodingFamily(ST);
9403
9406
9407 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
9408 // subtarget has UnpackedD16VMem feature.
9409 // TODO: remove this when we discard GFX80 encoding.
9410 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
9412
9413 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
9414 switch (ST.getGeneration()) {
9415 default:
9417 break;
9420 break;
9423 break;
9424 }
9425 }
9426
9427 if (isMAI(Opcode)) {
9428 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
9429 if (MFMAOp != -1)
9430 Opcode = MFMAOp;
9431 }
9432
9433 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
9434
9435 // -1 means that Opcode is already a native instruction.
9436 if (MCOp == -1)
9437 return Opcode;
9438
9439 if (ST.hasGFX90AInsts()) {
9440 uint16_t NMCOp = (uint16_t)-1;
9441 if (ST.hasGFX940Insts())
9443 if (NMCOp == (uint16_t)-1)
9445 if (NMCOp == (uint16_t)-1)
9447 if (NMCOp != (uint16_t)-1)
9448 MCOp = NMCOp;
9449 }
9450
9451 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
9452 // no encoding in the given subtarget generation.
9453 if (MCOp == (uint16_t)-1)
9454 return -1;
9455
9456 if (isAsmOnlyOpcode(MCOp))
9457 return -1;
9458
9459 return MCOp;
9460}
9461
9462static
9464 assert(RegOpnd.isReg());
9465 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
9466 getRegSubRegPair(RegOpnd);
9467}
9468
9471 assert(MI.isRegSequence());
9472 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
9473 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
9474 auto &RegOp = MI.getOperand(1 + 2 * I);
9475 return getRegOrUndef(RegOp);
9476 }
9478}
9479
9480// Try to find the definition of reg:subreg in subreg-manipulation pseudos
9481// Following a subreg of reg:subreg isn't supported
9484 if (!RSR.SubReg)
9485 return false;
9486 switch (MI.getOpcode()) {
9487 default: break;
9488 case AMDGPU::REG_SEQUENCE:
9489 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
9490 return true;
9491 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
9492 case AMDGPU::INSERT_SUBREG:
9493 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
9494 // inserted the subreg we're looking for
9495 RSR = getRegOrUndef(MI.getOperand(2));
9496 else { // the subreg in the rest of the reg
9497 auto R1 = getRegOrUndef(MI.getOperand(1));
9498 if (R1.SubReg) // subreg of subreg isn't supported
9499 return false;
9500 RSR.Reg = R1.Reg;
9501 }
9502 return true;
9503 }
9504 return false;
9505}
9506
9509 assert(MRI.isSSA());
9510 if (!P.Reg.isVirtual())
9511 return nullptr;
9512
9513 auto RSR = P;
9514 auto *DefInst = MRI.getVRegDef(RSR.Reg);
9515 while (auto *MI = DefInst) {
9516 DefInst = nullptr;
9517 switch (MI->getOpcode()) {
9518 case AMDGPU::COPY:
9519 case AMDGPU::V_MOV_B32_e32: {
9520 auto &Op1 = MI->getOperand(1);
9521 if (Op1.isReg() && Op1.getReg().isVirtual()) {
9522 if (Op1.isUndef())
9523 return nullptr;
9524 RSR = getRegSubRegPair(Op1);
9525 DefInst = MRI.getVRegDef(RSR.Reg);
9526 }
9527 break;
9528 }
9529 default:
9530 if (followSubRegDef(*MI, RSR)) {
9531 if (!RSR.Reg)
9532 return nullptr;
9533 DefInst = MRI.getVRegDef(RSR.Reg);
9534 }
9535 }
9536 if (!DefInst)
9537 return MI;
9538 }
9539 return nullptr;
9540}
9541
9543 Register VReg,
9544 const MachineInstr &DefMI,
9545 const MachineInstr &UseMI) {
9546 assert(MRI.isSSA() && "Must be run on SSA");
9547
9548 auto *TRI = MRI.getTargetRegisterInfo();
9549 auto *DefBB = DefMI.getParent();
9550
9551 // Don't bother searching between blocks, although it is possible this block
9552 // doesn't modify exec.
9553 if (UseMI.getParent() != DefBB)
9554 return true;
9555
9556 const int MaxInstScan = 20;
9557 int NumInst = 0;
9558
9559 // Stop scan at the use.
9560 auto E = UseMI.getIterator();
9561 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
9562 if (I->isDebugInstr())
9563 continue;
9564
9565 if (++NumInst > MaxInstScan)
9566 return true;
9567
9568 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
9569 return true;
9570 }
9571
9572 return false;
9573}
9574
9576 Register VReg,
9577 const MachineInstr &DefMI) {
9578 assert(MRI.isSSA() && "Must be run on SSA");
9579
9580 auto *TRI = MRI.getTargetRegisterInfo();
9581 auto *DefBB = DefMI.getParent();
9582
9583 const int MaxUseScan = 10;
9584 int NumUse = 0;
9585
9586 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
9587 auto &UseInst = *Use.getParent();
9588 // Don't bother searching between blocks, although it is possible this block
9589 // doesn't modify exec.
9590 if (UseInst.getParent() != DefBB || UseInst.isPHI())
9591 return true;
9592
9593 if (++NumUse > MaxUseScan)
9594 return true;
9595 }
9596
9597 if (NumUse == 0)
9598 return false;
9599
9600 const int MaxInstScan = 20;
9601 int NumInst = 0;
9602
9603 // Stop scan when we have seen all the uses.
9604 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
9605 assert(I != DefBB->end());
9606
9607 if (I->isDebugInstr())
9608 continue;
9609
9610 if (++NumInst > MaxInstScan)
9611 return true;
9612
9613 for (const MachineOperand &Op : I->operands()) {
9614 // We don't check reg masks here as they're used only on calls:
9615 // 1. EXEC is only considered const within one BB
9616 // 2. Call should be a terminator instruction if present in a BB
9617
9618 if (!Op.isReg())
9619 continue;
9620
9621 Register Reg = Op.getReg();
9622 if (Op.isUse()) {
9623 if (Reg == VReg && --NumUse == 0)
9624 return false;
9625 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
9626 return true;
9627 }
9628 }
9629}
9630
9633 const DebugLoc &DL, Register Src, Register Dst) const {
9634 auto Cur = MBB.begin();
9635 if (Cur != MBB.end())
9636 do {
9637 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
9638 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
9639 ++Cur;
9640 } while (Cur != MBB.end() && Cur != LastPHIIt);
9641
9642 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
9643 Dst);
9644}
9645
9648 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
9649 if (InsPt != MBB.end() &&
9650 (InsPt->getOpcode() == AMDGPU::SI_IF ||
9651 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
9652 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
9653 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
9654 InsPt++;
9655 return BuildMI(MBB, InsPt, DL,
9656 get(ST.isWave32() ? AMDGPU::S_MOV_B32_term
9657 : AMDGPU::S_MOV_B64_term),
9658 Dst)
9659 .addReg(Src, 0, SrcSubReg)
9660 .addReg(AMDGPU::EXEC, RegState::Implicit);
9661 }
9662 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
9663 Dst);
9664}
9665
9666bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
9667
9670 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
9671 VirtRegMap *VRM) const {
9672 // This is a bit of a hack (copied from AArch64). Consider this instruction:
9673 //
9674 // %0:sreg_32 = COPY $m0
9675 //
9676 // We explicitly chose SReg_32 for the virtual register so such a copy might
9677 // be eliminated by RegisterCoalescer. However, that may not be possible, and
9678 // %0 may even spill. We can't spill $m0 normally (it would require copying to
9679 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
9680 // TargetInstrInfo::foldMemoryOperand() is going to try.
9681 // A similar issue also exists with spilling and reloading $exec registers.
9682 //
9683 // To prevent that, constrain the %0 register class here.
9684 if (isFullCopyInstr(MI)) {
9685 Register DstReg = MI.getOperand(0).getReg();
9686 Register SrcReg = MI.getOperand(1).getReg();
9687 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
9688 (DstReg.isVirtual() != SrcReg.isVirtual())) {
9690 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
9691 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
9692 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
9693 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
9694 return nullptr;
9695 }
9696 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
9697 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
9698 return nullptr;
9699 }
9700 }
9701 }
9702
9703 return nullptr;
9704}
9705
9707 const MachineInstr &MI,
9708 unsigned *PredCost) const {
9709 if (MI.isBundle()) {
9711 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
9712 unsigned Lat = 0, Count = 0;
9713 for (++I; I != E && I->isBundledWithPred(); ++I) {
9714 ++Count;
9715 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
9716 }
9717 return Lat + Count - 1;
9718 }
9719
9720 return SchedModel.computeInstrLatency(&MI);
9721}
9722
9725 unsigned opcode = MI.getOpcode();
9726 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
9727 auto IID = GI->getIntrinsicID();
9732
9733 switch (IID) {
9734 case Intrinsic::amdgcn_if:
9735 case Intrinsic::amdgcn_else:
9736 // FIXME: Uniform if second result
9737 break;
9738 }
9739
9741 }
9742
9743 // Loads from the private and flat address spaces are divergent, because
9744 // threads can execute the load instruction with the same inputs and get
9745 // different results.
9746 //
9747 // All other loads are not divergent, because if threads issue loads with the
9748 // same arguments, they will always get the same result.
9749 if (opcode == AMDGPU::G_LOAD) {
9750 if (MI.memoperands_empty())
9751 return InstructionUniformity::NeverUniform; // conservative assumption
9752
9753 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
9754 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
9755 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
9756 })) {
9757 // At least one MMO in a non-global address space.
9759 }
9761 }
9762
9763 if (SIInstrInfo::isGenericAtomicRMWOpcode(opcode) ||
9764 opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
9765 opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
9766 AMDGPU::isGenericAtomic(opcode)) {
9768 }
9770}
9771
9774
9775 if (isNeverUniform(MI))
9777
9778 unsigned opcode = MI.getOpcode();
9779 if (opcode == AMDGPU::V_READLANE_B32 ||
9780 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
9781 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
9783
9784 if (isCopyInstr(MI)) {
9785 const MachineOperand &srcOp = MI.getOperand(1);
9786 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
9787 const TargetRegisterClass *regClass =
9788 RI.getPhysRegBaseClass(srcOp.getReg());
9791 }
9793 }
9794
9795 // GMIR handling
9796 if (MI.isPreISelOpcode())
9798
9799 // Atomics are divergent because they are executed sequentially: when an
9800 // atomic operation refers to the same address in each thread, then each
9801 // thread after the first sees the value written by the previous thread as
9802 // original value.
9803
9804 if (isAtomic(MI))
9806
9807 // Loads from the private and flat address spaces are divergent, because
9808 // threads can execute the load instruction with the same inputs and get
9809 // different results.
9810 if (isFLAT(MI) && MI.mayLoad()) {
9811 if (MI.memoperands_empty())
9812 return InstructionUniformity::NeverUniform; // conservative assumption
9813
9814 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
9815 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
9816 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
9817 })) {
9818 // At least one MMO in a non-global address space.
9820 }
9821
9823 }
9824
9825 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9826 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
9827
9828 // FIXME: It's conceptually broken to report this for an instruction, and not
9829 // a specific def operand. For inline asm in particular, there could be mixed
9830 // uniform and divergent results.
9831 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
9832 const MachineOperand &SrcOp = MI.getOperand(I);
9833 if (!SrcOp.isReg())
9834 continue;
9835
9836 Register Reg = SrcOp.getReg();
9837 if (!Reg || !SrcOp.readsReg())
9838 continue;
9839
9840 // If RegBank is null, this is unassigned or an unallocatable special
9841 // register, which are all scalars.
9842 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
9843 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
9845 }
9846
9847 // TODO: Uniformity check condtions above can be rearranged for more
9848 // redability
9849
9850 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
9851 // currently turned into no-op COPYs by SelectionDAG ISel and are
9852 // therefore no longer recognizable.
9853
9855}
9856
9858 switch (MF.getFunction().getCallingConv()) {
9860 return 1;
9862 return 2;
9864 return 3;
9868 report_fatal_error("ds_ordered_count unsupported for this calling conv");
9871 case CallingConv::C:
9872 case CallingConv::Fast:
9873 default:
9874 // Assume other calling conventions are various compute callable functions
9875 return 0;
9876 }
9877}
9878
9880 Register &SrcReg2, int64_t &CmpMask,
9881 int64_t &CmpValue) const {
9882 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
9883 return false;
9884
9885 switch (MI.getOpcode()) {
9886 default:
9887 break;
9888 case AMDGPU::S_CMP_EQ_U32:
9889 case AMDGPU::S_CMP_EQ_I32:
9890 case AMDGPU::S_CMP_LG_U32:
9891 case AMDGPU::S_CMP_LG_I32:
9892 case AMDGPU::S_CMP_LT_U32:
9893 case AMDGPU::S_CMP_LT_I32:
9894 case AMDGPU::S_CMP_GT_U32:
9895 case AMDGPU::S_CMP_GT_I32:
9896 case AMDGPU::S_CMP_LE_U32:
9897 case AMDGPU::S_CMP_LE_I32:
9898 case AMDGPU::S_CMP_GE_U32:
9899 case AMDGPU::S_CMP_GE_I32:
9900 case AMDGPU::S_CMP_EQ_U64:
9901 case AMDGPU::S_CMP_LG_U64:
9902 SrcReg = MI.getOperand(0).getReg();
9903 if (MI.getOperand(1).isReg()) {
9904 if (MI.getOperand(1).getSubReg())
9905 return false;
9906 SrcReg2 = MI.getOperand(1).getReg();
9907 CmpValue = 0;
9908 } else if (MI.getOperand(1).isImm()) {
9909 SrcReg2 = Register();
9910 CmpValue = MI.getOperand(1).getImm();
9911 } else {
9912 return false;
9913 }
9914 CmpMask = ~0;
9915 return true;
9916 case AMDGPU::S_CMPK_EQ_U32:
9917 case AMDGPU::S_CMPK_EQ_I32:
9918 case AMDGPU::S_CMPK_LG_U32:
9919 case AMDGPU::S_CMPK_LG_I32:
9920 case AMDGPU::S_CMPK_LT_U32:
9921 case AMDGPU::S_CMPK_LT_I32:
9922 case AMDGPU::S_CMPK_GT_U32:
9923 case AMDGPU::S_CMPK_GT_I32:
9924 case AMDGPU::S_CMPK_LE_U32:
9925 case AMDGPU::S_CMPK_LE_I32:
9926 case AMDGPU::S_CMPK_GE_U32:
9927 case AMDGPU::S_CMPK_GE_I32:
9928 SrcReg = MI.getOperand(0).getReg();
9929 SrcReg2 = Register();
9930 CmpValue = MI.getOperand(1).getImm();
9931 CmpMask = ~0;
9932 return true;
9933 }
9934
9935 return false;
9936}
9937
9939 Register SrcReg2, int64_t CmpMask,
9940 int64_t CmpValue,
9941 const MachineRegisterInfo *MRI) const {
9942 if (!SrcReg || SrcReg.isPhysical())
9943 return false;
9944
9945 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
9946 return false;
9947
9948 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
9949 this](int64_t ExpectedValue, unsigned SrcSize,
9950 bool IsReversible, bool IsSigned) -> bool {
9951 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9952 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9953 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9954 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9955 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
9956 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9957 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9958 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9959 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9960 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
9961 //
9962 // Signed ge/gt are not used for the sign bit.
9963 //
9964 // If result of the AND is unused except in the compare:
9965 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
9966 //
9967 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
9968 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
9969 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
9970 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
9971 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
9972 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
9973
9974 MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
9975 if (!Def || Def->getParent() != CmpInstr.getParent())
9976 return false;
9977
9978 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
9979 Def->getOpcode() != AMDGPU::S_AND_B64)
9980 return false;
9981
9982 int64_t Mask;
9983 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
9984 if (MO->isImm())
9985 Mask = MO->getImm();
9986 else if (!getFoldableImm(MO, Mask))
9987 return false;
9988 Mask &= maxUIntN(SrcSize);
9989 return isPowerOf2_64(Mask);
9990 };
9991
9992 MachineOperand *SrcOp = &Def->getOperand(1);
9993 if (isMask(SrcOp))
9994 SrcOp = &Def->getOperand(2);
9995 else if (isMask(&Def->getOperand(2)))
9996 SrcOp = &Def->getOperand(1);
9997 else
9998 return false;
9999
10000 // A valid Mask is required to have a single bit set, hence a non-zero and
10001 // power-of-two value. This verifies that we will not do 64-bit shift below.
10002 assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
10003 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
10004 if (IsSigned && BitNo == SrcSize - 1)
10005 return false;
10006
10007 ExpectedValue <<= BitNo;
10008
10009 bool IsReversedCC = false;
10010 if (CmpValue != ExpectedValue) {
10011 if (!IsReversible)
10012 return false;
10013 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
10014 if (!IsReversedCC)
10015 return false;
10016 }
10017
10018 Register DefReg = Def->getOperand(0).getReg();
10019 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
10020 return false;
10021
10022 for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
10023 I != E; ++I) {
10024 if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
10025 I->killsRegister(AMDGPU::SCC, &RI))
10026 return false;
10027 }
10028
10029 MachineOperand *SccDef =
10030 Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
10031 SccDef->setIsDead(false);
10032 CmpInstr.eraseFromParent();
10033
10034 if (!MRI->use_nodbg_empty(DefReg)) {
10035 assert(!IsReversedCC);
10036 return true;
10037 }
10038
10039 // Replace AND with unused result with a S_BITCMP.
10040 MachineBasicBlock *MBB = Def->getParent();
10041
10042 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
10043 : AMDGPU::S_BITCMP1_B32
10044 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
10045 : AMDGPU::S_BITCMP1_B64;
10046
10047 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
10048 .add(*SrcOp)
10049 .addImm(BitNo);
10050 Def->eraseFromParent();
10051
10052 return true;
10053 };
10054
10055 switch (CmpInstr.getOpcode()) {
10056 default:
10057 break;
10058 case AMDGPU::S_CMP_EQ_U32:
10059 case AMDGPU::S_CMP_EQ_I32:
10060 case AMDGPU::S_CMPK_EQ_U32:
10061 case AMDGPU::S_CMPK_EQ_I32:
10062 return optimizeCmpAnd(1, 32, true, false);
10063 case AMDGPU::S_CMP_GE_U32:
10064 case AMDGPU::S_CMPK_GE_U32:
10065 return optimizeCmpAnd(1, 32, false, false);
10066 case AMDGPU::S_CMP_GE_I32:
10067 case AMDGPU::S_CMPK_GE_I32:
10068 return optimizeCmpAnd(1, 32, false, true);
10069 case AMDGPU::S_CMP_EQ_U64:
10070 return optimizeCmpAnd(1, 64, true, false);
10071 case AMDGPU::S_CMP_LG_U32:
10072 case AMDGPU::S_CMP_LG_I32:
10073 case AMDGPU::S_CMPK_LG_U32:
10074 case AMDGPU::S_CMPK_LG_I32:
10075 return optimizeCmpAnd(0, 32, true, false);
10076 case AMDGPU::S_CMP_GT_U32:
10077 case AMDGPU::S_CMPK_GT_U32:
10078 return optimizeCmpAnd(0, 32, false, false);
10079 case AMDGPU::S_CMP_GT_I32:
10080 case AMDGPU::S_CMPK_GT_I32:
10081 return optimizeCmpAnd(0, 32, false, true);
10082 case AMDGPU::S_CMP_LG_U64:
10083 return optimizeCmpAnd(0, 64, true, false);
10084 }
10085
10086 return false;
10087}
10088
10090 unsigned OpName) const {
10091 if (!ST.needsAlignedVGPRs())
10092 return;
10093
10094 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
10095 if (OpNo < 0)
10096 return;
10097 MachineOperand &Op = MI.getOperand(OpNo);
10098 if (getOpSize(MI, OpNo) > 4)
10099 return;
10100
10101 // Add implicit aligned super-reg to force alignment on the data operand.
10102 const DebugLoc &DL = MI.getDebugLoc();
10103 MachineBasicBlock *BB = MI.getParent();
10105 Register DataReg = Op.getReg();
10106 bool IsAGPR = RI.isAGPR(MRI, DataReg);
10107 Register Undef = MRI.createVirtualRegister(
10108 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
10109 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
10110 Register NewVR =
10111 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
10112 : &AMDGPU::VReg_64_Align2RegClass);
10113 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
10114 .addReg(DataReg, 0, Op.getSubReg())
10115 .addImm(AMDGPU::sub0)
10116 .addReg(Undef)
10117 .addImm(AMDGPU::sub1);
10118 Op.setReg(NewVR);
10119 Op.setSubReg(AMDGPU::sub0);
10120 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
10121}
10122
10124 if (isIGLP(*MI))
10125 return false;
10126
10128}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
unsigned RegSize
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Addr
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
TargetInstrInfo::RegSubRegPair RegSubRegPair
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
#define GENERATE_RENAMED_GFX9_CASES(OPCODE)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static MachineInstr * swapImmOperands(MachineInstr &MI, MachineOperand &NonRegOp1, MachineOperand &NonRegOp2)
static unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &MFI)
static unsigned getAGPRSpillRestoreOpcode(unsigned Size)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static const TargetRegisterClass * adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI, const MachineRegisterInfo &MRI, const MCInstrDesc &TID, unsigned RCID, bool IsAllocatable)
static unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &MFI)
static unsigned getAGPRSpillSaveOpcode(unsigned Size)
static bool resultDependsOnExec(const MachineInstr &MI)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, unsigned OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
Definition: SIInstrInfo.cpp:82
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static bool isRegOrFI(const MachineOperand &MO)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static bool isRenamedInGFX9(int Opcode)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static constexpr unsigned ModifierOpNames[]
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
Definition: SIInstrInfo.cpp:73
static bool canRemat(const MachineInstr &MI)
static MachineBasicBlock * loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
bool IsDead
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool has16BitInsts() const
bool hasTrue16BitInsts() const
Return true if the subtarget supports True16 instructions.
bool hasInv2PiInlineImm() const
Class for arbitrary precision integers.
Definition: APInt.h:78
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1542
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:171
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
uint64_t getZExtValue() const
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:277
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:369
bool useVGPRIndexMode() const
bool hasSDWAOmod() const
Definition: GCNSubtarget.h:759
bool hasA16() const
bool hasSDWAScalar() const
Definition: GCNSubtarget.h:763
bool hasScalarCompareEq64() const
bool hasOnlyRevVALUShifts() const
Definition: GCNSubtarget.h:401
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:641
bool hasGFX90AInsts() const
bool hasDLInsts() const
Definition: GCNSubtarget.h:779
bool hasMAIInsts() const
Definition: GCNSubtarget.h:837
bool hasMFMAInlineLiteralBug() const
bool hasNegativeScratchOffsetBug() const
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasVALUMaskWriteHazard() const
bool hasPkMovB32() const
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
bool hasR128A16() const
bool hasOffset3fBug() const
bool hasGetPCZeroExtension() const
const AMDGPURegisterBankInfo * getRegBankInfo() const override
Definition: GCNSubtarget.h:313
bool hasSDWAOutModsVOPC() const
Definition: GCNSubtarget.h:775
bool hasRestrictedSOffset() const
bool hasFlatSegmentOffsetBug() const
Definition: GCNSubtarget.h:694
bool hasGFX940Insts() const
bool hasSDWASdst() const
Definition: GCNSubtarget.h:767
bool hasVALUReadSGPRHazard() const
bool hasMovB64() const
bool isWave32() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:354
bool hasNegativeUnalignedScratchOffsetBug() const
bool hasG16() const
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasNoF16PseudoScalarTransInlineConstants() const
Generation getGeneration() const
Definition: GCNSubtarget.h:327
bool hasVOP3Literal() const
Definition: GCNSubtarget.h:946
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:746
bool hasAddr64() const
Definition: GCNSubtarget.h:391
bool isWave64() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:738
bool hasGDS() const
bool hasPartialNSAEncoding() const
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
A possibly irreducible generalization of a Loop.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
LiveInterval - This class represents the liveness of a register, or stack slot.
Definition: LiveInterval.h:687
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
bool shrinkToUses(LiveInterval *li, SmallVectorImpl< MachineInstr * > *dead=nullptr)
After removing some uses of a register, shrink its live range to just the remaining uses.
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
Definition: LiveInterval.h:157
iterator find(SlotIndex Pos)
find - Return an iterator pointing to the first segment that ends after Pos, or end().
void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:542
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:612
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:622
static const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition: MCExpr.cpp:222
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
Definition: MCInstrDesc.h:237
ArrayRef< MCOperandInfo > operands() const
Definition: MCInstrDesc.h:239
bool mayStore() const
Return true if this instruction could possibly modify memory.
Definition: MCInstrDesc.h:444
bool mayLoad() const
Return true if this instruction could possibly read memory.
Definition: MCInstrDesc.h:438
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
Definition: MCInstrDesc.h:248
unsigned short Opcode
Definition: MCInstrDesc.h:205
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
Definition: MCInstrDesc.h:565
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:85
uint8_t OperandType
Information about the type of the operand.
Definition: MCInstrDesc.h:97
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition: MCInstrDesc.h:91
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx)
Definition: MCExpr.h:398
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:41
void setVariableValue(const MCExpr *Value)
Definition: MCSymbol.cpp:47
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
unsigned pred_size() const
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
reverse_iterator rend()
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< iterator > terminators()
iterator_range< succ_iterator > successors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineInstr * CloneMachineInstr(const MachineInstr *Orig)
Create a new MachineInstr which is a copy of Orig, identical in all ways except the instruction has n...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:575
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:347
void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:578
void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
iterator_range< mop_iterator > explicit_operands()
Definition: MachineInstr.h:697
unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
Definition: MachineInstr.h:821
void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:806
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:788
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:499
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
iterator_range< mop_iterator > implicit_operands()
Definition: MachineInstr.h:705
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:585
uint32_t getFlags() const
Return the MI flags bitvector.
Definition: MachineInstr.h:392
int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
unsigned getOperandNo() const
Returns the index of this operand in the instruction that it belongs to.
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
reg_begin/reg_end - Provide iteration support to walk over all definitions and uses of a register wit...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isReserved(MCRegister PhysReg) const
isReserved - Returns true when PhysReg is a reserved register.
void enterBasicBlockEnd(MachineBasicBlock &MBB)
Start tracking liveness from the end of basic block MBB.
bool isRegUsed(Register Reg, bool includeReserved=true) const
Return if a specific register is currently used.
void setRegUsed(Register Reg, LaneBitmask LaneMask=LaneBitmask::getAll())
Tell the scavenger a register is used.
void backward()
Update internal register state and move MBB iterator backwards.
void enterBasicBlock(MachineBasicBlock &MBB)
Start tracking liveness from the begin of basic block MBB.
Register scavengeRegisterBackwards(const TargetRegisterClass &RC, MachineBasicBlock::iterator To, bool RestoreAfter, int SPAdj, bool AllowSpill=true)
Make a register of the specific register class available from the current position backwards to the p...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
Definition: RegisterBank.h:28
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:45
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:115
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
static bool isMAI(const MachineInstr &MI)
Definition: SIInstrInfo.h:801
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:563
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
static bool isVOP3(const MachineInstr &MI)
Definition: SIInstrInfo.h:513
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
Definition: SIInstrInfo.h:933
unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
Definition: SIInstrInfo.h:1174
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
uint64_t getDefaultRsrcDataFormat() const
InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const
bool isIGLP(unsigned Opcode) const
Definition: SIInstrInfo.h:974
static bool isFLATScratch(const MachineInstr &MI)
Definition: SIInstrInfo.h:645
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
Definition: SIInstrInfo.h:553
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
int64_t getNamedImmOperand(const MachineInstr &MI, unsigned OpName) const
Get required immediate operand.
Definition: SIInstrInfo.h:1306
static bool isMTBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:545
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:658
static bool isSALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:417
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
unsigned getInstBundleSize(const MachineInstr &MI) const
static bool isVOP2(const MachineInstr &MI)
Definition: SIInstrInfo.h:505
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
Definition: SIInstrInfo.h:521
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
Definition: SIInstrInfo.h:613
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
Definition: SIInstrInfo.h:529
void removeModOperands(MachineInstr &MI) const
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, bool RenamableDest=false, bool RenamableSrc=false) const override
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
static bool isVIMAGE(const MachineInstr &MI)
Definition: SIInstrInfo.h:597
static bool isSOP2(const MachineInstr &MI)
Definition: SIInstrInfo.h:457
static bool isGWS(const MachineInstr &MI)
Definition: SIInstrInfo.h:579
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, unsigned OperandName) const
Returns the operand named Op.
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, unsigned Src0OpName, MachineOperand &Src1, unsigned Src1OpName) const
static bool isFLATGlobal(const MachineInstr &MI)
Definition: SIInstrInfo.h:637
bool isGlobalMemoryObject(const MachineInstr *MI) const override
static bool isVSAMPLE(const MachineInstr &MI)
Definition: SIInstrInfo.h:605
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
unsigned isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const override
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isImage(const MachineInstr &MI)
Definition: SIInstrInfo.h:433
static bool isSOPK(const MachineInstr &MI)
Definition: SIInstrInfo.h:473
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
Definition: SIInstrInfo.h:981
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, const MachineOperand &MO) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:537
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const override final
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:627
static bool isF16PseudoScalarTrans(unsigned Opcode)
Definition: SIInstrInfo.h:958
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
static bool isDPP(const MachineInstr &MI)
Definition: SIInstrInfo.h:769
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
bool isLowLatencyInstruction(const MachineInstr &MI) const
void materializeImmediate(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, int64_t Value) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
bool isAlwaysGDS(uint16_t Opcode) const
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
static bool isVGPRSpill(const MachineInstr &MI)
Definition: SIInstrInfo.h:725
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
static bool isWWMRegSpillOpcode(uint16_t Opcode)
Definition: SIInstrInfo.h:757
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool isVGPRCopy(const MachineInstr &MI) const
Definition: SIInstrInfo.h:1027
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:589
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg, MachineInstr::MIFlag Flags=MachineInstr::NoFlags) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description or operand ind...
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
const TargetRegisterClass * getRegClass(const MCInstrDesc &TID, unsigned OpNum, const TargetRegisterInfo *TRI, const MachineFunction &MF) const override
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI, const TargetRegisterClass *DstRC=nullptr) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool isLegalToSwap(const MachineInstr &MI, unsigned fromIdx, const MachineOperand *fromMO, unsigned toIdx, const MachineOperand *toMO) const
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
static bool isAtomic(const MachineInstr &MI)
Definition: SIInstrInfo.h:690
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
static bool sopkIsZext(unsigned Opcode)
Definition: SIInstrInfo.h:872
static bool isSGPRSpill(const MachineInstr &MI)
Definition: SIInstrInfo.h:737
static bool isWMMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:818
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
Definition: SIInstrInfo.h:465
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:621
static bool isVALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:425
bool isBarrier(unsigned Opcode) const
Definition: SIInstrInfo.h:948
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
Definition: SIInstrInfo.h:1319
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
bool hasModifiersSet(const MachineInstr &MI, unsigned OpName) const
static bool isFixedSize(const MachineInstr &MI)
Definition: SIInstrInfo.h:889
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
uint64_t getScratchRsrcWords23() const
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
static bool isLDSDMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:571
unsigned isStackAccess(const MachineInstr &MI, int &FrameIndex) const
static bool isVOP1(const MachineInstr &MI)
Definition: SIInstrInfo.h:497
SIInstrInfo(const GCNSubtarget &ST)
Definition: SIInstrInfo.cpp:63
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool isWWMReg(Register Reg) const
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
const TargetRegisterClass * getRegClass(unsigned RCID) const
const TargetRegisterClass * getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, const TargetRegisterClass *SubRC, unsigned SubIdx) const
Returns a register class which is compatible with SuperRC, such that a subregister exists with class ...
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCPhysReg get32BitRegister(MCPhysReg Reg) const
const TargetRegisterClass * getProperlyAlignedRC(const TargetRegisterClass *RC) const
bool isProperlyAlignedRC(const TargetRegisterClass &RC) const
static bool hasVectorRegisters(const TargetRegisterClass *RC)
const TargetRegisterClass * getEquivalentVGPRClass(const TargetRegisterClass *SRC) const
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const override
bool isVGPR(const MachineRegisterInfo &MRI, Register Reg) const
bool opCanUseInlineConstant(unsigned OpType) const
bool isVectorRegister(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getRegClassForReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentAGPRClass(const TargetRegisterClass *SRC) const
bool opCanUseLiteralConstant(unsigned OpType) const
static bool hasVGPRs(const TargetRegisterClass *RC)
static bool isVGPRClass(const TargetRegisterClass *RC)
unsigned getHWRegIndex(MCRegister Reg) const
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentSGPRClass(const TargetRegisterClass *VRC) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
const TargetRegisterClass * getBoolRC() const
bool isAGPR(const MachineRegisterInfo &MRI, Register Reg) const
unsigned getChannelFromSubReg(unsigned SubReg) const
MCRegister getVCC() const
static bool hasAGPRs(const TargetRegisterClass *RC)
const TargetRegisterClass * getWaveMaskRegClass() const
bool spillSGPRToVGPR() const
const TargetRegisterClass * getVGPR64Class() const
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:577
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:65
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
Definition: SlotIndexes.h:237
SlotIndexes pass.
Definition: SlotIndexes.h:297
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Definition: SlotIndexes.h:531
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:298
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
int64_t getImm() const
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool isGlobalMemoryObject(const MachineInstr *MI) const
Returns true if MI is an instruction we are unable to reason about (like a call or something with unm...
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
const TargetRegisterClass * getAllocatableClass(const TargetRegisterClass *RC) const
Return the maximal subclass of the given register class that is allocatable or NULL.
unsigned getSubRegIdxSize(unsigned Idx) const
Get the size of the bit range covered by a sub-register index.
unsigned getSubRegIdxOffset(unsigned Idx) const
Get the offset of the bit range covered by a sub-register index.
void init(const TargetSubtargetInfo *TSInfo)
Initialize the machine model for instruction scheduling.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:95
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst)
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
Definition: SIInstrInfo.h:1594
LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
LLVM_READNONE bool isLegalDPALU_DPPControl(unsigned DC)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
LLVM_READONLY int getGlobalVaddrOp(uint16_t Opcode)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isDPALU_DPP(const MCInstrDesc &OpDesc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
Definition: SIInstrInfo.h:1595
LLVM_READONLY int getAddr64Inst(uint16_t Opcode)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
bool isTrue16Inst(unsigned Opc)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo)
Is this an AMDGPU specific source operand? These include registers, inline constants,...
const uint64_t RSRC_TID_ENABLE
Definition: SIInstrInfo.h:1597
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
bool isGenericAtomic(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
LLVM_READONLY int getCommuteRev(uint16_t Opcode)
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition: SIDefines.h:234
@ OPERAND_REG_IMM_INT64
Definition: SIDefines.h:201
@ OPERAND_REG_IMM_V2FP16
Definition: SIDefines.h:211
@ OPERAND_REG_INLINE_C_V2INT32
Definition: SIDefines.h:227
@ OPERAND_REG_INLINE_C_FP64
Definition: SIDefines.h:223
@ OPERAND_REG_INLINE_C_BF16
Definition: SIDefines.h:220
@ OPERAND_REG_INLINE_C_V2BF16
Definition: SIDefines.h:225
@ OPERAND_REG_IMM_V2INT16
Definition: SIDefines.h:212
@ OPERAND_REG_IMM_BF16
Definition: SIDefines.h:205
@ OPERAND_REG_INLINE_AC_V2FP16
Definition: SIDefines.h:246
@ OPERAND_REG_IMM_INT32
Operands with register or 32-bit immediate.
Definition: SIDefines.h:200
@ OPERAND_REG_IMM_V2BF16
Definition: SIDefines.h:210
@ OPERAND_REG_IMM_BF16_DEFERRED
Definition: SIDefines.h:207
@ OPERAND_REG_IMM_FP16
Definition: SIDefines.h:206
@ OPERAND_REG_INLINE_C_INT64
Definition: SIDefines.h:219
@ OPERAND_REG_INLINE_AC_BF16
Definition: SIDefines.h:240
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition: SIDefines.h:217
@ OPERAND_REG_INLINE_AC_INT16
Operands with an AccVGPR register or inline constant.
Definition: SIDefines.h:238
@ OPERAND_REG_IMM_FP64
Definition: SIDefines.h:204
@ OPERAND_REG_INLINE_C_V2FP16
Definition: SIDefines.h:226
@ OPERAND_REG_INLINE_AC_V2INT16
Definition: SIDefines.h:244
@ OPERAND_REG_INLINE_AC_FP16
Definition: SIDefines.h:241
@ OPERAND_REG_INLINE_AC_INT32
Definition: SIDefines.h:239
@ OPERAND_REG_INLINE_AC_FP32
Definition: SIDefines.h:242
@ OPERAND_REG_INLINE_AC_V2BF16
Definition: SIDefines.h:245
@ OPERAND_REG_IMM_V2INT32
Definition: SIDefines.h:213
@ OPERAND_REG_IMM_FP32
Definition: SIDefines.h:203
@ OPERAND_INPUT_MODS
Definition: SIDefines.h:251
@ OPERAND_REG_INLINE_C_FP32
Definition: SIDefines.h:222
@ OPERAND_REG_INLINE_C_INT32
Definition: SIDefines.h:218
@ OPERAND_REG_INLINE_C_V2INT16
Definition: SIDefines.h:224
@ OPERAND_REG_IMM_V2FP32
Definition: SIDefines.h:214
@ OPERAND_REG_INLINE_AC_FP64
Definition: SIDefines.h:243
@ OPERAND_REG_INLINE_C_FP16
Definition: SIDefines.h:221
@ OPERAND_REG_IMM_INT16
Definition: SIDefines.h:202
@ OPERAND_REG_INLINE_C_V2FP32
Definition: SIDefines.h:228
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition: SIDefines.h:231
@ OPERAND_REG_IMM_FP32_DEFERRED
Definition: SIDefines.h:209
@ OPERAND_REG_IMM_FP16_DEFERRED
Definition: SIDefines.h:208
@ TI_SCRATCH_RSRC_DWORD1
Definition: AMDGPU.h:470
@ TI_SCRATCH_RSRC_DWORD3
Definition: AMDGPU.h:472
@ TI_SCRATCH_RSRC_DWORD0
Definition: AMDGPU.h:469
@ TI_SCRATCH_RSRC_DWORD2
Definition: AMDGPU.h:471
@ TI_CONSTDATA_START
Definition: AMDGPU.h:468
LLVM_READONLY int getCommuteOrig(uint16_t Opcode)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
int getMCOpcode(uint16_t Opcode, unsigned Gen)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
Definition: SIInstrInfo.h:1596
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
LLVM_READONLY int getIfAddr64Inst(uint16_t Opcode)
Check if Opcode is an Addr64 opcode.
bool isGraphics(CallingConv::ID cc)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ OPERAND_GENERIC_4
Definition: MCInstrDesc.h:70
@ OPERAND_GENERIC_2
Definition: MCInstrDesc.h:68
@ OPERAND_GENERIC_1
Definition: MCInstrDesc.h:67
@ OPERAND_REGISTER
Definition: MCInstrDesc.h:61
@ OPERAND_GENERIC_3
Definition: MCInstrDesc.h:69
@ OPERAND_IMMEDIATE
Definition: MCInstrDesc.h:60
@ OPERAND_UNKNOWN
Definition: MCInstrDesc.h:59
@ OPERAND_GENERIC_0
Definition: MCInstrDesc.h:66
@ OPERAND_GENERIC_5
Definition: MCInstrDesc.h:71
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Not(const Pred &P) -> Not< Pred >
Reg
All possible values of the reg field in the ModR/M byte.
@ ReallyHidden
Definition: CommandLine.h:138
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:480
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
Definition: SIInstrInfo.h:1480
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:557
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:298
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:43
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:342
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:156
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:161
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:405
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
@ Xor
Bitwise or logical XOR of integers.
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:262
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
Definition: TargetOpcodes.h:36
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition: SIInstrInfo.h:39
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
@ DS_Error
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:47
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition: Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
@ NeverUniform
The result values can never be assumed to be uniform.
@ Default
The result values are uniform if and only if all operands are uniform.
uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition: MathExtras.h:221
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
static Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition: APFloat.cpp:210
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Description of the encoding of one expression Op.
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
Definition: LiveVariables.h:83
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition: SIInstrInfo.h:51
MachineInstr * top() const
Definition: SIInstrInfo.h:56
bool empty() const
Definition: SIInstrInfo.h:66
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition: SIInstrInfo.h:75
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.