Skip to content

Commit 721cba4

Browse files
frederik-harsenm
andauthored
[AMDGPU] SIPeepholeSDWA: Handle V_CNDMASK_B32_e64 (#137930)
The VOP3 form of the V_CNDMASK_B32 instruction takes a carry-in operand. The conversion to SDWA implies a conversion to VOP2 form which reads from VCC instead. Convert V_CNDMASK_B32_e64 instructions that might be converted to SDWA to V_CNDMASK_B32_e32 first and introduce a copy of the carry-in operand to VCC. Closes #133431. --------- Co-authored-by: Matt Arsenault <arsenm2@gmail.com>
1 parent 9c9013f commit 721cba4

21 files changed

+2439
-1885
lines changed

llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp

+68-9
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ class SIPeepholeSDWA {
6262
std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
6363
void pseudoOpConvertToVOP2(MachineInstr &MI,
6464
const GCNSubtarget &ST) const;
65+
void convertVcndmaskToVOP2(MachineInstr &MI, const GCNSubtarget &ST) const;
6566
MachineInstr *createSDWAVersion(MachineInstr &MI);
6667
bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
6768
void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
@@ -1037,7 +1038,8 @@ void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
10371038
return;
10381039
// Make sure VCC or its subregs are dead before MI.
10391040
MachineBasicBlock &MBB = *MI.getParent();
1040-
auto Liveness = MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25);
1041+
MachineBasicBlock::LivenessQueryResult Liveness =
1042+
MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25);
10411043
if (Liveness != MachineBasicBlock::LQR_Dead)
10421044
return;
10431045
// Check if VCC is referenced in range of (MI,MISucc].
@@ -1061,6 +1063,52 @@ void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
10611063
MISucc.substituteRegister(CarryIn->getReg(), TRI->getVCC(), 0, *TRI);
10621064
}
10631065

1066+
/// Try to convert an \p MI in VOP3 which takes an src2 carry-in
1067+
/// operand into the corresponding VOP2 form which expects the
1068+
/// argument in VCC. To this end, add an copy from the carry-in to
1069+
/// VCC. The conversion will only be applied if \p MI can be shrunk
1070+
/// to VOP2 and if VCC can be proven to be dead before \p MI.
1071+
void SIPeepholeSDWA::convertVcndmaskToVOP2(MachineInstr &MI,
1072+
const GCNSubtarget &ST) const {
1073+
assert(MI.getOpcode() == AMDGPU::V_CNDMASK_B32_e64);
1074+
1075+
LLVM_DEBUG(dbgs() << "Attempting VOP2 conversion: " << MI);
1076+
if (!TII->canShrink(MI, *MRI)) {
1077+
LLVM_DEBUG(dbgs() << "Cannot shrink instruction\n");
1078+
return;
1079+
}
1080+
1081+
const MachineOperand &CarryIn =
1082+
*TII->getNamedOperand(MI, AMDGPU::OpName::src2);
1083+
Register CarryReg = CarryIn.getReg();
1084+
MachineInstr *CarryDef = MRI->getVRegDef(CarryReg);
1085+
if (!CarryDef) {
1086+
LLVM_DEBUG(dbgs() << "Missing carry-in operand definition\n");
1087+
return;
1088+
}
1089+
1090+
// Make sure VCC or its subregs are dead before MI.
1091+
MCRegister Vcc = TRI->getVCC();
1092+
MachineBasicBlock &MBB = *MI.getParent();
1093+
MachineBasicBlock::LivenessQueryResult Liveness =
1094+
MBB.computeRegisterLiveness(TRI, Vcc, MI);
1095+
if (Liveness != MachineBasicBlock::LQR_Dead) {
1096+
LLVM_DEBUG(dbgs() << "VCC not known to be dead before instruction\n");
1097+
return;
1098+
}
1099+
1100+
BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), Vcc).add(CarryIn);
1101+
1102+
auto Converted = BuildMI(MBB, MI, MI.getDebugLoc(),
1103+
TII->get(AMDGPU::getVOPe32(MI.getOpcode())))
1104+
.add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst))
1105+
.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0))
1106+
.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1))
1107+
.setMIFlags(MI.getFlags());
1108+
LLVM_DEBUG(dbgs() << "Converted to VOP2: " << *Converted);
1109+
MI.eraseFromParent();
1110+
}
1111+
10641112
namespace {
10651113
bool isConvertibleToSDWA(MachineInstr &MI,
10661114
const GCNSubtarget &ST,
@@ -1070,6 +1118,11 @@ bool isConvertibleToSDWA(MachineInstr &MI,
10701118
if (TII->isSDWA(Opc))
10711119
return true;
10721120

1121+
// Can only be handled after ealier conversion to
1122+
// AMDGPU::V_CNDMASK_B32_e32 which is not always possible.
1123+
if (Opc == AMDGPU::V_CNDMASK_B32_e64)
1124+
return false;
1125+
10731126
// Check if this instruction has opcode that supports SDWA
10741127
if (AMDGPU::getSDWAOp(Opc) == -1)
10751128
Opc = AMDGPU::getVOPe32(Opc);
@@ -1108,10 +1161,6 @@ bool isConvertibleToSDWA(MachineInstr &MI,
11081161
if (TII->pseudoToMCOpcode(Opc) == -1)
11091162
return false;
11101163

1111-
// FIXME: has SDWA but require handling of implicit VCC use
1112-
if (Opc == AMDGPU::V_CNDMASK_B32_e32)
1113-
return false;
1114-
11151164
if (MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0)) {
11161165
if (!Src0->isReg() && !Src0->isImm())
11171166
return false;
@@ -1266,7 +1315,9 @@ MachineInstr *SIPeepholeSDWA::createSDWAVersion(MachineInstr &MI) {
12661315
SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1);
12671316
}
12681317

1269-
return SDWAInst.getInstr();
1318+
MachineInstr *Ret = SDWAInst.getInstr();
1319+
TII->fixImplicitOperands(*Ret);
1320+
return Ret;
12701321
}
12711322

12721323
bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
@@ -1384,10 +1435,18 @@ bool SIPeepholeSDWA::run(MachineFunction &MF) {
13841435
for (const auto &OperandPair : SDWAOperands) {
13851436
const auto &Operand = OperandPair.second;
13861437
MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST);
1387-
if (PotentialMI &&
1388-
(PotentialMI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
1389-
PotentialMI->getOpcode() == AMDGPU::V_SUB_CO_U32_e64))
1438+
if (!PotentialMI)
1439+
continue;
1440+
1441+
switch (PotentialMI->getOpcode()) {
1442+
case AMDGPU::V_ADD_CO_U32_e64:
1443+
case AMDGPU::V_SUB_CO_U32_e64:
13901444
pseudoOpConvertToVOP2(*PotentialMI, ST);
1445+
break;
1446+
case AMDGPU::V_CNDMASK_B32_e64:
1447+
convertVcndmaskToVOP2(*PotentialMI, ST);
1448+
break;
1449+
};
13911450
}
13921451
SDWAOperands.clear();
13931452

llvm/test/CodeGen/AMDGPU/bf16.ll

+74-101
Original file line numberDiff line numberDiff line change
@@ -38481,10 +38481,7 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
3848138481
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
3848238482
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
3848338483
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
38484-
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
38485-
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
38486-
; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
38487-
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
38484+
; GFX8-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3848838485
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3848938486
; GFX8-NEXT: s_setpc_b64 s[30:31]
3849038487
;
@@ -38494,9 +38491,7 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
3849438491
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
3849538492
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
3849638493
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
38497-
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
38498-
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
38499-
; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
38494+
; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3850038495
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
3850138496
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
3850238497
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -38505,11 +38500,9 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
3850538500
; GFX10: ; %bb.0:
3850638501
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3850738502
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
38508-
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v1
38509-
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v2
3851038503
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
3851138504
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
38512-
; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc_lo
38505+
; GFX10-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3851338506
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
3851438507
; GFX10-NEXT: s_setpc_b64 s[30:31]
3851538508
;
@@ -38577,44 +38570,37 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo
3857738570
; GFX8-LABEL: v_vselect_v2bf16:
3857838571
; GFX8: ; %bb.0:
3857938572
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38580-
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
3858138573
; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
38582-
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
38583-
; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
38584-
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
38585-
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
38574+
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
3858638575
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
38587-
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
38588-
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
38576+
; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
38577+
; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5]
38578+
; GFX8-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3858938579
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3859038580
; GFX8-NEXT: s_setpc_b64 s[30:31]
3859138581
;
3859238582
; GFX9-LABEL: v_vselect_v2bf16:
3859338583
; GFX9: ; %bb.0:
3859438584
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38595-
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
3859638585
; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
38597-
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
38598-
; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
38599-
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
38600-
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
38586+
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
3860138587
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
38602-
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
38588+
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0
38589+
; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5]
38590+
; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
3860338591
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
3860438592
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
3860538593
; GFX9-NEXT: s_setpc_b64 s[30:31]
3860638594
;
3860738595
; GFX10-LABEL: v_vselect_v2bf16:
3860838596
; GFX10: ; %bb.0:
3860938597
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38610-
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
3861138598
; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
38612-
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v2
38613-
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v3
38614-
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
38615-
; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo
38599+
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
3861638600
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
38617-
; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo
38601+
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v0
38602+
; GFX10-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
38603+
; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v2, s4
3861838604
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
3861938605
; GFX10-NEXT: s_setpc_b64 s[30:31]
3862038606
;
@@ -38771,13 +38757,12 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
3877138757
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
3877238758
; GFX8-NEXT: v_mov_b32_e32 v1, s3
3877338759
; GFX8-NEXT: v_mov_b32_e32 v2, s2
38760+
; GFX8-NEXT: v_mov_b32_e32 v3, s1
38761+
; GFX8-NEXT: v_mov_b32_e32 v4, s0
3877438762
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
38775-
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
38776-
; GFX8-NEXT: v_mov_b32_e32 v1, s1
38777-
; GFX8-NEXT: v_mov_b32_e32 v2, s0
38778-
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
38779-
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
38780-
; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
38763+
; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
38764+
; GFX8-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
38765+
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3878138766
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
3878238767
; GFX8-NEXT: ; return to shader part epilog
3878338768
;
@@ -38882,14 +38867,13 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
3888238867
; GFX8: ; %bb.0:
3888338868
; GFX8-NEXT: s_lshr_b32 s2, s0, 16
3888438869
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
38885-
; GFX8-NEXT: v_mov_b32_e32 v2, s3
38886-
; GFX8-NEXT: v_mov_b32_e32 v3, s2
3888738870
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
38888-
; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
38871+
; GFX8-NEXT: v_mov_b32_e32 v1, s3
38872+
; GFX8-NEXT: v_mov_b32_e32 v2, s2
38873+
; GFX8-NEXT: v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3888938874
; GFX8-NEXT: v_mov_b32_e32 v2, s1
3889038875
; GFX8-NEXT: v_mov_b32_e32 v3, s0
3889138876
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
38892-
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
3889338877
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
3889438878
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
3889538879
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
@@ -40792,48 +40776,42 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo
4079240776
; GFX9-LABEL: v_vselect_v4bf16:
4079340777
; GFX9: ; %bb.0:
4079440778
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40795-
; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
40796-
; GFX9-NEXT: v_and_b32_e32 v3, 1, v3
40797-
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
40798-
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
40799-
; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc
40800-
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
40801-
; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7
40802-
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
4080340779
; GFX9-NEXT: v_and_b32_e32 v1, 1, v1
40804-
; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc
40780+
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1
40781+
; GFX9-NEXT: v_and_b32_e32 v1, 1, v3
40782+
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
40783+
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
40784+
; GFX9-NEXT: v_and_b32_e32 v1, 1, v2
40785+
; GFX9-NEXT: v_cndmask_b32_sdwa v2, v7, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
4080540786
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
40787+
; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v1
4080640788
; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc
40807-
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
40808-
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v6
40809-
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
40810-
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
40789+
; GFX9-NEXT: s_mov_b64 vcc, s[4:5]
40790+
; GFX9-NEXT: v_cndmask_b32_e64 v1, v7, v5, s[6:7]
40791+
; GFX9-NEXT: v_cndmask_b32_sdwa v3, v6, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
4081140792
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
40812-
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
40813-
; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4
40793+
; GFX9-NEXT: v_perm_b32 v0, v3, v0, s4
40794+
; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4
4081440795
; GFX9-NEXT: s_setpc_b64 s[30:31]
4081540796
;
4081640797
; GFX10-LABEL: v_vselect_v4bf16:
4081740798
; GFX10: ; %bb.0:
4081840799
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40819-
; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
40820-
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
40821-
; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
4082240800
; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
40823-
; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v4
40824-
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
40825-
; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v6
40826-
; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc_lo
40827-
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
40828-
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
40829-
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
40830-
; GFX10-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc_lo
40831-
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
40832-
; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo
40801+
; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
40802+
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
4083340803
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
40834-
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
40835-
; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc_lo
40836-
; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
40804+
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v1
40805+
; GFX10-NEXT: v_and_b32_e32 v1, 1, v2
40806+
; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 1, v0
40807+
; GFX10-NEXT: v_cndmask_b32_sdwa v2, v7, v5, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
40808+
; GFX10-NEXT: s_mov_b32 vcc_lo, s4
40809+
; GFX10-NEXT: v_cndmask_b32_sdwa v3, v6, v4, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
40810+
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
40811+
; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v4, s5
40812+
; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc_lo
40813+
; GFX10-NEXT: v_perm_b32 v0, v3, v0, 0x5040100
40814+
; GFX10-NEXT: v_perm_b32 v1, v2, v1, 0x5040100
4083740815
; GFX10-NEXT: s_setpc_b64 s[30:31]
4083840816
;
4083940817
; GFX11TRUE16-LABEL: v_vselect_v4bf16:
@@ -41081,42 +41059,37 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo
4108141059
; GFX10-LABEL: v_vselect_v8bf16:
4108241060
; GFX10: ; %bb.0:
4108341061
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41084-
; GFX10-NEXT: v_and_b32_e32 v6, 1, v6
41085-
; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
41086-
; GFX10-NEXT: v_and_b32_e32 v5, 1, v5
41087-
; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
41088-
; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v10
41089-
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
41090-
; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v14
41091-
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
4109241062
; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
41093-
; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
41094-
; GFX10-NEXT: v_cndmask_b32_e32 v6, v15, v11, vcc_lo
41095-
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
4109641063
; GFX10-NEXT: v_and_b32_e32 v7, 1, v7
41097-
; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v11
41098-
; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v15
41099-
; GFX10-NEXT: v_cndmask_b32_e32 v4, v14, v10, vcc_lo
41100-
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
41101-
; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v8
41102-
; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v12
41103-
; GFX10-NEXT: v_cndmask_b32_e32 v5, v17, v16, vcc_lo
41104-
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
41105-
; GFX10-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc_lo
41064+
; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
41065+
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
41066+
; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
41067+
; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v1
41068+
; GFX10-NEXT: v_and_b32_e32 v1, 1, v5
41069+
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
41070+
; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 1, v3
41071+
; GFX10-NEXT: v_and_b32_e32 v3, 1, v6
41072+
; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 1, v1
41073+
; GFX10-NEXT: v_and_b32_e32 v1, 1, v4
41074+
; GFX10-NEXT: v_cndmask_b32_sdwa v4, v15, v11, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
41075+
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
41076+
; GFX10-NEXT: v_cndmask_b32_e32 v5, v14, v10, vcc_lo
41077+
; GFX10-NEXT: s_mov_b32 vcc_lo, s6
41078+
; GFX10-NEXT: v_cndmask_b32_sdwa v6, v14, v10, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
41079+
; GFX10-NEXT: s_mov_b32 vcc_lo, s5
41080+
; GFX10-NEXT: v_cndmask_b32_sdwa v1, v13, v9, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
4110641081
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
41107-
; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v9
41108-
; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v13
4110941082
; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc_lo
41110-
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
41111-
; GFX10-NEXT: v_cndmask_b32_e32 v1, v14, v10, vcc_lo
41083+
; GFX10-NEXT: s_mov_b32 vcc_lo, s4
41084+
; GFX10-NEXT: v_cndmask_b32_sdwa v7, v12, v8, vcc_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
41085+
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
41086+
; GFX10-NEXT: v_perm_b32 v0, v7, v0, 0x5040100
41087+
; GFX10-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc_lo
4111241088
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
41113-
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
41114-
; GFX10-NEXT: v_cndmask_b32_e32 v3, v13, v9, vcc_lo
41115-
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
41116-
; GFX10-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
41117-
; GFX10-NEXT: v_cndmask_b32_e32 v7, v15, v11, vcc_lo
41118-
; GFX10-NEXT: v_perm_b32 v2, v5, v4, 0x5040100
41119-
; GFX10-NEXT: v_perm_b32 v3, v7, v6, 0x5040100
41089+
; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
41090+
; GFX10-NEXT: v_cndmask_b32_e32 v3, v15, v11, vcc_lo
41091+
; GFX10-NEXT: v_perm_b32 v2, v6, v5, 0x5040100
41092+
; GFX10-NEXT: v_perm_b32 v3, v4, v3, 0x5040100
4112041093
; GFX10-NEXT: s_setpc_b64 s[30:31]
4112141094
;
4112241095
; GFX11TRUE16-LABEL: v_vselect_v8bf16:

0 commit comments

Comments
 (0)