[llvm-exegesis] parallel snippet generator: avoid Read-After-Write pitfail for instrs w/ tied variables

As it is being discussed in https://github.com/llvm/llvm-project/issues/59325,
at least for the instructions with tied variables,
when trying to parallelize the instructions,
register selection is rather bad, and may either
use a register which we have used for def,
or vice versa.

That introduces serialization, and leads to
overly pessimistic inverse throughput measurement.

The new implementation avoids that,

New result:
```
$ ninja llvm-exegesis && ./bin/llvm-exegesis --mode=inverse_throughput --opcode-name=VFMADD132PDr --max-configs-per-opcode=9182
ninja: no work to do.
Check generated assembly with: /usr/bin/objdump -d /tmp/snippet-4af034.o
---
mode:            inverse_throughput
key:
  instructions:
    - 'VFMADD132PDr XMM3 XMM3 XMM4 XMM8'
    - 'VFMADD132PDr XMM5 XMM5 XMM14 XMM7'
    - 'VFMADD132PDr XMM10 XMM10 XMM11 XMM15'
    - 'VFMADD132PDr XMM13 XMM13 XMM15 XMM15'
    - 'VFMADD132PDr XMM12 XMM12 XMM11 XMM1'
    - 'VFMADD132PDr XMM0 XMM0 XMM6 XMM9'
    - 'VFMADD132PDr XMM2 XMM2 XMM15 XMM11'
  config:          ''
  register_initial_values:
    - 'XMM3=0x0'
    - 'XMM4=0x0'
    - 'XMM8=0x0'
    - 'MXCSR=0x0'
    - 'XMM5=0x0'
    - 'XMM14=0x0'
    - 'XMM7=0x0'
    - 'XMM10=0x0'
    - 'XMM11=0x0'
    - 'XMM15=0x0'
    - 'XMM13=0x0'
    - 'XMM12=0x0'
    - 'XMM1=0x0'
    - 'XMM0=0x0'
    - 'XMM6=0x0'
    - 'XMM9=0x0'
    - 'XMM2=0x0'
cpu_name:        znver3
llvm_triple:     x86_64-unknown-linux-gnu
num_repetitions: 10000
measurements:
  - { key: inverse_throughput, value: 0.6403, per_snippet_value: 4.4821 }
error:           ''
info:            instruction has tied variables, avoiding Read-After-Write issue, picking random def and use registers not aliasing each other, randomizing registers for uses
assembled_snippet: 4883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F1C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F24244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F04244883C4104883EC04C70424801F0000C5F8AE14244883C4044883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F2C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F34244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F3C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F14244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F1C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F3C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F2C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F24244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F0C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F04244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F34244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F0C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F14244883C410C4C2D998D8C4E28998EFC442A198D7C4428198EFC462A198E1C4C2C998C1C4C28198D3C4C2D998D8C4E28998EFC442A198D7C4428198EFC462A198E1C4C2C998C1C4C28198D3C4C2D998D8C4E28998EFC442A198D7C4428198EFC462A198E1C4C2C998C1C4C28198D3C4C2D998D8C4E28998EFC442A198D7C4428198EFC462A198E1C4C2C998C1C4C28198D3C3
...
Check generated assembly with: /usr/bin/objdump -d /tmp/snippet-f05c2f.o
---
mode:            inverse_throughput
key:
  instructions:
    - 'VFMADD132PDr XMM15 XMM15 XMM11 XMM2'
    - 'VFMADD132PDr XMM5 XMM5 XMM11 XMM2'
    - 'VFMADD132PDr XMM14 XMM14 XMM11 XMM2'
    - 'VFMADD132PDr XMM4 XMM4 XMM11 XMM2'
    - 'VFMADD132PDr XMM8 XMM8 XMM11 XMM2'
    - 'VFMADD132PDr XMM3 XMM3 XMM11 XMM2'
    - 'VFMADD132PDr XMM10 XMM10 XMM11 XMM2'
    - 'VFMADD132PDr XMM7 XMM7 XMM11 XMM2'
    - 'VFMADD132PDr XMM13 XMM13 XMM11 XMM2'
    - 'VFMADD132PDr XMM9 XMM9 XMM11 XMM2'
    - 'VFMADD132PDr XMM1 XMM1 XMM11 XMM2'
    - 'VFMADD132PDr XMM6 XMM6 XMM11 XMM2'
    - 'VFMADD132PDr XMM0 XMM0 XMM11 XMM2'
    - 'VFMADD132PDr XMM12 XMM12 XMM11 XMM2'
  config:          ''
  register_initial_values:
    - 'XMM15=0x0'
    - 'XMM11=0x0'
    - 'XMM2=0x0'
    - 'MXCSR=0x0'
    - 'XMM5=0x0'
    - 'XMM14=0x0'
    - 'XMM4=0x0'
    - 'XMM8=0x0'
    - 'XMM3=0x0'
    - 'XMM10=0x0'
    - 'XMM7=0x0'
    - 'XMM13=0x0'
    - 'XMM9=0x0'
    - 'XMM1=0x0'
    - 'XMM6=0x0'
    - 'XMM0=0x0'
    - 'XMM12=0x0'
cpu_name:        znver3
llvm_triple:     x86_64-unknown-linux-gnu
num_repetitions: 10000
measurements:
  - { key: inverse_throughput, value: 0.5312, per_snippet_value: 7.4368 }
error:           ''
info:            instruction has tied variables, avoiding Read-After-Write issue, picking random def and use registers not aliasing each other, one unique register for each use position
assembled_snippet: 4883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F3C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F1C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F14244883C4104883EC04C70424801F0000C5F8AE14244883C4044883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F2C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F34244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F24244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F04244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F1C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F14244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F3C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F2C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F0C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F0C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F34244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F04244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F24244883C410C462A198FAC4E2A198EAC462A198F2C4E2A198E2C462A198C2C4E2A198DAC462A198D2C4E2A198FAC462A198EAC462A198CAC4E2A198CAC4E2A198F2C4E2A198C2C462A198E2C462A198FAC4E2A198EAC462A198F2C4E2A198E2C462A198C2C4E2A198DAC462A198D2C4E2A198FAC462A198EAC462A198CAC4E2A198CAC4E2A198F2C4E2A198C2C462A198E2C462A198FAC4E2A198EAC462A198F2C4E2A198E2C462A198C2C4E2A198DAC462A198D2C4E2A198FAC462A198EAC462A198CAC4E2A198CAC4E2A198F2C4E2A198C2C462A198E2C462A198FAC4E2A198EAC462A198F2C4E2A198E2C462A198C2C4E2A198DAC462A198D2C4E2A198FAC462A198EAC462A198CAC4E2A198CAC4E2A198F2C4E2A198C2C462A198E2C3
...
Check generated assembly with: /usr/bin/objdump -d /tmp/snippet-c32060.o
---
mode:            inverse_throughput
key:
  instructions:
    - 'VFMADD132PDr XMM10 XMM10 XMM6 XMM6'
    - 'VFMADD132PDr XMM8 XMM8 XMM6 XMM6'
    - 'VFMADD132PDr XMM12 XMM12 XMM6 XMM6'
    - 'VFMADD132PDr XMM9 XMM9 XMM6 XMM6'
    - 'VFMADD132PDr XMM7 XMM7 XMM6 XMM6'
    - 'VFMADD132PDr XMM1 XMM1 XMM6 XMM6'
    - 'VFMADD132PDr XMM0 XMM0 XMM6 XMM6'
    - 'VFMADD132PDr XMM5 XMM5 XMM6 XMM6'
    - 'VFMADD132PDr XMM11 XMM11 XMM6 XMM6'
    - 'VFMADD132PDr XMM2 XMM2 XMM6 XMM6'
    - 'VFMADD132PDr XMM15 XMM15 XMM6 XMM6'
    - 'VFMADD132PDr XMM3 XMM3 XMM6 XMM6'
    - 'VFMADD132PDr XMM14 XMM14 XMM6 XMM6'
    - 'VFMADD132PDr XMM4 XMM4 XMM6 XMM6'
    - 'VFMADD132PDr XMM13 XMM13 XMM6 XMM6'
  config:          ''
  register_initial_values:
    - 'XMM10=0x0'
    - 'XMM6=0x0'
    - 'MXCSR=0x0'
    - 'XMM8=0x0'
    - 'XMM12=0x0'
    - 'XMM9=0x0'
    - 'XMM7=0x0'
    - 'XMM1=0x0'
    - 'XMM0=0x0'
    - 'XMM5=0x0'
    - 'XMM11=0x0'
    - 'XMM2=0x0'
    - 'XMM15=0x0'
    - 'XMM3=0x0'
    - 'XMM14=0x0'
    - 'XMM4=0x0'
    - 'XMM13=0x0'
cpu_name:        znver3
llvm_triple:     x86_64-unknown-linux-gnu
num_repetitions: 10000
measurements:
  - { key: inverse_throughput, value: 0.5311, per_snippet_value: 7.9665 }
error:           ''
info:            instruction has tied variables, avoiding Read-After-Write issue, picking random def and use registers not aliasing each other, reusing the same register for all uses
assembled_snippet: 4883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F14244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F34244883C4104883EC04C70424801F0000C5F8AE14244883C4044883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F04244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F24244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F0C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F3C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F0C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F04244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F2C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F1C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F14244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F3C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F1C244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F34244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C5FA6F24244883C4104883EC10C7042400000000C744240400000000C744240800000000C744240C00000000C57A6F2C244883C410C462C998D6C462C998C6C462C998E6C462C998CEC4E2C998FEC4E2C998CEC4E2C998C6C4E2C998EEC462C998DEC4E2C998D6C462C998FEC4E2C998DEC462C998F6C4E2C998E6C462C998EEC462C998D6C462C998C6C462C998E6C462C998CEC4E2C998FEC4E2C998CEC4E2C998C6C4E2C998EEC462C998DEC4E2C998D6C462C998FEC4E2C998DEC462C998F6C4E2C998E6C462C998EEC462C998D6C462C998C6C462C998E6C462C998CEC4E2C998FEC4E2C998CEC4E2C998C6C4E2C998EEC462C998DEC4E2C998D6C462C998FEC4E2C998DEC462C998F6C4E2C998E6C462C998EEC462C998D6C462C998C6C462C998E6C462C998CEC4E2C998FEC4E2C998CEC4E2C998C6C4E2C998EEC462C998DEC4E2C998D6C462C998FEC4E2C998DEC462C998F6C4E2C998E6C462C998EEC3
...
```

Reviewed By: courbet

Differential Revision: https://reviews.llvm.org/D139283
This commit is contained in:
Roman Lebedev
2022-12-06 17:55:35 +03:00
parent 26e50c4c4d
commit 2ffe225d11
7 changed files with 343 additions and 163 deletions

View File

@@ -11,10 +11,19 @@
namespace llvm {
namespace exegesis {
CodeTemplate::CodeTemplate(const CodeTemplate &) = default;
CodeTemplate::CodeTemplate(CodeTemplate &&) = default;
CodeTemplate &CodeTemplate::operator=(CodeTemplate &&) = default;
CodeTemplate &CodeTemplate::operator=(const CodeTemplate &) = default;
CodeTemplate CodeTemplate::clone() const {
CodeTemplate CT = *this;
return CT;
}
InstructionTemplate::InstructionTemplate(const Instruction *Instr)
: Instr(Instr), VariableValues(Instr->Variables.size()) {}

View File

@@ -119,8 +119,8 @@ struct CodeTemplate {
CodeTemplate(CodeTemplate &&); // default
CodeTemplate &operator=(CodeTemplate &&); // default
CodeTemplate(const CodeTemplate &) = delete;
CodeTemplate &operator=(const CodeTemplate &) = delete;
CodeTemplate clone() const;
ExecutionMode Execution = ExecutionMode::UNKNOWN;
// See InstructionBenchmarkKey.::Config.
@@ -132,6 +132,10 @@ struct CodeTemplate {
// If the template uses the provided scratch memory, the register in which
// the pointer to this memory is passed in to the function.
unsigned ScratchSpacePointerInReg = 0;
private:
CodeTemplate(const CodeTemplate &); // default
CodeTemplate &operator=(const CodeTemplate &); // default
};
} // namespace exegesis

View File

@@ -79,13 +79,12 @@
namespace llvm {
namespace exegesis {
static SmallVector<const Variable *, 8>
getVariablesWithTiedOperands(const Instruction &Instr) {
static bool hasVariablesWithTiedOperands(const Instruction &Instr) {
SmallVector<const Variable *, 8> Result;
for (const auto &Var : Instr.Variables)
if (Var.hasTiedOperands())
Result.push_back(&Var);
return Result;
return true;
return false;
}
ParallelSnippetGenerator::~ParallelSnippetGenerator() = default;
@@ -114,43 +113,181 @@ void ParallelSnippetGenerator::instantiateMemoryOperands(
"not enough scratch space");
}
static std::vector<InstructionTemplate> generateSnippetUsingStaticRenaming(
const LLVMState &State, const InstructionTemplate &IT,
const ArrayRef<const Variable *> TiedVariables,
const BitVector &ForbiddenRegisters) {
std::vector<InstructionTemplate> Instructions;
// Assign registers to variables in a round-robin manner. This is simple but
// ensures that the most register-constrained variable does not get starved.
std::vector<BitVector> PossibleRegsForVar;
for (const Variable *Var : TiedVariables) {
assert(Var);
const Operand &Op = IT.getInstr().getPrimaryOperand(*Var);
assert(Op.isReg());
BitVector PossibleRegs = Op.getRegisterAliasing().sourceBits();
remove(PossibleRegs, ForbiddenRegisters);
PossibleRegsForVar.push_back(std::move(PossibleRegs));
enum class RegRandomizationStrategy : uint8_t {
PickRandomRegs,
SingleStaticRegPerOperand,
SingleStaticReg,
FIRST = PickRandomRegs,
LAST = SingleStaticReg,
};
} // namespace exegesis
template <> struct enum_iteration_traits<exegesis::RegRandomizationStrategy> {
static constexpr bool is_iterable = true;
};
namespace exegesis {
const char *getDescription(RegRandomizationStrategy S) {
switch (S) {
case RegRandomizationStrategy::PickRandomRegs:
return "randomizing registers";
case RegRandomizationStrategy::SingleStaticRegPerOperand:
return "one unique register for each position";
case RegRandomizationStrategy::SingleStaticReg:
return "reusing the same register for all positions";
}
SmallVector<int, 2> Iterators(TiedVariables.size(), 0);
while (true) {
InstructionTemplate TmpIT = IT;
// Find a possible register for each variable in turn, marking the
// register as taken.
for (size_t VarId = 0; VarId < TiedVariables.size(); ++VarId) {
const int NextPossibleReg =
PossibleRegsForVar[VarId].find_next(Iterators[VarId]);
if (NextPossibleReg <= 0) {
return Instructions;
}
TmpIT.getValueFor(*TiedVariables[VarId]) =
MCOperand::createReg(NextPossibleReg);
// Bump iterator.
Iterators[VarId] = NextPossibleReg;
// Prevent other variables from using the register.
for (BitVector &OtherPossibleRegs : PossibleRegsForVar) {
OtherPossibleRegs.reset(NextPossibleReg);
llvm_unreachable("Unknown UseRegRandomizationStrategy enum");
}
static std::variant<std::nullopt_t, MCOperand, Register>
generateSingleRegisterForInstrAvoidingDefUseOverlap(
const LLVMState &State, const BitVector &ForbiddenRegisters,
const BitVector &ImplicitUseAliases, const BitVector &ImplicitDefAliases,
const BitVector &Uses, const BitVector &Defs, const InstructionTemplate &IT,
const Operand &Op, const ArrayRef<InstructionTemplate> Instructions,
RegRandomizationStrategy S) {
const Instruction &Instr = IT.getInstr();
assert(Op.isReg() && Op.isExplicit() && !Op.isMemory() &&
!IT.getValueFor(Op).isValid());
assert((!Op.isUse() || !Op.isTied()) &&
"Not expecting to see a tied use reg");
if (Op.isUse()) {
switch (S) {
case RegRandomizationStrategy::PickRandomRegs:
break;
case RegRandomizationStrategy::SingleStaticReg:
case RegRandomizationStrategy::SingleStaticRegPerOperand: {
if (!Instructions.empty())
return Instructions.front().getValueFor(Op);
if (S != RegRandomizationStrategy::SingleStaticReg)
break;
BitVector PossibleRegisters = Op.getRegisterAliasing().sourceBits();
const BitVector UseAliases = getAliasedBits(State.getRegInfo(), Uses);
if (std::optional<int> CommonBit =
getFirstCommonBit(PossibleRegisters, UseAliases))
return *CommonBit;
break;
}
}
}
BitVector PossibleRegisters = Op.getRegisterAliasing().sourceBits();
remove(PossibleRegisters, ForbiddenRegisters);
if (Op.isDef()) {
remove(PossibleRegisters, ImplicitUseAliases);
const BitVector UseAliases = getAliasedBits(State.getRegInfo(), Uses);
remove(PossibleRegisters, UseAliases);
}
if (Op.isUse()) {
remove(PossibleRegisters, ImplicitDefAliases);
// NOTE: in general, using same reg for multiple Use's is fine.
if (S == RegRandomizationStrategy::SingleStaticRegPerOperand) {
const BitVector UseAliases = getAliasedBits(State.getRegInfo(), Uses);
remove(PossibleRegisters, UseAliases);
}
}
bool IsDefWithTiedUse =
Instr.Variables[Op.getVariableIndex()].hasTiedOperands();
if (Op.isUse() || IsDefWithTiedUse) {
// Now, important bit: if we have used some register for def,
// then we can not use that same register for *any* use,
// be it either an untied use, or an use tied to a def.
// But def-ing same regs is fine, as long as there are no uses!
const BitVector DefsAliases = getAliasedBits(State.getRegInfo(), Defs);
remove(PossibleRegisters, DefsAliases);
}
if (!PossibleRegisters.any())
return std::nullopt;
return randomBit(PossibleRegisters);
}
static std::optional<InstructionTemplate>
generateSingleSnippetForInstrAvoidingDefUseOverlap(
const LLVMState &State, const BitVector &ForbiddenRegisters,
const BitVector &ImplicitUseAliases, const BitVector &ImplicitDefAliases,
BitVector &Uses, BitVector &Defs, InstructionTemplate IT,
const ArrayRef<InstructionTemplate> Instructions,
RegRandomizationStrategy S) {
const Instruction &Instr = IT.getInstr();
for (const Operand &Op : Instr.Operands) {
if (!Op.isReg() || !Op.isExplicit() || Op.isMemory() ||
IT.getValueFor(Op).isValid())
continue;
assert((!Op.isUse() || !Op.isTied()) && "Will not get tied uses.");
std::variant<std::nullopt_t, MCOperand, Register> R =
generateSingleRegisterForInstrAvoidingDefUseOverlap(
State, ForbiddenRegisters, ImplicitUseAliases, ImplicitDefAliases,
Uses, Defs, IT, Op, Instructions, S);
if (std::holds_alternative<std::nullopt_t>(R))
return {};
MCOperand MCOp;
if (std::holds_alternative<MCOperand>(R))
MCOp = std::get<MCOperand>(R);
else {
Register RandomReg = std::get<Register>(R);
if (Op.isDef())
Defs.set(RandomReg);
if (Op.isUse())
Uses.set(RandomReg);
MCOp = MCOperand::createReg(RandomReg);
}
IT.getValueFor(Op) = MCOp;
}
return IT;
}
static std::vector<InstructionTemplate>
generateSnippetForInstrAvoidingDefUseOverlap(
const LLVMState &State, const InstructionTemplate &IT,
RegRandomizationStrategy S, const BitVector &ForbiddenRegisters) {
// We don't want to accidentally serialize the instruction,
// so we must be sure that we don't pick a def that is an implicit use,
// or a use that is an implicit def, so record implicit regs now.
BitVector ImplicitUses(State.getRegInfo().getNumRegs());
BitVector ImplicitDefs(State.getRegInfo().getNumRegs());
for (const auto &Op : IT.getInstr().Operands) {
if (Op.isReg() && Op.isImplicit() && !Op.isMemory()) {
assert(Op.isImplicitReg() && "Not an implicit register operand?");
if (Op.isUse())
ImplicitUses.set(Op.getImplicitReg());
else {
assert(Op.isDef() && "Not a use and not a def?");
ImplicitDefs.set(Op.getImplicitReg());
}
}
Instructions.push_back(std::move(TmpIT));
}
const BitVector ImplicitUseAliases =
getAliasedBits(State.getRegInfo(), ImplicitUses);
const BitVector ImplicitDefAliases =
getAliasedBits(State.getRegInfo(), ImplicitDefs);
BitVector Defs(State.getRegInfo().getNumRegs());
BitVector Uses(State.getRegInfo().getNumRegs());
std::vector<InstructionTemplate> Instructions;
while (true) {
std::optional<InstructionTemplate> TmpIT =
generateSingleSnippetForInstrAvoidingDefUseOverlap(
State, ForbiddenRegisters, ImplicitUseAliases, ImplicitDefAliases,
Uses, Defs, IT, Instructions, S);
if (!TmpIT)
return Instructions;
Instructions.push_back(std::move(*TmpIT));
if (!hasVariablesWithTiedOperands(IT.getInstr()))
return Instructions;
assert(Instructions.size() <= 128 && "Stuck in endless loop?");
}
}
@@ -177,78 +314,41 @@ ParallelSnippetGenerator::generateCodeTemplates(
instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
return getSingleton(std::move(CT));
}
const auto TiedVariables = getVariablesWithTiedOperands(Instr);
if (!TiedVariables.empty()) {
CT.Info = "instruction has tied variables, using static renaming.";
CT.Instructions = generateSnippetUsingStaticRenaming(
State, Variant, TiedVariables, ForbiddenRegisters);
instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
return getSingleton(std::move(CT));
std::vector<CodeTemplate> Result;
bool HasTiedOperands = hasVariablesWithTiedOperands(Instr);
// If there are no tied operands, then we don't want to "saturate backedge",
// and the template we will produce will have only a single instruction.
unsigned NumUntiedUseRegs = count_if(Instr.Operands, [](const Operand &Op) {
return Op.isReg() && Op.isExplicit() && !Op.isMemory() && Op.isUse() &&
!Op.isTied();
});
SmallVector<RegRandomizationStrategy, 3> Strategies;
if (HasTiedOperands || NumUntiedUseRegs >= 3)
Strategies.push_back(RegRandomizationStrategy::PickRandomRegs);
if (HasTiedOperands || NumUntiedUseRegs >= 2)
Strategies.push_back(RegRandomizationStrategy::SingleStaticRegPerOperand);
Strategies.push_back(RegRandomizationStrategy::SingleStaticReg);
for (RegRandomizationStrategy S : Strategies) {
CodeTemplate CurrCT = CT.clone();
CurrCT.Info =
Twine("instruction has ")
.concat(HasTiedOperands ? "" : "no ")
.concat("tied variables, avoiding "
"Read-After-Write issue, picking random def and use "
"registers not aliasing each other, for uses, ")
.concat(getDescription(S))
.str();
CurrCT.Instructions = generateSnippetForInstrAvoidingDefUseOverlap(
State, Variant, S, ForbiddenRegisters);
if (CurrCT.Instructions.empty())
return make_error<StringError>(
Twine("Failed to produce any snippet via: ").concat(CurrCT.Info),
inconvertibleErrorCode());
instantiateMemoryOperands(CurrCT.ScratchSpacePointerInReg,
CurrCT.Instructions);
Result.push_back(std::move(CurrCT));
}
// No tied variables, we pick random values for defs.
// We don't want to accidentally serialize the instruction,
// so we must be sure that we don't pick a def that is an implicit use,
// or a use that is an implicit def, so record implicit regs now.
BitVector ImplicitUses(State.getRegInfo().getNumRegs());
BitVector ImplicitDefs(State.getRegInfo().getNumRegs());
for (const auto &Op : Instr.Operands) {
if (Op.isReg() && Op.isImplicit() && !Op.isMemory()) {
assert(Op.isImplicitReg() && "Not an implicit register operand?");
if (Op.isUse())
ImplicitUses.set(Op.getImplicitReg());
else {
assert(Op.isDef() && "Not a use and not a def?");
ImplicitDefs.set(Op.getImplicitReg());
}
}
}
const auto ImplicitUseAliases =
getAliasedBits(State.getRegInfo(), ImplicitUses);
const auto ImplicitDefAliases =
getAliasedBits(State.getRegInfo(), ImplicitDefs);
BitVector Defs(State.getRegInfo().getNumRegs());
for (const auto &Op : Instr.Operands) {
if (Op.isReg() && Op.isExplicit() && Op.isDef() && !Op.isMemory()) {
auto PossibleRegisters = Op.getRegisterAliasing().sourceBits();
// Do not use forbidden registers and regs that are implicitly used.
// Note that we don't try to avoid using implicit defs explicitly.
remove(PossibleRegisters, ForbiddenRegisters);
remove(PossibleRegisters, ImplicitUseAliases);
if (!PossibleRegisters.any())
return make_error<StringError>(
Twine("no available registers:\ncandidates:\n")
.concat(debugString(State.getRegInfo(),
Op.getRegisterAliasing().sourceBits()))
.concat("\nforbidden:\n")
.concat(debugString(State.getRegInfo(), ForbiddenRegisters))
.concat("\nimplicit use:\n")
.concat(debugString(State.getRegInfo(), ImplicitUseAliases)),
inconvertibleErrorCode());
const auto RandomReg = randomBit(PossibleRegisters);
Defs.set(RandomReg);
Variant.getValueFor(Op) = MCOperand::createReg(RandomReg);
}
}
// And pick random use values that are not reserved and don't alias with defs.
// Note that we don't try to avoid using implicit uses explicitly.
const auto DefAliases = getAliasedBits(State.getRegInfo(), Defs);
for (const auto &Op : Instr.Operands) {
if (Op.isReg() && Op.isExplicit() && Op.isUse() && !Op.isMemory()) {
auto PossibleRegisters = Op.getRegisterAliasing().sourceBits();
remove(PossibleRegisters, ForbiddenRegisters);
remove(PossibleRegisters, DefAliases);
remove(PossibleRegisters, ImplicitDefAliases);
assert(PossibleRegisters.any() && "No register left to choose from");
const auto RandomReg = randomBit(PossibleRegisters);
Variant.getValueFor(Op) = MCOperand::createReg(RandomReg);
}
}
CT.Info =
"instruction has no tied variables picking Uses different from defs";
CT.Instructions.push_back(std::move(Variant));
instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
return getSingleton(std::move(CT));
return Result;
}
constexpr const size_t ParallelSnippetGenerator::kMinNumDifferentAddresses;

View File

@@ -213,6 +213,15 @@ size_t randomBit(const BitVector &Vector) {
return *Itr;
}
std::optional<int> getFirstCommonBit(const BitVector &A, const BitVector &B) {
BitVector Intersect = A;
Intersect &= B;
int idx = Intersect.find_first();
if (idx != -1)
return idx;
return {};
}
void setRandomAliasing(const AliasingConfigurations &AliasingConfigurations,
InstructionTemplate &DefIB, InstructionTemplate &UseIB) {
assert(!AliasingConfigurations.empty());

View File

@@ -92,6 +92,9 @@ size_t randomIndex(size_t Max);
// Precondition: Vector must have at least one bit set.
size_t randomBit(const BitVector &Vector);
// Picks a first bit that is common to these two vectors.
std::optional<int> getFirstCommonBit(const BitVector &A, const BitVector &B);
// Picks a random configuration, then selects a random def and a random use from
// it and finally set the selected values in the provided InstructionInstances.
void setRandomAliasing(const AliasingConfigurations &AliasingConfigurations,

View File

@@ -116,17 +116,17 @@ TEST_F(PPCParallelSnippetGeneratorTest, MemoryUse) {
const unsigned Opcode = PPC::LDX;
const auto CodeTemplates = checkAndGetCodeTemplates(Opcode);
ASSERT_THAT(CodeTemplates, SizeIs(1));
const auto &CT = CodeTemplates[0];
EXPECT_THAT(CT.Info, HasSubstr("instruction has no tied variables picking "
"Uses different from defs"));
EXPECT_THAT(CT.Execution, ExecutionMode::UNKNOWN);
ASSERT_THAT(CT.Instructions,
SizeIs(ParallelSnippetGenerator::kMinNumDifferentAddresses));
const InstructionTemplate &IT = CT.Instructions[0];
EXPECT_THAT(IT.getOpcode(), Opcode);
ASSERT_THAT(IT.getVariableValues(), SizeIs(3));
EXPECT_EQ(IT.getVariableValues()[1].getReg(), PPC::X1);
EXPECT_EQ(IT.getVariableValues()[2].getReg(), PPC::X13);
for (const auto &CT : CodeTemplates) {
EXPECT_THAT(CT.Info, HasSubstr("instruction has no tied variables"));
EXPECT_THAT(CT.Execution, ExecutionMode::UNKNOWN);
ASSERT_THAT(CT.Instructions,
SizeIs(ParallelSnippetGenerator::kMinNumDifferentAddresses));
const InstructionTemplate &IT = CT.Instructions[0];
EXPECT_THAT(IT.getOpcode(), Opcode);
ASSERT_THAT(IT.getVariableValues(), SizeIs(3));
EXPECT_EQ(IT.getVariableValues()[1].getReg(), PPC::X1);
EXPECT_EQ(IT.getVariableValues()[2].getReg(), PPC::X13);
}
}
} // namespace

View File

@@ -14,6 +14,7 @@
#include "SerialSnippetGenerator.h"
#include "TestBase.h"
#include "X86InstrInfo.h"
#include "llvm/ADT/SetOperations.h"
#include <unordered_set>
@@ -26,8 +27,10 @@ namespace {
using testing::AnyOf;
using testing::ElementsAre;
using testing::Ge;
using testing::Gt;
using testing::HasSubstr;
using testing::IsEmpty;
using testing::Not;
using testing::SizeIs;
using testing::UnorderedElementsAre;
@@ -232,7 +235,7 @@ TEST_F(X86ParallelSnippetGeneratorTest, SerialInstruction) {
ASSERT_THAT(IT.getVariableValues(), SizeIs(0));
}
TEST_F(X86ParallelSnippetGeneratorTest, StaticRenaming) {
TEST_F(X86ParallelSnippetGeneratorTest, ReadAfterWrite_CMOV32rr) {
// CMOV32rr has tied variables, we enumerate the possible values to execute
// as many in parallel as possible.
@@ -248,19 +251,69 @@ TEST_F(X86ParallelSnippetGeneratorTest, StaticRenaming) {
// - hasAliasingRegisters
const unsigned Opcode = X86::CMOV32rr;
const auto CodeTemplates = checkAndGetCodeTemplates(Opcode);
ASSERT_THAT(CodeTemplates, SizeIs(1));
const auto &CT = CodeTemplates[0];
EXPECT_THAT(CT.Info, HasSubstr("static renaming"));
EXPECT_THAT(CT.Execution, ExecutionMode::UNKNOWN);
constexpr const unsigned kInstructionCount = 15;
ASSERT_THAT(CT.Instructions, SizeIs(kInstructionCount));
std::unordered_set<unsigned> AllDefRegisters;
for (const auto &IT : CT.Instructions) {
ASSERT_THAT(IT.getVariableValues(), SizeIs(3));
AllDefRegisters.insert(IT.getVariableValues()[0].getReg());
ASSERT_THAT(CodeTemplates, SizeIs(3));
for (const auto &CT : CodeTemplates) {
EXPECT_THAT(CT.Info, HasSubstr("avoiding Read-After-Write issue"));
EXPECT_THAT(CT.Execution, ExecutionMode::UNKNOWN);
ASSERT_GT(CT.Instructions.size(), 1U);
std::unordered_set<unsigned> AllDefRegisters;
std::unordered_set<unsigned> AllUseRegisters;
for (const auto &IT : CT.Instructions) {
ASSERT_THAT(IT.getVariableValues(), SizeIs(3));
AllDefRegisters.insert(IT.getVariableValues()[0].getReg());
AllUseRegisters.insert(IT.getVariableValues()[1].getReg());
}
EXPECT_THAT(AllDefRegisters, SizeIs(CT.Instructions.size()))
<< "Each instruction writes to a different register";
EXPECT_THAT(AllUseRegisters, Not(IsEmpty()))
<< "In total, some other registers are used";
auto AllDefAndUseRegs = AllUseRegisters;
llvm::set_intersect(AllDefAndUseRegs, AllDefRegisters); // A := A ^ B
EXPECT_THAT(AllDefAndUseRegs, IsEmpty())
<< "No instruction uses any register defined by any of the "
"instructions";
}
}
TEST_F(X86ParallelSnippetGeneratorTest, ReadAfterWrite_VFMADD132PDr) {
// VFMADD132PDr has tied variables, we enumerate the possible values
// to execute as many in parallel as possible.
// - VFMADD132PDr
// - Op0 Explicit Def RegClass(XMM)
// - Op1 Explicit Use RegClass(XMM) TiedToOp0
// - Op2 Explicit Use RegClass(XMM)
// - Op3 Explicit Use RegClass(XMM)
// - Var0 [Op0,Op1]
// - Var1 [Op2]
// - Var2 [Op3]
// - hasTiedRegisters (execution is always serial)
// - hasAliasingRegisters
const unsigned Opcode = X86::VFMADD132PDr;
const auto CodeTemplates = checkAndGetCodeTemplates(Opcode);
ASSERT_THAT(CodeTemplates, SizeIs(3));
for (const auto &CT : CodeTemplates) {
EXPECT_THAT(CT.Info, HasSubstr("avoiding Read-After-Write issue"));
EXPECT_THAT(CT.Execution, ExecutionMode::UNKNOWN);
ASSERT_GT(CT.Instructions.size(), 1U);
std::unordered_set<unsigned> AllDefRegisters;
std::unordered_set<unsigned> AllUseRegisters;
for (const auto &IT : CT.Instructions) {
ASSERT_THAT(IT.getVariableValues(), SizeIs(3));
AllDefRegisters.insert(IT.getVariableValues()[0].getReg());
AllUseRegisters.insert(IT.getVariableValues()[1].getReg());
AllUseRegisters.insert(IT.getVariableValues()[2].getReg());
}
EXPECT_THAT(AllDefRegisters, SizeIs(CT.Instructions.size()))
<< "Each instruction writes to a different register";
EXPECT_THAT(AllUseRegisters, Not(IsEmpty()))
<< "In total, some other registers are used";
auto AllDefAndUseRegs = AllUseRegisters;
llvm::set_intersect(AllDefAndUseRegs, AllDefRegisters); // A := A ^ B
EXPECT_THAT(AllDefAndUseRegs, IsEmpty())
<< "No instruction uses any register defined by any of the "
"instructions";
}
EXPECT_THAT(AllDefRegisters, SizeIs(kInstructionCount))
<< "Each instruction writes to a different register";
}
TEST_F(X86ParallelSnippetGeneratorTest, NoTiedVariables) {
@@ -280,21 +333,22 @@ TEST_F(X86ParallelSnippetGeneratorTest, NoTiedVariables) {
// - hasAliasingRegisters
const unsigned Opcode = X86::CMOV_GR32;
const auto CodeTemplates = checkAndGetCodeTemplates(Opcode);
ASSERT_THAT(CodeTemplates, SizeIs(1));
const auto &CT = CodeTemplates[0];
EXPECT_THAT(CT.Info, HasSubstr("no tied variables"));
EXPECT_THAT(CT.Execution, ExecutionMode::UNKNOWN);
ASSERT_THAT(CT.Instructions, SizeIs(1));
const InstructionTemplate &IT = CT.Instructions[0];
EXPECT_THAT(IT.getOpcode(), Opcode);
ASSERT_THAT(IT.getVariableValues(), SizeIs(4));
EXPECT_THAT(IT.getVariableValues()[0].getReg(),
Not(IT.getVariableValues()[1].getReg()))
<< "Def is different from first Use";
EXPECT_THAT(IT.getVariableValues()[0].getReg(),
Not(IT.getVariableValues()[2].getReg()))
<< "Def is different from second Use";
EXPECT_THAT(IT.getVariableValues()[3], IsInvalid());
ASSERT_THAT(CodeTemplates, SizeIs(2));
for (const auto &CT : CodeTemplates) {
EXPECT_THAT(CT.Info, HasSubstr("no tied variables"));
EXPECT_THAT(CT.Execution, ExecutionMode::UNKNOWN);
ASSERT_THAT(CT.Instructions, SizeIs(1));
const InstructionTemplate &IT = CT.Instructions[0];
EXPECT_THAT(IT.getOpcode(), Opcode);
ASSERT_THAT(IT.getVariableValues(), SizeIs(4));
EXPECT_THAT(IT.getVariableValues()[0].getReg(),
Not(IT.getVariableValues()[1].getReg()))
<< "Def is different from first Use";
EXPECT_THAT(IT.getVariableValues()[0].getReg(),
Not(IT.getVariableValues()[2].getReg()))
<< "Def is different from second Use";
EXPECT_THAT(IT.getVariableValues()[3], IsInvalid());
}
}
TEST_F(X86ParallelSnippetGeneratorTest, MemoryUse) {
@@ -317,18 +371,19 @@ TEST_F(X86ParallelSnippetGeneratorTest, MemoryUse) {
const unsigned Opcode = X86::MOV32rm;
const auto CodeTemplates = checkAndGetCodeTemplates(Opcode);
ASSERT_THAT(CodeTemplates, SizeIs(1));
const auto &CT = CodeTemplates[0];
EXPECT_THAT(CT.Info, HasSubstr("no tied variables"));
EXPECT_THAT(CT.Execution, ExecutionMode::UNKNOWN);
ASSERT_THAT(CT.Instructions,
SizeIs(ParallelSnippetGenerator::kMinNumDifferentAddresses));
const InstructionTemplate &IT = CT.Instructions[0];
EXPECT_THAT(IT.getOpcode(), Opcode);
ASSERT_THAT(IT.getVariableValues(), SizeIs(6));
EXPECT_EQ(IT.getVariableValues()[2].getImm(), 1);
EXPECT_EQ(IT.getVariableValues()[3].getReg(), 0u);
EXPECT_EQ(IT.getVariableValues()[4].getImm(), 0);
EXPECT_EQ(IT.getVariableValues()[5].getReg(), 0u);
for (const auto &CT : CodeTemplates) {
EXPECT_THAT(CT.Info, HasSubstr("no tied variables"));
EXPECT_THAT(CT.Execution, ExecutionMode::UNKNOWN);
ASSERT_THAT(CT.Instructions,
SizeIs(ParallelSnippetGenerator::kMinNumDifferentAddresses));
const InstructionTemplate &IT = CT.Instructions[0];
EXPECT_THAT(IT.getOpcode(), Opcode);
ASSERT_THAT(IT.getVariableValues(), SizeIs(6));
EXPECT_EQ(IT.getVariableValues()[2].getImm(), 1);
EXPECT_EQ(IT.getVariableValues()[3].getReg(), 0u);
EXPECT_EQ(IT.getVariableValues()[4].getImm(), 0);
EXPECT_EQ(IT.getVariableValues()[5].getReg(), 0u);
}
}
TEST_F(X86ParallelSnippetGeneratorTest, MOV16ms) {
@@ -362,7 +417,7 @@ TEST_F(X86ParallelSnippetGeneratorTest,
auto Err = Generator.generateCodeTemplates(&Instr, AllRegisters);
EXPECT_FALSE((bool)Err);
EXPECT_THAT(toString(Err.takeError()),
testing::HasSubstr("no available registers"));
testing::HasSubstr("Failed to produce any snippet"));
}
class X86FakeSnippetGenerator : public SnippetGenerator {