fix: do not mask lsb for higher part of timestamp

Related-To: NEO-12637
Signed-off-by: Maciej Plewka <maciej.plewka@intel.com>
This commit is contained in:
Maciej Plewka
2024-12-27 11:16:46 +00:00
committed by Compute-Runtime-Automation
parent b5f3b0eba9
commit c703e0bf6e
12 changed files with 112 additions and 79 deletions

View File

@@ -307,7 +307,7 @@ struct CommandListCoreFamily : public CommandListImp {
ze_result_t programSyncBuffer(Kernel &kernel, NEO::Device &device, const ze_group_count_t &threadGroupDimensions, size_t &patchIndex);
void programRegionGroupBarrier(Kernel &kernel, const ze_group_count_t &threadGroupDimensions, size_t localRegionSize, size_t &patchIndex);
void appendWriteKernelTimestamp(Event *event, CommandToPatchContainer *outTimeStampSyncCmds, bool beforeWalker, bool maskLsb, bool workloadPartition, bool copyOperation);
void adjustWriteKernelTimestamp(uint64_t globalAddress, uint64_t contextAddress, uint64_t baseAddress, CommandToPatchContainer *outTimeStampSyncCmds, bool maskLsb, uint32_t mask, bool workloadPartition, bool copyOperation);
void adjustWriteKernelTimestamp(uint64_t globalAddress, uint64_t contextAddress, uint64_t baseAddress, CommandToPatchContainer *outTimeStampSyncCmds, bool workloadPartition, bool copyOperation);
void appendEventForProfiling(Event *event, CommandToPatchContainer *outTimeStampSyncCmds, bool beforeWalker, bool skipBarrierForEndProfiling, bool skipAddingEventToResidency, bool copyOperation);
void appendEventForProfilingCopyCommand(Event *event, bool beforeWalker);
void appendSignalEventPostWalker(Event *event, void **syncCmdBuffer, CommandToPatchContainer *outTimeStampSyncCmds, bool skipBarrierForEndProfiling, bool skipAddingEventToResidency, bool copyOperation);

View File

@@ -2932,7 +2932,7 @@ void CommandListCoreFamily<gfxCoreFamily>::appendWriteKernelTimestamp(Event *eve
outTimeStampSyncCmds->push_back(ctxCmd);
}
adjustWriteKernelTimestamp(globalAddress, contextAddress, baseAddr, outTimeStampSyncCmds, maskLsb, mask, workloadPartition, copyOperation);
adjustWriteKernelTimestamp(globalAddress, contextAddress, baseAddr, outTimeStampSyncCmds, workloadPartition, copyOperation);
}
template <GFXCORE_FAMILY gfxCoreFamily>

View File

@@ -16,7 +16,7 @@ inline NEO::PreemptionMode CommandListCoreFamily<gfxCoreFamily>::obtainKernelPre
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::adjustWriteKernelTimestamp(uint64_t globalAddress, uint64_t contextAddress, uint64_t baseAddress, CommandToPatchContainer *outTimeStampSyncCmds, bool maskLsb,
uint32_t mask, bool workloadPartition, bool copyOperation) {}
void CommandListCoreFamily<gfxCoreFamily>::adjustWriteKernelTimestamp(uint64_t globalAddress, uint64_t contextAddress, uint64_t baseAddress, CommandToPatchContainer *outTimeStampSyncCmds,
bool workloadPartition, bool copyOperation) {}
} // namespace L0

View File

@@ -38,7 +38,7 @@ size_t CommandListCoreFamily<gfxCoreFamily>::getReserveSshSize() {
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::adjustWriteKernelTimestamp(uint64_t globalAddress, uint64_t contextAddress, uint64_t baseAddress, CommandToPatchContainer *outTimeStampSyncCmds, bool maskLsb, uint32_t mask,
void CommandListCoreFamily<gfxCoreFamily>::adjustWriteKernelTimestamp(uint64_t globalAddress, uint64_t contextAddress, uint64_t baseAddress, CommandToPatchContainer *outTimeStampSyncCmds,
bool workloadPartition, bool copyOperation) {}
template <GFXCORE_FAMILY gfxCoreFamily>

View File

@@ -22,7 +22,7 @@ inline NEO::PreemptionMode CommandListCoreFamily<gfxCoreFamily>::obtainKernelPre
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::adjustWriteKernelTimestamp(uint64_t globalAddress, uint64_t contextAddress, uint64_t baseAddress, CommandToPatchContainer *outTimeStampSyncCmds, bool maskLsb, uint32_t mask,
void CommandListCoreFamily<gfxCoreFamily>::adjustWriteKernelTimestamp(uint64_t globalAddress, uint64_t contextAddress, uint64_t baseAddress, CommandToPatchContainer *outTimeStampSyncCmds,
bool workloadPartition, bool copyOperation) {
uint64_t globalHighAddress = globalAddress + sizeof(uint32_t);
uint64_t contextHighAddress = contextAddress + sizeof(uint32_t);
@@ -38,13 +38,8 @@ void CommandListCoreFamily<gfxCoreFamily>::adjustWriteKernelTimestamp(uint64_t g
contextPostSyncCmdBuffer = &contextPostSyncCmd;
}
if (maskLsb) {
NEO::EncodeMathMMIO<GfxFamily>::encodeBitwiseAndVal(commandContainer, RegisterOffsets::globalTimestampUn, mask, globalHighAddress, workloadPartition, globalPostSyncCmdBuffer, copyOperation);
NEO::EncodeMathMMIO<GfxFamily>::encodeBitwiseAndVal(commandContainer, RegisterOffsets::gpThreadTimeRegAddressOffsetHigh, mask, contextHighAddress, workloadPartition, contextPostSyncCmdBuffer, copyOperation);
} else {
NEO::EncodeStoreMMIO<GfxFamily>::encode(*commandContainer.getCommandStream(), RegisterOffsets::globalTimestampUn, globalHighAddress, workloadPartition, globalPostSyncCmdBuffer, copyOperation);
NEO::EncodeStoreMMIO<GfxFamily>::encode(*commandContainer.getCommandStream(), RegisterOffsets::gpThreadTimeRegAddressOffsetHigh, contextHighAddress, workloadPartition, contextPostSyncCmdBuffer, copyOperation);
}
NEO::EncodeStoreMMIO<GfxFamily>::encode(*commandContainer.getCommandStream(), RegisterOffsets::globalTimestampUn, globalHighAddress, workloadPartition, globalPostSyncCmdBuffer, copyOperation);
NEO::EncodeStoreMMIO<GfxFamily>::encode(*commandContainer.getCommandStream(), RegisterOffsets::gpThreadTimeRegAddressOffsetHigh, contextHighAddress, workloadPartition, contextPostSyncCmdBuffer, copyOperation);
if (outTimeStampSyncCmds != nullptr) {
CommandToPatch ctxCmd;

View File

@@ -70,7 +70,8 @@ void validateTimestampRegisters(GenCmdList &cmdList,
uint64_t firstStoreRegMemAddress,
uint32_t secondLoadRegisterRegSrcAddress,
uint64_t secondStoreRegMemAddress,
bool workloadPartition);
bool workloadPartition,
bool useMask);
struct ModuleMutableCommandListFixture : public ModuleImmutableDataFixture {
void setUp() {

View File

@@ -32,7 +32,8 @@ void validateTimestampRegisters(GenCmdList &cmdList,
uint64_t firstStoreRegMemAddress,
uint32_t secondLoadRegisterRegSrcAddress,
uint64_t secondStoreRegMemAddress,
bool workloadPartition) {
bool workloadPartition,
bool useMask) {
using MI_LOAD_REGISTER_REG = typename FamilyType::MI_LOAD_REGISTER_REG;
using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM;
using MI_MATH = typename FamilyType::MI_MATH;
@@ -40,35 +41,46 @@ void validateTimestampRegisters(GenCmdList &cmdList,
constexpr uint32_t mask = 0xfffffffe;
auto itor = find<MI_LOAD_REGISTER_REG *>(startIt, cmdList.end());
auto itor = useMask ? find<MI_LOAD_REGISTER_REG *>(startIt, cmdList.end()) : find<MI_STORE_REGISTER_MEM *>(startIt, cmdList.end());
if (useMask) {
{
ASSERT_NE(cmdList.end(), itor);
auto cmdLoadReg = genCmdCast<MI_LOAD_REGISTER_REG *>(*itor);
EXPECT_EQ(firstLoadRegisterRegSrcAddress, cmdLoadReg->getSourceRegisterAddress());
EXPECT_EQ(RegisterOffsets::csGprR13, cmdLoadReg->getDestinationRegisterAddress());
}
{
ASSERT_NE(cmdList.end(), itor);
auto cmdLoadReg = genCmdCast<MI_LOAD_REGISTER_REG *>(*itor);
EXPECT_EQ(firstLoadRegisterRegSrcAddress, cmdLoadReg->getSourceRegisterAddress());
EXPECT_EQ(RegisterOffsets::csGprR13, cmdLoadReg->getDestinationRegisterAddress());
}
itor++;
{
ASSERT_NE(cmdList.end(), itor);
auto cmdLoadImm = genCmdCast<MI_LOAD_REGISTER_IMM *>(*itor);
EXPECT_EQ(RegisterOffsets::csGprR14, cmdLoadImm->getRegisterOffset());
EXPECT_EQ(mask, cmdLoadImm->getDataDword());
}
itor++;
{
ASSERT_NE(cmdList.end(), itor);
auto cmdLoadImm = genCmdCast<MI_LOAD_REGISTER_IMM *>(*itor);
EXPECT_EQ(RegisterOffsets::csGprR14, cmdLoadImm->getRegisterOffset());
EXPECT_EQ(mask, cmdLoadImm->getDataDword());
}
itor++;
{
ASSERT_NE(cmdList.end(), itor);
auto cmdMath = genCmdCast<MI_MATH *>(*itor);
EXPECT_EQ(3u, cmdMath->DW0.BitField.DwordLength);
}
itor++;
{
ASSERT_NE(cmdList.end(), itor);
auto cmdMath = genCmdCast<MI_MATH *>(*itor);
EXPECT_EQ(3u, cmdMath->DW0.BitField.DwordLength);
}
itor++;
{
itor++;
{
ASSERT_NE(cmdList.end(), itor);
auto cmdMem = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
EXPECT_EQ(RegisterOffsets::csGprR12, cmdMem->getRegisterAddress());
EXPECT_EQ(firstStoreRegMemAddress, cmdMem->getMemoryAddress());
if (workloadPartition) {
EXPECT_TRUE(UnitTestHelper<FamilyType>::getWorkloadPartitionForStoreRegisterMemCmd(*cmdMem));
} else {
EXPECT_FALSE(UnitTestHelper<FamilyType>::getWorkloadPartitionForStoreRegisterMemCmd(*cmdMem));
}
}
} else {
ASSERT_NE(cmdList.end(), itor);
auto cmdMem = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
EXPECT_EQ(RegisterOffsets::csGprR12, cmdMem->getRegisterAddress());
EXPECT_EQ(RegisterOffsets::globalTimestampUn, cmdMem->getRegisterAddress());
EXPECT_EQ(firstStoreRegMemAddress, cmdMem->getMemoryAddress());
if (workloadPartition) {
EXPECT_TRUE(UnitTestHelper<FamilyType>::getWorkloadPartitionForStoreRegisterMemCmd(*cmdMem));
@@ -78,33 +90,45 @@ void validateTimestampRegisters(GenCmdList &cmdList,
}
itor++;
{
ASSERT_NE(cmdList.end(), itor);
auto cmdLoadReg = genCmdCast<MI_LOAD_REGISTER_REG *>(*itor);
EXPECT_EQ(secondLoadRegisterRegSrcAddress, cmdLoadReg->getSourceRegisterAddress());
EXPECT_EQ(RegisterOffsets::csGprR13, cmdLoadReg->getDestinationRegisterAddress());
}
if (useMask) {
{
ASSERT_NE(cmdList.end(), itor);
auto cmdLoadReg = genCmdCast<MI_LOAD_REGISTER_REG *>(*itor);
EXPECT_EQ(secondLoadRegisterRegSrcAddress, cmdLoadReg->getSourceRegisterAddress());
EXPECT_EQ(RegisterOffsets::csGprR13, cmdLoadReg->getDestinationRegisterAddress());
}
itor++;
{
ASSERT_NE(cmdList.end(), itor);
auto cmdLoadImm = genCmdCast<MI_LOAD_REGISTER_IMM *>(*itor);
EXPECT_EQ(RegisterOffsets::csGprR14, cmdLoadImm->getRegisterOffset());
EXPECT_EQ(mask, cmdLoadImm->getDataDword());
}
itor++;
{
ASSERT_NE(cmdList.end(), itor);
auto cmdLoadImm = genCmdCast<MI_LOAD_REGISTER_IMM *>(*itor);
EXPECT_EQ(RegisterOffsets::csGprR14, cmdLoadImm->getRegisterOffset());
EXPECT_EQ(mask, cmdLoadImm->getDataDword());
}
itor++;
{
ASSERT_NE(cmdList.end(), itor);
auto cmdMath = genCmdCast<MI_MATH *>(*itor);
EXPECT_EQ(3u, cmdMath->DW0.BitField.DwordLength);
}
itor++;
{
ASSERT_NE(cmdList.end(), itor);
auto cmdMath = genCmdCast<MI_MATH *>(*itor);
EXPECT_EQ(3u, cmdMath->DW0.BitField.DwordLength);
}
itor++;
{
itor++;
{
ASSERT_NE(cmdList.end(), itor);
auto cmdMem = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
EXPECT_EQ(RegisterOffsets::csGprR12, cmdMem->getRegisterAddress());
EXPECT_EQ(secondStoreRegMemAddress, cmdMem->getMemoryAddress());
if (workloadPartition) {
EXPECT_TRUE(UnitTestHelper<FamilyType>::getWorkloadPartitionForStoreRegisterMemCmd(*cmdMem));
} else {
EXPECT_FALSE(UnitTestHelper<FamilyType>::getWorkloadPartitionForStoreRegisterMemCmd(*cmdMem));
}
}
} else {
ASSERT_NE(cmdList.end(), itor);
auto cmdMem = genCmdCast<MI_STORE_REGISTER_MEM *>(*itor);
EXPECT_EQ(RegisterOffsets::csGprR12, cmdMem->getRegisterAddress());
EXPECT_EQ(RegisterOffsets::gpThreadTimeRegAddressOffsetHigh, cmdMem->getRegisterAddress());
EXPECT_EQ(secondStoreRegMemAddress, cmdMem->getMemoryAddress());
if (workloadPartition) {
EXPECT_TRUE(UnitTestHelper<FamilyType>::getWorkloadPartitionForStoreRegisterMemCmd(*cmdMem));
@@ -112,6 +136,7 @@ void validateTimestampRegisters(GenCmdList &cmdList,
EXPECT_FALSE(UnitTestHelper<FamilyType>::getWorkloadPartitionForStoreRegisterMemCmd(*cmdMem));
}
}
itor++;
startIt = itor;
}

View File

@@ -347,7 +347,7 @@ struct MultiTileCommandListAppendBarrierFixture : public MultiTileCommandListFix
size_t timestampRegisters = 2 * (sizeof(MI_LOAD_REGISTER_REG) + sizeof(MI_LOAD_REGISTER_IMM) +
NEO::EncodeMath<FamilyType>::streamCommandSize + sizeof(MI_STORE_REGISTER_MEM));
if (NEO::UnitTestHelper<FamilyType>::timestampRegisterHighAddress()) {
timestampRegisters *= 2;
timestampRegisters += 2 * sizeof(MI_STORE_REGISTER_MEM);
}
size_t postBarrierSynchronization = NEO::MemorySynchronizationCommands<FamilyType>::getSizeForSingleBarrier(false) +
@@ -385,6 +385,7 @@ struct MultiTileCommandListAppendBarrierFixture : public MultiTileCommandListFix
begin,
RegisterOffsets::globalTimestampLdw, globalStartAddress,
RegisterOffsets::gpThreadTimeRegAddressOffsetLow, contextStartAddress,
true,
true);
auto barrierOffset = timestampRegisters;
@@ -419,6 +420,7 @@ struct MultiTileCommandListAppendBarrierFixture : public MultiTileCommandListFix
begin,
RegisterOffsets::globalTimestampLdw, globalEndAddress,
RegisterOffsets::gpThreadTimeRegAddressOffsetLow, contextEndAddress,
true,
true);
}
};

View File

@@ -813,13 +813,15 @@ HWTEST2_F(AppendMemoryCopyTests,
begin,
RegisterOffsets::globalTimestampLdw, globalStartAddress,
RegisterOffsets::gpThreadTimeRegAddressOffsetLow, contextStartAddress,
false);
false,
true);
validateTimestampRegisters<FamilyType>(cmdList,
secondWalker,
RegisterOffsets::globalTimestampLdw, globalEndAddress,
RegisterOffsets::gpThreadTimeRegAddressOffsetLow, contextEndAddress,
false);
false,
true);
}
HWTEST2_F(AppendMemoryCopyTests,
@@ -871,13 +873,15 @@ HWTEST2_F(AppendMemoryCopyTests,
begin,
RegisterOffsets::globalTimestampLdw, globalStartAddress,
RegisterOffsets::gpThreadTimeRegAddressOffsetLow, contextStartAddress,
false);
false,
true);
validateTimestampRegisters<FamilyType>(cmdList,
thirdWalker,
RegisterOffsets::globalTimestampLdw, globalEndAddress,
RegisterOffsets::gpThreadTimeRegAddressOffsetLow, contextEndAddress,
false);
false,
true);
}
HWTEST2_F(AppendMemoryCopyTests, givenCopyCommandListImmediateWithDummyBlitWaWhenCopyMemoryRegionThenDummyBlitIsNotProgrammedButIsRequiredForNextFlushProgramming, IsAtLeastXeHpCore) {

View File

@@ -829,6 +829,7 @@ HWTEST2_F(CommandListAppendUsedPacketSignalEvent,
startCmdList,
RegisterOffsets::globalTimestampLdw, globalStartAddress,
RegisterOffsets::gpThreadTimeRegAddressOffsetLow, contextStartAddress,
true,
true);
if (UnitTestHelper<FamilyType>::timestampRegisterHighAddress()) {
@@ -837,14 +838,16 @@ HWTEST2_F(CommandListAppendUsedPacketSignalEvent,
validateTimestampRegisters<FamilyType>(cmdList,
startCmdList,
RegisterOffsets::globalTimestampUn, globalStartAddressHigh,
0x23AC, contextStartAddressHigh,
true);
RegisterOffsets::gpThreadTimeRegAddressOffsetHigh, contextStartAddressHigh,
true,
false);
}
validateTimestampRegisters<FamilyType>(cmdList,
startCmdList,
RegisterOffsets::globalTimestampLdw, globalEndAddress,
RegisterOffsets::gpThreadTimeRegAddressOffsetLow, contextEndAddress,
true,
true);
if (UnitTestHelper<FamilyType>::timestampRegisterHighAddress()) {
@@ -853,8 +856,9 @@ HWTEST2_F(CommandListAppendUsedPacketSignalEvent,
validateTimestampRegisters<FamilyType>(cmdList,
startCmdList,
RegisterOffsets::globalTimestampUn, globalEndAddressHigh,
0x23AC, contextEndAddressHigh,
true);
RegisterOffsets::gpThreadTimeRegAddressOffsetHigh, contextEndAddressHigh,
true,
false);
}
}

View File

@@ -295,13 +295,15 @@ HWTEST2_F(AppendFillTest,
begin,
RegisterOffsets::globalTimestampLdw, globalStartAddress,
RegisterOffsets::gpThreadTimeRegAddressOffsetLow, contextStartAddress,
false);
false,
true);
validateTimestampRegisters<FamilyType>(cmdList,
secondWalker,
RegisterOffsets::globalTimestampLdw, globalEndAddress,
RegisterOffsets::gpThreadTimeRegAddressOffsetLow, contextEndAddress,
false);
false,
true);
}
HWTEST2_F(AppendFillTest,
@@ -349,13 +351,15 @@ HWTEST2_F(AppendFillTest,
begin,
RegisterOffsets::globalTimestampLdw, globalStartAddress,
RegisterOffsets::gpThreadTimeRegAddressOffsetLow, contextStartAddress,
false);
false,
true);
validateTimestampRegisters<FamilyType>(cmdList,
secondWalker,
RegisterOffsets::globalTimestampLdw, globalEndAddress,
RegisterOffsets::gpThreadTimeRegAddressOffsetLow, contextEndAddress,
false);
false,
true);
}
} // namespace ult

View File

@@ -98,17 +98,15 @@ struct CommandListXe2AndLaterFixture : public DeviceFixture {
uint64_t contextAddress = ptrOffset(baseAddr, contextOffset);
if (useMask) {
ASSERT_EQ(8u, srmCommands.size());
ASSERT_EQ(6u, srmCommands.size());
validateSrmCommand<FamilyType>(reinterpret_cast<MI_STORE_REGISTER_MEM *>(*srmCommands[0]), globalAddress, RegisterOffsets::csGprR12);
validateSrmCommand<FamilyType>(reinterpret_cast<MI_STORE_REGISTER_MEM *>(*srmCommands[1]), contextAddress, RegisterOffsets::csGprR12);
validateSrmCommand<FamilyType>(reinterpret_cast<MI_STORE_REGISTER_MEM *>(*srmCommands[2]), globalAddress + sizeof(uint32_t), RegisterOffsets::csGprR12);
validateSrmCommand<FamilyType>(reinterpret_cast<MI_STORE_REGISTER_MEM *>(*srmCommands[3]), contextAddress + sizeof(uint32_t), RegisterOffsets::csGprR12);
validateSrmCommand<FamilyType>(reinterpret_cast<MI_STORE_REGISTER_MEM *>(*srmCommands[2]), globalAddress + sizeof(uint32_t), RegisterOffsets::globalTimestampUn);
validateSrmCommand<FamilyType>(reinterpret_cast<MI_STORE_REGISTER_MEM *>(*srmCommands[3]), contextAddress + sizeof(uint32_t), RegisterOffsets::gpThreadTimeRegAddressOffsetHigh);
validateLrrCommand<FamilyType>(reinterpret_cast<MI_LOAD_REGISTER_REG *>(*srmCommands[4]), RegisterOffsets::globalTimestampLdw);
validateLrrCommand<FamilyType>(reinterpret_cast<MI_LOAD_REGISTER_REG *>(*srmCommands[5]), RegisterOffsets::gpThreadTimeRegAddressOffsetLow);
validateLrrCommand<FamilyType>(reinterpret_cast<MI_LOAD_REGISTER_REG *>(*srmCommands[6]), RegisterOffsets::globalTimestampUn);
validateLrrCommand<FamilyType>(reinterpret_cast<MI_LOAD_REGISTER_REG *>(*srmCommands[7]), RegisterOffsets::gpThreadTimeRegAddressOffsetHigh);
} else {
ASSERT_EQ(4u, srmCommands.size());