From e09ac446c48d89ac7a2fa3e1f541262f47243b47 Mon Sep 17 00:00:00 2001 From: Young Jin Yoon Date: Fri, 4 Dec 2020 08:15:15 -0800 Subject: [PATCH] Mask bit 0 of timestamp for event profiling Related-to: LOCI-1161 Signed-off-by: Young Jin Yoon --- level_zero/core/source/cmdlist/cmdlist_hw.h | 2 +- level_zero/core/source/cmdlist/cmdlist_hw.inl | 19 +++++--- .../sources/cmdlist/test_cmdlist_2.cpp | 26 +++++----- .../test_cmdlist_append_launch_kernel.cpp | 26 +++++----- .../command_container/command_encoder.h | 9 ++++ .../command_container/command_encoder.inl | 28 +++++++++++ .../unit_test/encoders/test_encode_math.cpp | 48 +++++++++++++++++++ 7 files changed, 126 insertions(+), 32 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.h b/level_zero/core/source/cmdlist/cmdlist_hw.h index 5f91939290..39f8026e49 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw.h @@ -211,7 +211,7 @@ struct CommandListCoreFamily : CommandListImp { const void **pRanges); ze_result_t setGlobalWorkSizeIndirect(NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress, uint32_t lws[3]); - void appendWriteKernelTimestamp(ze_event_handle_t hEvent, bool beforeWalker); + void appendWriteKernelTimestamp(ze_event_handle_t hEvent, bool beforeWalker, bool maskLsb); void appendEventForProfiling(ze_event_handle_t hEvent, bool beforeWalker); void appendEventForProfilingCopyCommand(ze_event_handle_t hEvent, bool beforeWalker); void appendSignalEventPostWalker(ze_event_handle_t hEvent); diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index 8fdb0f2e73..59ca347f65 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -1295,7 +1295,7 @@ void CommandListCoreFamily::appendEventForProfilingCopyCommand(ze if (!beforeWalker) { NEO::EncodeMiFlushDW::programMiFlushDw(*commandContainer.getCommandStream(), 0, 0, false, false); } - appendWriteKernelTimestamp(hEvent, beforeWalker); + appendWriteKernelTimestamp(hEvent, beforeWalker, false); } template @@ -1441,15 +1441,20 @@ ze_result_t CommandListCoreFamily::appendWaitOnEvents(uint32_t nu } template -void CommandListCoreFamily::appendWriteKernelTimestamp(ze_event_handle_t hEvent, bool beforeWalker) { +void CommandListCoreFamily::appendWriteKernelTimestamp(ze_event_handle_t hEvent, bool beforeWalker, bool maskLsb) { + constexpr uint32_t mask = 0xfffffffe; auto event = Event::fromHandle(hEvent); auto baseAddr = event->getGpuAddress(); auto contextOffset = beforeWalker ? offsetof(TimestampPacketStorage::Packet, contextStart) : offsetof(TimestampPacketStorage::Packet, contextEnd); auto globalOffset = beforeWalker ? offsetof(TimestampPacketStorage::Packet, globalStart) : offsetof(TimestampPacketStorage::Packet, globalEnd); - - NEO::EncodeStoreMMIO::encode(*commandContainer.getCommandStream(), REG_GLOBAL_TIMESTAMP_LDW, ptrOffset(baseAddr, globalOffset)); - NEO::EncodeStoreMMIO::encode(*commandContainer.getCommandStream(), GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, ptrOffset(baseAddr, contextOffset)); + if (maskLsb) { + NEO::EncodeMathMMIO::encodeBitwiseAndVal(commandContainer, REG_GLOBAL_TIMESTAMP_LDW, mask, ptrOffset(baseAddr, globalOffset)); + NEO::EncodeMathMMIO::encodeBitwiseAndVal(commandContainer, GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, mask, ptrOffset(baseAddr, contextOffset)); + } else { + NEO::EncodeStoreMMIO::encode(*commandContainer.getCommandStream(), REG_GLOBAL_TIMESTAMP_LDW, ptrOffset(baseAddr, globalOffset)); + NEO::EncodeStoreMMIO::encode(*commandContainer.getCommandStream(), GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, ptrOffset(baseAddr, contextOffset)); + } } template @@ -1469,14 +1474,14 @@ void CommandListCoreFamily::appendEventForProfiling(ze_event_hand commandContainer.addToResidencyContainer(&event->getAllocation()); if (beforeWalker) { - appendWriteKernelTimestamp(hEvent, beforeWalker); + appendWriteKernelTimestamp(hEvent, beforeWalker, true); } else { NEO::PipeControlArgs args; args.dcFlushEnable = true; NEO::MemorySynchronizationCommands::addPipeControl(*commandContainer.getCommandStream(), args); - appendWriteKernelTimestamp(hEvent, beforeWalker); + appendWriteKernelTimestamp(hEvent, beforeWalker, true); args.dcFlushEnable = (!event->signalScope) ? false : true; if (args.dcFlushEnable) { diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp index 5909dd5ac5..afde66fd77 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp @@ -461,7 +461,7 @@ HWTEST2_F(CommandListCreate, givenCommandListWhenAppendMemoryFillCalledThenAppen HWTEST2_F(CommandListCreate, givenCommandListWhenTimestampPassedToMemoryCopyThenAppendProfilingCalledOnceBeforeAndAfterCommand, Platforms) { using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; - using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; + using MI_LOAD_REGISTER_REG = typename GfxFamily::MI_LOAD_REGISTER_REG; using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; MockAppendMemoryCopy commandList; @@ -487,26 +487,30 @@ HWTEST2_F(CommandListCreate, givenCommandListWhenTimestampPassedToMemoryCopyThen ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( cmdList, ptrOffset(commandList.commandContainer.getCommandStream()->getCpuBase(), 0), commandList.commandContainer.getCommandStream()->getUsed())); - auto itor = find(cmdList.begin(), cmdList.end()); + auto itor = find(cmdList.begin(), cmdList.end()); EXPECT_NE(cmdList.end(), itor); - auto cmd = genCmdCast(*itor); - EXPECT_EQ(cmd->getRegisterAddress(), REG_GLOBAL_TIMESTAMP_LDW); + auto cmd = genCmdCast(*itor); + EXPECT_EQ(cmd->getSourceRegisterAddress(), REG_GLOBAL_TIMESTAMP_LDW); + itor++; + itor = find(itor, cmdList.end()); EXPECT_NE(cmdList.end(), itor); - cmd = genCmdCast(*itor); - EXPECT_EQ(cmd->getRegisterAddress(), GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW); + cmd = genCmdCast(*itor); + EXPECT_EQ(cmd->getSourceRegisterAddress(), GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW); itor = find(itor, cmdList.end()); EXPECT_NE(cmdList.end(), itor); - itor = find(itor, cmdList.end()); + itor = find(itor, cmdList.end()); EXPECT_NE(cmdList.end(), itor); - cmd = genCmdCast(*itor); - EXPECT_EQ(cmd->getRegisterAddress(), REG_GLOBAL_TIMESTAMP_LDW); + cmd = genCmdCast(*itor); + EXPECT_EQ(cmd->getSourceRegisterAddress(), REG_GLOBAL_TIMESTAMP_LDW); + itor++; + itor = find(itor, cmdList.end()); EXPECT_NE(cmdList.end(), itor); - cmd = genCmdCast(*itor); - EXPECT_EQ(cmd->getRegisterAddress(), GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW); + cmd = genCmdCast(*itor); + EXPECT_EQ(cmd->getSourceRegisterAddress(), GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW); } HWTEST2_F(CommandListCreate, givenCommandListWhenMemoryCopyWithSignalEventsThenSemaphoreWaitAndPipeControlAreFound, Platforms) { diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel.cpp index a58c5ffb13..04004b30b7 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel.cpp @@ -284,7 +284,7 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenTimestampEventsWhenAppendingKernel using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER; using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION; - using MI_STORE_REGISTER_MEM = typename FamilyType::MI_STORE_REGISTER_MEM; + using MI_LOAD_REGISTER_REG = typename FamilyType::MI_LOAD_REGISTER_REG; Mock<::L0::Kernel> kernel; ze_result_t returnValue; @@ -312,19 +312,19 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenTimestampEventsWhenAppendingKernel EXPECT_TRUE(FamilyType::PARSE::parseCommandBuffer( cmdList, ptrOffset(commandList->commandContainer.getCommandStream()->getCpuBase(), 0), usedSpaceAfter)); - auto itor = find(cmdList.begin(), cmdList.end()); + auto itor = find(cmdList.begin(), cmdList.end()); ASSERT_NE(cmdList.end(), itor); { - auto cmd = genCmdCast(*itor); - EXPECT_EQ(REG_GLOBAL_TIMESTAMP_LDW, cmd->getRegisterAddress()); + auto cmd = genCmdCast(*itor); + EXPECT_EQ(REG_GLOBAL_TIMESTAMP_LDW, cmd->getSourceRegisterAddress()); } itor++; - itor = find(itor, cmdList.end()); + itor = find(itor, cmdList.end()); ASSERT_NE(cmdList.end(), itor); { - auto cmd = genCmdCast(*itor); - EXPECT_EQ(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, cmd->getRegisterAddress()); + auto cmd = genCmdCast(*itor); + EXPECT_EQ(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, cmd->getSourceRegisterAddress()); } itor++; @@ -341,19 +341,19 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenTimestampEventsWhenAppendingKernel } itor++; - itor = find(itor, cmdList.end()); + itor = find(itor, cmdList.end()); ASSERT_NE(cmdList.end(), itor); { - auto cmd = genCmdCast(*itor); - EXPECT_EQ(REG_GLOBAL_TIMESTAMP_LDW, cmd->getRegisterAddress()); + auto cmd = genCmdCast(*itor); + EXPECT_EQ(REG_GLOBAL_TIMESTAMP_LDW, cmd->getSourceRegisterAddress()); } itor++; - itor = find(itor, cmdList.end()); + itor = find(itor, cmdList.end()); EXPECT_NE(cmdList.end(), itor); { - auto cmd = genCmdCast(*itor); - EXPECT_EQ(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, cmd->getRegisterAddress()); + auto cmd = genCmdCast(*itor); + EXPECT_EQ(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, cmd->getSourceRegisterAddress()); } { diff --git a/shared/source/command_container/command_encoder.h b/shared/source/command_container/command_encoder.h index f06267aae6..7ce1413536 100644 --- a/shared/source/command_container/command_encoder.h +++ b/shared/source/command_container/command_encoder.h @@ -114,6 +114,10 @@ struct EncodeMath { AluRegisters firstOperandRegister, AluRegisters secondOperandRegister, AluRegisters finalResultRegister); + static void bitwiseAnd(CommandContainer &container, + AluRegisters firstOperandRegister, + AluRegisters secondOperandRegister, + AluRegisters finalResultRegister); }; template @@ -128,6 +132,11 @@ struct EncodeMathMMIO { static void encodeGreaterThanPredicate(CommandContainer &container, uint64_t lhsVal, uint32_t rhsVal); + static void encodeBitwiseAndVal(CommandContainer &container, + uint32_t regOffset, + uint32_t immVal, + uint64_t dstAddress); + static void encodeAlu(MI_MATH_ALU_INST_INLINE *pAluParam, AluRegisters srcA, AluRegisters srcB, AluRegisters op, AluRegisters dest, AluRegisters result); static void encodeAluSubStoreCarry(MI_MATH_ALU_INST_INLINE *pAluParam, AluRegisters regA, AluRegisters regB, AluRegisters finalResultRegister); diff --git a/shared/source/command_container/command_encoder.inl b/shared/source/command_container/command_encoder.inl index 3e4b2a6c1d..4df5c85538 100644 --- a/shared/source/command_container/command_encoder.inl +++ b/shared/source/command_container/command_encoder.inl @@ -133,6 +133,21 @@ void EncodeMathMMIO::encodeGreaterThanPredicate(CommandContainer &contai EncodeSetMMIO::encodeREG(container, CS_PREDICATE_RESULT, CS_GPR_R2); } +/* + * Compute bitwise AND between a register value from regOffset and immVal + * and store it into dstAddress. + */ +template +void EncodeMathMMIO::encodeBitwiseAndVal(CommandContainer &container, uint32_t regOffset, uint32_t immVal, uint64_t dstAddress) { + EncodeSetMMIO::encodeREG(container, CS_GPR_R0, regOffset); + EncodeSetMMIO::encodeIMM(container, CS_GPR_R1, immVal, true); + EncodeMath::bitwiseAnd(container, AluRegisters::R_0, + AluRegisters::R_1, + AluRegisters::R_2); + EncodeStoreMMIO::encode(*container.getCommandStream(), + CS_GPR_R2, dstAddress); +} + /* * encodeAlu() performs operations that leave a state including the result of * an operation such as the carry flag, and the accu flag with subtraction and @@ -247,6 +262,19 @@ void EncodeMath::addition(CommandContainer &container, finalResultRegister); } +template +void EncodeMath::bitwiseAnd(CommandContainer &container, + AluRegisters firstOperandRegister, + AluRegisters secondOperandRegister, + AluRegisters finalResultRegister) { + uint32_t *cmd = EncodeMath::commandReserve(container); + + EncodeMathMMIO::encodeAluAnd(reinterpret_cast(cmd), + firstOperandRegister, + secondOperandRegister, + finalResultRegister); +} + template inline void EncodeSetMMIO::encodeIMM(CommandContainer &container, uint32_t offset, uint32_t data, bool remap) { LriHelper::program(container.getCommandStream(), diff --git a/shared/test/unit_test/encoders/test_encode_math.cpp b/shared/test/unit_test/encoders/test_encode_math.cpp index 5abd067941..1d7c74d497 100644 --- a/shared/test/unit_test/encoders/test_encode_math.cpp +++ b/shared/test/unit_test/encoders/test_encode_math.cpp @@ -169,6 +169,54 @@ HWTEST_F(CommandEncoderMathTest, commandReserve) { static_cast(NUM_ALU_INST_FOR_READ_MODIFY_WRITE - 1)); } +HWTEST_F(CommandEncoderMathTest, givenOffsetAndValueWhenEncodeBitwiseAndValIsCalledThenContainerHasCorrectMathCommands) { + using MI_LOAD_REGISTER_REG = typename FamilyType::MI_LOAD_REGISTER_REG; + using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM; + using MI_MATH = typename FamilyType::MI_MATH; + using MI_STORE_REGISTER_MEM = typename FamilyType::MI_STORE_REGISTER_MEM; + + GenCmdList commands; + CommandContainer cmdContainer; + cmdContainer.initialize(pDevice); + constexpr uint32_t regOffset = 0x2000u; + constexpr uint32_t immVal = 0xbaau; + constexpr uint64_t dstAddress = 0xDEADCAF0u; + EncodeMathMMIO::encodeBitwiseAndVal(cmdContainer, regOffset, immVal, dstAddress); + + CmdParse::parseCommandBuffer(commands, + ptrOffset(cmdContainer.getCommandStream()->getCpuBase(), 0), + cmdContainer.getCommandStream()->getUsed()); + + auto itor = find(commands.begin(), commands.end()); + + // load regOffset to R0 + EXPECT_NE(commands.end(), itor); + auto cmdLoadReg = genCmdCast(*itor); + EXPECT_EQ(cmdLoadReg->getSourceRegisterAddress(), regOffset); + EXPECT_EQ(cmdLoadReg->getDestinationRegisterAddress(), CS_GPR_R0); + + // load immVal to R1 + itor++; + EXPECT_NE(commands.end(), itor); + auto cmdLoadImm = genCmdCast(*itor); + EXPECT_EQ(cmdLoadImm->getRegisterOffset(), CS_GPR_R1); + EXPECT_EQ(cmdLoadImm->getDataDword(), immVal); + + // encodeAluAnd should have its own unit tests, so we only check + // that the MI_MATH exists and length is set to 3u + itor++; + EXPECT_NE(commands.end(), itor); + auto cmdMath = genCmdCast(*itor); + EXPECT_EQ(cmdMath->DW0.BitField.DwordLength, 3u); + + // store R2 to address + itor++; + EXPECT_NE(commands.end(), itor); + auto cmdMem = genCmdCast(*itor); + EXPECT_EQ(cmdMem->getRegisterAddress(), CS_GPR_R2); + EXPECT_EQ(cmdMem->getMemoryAddress(), dstAddress); +} + HWTEST_F(CommandEncoderMathTest, setGroupSizeIndirect) { using MI_MATH = typename FamilyType::MI_MATH; using MI_MATH_ALU_INST_INLINE = typename FamilyType::MI_MATH_ALU_INST_INLINE;