mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-24 12:23:05 +08:00
feature: synchronize patch preamble with wait commands before patching
Related-To: NEO-16140 Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
8ab463f47c
commit
0b65b86ccb
@@ -521,12 +521,15 @@ struct CommandList : _ze_command_list_handle_t {
|
|||||||
return activeScratchPatchElements;
|
return activeScratchPatchElements;
|
||||||
}
|
}
|
||||||
bool isDualStreamCopyOffloadOperation(bool offloadOperation) const { return (getCopyOffloadModeForOperation(offloadOperation) == CopyOffloadModes::dualStream); }
|
bool isDualStreamCopyOffloadOperation(bool offloadOperation) const { return (getCopyOffloadModeForOperation(offloadOperation) == CopyOffloadModes::dualStream); }
|
||||||
void saveLatestTagAndTaskCount(uint64_t tagGpuAddress, TaskCountType submittedTaskCount) {
|
void saveLatestTagAndTaskCount(NEO::GraphicsAllocation *tagGpuAllocation, TaskCountType submittedTaskCount) {
|
||||||
this->latestTagGpuAddress = tagGpuAddress;
|
this->latesTagGpuAllocation = tagGpuAllocation;
|
||||||
this->latestTaskCount = submittedTaskCount;
|
this->latestTaskCount = submittedTaskCount;
|
||||||
}
|
}
|
||||||
uint64_t getLatestTagGpuAddress() const {
|
uint64_t getLatestTagGpuAddress() const {
|
||||||
return this->latestTagGpuAddress;
|
return this->latesTagGpuAllocation == nullptr ? 0 : this->latesTagGpuAllocation->getGpuAddress();
|
||||||
|
}
|
||||||
|
NEO::GraphicsAllocation *getLatestTagGpuAllocation() const {
|
||||||
|
return this->latesTagGpuAllocation;
|
||||||
}
|
}
|
||||||
TaskCountType getLatestTaskCount() const {
|
TaskCountType getLatestTaskCount() const {
|
||||||
return this->latestTaskCount;
|
return this->latestTaskCount;
|
||||||
@@ -563,7 +566,6 @@ struct CommandList : _ze_command_list_handle_t {
|
|||||||
NEO::L1CachePolicy l1CachePolicyData{};
|
NEO::L1CachePolicy l1CachePolicyData{};
|
||||||
NEO::EncodeDummyBlitWaArgs dummyBlitWa{};
|
NEO::EncodeDummyBlitWaArgs dummyBlitWa{};
|
||||||
|
|
||||||
uint64_t latestTagGpuAddress = 0;
|
|
||||||
int64_t currentSurfaceStateBaseAddress = NEO::StreamProperty64::initValue;
|
int64_t currentSurfaceStateBaseAddress = NEO::StreamProperty64::initValue;
|
||||||
int64_t currentDynamicStateBaseAddress = NEO::StreamProperty64::initValue;
|
int64_t currentDynamicStateBaseAddress = NEO::StreamProperty64::initValue;
|
||||||
int64_t currentIndirectObjectBaseAddress = NEO::StreamProperty64::initValue;
|
int64_t currentIndirectObjectBaseAddress = NEO::StreamProperty64::initValue;
|
||||||
@@ -571,6 +573,7 @@ struct CommandList : _ze_command_list_handle_t {
|
|||||||
|
|
||||||
TaskCountType latestTaskCount = 0;
|
TaskCountType latestTaskCount = 0;
|
||||||
|
|
||||||
|
NEO::GraphicsAllocation *latesTagGpuAllocation = nullptr;
|
||||||
ze_context_handle_t hContext = nullptr;
|
ze_context_handle_t hContext = nullptr;
|
||||||
CommandQueue *cmdQImmediate = nullptr;
|
CommandQueue *cmdQImmediate = nullptr;
|
||||||
CommandQueue *cmdQImmediateCopyOffload = nullptr;
|
CommandQueue *cmdQImmediateCopyOffload = nullptr;
|
||||||
|
|||||||
@@ -154,7 +154,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::reset() {
|
|||||||
|
|
||||||
this->inOrderPatchCmds.clear();
|
this->inOrderPatchCmds.clear();
|
||||||
this->totalNoopSpace = 0;
|
this->totalNoopSpace = 0;
|
||||||
this->latestTagGpuAddress = 0;
|
this->latesTagGpuAllocation = nullptr;
|
||||||
this->latestTaskCount = 0;
|
this->latestTaskCount = 0;
|
||||||
|
|
||||||
return ZE_RESULT_SUCCESS;
|
return ZE_RESULT_SUCCESS;
|
||||||
|
|||||||
@@ -1880,7 +1880,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::appendCommandLists(ui
|
|||||||
requireTaskCountUpdate,
|
requireTaskCountUpdate,
|
||||||
&mainAppendLock,
|
&mainAppendLock,
|
||||||
&mainLockForIndirect);
|
&mainLockForIndirect);
|
||||||
queueImp->saveTagAndTaskCountForCommandLists(numCommandLists, phCommandLists, queueImp->getCsr()->getTagAllocation()->getGpuAddress(), queueImp->getTaskCount());
|
queueImp->saveTagAndTaskCountForCommandLists(numCommandLists, phCommandLists, queueImp->getCsr()->getTagAllocation(), queueImp->getTaskCount());
|
||||||
return retCode;
|
return retCode;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -43,11 +43,11 @@ bool CommandQueue::frontEndTrackingEnabled() const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void CommandQueue::saveTagAndTaskCountForCommandLists(uint32_t numCommandLists, ze_command_list_handle_t *commandListHandles,
|
void CommandQueue::saveTagAndTaskCountForCommandLists(uint32_t numCommandLists, ze_command_list_handle_t *commandListHandles,
|
||||||
uint64_t tagGpuAddress, TaskCountType submittedTaskCount) {
|
NEO::GraphicsAllocation *tagGpuAllocation, TaskCountType submittedTaskCount) {
|
||||||
if (this->saveWaitForPreamble) {
|
if (this->saveWaitForPreamble) {
|
||||||
for (uint32_t i = 0; i < numCommandLists; i++) {
|
for (uint32_t i = 0; i < numCommandLists; i++) {
|
||||||
auto commandList = CommandList::fromHandle(commandListHandles[i]);
|
auto commandList = CommandList::fromHandle(commandListHandles[i]);
|
||||||
commandList->saveLatestTagAndTaskCount(tagGpuAddress, submittedTaskCount);
|
commandList->saveLatestTagAndTaskCount(tagGpuAllocation, submittedTaskCount);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -410,8 +410,9 @@ void CommandQueueImp::makeResidentForResidencyContainer(const NEO::ResidencyCont
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool CommandQueueImp::checkNeededPatchPreambleWait(uint64_t tagGpuAddress) {
|
bool CommandQueueImp::checkNeededPatchPreambleWait(CommandList *commandList) {
|
||||||
return this->saveWaitForPreamble && (getCsr()->getTagAllocation()->getGpuAddress() != tagGpuAddress);
|
uint64_t tagGpuAddress = commandList->getLatestTagGpuAddress();
|
||||||
|
return this->saveWaitForPreamble && (tagGpuAddress != 0) && (getCsr()->getTagAllocation()->getGpuAddress() != tagGpuAddress);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace L0
|
} // namespace L0
|
||||||
|
|||||||
@@ -101,7 +101,7 @@ struct CommandQueue : _ze_command_queue_handle_t {
|
|||||||
return this->saveWaitForPreamble;
|
return this->saveWaitForPreamble;
|
||||||
}
|
}
|
||||||
void saveTagAndTaskCountForCommandLists(uint32_t numCommandLists, ze_command_list_handle_t *commandListHandles,
|
void saveTagAndTaskCountForCommandLists(uint32_t numCommandLists, ze_command_list_handle_t *commandListHandles,
|
||||||
uint64_t tagGpuAddress, TaskCountType submittedTaskCount);
|
NEO::GraphicsAllocation *tagGpuAllocation, TaskCountType submittedTaskCount);
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
bool frontEndTrackingEnabled() const;
|
bool frontEndTrackingEnabled() const;
|
||||||
|
|||||||
@@ -159,10 +159,12 @@ struct CommandQueueHw : public CommandQueueImp {
|
|||||||
inline size_t estimateCommandListPatchPreamble(CommandListExecutionContext &ctx, uint32_t numCommandLists);
|
inline size_t estimateCommandListPatchPreamble(CommandListExecutionContext &ctx, uint32_t numCommandLists);
|
||||||
inline size_t estimateCommandListPatchPreambleFrontEndCmd(CommandListExecutionContext &ctx, CommandList *commandList);
|
inline size_t estimateCommandListPatchPreambleFrontEndCmd(CommandListExecutionContext &ctx, CommandList *commandList);
|
||||||
inline void getCommandListPatchPreambleData(CommandListExecutionContext &ctx, CommandList *commandList);
|
inline void getCommandListPatchPreambleData(CommandListExecutionContext &ctx, CommandList *commandList);
|
||||||
|
inline size_t estimateCommandListPatchPreambleWaitSync(CommandListExecutionContext &ctx, CommandList *commandList);
|
||||||
inline size_t estimateTotalPatchPreambleData(CommandListExecutionContext &ctx);
|
inline size_t estimateTotalPatchPreambleData(CommandListExecutionContext &ctx);
|
||||||
inline void retrivePatchPreambleSpace(CommandListExecutionContext &ctx, NEO::LinearStream &commandStream);
|
inline void retrivePatchPreambleSpace(CommandListExecutionContext &ctx, NEO::LinearStream &commandStream);
|
||||||
inline void dispatchPatchPreambleEnding(CommandListExecutionContext &ctx);
|
inline void dispatchPatchPreambleEnding(CommandListExecutionContext &ctx);
|
||||||
inline void dispatchPatchPreambleInOrderNoop(CommandListExecutionContext &ctx, CommandList *commandList);
|
inline void dispatchPatchPreambleInOrderNoop(CommandListExecutionContext &ctx, CommandList *commandList);
|
||||||
|
inline void dispatchPatchPreambleCommandListWaitSync(CommandListExecutionContext &ctx, CommandList *commandList);
|
||||||
inline size_t estimateCommandListResidencySize(CommandList *commandList);
|
inline size_t estimateCommandListResidencySize(CommandList *commandList);
|
||||||
inline void setFrontEndStateProperties(CommandListExecutionContext &ctx);
|
inline void setFrontEndStateProperties(CommandListExecutionContext &ctx);
|
||||||
inline void handleScratchSpaceAndUpdateGSBAStateDirtyFlag(CommandListExecutionContext &ctx);
|
inline void handleScratchSpaceAndUpdateGSBAStateDirtyFlag(CommandListExecutionContext &ctx);
|
||||||
|
|||||||
@@ -209,6 +209,8 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsRegularHeapless(
|
|||||||
|
|
||||||
ctx.childGpuAddressPositionBeforeDynamicPreamble = (*streamForDispatch).getCurrentGpuAddressPosition();
|
ctx.childGpuAddressPositionBeforeDynamicPreamble = (*streamForDispatch).getCurrentGpuAddressPosition();
|
||||||
|
|
||||||
|
this->dispatchPatchPreambleCommandListWaitSync(ctx, commandList);
|
||||||
|
|
||||||
this->patchCommands(*commandList, ctx);
|
this->patchCommands(*commandList, ctx);
|
||||||
this->programOneCmdListBatchBufferStart(commandList, *streamForDispatch, ctx);
|
this->programOneCmdListBatchBufferStart(commandList, *streamForDispatch, ctx);
|
||||||
|
|
||||||
@@ -266,6 +268,7 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateStreamSizeForExecuteCommandListsRe
|
|||||||
getCommandListPatchPreambleData(ctx, cmdList);
|
getCommandListPatchPreambleData(ctx, cmdList);
|
||||||
|
|
||||||
linearStreamSizeEstimate += estimateCommandListPatchPreambleFrontEndCmd(ctx, cmdList);
|
linearStreamSizeEstimate += estimateCommandListPatchPreambleFrontEndCmd(ctx, cmdList);
|
||||||
|
linearStreamSizeEstimate += estimateCommandListPatchPreambleWaitSync(ctx, cmdList);
|
||||||
linearStreamSizeEstimate += estimateCommandListSecondaryStart(cmdList);
|
linearStreamSizeEstimate += estimateCommandListSecondaryStart(cmdList);
|
||||||
}
|
}
|
||||||
linearStreamSizeEstimate += estimateTotalPatchPreambleData(ctx);
|
linearStreamSizeEstimate += estimateTotalPatchPreambleData(ctx);
|
||||||
@@ -431,6 +434,8 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsRegular(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
this->dispatchPatchPreambleCommandListWaitSync(ctx, commandList);
|
||||||
|
|
||||||
this->patchCommands(*commandList, ctx);
|
this->patchCommands(*commandList, ctx);
|
||||||
this->programOneCmdListBatchBufferStart(commandList, *streamForDispatch, ctx);
|
this->programOneCmdListBatchBufferStart(commandList, *streamForDispatch, ctx);
|
||||||
|
|
||||||
@@ -493,6 +498,7 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsCopyOnly(
|
|||||||
fenceRequired |= commandList->isTaskCountUpdateFenceRequired();
|
fenceRequired |= commandList->isTaskCountUpdateFenceRequired();
|
||||||
|
|
||||||
linearStreamSizeEstimate += estimateCommandListSecondaryStart(commandList);
|
linearStreamSizeEstimate += estimateCommandListSecondaryStart(commandList);
|
||||||
|
linearStreamSizeEstimate += estimateCommandListPatchPreambleWaitSync(ctx, commandList);
|
||||||
}
|
}
|
||||||
|
|
||||||
linearStreamSizeEstimate += this->estimateCommandListPatchPreamble(ctx, numCommandLists);
|
linearStreamSizeEstimate += this->estimateCommandListPatchPreamble(ctx, numCommandLists);
|
||||||
@@ -522,6 +528,8 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsCopyOnly(
|
|||||||
auto commandList = CommandList::fromHandle(phCommandLists[i]);
|
auto commandList = CommandList::fromHandle(phCommandLists[i]);
|
||||||
ctx.childGpuAddressPositionBeforeDynamicPreamble = (*streamForDispatch).getCurrentGpuAddressPosition();
|
ctx.childGpuAddressPositionBeforeDynamicPreamble = (*streamForDispatch).getCurrentGpuAddressPosition();
|
||||||
|
|
||||||
|
this->dispatchPatchPreambleCommandListWaitSync(ctx, commandList);
|
||||||
|
|
||||||
this->programOneCmdListBatchBufferStart(commandList, *streamForDispatch, ctx);
|
this->programOneCmdListBatchBufferStart(commandList, *streamForDispatch, ctx);
|
||||||
this->prefetchMemoryToDeviceAssociatedWithCmdList(commandList);
|
this->prefetchMemoryToDeviceAssociatedWithCmdList(commandList);
|
||||||
this->dispatchPatchPreambleInOrderNoop(ctx, commandList);
|
this->dispatchPatchPreambleInOrderNoop(ctx, commandList);
|
||||||
@@ -924,6 +932,21 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateCommandListPatchPreambleFrontEndCm
|
|||||||
return encodeSize;
|
return encodeSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||||
|
size_t CommandQueueHw<gfxCoreFamily>::estimateCommandListPatchPreambleWaitSync(CommandListExecutionContext &ctx, CommandList *commandList) {
|
||||||
|
using MI_LOAD_REGISTER_IMM = typename GfxFamily::MI_LOAD_REGISTER_IMM;
|
||||||
|
size_t waitSize = 0;
|
||||||
|
if (this->patchingPreamble) {
|
||||||
|
bool needWait = this->checkNeededPatchPreambleWait(commandList);
|
||||||
|
if (needWait) {
|
||||||
|
waitSize = NEO::EncodeSemaphore<GfxFamily>::getSizeMiSemaphoreWait();
|
||||||
|
waitSize += (2 * sizeof(MI_LOAD_REGISTER_IMM));
|
||||||
|
}
|
||||||
|
ctx.bufferSpaceForPatchPreamble += waitSize;
|
||||||
|
}
|
||||||
|
return waitSize;
|
||||||
|
}
|
||||||
|
|
||||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||||
inline size_t CommandQueueHw<gfxCoreFamily>::estimateTotalPatchPreambleData(CommandListExecutionContext &ctx) {
|
inline size_t CommandQueueHw<gfxCoreFamily>::estimateTotalPatchPreambleData(CommandListExecutionContext &ctx) {
|
||||||
size_t encodeSize = 0;
|
size_t encodeSize = 0;
|
||||||
@@ -1007,6 +1030,49 @@ void CommandQueueHw<gfxCoreFamily>::dispatchPatchPreambleInOrderNoop(CommandList
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||||
|
void CommandQueueHw<gfxCoreFamily>::dispatchPatchPreambleCommandListWaitSync(CommandListExecutionContext &ctx, CommandList *commandList) {
|
||||||
|
using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT;
|
||||||
|
using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION;
|
||||||
|
using MI_LOAD_REGISTER_IMM = typename GfxFamily::MI_LOAD_REGISTER_IMM;
|
||||||
|
|
||||||
|
if (this->patchingPreamble) {
|
||||||
|
if (this->checkNeededPatchPreambleWait(commandList)) {
|
||||||
|
constexpr uint32_t firstRegister = RegisterOffsets::csGprR0;
|
||||||
|
constexpr uint32_t secondRegister = RegisterOffsets::csGprR0 + 4;
|
||||||
|
|
||||||
|
auto waitValue = commandList->getLatestTaskCount();
|
||||||
|
|
||||||
|
NEO::LriHelper<GfxFamily>::program(reinterpret_cast<MI_LOAD_REGISTER_IMM *>(ctx.currentPatchPreambleBuffer),
|
||||||
|
firstRegister,
|
||||||
|
getLowPart(waitValue),
|
||||||
|
true,
|
||||||
|
this->isCopyOnlyCommandQueue);
|
||||||
|
ctx.currentPatchPreambleBuffer = ptrOffset(ctx.currentPatchPreambleBuffer, sizeof(MI_LOAD_REGISTER_IMM));
|
||||||
|
|
||||||
|
NEO::LriHelper<GfxFamily>::program(reinterpret_cast<MI_LOAD_REGISTER_IMM *>(ctx.currentPatchPreambleBuffer),
|
||||||
|
secondRegister,
|
||||||
|
getHighPart(waitValue),
|
||||||
|
true,
|
||||||
|
this->isCopyOnlyCommandQueue);
|
||||||
|
ctx.currentPatchPreambleBuffer = ptrOffset(ctx.currentPatchPreambleBuffer, sizeof(MI_LOAD_REGISTER_IMM));
|
||||||
|
|
||||||
|
NEO::EncodeSemaphore<GfxFamily>::programMiSemaphoreWait(reinterpret_cast<MI_SEMAPHORE_WAIT *>(ctx.currentPatchPreambleBuffer),
|
||||||
|
commandList->getLatestTagGpuAddress(),
|
||||||
|
commandList->getLatestTaskCount(),
|
||||||
|
COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD,
|
||||||
|
false,
|
||||||
|
true,
|
||||||
|
GfxFamily::isQwordInOrderCounter,
|
||||||
|
GfxFamily::isQwordInOrderCounter,
|
||||||
|
false);
|
||||||
|
ctx.currentPatchPreambleBuffer = ptrOffset(ctx.currentPatchPreambleBuffer, NEO::EncodeSemaphore<GfxFamily>::getSizeMiSemaphoreWait());
|
||||||
|
|
||||||
|
this->csr->makeResident(*commandList->getLatestTagGpuAllocation());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||||
size_t CommandQueueHw<gfxCoreFamily>::estimateCommandListResidencySize(CommandList *commandList) {
|
size_t CommandQueueHw<gfxCoreFamily>::estimateCommandListResidencySize(CommandList *commandList) {
|
||||||
return commandList->getCmdContainer().getResidencyContainer().size();
|
return commandList->getCmdContainer().getResidencyContainer().size();
|
||||||
@@ -1079,6 +1145,7 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateLinearStreamSizeComplementary(
|
|||||||
|
|
||||||
getCommandListPatchPreambleData(ctx, cmdList);
|
getCommandListPatchPreambleData(ctx, cmdList);
|
||||||
linearStreamSizeEstimate += estimateCommandListPatchPreambleFrontEndCmd(ctx, cmdList);
|
linearStreamSizeEstimate += estimateCommandListPatchPreambleFrontEndCmd(ctx, cmdList);
|
||||||
|
linearStreamSizeEstimate += estimateCommandListPatchPreambleWaitSync(ctx, cmdList);
|
||||||
linearStreamSizeEstimate += estimateFrontEndCmdSizeForMultipleCommandLists(frontEndStateDirty, cmdList,
|
linearStreamSizeEstimate += estimateFrontEndCmdSizeForMultipleCommandLists(frontEndStateDirty, cmdList,
|
||||||
streamProperties, requiredStreamState, finalStreamState,
|
streamProperties, requiredStreamState, finalStreamState,
|
||||||
cmdListState.requiredState,
|
cmdListState.requiredState,
|
||||||
@@ -1423,10 +1490,6 @@ void CommandQueueHw<gfxCoreFamily>::programOneCmdListBatchBufferStartSecondaryBa
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ctx.containsParentImmediateStream) {
|
|
||||||
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programBatchBufferEnd(commandContainer);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||||
@@ -1609,7 +1672,7 @@ void CommandQueueHw<gfxCoreFamily>::updateTaskCountAndPostSync(bool isDispatchTa
|
|||||||
this->taskCount = this->csr->peekTaskCount();
|
this->taskCount = this->csr->peekTaskCount();
|
||||||
this->csr->setLatestFlushedTaskCount(this->taskCount);
|
this->csr->setLatestFlushedTaskCount(this->taskCount);
|
||||||
|
|
||||||
this->saveTagAndTaskCountForCommandLists(numCommandLists, commandListHandles, this->csr->getTagAllocation()->getGpuAddress(), this->taskCount);
|
this->saveTagAndTaskCountForCommandLists(numCommandLists, commandListHandles, this->csr->getTagAllocation(), this->taskCount);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||||
|
|||||||
@@ -120,7 +120,7 @@ struct CommandQueueImp : public CommandQueue {
|
|||||||
}
|
}
|
||||||
void makeResidentForResidencyContainer(const NEO::ResidencyContainer &residencyContainer);
|
void makeResidentForResidencyContainer(const NEO::ResidencyContainer &residencyContainer);
|
||||||
|
|
||||||
bool checkNeededPatchPreambleWait(uint64_t tagGpuAddress);
|
bool checkNeededPatchPreambleWait(CommandList *commandList);
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
MOCKABLE_VIRTUAL NEO::SubmissionStatus submitBatchBuffer(size_t offset, NEO::ResidencyContainer &residencyContainer, void *endingCmdPtr,
|
MOCKABLE_VIRTUAL NEO::SubmissionStatus submitBatchBuffer(size_t offset, NEO::ResidencyContainer &residencyContainer, void *endingCmdPtr,
|
||||||
|
|||||||
@@ -9,6 +9,7 @@
|
|||||||
#include "shared/source/helpers/pause_on_gpu_properties.h"
|
#include "shared/source/helpers/pause_on_gpu_properties.h"
|
||||||
#include "shared/test/common/cmd_parse/gen_cmd_parse.h"
|
#include "shared/test/common/cmd_parse/gen_cmd_parse.h"
|
||||||
#include "shared/test/common/helpers/unit_test_helper.h"
|
#include "shared/test/common/helpers/unit_test_helper.h"
|
||||||
|
#include "shared/test/common/libult/ult_command_stream_receiver.h"
|
||||||
#include "shared/test/common/mocks/mock_bindless_heaps_helper.h"
|
#include "shared/test/common/mocks/mock_bindless_heaps_helper.h"
|
||||||
#include "shared/test/common/test_macros/hw_test.h"
|
#include "shared/test/common/test_macros/hw_test.h"
|
||||||
|
|
||||||
@@ -1321,31 +1322,39 @@ HWTEST_F(CommandQueueExecuteCommandListsSimpleTest, givenPatchPreambleAndSavingW
|
|||||||
EXPECT_TRUE(commandQueue->getPatchingPreamble());
|
EXPECT_TRUE(commandQueue->getPatchingPreamble());
|
||||||
EXPECT_FALSE(commandQueue->getSaveWaitForPreamble());
|
EXPECT_FALSE(commandQueue->getSaveWaitForPreamble());
|
||||||
|
|
||||||
uint64_t expectedGpuAddress = commandQueue->getCsr()->getTagAllocation()->getGpuAddress();
|
NEO::GraphicsAllocation *expectedGpuAllocation = commandQueue->getCsr()->getTagAllocation();
|
||||||
TaskCountType expectedTaskCount = 0x456;
|
TaskCountType expectedTaskCount = 0x456;
|
||||||
|
uint64_t expectedGpuAddress = expectedGpuAllocation->getGpuAddress();
|
||||||
|
|
||||||
commandQueue->saveTagAndTaskCountForCommandLists(1, &commandListHandle, expectedGpuAddress, expectedTaskCount);
|
commandQueue->saveTagAndTaskCountForCommandLists(1, &commandListHandle, expectedGpuAllocation, expectedTaskCount);
|
||||||
// save and wait is disabled, so nothing to be saved
|
// save and wait is disabled, so nothing to be saved
|
||||||
EXPECT_EQ(0u, commandList->getLatestTagGpuAddress());
|
EXPECT_EQ(0u, commandList->getLatestTagGpuAddress());
|
||||||
EXPECT_EQ(0u, commandList->getLatestTaskCount());
|
EXPECT_EQ(0u, commandList->getLatestTaskCount());
|
||||||
|
|
||||||
EXPECT_FALSE(commandQueue->checkNeededPatchPreambleWait(expectedGpuAddress));
|
EXPECT_FALSE(commandQueue->checkNeededPatchPreambleWait(commandList));
|
||||||
|
|
||||||
commandQueue->setPatchingPreamble(true, true);
|
commandQueue->setPatchingPreamble(true, true);
|
||||||
EXPECT_TRUE(commandQueue->getPatchingPreamble());
|
EXPECT_TRUE(commandQueue->getPatchingPreamble());
|
||||||
EXPECT_TRUE(commandQueue->getSaveWaitForPreamble());
|
EXPECT_TRUE(commandQueue->getSaveWaitForPreamble());
|
||||||
|
|
||||||
EXPECT_TRUE(commandQueue->checkNeededPatchPreambleWait(expectedGpuAddress + 1000));
|
EXPECT_FALSE(commandQueue->checkNeededPatchPreambleWait(commandList));
|
||||||
EXPECT_FALSE(commandQueue->checkNeededPatchPreambleWait(expectedGpuAddress));
|
|
||||||
|
|
||||||
commandQueue->saveTagAndTaskCountForCommandLists(1, &commandListHandle, expectedGpuAddress, expectedTaskCount);
|
commandQueue->saveTagAndTaskCountForCommandLists(1, &commandListHandle, expectedGpuAllocation, expectedTaskCount);
|
||||||
// save and wait is now enabled
|
// save and wait is now enabled
|
||||||
EXPECT_EQ(expectedGpuAddress, commandList->getLatestTagGpuAddress());
|
EXPECT_EQ(expectedGpuAddress, commandList->getLatestTagGpuAddress());
|
||||||
EXPECT_EQ(expectedTaskCount, commandList->getLatestTaskCount());
|
EXPECT_EQ(expectedTaskCount, commandList->getLatestTaskCount());
|
||||||
|
|
||||||
|
EXPECT_FALSE(commandQueue->checkNeededPatchPreambleWait(commandList));
|
||||||
|
|
||||||
|
MockGraphicsAllocation otherTagAllocation(nullptr, expectedGpuAddress + 0x1000, 1);
|
||||||
|
|
||||||
|
commandQueue->saveTagAndTaskCountForCommandLists(1, &commandListHandle, &otherTagAllocation, expectedTaskCount);
|
||||||
|
EXPECT_TRUE(commandQueue->checkNeededPatchPreambleWait(commandList));
|
||||||
|
|
||||||
commandList->reset();
|
commandList->reset();
|
||||||
EXPECT_EQ(0u, commandList->getLatestTagGpuAddress());
|
EXPECT_EQ(0u, commandList->getLatestTagGpuAddress());
|
||||||
EXPECT_EQ(0u, commandList->getLatestTaskCount());
|
EXPECT_EQ(0u, commandList->getLatestTaskCount());
|
||||||
|
EXPECT_EQ(nullptr, commandList->getLatestTagGpuAllocation());
|
||||||
|
|
||||||
commandList->destroy();
|
commandList->destroy();
|
||||||
commandQueue->destroy();
|
commandQueue->destroy();
|
||||||
@@ -1376,7 +1385,8 @@ HWTEST_F(CommandQueueExecuteCommandListsSimpleTest, givenPatchPreambleAndSavingW
|
|||||||
commandList->close();
|
commandList->close();
|
||||||
|
|
||||||
commandQueue->setPatchingPreamble(true, true);
|
commandQueue->setPatchingPreamble(true, true);
|
||||||
commandQueue->executeCommandLists(1, &commandListHandle, nullptr, false, nullptr, nullptr);
|
returnValue = commandQueue->executeCommandLists(1, &commandListHandle, nullptr, false, nullptr, nullptr);
|
||||||
|
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||||
|
|
||||||
uint64_t expectedGpuAddress = commandQueue->getCsr()->getTagAllocation()->getGpuAddress();
|
uint64_t expectedGpuAddress = commandQueue->getCsr()->getTagAllocation()->getGpuAddress();
|
||||||
TaskCountType expectedTaskCount = commandQueue->getTaskCount();
|
TaskCountType expectedTaskCount = commandQueue->getTaskCount();
|
||||||
@@ -1409,7 +1419,8 @@ HWTEST_F(CommandQueueExecuteCommandListsSimpleTest, givenPatchPreambleAndSavingW
|
|||||||
commandList->close();
|
commandList->close();
|
||||||
|
|
||||||
immediateCmdList->setPatchingPreamble(true, true);
|
immediateCmdList->setPatchingPreamble(true, true);
|
||||||
immediateCmdList->appendCommandLists(1, &commandListHandle, nullptr, 0, nullptr);
|
returnValue = immediateCmdList->appendCommandLists(1, &commandListHandle, nullptr, 0, nullptr);
|
||||||
|
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||||
|
|
||||||
uint64_t expectedGpuAddress = immediateQueue->getCsr()->getTagAllocation()->getGpuAddress();
|
uint64_t expectedGpuAddress = immediateQueue->getCsr()->getTagAllocation()->getGpuAddress();
|
||||||
TaskCountType expectedTaskCount = immediateQueue->getTaskCount();
|
TaskCountType expectedTaskCount = immediateQueue->getTaskCount();
|
||||||
@@ -1420,5 +1431,114 @@ HWTEST_F(CommandQueueExecuteCommandListsSimpleTest, givenPatchPreambleAndSavingW
|
|||||||
commandList->destroy();
|
commandList->destroy();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
HWTEST_F(CommandQueueExecuteCommandListsSimpleTest, givenPatchPreambleAndSavingWaitDataWhenCmdListExecutedByQueueThenSemaphoreDispatchedWhenNeeded) {
|
||||||
|
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
|
||||||
|
using COMPARE_OPERATION = typename FamilyType::MI_SEMAPHORE_WAIT::COMPARE_OPERATION;
|
||||||
|
using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM;
|
||||||
|
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
|
||||||
|
|
||||||
|
ze_result_t returnValue;
|
||||||
|
ze_command_queue_desc_t queueDesc{ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC};
|
||||||
|
queueDesc.ordinal = 0u;
|
||||||
|
queueDesc.index = 0u;
|
||||||
|
queueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL;
|
||||||
|
queueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
|
||||||
|
|
||||||
|
WhiteBox<L0::CommandQueue> *commandQueue = whiteboxCast(CommandQueue::create(productFamily,
|
||||||
|
device,
|
||||||
|
neoDevice->getDefaultEngine().commandStreamReceiver,
|
||||||
|
&queueDesc,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
false,
|
||||||
|
returnValue));
|
||||||
|
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||||
|
|
||||||
|
MockGraphicsAllocation otherTagAllocation(nullptr, commandQueue->getCsr()->getTagAllocation()->getGpuAddress() + 0x1000, 1);
|
||||||
|
|
||||||
|
auto commandList = CommandList::create(productFamily, device, NEO::EngineGroupType::compute, 0u, returnValue, false);
|
||||||
|
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||||
|
|
||||||
|
ze_command_list_handle_t commandListHandle = commandList->toHandle();
|
||||||
|
commandList->close();
|
||||||
|
commandQueue->setPatchingPreamble(true, true);
|
||||||
|
|
||||||
|
auto usedSpaceBefore = commandQueue->commandStream.getUsed();
|
||||||
|
returnValue = commandQueue->executeCommandLists(1, &commandListHandle, nullptr, false, nullptr, nullptr);
|
||||||
|
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||||
|
auto usedSpaceAfter = commandQueue->commandStream.getUsed();
|
||||||
|
ASSERT_GT(usedSpaceAfter, usedSpaceBefore);
|
||||||
|
|
||||||
|
GenCmdList cmdList;
|
||||||
|
ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(
|
||||||
|
cmdList, ptrOffset(commandQueue->commandStream.getCpuBase(), usedSpaceBefore), usedSpaceAfter - usedSpaceBefore));
|
||||||
|
|
||||||
|
// first execution of command list, no prior history and no semaphore required
|
||||||
|
auto semWaitCmds = findAll<MI_SEMAPHORE_WAIT *>(cmdList.begin(), cmdList.end());
|
||||||
|
EXPECT_EQ(0u, semWaitCmds.size());
|
||||||
|
|
||||||
|
usedSpaceBefore = commandQueue->commandStream.getUsed();
|
||||||
|
returnValue = commandQueue->executeCommandLists(1, &commandListHandle, nullptr, false, nullptr, nullptr);
|
||||||
|
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||||
|
usedSpaceAfter = commandQueue->commandStream.getUsed();
|
||||||
|
ASSERT_GT(usedSpaceAfter, usedSpaceBefore);
|
||||||
|
|
||||||
|
cmdList.clear();
|
||||||
|
ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(
|
||||||
|
cmdList, ptrOffset(commandQueue->commandStream.getCpuBase(), usedSpaceBefore), usedSpaceAfter - usedSpaceBefore));
|
||||||
|
|
||||||
|
// second execution of command list, same tag allocation and no semaphore required
|
||||||
|
semWaitCmds = findAll<MI_SEMAPHORE_WAIT *>(cmdList.begin(), cmdList.end());
|
||||||
|
EXPECT_EQ(0u, semWaitCmds.size());
|
||||||
|
|
||||||
|
// change tag allocation to simulate previous execution on different context
|
||||||
|
constexpr uint32_t otherTaskCount = 0x123;
|
||||||
|
commandList->saveLatestTagAndTaskCount(&otherTagAllocation, otherTaskCount);
|
||||||
|
|
||||||
|
auto ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(commandQueue->getCsr());
|
||||||
|
ultCsr->storeMakeResidentAllocations = true;
|
||||||
|
|
||||||
|
usedSpaceBefore = commandQueue->commandStream.getUsed();
|
||||||
|
returnValue = commandQueue->executeCommandLists(1, &commandListHandle, nullptr, false, nullptr, nullptr);
|
||||||
|
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||||
|
usedSpaceAfter = commandQueue->commandStream.getUsed();
|
||||||
|
ASSERT_GT(usedSpaceAfter, usedSpaceBefore);
|
||||||
|
|
||||||
|
cmdList.clear();
|
||||||
|
ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(
|
||||||
|
cmdList, ptrOffset(commandQueue->commandStream.getCpuBase(), usedSpaceBefore), usedSpaceAfter - usedSpaceBefore));
|
||||||
|
|
||||||
|
// third execution of command list, different tag allocation and semaphore required
|
||||||
|
auto lriCmds = findAll<MI_LOAD_REGISTER_IMM *>(cmdList.begin(), cmdList.end());
|
||||||
|
ASSERT_EQ(2u, lriCmds.size());
|
||||||
|
|
||||||
|
auto lriCmd = reinterpret_cast<MI_LOAD_REGISTER_IMM *>(*lriCmds[0]);
|
||||||
|
EXPECT_EQ(RegisterOffsets::csGprR0, lriCmd->getRegisterOffset());
|
||||||
|
EXPECT_EQ(getLowPart(otherTaskCount), lriCmd->getDataDword());
|
||||||
|
|
||||||
|
lriCmd = reinterpret_cast<MI_LOAD_REGISTER_IMM *>(*lriCmds[1]);
|
||||||
|
EXPECT_EQ(RegisterOffsets::csGprR0 + 4, lriCmd->getRegisterOffset());
|
||||||
|
EXPECT_EQ(getHighPart(otherTaskCount), lriCmd->getDataDword());
|
||||||
|
|
||||||
|
semWaitCmds = findAll<MI_SEMAPHORE_WAIT *>(lriCmds[1], cmdList.end());
|
||||||
|
ASSERT_EQ(1u, semWaitCmds.size());
|
||||||
|
auto semWaitCmd = reinterpret_cast<MI_SEMAPHORE_WAIT *>(*semWaitCmds[0]);
|
||||||
|
|
||||||
|
EXPECT_EQ(otherTagAllocation.getGpuAddress(), semWaitCmd->getSemaphoreGraphicsAddress());
|
||||||
|
EXPECT_EQ(COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, semWaitCmd->getCompareOperation());
|
||||||
|
|
||||||
|
EXPECT_TRUE(ultCsr->isMadeResident(&otherTagAllocation));
|
||||||
|
|
||||||
|
// verify that all sdi patching commands are after wait synchronize
|
||||||
|
auto sdiCmds = findAll<MI_STORE_DATA_IMM *>(cmdList.begin(), semWaitCmds[0]);
|
||||||
|
EXPECT_EQ(0u, sdiCmds.size());
|
||||||
|
|
||||||
|
sdiCmds = findAll<MI_STORE_DATA_IMM *>(semWaitCmds[0], cmdList.end());
|
||||||
|
EXPECT_NE(0u, sdiCmds.size());
|
||||||
|
|
||||||
|
commandList->destroy();
|
||||||
|
commandQueue->destroy();
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace ult
|
} // namespace ult
|
||||||
} // namespace L0
|
} // namespace L0
|
||||||
|
|||||||
Reference in New Issue
Block a user