diff --git a/runtime/command_queue/enqueue_common.h b/runtime/command_queue/enqueue_common.h index af1451731c..e18f532735 100644 --- a/runtime/command_queue/enqueue_common.h +++ b/runtime/command_queue/enqueue_common.h @@ -488,6 +488,7 @@ void CommandQueueHw::processDeviceEnqueue(DeviceQueueHw *d *devQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE), parentKernel, (uint32_t)multiDispatchInfo.size(), + getGpgpuCommandStreamReceiver().getTagAllocation()->getGpuAddress(), taskCount, hwTimeStamps); diff --git a/runtime/device_queue/device_queue.cpp b/runtime/device_queue/device_queue.cpp index 3dcb8ae158..dbd1869808 100644 --- a/runtime/device_queue/device_queue.cpp +++ b/runtime/device_queue/device_queue.cpp @@ -144,16 +144,17 @@ void DeviceQueue::initDeviceQueue() { igilEventPool->m_size = caps.maxOnDeviceEvents; } -void DeviceQueue::setupExecutionModelDispatch(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentCount, uint32_t taskCount, TagNode *hwTimeStamp) { +void DeviceQueue::setupExecutionModelDispatch(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, + uint32_t parentCount, uint64_t tagAddress, uint32_t taskCount, TagNode *hwTimeStamp) { setupIndirectState(surfaceStateHeap, dynamicStateHeap, parentKernel, parentCount); - addExecutionModelCleanUpSection(parentKernel, hwTimeStamp, taskCount); + addExecutionModelCleanUpSection(parentKernel, hwTimeStamp, tagAddress, taskCount); } void DeviceQueue::setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount) { return; } -void DeviceQueue::addExecutionModelCleanUpSection(Kernel *parentKernel, TagNode *hwTimeStamp, uint32_t taskCount) { +void DeviceQueue::addExecutionModelCleanUpSection(Kernel *parentKernel, TagNode *hwTimeStamp, uint64_t tagAddress, uint32_t taskCount) { return; } diff --git a/runtime/device_queue/device_queue.h b/runtime/device_queue/device_queue.h index 6494cb67ca..3886694fbb 100644 --- a/runtime/device_queue/device_queue.h +++ b/runtime/device_queue/device_queue.h @@ -68,10 +68,10 @@ class DeviceQueue : public BaseObject<_device_queue> { size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet); - void setupExecutionModelDispatch(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentCount, uint32_t taskCount, TagNode *hwTimeStamp); + void setupExecutionModelDispatch(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentCount, uint64_t tagAddress, uint32_t taskCount, TagNode *hwTimeStamp); virtual void setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount); - virtual void addExecutionModelCleanUpSection(Kernel *parentKernel, TagNode *hwTimeStamp, uint32_t taskCount); + virtual void addExecutionModelCleanUpSection(Kernel *parentKernel, TagNode *hwTimeStamp, uint64_t tagAddress, uint32_t taskCount); MOCKABLE_VIRTUAL bool isEMCriticalSectionFree() { auto igilCmdQueue = reinterpret_cast(queueBuffer->getUnderlyingBuffer()); diff --git a/runtime/device_queue/device_queue_hw.h b/runtime/device_queue/device_queue_hw.h index 74bf96d0a9..737772f1fc 100644 --- a/runtime/device_queue/device_queue_hw.h +++ b/runtime/device_queue/device_queue_hw.h @@ -57,7 +57,7 @@ class DeviceQueueHw : public DeviceQueue { void setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount) override; - void addExecutionModelCleanUpSection(Kernel *parentKernel, TagNode *hwTimeStamp, uint32_t taskCount) override; + void addExecutionModelCleanUpSection(Kernel *parentKernel, TagNode *hwTimeStamp, uint64_t tagAddress, uint32_t taskCount) override; void resetDeviceQueue() override; void dispatchScheduler(LinearStream &commandStream, SchedulerKernel &scheduler, PreemptionMode preemptionMode, IndirectHeap *ssh, IndirectHeap *dsh) override; diff --git a/runtime/device_queue/device_queue_hw_base.inl b/runtime/device_queue/device_queue_hw_base.inl index 0dc5d7ee69..cec433761a 100644 --- a/runtime/device_queue/device_queue_hw_base.inl +++ b/runtime/device_queue/device_queue_hw_base.inl @@ -99,7 +99,7 @@ void DeviceQueueHw::initPipeControl(PIPE_CONTROL *pc) { } template -void DeviceQueueHw::addExecutionModelCleanUpSection(Kernel *parentKernel, TagNode *hwTimeStamp, uint32_t taskCount) { +void DeviceQueueHw::addExecutionModelCleanUpSection(Kernel *parentKernel, TagNode *hwTimeStamp, uint64_t tagAddress, uint32_t taskCount) { // CleanUp Section auto offset = slbCS.getUsed(); auto alignmentSize = alignUp(offset, MemoryConstants::pageSize) - offset; @@ -127,8 +127,6 @@ void DeviceQueueHw::addExecutionModelCleanUpSection(Kernel *parentKer PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, criticalSectionAddress, ExecutionModelCriticalSection::Free, false, device->getHardwareInfo()); - uint64_t tagAddress = reinterpret_cast(device->getDefaultEngine().commandStreamReceiver->getTagAddress()); - PipeControlHelper::obtainPipeControlAndProgramPostSyncOperation(slbCS, PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, tagAddress, taskCount, false, device->getHardwareInfo()); diff --git a/runtime/helpers/task_information.cpp b/runtime/helpers/task_information.cpp index 952de32e2b..8366e57ac4 100644 --- a/runtime/helpers/task_information.cpp +++ b/runtime/helpers/task_information.cpp @@ -143,7 +143,8 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate if (executionModelKernel) { uint32_t taskCount = commandStreamReceiver.peekTaskCount() + 1; - devQueue->setupExecutionModelDispatch(*ssh, *dsh, kernel, kernelCount, taskCount, timestamp); + devQueue->setupExecutionModelDispatch(*ssh, *dsh, kernel, kernelCount, + commandStreamReceiver.getTagAllocation()->getGpuAddress(), taskCount, timestamp); BuiltIns &builtIns = *this->kernel->getDevice().getExecutionEnvironment()->getBuiltIns(); SchedulerKernel &scheduler = builtIns.getSchedulerKernel(commandQueue.getContext()); diff --git a/unit_tests/device_queue/device_queue_hw_tests.cpp b/unit_tests/device_queue/device_queue_hw_tests.cpp index 1b5737bee1..3ab9c90906 100644 --- a/unit_tests/device_queue/device_queue_hw_tests.cpp +++ b/unit_tests/device_queue/device_queue_hw_tests.cpp @@ -300,7 +300,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, DeviceQueueSlb, cleanupSection) { uint32_t taskCount = 7; mockDeviceQueueHw->buildSlbDummyCommands(); - mockDeviceQueueHw->addExecutionModelCleanUpSection(mockParentKernel, nullptr, taskCount); + uint64_t tagAddress = 0x123450000; + mockDeviceQueueHw->addExecutionModelCleanUpSection(mockParentKernel, nullptr, tagAddress, taskCount); HardwareParse hwParser; auto *slbCS = mockDeviceQueueHw->getSlbCS(); @@ -329,6 +330,21 @@ HWCMDTEST_F(IGFX_GEN8_CORE, DeviceQueueSlb, cleanupSection) { auto pipeControlItor = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); EXPECT_NE(hwParser.cmdList.end(), pipeControlItor); + bool tagWriteFound = false; + while (auto pipeControlCmd = genCmdCast(*(++pipeControlItor))) { + if (pipeControlCmd->getPostSyncOperation() == PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) { + auto expectedAddressLow = static_cast(tagAddress & 0x0000FFFFFFFFULL); + auto expectedAddressHigh = static_cast(tagAddress >> 32); + + if ((expectedAddressLow == pipeControlCmd->getAddress()) && (expectedAddressHigh == pipeControlCmd->getAddressHigh())) { + tagWriteFound = true; + break; + } + } + } + + EXPECT_TRUE(tagWriteFound); + auto bbEndItor = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); EXPECT_NE(hwParser.cmdList.end(), bbEndItor); MI_BATCH_BUFFER_END *bbEnd = (MI_BATCH_BUFFER_END *)*bbEndItor; @@ -355,7 +371,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, DeviceQueueSlb, AddEMCleanupSectionWithProfiling) { auto hwTimeStamp = pCommandQueue->getGpgpuCommandStreamReceiver().getEventTsAllocator()->getTag(); mockDeviceQueueHw->buildSlbDummyCommands(); - mockDeviceQueueHw->addExecutionModelCleanUpSection(mockParentKernel, hwTimeStamp, taskCount); + mockDeviceQueueHw->addExecutionModelCleanUpSection(mockParentKernel, hwTimeStamp, 0x123, taskCount); uint64_t eventTimestampAddr = igilCmdQueue->m_controls.m_EventTimestampAddress; uint64_t contextCompleteAddr = hwTimeStamp->getGpuAddress() + offsetof(HwTimeStamps, ContextCompleteTS); @@ -673,7 +689,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, TheSimplestDeviceQueueFixture, addExecutionModelClea mockDeviceQueueHw->buildSlbDummyCommands(); EXPECT_FALSE(mockDeviceQueueHw->addMediaStateClearCmdsCalled); - mockDeviceQueueHw->addExecutionModelCleanUpSection(mockParentKernel.get(), nullptr, taskCount); + mockDeviceQueueHw->addExecutionModelCleanUpSection(mockParentKernel.get(), nullptr, 0x123, taskCount); EXPECT_TRUE(mockDeviceQueueHw->addMediaStateClearCmdsCalled); } diff --git a/unit_tests/device_queue/device_queue_tests.cpp b/unit_tests/device_queue/device_queue_tests.cpp index ef88b01b52..1f45ec7a3a 100644 --- a/unit_tests/device_queue/device_queue_tests.cpp +++ b/unit_tests/device_queue/device_queue_tests.cpp @@ -27,7 +27,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, DeviceQueueSimpleTest, setupExecutionModelDispatchDo size_t size = 20; IndirectHeap ssh(buffer, size); IndirectHeap dsh(buffer, size); - devQueue.setupExecutionModelDispatch(ssh, dsh, nullptr, 0, 0, 0); + devQueue.setupExecutionModelDispatch(ssh, dsh, nullptr, 0, 0, 0x123, 0); EXPECT_EQ(0u, ssh.getUsed()); diff --git a/unit_tests/execution_model/submit_blocked_parent_kernel_tests.cpp b/unit_tests/execution_model/submit_blocked_parent_kernel_tests.cpp index c7da186cfd..526852d38a 100644 --- a/unit_tests/execution_model/submit_blocked_parent_kernel_tests.cpp +++ b/unit_tests/execution_model/submit_blocked_parent_kernel_tests.cpp @@ -56,10 +56,10 @@ class MockDeviceQueueHwWithCriticalSectionRelease : public DeviceQueueHw *hwTimeStamp, uint32_t taskCount) override { + void addExecutionModelCleanUpSection(Kernel *parentKernel, TagNode *hwTimeStamp, uint64_t tagAddress, uint32_t taskCount) override { cleanupSectionAdded = true; timestampAddedInCleanupSection = hwTimeStamp ? hwTimeStamp->tagForCpuAccess : nullptr; - return BaseClass::addExecutionModelCleanUpSection(parentKernel, hwTimeStamp, taskCount); + return BaseClass::addExecutionModelCleanUpSection(parentKernel, hwTimeStamp, tagAddress, taskCount); } void dispatchScheduler(LinearStream &commandStream, SchedulerKernel &scheduler, PreemptionMode preemptionMode, IndirectHeap *ssh, IndirectHeap *dsh) override { schedulerDispatched = true;