From cc1f4bed609e61139bbe7ee5b4f95aaaf076d9e3 Mon Sep 17 00:00:00 2001 From: Pawel Wilma Date: Thu, 20 Dec 2018 16:32:47 +0100 Subject: [PATCH] Revert "Use GPU instead of CPU address in programming commands for HwTim(...)" This reverts commit 6202b2222bb5ac24104a2f607f861b0a9179a94d. "Use GPU instead of CPU address in programming commands for HwTimeStamps" Change-Id: I085382d95538ae41068a21c628d606039bf9cdf0 --- runtime/command_queue/enqueue_common.h | 4 ++-- runtime/command_queue/gpgpu_walker.h | 4 ++-- runtime/command_queue/gpgpu_walker.inl | 10 +++++----- runtime/command_queue/hardware_interface.h | 6 +++--- runtime/command_queue/hardware_interface.inl | 2 +- .../command_queue/hardware_interface_base.inl | 4 ++-- runtime/device_queue/device_queue.cpp | 4 ++-- runtime/device_queue/device_queue.h | 6 ++---- runtime/device_queue/device_queue_hw.h | 2 +- runtime/device_queue/device_queue_hw.inl | 5 ++--- runtime/event/event.cpp | 2 +- runtime/helpers/task_information.h | 4 +--- .../device_queue/device_queue_hw_tests.cpp | 18 ++++++++---------- .../submit_blocked_parent_kernel_tests.cpp | 12 ++++++------ 14 files changed, 38 insertions(+), 45 deletions(-) diff --git a/runtime/command_queue/enqueue_common.h b/runtime/command_queue/enqueue_common.h index e211340414..8991964c72 100644 --- a/runtime/command_queue/enqueue_common.h +++ b/runtime/command_queue/enqueue_common.h @@ -146,7 +146,7 @@ void CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, auto devQueue = this->getContext().getDefaultDeviceQueue(); DeviceQueueHw *devQueueHw = castToObject>(devQueue); - TagNode *hwTimeStamps = nullptr; + HwTimeStamps *hwTimeStamps = nullptr; auto commandStreamRecieverOwnership = getCommandStreamReceiver().obtainUniqueOwnership(); @@ -230,7 +230,7 @@ void CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, } if (this->isProfilingEnabled()) { // Get allocation for timestamps - hwTimeStamps = eventBuilder.getEvent()->getHwTimeStampNode(); + hwTimeStamps = eventBuilder.getEvent()->getHwTimeStampNode()->tag; if (this->isPerfCountersEnabled()) { hwPerfCounter = eventBuilder.getEvent()->getHwPerfCounterNode()->tag; // PERF COUNTER: copy current configuration from queue to event diff --git a/runtime/command_queue/gpgpu_walker.h b/runtime/command_queue/gpgpu_walker.h index 0e02d2b601..8e02d4a4da 100644 --- a/runtime/command_queue/gpgpu_walker.h +++ b/runtime/command_queue/gpgpu_walker.h @@ -139,11 +139,11 @@ class GpgpuWalkerHelper { const iOpenCL::SPatchThreadPayload &threadPayload); static void dispatchProfilingCommandsStart( - TagNode &hwTimeStamps, + HwTimeStamps &hwTimeStamps, OCLRT::LinearStream *commandStream); static void dispatchProfilingCommandsEnd( - TagNode &hwTimeStamps, + HwTimeStamps &hwTimeStamps, OCLRT::LinearStream *commandStream); static void dispatchPerfCountersNoopidRegisterCommands( diff --git a/runtime/command_queue/gpgpu_walker.inl b/runtime/command_queue/gpgpu_walker.inl index c16deec6e2..4b6275f10d 100644 --- a/runtime/command_queue/gpgpu_walker.inl +++ b/runtime/command_queue/gpgpu_walker.inl @@ -101,17 +101,17 @@ void GpgpuWalkerHelper::addAluReadModifyWriteRegister( template void GpgpuWalkerHelper::dispatchProfilingCommandsStart( - TagNode &hwTimeStamps, + HwTimeStamps &hwTimeStamps, OCLRT::LinearStream *commandStream) { using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; // PIPE_CONTROL for global timestamp - uint64_t TimeStampAddress = hwTimeStamps.getGraphicsAllocation()->getGpuAddress() + ptrDiff(&hwTimeStamps.tag->GlobalStartTS, hwTimeStamps.tag); + uint64_t TimeStampAddress = reinterpret_cast(&(hwTimeStamps.GlobalStartTS)); PipeControlHelper::obtainPipeControlAndProgramPostSyncOperation(commandStream, PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, TimeStampAddress, 0llu); //MI_STORE_REGISTER_MEM for context local timestamp - TimeStampAddress = hwTimeStamps.getGraphicsAllocation()->getGpuAddress() + ptrDiff(&hwTimeStamps.tag->ContextStartTS, hwTimeStamps.tag); + TimeStampAddress = reinterpret_cast(&(hwTimeStamps.ContextStartTS)); //low part auto pMICmdLow = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM)); @@ -122,7 +122,7 @@ void GpgpuWalkerHelper::dispatchProfilingCommandsStart( template void GpgpuWalkerHelper::dispatchProfilingCommandsEnd( - TagNode &hwTimeStamps, + HwTimeStamps &hwTimeStamps, OCLRT::LinearStream *commandStream) { using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; @@ -133,7 +133,7 @@ void GpgpuWalkerHelper::dispatchProfilingCommandsEnd( pPipeControlCmd->setCommandStreamerStallEnable(true); //MI_STORE_REGISTER_MEM for context local timestamp - uint64_t TimeStampAddress = hwTimeStamps.getGraphicsAllocation()->getGpuAddress() + ptrDiff(&hwTimeStamps.tag->ContextEndTS, hwTimeStamps.tag); + uint64_t TimeStampAddress = reinterpret_cast(&(hwTimeStamps.ContextEndTS)); //low part auto pMICmdLow = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM)); diff --git a/runtime/command_queue/hardware_interface.h b/runtime/command_queue/hardware_interface.h index bc46e4ff82..ceb34d310c 100644 --- a/runtime/command_queue/hardware_interface.h +++ b/runtime/command_queue/hardware_interface.h @@ -40,7 +40,7 @@ class HardwareInterface { cl_uint numEventsInWaitList, const cl_event *eventWaitList, KernelOperation **blockedCommandsData, - TagNode *hwTimeStamps, + HwTimeStamps *hwTimeStamps, HwPerfCounter *hwPerfCounter, TimestampPacketContainer *previousTimestampPacketNodes, TimestampPacketContainer *currentTimestampPacketNodes, @@ -69,13 +69,13 @@ class HardwareInterface { static void dispatchProfilingPerfStartCommands( const DispatchInfo &dispatchInfo, const MultiDispatchInfo &multiDispatchInfo, - TagNode *hwTimeStamps, + HwTimeStamps *hwTimeStamps, HwPerfCounter *hwPerfCounter, LinearStream *commandStream, CommandQueue &commandQueue); static void dispatchProfilingPerfEndCommands( - TagNode *hwTimeStamps, + HwTimeStamps *hwTimeStamps, HwPerfCounter *hwPerfCounter, LinearStream *commandStream, CommandQueue &commandQueue); diff --git a/runtime/command_queue/hardware_interface.inl b/runtime/command_queue/hardware_interface.inl index 021bcb8dec..03aeb30261 100644 --- a/runtime/command_queue/hardware_interface.inl +++ b/runtime/command_queue/hardware_interface.inl @@ -19,7 +19,7 @@ void HardwareInterface::dispatchWalker( cl_uint numEventsInWaitList, const cl_event *eventWaitList, KernelOperation **blockedCommandsData, - TagNode *hwTimeStamps, + HwTimeStamps *hwTimeStamps, HwPerfCounter *hwPerfCounter, TimestampPacketContainer *previousTimestampPacketNodes, TimestampPacketContainer *currentTimestampPacketNodes, diff --git a/runtime/command_queue/hardware_interface_base.inl b/runtime/command_queue/hardware_interface_base.inl index 33c5645794..528a5d36a6 100644 --- a/runtime/command_queue/hardware_interface_base.inl +++ b/runtime/command_queue/hardware_interface_base.inl @@ -59,7 +59,7 @@ template inline void HardwareInterface::dispatchProfilingPerfStartCommands( const DispatchInfo &dispatchInfo, const MultiDispatchInfo &multiDispatchInfo, - TagNode *hwTimeStamps, + HwTimeStamps *hwTimeStamps, HwPerfCounter *hwPerfCounter, LinearStream *commandStream, CommandQueue &commandQueue) { @@ -77,7 +77,7 @@ inline void HardwareInterface::dispatchProfilingPerfStartCommands( template inline void HardwareInterface::dispatchProfilingPerfEndCommands( - TagNode *hwTimeStamps, + HwTimeStamps *hwTimeStamps, HwPerfCounter *hwPerfCounter, LinearStream *commandStream, CommandQueue &commandQueue) { diff --git a/runtime/device_queue/device_queue.cpp b/runtime/device_queue/device_queue.cpp index 7065a33f1c..68580259dd 100644 --- a/runtime/device_queue/device_queue.cpp +++ b/runtime/device_queue/device_queue.cpp @@ -143,7 +143,7 @@ void DeviceQueue::initDeviceQueue() { igilEventPool->m_size = caps.maxOnDeviceEvents; } -void DeviceQueue::setupExecutionModelDispatch(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentCount, uint32_t taskCount, TagNode *hwTimeStamp) { +void DeviceQueue::setupExecutionModelDispatch(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentCount, uint32_t taskCount, HwTimeStamps *hwTimeStamp) { setupIndirectState(surfaceStateHeap, dynamicStateHeap, parentKernel, parentCount); addExecutionModelCleanUpSection(parentKernel, hwTimeStamp, taskCount); } @@ -152,7 +152,7 @@ void DeviceQueue::setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHea return; } -void DeviceQueue::addExecutionModelCleanUpSection(Kernel *parentKernel, TagNode *hwTimeStamp, uint32_t taskCount) { +void DeviceQueue::addExecutionModelCleanUpSection(Kernel *parentKernel, HwTimeStamps *hwTimeStamp, uint32_t taskCount) { return; } diff --git a/runtime/device_queue/device_queue.h b/runtime/device_queue/device_queue.h index a8311781d4..de8765bcc8 100644 --- a/runtime/device_queue/device_queue.h +++ b/runtime/device_queue/device_queue.h @@ -22,8 +22,6 @@ class Event; struct MultiDispatchInfo; class SchedulerKernel; struct HwTimeStamps; -template -struct TagNode; template <> struct OpenCLObjectMapper<_device_queue> { @@ -68,10 +66,10 @@ class DeviceQueue : public BaseObject<_device_queue> { size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet); - void setupExecutionModelDispatch(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentCount, uint32_t taskCount, TagNode *hwTimeStamp); + void setupExecutionModelDispatch(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentCount, uint32_t taskCount, HwTimeStamps *hwTimeStamp); virtual void setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount); - virtual void addExecutionModelCleanUpSection(Kernel *parentKernel, TagNode *hwTimeStamp, uint32_t taskCount); + virtual void addExecutionModelCleanUpSection(Kernel *parentKernel, HwTimeStamps *hwTimeStamp, uint32_t taskCount); MOCKABLE_VIRTUAL bool isEMCriticalSectionFree() { auto igilCmdQueue = reinterpret_cast(queueBuffer->getUnderlyingBuffer()); diff --git a/runtime/device_queue/device_queue_hw.h b/runtime/device_queue/device_queue_hw.h index c7921922e0..ea5135fdd4 100644 --- a/runtime/device_queue/device_queue_hw.h +++ b/runtime/device_queue/device_queue_hw.h @@ -55,7 +55,7 @@ class DeviceQueueHw : public DeviceQueue { void setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount) override; - void addExecutionModelCleanUpSection(Kernel *parentKernel, TagNode *hwTimeStamp, uint32_t taskCount) override; + void addExecutionModelCleanUpSection(Kernel *parentKernel, HwTimeStamps *hwTimeStamp, uint32_t taskCount) override; void resetDeviceQueue() override; void dispatchScheduler(CommandQueue &cmdQ, SchedulerKernel &scheduler, PreemptionMode preemptionMode, IndirectHeap *ssh, IndirectHeap *dsh) override; diff --git a/runtime/device_queue/device_queue_hw.inl b/runtime/device_queue/device_queue_hw.inl index 52e27d3596..8e51a656ed 100644 --- a/runtime/device_queue/device_queue_hw.inl +++ b/runtime/device_queue/device_queue_hw.inl @@ -12,7 +12,6 @@ #include "runtime/helpers/preamble.h" #include "runtime/helpers/string.h" #include "runtime/memory_manager/memory_manager.h" -#include "runtime/utilities/tag_allocator.h" namespace OCLRT { template @@ -202,7 +201,7 @@ void DeviceQueueHw::buildSlbDummyCommands() { } template -void DeviceQueueHw::addExecutionModelCleanUpSection(Kernel *parentKernel, TagNode *hwTimeStamp, uint32_t taskCount) { +void DeviceQueueHw::addExecutionModelCleanUpSection(Kernel *parentKernel, HwTimeStamps *hwTimeStamp, uint32_t taskCount) { // CleanUp Section auto offset = slbCS.getUsed(); auto alignmentSize = alignUp(offset, MemoryConstants::pageSize) - offset; @@ -216,7 +215,7 @@ void DeviceQueueHw::addExecutionModelCleanUpSection(Kernel *parentKer using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL; if (hwTimeStamp != nullptr) { - uint64_t TimeStampAddress = hwTimeStamp->getGraphicsAllocation()->getGpuAddress() + ptrDiff(&hwTimeStamp->tag->ContextCompleteTS, hwTimeStamp->tag); + uint64_t TimeStampAddress = (uint64_t)((uintptr_t) & (hwTimeStamp->ContextCompleteTS)); igilQueue->m_controls.m_EventTimestampAddress = TimeStampAddress; addProfilingEndCmds(TimeStampAddress); diff --git a/runtime/event/event.cpp b/runtime/event/event.cpp index 4bf0702aee..2d25a502a4 100644 --- a/runtime/event/event.cpp +++ b/runtime/event/event.cpp @@ -451,7 +451,7 @@ void Event::submitCommand(bool abortTasks) { if ((this->isProfilingEnabled()) && (this->cmdQueue != nullptr)) { if (timeStampNode) { this->cmdQueue->getCommandStreamReceiver().makeResident(*timeStampNode->getGraphicsAllocation()); - cmdToProcess->timestamp = timeStampNode; + cmdToProcess->timestamp = timeStampNode->tag; } if (profilingCpuPath) { setSubmitTimeStamp(); diff --git a/runtime/helpers/task_information.h b/runtime/helpers/task_information.h index 16ef736700..abde814543 100644 --- a/runtime/helpers/task_information.h +++ b/runtime/helpers/task_information.h @@ -26,8 +26,6 @@ class Surface; class PrintfHandler; struct HwTimeStamps; class TimestampPacketContainer; -template -struct TagNode; enum MapOperationType { MAP, @@ -44,7 +42,7 @@ class Command : public IFNode { virtual LinearStream *getCommandStream() { return nullptr; } - TagNode *timestamp = nullptr; + HwTimeStamps *timestamp = nullptr; CompletionStamp completionStamp = {}; }; diff --git a/unit_tests/device_queue/device_queue_hw_tests.cpp b/unit_tests/device_queue/device_queue_hw_tests.cpp index 0bf90281d7..ec180e9b27 100644 --- a/unit_tests/device_queue/device_queue_hw_tests.cpp +++ b/unit_tests/device_queue/device_queue_hw_tests.cpp @@ -7,7 +7,6 @@ #include "hw_cmds.h" #include "runtime/helpers/options.h" -#include "runtime/utilities/tag_allocator.h" #include "unit_tests/fixtures/device_host_queue_fixture.h" #include "unit_tests/fixtures/execution_model_fixture.h" #include "unit_tests/helpers/hw_parse.h" @@ -353,19 +352,18 @@ HWCMDTEST_F(IGFX_GEN8_CORE, DeviceQueueSlb, AddEMCleanupSectionWithProfiling) { MockParentKernel *mockParentKernel = MockParentKernel::create(*pContext); uint32_t taskCount = 7; - auto hwTimeStamp = pCommandQueue->getCommandStreamReceiver().getEventTsAllocator()->getTag(); + HwTimeStamps hwTimeStamp; mockDeviceQueueHw->buildSlbDummyCommands(); - mockDeviceQueueHw->addExecutionModelCleanUpSection(mockParentKernel, hwTimeStamp, taskCount); + mockDeviceQueueHw->addExecutionModelCleanUpSection(mockParentKernel, &hwTimeStamp, taskCount); - uint32_t eventTimestampAddrLow = static_cast(igilCmdQueue->m_controls.m_EventTimestampAddress & 0xFFFFFFFF); - uint32_t eventTimestampAddrHigh = static_cast((igilCmdQueue->m_controls.m_EventTimestampAddress & 0xFFFFFFFF00000000) >> 32); + uint32_t eventTimestampLow = (uint32_t)(igilCmdQueue->m_controls.m_EventTimestampAddress & 0xFFFFFFFF); + uint32_t eventTimestampHigh = (uint32_t)((igilCmdQueue->m_controls.m_EventTimestampAddress & 0xFFFFFFFF00000000) >> 32); - uint64_t contextCompleteAddr = hwTimeStamp->getGraphicsAllocation()->getGpuAddress() + ptrDiff(&hwTimeStamp->tag->ContextCompleteTS, hwTimeStamp->tag); - uint32_t contextCompleteAddrLow = static_cast(contextCompleteAddr & 0xFFFFFFFF); - uint32_t contextCompleteAddrHigh = static_cast((contextCompleteAddr & 0xFFFFFFFF00000000) >> 32); + uint32_t contextCompleteLow = (uint32_t)((uint64_t)((uintptr_t)(&hwTimeStamp.ContextCompleteTS)) & 0xFFFFFFFF); + uint32_t contextCompleteHigh = (uint32_t)(((uint64_t)((uintptr_t)(&hwTimeStamp.ContextCompleteTS)) & 0xFFFFFFFF00000000) >> 32); - EXPECT_EQ(contextCompleteAddrLow, eventTimestampAddrLow); - EXPECT_EQ(contextCompleteAddrHigh, eventTimestampAddrHigh); + EXPECT_EQ(contextCompleteLow, eventTimestampLow); + EXPECT_EQ(contextCompleteHigh, eventTimestampHigh); HardwareParse hwParser; auto *slbCS = mockDeviceQueueHw->getSlbCS(); diff --git a/unit_tests/execution_model/submit_blocked_parent_kernel_tests.cpp b/unit_tests/execution_model/submit_blocked_parent_kernel_tests.cpp index 4122645e2d..e60c2f16a1 100644 --- a/unit_tests/execution_model/submit_blocked_parent_kernel_tests.cpp +++ b/unit_tests/execution_model/submit_blocked_parent_kernel_tests.cpp @@ -8,7 +8,6 @@ #include "runtime/command_queue/gpgpu_walker.h" #include "runtime/command_queue/hardware_interface.h" #include "runtime/event/hw_timestamps.h" -#include "runtime/utilities/tag_allocator.h" #include "runtime/helpers/kernel_commands.h" #include "runtime/helpers/task_information.h" #include "unit_tests/mocks/mock_command_queue.h" @@ -57,9 +56,9 @@ class MockDeviceQueueHwWithCriticalSectionRelease : public DeviceQueueHw *hwTimeStamp, uint32_t taskCount) override { + void addExecutionModelCleanUpSection(Kernel *parentKernel, HwTimeStamps *hwTimeStamp, uint32_t taskCount) override { cleanupSectionAdded = true; - timestampAddedInCleanupSection = hwTimeStamp ? hwTimeStamp->tag : nullptr; + timestampAddedInCleanupSection = hwTimeStamp; return BaseClass::addExecutionModelCleanUpSection(parentKernel, hwTimeStamp, taskCount); } void dispatchScheduler(CommandQueue &cmdQ, SchedulerKernel &scheduler, PreemptionMode preemptionMode, IndirectHeap *ssh, IndirectHeap *dsh) override { @@ -250,12 +249,13 @@ HWTEST_F(ParentKernelCommandQueueFixture, givenBlockedParentKernelWithProfilingW std::vector surfaces; auto *cmdComputeKernel = new CommandComputeKernel(*pCmdQ, std::unique_ptr(blockedCommandData), surfaces, false, false, false, nullptr, preemptionMode, parentKernel, 1); - auto timestamp = pCmdQ->getCommandStreamReceiver().getEventTsAllocator()->getTag(); - cmdComputeKernel->timestamp = timestamp; + HwTimeStamps timestamp; + + cmdComputeKernel->timestamp = ×tamp; cmdComputeKernel->submit(0, false); EXPECT_TRUE(mockDevQueue.cleanupSectionAdded); - EXPECT_EQ(mockDevQueue.timestampAddedInCleanupSection, timestamp->tag); + EXPECT_EQ(mockDevQueue.timestampAddedInCleanupSection, ×tamp); delete cmdComputeKernel; delete parentKernel;