Revert "Use GPU instead of CPU address in programming commands for HwTim(...)"

This reverts commit 6202b2222b.
"Use GPU instead of CPU address in programming commands for HwTimeStamps"

Change-Id: I085382d95538ae41068a21c628d606039bf9cdf0
This commit is contained in:
Pawel Wilma 2018-12-20 16:32:47 +01:00 committed by sys_ocldev
parent 1e011f9a08
commit cc1f4bed60
14 changed files with 38 additions and 45 deletions

View File

@ -146,7 +146,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
auto devQueue = this->getContext().getDefaultDeviceQueue();
DeviceQueueHw<GfxFamily> *devQueueHw = castToObject<DeviceQueueHw<GfxFamily>>(devQueue);
TagNode<HwTimeStamps> *hwTimeStamps = nullptr;
HwTimeStamps *hwTimeStamps = nullptr;
auto commandStreamRecieverOwnership = getCommandStreamReceiver().obtainUniqueOwnership();
@ -230,7 +230,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
}
if (this->isProfilingEnabled()) {
// Get allocation for timestamps
hwTimeStamps = eventBuilder.getEvent()->getHwTimeStampNode();
hwTimeStamps = eventBuilder.getEvent()->getHwTimeStampNode()->tag;
if (this->isPerfCountersEnabled()) {
hwPerfCounter = eventBuilder.getEvent()->getHwPerfCounterNode()->tag;
// PERF COUNTER: copy current configuration from queue to event

View File

@ -139,11 +139,11 @@ class GpgpuWalkerHelper {
const iOpenCL::SPatchThreadPayload &threadPayload);
static void dispatchProfilingCommandsStart(
TagNode<HwTimeStamps> &hwTimeStamps,
HwTimeStamps &hwTimeStamps,
OCLRT::LinearStream *commandStream);
static void dispatchProfilingCommandsEnd(
TagNode<HwTimeStamps> &hwTimeStamps,
HwTimeStamps &hwTimeStamps,
OCLRT::LinearStream *commandStream);
static void dispatchPerfCountersNoopidRegisterCommands(

View File

@ -101,17 +101,17 @@ void GpgpuWalkerHelper<GfxFamily>::addAluReadModifyWriteRegister(
template <typename GfxFamily>
void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsStart(
TagNode<HwTimeStamps> &hwTimeStamps,
HwTimeStamps &hwTimeStamps,
OCLRT::LinearStream *commandStream) {
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
// PIPE_CONTROL for global timestamp
uint64_t TimeStampAddress = hwTimeStamps.getGraphicsAllocation()->getGpuAddress() + ptrDiff(&hwTimeStamps.tag->GlobalStartTS, hwTimeStamps.tag);
uint64_t TimeStampAddress = reinterpret_cast<uint64_t>(&(hwTimeStamps.GlobalStartTS));
PipeControlHelper<GfxFamily>::obtainPipeControlAndProgramPostSyncOperation(commandStream, PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, TimeStampAddress, 0llu);
//MI_STORE_REGISTER_MEM for context local timestamp
TimeStampAddress = hwTimeStamps.getGraphicsAllocation()->getGpuAddress() + ptrDiff(&hwTimeStamps.tag->ContextStartTS, hwTimeStamps.tag);
TimeStampAddress = reinterpret_cast<uint64_t>(&(hwTimeStamps.ContextStartTS));
//low part
auto pMICmdLow = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
@ -122,7 +122,7 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsStart(
template <typename GfxFamily>
void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsEnd(
TagNode<HwTimeStamps> &hwTimeStamps,
HwTimeStamps &hwTimeStamps,
OCLRT::LinearStream *commandStream) {
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
@ -133,7 +133,7 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsEnd(
pPipeControlCmd->setCommandStreamerStallEnable(true);
//MI_STORE_REGISTER_MEM for context local timestamp
uint64_t TimeStampAddress = hwTimeStamps.getGraphicsAllocation()->getGpuAddress() + ptrDiff(&hwTimeStamps.tag->ContextEndTS, hwTimeStamps.tag);
uint64_t TimeStampAddress = reinterpret_cast<uint64_t>(&(hwTimeStamps.ContextEndTS));
//low part
auto pMICmdLow = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));

View File

@ -40,7 +40,7 @@ class HardwareInterface {
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
KernelOperation **blockedCommandsData,
TagNode<HwTimeStamps> *hwTimeStamps,
HwTimeStamps *hwTimeStamps,
HwPerfCounter *hwPerfCounter,
TimestampPacketContainer *previousTimestampPacketNodes,
TimestampPacketContainer *currentTimestampPacketNodes,
@ -69,13 +69,13 @@ class HardwareInterface {
static void dispatchProfilingPerfStartCommands(
const DispatchInfo &dispatchInfo,
const MultiDispatchInfo &multiDispatchInfo,
TagNode<HwTimeStamps> *hwTimeStamps,
HwTimeStamps *hwTimeStamps,
HwPerfCounter *hwPerfCounter,
LinearStream *commandStream,
CommandQueue &commandQueue);
static void dispatchProfilingPerfEndCommands(
TagNode<HwTimeStamps> *hwTimeStamps,
HwTimeStamps *hwTimeStamps,
HwPerfCounter *hwPerfCounter,
LinearStream *commandStream,
CommandQueue &commandQueue);

View File

@ -19,7 +19,7 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
KernelOperation **blockedCommandsData,
TagNode<HwTimeStamps> *hwTimeStamps,
HwTimeStamps *hwTimeStamps,
HwPerfCounter *hwPerfCounter,
TimestampPacketContainer *previousTimestampPacketNodes,
TimestampPacketContainer *currentTimestampPacketNodes,

View File

@ -59,7 +59,7 @@ template <typename GfxFamily>
inline void HardwareInterface<GfxFamily>::dispatchProfilingPerfStartCommands(
const DispatchInfo &dispatchInfo,
const MultiDispatchInfo &multiDispatchInfo,
TagNode<HwTimeStamps> *hwTimeStamps,
HwTimeStamps *hwTimeStamps,
HwPerfCounter *hwPerfCounter,
LinearStream *commandStream,
CommandQueue &commandQueue) {
@ -77,7 +77,7 @@ inline void HardwareInterface<GfxFamily>::dispatchProfilingPerfStartCommands(
template <typename GfxFamily>
inline void HardwareInterface<GfxFamily>::dispatchProfilingPerfEndCommands(
TagNode<HwTimeStamps> *hwTimeStamps,
HwTimeStamps *hwTimeStamps,
HwPerfCounter *hwPerfCounter,
LinearStream *commandStream,
CommandQueue &commandQueue) {

View File

@ -143,7 +143,7 @@ void DeviceQueue::initDeviceQueue() {
igilEventPool->m_size = caps.maxOnDeviceEvents;
}
void DeviceQueue::setupExecutionModelDispatch(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentCount, uint32_t taskCount, TagNode<HwTimeStamps> *hwTimeStamp) {
void DeviceQueue::setupExecutionModelDispatch(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentCount, uint32_t taskCount, HwTimeStamps *hwTimeStamp) {
setupIndirectState(surfaceStateHeap, dynamicStateHeap, parentKernel, parentCount);
addExecutionModelCleanUpSection(parentKernel, hwTimeStamp, taskCount);
}
@ -152,7 +152,7 @@ void DeviceQueue::setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHea
return;
}
void DeviceQueue::addExecutionModelCleanUpSection(Kernel *parentKernel, TagNode<HwTimeStamps> *hwTimeStamp, uint32_t taskCount) {
void DeviceQueue::addExecutionModelCleanUpSection(Kernel *parentKernel, HwTimeStamps *hwTimeStamp, uint32_t taskCount) {
return;
}

View File

@ -22,8 +22,6 @@ class Event;
struct MultiDispatchInfo;
class SchedulerKernel;
struct HwTimeStamps;
template <class T>
struct TagNode;
template <>
struct OpenCLObjectMapper<_device_queue> {
@ -68,10 +66,10 @@ class DeviceQueue : public BaseObject<_device_queue> {
size_t paramValueSize, void *paramValue,
size_t *paramValueSizeRet);
void setupExecutionModelDispatch(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentCount, uint32_t taskCount, TagNode<HwTimeStamps> *hwTimeStamp);
void setupExecutionModelDispatch(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentCount, uint32_t taskCount, HwTimeStamps *hwTimeStamp);
virtual void setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount);
virtual void addExecutionModelCleanUpSection(Kernel *parentKernel, TagNode<HwTimeStamps> *hwTimeStamp, uint32_t taskCount);
virtual void addExecutionModelCleanUpSection(Kernel *parentKernel, HwTimeStamps *hwTimeStamp, uint32_t taskCount);
MOCKABLE_VIRTUAL bool isEMCriticalSectionFree() {
auto igilCmdQueue = reinterpret_cast<IGIL_CommandQueue *>(queueBuffer->getUnderlyingBuffer());

View File

@ -55,7 +55,7 @@ class DeviceQueueHw : public DeviceQueue {
void setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount) override;
void addExecutionModelCleanUpSection(Kernel *parentKernel, TagNode<HwTimeStamps> *hwTimeStamp, uint32_t taskCount) override;
void addExecutionModelCleanUpSection(Kernel *parentKernel, HwTimeStamps *hwTimeStamp, uint32_t taskCount) override;
void resetDeviceQueue() override;
void dispatchScheduler(CommandQueue &cmdQ, SchedulerKernel &scheduler, PreemptionMode preemptionMode, IndirectHeap *ssh, IndirectHeap *dsh) override;

View File

@ -12,7 +12,6 @@
#include "runtime/helpers/preamble.h"
#include "runtime/helpers/string.h"
#include "runtime/memory_manager/memory_manager.h"
#include "runtime/utilities/tag_allocator.h"
namespace OCLRT {
template <typename GfxFamily>
@ -202,7 +201,7 @@ void DeviceQueueHw<GfxFamily>::buildSlbDummyCommands() {
}
template <typename GfxFamily>
void DeviceQueueHw<GfxFamily>::addExecutionModelCleanUpSection(Kernel *parentKernel, TagNode<HwTimeStamps> *hwTimeStamp, uint32_t taskCount) {
void DeviceQueueHw<GfxFamily>::addExecutionModelCleanUpSection(Kernel *parentKernel, HwTimeStamps *hwTimeStamp, uint32_t taskCount) {
// CleanUp Section
auto offset = slbCS.getUsed();
auto alignmentSize = alignUp(offset, MemoryConstants::pageSize) - offset;
@ -216,7 +215,7 @@ void DeviceQueueHw<GfxFamily>::addExecutionModelCleanUpSection(Kernel *parentKer
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
if (hwTimeStamp != nullptr) {
uint64_t TimeStampAddress = hwTimeStamp->getGraphicsAllocation()->getGpuAddress() + ptrDiff(&hwTimeStamp->tag->ContextCompleteTS, hwTimeStamp->tag);
uint64_t TimeStampAddress = (uint64_t)((uintptr_t) & (hwTimeStamp->ContextCompleteTS));
igilQueue->m_controls.m_EventTimestampAddress = TimeStampAddress;
addProfilingEndCmds(TimeStampAddress);

View File

@ -451,7 +451,7 @@ void Event::submitCommand(bool abortTasks) {
if ((this->isProfilingEnabled()) && (this->cmdQueue != nullptr)) {
if (timeStampNode) {
this->cmdQueue->getCommandStreamReceiver().makeResident(*timeStampNode->getGraphicsAllocation());
cmdToProcess->timestamp = timeStampNode;
cmdToProcess->timestamp = timeStampNode->tag;
}
if (profilingCpuPath) {
setSubmitTimeStamp();

View File

@ -26,8 +26,6 @@ class Surface;
class PrintfHandler;
struct HwTimeStamps;
class TimestampPacketContainer;
template <class T>
struct TagNode;
enum MapOperationType {
MAP,
@ -44,7 +42,7 @@ class Command : public IFNode<Command> {
virtual LinearStream *getCommandStream() {
return nullptr;
}
TagNode<HwTimeStamps> *timestamp = nullptr;
HwTimeStamps *timestamp = nullptr;
CompletionStamp completionStamp = {};
};

View File

@ -7,7 +7,6 @@
#include "hw_cmds.h"
#include "runtime/helpers/options.h"
#include "runtime/utilities/tag_allocator.h"
#include "unit_tests/fixtures/device_host_queue_fixture.h"
#include "unit_tests/fixtures/execution_model_fixture.h"
#include "unit_tests/helpers/hw_parse.h"
@ -353,19 +352,18 @@ HWCMDTEST_F(IGFX_GEN8_CORE, DeviceQueueSlb, AddEMCleanupSectionWithProfiling) {
MockParentKernel *mockParentKernel = MockParentKernel::create(*pContext);
uint32_t taskCount = 7;
auto hwTimeStamp = pCommandQueue->getCommandStreamReceiver().getEventTsAllocator()->getTag();
HwTimeStamps hwTimeStamp;
mockDeviceQueueHw->buildSlbDummyCommands();
mockDeviceQueueHw->addExecutionModelCleanUpSection(mockParentKernel, hwTimeStamp, taskCount);
mockDeviceQueueHw->addExecutionModelCleanUpSection(mockParentKernel, &hwTimeStamp, taskCount);
uint32_t eventTimestampAddrLow = static_cast<uint32_t>(igilCmdQueue->m_controls.m_EventTimestampAddress & 0xFFFFFFFF);
uint32_t eventTimestampAddrHigh = static_cast<uint32_t>((igilCmdQueue->m_controls.m_EventTimestampAddress & 0xFFFFFFFF00000000) >> 32);
uint32_t eventTimestampLow = (uint32_t)(igilCmdQueue->m_controls.m_EventTimestampAddress & 0xFFFFFFFF);
uint32_t eventTimestampHigh = (uint32_t)((igilCmdQueue->m_controls.m_EventTimestampAddress & 0xFFFFFFFF00000000) >> 32);
uint64_t contextCompleteAddr = hwTimeStamp->getGraphicsAllocation()->getGpuAddress() + ptrDiff(&hwTimeStamp->tag->ContextCompleteTS, hwTimeStamp->tag);
uint32_t contextCompleteAddrLow = static_cast<uint32_t>(contextCompleteAddr & 0xFFFFFFFF);
uint32_t contextCompleteAddrHigh = static_cast<uint32_t>((contextCompleteAddr & 0xFFFFFFFF00000000) >> 32);
uint32_t contextCompleteLow = (uint32_t)((uint64_t)((uintptr_t)(&hwTimeStamp.ContextCompleteTS)) & 0xFFFFFFFF);
uint32_t contextCompleteHigh = (uint32_t)(((uint64_t)((uintptr_t)(&hwTimeStamp.ContextCompleteTS)) & 0xFFFFFFFF00000000) >> 32);
EXPECT_EQ(contextCompleteAddrLow, eventTimestampAddrLow);
EXPECT_EQ(contextCompleteAddrHigh, eventTimestampAddrHigh);
EXPECT_EQ(contextCompleteLow, eventTimestampLow);
EXPECT_EQ(contextCompleteHigh, eventTimestampHigh);
HardwareParse hwParser;
auto *slbCS = mockDeviceQueueHw->getSlbCS();

View File

@ -8,7 +8,6 @@
#include "runtime/command_queue/gpgpu_walker.h"
#include "runtime/command_queue/hardware_interface.h"
#include "runtime/event/hw_timestamps.h"
#include "runtime/utilities/tag_allocator.h"
#include "runtime/helpers/kernel_commands.h"
#include "runtime/helpers/task_information.h"
#include "unit_tests/mocks/mock_command_queue.h"
@ -57,9 +56,9 @@ class MockDeviceQueueHwWithCriticalSectionRelease : public DeviceQueueHw<GfxFami
indirectStateSetup = true;
return BaseClass::setupIndirectState(surfaceStateHeap, dynamicStateHeap, parentKernel, parentIDCount);
}
void addExecutionModelCleanUpSection(Kernel *parentKernel, TagNode<HwTimeStamps> *hwTimeStamp, uint32_t taskCount) override {
void addExecutionModelCleanUpSection(Kernel *parentKernel, HwTimeStamps *hwTimeStamp, uint32_t taskCount) override {
cleanupSectionAdded = true;
timestampAddedInCleanupSection = hwTimeStamp ? hwTimeStamp->tag : nullptr;
timestampAddedInCleanupSection = hwTimeStamp;
return BaseClass::addExecutionModelCleanUpSection(parentKernel, hwTimeStamp, taskCount);
}
void dispatchScheduler(CommandQueue &cmdQ, SchedulerKernel &scheduler, PreemptionMode preemptionMode, IndirectHeap *ssh, IndirectHeap *dsh) override {
@ -250,12 +249,13 @@ HWTEST_F(ParentKernelCommandQueueFixture, givenBlockedParentKernelWithProfilingW
std::vector<Surface *> surfaces;
auto *cmdComputeKernel = new CommandComputeKernel(*pCmdQ, std::unique_ptr<KernelOperation>(blockedCommandData), surfaces, false, false, false, nullptr, preemptionMode, parentKernel, 1);
auto timestamp = pCmdQ->getCommandStreamReceiver().getEventTsAllocator()->getTag();
cmdComputeKernel->timestamp = timestamp;
HwTimeStamps timestamp;
cmdComputeKernel->timestamp = &timestamp;
cmdComputeKernel->submit(0, false);
EXPECT_TRUE(mockDevQueue.cleanupSectionAdded);
EXPECT_EQ(mockDevQueue.timestampAddedInCleanupSection, timestamp->tag);
EXPECT_EQ(mockDevQueue.timestampAddedInCleanupSection, &timestamp);
delete cmdComputeKernel;
delete parentKernel;