Use MI_SEMAPHORE_WAIT command for event synchronization

Related-To: NEO-5508
Signed-off-by: Krzysztof Gibala <krzysztof.gibala@intel.com>
This commit is contained in:
Krzysztof Gibala
2021-03-11 13:48:04 +00:00
committed by Compute-Runtime-Automation
parent 2b956651a7
commit b01b8ba5ac
17 changed files with 395 additions and 165 deletions

View File

@ -585,46 +585,6 @@ bool CommandQueue::validateCapabilityForOperation(cl_command_queue_capabilities_
return operationValid && waitListValid && outEventValid;
}
void CommandQueue::waitForEventsFromDifferentRootDeviceIndex(cl_uint numEventsInWaitList, const cl_event *eventWaitList,
StackVec<cl_event, 8> &waitListCurrentRootDeviceIndex, bool &isEventWaitListFromPreviousRootDevice) {
isEventWaitListFromPreviousRootDevice = false;
for (auto &rootDeviceIndex : context->getRootDeviceIndices()) {
CommandQueue *commandQueuePreviousRootDevice = nullptr;
auto maxTaskCountPreviousRootDevice = 0u;
if (this->getDevice().getRootDeviceIndex() != rootDeviceIndex) {
for (auto eventId = 0u; eventId < numEventsInWaitList; eventId++) {
auto event = castToObject<Event>(eventWaitList[eventId]);
if (event->getCommandQueue() && event->getCommandQueue()->getDevice().getRootDeviceIndex() == rootDeviceIndex) {
maxTaskCountPreviousRootDevice = std::max(maxTaskCountPreviousRootDevice, event->peekTaskCount());
commandQueuePreviousRootDevice = event->getCommandQueue();
isEventWaitListFromPreviousRootDevice = true;
}
}
if (maxTaskCountPreviousRootDevice) {
commandQueuePreviousRootDevice->getCommandStreamReceiver(false).waitForCompletionWithTimeout(false, 0, maxTaskCountPreviousRootDevice);
}
}
}
if (isEventWaitListFromPreviousRootDevice) {
for (auto eventId = 0u; eventId < numEventsInWaitList; eventId++) {
auto event = castToObject<Event>(eventWaitList[eventId]);
if (event->getCommandQueue()) {
if (event->getCommandQueue()->getDevice().getRootDeviceIndex() == this->getDevice().getRootDeviceIndex()) {
waitListCurrentRootDeviceIndex.push_back(static_cast<cl_event>(eventWaitList[eventId]));
}
} else {
waitListCurrentRootDeviceIndex.push_back(static_cast<cl_event>(eventWaitList[eventId]));
}
}
}
}
cl_uint CommandQueue::getQueueFamilyIndex() const {
if (isQueueFamilySelected()) {
return queueFamilyIndex;

View File

@ -303,8 +303,6 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
bool validateCapability(cl_command_queue_capabilities_intel capability) const;
bool validateCapabilitiesForEventWaitList(cl_uint numEventsInWaitList, const cl_event *waitList) const;
bool validateCapabilityForOperation(cl_command_queue_capabilities_intel capability, cl_uint numEventsInWaitList, const cl_event *waitList, const cl_event *outEvent) const;
void waitForEventsFromDifferentRootDeviceIndex(cl_uint numEventsInWaitList, const cl_event *eventWaitList,
StackVec<cl_event, 8> &waitListCurrentRootDeviceIndex, bool &isEventWaitListFromPreviousRootDevice);
cl_uint getQueueFamilyIndex() const;
cl_uint getQueueIndexWithinFamily() const { return queueIndexWithinFamily; }
bool isQueueFamilySelected() const { return queueFamilySelected; }

View File

@ -159,16 +159,6 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
return;
}
StackVec<cl_event, 8> waitListCurrentRootDeviceIndex;
bool isEventWaitListFromPreviousRootDevice = false;
if (context->getRootDeviceIndices().size() > 1u) {
waitForEventsFromDifferentRootDeviceIndex(numEventsInWaitList, eventWaitList, waitListCurrentRootDeviceIndex, isEventWaitListFromPreviousRootDevice);
}
const cl_event *eventWaitListCurrentRootDevice = isEventWaitListFromPreviousRootDevice ? waitListCurrentRootDeviceIndex.data() : eventWaitList;
cl_uint numEventsInWaitListCurrentRootDevice = isEventWaitListFromPreviousRootDevice ? static_cast<cl_uint>(waitListCurrentRootDeviceIndex.size()) : numEventsInWaitList;
Kernel *parentKernel = multiDispatchInfo.peekParentKernel();
auto devQueue = this->getContext().getDefaultDeviceQueue();
DeviceQueueHw<GfxFamily> *devQueueHw = castToObject<DeviceQueueHw<GfxFamily>>(devQueue);
@ -187,7 +177,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
auto blockQueue = false;
auto taskLevel = 0u;
obtainTaskLevelAndBlockedStatus(taskLevel, numEventsInWaitListCurrentRootDevice, eventWaitListCurrentRootDevice, blockQueue, commandType);
obtainTaskLevelAndBlockedStatus(taskLevel, numEventsInWaitList, eventWaitList, blockQueue, commandType);
if (parentKernel && !blockQueue) {
while (!devQueueHw->isEMCriticalSectionFree())
@ -203,14 +193,16 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
}
TimestampPacketDependencies timestampPacketDependencies;
EventsRequest eventsRequest(numEventsInWaitListCurrentRootDevice, eventWaitListCurrentRootDevice, event);
EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, event);
CsrDependencies csrDeps;
BlitPropertiesContainer blitPropertiesContainer;
eventsRequest.fillCsrDependenciesForTaskCountContainer(csrDeps, getGpgpuCommandStreamReceiver());
bool enqueueWithBlitAuxTranslation = isBlitAuxTranslationRequired(multiDispatchInfo);
if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
eventsRequest.fillCsrDependencies(csrDeps, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr);
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr);
auto allocator = getGpgpuCommandStreamReceiver().getTimestampPacketAllocator();
size_t nodesCount = 0u;
@ -227,7 +219,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
if (nodesCount > 0) {
obtainNewTimestampPacketNodes(nodesCount, timestampPacketDependencies.previousEnqueueNodes, clearAllDependencies, false);
csrDeps.push_back(&timestampPacketDependencies.previousEnqueueNodes);
csrDeps.timestampPacketContainer.push_back(&timestampPacketDependencies.previousEnqueueNodes);
}
}
@ -235,6 +227,8 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
blockedCommandsData, surfacesForResidency, numSurfaceForResidency);
auto commandStreamStart = commandStream.getUsed();
TimestampPacketHelper::programCsrDependenciesForForTaskCountContainer<GfxFamily>(commandStream, csrDeps);
if (enqueueWithBlitAuxTranslation) {
processDispatchForBlitAuxTranslation(multiDispatchInfo, blitPropertiesContainer, timestampPacketDependencies,
eventsRequest, blockQueue);
@ -269,7 +263,10 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
}
}
if (flushDependenciesForNonKernelCommand) {
TimestampPacketHelper::programCsrDependencies<GfxFamily>(commandStream, csrDeps, getGpgpuCommandStreamReceiver().getOsContext().getNumSupportedDevices());
TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer<GfxFamily>(
commandStream,
csrDeps,
getGpgpuCommandStreamReceiver().getOsContext().getNumSupportedDevices());
}
}
@ -325,10 +322,10 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
auto maxTaskCountCurrentRootDevice = this->taskCount;
for (auto eventId = 0u; eventId < numEventsInWaitListCurrentRootDevice; eventId++) {
auto event = castToObject<Event>(eventWaitListCurrentRootDevice[eventId]);
for (auto eventId = 0u; eventId < numEventsInWaitList; eventId++) {
auto event = castToObject<Event>(eventWaitList[eventId]);
if (!event->isUserEvent() && !event->isExternallySynchronized()) {
if (event->getCommandQueue() && event->getCommandQueue()->getDevice().getRootDeviceIndex() == this->getDevice().getRootDeviceIndex()) {
maxTaskCountCurrentRootDevice = std::max(maxTaskCountCurrentRootDevice, event->peekTaskCount());
}
}
@ -467,12 +464,12 @@ BlitProperties CommandQueueHw<GfxFamily>::processDispatchForBlitEnqueue(const Mu
auto blitProperties = ClBlitProperties::constructProperties(blitDirection, *blitCommandStreamReceiver,
multiDispatchInfo.peekBuiltinOpParams());
if (!queueBlocked) {
eventsRequest.fillCsrDependencies(blitProperties.csrDependencies, *blitCommandStreamReceiver,
CsrDependencies::DependenciesType::All);
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(blitProperties.csrDependencies, *blitCommandStreamReceiver,
CsrDependencies::DependenciesType::All);
blitProperties.csrDependencies.push_back(&timestampPacketDependencies.cacheFlushNodes);
blitProperties.csrDependencies.push_back(&timestampPacketDependencies.previousEnqueueNodes);
blitProperties.csrDependencies.push_back(&timestampPacketDependencies.barrierNodes);
blitProperties.csrDependencies.timestampPacketContainer.push_back(&timestampPacketDependencies.cacheFlushNodes);
blitProperties.csrDependencies.timestampPacketContainer.push_back(&timestampPacketDependencies.previousEnqueueNodes);
blitProperties.csrDependencies.timestampPacketContainer.push_back(&timestampPacketDependencies.barrierNodes);
}
auto currentTimestampPacketNode = timestampPacketContainer->peekNodes().at(0);
@ -537,7 +534,7 @@ void CommandQueueHw<GfxFamily>::processDispatchForBlitAuxTranslation(const Multi
if (!queueBlocked) {
CsrDependencies csrDeps;
eventsRequest.fillCsrDependencies(csrDeps, *getBcsCommandStreamReceiver(), CsrDependencies::DependenciesType::All);
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, *getBcsCommandStreamReceiver(), CsrDependencies::DependenciesType::All);
BlitProperties::setupDependenciesForAuxTranslation(blitPropertiesContainer, timestampPacketDependencies,
*this->timestampPacketContainer, csrDeps,
getGpgpuCommandStreamReceiver(), *getBcsCommandStreamReceiver());
@ -550,7 +547,10 @@ void CommandQueueHw<GfxFamily>::processDispatchForCacheFlush(Surface **surfaces,
LinearStream *commandStream,
CsrDependencies &csrDeps) {
TimestampPacketHelper::programCsrDependencies<GfxFamily>(*commandStream, csrDeps, getGpgpuCommandStreamReceiver().getOsContext().getNumSupportedDevices());
TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer<GfxFamily>(
*commandStream,
csrDeps,
getGpgpuCommandStreamReceiver().getOsContext().getNumSupportedDevices());
uint64_t postSyncAddress = 0;
if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
@ -813,7 +813,7 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueNonBlocked(
dispatchFlags.pipelineSelectArgs.specialPipelineSelectMode = specialPipelineSelectMode;
if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
eventsRequest.fillCsrDependencies(dispatchFlags.csrDependencies, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OutOfCsr);
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OutOfCsr);
dispatchFlags.csrDependencies.makeResident(getGpgpuCommandStreamReceiver());
}
@ -1027,7 +1027,7 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueCommandWithoutKernel(
1u); //numDevicesInContext
if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
eventsRequest.fillCsrDependencies(dispatchFlags.csrDependencies, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OutOfCsr);
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OutOfCsr);
dispatchFlags.csrDependencies.makeResident(getGpgpuCommandStreamReceiver());
}
@ -1106,7 +1106,7 @@ void CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDispat
BlitPropertiesContainer blitPropertiesContainer;
CsrDependencies csrDeps;
eventsRequest.fillCsrDependencies(csrDeps, *getBcsCommandStreamReceiver(), CsrDependencies::DependenciesType::All);
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, *getBcsCommandStreamReceiver(), CsrDependencies::DependenciesType::All);
auto allocator = getBcsCommandStreamReceiver()->getTimestampPacketAllocator();
if (isCacheFlushForBcsRequired() && isGpgpuSubmissionForBcsRequired(blockQueue)) {
@ -1118,7 +1118,7 @@ void CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDispat
}
obtainNewTimestampPacketNodes(1, timestampPacketDependencies.previousEnqueueNodes, clearAllDependencies, true);
csrDeps.push_back(&timestampPacketDependencies.previousEnqueueNodes);
csrDeps.timestampPacketContainer.push_back(&timestampPacketDependencies.previousEnqueueNodes);
LinearStream *gpgpuCommandStream = {};
size_t gpgpuCommandStreamStart = {};

View File

@ -219,6 +219,9 @@ size_t EnqueueOperation<GfxFamily>::getTotalSizeRequiredCS(uint32_t eventType, c
if (DebugManager.flags.GpuScratchRegWriteAfterWalker.get() != -1) {
expectedSizeCS += sizeof(typename GfxFamily::MI_LOAD_REGISTER_IMM);
}
expectedSizeCS += TimestampPacketHelper::getRequiredCmdStreamSizeForTaskCountContainer<GfxFamily>(csrDeps);
return expectedSizeCS;
}

View File

@ -107,7 +107,7 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
}
auto numSupportedDevices = commandQueue.getGpgpuCommandStreamReceiver().getOsContext().getNumSupportedDevices();
TimestampPacketHelper::programCsrDependencies<GfxFamily>(*commandStream, csrDependencies, numSupportedDevices);
TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer<GfxFamily>(*commandStream, csrDependencies, numSupportedDevices);
dsh->align(EncodeStates<GfxFamily>::alignInterfaceDescriptorData);

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2018-2020 Intel Corporation
* Copyright (C) 2018-2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@ -17,7 +17,7 @@
namespace NEO {
void EventsRequest::fillCsrDependencies(CsrDependencies &csrDeps, CommandStreamReceiver &currentCsr, CsrDependencies::DependenciesType depsType) const {
void EventsRequest::fillCsrDependenciesForTimestampPacketContainer(CsrDependencies &csrDeps, CommandStreamReceiver &currentCsr, CsrDependencies::DependenciesType depsType) const {
for (cl_uint i = 0; i < this->numEventsInWaitList; i++) {
auto event = castToObjectOrAbort<Event>(this->eventWaitList[i]);
if (event->isUserEvent()) {
@ -35,7 +35,26 @@ void EventsRequest::fillCsrDependencies(CsrDependencies &csrDeps, CommandStreamR
(CsrDependencies::DependenciesType::All == depsType);
if (pushDependency) {
csrDeps.push_back(timestampPacketContainer);
csrDeps.timestampPacketContainer.push_back(timestampPacketContainer);
}
}
}
void EventsRequest::fillCsrDependenciesForTaskCountContainer(CsrDependencies &csrDeps, CommandStreamReceiver &currentCsr) const {
for (cl_uint i = 0; i < this->numEventsInWaitList; i++) {
auto event = castToObjectOrAbort<Event>(this->eventWaitList[i]);
if (event->isUserEvent()) {
continue;
}
if (event->getCommandQueue() && event->getCommandQueue()->getDevice().getRootDeviceIndex() != currentCsr.getRootDeviceIndex()) {
auto taskCountPreviousRootDevice = event->peekTaskCount();
auto tagAddressPreviousRootDevice = event->getCommandQueue()->getCommandStreamReceiver(false).getTagAddress();
csrDeps.taskCountContainer.push_back({taskCountPreviousRootDevice, reinterpret_cast<uint64_t>(tagAddressPreviousRootDevice)});
auto graphicsAllocation = event->getCommandQueue()->getCommandStreamReceiver(false).getTagsMultiAllocation()->getGraphicsAllocation(currentCsr.getRootDeviceIndex());
currentCsr.getResidencyAllocations().push_back(graphicsAllocation);
}
}
}
@ -43,7 +62,6 @@ void EventsRequest::fillCsrDependencies(CsrDependencies &csrDeps, CommandStreamR
TransferProperties::TransferProperties(MemObj *memObj, cl_command_type cmdType, cl_map_flags mapFlags, bool blocking,
size_t *offsetPtr, size_t *sizePtr, void *ptr, bool doTransferOnCpu, uint32_t rootDeviceIndex)
: memObj(memObj), ptr(ptr), cmdType(cmdType), mapFlags(mapFlags), blocking(blocking), doTransferOnCpu(doTransferOnCpu) {
// no size or offset passed for unmap operation
if (cmdType != CL_COMMAND_UNMAP_MEM_OBJECT) {
if (memObj->peekClMemObjType() == CL_MEM_OBJECT_BUFFER) {

View File

@ -24,7 +24,8 @@ struct EventsRequest {
EventsRequest(cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *outEvent)
: numEventsInWaitList(numEventsInWaitList), eventWaitList(eventWaitList), outEvent(outEvent) {}
void fillCsrDependencies(CsrDependencies &csrDeps, CommandStreamReceiver &currentCsr, CsrDependencies::DependenciesType depsType) const;
void fillCsrDependenciesForTimestampPacketContainer(CsrDependencies &csrDeps, CommandStreamReceiver &currentCsr, CsrDependencies::DependenciesType depsType) const;
void fillCsrDependenciesForTaskCountContainer(CsrDependencies &csrDeps, CommandStreamReceiver &currentCsr) const;
cl_uint numEventsInWaitList;
const cl_event *eventWaitList;

View File

@ -205,7 +205,7 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
if (kernelOperation->blitPropertiesContainer.size() > 0) {
auto &bcsCsr = *commandQueue.getBcsCommandStreamReceiver();
CsrDependencies csrDeps;
eventsRequest.fillCsrDependencies(csrDeps, bcsCsr, CsrDependencies::DependenciesType::All);
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, bcsCsr, CsrDependencies::DependenciesType::All);
BlitProperties::setupDependenciesForAuxTranslation(kernelOperation->blitPropertiesContainer, *timestampPacketDependencies,
*currentTimestampPacketNodes, csrDeps,
@ -246,7 +246,7 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
kernel->areMultipleSubDevicesInContext()); //areMultipleSubDevicesInContext
if (timestampPacketDependencies) {
eventsRequest.fillCsrDependencies(dispatchFlags.csrDependencies, commandStreamReceiver, CsrDependencies::DependenciesType::OutOfCsr);
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, commandStreamReceiver, CsrDependencies::DependenciesType::OutOfCsr);
dispatchFlags.barrierTimestampPacketNodes = &timestampPacketDependencies->barrierNodes;
}
dispatchFlags.pipelineSelectArgs.specialPipelineSelectMode = kernel->requiresSpecialPipelineSelectMode();
@ -303,10 +303,10 @@ void CommandWithoutKernel::dispatchBlitOperation() {
UNRECOVERABLE_IF(kernelOperation->blitPropertiesContainer.size() != 1);
auto &blitProperties = *kernelOperation->blitPropertiesContainer.begin();
eventsRequest.fillCsrDependencies(blitProperties.csrDependencies, *bcsCsr, CsrDependencies::DependenciesType::All);
blitProperties.csrDependencies.push_back(&timestampPacketDependencies->cacheFlushNodes);
blitProperties.csrDependencies.push_back(&timestampPacketDependencies->previousEnqueueNodes);
blitProperties.csrDependencies.push_back(&timestampPacketDependencies->barrierNodes);
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(blitProperties.csrDependencies, *bcsCsr, CsrDependencies::DependenciesType::All);
blitProperties.csrDependencies.timestampPacketContainer.push_back(&timestampPacketDependencies->cacheFlushNodes);
blitProperties.csrDependencies.timestampPacketContainer.push_back(&timestampPacketDependencies->previousEnqueueNodes);
blitProperties.csrDependencies.timestampPacketContainer.push_back(&timestampPacketDependencies->barrierNodes);
blitProperties.outputTimestampPacket = currentTimestampPacketNodes->peekNodes()[0];
auto bcsTaskCount = bcsCsr->blitBuffer(kernelOperation->blitPropertiesContainer, false, commandQueue.isProfilingEnabled());
@ -372,7 +372,7 @@ CompletionStamp &CommandWithoutKernel::submit(uint32_t taskLevel, bool terminate
UNRECOVERABLE_IF(!kernelOperation->blitEnqueue && !commandStreamReceiver.peekTimestampPacketWriteEnabled());
eventsRequest.fillCsrDependencies(dispatchFlags.csrDependencies, commandStreamReceiver, CsrDependencies::DependenciesType::OutOfCsr);
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, commandStreamReceiver, CsrDependencies::DependenciesType::OutOfCsr);
makeTimestampPacketsResident(commandStreamReceiver);
gtpinNotifyPreFlushTask(&commandQueue);

View File

@ -29,6 +29,8 @@
#include "opencl/test/unit_test/mocks/mock_platform.h"
#include "opencl/test/unit_test/mocks/mock_program.h"
#include "opencl/test/unit_test/mocks/mock_submissions_aggregator.h"
#include "opencl/test/unit_test/mocks/mock_svm_manager.h"
#include "opencl/test/unit_test/test_macros/test_checks_ocl.h"
#include "test.h"
using namespace NEO;
@ -1944,7 +1946,11 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, GivenGpuIsIdleWhenCsrIsEnabledToFl
*commandStreamReceiver.getTagAddress() = 2u;
}
TEST(MultiRootDeviceCommandStreamReceiverTests, givenMultipleEventInMultiRootDeviceEnvironmentWhenTheyArePassedToMarkerThenCsrsAreWaitingForEventsFromPreviousDevices) {
using MultiRootDeviceCommandStreamReceiverTests = CommandStreamReceiverFlushTaskTests;
HWTEST_F(MultiRootDeviceCommandStreamReceiverTests, givenMultipleEventInMultiRootDeviceEnvironmentWhenTheyArePassedToEnqueueWithoutSubmissionThenCsIsWaitingForEventsFromPreviousDevices) {
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
auto deviceFactory = std::make_unique<UltClDeviceFactory>(4, 0);
auto device1 = deviceFactory->rootDevices[1];
auto device2 = deviceFactory->rootDevices[2];
@ -1968,11 +1974,10 @@ TEST(MultiRootDeviceCommandStreamReceiverTests, givenMultipleEventInMultiRootDev
Event event1(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 5, 15);
Event event2(nullptr, CL_COMMAND_NDRANGE_KERNEL, 6, 16);
Event event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 1, 6);
Event event4(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 4, 20);
Event event5(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4);
Event event6(pCmdQ3, CL_COMMAND_NDRANGE_KERNEL, 7, 21);
Event event7(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 2, 7);
Event event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 4, 20);
Event event4(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4);
Event event5(pCmdQ3, CL_COMMAND_NDRANGE_KERNEL, 7, 21);
Event event6(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 2, 7);
UserEvent userEvent1(&pCmdQ1->getContext());
UserEvent userEvent2(&pCmdQ2->getContext());
@ -1987,42 +1992,34 @@ TEST(MultiRootDeviceCommandStreamReceiverTests, givenMultipleEventInMultiRootDev
&event4,
&event5,
&event6,
&event7,
&userEvent1,
&userEvent2,
};
cl_uint numEventsInWaitList = sizeof(eventWaitList) / sizeof(eventWaitList[0]);
{
cl_event eventWaitList[] =
{
&event1,
&event3,
&event4,
};
cl_uint numEventsInWaitList = sizeof(eventWaitList) / sizeof(eventWaitList[0]);
pCmdQ1->enqueueMarkerWithWaitList(
numEventsInWaitList,
eventWaitList,
nullptr);
EXPECT_EQ(0u, mockCsr1->waitForCompletionWithTimeoutCalled);
EXPECT_EQ(0u, mockCsr2->waitForCompletionWithTimeoutCalled);
EXPECT_EQ(0u, mockCsr3->waitForCompletionWithTimeoutCalled);
}
HardwareParse csHwParser;
csHwParser.parseCommands<FamilyType>(pCmdQ1->getCS(0));
auto semaphores = findAll<MI_SEMAPHORE_WAIT *>(csHwParser.cmdList.begin(), csHwParser.cmdList.end());
{
pCmdQ1->enqueueMarkerWithWaitList(
numEventsInWaitList,
eventWaitList,
nullptr);
EXPECT_EQ(3u, semaphores.size());
EXPECT_EQ(0u, mockCsr1->waitForCompletionWithTimeoutCalled);
EXPECT_EQ(1u, mockCsr2->waitForCompletionWithTimeoutCalled);
EXPECT_EQ(1u, mockCsr3->waitForCompletionWithTimeoutCalled);
auto semaphoreCmd0 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[0]));
EXPECT_EQ(4u, semaphoreCmd0->getSemaphoreDataDword());
EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ2->getCommandStreamReceiver(false).getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress());
auto semaphoreCmd1 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[1]));
EXPECT_EQ(21u, semaphoreCmd1->getSemaphoreDataDword());
EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ3->getCommandStreamReceiver(false).getTagAddress()), semaphoreCmd1->getSemaphoreGraphicsAddress());
auto semaphoreCmd2 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[2]));
EXPECT_EQ(7u, semaphoreCmd2->getSemaphoreDataDword());
EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ2->getCommandStreamReceiver(false).getTagAddress()), semaphoreCmd2->getSemaphoreGraphicsAddress());
}
{
@ -2031,20 +2028,250 @@ TEST(MultiRootDeviceCommandStreamReceiverTests, givenMultipleEventInMultiRootDev
eventWaitList,
nullptr);
EXPECT_EQ(1u, mockCsr1->waitForCompletionWithTimeoutCalled);
EXPECT_EQ(1u, mockCsr2->waitForCompletionWithTimeoutCalled);
EXPECT_EQ(2u, mockCsr3->waitForCompletionWithTimeoutCalled);
HardwareParse csHwParser;
csHwParser.parseCommands<FamilyType>(pCmdQ2->getCS(0));
auto semaphores = findAll<MI_SEMAPHORE_WAIT *>(csHwParser.cmdList.begin(), csHwParser.cmdList.end());
EXPECT_EQ(3u, semaphores.size());
auto semaphoreCmd0 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[0]));
EXPECT_EQ(15u, semaphoreCmd0->getSemaphoreDataDword());
EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ1->getCommandStreamReceiver(false).getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress());
auto semaphoreCmd1 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[1]));
EXPECT_EQ(20u, semaphoreCmd1->getSemaphoreDataDword());
EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ1->getCommandStreamReceiver(false).getTagAddress()), semaphoreCmd1->getSemaphoreGraphicsAddress());
auto semaphoreCmd2 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[2]));
EXPECT_EQ(21u, semaphoreCmd2->getSemaphoreDataDword());
EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ3->getCommandStreamReceiver(false).getTagAddress()), semaphoreCmd2->getSemaphoreGraphicsAddress());
}
{
cl_event eventWaitList[] =
{
&event1,
&event2,
&event5,
&userEvent1,
};
cl_uint numEventsInWaitList = sizeof(eventWaitList) / sizeof(eventWaitList[0]);
pCmdQ3->enqueueMarkerWithWaitList(
numEventsInWaitList,
eventWaitList,
nullptr);
EXPECT_EQ(2u, mockCsr1->waitForCompletionWithTimeoutCalled);
EXPECT_EQ(2u, mockCsr2->waitForCompletionWithTimeoutCalled);
EXPECT_EQ(2u, mockCsr3->waitForCompletionWithTimeoutCalled);
HardwareParse csHwParser;
csHwParser.parseCommands<FamilyType>(pCmdQ3->getCS(0));
auto semaphores = findAll<MI_SEMAPHORE_WAIT *>(csHwParser.cmdList.begin(), csHwParser.cmdList.end());
EXPECT_EQ(1u, semaphores.size());
auto semaphoreCmd0 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[0]));
EXPECT_EQ(15u, semaphoreCmd0->getSemaphoreDataDword());
EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ1->getCommandStreamReceiver(false).getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress());
}
}
using MultiRootDeviceCommandStreamReceiverBufferTests = MultiRootDeviceFixture;
HWTEST_F(MultiRootDeviceCommandStreamReceiverBufferTests, givenMultipleEventInMultiRootDeviceEnvironmentWhenTheyArePassedToEnqueueWithSubmissionThenCsIsWaitingForEventsFromPreviousDevices) {
REQUIRE_SVM_OR_SKIP(device1);
REQUIRE_SVM_OR_SKIP(device2);
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
cl_int retVal = 0;
size_t offset = 0;
size_t size = 1;
auto pCmdQ1 = context.get()->getSpecialQueue(1u);
auto pCmdQ2 = context.get()->getSpecialQueue(2u);
std::unique_ptr<MockProgram> program(Program::createBuiltInFromSource<MockProgram>("FillBufferBytes", context.get(), context.get()->getDevices(), &retVal));
program->build(program->getDevices(), nullptr, false);
std::unique_ptr<MockKernel> kernel(Kernel::create<MockKernel>(program.get(), program->getKernelInfoForKernel("FillBufferBytes"), *context.get()->getDevice(0), &retVal));
size_t svmSize = 4096;
void *svmPtr = alignedMalloc(svmSize, MemoryConstants::pageSize);
MockGraphicsAllocation svmAlloc(svmPtr, svmSize);
Event event1(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 5, 15);
Event event2(nullptr, CL_COMMAND_NDRANGE_KERNEL, 6, 16);
Event event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 4, 20);
Event event4(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4);
Event event5(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 2, 7);
UserEvent userEvent1(&pCmdQ1->getContext());
UserEvent userEvent2(&pCmdQ2->getContext());
userEvent1.setStatus(CL_COMPLETE);
userEvent2.setStatus(CL_COMPLETE);
cl_event eventWaitList[] =
{
&event1,
&event2,
&event3,
&event4,
&event5,
&userEvent1,
&userEvent2,
};
cl_uint numEventsInWaitList = sizeof(eventWaitList) / sizeof(eventWaitList[0]);
{
kernel->setSvmKernelExecInfo(&svmAlloc);
retVal = pCmdQ1->enqueueKernel(
kernel.get(),
1,
&offset,
&size,
&size,
numEventsInWaitList,
eventWaitList,
nullptr);
HardwareParse csHwParser;
csHwParser.parseCommands<FamilyType>(pCmdQ1->getCS(0));
auto semaphores = findAll<MI_SEMAPHORE_WAIT *>(csHwParser.cmdList.begin(), csHwParser.cmdList.end());
EXPECT_EQ(2u, semaphores.size());
auto semaphoreCmd0 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[0]));
EXPECT_EQ(4u, semaphoreCmd0->getSemaphoreDataDword());
EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ2->getCommandStreamReceiver(false).getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress());
auto semaphoreCmd1 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[1]));
EXPECT_EQ(7u, semaphoreCmd1->getSemaphoreDataDword());
EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ2->getCommandStreamReceiver(false).getTagAddress()), semaphoreCmd1->getSemaphoreGraphicsAddress());
}
{
kernel->setSvmKernelExecInfo(&svmAlloc);
retVal = pCmdQ2->enqueueKernel(
kernel.get(),
1,
&offset,
&size,
&size,
numEventsInWaitList,
eventWaitList,
nullptr);
HardwareParse csHwParser;
csHwParser.parseCommands<FamilyType>(pCmdQ2->getCS(0));
auto semaphores = findAll<MI_SEMAPHORE_WAIT *>(csHwParser.cmdList.begin(), csHwParser.cmdList.end());
EXPECT_EQ(2u, semaphores.size());
auto semaphoreCmd0 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[0]));
EXPECT_EQ(15u, semaphoreCmd0->getSemaphoreDataDword());
EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ1->getCommandStreamReceiver(false).getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress());
auto semaphoreCmd1 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[1]));
EXPECT_EQ(20u, semaphoreCmd1->getSemaphoreDataDword());
EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ1->getCommandStreamReceiver(false).getTagAddress()), semaphoreCmd1->getSemaphoreGraphicsAddress());
}
alignedFree(svmPtr);
}
HWTEST_F(MultiRootDeviceCommandStreamReceiverTests, givenMultipleEventInMultiRootDeviceEnvironmentWhenTheyArePassedToMarkerThenMiSemaphoreWaitCommandSizeIsIncluded) {
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
auto deviceFactory = std::make_unique<UltClDeviceFactory>(3, 0);
auto device1 = deviceFactory->rootDevices[1];
auto device2 = deviceFactory->rootDevices[2];
auto mockCsr1 = new MockCommandStreamReceiver(*device1->executionEnvironment, device1->getRootDeviceIndex(), device1->getDeviceBitfield());
auto mockCsr2 = new MockCommandStreamReceiver(*device2->executionEnvironment, device2->getRootDeviceIndex(), device2->getDeviceBitfield());
device1->resetCommandStreamReceiver(mockCsr1);
device2->resetCommandStreamReceiver(mockCsr2);
cl_device_id devices[] = {device1, device2};
auto context = std::make_unique<MockContext>(ClDeviceVector(devices, 2), false);
auto pCmdQ1 = context.get()->getSpecialQueue(1u);
auto pCmdQ2 = context.get()->getSpecialQueue(2u);
MockKernelWithInternals mockKernel(ClDeviceVector(devices, 2));
DispatchInfo dispatchInfo;
MultiDispatchInfo multiDispatchInfo(mockKernel.mockKernel);
dispatchInfo.setKernel(mockKernel.mockKernel);
multiDispatchInfo.push(dispatchInfo);
Event event1(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 5, 15);
Event event2(nullptr, CL_COMMAND_NDRANGE_KERNEL, 6, 16);
Event event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 1, 6);
Event event4(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 4, 20);
Event event5(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 3, 4);
Event event6(pCmdQ2, CL_COMMAND_NDRANGE_KERNEL, 2, 7);
UserEvent userEvent1(&pCmdQ1->getContext());
UserEvent userEvent2(&pCmdQ2->getContext());
userEvent1.setStatus(CL_COMPLETE);
userEvent2.setStatus(CL_COMPLETE);
{
cl_event eventWaitList[] =
{
&event1,
&event2,
&event3,
&event4,
&userEvent1,
&userEvent2,
};
cl_uint numEventsInWaitList = sizeof(eventWaitList) / sizeof(eventWaitList[0]);
pCmdQ1->enqueueMarkerWithWaitList(
numEventsInWaitList,
eventWaitList,
nullptr);
EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, nullptr);
CsrDependencies csrDeps;
eventsRequest.fillCsrDependenciesForTaskCountContainer(csrDeps, pCmdQ1->getCommandStreamReceiver(false));
HardwareParse csHwParser;
csHwParser.parseCommands<FamilyType>(pCmdQ1->getCS(0));
auto semaphores = findAll<MI_SEMAPHORE_WAIT *>(csHwParser.cmdList.begin(), csHwParser.cmdList.end());
EXPECT_EQ(0u, semaphores.size());
EXPECT_EQ(0u, TimestampPacketHelper::getRequiredCmdStreamSizeForTaskCountContainer<FamilyType>(csrDeps));
}
{
cl_event eventWaitList[] =
{
&event1,
&event2,
&event3,
&event4,
&event5,
&event6,
&userEvent1,
};
cl_uint numEventsInWaitList = sizeof(eventWaitList) / sizeof(eventWaitList[0]);
pCmdQ2->enqueueMarkerWithWaitList(
numEventsInWaitList,
eventWaitList,
nullptr);
EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, nullptr);
CsrDependencies csrDeps;
eventsRequest.fillCsrDependenciesForTaskCountContainer(csrDeps, pCmdQ2->getCommandStreamReceiver(false));
HardwareParse csHwParser;
csHwParser.parseCommands<FamilyType>(pCmdQ2->getCS(0));
auto semaphores = findAll<MI_SEMAPHORE_WAIT *>(csHwParser.cmdList.begin(), csHwParser.cmdList.end());
EXPECT_EQ(3u, semaphores.size());
EXPECT_EQ(3u * sizeof(MI_SEMAPHORE_WAIT), TimestampPacketHelper::getRequiredCmdStreamSizeForTaskCountContainer<FamilyType>(csrDeps));
}
}

View File

@ -804,8 +804,8 @@ HWTEST_F(BcsTests, givenBltSizeAndCsrDependenciesWhenEstimatingCommandSizeThenAd
MockTimestampPacketContainer timestamp0(*csr.getTimestampPacketAllocator(), numberNodesPerContainer);
MockTimestampPacketContainer timestamp1(*csr.getTimestampPacketAllocator(), numberNodesPerContainer);
csrDependencies.push_back(&timestamp0);
csrDependencies.push_back(&timestamp1);
csrDependencies.timestampPacketContainer.push_back(&timestamp0);
csrDependencies.timestampPacketContainer.push_back(&timestamp1);
size_t cmdsSizePerBlit = sizeof(typename FamilyType::XY_COPY_BLT) + sizeof(typename FamilyType::MI_ARB_CHECK);

View File

@ -220,8 +220,8 @@ HWTEST_F(BcsTests, givenCsrDependenciesWhenProgrammingCommandStreamThenAddSemaph
MockTimestampPacketContainer timestamp0(*csr.getTimestampPacketAllocator(), numberNodesPerContainer);
MockTimestampPacketContainer timestamp1(*csr.getTimestampPacketAllocator(), numberNodesPerContainer);
blitProperties.csrDependencies.push_back(&timestamp0);
blitProperties.csrDependencies.push_back(&timestamp1);
blitProperties.csrDependencies.timestampPacketContainer.push_back(&timestamp0);
blitProperties.csrDependencies.timestampPacketContainer.push_back(&timestamp1);
blitBuffer(&csr, blitProperties, true);
@ -278,8 +278,8 @@ HWTEST_F(BcsTests, givenMultipleBlitPropertiesWhenDispatchingThenProgramCommands
MockTimestampPacketContainer timestamp1(*csr.getTimestampPacketAllocator(), 1);
MockTimestampPacketContainer timestamp2(*csr.getTimestampPacketAllocator(), 1);
blitProperties1.csrDependencies.push_back(&timestamp1);
blitProperties2.csrDependencies.push_back(&timestamp2);
blitProperties1.csrDependencies.timestampPacketContainer.push_back(&timestamp1);
blitProperties2.csrDependencies.timestampPacketContainer.push_back(&timestamp2);
BlitPropertiesContainer blitPropertiesContainer;
blitPropertiesContainer.push_back(blitProperties1);
@ -1248,8 +1248,8 @@ HWTEST_F(BcsTests, givenBlitterDirectSubmissionEnabledWhenProgrammingBlitterThen
MockTimestampPacketContainer timestamp0(*csr.getTimestampPacketAllocator(), numberNodesPerContainer);
MockTimestampPacketContainer timestamp1(*csr.getTimestampPacketAllocator(), numberNodesPerContainer);
blitProperties.csrDependencies.push_back(&timestamp0);
blitProperties.csrDependencies.push_back(&timestamp1);
blitProperties.csrDependencies.timestampPacketContainer.push_back(&timestamp0);
blitProperties.csrDependencies.timestampPacketContainer.push_back(&timestamp1);
blitBuffer(&csr, blitProperties, true);
@ -1564,4 +1564,4 @@ TEST(BcsConstantsTests, givenBlitConstantsThenTheyHaveDesiredValues) {
EXPECT_EQ(BlitterConstants::maxBlitHeight, 0x4000u);
EXPECT_EQ(BlitterConstants::maxBlitSetWidth, 0x1FF80u);
EXPECT_EQ(BlitterConstants::maxBlitSetHeight, 0x1FFC0u);
}
}

View File

@ -449,14 +449,14 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledAndOoqWhenEstimat
EventsRequest eventsRequest(numEventsOnWaitlist, waitlist, nullptr);
CsrDependencies csrDeps;
eventsRequest.fillCsrDependencies(
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(
csrDeps, device->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr);
getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*mockCmdQHw, csrDeps, false, false, false, multiDispatchInfo, nullptr, 0);
auto sizeWithEnabled = mockCmdQHw->requestedCmdStreamSize;
size_t sizeForNodeDependency = 0;
for (auto timestampPacketContainer : csrDeps) {
for (auto timestampPacketContainer : csrDeps.timestampPacketContainer) {
for (auto &node : timestampPacketContainer->peekNodes()) {
sizeForNodeDependency += TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependency<FamilyType>(*node);
}
@ -499,13 +499,13 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenEstimatingStr
EventsRequest eventsRequest(numEventsOnWaitlist, waitlist, nullptr);
CsrDependencies csrDeps;
eventsRequest.fillCsrDependencies(csrDeps, device->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr);
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, device->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr);
getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*mockCmdQHw, csrDeps, false, false, false, multiDispatchInfo, nullptr, 0);
auto sizeWithEnabled = mockCmdQHw->requestedCmdStreamSize;
size_t sizeForNodeDependency = 0;
for (auto timestampPacketContainer : csrDeps) {
for (auto timestampPacketContainer : csrDeps.timestampPacketContainer) {
for (auto &node : timestampPacketContainer->peekNodes()) {
sizeForNodeDependency += TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependency<FamilyType>(*node);
}
@ -531,8 +531,8 @@ HWTEST_F(TimestampPacketTests, givenEventsRequestWithEventsWithoutTimestampsWhen
EventsRequest eventsRequest(numEventsOnWaitlist, waitlist, nullptr);
CsrDependencies csrDepsEmpty;
eventsRequest.fillCsrDependencies(csrDepsEmpty, device->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr);
EXPECT_EQ(0u, csrDepsEmpty.size());
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDepsEmpty, device->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr);
EXPECT_EQ(0u, csrDepsEmpty.timestampPacketContainer.size());
device->getUltCommandStreamReceiver<FamilyType>().timestampPacketWriteEnabled = true;
MockTimestampPacketContainer timestamp1(*device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 1);
@ -559,12 +559,12 @@ HWTEST_F(TimestampPacketTests, givenEventsRequestWithEventsWithoutTimestampsWhen
cl_event waitlist2[] = {&event1, &eventWithEmptyTimestampContainer2, &event3, &eventWithEmptyTimestampContainer4, &event5};
EventsRequest eventsRequest2(numEventsOnWaitlist, waitlist2, nullptr);
CsrDependencies csrDepsSize3;
eventsRequest2.fillCsrDependencies(csrDepsSize3, device->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr);
eventsRequest2.fillCsrDependenciesForTimestampPacketContainer(csrDepsSize3, device->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr);
EXPECT_EQ(3u, csrDepsSize3.size());
EXPECT_EQ(3u, csrDepsSize3.timestampPacketContainer.size());
size_t sizeForNodeDependency = 0;
for (auto timestampPacketContainer : csrDepsSize3) {
for (auto timestampPacketContainer : csrDepsSize3.timestampPacketContainer) {
for (auto &node : timestampPacketContainer->peekNodes()) {
sizeForNodeDependency += TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependency<FamilyType>(*node);
}
@ -794,11 +794,11 @@ HWTEST_F(TimestampPacketTests, givenEventsRequestWhenEstimatingStreamSizeForCsrT
auto sizeWithoutEvents = csr.getRequiredCmdStreamSize(flags, device->getDevice());
eventsRequest.fillCsrDependencies(flags.csrDependencies, csr, NEO::CsrDependencies::DependenciesType::OutOfCsr);
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(flags.csrDependencies, csr, NEO::CsrDependencies::DependenciesType::OutOfCsr);
auto sizeWithEvents = csr.getRequiredCmdStreamSize(flags, device->getDevice());
size_t sizeForNodeDependency = 0;
for (auto timestampPacketContainer : flags.csrDependencies) {
for (auto timestampPacketContainer : flags.csrDependencies.timestampPacketContainer) {
for (auto &node : timestampPacketContainer->peekNodes()) {
sizeForNodeDependency += TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependency<FamilyType>(*node);
}
@ -842,11 +842,11 @@ HWTEST_F(TimestampPacketTests, givenEventsRequestWhenEstimatingStreamSizeForDiff
auto sizeWithoutEvents = csr.getRequiredCmdStreamSize(flags, device->getDevice());
eventsRequest.fillCsrDependencies(flags.csrDependencies, csr, NEO::CsrDependencies::DependenciesType::OutOfCsr);
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(flags.csrDependencies, csr, NEO::CsrDependencies::DependenciesType::OutOfCsr);
auto sizeWithEvents = csr.getRequiredCmdStreamSize(flags, device->getDevice());
size_t sizeForNodeDependency = 0;
for (auto timestampPacketContainer : flags.csrDependencies) {
for (auto timestampPacketContainer : flags.csrDependencies.timestampPacketContainer) {
for (auto &node : timestampPacketContainer->peekNodes()) {
sizeForNodeDependency += TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependency<FamilyType>(*node);
}
@ -991,8 +991,8 @@ HWTEST_F(TimestampPacketTests, givenAllDependencyTypesModeWhenFillingFromDiffere
EventsRequest eventsRequest(eventsOnWaitlist, waitlist, nullptr);
CsrDependencies csrDependencies;
eventsRequest.fillCsrDependencies(csrDependencies, csr1, CsrDependencies::DependenciesType::All);
EXPECT_EQ(static_cast<size_t>(eventsOnWaitlist), csrDependencies.size());
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDependencies, csr1, CsrDependencies::DependenciesType::All);
EXPECT_EQ(static_cast<size_t>(eventsOnWaitlist), csrDependencies.timestampPacketContainer.size());
}
HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledOnDifferentCSRsFromOneDeviceWhenEnqueueingThenProgramSemaphoresOnCsrStream) {
@ -1177,7 +1177,7 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenDispatchingTh
EventsRequest eventsRequest(eventsOnWaitlist, waitlist, nullptr);
CsrDependencies csrDeps;
eventsRequest.fillCsrDependencies(csrDeps, mockCmdQ->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr);
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, mockCmdQ->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr);
HardwareInterface<FamilyType>::dispatchWalker(
*mockCmdQ,
@ -1260,7 +1260,7 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledOnDifferentCSRsFr
EventsRequest eventsRequest(eventsOnWaitlist, waitlist, nullptr);
CsrDependencies csrDeps;
eventsRequest.fillCsrDependencies(csrDeps, mockCmdQ->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr);
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(csrDeps, mockCmdQ->getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr);
HardwareInterface<FamilyType>::dispatchWalker(
*mockCmdQ,
@ -1769,12 +1769,12 @@ HWTEST_F(TimestampPacketTests, givenWaitlistAndOutputEventWhenEnqueueingMarkerWi
verifySemaphore(genCmdCast<MI_SEMAPHORE_WAIT *>(*(csrSemaphores[0])), node2.getNode(0), 0);
auto queueSemaphores = findAll<MI_SEMAPHORE_WAIT *>(hwParserCmdQ.cmdList.begin(), hwParserCmdQ.cmdList.end());
auto expectedQueueSemaphoresCount = 1u;
auto expectedQueueSemaphoresCount = 2u;
if (UnitTestHelper<FamilyType>::isAdditionalMiSemaphoreWaitRequired(device->getHardwareInfo())) {
expectedQueueSemaphoresCount += 2;
}
EXPECT_EQ(expectedQueueSemaphoresCount, queueSemaphores.size());
verifySemaphore(genCmdCast<MI_SEMAPHORE_WAIT *>(*(queueSemaphores[0])), node1.getNode(0), 0);
verifySemaphore(genCmdCast<MI_SEMAPHORE_WAIT *>(*(queueSemaphores[1])), node1.getNode(0), 0);
}
HWTEST_F(TimestampPacketTests, givenWaitlistAndOutputEventWhenEnqueueingBarrierWithoutKernelThenInheritTimestampPacketsAndProgramSemaphores) {
@ -1812,12 +1812,12 @@ HWTEST_F(TimestampPacketTests, givenWaitlistAndOutputEventWhenEnqueueingBarrierW
verifySemaphore(genCmdCast<MI_SEMAPHORE_WAIT *>(*(csrSemaphores[0])), node2.getNode(0), 0);
auto queueSemaphores = findAll<MI_SEMAPHORE_WAIT *>(hwParserCmdQ.cmdList.begin(), hwParserCmdQ.cmdList.end());
auto expectedQueueSemaphoresCount = 1u;
auto expectedQueueSemaphoresCount = 2u;
if (UnitTestHelper<FamilyType>::isAdditionalMiSemaphoreWaitRequired(device->getHardwareInfo())) {
expectedQueueSemaphoresCount += 2;
}
EXPECT_EQ(expectedQueueSemaphoresCount, queueSemaphores.size());
verifySemaphore(genCmdCast<MI_SEMAPHORE_WAIT *>(*(queueSemaphores[0])), node1.getNode(0), 0);
verifySemaphore(genCmdCast<MI_SEMAPHORE_WAIT *>(*(queueSemaphores[1])), node1.getNode(0), 0);
}
HWTEST_F(TimestampPacketTests, givenEmptyWaitlistAndNoOutputEventWhenEnqueueingMarkerThenDoNothing) {

View File

@ -327,7 +327,7 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
auto &commandStreamCSR = this->getCS(getRequiredCmdStreamSizeAligned(dispatchFlags, device));
auto commandStreamStartCSR = commandStreamCSR.getUsed();
TimestampPacketHelper::programCsrDependencies<GfxFamily>(commandStreamCSR, dispatchFlags.csrDependencies, getOsContext().getNumSupportedDevices());
TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer<GfxFamily>(commandStreamCSR, dispatchFlags.csrDependencies, getOsContext().getNumSupportedDevices());
if (stallingPipeControlOnNextFlushRequired) {
programStallingPipeControlForBarrier(commandStreamCSR, dispatchFlags);
@ -1016,7 +1016,7 @@ uint32_t CommandStreamReceiverHw<GfxFamily>::blitBuffer(const BlitPropertiesCont
programEnginePrologue(commandStream);
for (auto &blitProperties : blitPropertiesContainer) {
TimestampPacketHelper::programCsrDependencies<GfxFamily>(commandStream, blitProperties.csrDependencies, getOsContext().getNumSupportedDevices());
TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer<GfxFamily>(commandStream, blitProperties.csrDependencies, getOsContext().getNumSupportedDevices());
if (blitProperties.outputTimestampPacket && profilingEnabled) {
BlitCommandsHelper<GfxFamily>::encodeProfilingStartMmios(commandStream, *blitProperties.outputTimestampPacket);

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2019-2020 Intel Corporation
* Copyright (C) 2019-2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@ -12,7 +12,7 @@
namespace NEO {
void CsrDependencies::makeResident(CommandStreamReceiver &commandStreamReceiver) const {
for (auto &timestampPacketContainer : *this) {
for (auto &timestampPacketContainer : timestampPacketContainer) {
timestampPacketContainer->makeResident(commandStreamReceiver);
}
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2019-2020 Intel Corporation
* Copyright (C) 2019-2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@ -13,7 +13,7 @@ namespace NEO {
class TimestampPacketContainer;
class CommandStreamReceiver;
class CsrDependencies : public StackVec<TimestampPacketContainer *, 32> {
class CsrDependencies {
public:
enum class DependenciesType {
OnCsr,
@ -21,6 +21,9 @@ class CsrDependencies : public StackVec<TimestampPacketContainer *, 32> {
All
};
StackVec<std::pair<uint32_t, uint64_t>, 32> taskCountContainer;
StackVec<TimestampPacketContainer *, 32> timestampPacketContainer;
void makeResident(CommandStreamReceiver &commandStreamReceiver) const;
};
} // namespace NEO

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2019-2020 Intel Corporation
* Copyright (C) 2019-2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@ -144,15 +144,15 @@ void BlitProperties::setupDependenciesForAuxTranslation(BlitPropertiesContainer
timestampPacketDependencies.barrierNodes.add(nodesAllocator->getTag());
// wait for barrier and events before AuxToNonAux
blitPropertiesContainer[0].csrDependencies.push_back(&timestampPacketDependencies.barrierNodes);
blitPropertiesContainer[0].csrDependencies.timestampPacketContainer.push_back(&timestampPacketDependencies.barrierNodes);
for (auto dep : depsFromEvents) {
blitPropertiesContainer[0].csrDependencies.push_back(dep);
for (auto dep : depsFromEvents.timestampPacketContainer) {
blitPropertiesContainer[0].csrDependencies.timestampPacketContainer.push_back(dep);
}
// wait for NDR before NonAuxToAux
blitPropertiesContainer[numObjects].csrDependencies.push_back(&timestampPacketDependencies.cacheFlushNodes);
blitPropertiesContainer[numObjects].csrDependencies.push_back(&kernelTimestamps);
blitPropertiesContainer[numObjects].csrDependencies.timestampPacketContainer.push_back(&timestampPacketDependencies.cacheFlushNodes);
blitPropertiesContainer[numObjects].csrDependencies.timestampPacketContainer.push_back(&kernelTimestamps);
}
} // namespace NEO

View File

@ -6,8 +6,8 @@
*/
#pragma once
#include "shared/source/command_container/command_encoder.h"
#include "shared/source/command_stream/command_stream_receiver.h"
#include "shared/source/command_stream/csr_deps.h"
#include "shared/source/helpers/aux_translation.h"
#include "shared/source/helpers/hw_helper.h"
@ -183,14 +183,29 @@ struct TimestampPacketHelper {
}
template <typename GfxFamily>
static void programCsrDependencies(LinearStream &cmdStream, const CsrDependencies &csrDependencies, uint32_t numSupportedDevices) {
for (auto timestampPacketContainer : csrDependencies) {
static void programCsrDependenciesForTimestampPacketContainer(LinearStream &cmdStream, const CsrDependencies &csrDependencies, uint32_t numSupportedDevices) {
for (auto timestampPacketContainer : csrDependencies.timestampPacketContainer) {
for (auto &node : timestampPacketContainer->peekNodes()) {
TimestampPacketHelper::programSemaphoreWithImplicitDependency<GfxFamily>(cmdStream, *node, numSupportedDevices);
}
}
}
template <typename GfxFamily>
static void programCsrDependenciesForForTaskCountContainer(LinearStream &cmdStream, const CsrDependencies &csrDependencies) {
auto taskCountContainer = csrDependencies.taskCountContainer;
for (auto &[taskCountPreviousRootDevice, tagAddressPreviousRootDevice] : taskCountContainer) {
using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION;
using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT;
EncodeSempahore<GfxFamily>::addMiSemaphoreWaitCommand(cmdStream,
static_cast<uint64_t>(tagAddressPreviousRootDevice),
taskCountPreviousRootDevice,
COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD);
}
}
template <typename GfxFamily, AuxTranslationDirection auxTranslationDirection>
static void programSemaphoreWithImplicitDependencyForAuxTranslation(LinearStream &cmdStream,
const TimestampPacketDependencies *timestampPacketDependencies,
@ -241,7 +256,7 @@ struct TimestampPacketHelper {
template <typename GfxFamily>
static size_t getRequiredCmdStreamSize(const CsrDependencies &csrDependencies) {
size_t totalCommandsSize = 0;
for (auto timestampPacketContainer : csrDependencies) {
for (auto timestampPacketContainer : csrDependencies.timestampPacketContainer) {
for (auto &node : timestampPacketContainer->peekNodes()) {
totalCommandsSize += getRequiredCmdStreamSizeForNodeDependency<GfxFamily>(*node);
}
@ -249,6 +264,11 @@ struct TimestampPacketHelper {
return totalCommandsSize;
}
template <typename GfxFamily>
static size_t getRequiredCmdStreamSizeForTaskCountContainer(const CsrDependencies &csrDependencies) {
return csrDependencies.taskCountContainer.size() * sizeof(typename GfxFamily::MI_SEMAPHORE_WAIT);
}
};
} // namespace NEO