Correct semaphore wait programming for cross device dependencies

when event is blocked by blocked user event then program semaphore wait during
unblocking user event

Related-To: NEO-3691

Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
This commit is contained in:
Mateusz Jablonski
2021-06-14 15:33:53 +00:00
committed by Compute-Runtime-Automation
parent 7a2d3d6369
commit e027178c37
9 changed files with 369 additions and 56 deletions

View File

@ -84,7 +84,7 @@ CommandQueue::CommandQueue(Context *context, ClDevice *device, const cl_queue_pr
}
if (bcsAllowed) {
auto &selectorCopyEngine = device->getDeviceById(0)->getSelectorCopyEngine();
bcsEngine = &device->getDeviceById(0)->getEngine(EngineHelpers::getBcsEngineType(hwInfo, selectorCopyEngine), EngineUsage::Regular);
bcsEngine = device->getDeviceById(0)->getDevice().tryGetEngine(EngineHelpers::getBcsEngineType(hwInfo, selectorCopyEngine), EngineUsage::Regular);
}
}
@ -783,13 +783,18 @@ bool CommandQueue::isBlockedCommandStreamRequired(uint32_t commandType, const Ev
return true;
}
if ((CL_COMMAND_BARRIER == commandType || CL_COMMAND_MARKER == commandType) &&
getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
if (CL_COMMAND_BARRIER == commandType || CL_COMMAND_MARKER == commandType) {
auto timestampPacketWriteEnabled = getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled();
if (timestampPacketWriteEnabled || context->getRootDeviceIndices().size() > 1) {
for (size_t i = 0; i < eventsRequest.numEventsInWaitList; i++) {
auto waitlistEvent = castToObjectOrAbort<Event>(eventsRequest.eventWaitList[i]);
if (waitlistEvent->getTimestampPacketNodes()) {
return true;
for (size_t i = 0; i < eventsRequest.numEventsInWaitList; i++) {
auto waitlistEvent = castToObjectOrAbort<Event>(eventsRequest.eventWaitList[i]);
if (timestampPacketWriteEnabled && waitlistEvent->getTimestampPacketNodes()) {
return true;
}
if (waitlistEvent->getCommandQueue() && waitlistEvent->getCommandQueue()->getDevice().getRootDeviceIndex() != this->getDevice().getRootDeviceIndex()) {
return true;
}
}
}
}

View File

@ -982,13 +982,12 @@ void CommandQueueHw<GfxFamily>::enqueueBlocked(
(uint32_t)multiDispatchInfo.size());
}
if (storeTimestampPackets) {
for (cl_uint i = 0; i < eventsRequest.numEventsInWaitList; i++) {
auto event = castToObjectOrAbort<Event>(eventsRequest.eventWaitList[i]);
event->incRefInternal();
}
command->setTimestampPacketNode(*timestampPacketContainer, std::move(timestampPacketDependencies));
command->setEventsRequest(eventsRequest);
} else if (this->context->getRootDeviceIndices().size() > 1) {
command->setEventsRequest(eventsRequest);
}
outEvent->setCommand(std::move(command));
eventBuilder->addParentEvents(ArrayRef<const cl_event>(eventsRequest.eventWaitList, eventsRequest.numEventsInWaitList));

View File

@ -48,7 +48,7 @@ void EventsRequest::fillCsrDependenciesForTimestampPacketContainer(CsrDependenci
void EventsRequest::fillCsrDependenciesForTaskCountContainer(CsrDependencies &csrDeps, CommandStreamReceiver &currentCsr) const {
for (cl_uint i = 0; i < this->numEventsInWaitList; i++) {
auto event = castToObjectOrAbort<Event>(this->eventWaitList[i]);
if (event->isUserEvent()) {
if (event->isUserEvent() || CompletionStamp::notReady == event->peekTaskCount()) {
continue;
}

View File

@ -245,6 +245,10 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
kernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, //useGlobalAtomics
kernel->areMultipleSubDevicesInContext()); //areMultipleSubDevicesInContext
if (commandQueue.getContext().getRootDeviceIndices().size() > 1) {
eventsRequest.fillCsrDependenciesForTaskCountContainer(dispatchFlags.csrDependencies, commandStreamReceiver);
}
if (timestampPacketDependencies) {
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, commandStreamReceiver, CsrDependencies::DependenciesType::OutOfCsr);
dispatchFlags.barrierTimestampPacketNodes = &timestampPacketDependencies->barrierNodes;
@ -309,6 +313,10 @@ void CommandWithoutKernel::dispatchBlitOperation() {
blitProperties.csrDependencies.timestampPacketContainer.push_back(&timestampPacketDependencies->barrierNodes);
blitProperties.outputTimestampPacket = currentTimestampPacketNodes->peekNodes()[0];
if (commandQueue.getContext().getRootDeviceIndices().size() > 1) {
eventsRequest.fillCsrDependenciesForTaskCountContainer(blitProperties.csrDependencies, *bcsCsr);
}
auto bcsTaskCount = bcsCsr->blitBuffer(kernelOperation->blitPropertiesContainer, false, commandQueue.isProfilingEnabled());
commandQueue.updateBcsTaskCount(bcsTaskCount);
@ -329,6 +337,7 @@ CompletionStamp &CommandWithoutKernel::submit(uint32_t taskLevel, bool terminate
return completionStamp;
}
auto barrierNodes = timestampPacketDependencies ? &timestampPacketDependencies->barrierNodes : nullptr;
auto lockCSR = commandStreamReceiver.obtainUniqueOwnership();
auto enqueueOperationType = EnqueueProperties::Operation::DependencyResolveOnGpu;
@ -336,14 +345,15 @@ CompletionStamp &CommandWithoutKernel::submit(uint32_t taskLevel, bool terminate
if (kernelOperation->blitEnqueue) {
enqueueOperationType = EnqueueProperties::Operation::Blit;
UNRECOVERABLE_IF(!barrierNodes);
if (commandStreamReceiver.isStallingPipeControlOnNextFlushRequired()) {
timestampPacketDependencies->barrierNodes.add(commandStreamReceiver.getTimestampPacketAllocator()->getTag());
barrierNodes->add(commandStreamReceiver.getTimestampPacketAllocator()->getTag());
}
}
DispatchFlags dispatchFlags(
{}, //csrDependencies
&timestampPacketDependencies->barrierNodes, //barrierTimestampPacketNodes
barrierNodes, //barrierTimestampPacketNodes
{}, //pipelineSelectArgs
commandQueue.flushStamp->getStampReference(), //flushStampReference
commandQueue.getThrottle(), //throttle
@ -370,7 +380,11 @@ CompletionStamp &CommandWithoutKernel::submit(uint32_t taskLevel, bool terminate
false, //useGlobalAtomics
1u); //numDevicesInContext
UNRECOVERABLE_IF(!kernelOperation->blitEnqueue && !commandStreamReceiver.peekTimestampPacketWriteEnabled());
UNRECOVERABLE_IF(!kernelOperation->blitEnqueue && !commandStreamReceiver.peekTimestampPacketWriteEnabled() && commandQueue.getContext().getRootDeviceIndices().size() == 1);
if (commandQueue.getContext().getRootDeviceIndices().size() > 1) {
eventsRequest.fillCsrDependenciesForTaskCountContainer(dispatchFlags.csrDependencies, commandStreamReceiver);
}
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, commandStreamReceiver, CsrDependencies::DependenciesType::OutOfCsr);
makeTimestampPacketsResident(commandStreamReceiver);
@ -402,6 +416,10 @@ void Command::setEventsRequest(EventsRequest &eventsRequest) {
auto size = eventsRequest.numEventsInWaitList * sizeof(cl_event);
memcpy_s(&eventsWaitlist[0], size, eventsRequest.eventWaitList, size);
this->eventsRequest.eventWaitList = &eventsWaitlist[0];
for (cl_uint i = 0; i < eventsRequest.numEventsInWaitList; i++) {
auto event = castToObjectOrAbort<Event>(eventsRequest.eventWaitList[i]);
event->incRefInternal();
}
}
}
@ -414,12 +432,9 @@ void Command::setTimestampPacketNode(TimestampPacketContainer &current, Timestam
}
Command::~Command() {
auto &commandStreamReceiver = commandQueue.getGpgpuCommandStreamReceiver();
if (commandStreamReceiver.peekTimestampPacketWriteEnabled()) {
for (cl_event &eventFromWaitList : eventsWaitlist) {
auto event = castToObjectOrAbort<Event>(eventFromWaitList);
event->decRefInternal();
}
for (cl_event &eventFromWaitList : eventsWaitlist) {
auto event = castToObjectOrAbort<Event>(eventFromWaitList);
event->decRefInternal();
}
}
@ -427,7 +442,7 @@ void Command::makeTimestampPacketsResident(CommandStreamReceiver &commandStreamR
if (commandStreamReceiver.peekTimestampPacketWriteEnabled()) {
for (cl_event &eventFromWaitList : eventsWaitlist) {
auto event = castToObjectOrAbort<Event>(eventFromWaitList);
if (event->getTimestampPacketNodes()) {
if (event->getTimestampPacketNodes() && event->getCommandQueue()->getClDevice().getRootDeviceIndex() == commandStreamReceiver.getRootDeviceIndex()) {
event->getTimestampPacketNodes()->makeResident(commandStreamReceiver);
}
}

View File

@ -1196,9 +1196,9 @@ HWCMDTEST_F(IGFX_GEN8_CORE, CommandStreamReceiverFlushTaskTests, GivenBlockingWh
HWTEST_F(CommandStreamReceiverFlushTaskTests, GivenBlockedKernelRequiringDCFlushWhenUnblockedThenDCFlushIsAdded) {
typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL;
MockContext ctx(pClDevice);
CommandQueueHw<FamilyType> commandQueue(&ctx, pClDevice, 0, false);
auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver<FamilyType>();
commandStreamReceiver.timestampPacketWriteEnabled = false;
CommandQueueHw<FamilyType> commandQueue(&ctx, pClDevice, 0, false);
cl_event blockingEvent;
MockEvent<UserEvent> mockEvent(&ctx);
blockingEvent = &mockEvent;

View File

@ -37,9 +37,9 @@ typedef UltCommandStreamReceiverTest CommandStreamReceiverFlushTaskTests;
HWTEST_F(CommandStreamReceiverFlushTaskTests, GivenBlockedKernelNotRequiringDCFlushWhenUnblockedThenDCFlushIsNotAdded) {
typedef typename FamilyType::PIPE_CONTROL PIPE_CONTROL;
MockContext ctx(pClDevice);
CommandQueueHw<FamilyType> commandQueue(&ctx, pClDevice, 0, false);
auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver<FamilyType>();
commandStreamReceiver.timestampPacketWriteEnabled = false;
CommandQueueHw<FamilyType> commandQueue(&ctx, pClDevice, 0, false);
cl_event blockingEvent;
MockEvent<UserEvent> mockEvent(&ctx);
blockingEvent = &mockEvent;

View File

@ -250,32 +250,35 @@ HWTEST_F(MultiRootDeviceCommandStreamReceiverTests, givenMultipleEventInMultiRoo
}
}
HWTEST_F(MultiRootDeviceCommandStreamReceiverTests, givenMultipleEventInMultiRootDeviceEnvironmentWhenTheyArePassedToMarkerThenMiSemaphoreWaitCommandSizeIsIncluded) {
struct CrossDeviceDependenciesTests : public ::testing::Test {
void SetUp() override {
deviceFactory = std::make_unique<UltClDeviceFactory>(3, 0);
auto device1 = deviceFactory->rootDevices[1];
auto device2 = deviceFactory->rootDevices[2];
cl_device_id devices[] = {device1, device2};
context = std::make_unique<MockContext>(ClDeviceVector(devices, 2), false);
pCmdQ1 = context.get()->getSpecialQueue(1u);
pCmdQ2 = context.get()->getSpecialQueue(2u);
}
void TearDown() override {
}
std::unique_ptr<UltClDeviceFactory> deviceFactory;
std::unique_ptr<MockContext> context;
CommandQueue *pCmdQ1 = nullptr;
CommandQueue *pCmdQ2 = nullptr;
};
HWTEST_F(CrossDeviceDependenciesTests, givenMultipleEventInMultiRootDeviceEnvironmentWhenTheyArePassedToMarkerThenMiSemaphoreWaitCommandSizeIsIncluded) {
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
auto deviceFactory = std::make_unique<UltClDeviceFactory>(3, 0);
auto device1 = deviceFactory->rootDevices[1];
auto device2 = deviceFactory->rootDevices[2];
auto mockCsr1 = new MockCommandStreamReceiver(*device1->executionEnvironment, device1->getRootDeviceIndex(), device1->getDeviceBitfield());
auto mockCsr2 = new MockCommandStreamReceiver(*device2->executionEnvironment, device2->getRootDeviceIndex(), device2->getDeviceBitfield());
device1->resetCommandStreamReceiver(mockCsr1);
device2->resetCommandStreamReceiver(mockCsr2);
cl_device_id devices[] = {device1, device2};
auto context = std::make_unique<MockContext>(ClDeviceVector(devices, 2), false);
auto pCmdQ1 = context.get()->getSpecialQueue(1u);
auto pCmdQ2 = context.get()->getSpecialQueue(2u);
MockKernelWithInternals mockKernel(ClDeviceVector(devices, 2));
DispatchInfo dispatchInfo;
MultiDispatchInfo multiDispatchInfo(mockKernel.mockKernel);
dispatchInfo.setKernel(mockKernel.mockKernel);
multiDispatchInfo.push(dispatchInfo);
Event event1(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 5, 15);
Event event2(nullptr, CL_COMMAND_NDRANGE_KERNEL, 6, 16);
Event event3(pCmdQ1, CL_COMMAND_NDRANGE_KERNEL, 1, 6);
@ -309,11 +312,7 @@ HWTEST_F(MultiRootDeviceCommandStreamReceiverTests, givenMultipleEventInMultiRoo
CsrDependencies csrDeps;
eventsRequest.fillCsrDependenciesForTaskCountContainer(csrDeps, pCmdQ1->getCommandStreamReceiver(false));
HardwareParse csHwParser;
csHwParser.parseCommands<FamilyType>(pCmdQ1->getCS(0));
auto semaphores = findAll<MI_SEMAPHORE_WAIT *>(csHwParser.cmdList.begin(), csHwParser.cmdList.end());
EXPECT_EQ(0u, semaphores.size());
EXPECT_EQ(0u, csrDeps.taskCountContainer.size());
EXPECT_EQ(0u, TimestampPacketHelper::getRequiredCmdStreamSizeForTaskCountContainer<FamilyType>(csrDeps));
}
@ -339,13 +338,301 @@ HWTEST_F(MultiRootDeviceCommandStreamReceiverTests, givenMultipleEventInMultiRoo
CsrDependencies csrDeps;
eventsRequest.fillCsrDependenciesForTaskCountContainer(csrDeps, pCmdQ2->getCommandStreamReceiver(false));
EXPECT_EQ(3u, csrDeps.taskCountContainer.size());
EXPECT_EQ(3u * sizeof(MI_SEMAPHORE_WAIT), TimestampPacketHelper::getRequiredCmdStreamSizeForTaskCountContainer<FamilyType>(csrDeps));
}
}
HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventWhenProgrammingCrossDeviceDependenciesForGpgpuCsrThenProgramSemaphoreWaitOnUnblockingEvent) {
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
UserEvent userEvent1(&pCmdQ1->getContext());
cl_event outputEvent1{};
cl_event inputEvent1 = &userEvent1;
pCmdQ1->enqueueMarkerWithWaitList(
1,
&inputEvent1,
&outputEvent1);
auto event1 = castToObject<Event>(outputEvent1);
ASSERT_NE(nullptr, event1);
EXPECT_EQ(CompletionStamp::notReady, event1->peekTaskCount());
cl_int retVal = CL_INVALID_PLATFORM;
auto buffer = Buffer::create(context.get(), 0, MemoryConstants::pageSize, nullptr, retVal);
EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_NE(nullptr, buffer);
char hostPtr[MemoryConstants::pageSize]{};
cl_event outputEvent2{};
pCmdQ2->enqueueReadBuffer(buffer, CL_FALSE, 0, MemoryConstants::pageSize, hostPtr, nullptr,
1,
&outputEvent1,
&outputEvent2);
{
HardwareParse csHwParser;
csHwParser.parseCommands<FamilyType>(pCmdQ2->getCS(0));
auto semaphores = findAll<MI_SEMAPHORE_WAIT *>(csHwParser.cmdList.begin(), csHwParser.cmdList.end());
EXPECT_EQ(3u, semaphores.size());
EXPECT_EQ(3u * sizeof(MI_SEMAPHORE_WAIT), TimestampPacketHelper::getRequiredCmdStreamSizeForTaskCountContainer<FamilyType>(csrDeps));
EXPECT_EQ(0u, semaphores.size());
}
auto event2 = castToObject<Event>(outputEvent2);
ASSERT_NE(nullptr, event2);
EXPECT_EQ(CompletionStamp::notReady, event2->peekTaskCount());
pCmdQ1->enqueueMarkerWithWaitList(
1,
&outputEvent2,
nullptr);
{
HardwareParse csHwParser;
csHwParser.parseCommands<FamilyType>(pCmdQ1->getCS(0));
auto semaphores = findAll<MI_SEMAPHORE_WAIT *>(csHwParser.cmdList.begin(), csHwParser.cmdList.end());
EXPECT_EQ(0u, semaphores.size());
}
userEvent1.setStatus(CL_COMPLETE);
event1->release();
event2->release();
pCmdQ1->finish();
pCmdQ2->finish();
{
HardwareParse csHwParser;
csHwParser.parseCommands<FamilyType>(pCmdQ1->getGpgpuCommandStreamReceiver().getCS(0));
auto semaphores = findAll<MI_SEMAPHORE_WAIT *>(csHwParser.cmdList.begin(), csHwParser.cmdList.end());
EXPECT_EQ(1u, semaphores.size());
auto semaphoreCmd = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[0]));
EXPECT_EQ(1u, semaphoreCmd->getSemaphoreDataDword());
EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd->getSemaphoreGraphicsAddress());
}
{
HardwareParse csHwParser;
csHwParser.parseCommands<FamilyType>(pCmdQ2->getGpgpuCommandStreamReceiver().getCS(0));
auto semaphores = findAll<MI_SEMAPHORE_WAIT *>(csHwParser.cmdList.begin(), csHwParser.cmdList.end());
EXPECT_EQ(1u, semaphores.size());
auto semaphoreCmd = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[0]));
EXPECT_EQ(0u, semaphoreCmd->getSemaphoreDataDword());
EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd->getSemaphoreGraphicsAddress());
}
buffer->release();
}
HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventWhenProgrammingSingleDeviceDependenciesForGpgpuCsrThenNoSemaphoreWaitIsProgrammed) {
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
UserEvent userEvent1(&pCmdQ1->getContext());
cl_event outputEvent1{};
cl_event inputEvent1 = &userEvent1;
pCmdQ1->enqueueMarkerWithWaitList(
1,
&inputEvent1,
&outputEvent1);
auto event1 = castToObject<Event>(outputEvent1);
ASSERT_NE(nullptr, event1);
EXPECT_EQ(CompletionStamp::notReady, event1->peekTaskCount());
cl_int retVal = CL_INVALID_PLATFORM;
auto buffer = Buffer::create(context.get(), 0, MemoryConstants::pageSize, nullptr, retVal);
EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_NE(nullptr, buffer);
char hostPtr[MemoryConstants::pageSize]{};
cl_event outputEvent2{};
pCmdQ1->enqueueReadBuffer(buffer, CL_FALSE, 0, MemoryConstants::pageSize, hostPtr, nullptr,
1,
&outputEvent1,
&outputEvent2);
{
HardwareParse csHwParser;
csHwParser.parseCommands<FamilyType>(pCmdQ1->getCS(0));
auto semaphores = findAll<MI_SEMAPHORE_WAIT *>(csHwParser.cmdList.begin(), csHwParser.cmdList.end());
EXPECT_EQ(0u, semaphores.size());
}
auto event2 = castToObject<Event>(outputEvent2);
ASSERT_NE(nullptr, event2);
EXPECT_EQ(CompletionStamp::notReady, event2->peekTaskCount());
pCmdQ1->enqueueMarkerWithWaitList(
1,
&outputEvent2,
nullptr);
{
HardwareParse csHwParser;
csHwParser.parseCommands<FamilyType>(pCmdQ1->getCS(0));
auto semaphores = findAll<MI_SEMAPHORE_WAIT *>(csHwParser.cmdList.begin(), csHwParser.cmdList.end());
EXPECT_EQ(0u, semaphores.size());
}
userEvent1.setStatus(CL_COMPLETE);
event1->release();
event2->release();
pCmdQ1->finish();
{
HardwareParse csHwParser;
csHwParser.parseCommands<FamilyType>(pCmdQ1->getGpgpuCommandStreamReceiver().getCS(0));
auto semaphores = findAll<MI_SEMAPHORE_WAIT *>(csHwParser.cmdList.begin(), csHwParser.cmdList.end());
EXPECT_EQ(0u, semaphores.size());
}
buffer->release();
}
HWTEST_F(CrossDeviceDependenciesTests, givenWaitListWithEventBlockedByUserEventWhenProgrammingCrossDeviceDependenciesForBlitCsrThenProgramSemaphoreWaitOnUnblockingEvent) {
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
DebugManagerStateRestore restorer;
DebugManager.flags.EnableBlitterForEnqueueOperations.set(true);
for (auto &rootDeviceEnvironment : deviceFactory->rootDevices[0]->getExecutionEnvironment()->rootDeviceEnvironments) {
rootDeviceEnvironment->getMutableHardwareInfo()->capabilityTable.blitterOperationsSupported = true;
}
auto clCmdQ1 = clCreateCommandQueue(context.get(), deviceFactory->rootDevices[1], {}, nullptr);
auto clCmdQ2 = clCreateCommandQueue(context.get(), deviceFactory->rootDevices[2], {}, nullptr);
pCmdQ1 = castToObject<CommandQueue>(clCmdQ1);
pCmdQ2 = castToObject<CommandQueue>(clCmdQ2);
ASSERT_NE(nullptr, pCmdQ1);
ASSERT_NE(nullptr, pCmdQ2);
if (!pCmdQ1->getBcsCommandStreamReceiver()) {
pCmdQ1->release();
pCmdQ2->release();
GTEST_SKIP();
}
UserEvent userEvent1(&pCmdQ1->getContext());
cl_event outputEvent1{};
cl_event inputEvent1 = &userEvent1;
pCmdQ1->enqueueMarkerWithWaitList(
1,
&inputEvent1,
&outputEvent1);
auto event1 = castToObject<Event>(outputEvent1);
ASSERT_NE(nullptr, event1);
EXPECT_EQ(CompletionStamp::notReady, event1->peekTaskCount());
cl_int retVal = CL_INVALID_PLATFORM;
auto buffer = Buffer::create(context.get(), 0, MemoryConstants::pageSize, nullptr, retVal);
EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_NE(nullptr, buffer);
char hostPtr[MemoryConstants::pageSize]{};
cl_event outputEvent2{};
pCmdQ2->enqueueReadBuffer(buffer, CL_FALSE, 0, MemoryConstants::pageSize, hostPtr, nullptr,
1,
&outputEvent1,
&outputEvent2);
auto event2 = castToObject<Event>(outputEvent2);
ASSERT_NE(nullptr, event2);
EXPECT_EQ(CompletionStamp::notReady, event2->peekTaskCount());
{
HardwareParse csHwParser;
csHwParser.parseCommands<FamilyType>(pCmdQ2->getCS(0));
auto semaphores = findAll<MI_SEMAPHORE_WAIT *>(csHwParser.cmdList.begin(), csHwParser.cmdList.end());
EXPECT_EQ(0u, semaphores.size());
}
cl_event outputEvent3{};
pCmdQ1->enqueueReadBuffer(buffer, CL_FALSE, 0, MemoryConstants::pageSize, hostPtr, nullptr,
1,
&outputEvent2,
&outputEvent3);
auto event3 = castToObject<Event>(outputEvent3);
ASSERT_NE(nullptr, event3);
EXPECT_EQ(CompletionStamp::notReady, event3->peekTaskCount());
{
HardwareParse csHwParser;
csHwParser.parseCommands<FamilyType>(pCmdQ2->getCS(0));
auto semaphores = findAll<MI_SEMAPHORE_WAIT *>(csHwParser.cmdList.begin(), csHwParser.cmdList.end());
EXPECT_EQ(0u, semaphores.size());
}
pCmdQ2->enqueueMarkerWithWaitList(
1,
&outputEvent3,
nullptr);
{
HardwareParse csHwParser;
csHwParser.parseCommands<FamilyType>(pCmdQ2->getCS(0));
auto semaphores = findAll<MI_SEMAPHORE_WAIT *>(csHwParser.cmdList.begin(), csHwParser.cmdList.end());
EXPECT_EQ(0u, semaphores.size());
}
userEvent1.setStatus(CL_COMPLETE);
event1->release();
event2->release();
event3->release();
pCmdQ1->finish();
pCmdQ2->finish();
{
HardwareParse csHwParser;
csHwParser.parseCommands<FamilyType>(pCmdQ1->getGpgpuCommandStreamReceiver().getCS(0));
auto semaphores = findAll<MI_SEMAPHORE_WAIT *>(csHwParser.cmdList.begin(), csHwParser.cmdList.end());
EXPECT_EQ(1u, semaphores.size());
auto semaphoreCmd = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[0]));
EXPECT_EQ(1u, semaphoreCmd->getSemaphoreDataDword());
EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ2->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd->getSemaphoreGraphicsAddress());
}
{
HardwareParse csHwParser;
csHwParser.parseCommands<FamilyType>(pCmdQ1->getBcsCommandStreamReceiver()->getCS(0));
auto semaphores = findAll<MI_SEMAPHORE_WAIT *>(csHwParser.cmdList.begin(), csHwParser.cmdList.end());
EXPECT_LE(1u, semaphores.size());
}
{
HardwareParse csHwParser;
csHwParser.parseCommands<FamilyType>(pCmdQ2->getGpgpuCommandStreamReceiver().getCS(0));
auto semaphores = findAll<MI_SEMAPHORE_WAIT *>(csHwParser.cmdList.begin(), csHwParser.cmdList.end());
EXPECT_EQ(2u, semaphores.size());
auto semaphoreCmd0 = genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[0]));
EXPECT_EQ(0u, semaphoreCmd0->getSemaphoreDataDword());
EXPECT_EQ(reinterpret_cast<uint64_t>(pCmdQ1->getGpgpuCommandStreamReceiver().getTagAddress()), semaphoreCmd0->getSemaphoreGraphicsAddress());
}
{
HardwareParse csHwParser;
csHwParser.parseCommands<FamilyType>(pCmdQ2->getBcsCommandStreamReceiver()->getCS(0));
auto semaphores = findAll<MI_SEMAPHORE_WAIT *>(csHwParser.cmdList.begin(), csHwParser.cmdList.end());
EXPECT_LE(1u, semaphores.size());
}
buffer->release();
pCmdQ1->release();
pCmdQ2->release();
}
HWTEST_F(CommandStreamReceiverFlushTaskTests, givenStaticPartitioningEnabledWhenFlushingTaskThenWorkPartitionAllocationIsMadeResident) {

View File

@ -317,6 +317,7 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
auto commandStreamStartCSR = commandStreamCSR.getUsed();
TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer<GfxFamily>(commandStreamCSR, dispatchFlags.csrDependencies);
TimestampPacketHelper::programCsrDependenciesForForTaskCountContainer<GfxFamily>(commandStreamCSR, dispatchFlags.csrDependencies);
if (stallingPipeControlOnNextFlushRequired) {
programStallingPipeControlForBarrier(commandStreamCSR, dispatchFlags);
@ -795,6 +796,7 @@ size_t CommandStreamReceiverHw<GfxFamily>::getRequiredCmdStreamSize(const Dispat
}
size += TimestampPacketHelper::getRequiredCmdStreamSize<GfxFamily>(dispatchFlags.csrDependencies);
size += TimestampPacketHelper::getRequiredCmdStreamSizeForTaskCountContainer<GfxFamily>(dispatchFlags.csrDependencies);
if (stallingPipeControlOnNextFlushRequired) {
auto barrierTimestampPacketNodes = dispatchFlags.barrierTimestampPacketNodes;
@ -1010,6 +1012,7 @@ uint32_t CommandStreamReceiverHw<GfxFamily>::blitBuffer(const BlitPropertiesCont
for (auto &blitProperties : blitPropertiesContainer) {
TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer<GfxFamily>(commandStream, blitProperties.csrDependencies);
TimestampPacketHelper::programCsrDependenciesForForTaskCountContainer<GfxFamily>(commandStream, blitProperties.csrDependencies);
if (blitProperties.outputTimestampPacket && profilingEnabled) {
BlitCommandsHelper<GfxFamily>::encodeProfilingStartMmios(commandStream, *blitProperties.outputTimestampPacket);

View File

@ -118,7 +118,11 @@ size_t BlitCommandsHelper<GfxFamily>::estimateBlitCommandsSize(const Vec3<size_t
auto sizePerBlit = (sizeof(typename GfxFamily::XY_COPY_BLT) + estimatePostBlitCommandSize());
return TimestampPacketHelper::getRequiredCmdStreamSize<GfxFamily>(csrDependencies) + (sizePerBlit * nBlits) + timestampCmdSize + estimatePreBlitCommandSize();
return TimestampPacketHelper::getRequiredCmdStreamSize<GfxFamily>(csrDependencies) +
TimestampPacketHelper::getRequiredCmdStreamSizeForTaskCountContainer<GfxFamily>(csrDependencies) +
(sizePerBlit * nBlits) +
timestampCmdSize +
estimatePreBlitCommandSize();
}
template <typename GfxFamily>