Add cache flush for blit enqueues

Change-Id: I31dbeed9973c5077bf79ea7c7534b2430bca5083
Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
This commit is contained in:
Bartosz Dunajski
2020-02-27 13:29:15 +01:00
committed by sys_ocldev
parent ec647dabe9
commit db012c9d5c
6 changed files with 76 additions and 7 deletions

View File

@@ -208,9 +208,13 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
nodesCount = estimateTimestampPacketNodesCount(multiDispatchInfo);
}
if (blitEnqueue && !blockQueue && getGpgpuCommandStreamReceiver().isStallingPipeControlOnNextFlushRequired()) {
if (blitEnqueue) {
auto allocator = getGpgpuCommandStreamReceiver().getTimestampPacketAllocator();
timestampPacketDependencies.barrierNodes.add(allocator->getTag());
timestampPacketDependencies.cacheFlushNodes.add(allocator->getTag());
if (!blockQueue && getGpgpuCommandStreamReceiver().isStallingPipeControlOnNextFlushRequired()) {
timestampPacketDependencies.barrierNodes.add(allocator->getTag());
}
}
if (nodesCount > 0) {
@@ -461,12 +465,21 @@ BlitProperties CommandQueueHw<GfxFamily>::processDispatchForBlitEnqueue(const Mu
eventsRequest.fillCsrDependencies(blitProperties.csrDependencies, *blitCommandStreamReceiver,
CsrDependencies::DependenciesType::All);
blitProperties.csrDependencies.push_back(&timestampPacketDependencies.cacheFlushNodes);
blitProperties.csrDependencies.push_back(&timestampPacketDependencies.previousEnqueueNodes);
blitProperties.csrDependencies.push_back(&timestampPacketDependencies.barrierNodes);
}
auto currentTimestampPacketNode = timestampPacketContainer->peekNodes().at(0);
blitProperties.outputTimestampPacket = currentTimestampPacketNode;
auto cacheFlushTimestampPacketGpuAddress = timestampPacketDependencies.cacheFlushNodes.peekNodes()[0]->getGpuAddress() +
offsetof(TimestampPacketStorage, packets[0].contextEnd);
MemorySynchronizationCommands<GfxFamily>::obtainPipeControlAndProgramPostSyncOperation(
commandStream, GfxFamily::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
cacheFlushTimestampPacketGpuAddress, 0, true, device->getHardwareInfo());
TimestampPacketHelper::programSemaphoreWithImplicitDependency<GfxFamily>(commandStream, *currentTimestampPacketNode);
return blitProperties;
@@ -920,6 +933,7 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueCommandWithoutKernel(
if (timestampPacketContainer) {
timestampPacketContainer->makeResident(getGpgpuCommandStreamReceiver());
timestampPacketDependencies.previousEnqueueNodes.makeResident(getGpgpuCommandStreamReceiver());
timestampPacketDependencies.cacheFlushNodes.makeResident(getGpgpuCommandStreamReceiver());
}
for (auto surface : CreateRange(surfaces, surfaceCount)) {

View File

@@ -210,7 +210,9 @@ void GpgpuWalkerHelper<GfxFamily>::adjustMiStoreRegMemMode(MI_STORE_REG_MEM<GfxF
template <typename GfxFamily>
size_t EnqueueOperation<GfxFamily>::getTotalSizeRequiredCS(uint32_t eventType, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounters, bool blitEnqueue, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) {
if (blitEnqueue) {
return TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependencyWithBlitEnqueue<GfxFamily>();
auto &hwInfo = commandQueue.getDevice().getHardwareInfo();
return TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependencyWithBlitEnqueue<GfxFamily>() +
MemorySynchronizationCommands<GfxFamily>::getSizeForPipeControlWithPostSyncOperation(hwInfo);
}
size_t expectedSizeCS = 0;
Kernel *parentKernel = multiDispatchInfo.peekParentKernel();

View File

@@ -285,6 +285,7 @@ void CommandWithoutKernel::dispatchBlitOperation() {
UNRECOVERABLE_IF(kernelOperation->blitPropertiesContainer.size() != 1);
auto &blitProperties = *kernelOperation->blitPropertiesContainer.begin();
eventsRequest.fillCsrDependencies(blitProperties.csrDependencies, *bcsCsr, CsrDependencies::DependenciesType::All);
blitProperties.csrDependencies.push_back(&timestampPacketDependencies->cacheFlushNodes);
blitProperties.csrDependencies.push_back(&timestampPacketDependencies->previousEnqueueNodes);
blitProperties.csrDependencies.push_back(&timestampPacketDependencies->barrierNodes);
blitProperties.outputTimestampPacket = currentTimestampPacketNodes->peekNodes()[0];
@@ -403,6 +404,7 @@ void Command::makeTimestampPacketsResident(CommandStreamReceiver &commandStreamR
currentTimestampPacketNodes->makeResident(commandStreamReceiver);
}
if (timestampPacketDependencies) {
timestampPacketDependencies->cacheFlushNodes.makeResident(commandStreamReceiver);
timestampPacketDependencies->previousEnqueueNodes.makeResident(commandStreamReceiver);
}
}

View File

@@ -175,6 +175,8 @@ HWTEST_F(DispatchFlagsTests, givenBlitEnqueueWhenDispatchingCommandsWithoutKerne
multiDispatchInfo.setBuiltinOpParams(builtinOpParams);
mockCmdQ->obtainNewTimestampPacketNodes(1, timestampPacketDependencies.previousEnqueueNodes, true);
timestampPacketDependencies.cacheFlushNodes.add(mockCmdQ->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator()->getTag());
BlitProperties blitProperties = mockCmdQ->processDispatchForBlitEnqueue(multiDispatchInfo, timestampPacketDependencies,
eventsRequest, mockCmdQ->getCS(0), CL_COMMAND_READ_BUFFER, false);
@@ -214,6 +216,7 @@ HWTEST_F(DispatchFlagsTests, givenN1EnabledWhenDispatchingWithoutKernelTheAllowO
multiDispatchInfo.setBuiltinOpParams(builtinOpParams);
mockCmdQ->obtainNewTimestampPacketNodes(1, timestampPacketDependencies.previousEnqueueNodes, true);
timestampPacketDependencies.cacheFlushNodes.add(mockCmdQ->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator()->getTag());
BlitProperties blitProperties = mockCmdQ->processDispatchForBlitEnqueue(multiDispatchInfo, timestampPacketDependencies,
eventsRequest, mockCmdQ->getCS(0), CL_COMMAND_READ_BUFFER, false);
BlitPropertiesContainer blitPropertiesContainer;

View File

@@ -992,6 +992,50 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenReadBufferEnqueueWhenProgrammingCommandS
EXPECT_EQ(initialTaskCount + 1, queueCsr->peekTaskCount());
}
HWTEST_TEMPLATED_F(BcsBufferTests, givenBlitEnqueueWhenProgrammingCmdBufferThenWaitForCacheFlushFromBcs) {
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
using MI_ATOMIC = typename FamilyType::MI_ATOMIC;
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
auto cmdQ = clUniquePtr(new MockCommandQueueHw<FamilyType>(bcsMockContext.get(), device.get(), nullptr));
auto bcsCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(cmdQ->getBcsCommandStreamReceiver());
cl_int retVal = CL_SUCCESS;
auto buffer = clUniquePtr<Buffer>(Buffer::create(bcsMockContext.get(), CL_MEM_READ_WRITE, 1, nullptr, retVal));
buffer->forceDisallowCPUCopy = true;
void *hostPtr = reinterpret_cast<void *>(0x12340000);
cmdQ->enqueueWriteBuffer(buffer.get(), true, 0, 1, hostPtr, nullptr, 0, nullptr, nullptr);
HardwareParse hwParserGpGpu;
HardwareParse hwParserBcs;
hwParserGpGpu.parseCommands<FamilyType>(*cmdQ->peekCommandStream());
hwParserBcs.parseCommands<FamilyType>(bcsCsr->commandStream);
auto gpgpuPipeControls = findAll<PIPE_CONTROL *>(hwParserGpGpu.cmdList.begin(), hwParserGpGpu.cmdList.end());
uint64_t cacheFlushWriteAddress = 0;
for (auto &pipeControl : gpgpuPipeControls) {
auto pipeControlCmd = genCmdCast<PIPE_CONTROL *>(*pipeControl);
uint64_t addressHigh = static_cast<uint64_t>(pipeControlCmd->getAddressHigh()) << 32;
uint64_t addressLow = pipeControlCmd->getAddress();
cacheFlushWriteAddress = addressHigh | addressLow;
if (cacheFlushWriteAddress != 0) {
EXPECT_TRUE(pipeControlCmd->getDcFlushEnable());
EXPECT_TRUE(pipeControlCmd->getCommandStreamerStallEnable());
EXPECT_EQ(0u, pipeControlCmd->getImmediateData());
break;
}
}
EXPECT_NE(0u, cacheFlushWriteAddress);
auto bcsSemaphores = findAll<MI_SEMAPHORE_WAIT *>(hwParserBcs.cmdList.begin(), hwParserBcs.cmdList.end());
auto semaphoreCmd = genCmdCast<MI_SEMAPHORE_WAIT *>(*bcsSemaphores[0]);
EXPECT_EQ(cacheFlushWriteAddress, semaphoreCmd->getSemaphoreGraphicsAddress());
}
HWTEST_TEMPLATED_F(BcsBufferTests, givenPipeControlRequestWhenDispatchingBlitEnqueueThenWaitPipeControlOnBcsEngine) {
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
@@ -1033,8 +1077,8 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenPipeControlRequestWhenDispatchingBlitEnq
bcsHwParser.parseCommands<FamilyType>(bcsCsr->commandStream);
auto semaphores = findAll<MI_SEMAPHORE_WAIT *>(bcsHwParser.cmdList.begin(), bcsHwParser.cmdList.end());
EXPECT_EQ(UnitTestHelper<FamilyType>::isSynchronizationWArequired(device->getHardwareInfo()) ? 3u : 1u, semaphores.size());
EXPECT_EQ(pipeControlWriteAddress, genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[0]))->getSemaphoreGraphicsAddress());
EXPECT_EQ(UnitTestHelper<FamilyType>::isSynchronizationWArequired(device->getHardwareInfo()) ? 4u : 2u, semaphores.size());
EXPECT_EQ(pipeControlWriteAddress, genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[1]))->getSemaphoreGraphicsAddress());
}
HWTEST_TEMPLATED_F(BcsBufferTests, givenBarrierWhenReleasingMultipleBlockedEnqueuesThenProgramBarrierOnce) {
@@ -1111,7 +1155,7 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenPipeControlRequestWhenDispatchingBlocked
bcsHwParser.parseCommands<FamilyType>(bcsCsr->commandStream);
auto semaphores = findAll<MI_SEMAPHORE_WAIT *>(bcsHwParser.cmdList.begin(), bcsHwParser.cmdList.end());
EXPECT_EQ(UnitTestHelper<FamilyType>::isSynchronizationWArequired(device->getHardwareInfo()) ? 3u : 1u, semaphores.size());
EXPECT_EQ(UnitTestHelper<FamilyType>::isSynchronizationWArequired(device->getHardwareInfo()) ? 4u : 2u, semaphores.size());
cmdQ->isQueueBlocked();
}
@@ -1121,13 +1165,16 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenBufferOperationWithoutKernelWhenEstimati
CsrDependencies csrDependencies;
MultiDispatchInfo multiDispatchInfo;
auto &hwInfo = cmdQ->getDevice().getHardwareInfo();
auto readBufferCmdsSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_READ_BUFFER, csrDependencies, false, false,
true, *cmdQ, multiDispatchInfo);
auto writeBufferCmdsSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_WRITE_BUFFER, csrDependencies, false, false,
true, *cmdQ, multiDispatchInfo);
auto copyBufferCmdsSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_COPY_BUFFER, csrDependencies, false, false,
true, *cmdQ, multiDispatchInfo);
auto expectedSize = TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependencyWithBlitEnqueue<FamilyType>();
auto expectedSize = TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependencyWithBlitEnqueue<FamilyType>() +
MemorySynchronizationCommands<FamilyType>::getSizeForPipeControlWithPostSyncOperation(hwInfo);
EXPECT_EQ(expectedSize, readBufferCmdsSize);
EXPECT_EQ(expectedSize, writeBufferCmdsSize);

View File

@@ -95,6 +95,7 @@ class TimestampPacketContainer : public NonCopyableClass {
};
struct TimestampPacketDependencies : public NonCopyableClass {
TimestampPacketContainer cacheFlushNodes;
TimestampPacketContainer previousEnqueueNodes;
TimestampPacketContainer barrierNodes;
TimestampPacketContainer auxToNonAuxNodes;