Check if cache flush for BCS is required

Change-Id: Ia36856c46fe7da7a72dae14e2543456fb30ec409
Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
This commit is contained in:
Bartosz Dunajski
2020-02-28 09:07:07 +01:00
committed by sys_ocldev
parent baf80c28ec
commit 4c781c1b98
7 changed files with 99 additions and 24 deletions

View File

@@ -394,6 +394,8 @@ class CommandQueueHw : public CommandQueue {
bool isCacheFlushCommand(uint32_t commandType) const override;
MOCKABLE_VIRTUAL bool isCacheFlushForBcsRequired() const;
protected:
MOCKABLE_VIRTUAL void enqueueHandlerHook(const unsigned int commandType, const MultiDispatchInfo &dispatchInfo){};
size_t calculateHostPtrSizeForImage(const size_t *region, size_t rowPitch, size_t slicePitch, Image *image);

View File

@@ -123,6 +123,11 @@ bool CommandQueueHw<Family>::forceStateless(size_t size) {
return size >= 4ull * MemoryConstants::gigaByte;
}
template <typename Family>
bool CommandQueueHw<Family>::isCacheFlushForBcsRequired() const {
return true;
}
template <typename Family>
void CommandQueueHw<Family>::setupBlitAuxTranslation(MultiDispatchInfo &multiDispatchInfo) {
multiDispatchInfo.begin()->dispatchInitCommands.registerMethod(

View File

@@ -211,7 +211,9 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
if (blitEnqueue) {
auto allocator = getGpgpuCommandStreamReceiver().getTimestampPacketAllocator();
timestampPacketDependencies.cacheFlushNodes.add(allocator->getTag());
if (isCacheFlushForBcsRequired()) {
timestampPacketDependencies.cacheFlushNodes.add(allocator->getTag());
}
if (!blockQueue && getGpgpuCommandStreamReceiver().isStallingPipeControlOnNextFlushRequired()) {
timestampPacketDependencies.barrierNodes.add(allocator->getTag());
}
@@ -473,12 +475,14 @@ BlitProperties CommandQueueHw<GfxFamily>::processDispatchForBlitEnqueue(const Mu
auto currentTimestampPacketNode = timestampPacketContainer->peekNodes().at(0);
blitProperties.outputTimestampPacket = currentTimestampPacketNode;
auto cacheFlushTimestampPacketGpuAddress = timestampPacketDependencies.cacheFlushNodes.peekNodes()[0]->getGpuAddress() +
offsetof(TimestampPacketStorage, packets[0].contextEnd);
if (isCacheFlushForBcsRequired()) {
auto cacheFlushTimestampPacketGpuAddress = timestampPacketDependencies.cacheFlushNodes.peekNodes()[0]->getGpuAddress() +
offsetof(TimestampPacketStorage, packets[0].contextEnd);
MemorySynchronizationCommands<GfxFamily>::obtainPipeControlAndProgramPostSyncOperation(
commandStream, GfxFamily::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
cacheFlushTimestampPacketGpuAddress, 0, true, device->getHardwareInfo());
MemorySynchronizationCommands<GfxFamily>::obtainPipeControlAndProgramPostSyncOperation(
commandStream, GfxFamily::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
cacheFlushTimestampPacketGpuAddress, 0, true, device->getHardwareInfo());
}
TimestampPacketHelper::programSemaphoreWithImplicitDependency<GfxFamily>(commandStream, *currentTimestampPacketNode);

View File

@@ -17,6 +17,7 @@
#include "shared/source/utilities/tag_allocator.h"
#include "opencl/source/command_queue/command_queue.h"
#include "opencl/source/command_queue/command_queue_hw.h"
#include "opencl/source/command_queue/gpgpu_walker.h"
#include "opencl/source/command_queue/local_id_gen.h"
#include "opencl/source/device/device_info.h"
@@ -209,12 +210,20 @@ void GpgpuWalkerHelper<GfxFamily>::adjustMiStoreRegMemMode(MI_STORE_REG_MEM<GfxF
template <typename GfxFamily>
size_t EnqueueOperation<GfxFamily>::getTotalSizeRequiredCS(uint32_t eventType, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounters, bool blitEnqueue, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) {
size_t expectedSizeCS = 0;
if (blitEnqueue) {
auto &hwInfo = commandQueue.getDevice().getHardwareInfo();
return TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependencyWithBlitEnqueue<GfxFamily>() +
MemorySynchronizationCommands<GfxFamily>::getSizeForPipeControlWithPostSyncOperation(hwInfo);
auto &commandQueueHw = static_cast<CommandQueueHw<GfxFamily> &>(commandQueue);
size_t expectedSizeCS = TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependencyWithBlitEnqueue<GfxFamily>();
if (commandQueueHw.isCacheFlushForBcsRequired()) {
expectedSizeCS += MemorySynchronizationCommands<GfxFamily>::getSizeForPipeControlWithPostSyncOperation(hwInfo);
}
return expectedSizeCS;
}
size_t expectedSizeCS = 0;
Kernel *parentKernel = multiDispatchInfo.peekParentKernel();
for (auto &dispatchInfo : multiDispatchInfo) {
expectedSizeCS += EnqueueOperation<GfxFamily>::getSizeRequiredCS(eventType, reserveProfilingCmdsSpace, reservePerfCounters, commandQueue, dispatchInfo.getKernel());

View File

@@ -99,6 +99,12 @@ HWTEST_F(CommandQueueHwTest, WhenEnqueuingBlockedMapUnmapOperationThenVirtualEve
pHwQ->virtualEvent = nullptr;
}
HWTEST_F(CommandQueueHwTest, givenCommandQueueWhenAskingForCacheFlushOnBcsThenReturnTrue) {
auto pHwQ = static_cast<CommandQueueHw<FamilyType> *>(pCmdQ);
EXPECT_TRUE(pHwQ->isCacheFlushForBcsRequired());
}
HWTEST_F(CommandQueueHwTest, givenBlockedMapBufferCallWhenMemObjectIsPassedToCommandThenItsRefCountIsBeingIncreased) {
CommandQueueHw<FamilyType> *pHwQ = reinterpret_cast<CommandQueueHw<FamilyType> *>(pCmdQ);
MockBuffer buffer;

View File

@@ -729,6 +729,9 @@ struct BcsBufferTests : public ::testing::Test {
template <typename FamilyType>
void TearDownT() {}
template <typename FamilyType>
void waitForCacheFlushFromBcsTest(MockCommandQueueHw<FamilyType> &commandQueue);
DebugManagerStateRestore restore;
std::unique_ptr<OsContext> bcsOsContext;
@@ -992,25 +995,26 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenReadBufferEnqueueWhenProgrammingCommandS
EXPECT_EQ(initialTaskCount + 1, queueCsr->peekTaskCount());
}
HWTEST_TEMPLATED_F(BcsBufferTests, givenBlitEnqueueWhenProgrammingCmdBufferThenWaitForCacheFlushFromBcs) {
template <typename FamilyType>
void BcsBufferTests::waitForCacheFlushFromBcsTest(MockCommandQueueHw<FamilyType> &commandQueue) {
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
using MI_ATOMIC = typename FamilyType::MI_ATOMIC;
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
auto cmdQ = clUniquePtr(new MockCommandQueueHw<FamilyType>(bcsMockContext.get(), device.get(), nullptr));
bool isCacheFlushForBcsRequired = commandQueue.isCacheFlushForBcsRequired();
auto bcsCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(cmdQ->getBcsCommandStreamReceiver());
auto bcsCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(commandQueue.getBcsCommandStreamReceiver());
cl_int retVal = CL_SUCCESS;
auto buffer = clUniquePtr<Buffer>(Buffer::create(bcsMockContext.get(), CL_MEM_READ_WRITE, 1, nullptr, retVal));
buffer->forceDisallowCPUCopy = true;
void *hostPtr = reinterpret_cast<void *>(0x12340000);
cmdQ->enqueueWriteBuffer(buffer.get(), true, 0, 1, hostPtr, nullptr, 0, nullptr, nullptr);
commandQueue.enqueueWriteBuffer(buffer.get(), true, 0, 1, hostPtr, nullptr, 0, nullptr, nullptr);
HardwareParse hwParserGpGpu;
HardwareParse hwParserBcs;
hwParserGpGpu.parseCommands<FamilyType>(*cmdQ->peekCommandStream());
hwParserGpGpu.parseCommands<FamilyType>(*commandQueue.peekCommandStream());
hwParserBcs.parseCommands<FamilyType>(bcsCsr->commandStream);
auto gpgpuPipeControls = findAll<PIPE_CONTROL *>(hwParserGpGpu.cmdList.begin(), hwParserGpGpu.cmdList.end());
@@ -1024,16 +1028,37 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenBlitEnqueueWhenProgrammingCmdBufferThenW
if (cacheFlushWriteAddress != 0) {
EXPECT_TRUE(pipeControlCmd->getDcFlushEnable());
EXPECT_TRUE(pipeControlCmd->getCommandStreamerStallEnable());
EXPECT_EQ(0u, pipeControlCmd->getImmediateData());
EXPECT_EQ(isCacheFlushForBcsRequired, 0u == pipeControlCmd->getImmediateData());
break;
}
}
EXPECT_NE(0u, cacheFlushWriteAddress);
auto bcsSemaphores = findAll<MI_SEMAPHORE_WAIT *>(hwParserBcs.cmdList.begin(), hwParserBcs.cmdList.end());
auto semaphoreCmd = genCmdCast<MI_SEMAPHORE_WAIT *>(*bcsSemaphores[0]);
size_t additionalSemaphores = UnitTestHelper<FamilyType>::isSynchronizationWArequired(device->getHardwareInfo()) ? 2 : 0;
EXPECT_EQ(cacheFlushWriteAddress, semaphoreCmd->getSemaphoreGraphicsAddress());
if (isCacheFlushForBcsRequired) {
EXPECT_NE(0u, cacheFlushWriteAddress);
EXPECT_EQ(1u + additionalSemaphores, bcsSemaphores.size());
auto semaphoreCmd = genCmdCast<MI_SEMAPHORE_WAIT *>(*bcsSemaphores[0]);
EXPECT_EQ(cacheFlushWriteAddress, semaphoreCmd->getSemaphoreGraphicsAddress());
} else {
EXPECT_EQ(additionalSemaphores, bcsSemaphores.size());
}
}
HWTEST_TEMPLATED_F(BcsBufferTests, givenCommandQueueWithCacheFlushRequirementWhenProgrammingCmdBufferThenWaitForCacheFlushFromBcs) {
auto cmdQ = clUniquePtr(new MockCommandQueueHw<FamilyType>(bcsMockContext.get(), device.get(), nullptr));
cmdQ->overrideIsCacheFlushForBcsRequired.enabled = true;
cmdQ->overrideIsCacheFlushForBcsRequired.returnValue = true;
waitForCacheFlushFromBcsTest<FamilyType>(*cmdQ);
}
HWTEST_TEMPLATED_F(BcsBufferTests, givenCommandQueueWithoutCacheFlushRequirementWhenProgrammingCmdBufferThenWaitForCacheFlushFromBcs) {
auto cmdQ = clUniquePtr(new MockCommandQueueHw<FamilyType>(bcsMockContext.get(), device.get(), nullptr));
cmdQ->overrideIsCacheFlushForBcsRequired.enabled = true;
cmdQ->overrideIsCacheFlushForBcsRequired.returnValue = false;
waitForCacheFlushFromBcsTest<FamilyType>(*cmdQ);
}
HWTEST_TEMPLATED_F(BcsBufferTests, givenPipeControlRequestWhenDispatchingBlitEnqueueThenWaitPipeControlOnBcsEngine) {
@@ -1075,10 +1100,15 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenPipeControlRequestWhenDispatchingBlitEnq
HardwareParse bcsHwParser;
bcsHwParser.parseCommands<FamilyType>(bcsCsr->commandStream);
auto semaphores = findAll<MI_SEMAPHORE_WAIT *>(bcsHwParser.cmdList.begin(), bcsHwParser.cmdList.end());
EXPECT_EQ(UnitTestHelper<FamilyType>::isSynchronizationWArequired(device->getHardwareInfo()) ? 4u : 2u, semaphores.size());
EXPECT_EQ(pipeControlWriteAddress, genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[1]))->getSemaphoreGraphicsAddress());
if (cmdQ->isCacheFlushForBcsRequired()) {
EXPECT_EQ(UnitTestHelper<FamilyType>::isSynchronizationWArequired(device->getHardwareInfo()) ? 4u : 2u, semaphores.size());
EXPECT_EQ(pipeControlWriteAddress, genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[1]))->getSemaphoreGraphicsAddress());
} else {
EXPECT_EQ(UnitTestHelper<FamilyType>::isSynchronizationWArequired(device->getHardwareInfo()) ? 3u : 1u, semaphores.size());
EXPECT_EQ(pipeControlWriteAddress, genCmdCast<MI_SEMAPHORE_WAIT *>(*(semaphores[0]))->getSemaphoreGraphicsAddress());
}
}
HWTEST_TEMPLATED_F(BcsBufferTests, givenBarrierWhenReleasingMultipleBlockedEnqueuesThenProgramBarrierOnce) {
@@ -1155,7 +1185,12 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenPipeControlRequestWhenDispatchingBlocked
bcsHwParser.parseCommands<FamilyType>(bcsCsr->commandStream);
auto semaphores = findAll<MI_SEMAPHORE_WAIT *>(bcsHwParser.cmdList.begin(), bcsHwParser.cmdList.end());
EXPECT_EQ(UnitTestHelper<FamilyType>::isSynchronizationWArequired(device->getHardwareInfo()) ? 4u : 2u, semaphores.size());
if (cmdQ->isCacheFlushForBcsRequired()) {
EXPECT_EQ(UnitTestHelper<FamilyType>::isSynchronizationWArequired(device->getHardwareInfo()) ? 4u : 2u, semaphores.size());
} else {
EXPECT_EQ(UnitTestHelper<FamilyType>::isSynchronizationWArequired(device->getHardwareInfo()) ? 3u : 1u, semaphores.size());
}
cmdQ->isQueueBlocked();
}
@@ -1173,8 +1208,11 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenBufferOperationWithoutKernelWhenEstimati
true, *cmdQ, multiDispatchInfo);
auto copyBufferCmdsSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_COPY_BUFFER, csrDependencies, false, false,
true, *cmdQ, multiDispatchInfo);
auto expectedSize = TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependencyWithBlitEnqueue<FamilyType>() +
MemorySynchronizationCommands<FamilyType>::getSizeForPipeControlWithPostSyncOperation(hwInfo);
auto expectedSize = TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependencyWithBlitEnqueue<FamilyType>();
if (cmdQ->isCacheFlushForBcsRequired()) {
expectedSize += MemorySynchronizationCommands<FamilyType>::getSizeForPipeControlWithPostSyncOperation(hwInfo);
}
EXPECT_EQ(expectedSize, readBufferCmdsSize);
EXPECT_EQ(expectedSize, writeBufferCmdsSize);

View File

@@ -159,6 +159,13 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
return BaseClass::waitUntilComplete(taskCountToWait, flushStampToWait, useQuickKmdSleep);
}
bool isCacheFlushForBcsRequired() const override {
if (overrideIsCacheFlushForBcsRequired.enabled) {
return overrideIsCacheFlushForBcsRequired.returnValue;
}
return BaseClass::isCacheFlushForBcsRequired();
}
unsigned int lastCommandType;
std::vector<Kernel *> lastEnqueuedKernels;
MultiDispatchInfo storedMultiDispatchInfo;
@@ -169,6 +176,10 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
bool notifyEnqueueReadBufferCalled = false;
bool notifyEnqueueReadImageCalled = false;
bool cpuDataTransferHandlerCalled = false;
struct OverrideReturnValue {
bool enabled = false;
bool returnValue = false;
} overrideIsCacheFlushForBcsRequired;
BuiltinOpParams kernelParams;
std::atomic<uint32_t> latestTaskCountWaited{std::numeric_limits<uint32_t>::max()};