Optimize BCS flushing scheme [1/n]

Change-Id: Ia192d24196e46fc281c401c241044f3429c16693
Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
This commit is contained in:
Bartosz Dunajski
2020-06-24 13:32:09 +02:00
committed by sys_ocldev
parent c5f3fe5987
commit c1dc8a8c3c
8 changed files with 203 additions and 90 deletions

View File

@ -686,4 +686,12 @@ void CommandQueue::aubCaptureHook(bool &blocking, bool &clearAllDependencies, co
}
}
}
bool CommandQueue::isGpgpuSubmissionForBcsRequired() const {
if (DebugManager.flags.ForceGpgpuSubmissionForBcsEnqueue.get() == 0) {
return (latestSentEnqueueType != EnqueueProperties::Operation::Blit) && (latestSentEnqueueType != EnqueueProperties::Operation::None);
}
return true;
}
} // namespace NEO

View File

@ -11,6 +11,7 @@
#include "opencl/source/event/event.h"
#include "opencl/source/helpers/base_object.h"
#include "opencl/source/helpers/dispatch_info.h"
#include "opencl/source/helpers/enqueue_properties.h"
#include "opencl/source/helpers/task_information.h"
#include <atomic>
@ -336,6 +337,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
bool blitEnqueueAllowed(cl_command_type cmdType) const;
void aubCaptureHook(bool &blocking, bool &clearAllDependencies, const MultiDispatchInfo &multiDispatchInfo);
virtual bool obtainTimestampPacketForCacheFlush(bool isCacheFlushRequired) const = 0;
bool isGpgpuSubmissionForBcsRequired() const;
Context *context = nullptr;
ClDevice *device = nullptr;
@ -347,6 +349,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
QueuePriority priority = QueuePriority::MEDIUM;
QueueThrottle throttle = QueueThrottle::MEDIUM;
EnqueueProperties::Operation latestSentEnqueueType = EnqueueProperties::Operation::None;
uint64_t sliceCount = QueueSliceCount::defaultSliceCount;
uint32_t bcsTaskCount = 0;

View File

@ -213,8 +213,11 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
nodesCount = estimateTimestampPacketNodesCount(multiDispatchInfo);
}
if (isCacheFlushForBcsRequired() && (blitEnqueue || enqueueWithBlitAuxTranslation)) {
timestampPacketDependencies.cacheFlushNodes.add(allocator->getTag());
if (isCacheFlushForBcsRequired()) {
// Cache flush for aux translation is always required (if supported)
if ((blitEnqueue && isGpgpuSubmissionForBcsRequired()) || (enqueueWithBlitAuxTranslation)) {
timestampPacketDependencies.cacheFlushNodes.add(allocator->getTag());
}
}
if (blitEnqueue && !blockQueue && getGpgpuCommandStreamReceiver().isStallingPipeControlOnNextFlushRequired()) {
@ -344,6 +347,8 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
if (eventBuilder.getEvent()) {
eventBuilder.getEvent()->flushStamp->replaceStampObject(this->flushStamp->getStampReference());
}
this->latestSentEnqueueType = enqueueProperties.operation;
}
updateFromCompletionStamp(completionStamp);
@ -485,21 +490,22 @@ BlitProperties CommandQueueHw<GfxFamily>::processDispatchForBlitEnqueue(const Mu
auto currentTimestampPacketNode = timestampPacketContainer->peekNodes().at(0);
blitProperties.outputTimestampPacket = currentTimestampPacketNode;
if (isCacheFlushForBcsRequired()) {
auto cacheFlushTimestampPacketGpuAddress = TimestampPacketHelper::getContextEndGpuAddress(*timestampPacketDependencies.cacheFlushNodes.peekNodes()[0]);
PipeControlArgs args(true);
MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(
commandStream,
GfxFamily::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
cacheFlushTimestampPacketGpuAddress,
0,
device->getHardwareInfo(),
args);
if (isGpgpuSubmissionForBcsRequired()) {
if (isCacheFlushForBcsRequired()) {
auto cacheFlushTimestampPacketGpuAddress = TimestampPacketHelper::getContextEndGpuAddress(*timestampPacketDependencies.cacheFlushNodes.peekNodes()[0]);
PipeControlArgs args(true);
MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(
commandStream,
GfxFamily::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
cacheFlushTimestampPacketGpuAddress,
0,
device->getHardwareInfo(),
args);
}
TimestampPacketHelper::programSemaphoreWithImplicitDependency<GfxFamily>(commandStream, *currentTimestampPacketNode,
getGpgpuCommandStreamReceiver().getOsContext().getNumSupportedDevices());
}
TimestampPacketHelper::programSemaphoreWithImplicitDependency<GfxFamily>(commandStream, *currentTimestampPacketNode,
getGpgpuCommandStreamReceiver().getOsContext().getNumSupportedDevices());
return blitProperties;
}
@ -947,60 +953,70 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueCommandWithoutKernel(
EventBuilder &eventBuilder,
uint32_t taskLevel) {
if (timestampPacketContainer) {
timestampPacketContainer->makeResident(getGpgpuCommandStreamReceiver());
timestampPacketDependencies.previousEnqueueNodes.makeResident(getGpgpuCommandStreamReceiver());
timestampPacketDependencies.cacheFlushNodes.makeResident(getGpgpuCommandStreamReceiver());
CompletionStamp completionStamp = {this->taskCount, this->taskLevel, this->flushStamp->peekStamp()};
bool flushGpgpuCsr = true;
if ((enqueueProperties.operation == EnqueueProperties::Operation::Blit) && !isGpgpuSubmissionForBcsRequired()) {
flushGpgpuCsr = false;
}
for (auto surface : CreateRange(surfaces, surfaceCount)) {
surface->makeResident(getGpgpuCommandStreamReceiver());
}
if (flushGpgpuCsr) {
if (timestampPacketContainer) {
timestampPacketContainer->makeResident(getGpgpuCommandStreamReceiver());
timestampPacketDependencies.previousEnqueueNodes.makeResident(getGpgpuCommandStreamReceiver());
timestampPacketDependencies.cacheFlushNodes.makeResident(getGpgpuCommandStreamReceiver());
}
TimeStampData submitTimeStamp;
if (eventBuilder.getEvent() && isProfilingEnabled() && getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
this->getDevice().getOSTime()->getCpuGpuTime(&submitTimeStamp);
eventBuilder.getEvent()->setSubmitTimeStamp(&submitTimeStamp);
eventBuilder.getEvent()->getTimestampPacketNodes()->makeResident(getGpgpuCommandStreamReceiver());
}
for (auto surface : CreateRange(surfaces, surfaceCount)) {
surface->makeResident(getGpgpuCommandStreamReceiver());
}
DispatchFlags dispatchFlags(
{}, //csrDependencies
&timestampPacketDependencies.barrierNodes, //barrierTimestampPacketNodes
{}, //pipelineSelectArgs
flushStamp->getStampReference(), //flushStampReference
getThrottle(), //throttle
device->getPreemptionMode(), //preemptionMode
GrfConfig::DefaultGrfNumber, //numGrfRequired
L3CachingSettings::l3CacheOn, //l3CacheSettings
ThreadArbitrationPolicy::NotPresent, //threadArbitrationPolicy
getSliceCount(), //sliceCount
blocking, //blocking
false, //dcFlush
false, //useSLM
true, //guardCommandBufferWithPipeControl
false, //GSBA32BitRequired
false, //requiresCoherency
false, //lowPriority
(enqueueProperties.operation == EnqueueProperties::Operation::Blit), //implicitFlush
getGpgpuCommandStreamReceiver().isNTo1SubmissionModelEnabled(), //outOfOrderExecutionAllowed
false, //epilogueRequired
false //usePerDssBackedBuffer
);
TimeStampData submitTimeStamp;
if (eventBuilder.getEvent() && isProfilingEnabled() && getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
this->getDevice().getOSTime()->getCpuGpuTime(&submitTimeStamp);
eventBuilder.getEvent()->setSubmitTimeStamp(&submitTimeStamp);
eventBuilder.getEvent()->getTimestampPacketNodes()->makeResident(getGpgpuCommandStreamReceiver());
}
if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
eventsRequest.fillCsrDependencies(dispatchFlags.csrDependencies, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OutOfCsr);
dispatchFlags.csrDependencies.makeResident(getGpgpuCommandStreamReceiver());
DispatchFlags dispatchFlags(
{}, //csrDependencies
&timestampPacketDependencies.barrierNodes, //barrierTimestampPacketNodes
{}, //pipelineSelectArgs
flushStamp->getStampReference(), //flushStampReference
getThrottle(), //throttle
device->getPreemptionMode(), //preemptionMode
GrfConfig::DefaultGrfNumber, //numGrfRequired
L3CachingSettings::l3CacheOn, //l3CacheSettings
ThreadArbitrationPolicy::NotPresent, //threadArbitrationPolicy
getSliceCount(), //sliceCount
blocking, //blocking
false, //dcFlush
false, //useSLM
true, //guardCommandBufferWithPipeControl
false, //GSBA32BitRequired
false, //requiresCoherency
false, //lowPriority
(enqueueProperties.operation == EnqueueProperties::Operation::Blit), //implicitFlush
getGpgpuCommandStreamReceiver().isNTo1SubmissionModelEnabled(), //outOfOrderExecutionAllowed
false, //epilogueRequired
false //usePerDssBackedBuffer
);
if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
eventsRequest.fillCsrDependencies(dispatchFlags.csrDependencies, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OutOfCsr);
dispatchFlags.csrDependencies.makeResident(getGpgpuCommandStreamReceiver());
}
completionStamp = getGpgpuCommandStreamReceiver().flushTask(
commandStream,
commandStreamStart,
getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 0u),
getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0u),
getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u),
taskLevel,
dispatchFlags,
getDevice());
}
CompletionStamp completionStamp = getGpgpuCommandStreamReceiver().flushTask(
commandStream,
commandStreamStart,
getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 0u),
getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0u),
getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u),
taskLevel,
dispatchFlags,
getDevice());
if (enqueueProperties.operation == EnqueueProperties::Operation::Blit) {
UNRECOVERABLE_IF(!enqueueProperties.blitPropertiesContainer);

View File

@ -12,6 +12,7 @@ namespace NEO {
struct EnqueueProperties {
enum class Operation {
None,
Blit,
ExplicitCacheFlush,
EnqueueWithoutSubmission,

View File

@ -118,9 +118,9 @@ struct BlitEnqueueTests : public ::testing::Test {
}
template <typename Family>
GenCmdList getCmdList(LinearStream &linearStream) {
GenCmdList getCmdList(LinearStream &linearStream, size_t offset) {
HardwareParse hwParser;
hwParser.parseCommands<Family>(linearStream);
hwParser.parseCommands<Family>(linearStream, offset);
return hwParser.cmdList;
}
@ -225,8 +225,8 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitAuxTranslationWhenConstruct
// Gpgpu command buffer
{
auto cmdListCsr = getCmdList<FamilyType>(gpgpuCsr->getCS(0));
auto cmdListQueue = getCmdList<FamilyType>(commandQueue->getCS(0));
auto cmdListCsr = getCmdList<FamilyType>(gpgpuCsr->getCS(0), 0);
auto cmdListQueue = getCmdList<FamilyType>(commandQueue->getCS(0), 0);
// Barrier
expectPipeControl<FamilyType>(cmdListCsr.begin(), cmdListCsr.end());
@ -247,7 +247,7 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitAuxTranslationWhenConstruct
// BCS command buffer
{
auto cmdList = getCmdList<FamilyType>(bcsCsr->getCS(0));
auto cmdList = getCmdList<FamilyType>(bcsCsr->getCS(0), 0);
// Barrier
auto cmdFound = expectCommand<MI_SEMAPHORE_WAIT>(cmdList.begin(), cmdList.end());
@ -298,9 +298,9 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitAuxTranslationWhenConstruct
// Gpgpu command buffer
{
auto cmdListCsr = getCmdList<FamilyType>(gpgpuCsr->getCS(0));
auto cmdListCsr = getCmdList<FamilyType>(gpgpuCsr->getCS(0), 0);
auto ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(gpgpuCsr);
auto cmdListQueue = getCmdList<FamilyType>(*ultCsr->lastFlushedCommandStream);
auto cmdListQueue = getCmdList<FamilyType>(*ultCsr->lastFlushedCommandStream, 0);
// Barrier
expectPipeControl<FamilyType>(cmdListCsr.begin(), cmdListCsr.end());
@ -321,7 +321,7 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitAuxTranslationWhenConstruct
// BCS command buffer
{
auto cmdList = getCmdList<FamilyType>(bcsCsr->getCS(0));
auto cmdList = getCmdList<FamilyType>(bcsCsr->getCS(0), 0);
// Barrier
auto cmdFound = expectCommand<MI_SEMAPHORE_WAIT>(cmdList.begin(), cmdList.end());
@ -357,7 +357,7 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructing
commandQueue->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr);
auto cmdListCsr = getCmdList<FamilyType>(gpgpuCsr->getCS(0));
auto cmdListCsr = getCmdList<FamilyType>(gpgpuCsr->getCS(0), 0);
auto pipeControl = expectPipeControl<FamilyType>(cmdListCsr.begin(), cmdListCsr.end());
auto pipeControlCmd = genCmdCast<PIPE_CONTROL *>(*pipeControl);
@ -365,7 +365,7 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructing
uint64_t high = pipeControlCmd->getAddressHigh();
uint64_t barrierGpuAddress = (high << 32) | low;
auto cmdList = getCmdList<FamilyType>(bcsCsr->getCS(0));
auto cmdList = getCmdList<FamilyType>(bcsCsr->getCS(0), 0);
auto semaphore = expectCommand<MI_SEMAPHORE_WAIT>(cmdList.begin(), cmdList.end());
verifySemaphore<FamilyType>(semaphore, barrierGpuAddress);
}
@ -385,7 +385,7 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructing
uint64_t auxToNonAuxOutputAddress[2] = {};
uint64_t nonAuxToAuxOutputAddress[2] = {};
{
auto cmdListBcs = getCmdList<FamilyType>(bcsCsr->getCS(0));
auto cmdListBcs = getCmdList<FamilyType>(bcsCsr->getCS(0), 0);
auto cmdFound = expectCommand<XY_COPY_BLT>(cmdListBcs.begin(), cmdListBcs.end());
@ -409,7 +409,7 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructing
}
{
auto cmdListQueue = getCmdList<FamilyType>(commandQueue->getCS(0));
auto cmdListQueue = getCmdList<FamilyType>(commandQueue->getCS(0), 0);
// Aux to NonAux
auto cmdFound = expectCommand<MI_SEMAPHORE_WAIT>(cmdListQueue.begin(), cmdListQueue.end());
@ -446,7 +446,7 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructing
auto kernelNode = mockCmdQ->timestampPacketContainer->peekNodes()[0];
auto kernelNodeAddress = TimestampPacketHelper::getContextEndGpuAddress(*kernelNode);
auto cmdList = getCmdList<FamilyType>(bcsCsr->getCS(0));
auto cmdList = getCmdList<FamilyType>(bcsCsr->getCS(0), 0);
// Aux to nonAux
auto cmdFound = expectCommand<XY_COPY_BLT>(cmdList.begin(), cmdList.end());
@ -470,8 +470,8 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructing
mockCmdQ->overrideIsCacheFlushForBcsRequired.returnValue = true;
mockCmdQ->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr);
auto cmdListBcs = getCmdList<FamilyType>(bcsCsr->getCS(0));
auto cmdListQueue = getCmdList<FamilyType>(mockCmdQ->getCS(0));
auto cmdListBcs = getCmdList<FamilyType>(bcsCsr->getCS(0), 0);
auto cmdListQueue = getCmdList<FamilyType>(mockCmdQ->getCS(0), 0);
uint64_t cacheFlushWriteAddress = 0;
@ -521,7 +521,7 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructing
auto eventDependencyAddress = TimestampPacketHelper::getContextEndGpuAddress(*eventDependency);
auto cmdList = getCmdList<FamilyType>(bcsCsr->getCS(0));
auto cmdList = getCmdList<FamilyType>(bcsCsr->getCS(0), 0);
// Barrier
auto cmdFound = expectCommand<MI_SEMAPHORE_WAIT>(cmdList.begin(), cmdList.end());
@ -550,7 +550,7 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenOutEventWhenDispatchingThenAssi
auto &eventNodes = event->getTimestampPacketNodes()->peekNodes();
EXPECT_EQ(3u, eventNodes.size());
auto cmdListQueue = getCmdList<FamilyType>(commandQueue->getCS(0));
auto cmdListQueue = getCmdList<FamilyType>(commandQueue->getCS(0), 0);
auto cmdFound = expectCommand<WALKER_TYPE>(cmdListQueue.begin(), cmdListQueue.end());
@ -657,7 +657,7 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructing
commandQueue->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, nullptr, 1, waitlist, nullptr);
userEvent.setStatus(CL_COMPLETE);
auto cmdListCsr = getCmdList<FamilyType>(gpgpuCsr->getCS(0));
auto cmdListCsr = getCmdList<FamilyType>(gpgpuCsr->getCS(0), 0);
auto pipeControl = expectPipeControl<FamilyType>(cmdListCsr.begin(), cmdListCsr.end());
auto pipeControlCmd = genCmdCast<PIPE_CONTROL *>(*pipeControl);
@ -665,7 +665,7 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructing
uint64_t high = pipeControlCmd->getAddressHigh();
uint64_t barrierGpuAddress = (high << 32) | low;
auto cmdList = getCmdList<FamilyType>(bcsCsr->getCS(0));
auto cmdList = getCmdList<FamilyType>(bcsCsr->getCS(0), 0);
auto semaphore = expectCommand<MI_SEMAPHORE_WAIT>(cmdList.begin(), cmdList.end());
verifySemaphore<FamilyType>(semaphore, barrierGpuAddress);
@ -691,7 +691,7 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructing
auto eventDependencyAddress = TimestampPacketHelper::getContextEndGpuAddress(*eventDependency);
auto cmdList = getCmdList<FamilyType>(bcsCsr->getCS(0));
auto cmdList = getCmdList<FamilyType>(bcsCsr->getCS(0), 0);
// Barrier
auto cmdFound = expectCommand<MI_SEMAPHORE_WAIT>(cmdList.begin(), cmdList.end());
@ -723,7 +723,7 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructing
auto kernelNode = mockCmdQ->timestampPacketContainer->peekNodes()[0];
auto kernelNodeAddress = TimestampPacketHelper::getContextEndGpuAddress(*kernelNode);
auto cmdList = getCmdList<FamilyType>(bcsCsr->getCS(0));
auto cmdList = getCmdList<FamilyType>(bcsCsr->getCS(0), 0);
// Aux to nonAux
auto cmdFound = expectCommand<XY_COPY_BLT>(cmdList.begin(), cmdList.end());
@ -756,7 +756,7 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructing
uint64_t auxToNonAuxOutputAddress[2] = {};
uint64_t nonAuxToAuxOutputAddress[2] = {};
{
auto cmdListBcs = getCmdList<FamilyType>(bcsCsr->getCS(0));
auto cmdListBcs = getCmdList<FamilyType>(bcsCsr->getCS(0), 0);
auto cmdFound = expectCommand<XY_COPY_BLT>(cmdListBcs.begin(), cmdListBcs.end());
@ -781,7 +781,7 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructing
{
auto ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(gpgpuCsr);
auto cmdListQueue = getCmdList<FamilyType>(*ultCsr->lastFlushedCommandStream);
auto cmdListQueue = getCmdList<FamilyType>(*ultCsr->lastFlushedCommandStream, 0);
// Aux to NonAux
auto cmdFound = expectCommand<MI_SEMAPHORE_WAIT>(cmdListQueue.begin(), cmdListQueue.end());
@ -837,8 +837,8 @@ HWTEST_TEMPLATED_F(BlitEnqueueWithNoTimestampPacketTests, givenNoTimestampPacket
commandQueue->enqueueReadBuffer(buffer.get(), CL_FALSE, 0, bufferSize, cpuBuffer, nullptr, 0, nullptr, nullptr);
commandQueue->finish();
auto bcsCommands = getCmdList<FamilyType>(bcsCsr->getCS(0));
auto ccsCommands = getCmdList<FamilyType>(commandQueue->getCS(0));
auto bcsCommands = getCmdList<FamilyType>(bcsCsr->getCS(0), 0);
auto ccsCommands = getCmdList<FamilyType>(commandQueue->getCS(0), 0);
auto cmdFound = expectCommand<MI_SEMAPHORE_WAIT>(bcsCommands.begin(), bcsCommands.end());
@ -1024,4 +1024,86 @@ HWTEST_TEMPLATED_F(BlitEnqueueFlushTests, givenDebugFlagSetWhenCheckingBcsCacheF
DebugManager.flags.ForceCacheFlushForBcs.set(1);
EXPECT_TRUE(mockCommandQueue->isCacheFlushForBcsRequired());
}
using BlitEnqueueWithDisabledGpgpuSubmissionTests = BlitEnqueueTests<1>;
HWTEST_TEMPLATED_F(BlitEnqueueWithDisabledGpgpuSubmissionTests, givenDebugFlagSetWhenDoingBcsCopyThenSubmitToGpgpuOnlyIfPreviousEnqueueWasGpgpu) {
DebugManager.flags.ForceGpgpuSubmissionForBcsEnqueue.set(0);
auto mockCommandQueue = static_cast<MockCommandQueueHw<FamilyType> *>(commandQueue.get());
EXPECT_EQ(EnqueueProperties::Operation::None, mockCommandQueue->latestSentEnqueueType);
auto buffer = createBuffer(1, false);
buffer->forceDisallowCPUCopy = true;
int hostPtr = 0;
commandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr);
EXPECT_EQ(EnqueueProperties::Operation::Blit, mockCommandQueue->latestSentEnqueueType);
EXPECT_EQ(0u, gpgpuCsr->peekTaskCount());
commandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr);
EXPECT_EQ(EnqueueProperties::Operation::Blit, mockCommandQueue->latestSentEnqueueType);
EXPECT_EQ(0u, gpgpuCsr->peekTaskCount());
commandQueue->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr);
EXPECT_EQ(EnqueueProperties::Operation::GpuKernel, mockCommandQueue->latestSentEnqueueType);
EXPECT_EQ(1u, gpgpuCsr->peekTaskCount());
commandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr);
EXPECT_EQ(EnqueueProperties::Operation::Blit, mockCommandQueue->latestSentEnqueueType);
EXPECT_EQ(2u, gpgpuCsr->peekTaskCount());
commandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr);
EXPECT_EQ(EnqueueProperties::Operation::Blit, mockCommandQueue->latestSentEnqueueType);
EXPECT_EQ(2u, gpgpuCsr->peekTaskCount());
}
HWTEST_TEMPLATED_F(BlitEnqueueWithDisabledGpgpuSubmissionTests, givenDebugFlagSetWhenDoingBcsCopyThatRequiresCacheFlushThenSubmitToGpgpu) {
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
using XY_COPY_BLT = typename FamilyType::XY_COPY_BLT;
DebugManager.flags.ForceGpgpuSubmissionForBcsEnqueue.set(0);
auto mockCommandQueue = static_cast<MockCommandQueueHw<FamilyType> *>(commandQueue.get());
mockCommandQueue->overrideIsCacheFlushForBcsRequired.enabled = true;
mockCommandQueue->overrideIsCacheFlushForBcsRequired.returnValue = true;
auto buffer = createBuffer(1, false);
buffer->forceDisallowCPUCopy = true;
int hostPtr = 0;
// enqueue kernel to force gpgpu submission on write buffer
commandQueue->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr);
EXPECT_EQ(1u, gpgpuCsr->peekTaskCount());
auto offset = mockCommandQueue->getCS(0).getUsed();
commandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr);
EXPECT_EQ(2u, gpgpuCsr->peekTaskCount());
auto cmdListBcs = getCmdList<FamilyType>(bcsCsr->getCS(0), 0);
auto cmdListQueue = getCmdList<FamilyType>(mockCommandQueue->getCS(0), offset);
uint64_t cacheFlushWriteAddress = 0;
{
auto cmdFound = expectPipeControl<FamilyType>(cmdListQueue.begin(), cmdListQueue.end());
auto pipeControlCmd = genCmdCast<PIPE_CONTROL *>(*cmdFound);
EXPECT_TRUE(pipeControlCmd->getDcFlushEnable());
EXPECT_TRUE(pipeControlCmd->getCommandStreamerStallEnable());
uint64_t low = pipeControlCmd->getAddress();
uint64_t high = pipeControlCmd->getAddressHigh();
cacheFlushWriteAddress = (high << 32) | low;
EXPECT_NE(0u, cacheFlushWriteAddress);
}
{
auto cmdFound = expectCommand<MI_SEMAPHORE_WAIT>(cmdListBcs.begin(), cmdListBcs.end());
verifySemaphore<FamilyType>(cmdFound, cacheFlushWriteAddress);
cmdFound = expectCommand<XY_COPY_BLT>(cmdListBcs.begin(), cmdListBcs.end());
EXPECT_NE(cmdListBcs.end(), cmdFound);
}
}
} // namespace NEO

View File

@ -193,6 +193,7 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
using BaseClass::commandQueueProperties;
using BaseClass::commandStream;
using BaseClass::gpgpuEngine;
using BaseClass::latestSentEnqueueType;
using BaseClass::obtainCommandStream;
using BaseClass::obtainNewTimestampPacketNodes;
using BaseClass::requiresCacheFlushAfterWalker;

View File

@ -171,4 +171,5 @@ EnableCrossDeviceAccess = -1
PauseOnBlitCopy = -1
ForceImplicitFlush = 0
OverrideRevision = -1
ForceCacheFlushForBcs = -1
ForceCacheFlushForBcs = -1
ForceGpgpuSubmissionForBcsEnqueue = -1

View File

@ -58,6 +58,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, OverridePreemptionSurfaceSizeInMb, -1, "-1: defa
DECLARE_DEBUG_VARIABLE(int32_t, OverrideLeastOccupiedBank, -1, "-1: default, >=0 Override least occupied bank with value")
DECLARE_DEBUG_VARIABLE(int32_t, OverrideRevision, -1, "-1: default, >=0: Revision id")
DECLARE_DEBUG_VARIABLE(int32_t, ForceCacheFlushForBcs, -1, "Force cache flush from gpgpu engine before dispatching BCS copy. -1: default, 1: enabled, 0: disabled")
DECLARE_DEBUG_VARIABLE(int32_t, ForceGpgpuSubmissionForBcsEnqueue, -1, "-1: Default, 1: Submit gpgpu command buffer with cache flushing and completion synchronization, 0: Do nothing, if possible")
DECLARE_DEBUG_VARIABLE(bool, EnableDebugBreak, true, "Enable DEBUG_BREAKs")
DECLARE_DEBUG_VARIABLE(bool, FlushAllCaches, false, "pipe controls between enqueues flush all possible caches")
DECLARE_DEBUG_VARIABLE(bool, MakeEachEnqueueBlocking, false, "equivalent of finish after each enqueue")