mirror of
https://github.com/intel/compute-runtime.git
synced 2025-09-10 12:53:42 +08:00
Optimize BCS flushing scheme [1/n]
Change-Id: Ia192d24196e46fc281c401c241044f3429c16693 Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
This commit is contained in:

committed by
sys_ocldev

parent
c5f3fe5987
commit
c1dc8a8c3c
@ -686,4 +686,12 @@ void CommandQueue::aubCaptureHook(bool &blocking, bool &clearAllDependencies, co
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool CommandQueue::isGpgpuSubmissionForBcsRequired() const {
|
||||
if (DebugManager.flags.ForceGpgpuSubmissionForBcsEnqueue.get() == 0) {
|
||||
return (latestSentEnqueueType != EnqueueProperties::Operation::Blit) && (latestSentEnqueueType != EnqueueProperties::Operation::None);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
} // namespace NEO
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include "opencl/source/event/event.h"
|
||||
#include "opencl/source/helpers/base_object.h"
|
||||
#include "opencl/source/helpers/dispatch_info.h"
|
||||
#include "opencl/source/helpers/enqueue_properties.h"
|
||||
#include "opencl/source/helpers/task_information.h"
|
||||
|
||||
#include <atomic>
|
||||
@ -336,6 +337,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
|
||||
bool blitEnqueueAllowed(cl_command_type cmdType) const;
|
||||
void aubCaptureHook(bool &blocking, bool &clearAllDependencies, const MultiDispatchInfo &multiDispatchInfo);
|
||||
virtual bool obtainTimestampPacketForCacheFlush(bool isCacheFlushRequired) const = 0;
|
||||
bool isGpgpuSubmissionForBcsRequired() const;
|
||||
|
||||
Context *context = nullptr;
|
||||
ClDevice *device = nullptr;
|
||||
@ -347,6 +349,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
|
||||
|
||||
QueuePriority priority = QueuePriority::MEDIUM;
|
||||
QueueThrottle throttle = QueueThrottle::MEDIUM;
|
||||
EnqueueProperties::Operation latestSentEnqueueType = EnqueueProperties::Operation::None;
|
||||
uint64_t sliceCount = QueueSliceCount::defaultSliceCount;
|
||||
uint32_t bcsTaskCount = 0;
|
||||
|
||||
|
@ -213,8 +213,11 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
|
||||
nodesCount = estimateTimestampPacketNodesCount(multiDispatchInfo);
|
||||
}
|
||||
|
||||
if (isCacheFlushForBcsRequired() && (blitEnqueue || enqueueWithBlitAuxTranslation)) {
|
||||
timestampPacketDependencies.cacheFlushNodes.add(allocator->getTag());
|
||||
if (isCacheFlushForBcsRequired()) {
|
||||
// Cache flush for aux translation is always required (if supported)
|
||||
if ((blitEnqueue && isGpgpuSubmissionForBcsRequired()) || (enqueueWithBlitAuxTranslation)) {
|
||||
timestampPacketDependencies.cacheFlushNodes.add(allocator->getTag());
|
||||
}
|
||||
}
|
||||
|
||||
if (blitEnqueue && !blockQueue && getGpgpuCommandStreamReceiver().isStallingPipeControlOnNextFlushRequired()) {
|
||||
@ -344,6 +347,8 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
|
||||
if (eventBuilder.getEvent()) {
|
||||
eventBuilder.getEvent()->flushStamp->replaceStampObject(this->flushStamp->getStampReference());
|
||||
}
|
||||
|
||||
this->latestSentEnqueueType = enqueueProperties.operation;
|
||||
}
|
||||
updateFromCompletionStamp(completionStamp);
|
||||
|
||||
@ -485,21 +490,22 @@ BlitProperties CommandQueueHw<GfxFamily>::processDispatchForBlitEnqueue(const Mu
|
||||
auto currentTimestampPacketNode = timestampPacketContainer->peekNodes().at(0);
|
||||
blitProperties.outputTimestampPacket = currentTimestampPacketNode;
|
||||
|
||||
if (isCacheFlushForBcsRequired()) {
|
||||
auto cacheFlushTimestampPacketGpuAddress = TimestampPacketHelper::getContextEndGpuAddress(*timestampPacketDependencies.cacheFlushNodes.peekNodes()[0]);
|
||||
PipeControlArgs args(true);
|
||||
MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(
|
||||
commandStream,
|
||||
GfxFamily::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
|
||||
cacheFlushTimestampPacketGpuAddress,
|
||||
0,
|
||||
device->getHardwareInfo(),
|
||||
args);
|
||||
if (isGpgpuSubmissionForBcsRequired()) {
|
||||
if (isCacheFlushForBcsRequired()) {
|
||||
auto cacheFlushTimestampPacketGpuAddress = TimestampPacketHelper::getContextEndGpuAddress(*timestampPacketDependencies.cacheFlushNodes.peekNodes()[0]);
|
||||
PipeControlArgs args(true);
|
||||
MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(
|
||||
commandStream,
|
||||
GfxFamily::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
|
||||
cacheFlushTimestampPacketGpuAddress,
|
||||
0,
|
||||
device->getHardwareInfo(),
|
||||
args);
|
||||
}
|
||||
|
||||
TimestampPacketHelper::programSemaphoreWithImplicitDependency<GfxFamily>(commandStream, *currentTimestampPacketNode,
|
||||
getGpgpuCommandStreamReceiver().getOsContext().getNumSupportedDevices());
|
||||
}
|
||||
|
||||
TimestampPacketHelper::programSemaphoreWithImplicitDependency<GfxFamily>(commandStream, *currentTimestampPacketNode,
|
||||
getGpgpuCommandStreamReceiver().getOsContext().getNumSupportedDevices());
|
||||
|
||||
return blitProperties;
|
||||
}
|
||||
|
||||
@ -947,60 +953,70 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueCommandWithoutKernel(
|
||||
EventBuilder &eventBuilder,
|
||||
uint32_t taskLevel) {
|
||||
|
||||
if (timestampPacketContainer) {
|
||||
timestampPacketContainer->makeResident(getGpgpuCommandStreamReceiver());
|
||||
timestampPacketDependencies.previousEnqueueNodes.makeResident(getGpgpuCommandStreamReceiver());
|
||||
timestampPacketDependencies.cacheFlushNodes.makeResident(getGpgpuCommandStreamReceiver());
|
||||
CompletionStamp completionStamp = {this->taskCount, this->taskLevel, this->flushStamp->peekStamp()};
|
||||
bool flushGpgpuCsr = true;
|
||||
|
||||
if ((enqueueProperties.operation == EnqueueProperties::Operation::Blit) && !isGpgpuSubmissionForBcsRequired()) {
|
||||
flushGpgpuCsr = false;
|
||||
}
|
||||
|
||||
for (auto surface : CreateRange(surfaces, surfaceCount)) {
|
||||
surface->makeResident(getGpgpuCommandStreamReceiver());
|
||||
}
|
||||
if (flushGpgpuCsr) {
|
||||
if (timestampPacketContainer) {
|
||||
timestampPacketContainer->makeResident(getGpgpuCommandStreamReceiver());
|
||||
timestampPacketDependencies.previousEnqueueNodes.makeResident(getGpgpuCommandStreamReceiver());
|
||||
timestampPacketDependencies.cacheFlushNodes.makeResident(getGpgpuCommandStreamReceiver());
|
||||
}
|
||||
|
||||
TimeStampData submitTimeStamp;
|
||||
if (eventBuilder.getEvent() && isProfilingEnabled() && getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
|
||||
this->getDevice().getOSTime()->getCpuGpuTime(&submitTimeStamp);
|
||||
eventBuilder.getEvent()->setSubmitTimeStamp(&submitTimeStamp);
|
||||
eventBuilder.getEvent()->getTimestampPacketNodes()->makeResident(getGpgpuCommandStreamReceiver());
|
||||
}
|
||||
for (auto surface : CreateRange(surfaces, surfaceCount)) {
|
||||
surface->makeResident(getGpgpuCommandStreamReceiver());
|
||||
}
|
||||
|
||||
DispatchFlags dispatchFlags(
|
||||
{}, //csrDependencies
|
||||
×tampPacketDependencies.barrierNodes, //barrierTimestampPacketNodes
|
||||
{}, //pipelineSelectArgs
|
||||
flushStamp->getStampReference(), //flushStampReference
|
||||
getThrottle(), //throttle
|
||||
device->getPreemptionMode(), //preemptionMode
|
||||
GrfConfig::DefaultGrfNumber, //numGrfRequired
|
||||
L3CachingSettings::l3CacheOn, //l3CacheSettings
|
||||
ThreadArbitrationPolicy::NotPresent, //threadArbitrationPolicy
|
||||
getSliceCount(), //sliceCount
|
||||
blocking, //blocking
|
||||
false, //dcFlush
|
||||
false, //useSLM
|
||||
true, //guardCommandBufferWithPipeControl
|
||||
false, //GSBA32BitRequired
|
||||
false, //requiresCoherency
|
||||
false, //lowPriority
|
||||
(enqueueProperties.operation == EnqueueProperties::Operation::Blit), //implicitFlush
|
||||
getGpgpuCommandStreamReceiver().isNTo1SubmissionModelEnabled(), //outOfOrderExecutionAllowed
|
||||
false, //epilogueRequired
|
||||
false //usePerDssBackedBuffer
|
||||
);
|
||||
TimeStampData submitTimeStamp;
|
||||
if (eventBuilder.getEvent() && isProfilingEnabled() && getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
|
||||
this->getDevice().getOSTime()->getCpuGpuTime(&submitTimeStamp);
|
||||
eventBuilder.getEvent()->setSubmitTimeStamp(&submitTimeStamp);
|
||||
eventBuilder.getEvent()->getTimestampPacketNodes()->makeResident(getGpgpuCommandStreamReceiver());
|
||||
}
|
||||
|
||||
if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
|
||||
eventsRequest.fillCsrDependencies(dispatchFlags.csrDependencies, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OutOfCsr);
|
||||
dispatchFlags.csrDependencies.makeResident(getGpgpuCommandStreamReceiver());
|
||||
DispatchFlags dispatchFlags(
|
||||
{}, //csrDependencies
|
||||
×tampPacketDependencies.barrierNodes, //barrierTimestampPacketNodes
|
||||
{}, //pipelineSelectArgs
|
||||
flushStamp->getStampReference(), //flushStampReference
|
||||
getThrottle(), //throttle
|
||||
device->getPreemptionMode(), //preemptionMode
|
||||
GrfConfig::DefaultGrfNumber, //numGrfRequired
|
||||
L3CachingSettings::l3CacheOn, //l3CacheSettings
|
||||
ThreadArbitrationPolicy::NotPresent, //threadArbitrationPolicy
|
||||
getSliceCount(), //sliceCount
|
||||
blocking, //blocking
|
||||
false, //dcFlush
|
||||
false, //useSLM
|
||||
true, //guardCommandBufferWithPipeControl
|
||||
false, //GSBA32BitRequired
|
||||
false, //requiresCoherency
|
||||
false, //lowPriority
|
||||
(enqueueProperties.operation == EnqueueProperties::Operation::Blit), //implicitFlush
|
||||
getGpgpuCommandStreamReceiver().isNTo1SubmissionModelEnabled(), //outOfOrderExecutionAllowed
|
||||
false, //epilogueRequired
|
||||
false //usePerDssBackedBuffer
|
||||
);
|
||||
|
||||
if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
|
||||
eventsRequest.fillCsrDependencies(dispatchFlags.csrDependencies, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OutOfCsr);
|
||||
dispatchFlags.csrDependencies.makeResident(getGpgpuCommandStreamReceiver());
|
||||
}
|
||||
|
||||
completionStamp = getGpgpuCommandStreamReceiver().flushTask(
|
||||
commandStream,
|
||||
commandStreamStart,
|
||||
getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 0u),
|
||||
getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0u),
|
||||
getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u),
|
||||
taskLevel,
|
||||
dispatchFlags,
|
||||
getDevice());
|
||||
}
|
||||
CompletionStamp completionStamp = getGpgpuCommandStreamReceiver().flushTask(
|
||||
commandStream,
|
||||
commandStreamStart,
|
||||
getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 0u),
|
||||
getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0u),
|
||||
getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u),
|
||||
taskLevel,
|
||||
dispatchFlags,
|
||||
getDevice());
|
||||
|
||||
if (enqueueProperties.operation == EnqueueProperties::Operation::Blit) {
|
||||
UNRECOVERABLE_IF(!enqueueProperties.blitPropertiesContainer);
|
||||
|
@ -12,6 +12,7 @@ namespace NEO {
|
||||
|
||||
struct EnqueueProperties {
|
||||
enum class Operation {
|
||||
None,
|
||||
Blit,
|
||||
ExplicitCacheFlush,
|
||||
EnqueueWithoutSubmission,
|
||||
|
@ -118,9 +118,9 @@ struct BlitEnqueueTests : public ::testing::Test {
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
GenCmdList getCmdList(LinearStream &linearStream) {
|
||||
GenCmdList getCmdList(LinearStream &linearStream, size_t offset) {
|
||||
HardwareParse hwParser;
|
||||
hwParser.parseCommands<Family>(linearStream);
|
||||
hwParser.parseCommands<Family>(linearStream, offset);
|
||||
|
||||
return hwParser.cmdList;
|
||||
}
|
||||
@ -225,8 +225,8 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitAuxTranslationWhenConstruct
|
||||
|
||||
// Gpgpu command buffer
|
||||
{
|
||||
auto cmdListCsr = getCmdList<FamilyType>(gpgpuCsr->getCS(0));
|
||||
auto cmdListQueue = getCmdList<FamilyType>(commandQueue->getCS(0));
|
||||
auto cmdListCsr = getCmdList<FamilyType>(gpgpuCsr->getCS(0), 0);
|
||||
auto cmdListQueue = getCmdList<FamilyType>(commandQueue->getCS(0), 0);
|
||||
|
||||
// Barrier
|
||||
expectPipeControl<FamilyType>(cmdListCsr.begin(), cmdListCsr.end());
|
||||
@ -247,7 +247,7 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitAuxTranslationWhenConstruct
|
||||
|
||||
// BCS command buffer
|
||||
{
|
||||
auto cmdList = getCmdList<FamilyType>(bcsCsr->getCS(0));
|
||||
auto cmdList = getCmdList<FamilyType>(bcsCsr->getCS(0), 0);
|
||||
|
||||
// Barrier
|
||||
auto cmdFound = expectCommand<MI_SEMAPHORE_WAIT>(cmdList.begin(), cmdList.end());
|
||||
@ -298,9 +298,9 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitAuxTranslationWhenConstruct
|
||||
|
||||
// Gpgpu command buffer
|
||||
{
|
||||
auto cmdListCsr = getCmdList<FamilyType>(gpgpuCsr->getCS(0));
|
||||
auto cmdListCsr = getCmdList<FamilyType>(gpgpuCsr->getCS(0), 0);
|
||||
auto ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(gpgpuCsr);
|
||||
auto cmdListQueue = getCmdList<FamilyType>(*ultCsr->lastFlushedCommandStream);
|
||||
auto cmdListQueue = getCmdList<FamilyType>(*ultCsr->lastFlushedCommandStream, 0);
|
||||
|
||||
// Barrier
|
||||
expectPipeControl<FamilyType>(cmdListCsr.begin(), cmdListCsr.end());
|
||||
@ -321,7 +321,7 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitAuxTranslationWhenConstruct
|
||||
|
||||
// BCS command buffer
|
||||
{
|
||||
auto cmdList = getCmdList<FamilyType>(bcsCsr->getCS(0));
|
||||
auto cmdList = getCmdList<FamilyType>(bcsCsr->getCS(0), 0);
|
||||
|
||||
// Barrier
|
||||
auto cmdFound = expectCommand<MI_SEMAPHORE_WAIT>(cmdList.begin(), cmdList.end());
|
||||
@ -357,7 +357,7 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructing
|
||||
|
||||
commandQueue->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr);
|
||||
|
||||
auto cmdListCsr = getCmdList<FamilyType>(gpgpuCsr->getCS(0));
|
||||
auto cmdListCsr = getCmdList<FamilyType>(gpgpuCsr->getCS(0), 0);
|
||||
auto pipeControl = expectPipeControl<FamilyType>(cmdListCsr.begin(), cmdListCsr.end());
|
||||
auto pipeControlCmd = genCmdCast<PIPE_CONTROL *>(*pipeControl);
|
||||
|
||||
@ -365,7 +365,7 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructing
|
||||
uint64_t high = pipeControlCmd->getAddressHigh();
|
||||
uint64_t barrierGpuAddress = (high << 32) | low;
|
||||
|
||||
auto cmdList = getCmdList<FamilyType>(bcsCsr->getCS(0));
|
||||
auto cmdList = getCmdList<FamilyType>(bcsCsr->getCS(0), 0);
|
||||
auto semaphore = expectCommand<MI_SEMAPHORE_WAIT>(cmdList.begin(), cmdList.end());
|
||||
verifySemaphore<FamilyType>(semaphore, barrierGpuAddress);
|
||||
}
|
||||
@ -385,7 +385,7 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructing
|
||||
uint64_t auxToNonAuxOutputAddress[2] = {};
|
||||
uint64_t nonAuxToAuxOutputAddress[2] = {};
|
||||
{
|
||||
auto cmdListBcs = getCmdList<FamilyType>(bcsCsr->getCS(0));
|
||||
auto cmdListBcs = getCmdList<FamilyType>(bcsCsr->getCS(0), 0);
|
||||
|
||||
auto cmdFound = expectCommand<XY_COPY_BLT>(cmdListBcs.begin(), cmdListBcs.end());
|
||||
|
||||
@ -409,7 +409,7 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructing
|
||||
}
|
||||
|
||||
{
|
||||
auto cmdListQueue = getCmdList<FamilyType>(commandQueue->getCS(0));
|
||||
auto cmdListQueue = getCmdList<FamilyType>(commandQueue->getCS(0), 0);
|
||||
|
||||
// Aux to NonAux
|
||||
auto cmdFound = expectCommand<MI_SEMAPHORE_WAIT>(cmdListQueue.begin(), cmdListQueue.end());
|
||||
@ -446,7 +446,7 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructing
|
||||
auto kernelNode = mockCmdQ->timestampPacketContainer->peekNodes()[0];
|
||||
auto kernelNodeAddress = TimestampPacketHelper::getContextEndGpuAddress(*kernelNode);
|
||||
|
||||
auto cmdList = getCmdList<FamilyType>(bcsCsr->getCS(0));
|
||||
auto cmdList = getCmdList<FamilyType>(bcsCsr->getCS(0), 0);
|
||||
|
||||
// Aux to nonAux
|
||||
auto cmdFound = expectCommand<XY_COPY_BLT>(cmdList.begin(), cmdList.end());
|
||||
@ -470,8 +470,8 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructing
|
||||
mockCmdQ->overrideIsCacheFlushForBcsRequired.returnValue = true;
|
||||
mockCmdQ->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr);
|
||||
|
||||
auto cmdListBcs = getCmdList<FamilyType>(bcsCsr->getCS(0));
|
||||
auto cmdListQueue = getCmdList<FamilyType>(mockCmdQ->getCS(0));
|
||||
auto cmdListBcs = getCmdList<FamilyType>(bcsCsr->getCS(0), 0);
|
||||
auto cmdListQueue = getCmdList<FamilyType>(mockCmdQ->getCS(0), 0);
|
||||
|
||||
uint64_t cacheFlushWriteAddress = 0;
|
||||
|
||||
@ -521,7 +521,7 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructing
|
||||
|
||||
auto eventDependencyAddress = TimestampPacketHelper::getContextEndGpuAddress(*eventDependency);
|
||||
|
||||
auto cmdList = getCmdList<FamilyType>(bcsCsr->getCS(0));
|
||||
auto cmdList = getCmdList<FamilyType>(bcsCsr->getCS(0), 0);
|
||||
|
||||
// Barrier
|
||||
auto cmdFound = expectCommand<MI_SEMAPHORE_WAIT>(cmdList.begin(), cmdList.end());
|
||||
@ -550,7 +550,7 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenOutEventWhenDispatchingThenAssi
|
||||
auto &eventNodes = event->getTimestampPacketNodes()->peekNodes();
|
||||
EXPECT_EQ(3u, eventNodes.size());
|
||||
|
||||
auto cmdListQueue = getCmdList<FamilyType>(commandQueue->getCS(0));
|
||||
auto cmdListQueue = getCmdList<FamilyType>(commandQueue->getCS(0), 0);
|
||||
|
||||
auto cmdFound = expectCommand<WALKER_TYPE>(cmdListQueue.begin(), cmdListQueue.end());
|
||||
|
||||
@ -657,7 +657,7 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructing
|
||||
commandQueue->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, nullptr, 1, waitlist, nullptr);
|
||||
userEvent.setStatus(CL_COMPLETE);
|
||||
|
||||
auto cmdListCsr = getCmdList<FamilyType>(gpgpuCsr->getCS(0));
|
||||
auto cmdListCsr = getCmdList<FamilyType>(gpgpuCsr->getCS(0), 0);
|
||||
auto pipeControl = expectPipeControl<FamilyType>(cmdListCsr.begin(), cmdListCsr.end());
|
||||
auto pipeControlCmd = genCmdCast<PIPE_CONTROL *>(*pipeControl);
|
||||
|
||||
@ -665,7 +665,7 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructing
|
||||
uint64_t high = pipeControlCmd->getAddressHigh();
|
||||
uint64_t barrierGpuAddress = (high << 32) | low;
|
||||
|
||||
auto cmdList = getCmdList<FamilyType>(bcsCsr->getCS(0));
|
||||
auto cmdList = getCmdList<FamilyType>(bcsCsr->getCS(0), 0);
|
||||
auto semaphore = expectCommand<MI_SEMAPHORE_WAIT>(cmdList.begin(), cmdList.end());
|
||||
verifySemaphore<FamilyType>(semaphore, barrierGpuAddress);
|
||||
|
||||
@ -691,7 +691,7 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructing
|
||||
|
||||
auto eventDependencyAddress = TimestampPacketHelper::getContextEndGpuAddress(*eventDependency);
|
||||
|
||||
auto cmdList = getCmdList<FamilyType>(bcsCsr->getCS(0));
|
||||
auto cmdList = getCmdList<FamilyType>(bcsCsr->getCS(0), 0);
|
||||
|
||||
// Barrier
|
||||
auto cmdFound = expectCommand<MI_SEMAPHORE_WAIT>(cmdList.begin(), cmdList.end());
|
||||
@ -723,7 +723,7 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructing
|
||||
auto kernelNode = mockCmdQ->timestampPacketContainer->peekNodes()[0];
|
||||
auto kernelNodeAddress = TimestampPacketHelper::getContextEndGpuAddress(*kernelNode);
|
||||
|
||||
auto cmdList = getCmdList<FamilyType>(bcsCsr->getCS(0));
|
||||
auto cmdList = getCmdList<FamilyType>(bcsCsr->getCS(0), 0);
|
||||
|
||||
// Aux to nonAux
|
||||
auto cmdFound = expectCommand<XY_COPY_BLT>(cmdList.begin(), cmdList.end());
|
||||
@ -756,7 +756,7 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructing
|
||||
uint64_t auxToNonAuxOutputAddress[2] = {};
|
||||
uint64_t nonAuxToAuxOutputAddress[2] = {};
|
||||
{
|
||||
auto cmdListBcs = getCmdList<FamilyType>(bcsCsr->getCS(0));
|
||||
auto cmdListBcs = getCmdList<FamilyType>(bcsCsr->getCS(0), 0);
|
||||
|
||||
auto cmdFound = expectCommand<XY_COPY_BLT>(cmdListBcs.begin(), cmdListBcs.end());
|
||||
|
||||
@ -781,7 +781,7 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructing
|
||||
|
||||
{
|
||||
auto ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(gpgpuCsr);
|
||||
auto cmdListQueue = getCmdList<FamilyType>(*ultCsr->lastFlushedCommandStream);
|
||||
auto cmdListQueue = getCmdList<FamilyType>(*ultCsr->lastFlushedCommandStream, 0);
|
||||
|
||||
// Aux to NonAux
|
||||
auto cmdFound = expectCommand<MI_SEMAPHORE_WAIT>(cmdListQueue.begin(), cmdListQueue.end());
|
||||
@ -837,8 +837,8 @@ HWTEST_TEMPLATED_F(BlitEnqueueWithNoTimestampPacketTests, givenNoTimestampPacket
|
||||
commandQueue->enqueueReadBuffer(buffer.get(), CL_FALSE, 0, bufferSize, cpuBuffer, nullptr, 0, nullptr, nullptr);
|
||||
commandQueue->finish();
|
||||
|
||||
auto bcsCommands = getCmdList<FamilyType>(bcsCsr->getCS(0));
|
||||
auto ccsCommands = getCmdList<FamilyType>(commandQueue->getCS(0));
|
||||
auto bcsCommands = getCmdList<FamilyType>(bcsCsr->getCS(0), 0);
|
||||
auto ccsCommands = getCmdList<FamilyType>(commandQueue->getCS(0), 0);
|
||||
|
||||
auto cmdFound = expectCommand<MI_SEMAPHORE_WAIT>(bcsCommands.begin(), bcsCommands.end());
|
||||
|
||||
@ -1024,4 +1024,86 @@ HWTEST_TEMPLATED_F(BlitEnqueueFlushTests, givenDebugFlagSetWhenCheckingBcsCacheF
|
||||
DebugManager.flags.ForceCacheFlushForBcs.set(1);
|
||||
EXPECT_TRUE(mockCommandQueue->isCacheFlushForBcsRequired());
|
||||
}
|
||||
|
||||
using BlitEnqueueWithDisabledGpgpuSubmissionTests = BlitEnqueueTests<1>;
|
||||
|
||||
HWTEST_TEMPLATED_F(BlitEnqueueWithDisabledGpgpuSubmissionTests, givenDebugFlagSetWhenDoingBcsCopyThenSubmitToGpgpuOnlyIfPreviousEnqueueWasGpgpu) {
|
||||
DebugManager.flags.ForceGpgpuSubmissionForBcsEnqueue.set(0);
|
||||
auto mockCommandQueue = static_cast<MockCommandQueueHw<FamilyType> *>(commandQueue.get());
|
||||
EXPECT_EQ(EnqueueProperties::Operation::None, mockCommandQueue->latestSentEnqueueType);
|
||||
|
||||
auto buffer = createBuffer(1, false);
|
||||
buffer->forceDisallowCPUCopy = true;
|
||||
int hostPtr = 0;
|
||||
|
||||
commandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr);
|
||||
EXPECT_EQ(EnqueueProperties::Operation::Blit, mockCommandQueue->latestSentEnqueueType);
|
||||
EXPECT_EQ(0u, gpgpuCsr->peekTaskCount());
|
||||
|
||||
commandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr);
|
||||
EXPECT_EQ(EnqueueProperties::Operation::Blit, mockCommandQueue->latestSentEnqueueType);
|
||||
EXPECT_EQ(0u, gpgpuCsr->peekTaskCount());
|
||||
|
||||
commandQueue->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr);
|
||||
EXPECT_EQ(EnqueueProperties::Operation::GpuKernel, mockCommandQueue->latestSentEnqueueType);
|
||||
EXPECT_EQ(1u, gpgpuCsr->peekTaskCount());
|
||||
|
||||
commandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr);
|
||||
EXPECT_EQ(EnqueueProperties::Operation::Blit, mockCommandQueue->latestSentEnqueueType);
|
||||
EXPECT_EQ(2u, gpgpuCsr->peekTaskCount());
|
||||
|
||||
commandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr);
|
||||
EXPECT_EQ(EnqueueProperties::Operation::Blit, mockCommandQueue->latestSentEnqueueType);
|
||||
EXPECT_EQ(2u, gpgpuCsr->peekTaskCount());
|
||||
}
|
||||
|
||||
HWTEST_TEMPLATED_F(BlitEnqueueWithDisabledGpgpuSubmissionTests, givenDebugFlagSetWhenDoingBcsCopyThatRequiresCacheFlushThenSubmitToGpgpu) {
|
||||
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
|
||||
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
|
||||
using XY_COPY_BLT = typename FamilyType::XY_COPY_BLT;
|
||||
|
||||
DebugManager.flags.ForceGpgpuSubmissionForBcsEnqueue.set(0);
|
||||
auto mockCommandQueue = static_cast<MockCommandQueueHw<FamilyType> *>(commandQueue.get());
|
||||
mockCommandQueue->overrideIsCacheFlushForBcsRequired.enabled = true;
|
||||
mockCommandQueue->overrideIsCacheFlushForBcsRequired.returnValue = true;
|
||||
|
||||
auto buffer = createBuffer(1, false);
|
||||
buffer->forceDisallowCPUCopy = true;
|
||||
int hostPtr = 0;
|
||||
|
||||
// enqueue kernel to force gpgpu submission on write buffer
|
||||
commandQueue->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr);
|
||||
EXPECT_EQ(1u, gpgpuCsr->peekTaskCount());
|
||||
|
||||
auto offset = mockCommandQueue->getCS(0).getUsed();
|
||||
|
||||
commandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr);
|
||||
EXPECT_EQ(2u, gpgpuCsr->peekTaskCount());
|
||||
|
||||
auto cmdListBcs = getCmdList<FamilyType>(bcsCsr->getCS(0), 0);
|
||||
auto cmdListQueue = getCmdList<FamilyType>(mockCommandQueue->getCS(0), offset);
|
||||
|
||||
uint64_t cacheFlushWriteAddress = 0;
|
||||
|
||||
{
|
||||
auto cmdFound = expectPipeControl<FamilyType>(cmdListQueue.begin(), cmdListQueue.end());
|
||||
auto pipeControlCmd = genCmdCast<PIPE_CONTROL *>(*cmdFound);
|
||||
|
||||
EXPECT_TRUE(pipeControlCmd->getDcFlushEnable());
|
||||
EXPECT_TRUE(pipeControlCmd->getCommandStreamerStallEnable());
|
||||
uint64_t low = pipeControlCmd->getAddress();
|
||||
uint64_t high = pipeControlCmd->getAddressHigh();
|
||||
cacheFlushWriteAddress = (high << 32) | low;
|
||||
EXPECT_NE(0u, cacheFlushWriteAddress);
|
||||
}
|
||||
|
||||
{
|
||||
auto cmdFound = expectCommand<MI_SEMAPHORE_WAIT>(cmdListBcs.begin(), cmdListBcs.end());
|
||||
verifySemaphore<FamilyType>(cmdFound, cacheFlushWriteAddress);
|
||||
|
||||
cmdFound = expectCommand<XY_COPY_BLT>(cmdListBcs.begin(), cmdListBcs.end());
|
||||
EXPECT_NE(cmdListBcs.end(), cmdFound);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace NEO
|
||||
|
@ -193,6 +193,7 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
|
||||
using BaseClass::commandQueueProperties;
|
||||
using BaseClass::commandStream;
|
||||
using BaseClass::gpgpuEngine;
|
||||
using BaseClass::latestSentEnqueueType;
|
||||
using BaseClass::obtainCommandStream;
|
||||
using BaseClass::obtainNewTimestampPacketNodes;
|
||||
using BaseClass::requiresCacheFlushAfterWalker;
|
||||
|
@ -171,4 +171,5 @@ EnableCrossDeviceAccess = -1
|
||||
PauseOnBlitCopy = -1
|
||||
ForceImplicitFlush = 0
|
||||
OverrideRevision = -1
|
||||
ForceCacheFlushForBcs = -1
|
||||
ForceCacheFlushForBcs = -1
|
||||
ForceGpgpuSubmissionForBcsEnqueue = -1
|
@ -58,6 +58,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, OverridePreemptionSurfaceSizeInMb, -1, "-1: defa
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, OverrideLeastOccupiedBank, -1, "-1: default, >=0 Override least occupied bank with value")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, OverrideRevision, -1, "-1: default, >=0: Revision id")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ForceCacheFlushForBcs, -1, "Force cache flush from gpgpu engine before dispatching BCS copy. -1: default, 1: enabled, 0: disabled")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ForceGpgpuSubmissionForBcsEnqueue, -1, "-1: Default, 1: Submit gpgpu command buffer with cache flushing and completion synchronization, 0: Do nothing, if possible")
|
||||
DECLARE_DEBUG_VARIABLE(bool, EnableDebugBreak, true, "Enable DEBUG_BREAKs")
|
||||
DECLARE_DEBUG_VARIABLE(bool, FlushAllCaches, false, "pipe controls between enqueues flush all possible caches")
|
||||
DECLARE_DEBUG_VARIABLE(bool, MakeEachEnqueueBlocking, false, "equivalent of finish after each enqueue")
|
||||
|
Reference in New Issue
Block a user