Add missing cache flush

Resolves: NEO-6505

Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
This commit is contained in:
Lukasz Jobczyk
2021-12-06 10:01:46 +00:00
committed by Compute-Runtime-Automation
parent 0346a5679f
commit 09d2ffb9ed
11 changed files with 85 additions and 45 deletions

View File

@ -54,7 +54,8 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::executeCommandListImm
false, //useSingleSubdevice false, //useSingleSubdevice
false, //useGlobalAtomics false, //useGlobalAtomics
this->device->getNEODevice()->getNumGenericSubDevices() > 1, //areMultipleSubDevicesInContext this->device->getNEODevice()->getNumGenericSubDevices() > 1, //areMultipleSubDevicesInContext
false //memoryMigrationRequired false, //memoryMigrationRequired
false //textureCacheFlush
); );
this->commandContainer.removeDuplicatesFromResidencyContainer(); this->commandContainer.removeDuplicatesFromResidencyContainer();

View File

@ -685,6 +685,10 @@ uint32_t CommandQueue::peekBcsTaskCount(aub_stream::EngineType bcsEngineType) co
return state.taskCount; return state.taskCount;
} }
bool CommandQueue::isTextureCacheFlushNeeded(uint32_t commandType) const {
return commandType == CL_COMMAND_COPY_IMAGE && getGpgpuCommandStreamReceiver().isDirectSubmissionEnabled();
}
IndirectHeap &CommandQueue::getIndirectHeap(IndirectHeap::Type heapType, size_t minRequiredSize) { IndirectHeap &CommandQueue::getIndirectHeap(IndirectHeap::Type heapType, size_t minRequiredSize) {
return getGpgpuCommandStreamReceiver().getIndirectHeap(heapType, minRequiredSize); return getGpgpuCommandStreamReceiver().getIndirectHeap(heapType, minRequiredSize);
} }

View File

@ -45,14 +45,6 @@ enum class QueuePriority {
HIGH HIGH
}; };
inline bool shouldFlushDC(uint32_t commandType, PrintfHandler *printfHandler) {
return (commandType == CL_COMMAND_READ_BUFFER ||
commandType == CL_COMMAND_READ_BUFFER_RECT ||
commandType == CL_COMMAND_READ_IMAGE ||
commandType == CL_COMMAND_SVM_MAP ||
printfHandler);
}
template <> template <>
struct OpenCLObjectMapper<_cl_command_queue> { struct OpenCLObjectMapper<_cl_command_queue> {
typedef class CommandQueue DerivedType; typedef class CommandQueue DerivedType;
@ -371,6 +363,17 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
void providePerformanceHint(TransferProperties &transferProperties); void providePerformanceHint(TransferProperties &transferProperties);
bool queueDependenciesClearRequired() const; bool queueDependenciesClearRequired() const;
bool blitEnqueueAllowed(const CsrSelectionArgs &args) const; bool blitEnqueueAllowed(const CsrSelectionArgs &args) const;
bool isTextureCacheFlushNeeded(uint32_t commandType) const;
inline bool shouldFlushDC(uint32_t commandType, PrintfHandler *printfHandler) const {
return (commandType == CL_COMMAND_READ_BUFFER ||
commandType == CL_COMMAND_READ_BUFFER_RECT ||
commandType == CL_COMMAND_READ_IMAGE ||
commandType == CL_COMMAND_SVM_MAP ||
printfHandler ||
isTextureCacheFlushNeeded(commandType));
}
MOCKABLE_VIRTUAL bool blitEnqueueImageAllowed(const size_t *origin, const size_t *region, const Image &image) const; MOCKABLE_VIRTUAL bool blitEnqueueImageAllowed(const size_t *origin, const size_t *region, const Image &image) const;
void aubCaptureHook(bool &blocking, bool &clearAllDependencies, const MultiDispatchInfo &multiDispatchInfo); void aubCaptureHook(bool &blocking, bool &clearAllDependencies, const MultiDispatchInfo &multiDispatchInfo);
virtual bool obtainTimestampPacketForCacheFlush(bool isCacheFlushRequired) const = 0; virtual bool obtainTimestampPacketForCacheFlush(bool isCacheFlushRequired) const = 0;

View File

@ -892,8 +892,8 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueNonBlocked(
kernel->isSingleSubdevicePreferred(), //useSingleSubdevice kernel->isSingleSubdevicePreferred(), //useSingleSubdevice
useGlobalAtomics, //useGlobalAtomics useGlobalAtomics, //useGlobalAtomics
kernel->areMultipleSubDevicesInContext(), //areMultipleSubDevicesInContext kernel->areMultipleSubDevicesInContext(), //areMultipleSubDevicesInContext
kernel->requiresMemoryMigration() //memoryMigrationRequired kernel->requiresMemoryMigration(), //memoryMigrationRequired
); isTextureCacheFlushNeeded(commandType)); //textureCacheFlush
dispatchFlags.pipelineSelectArgs.mediaSamplerRequired = mediaSamplerRequired; dispatchFlags.pipelineSelectArgs.mediaSamplerRequired = mediaSamplerRequired;
dispatchFlags.pipelineSelectArgs.specialPipelineSelectMode = specialPipelineSelectMode; dispatchFlags.pipelineSelectArgs.specialPipelineSelectMode = specialPipelineSelectMode;
@ -1116,7 +1116,8 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueCommandWithoutKernel(
false, //useSingleSubdevice false, //useSingleSubdevice
false, //useGlobalAtomics false, //useGlobalAtomics
context->containsMultipleSubDevices(rootDeviceIndex), //areMultipleSubDevicesInContext context->containsMultipleSubDevices(rootDeviceIndex), //areMultipleSubDevicesInContext
false); //memoryMigrationRequired false, //memoryMigrationRequired
false); //textureCacheFlush
if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OutOfCsr); eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OutOfCsr);

View File

@ -79,7 +79,8 @@ CompletionStamp &CommandMapUnmap::submit(uint32_t taskLevel, bool terminated) {
false, //useSingleSubdevice false, //useSingleSubdevice
false, //useGlobalAtomics false, //useGlobalAtomics
false, //areMultipleSubDevicesInContext false, //areMultipleSubDevicesInContext
false); //memoryMigrationRequired false, //memoryMigrationRequired
false); //textureCacheFlush
DEBUG_BREAK_IF(taskLevel >= CompletionStamp::notReady); DEBUG_BREAK_IF(taskLevel >= CompletionStamp::notReady);
@ -246,7 +247,8 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
kernel->isSingleSubdevicePreferred(), //useSingleSubdevice kernel->isSingleSubdevicePreferred(), //useSingleSubdevice
kernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, //useGlobalAtomics kernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, //useGlobalAtomics
kernel->areMultipleSubDevicesInContext(), //areMultipleSubDevicesInContext kernel->areMultipleSubDevicesInContext(), //areMultipleSubDevicesInContext
kernel->requiresMemoryMigration()); //memoryMigrationRequired kernel->requiresMemoryMigration(), //memoryMigrationRequired
false); //textureCacheFlush
if (commandQueue.getContext().getRootDeviceIndices().size() > 1) { if (commandQueue.getContext().getRootDeviceIndices().size() > 1) {
eventsRequest.fillCsrDependenciesForTaskCountContainer(dispatchFlags.csrDependencies, commandStreamReceiver); eventsRequest.fillCsrDependenciesForTaskCountContainer(dispatchFlags.csrDependencies, commandStreamReceiver);
@ -388,7 +390,8 @@ CompletionStamp &CommandWithoutKernel::submit(uint32_t taskLevel, bool terminate
false, //useSingleSubdevice false, //useSingleSubdevice
false, //useGlobalAtomics false, //useGlobalAtomics
commandQueue.getContext().containsMultipleSubDevices(rootDeviceIndex), //areMultipleSubDevicesInContext commandQueue.getContext().containsMultipleSubDevices(rootDeviceIndex), //areMultipleSubDevicesInContext
false); //memoryMigrationRequired false, //memoryMigrationRequired
false); //textureCacheFlush
if (commandQueue.getContext().getRootDeviceIndices().size() > 1) { if (commandQueue.getContext().getRootDeviceIndices().size() > 1) {
eventsRequest.fillCsrDependenciesForTaskCountContainer(dispatchFlags.csrDependencies, commandStreamReceiver); eventsRequest.fillCsrDependenciesForTaskCountContainer(dispatchFlags.csrDependencies, commandStreamReceiver);

View File

@ -379,6 +379,29 @@ HWTEST_F(CommandQueueCommandStreamTest, givenCommandQueueThatWaitsOnAbortedUserE
EXPECT_EQ(100u, cmdQ.taskLevel); EXPECT_EQ(100u, cmdQ.taskLevel);
} }
HWTEST_F(CommandQueueCommandStreamTest, WhenCheckIsTextureCacheFlushNeededThenReturnProperValue) {
MockContext context;
auto mockDevice = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(nullptr));
MockCommandQueue cmdQ(&context, mockDevice.get(), 0, false);
auto &commandStreamReceiver = mockDevice->getUltCommandStreamReceiver<FamilyType>();
EXPECT_FALSE(cmdQ.isTextureCacheFlushNeeded(CL_COMMAND_COPY_BUFFER_RECT));
for (auto i = CL_COMMAND_NDRANGE_KERNEL; i < CL_COMMAND_RELEASE_GL_OBJECTS; i++) {
if (i == CL_COMMAND_COPY_IMAGE) {
commandStreamReceiver.directSubmissionAvailable = true;
EXPECT_TRUE(cmdQ.isTextureCacheFlushNeeded(i));
commandStreamReceiver.directSubmissionAvailable = false;
EXPECT_FALSE(cmdQ.isTextureCacheFlushNeeded(i));
} else {
commandStreamReceiver.directSubmissionAvailable = true;
EXPECT_FALSE(cmdQ.isTextureCacheFlushNeeded(i));
commandStreamReceiver.directSubmissionAvailable = false;
EXPECT_FALSE(cmdQ.isTextureCacheFlushNeeded(i));
}
}
}
TEST_F(CommandQueueCommandStreamTest, GivenValidCommandQueueWhenGettingCommandStreamThenValidObjectIsReturned) { TEST_F(CommandQueueCommandStreamTest, GivenValidCommandQueueWhenGettingCommandStreamThenValidObjectIsReturned) {
const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, 0, 0}; const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, 0, 0};
MockCommandQueue commandQueue(context.get(), pClDevice, props, false); MockCommandQueue commandQueue(context.get(), pClDevice, props, false);

View File

@ -83,6 +83,6 @@ struct ComputeModeRequirements : public ::testing::Test {
CommandStreamReceiver *csr = nullptr; CommandStreamReceiver *csr = nullptr;
std::unique_ptr<MockDevice> device; std::unique_ptr<MockDevice> device;
DispatchFlags flags{{}, nullptr, {}, nullptr, QueueThrottle::MEDIUM, PreemptionMode::Disabled, GrfConfig::DefaultGrfNumber, L3CachingSettings::l3CacheOn, ThreadArbitrationPolicy::NotPresent, AdditionalKernelExecInfo::NotApplicable, KernelExecutionType::NotApplicable, MemoryCompressionState::NotApplicable, QueueSliceCount::defaultSliceCount, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false}; DispatchFlags flags{{}, nullptr, {}, nullptr, QueueThrottle::MEDIUM, PreemptionMode::Disabled, GrfConfig::DefaultGrfNumber, L3CachingSettings::l3CacheOn, ThreadArbitrationPolicy::NotPresent, AdditionalKernelExecInfo::NotApplicable, KernelExecutionType::NotApplicable, MemoryCompressionState::NotApplicable, QueueSliceCount::defaultSliceCount, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false};
GraphicsAllocation *alloc = nullptr; GraphicsAllocation *alloc = nullptr;
}; };

View File

@ -26,6 +26,7 @@ class MockCommandQueue : public CommandQueue {
using CommandQueue::device; using CommandQueue::device;
using CommandQueue::gpgpuEngine; using CommandQueue::gpgpuEngine;
using CommandQueue::isCopyOnly; using CommandQueue::isCopyOnly;
using CommandQueue::isTextureCacheFlushNeeded;
using CommandQueue::obtainNewTimestampPacketNodes; using CommandQueue::obtainNewTimestampPacketNodes;
using CommandQueue::overrideEngine; using CommandQueue::overrideEngine;
using CommandQueue::queueCapabilities; using CommandQueue::queueCapabilities;

View File

@ -217,6 +217,7 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
PipeControlArgs args(dispatchFlags.dcFlush); PipeControlArgs args(dispatchFlags.dcFlush);
args.notifyEnable = isUsedNotifyEnableForPostSync(); args.notifyEnable = isUsedNotifyEnableForPostSync();
args.tlbInvalidation |= dispatchFlags.memoryMigrationRequired; args.tlbInvalidation |= dispatchFlags.memoryMigrationRequired;
args.textureCacheInvalidationEnable |= dispatchFlags.textureCacheFlush;
args.workloadPartitionOffset = this->activePartitions > 1 && this->staticWorkPartitioningEnabled; args.workloadPartitionOffset = this->activePartitions > 1 && this->staticWorkPartitioningEnabled;
MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation( MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(
commandStreamTask, commandStreamTask,

View File

@ -55,34 +55,35 @@ struct DispatchFlags {
KernelExecutionType kernelExecutionTypeP, MemoryCompressionState memoryCompressionStateP, KernelExecutionType kernelExecutionTypeP, MemoryCompressionState memoryCompressionStateP,
uint64_t sliceCountP, bool blockingP, bool dcFlushP, bool useSLMP, bool guardCommandBufferWithPipeControlP, bool gsba32BitRequiredP, uint64_t sliceCountP, bool blockingP, bool dcFlushP, bool useSLMP, bool guardCommandBufferWithPipeControlP, bool gsba32BitRequiredP,
bool requiresCoherencyP, bool lowPriorityP, bool implicitFlushP, bool outOfOrderExecutionAllowedP, bool epilogueRequiredP, bool requiresCoherencyP, bool lowPriorityP, bool implicitFlushP, bool outOfOrderExecutionAllowedP, bool epilogueRequiredP,
bool usePerDSSbackedBufferP, bool useSingleSubdeviceP, bool useGlobalAtomicsP, bool areMultipleSubDevicesInContextP, bool memoryMigrationRequiredP) : csrDependencies(csrDependenciesP), bool usePerDSSbackedBufferP, bool useSingleSubdeviceP, bool useGlobalAtomicsP, bool areMultipleSubDevicesInContextP, bool memoryMigrationRequiredP, bool textureCacheFlush) : csrDependencies(csrDependenciesP),
barrierTimestampPacketNodes(barrierTimestampPacketNodesP), barrierTimestampPacketNodes(barrierTimestampPacketNodesP),
pipelineSelectArgs(pipelineSelectArgsP), pipelineSelectArgs(pipelineSelectArgsP),
flushStampReference(flushStampReferenceP), flushStampReference(flushStampReferenceP),
throttle(throttleP), throttle(throttleP),
preemptionMode(preemptionModeP), preemptionMode(preemptionModeP),
numGrfRequired(numGrfRequiredP), numGrfRequired(numGrfRequiredP),
l3CacheSettings(l3CacheSettingsP), l3CacheSettings(l3CacheSettingsP),
threadArbitrationPolicy(threadArbitrationPolicyP), threadArbitrationPolicy(threadArbitrationPolicyP),
additionalKernelExecInfo(additionalKernelExecInfoP), additionalKernelExecInfo(additionalKernelExecInfoP),
kernelExecutionType(kernelExecutionTypeP), kernelExecutionType(kernelExecutionTypeP),
memoryCompressionState(memoryCompressionStateP), memoryCompressionState(memoryCompressionStateP),
sliceCount(sliceCountP), sliceCount(sliceCountP),
blocking(blockingP), blocking(blockingP),
dcFlush(dcFlushP), dcFlush(dcFlushP),
useSLM(useSLMP), useSLM(useSLMP),
guardCommandBufferWithPipeControl(guardCommandBufferWithPipeControlP), guardCommandBufferWithPipeControl(guardCommandBufferWithPipeControlP),
gsba32BitRequired(gsba32BitRequiredP), gsba32BitRequired(gsba32BitRequiredP),
requiresCoherency(requiresCoherencyP), requiresCoherency(requiresCoherencyP),
lowPriority(lowPriorityP), lowPriority(lowPriorityP),
implicitFlush(implicitFlushP), implicitFlush(implicitFlushP),
outOfOrderExecutionAllowed(outOfOrderExecutionAllowedP), outOfOrderExecutionAllowed(outOfOrderExecutionAllowedP),
epilogueRequired(epilogueRequiredP), epilogueRequired(epilogueRequiredP),
usePerDssBackedBuffer(usePerDSSbackedBufferP), usePerDssBackedBuffer(usePerDSSbackedBufferP),
useSingleSubdevice(useSingleSubdeviceP), useSingleSubdevice(useSingleSubdeviceP),
useGlobalAtomics(useGlobalAtomicsP), useGlobalAtomics(useGlobalAtomicsP),
areMultipleSubDevicesInContext(areMultipleSubDevicesInContextP), areMultipleSubDevicesInContext(areMultipleSubDevicesInContextP),
memoryMigrationRequired(memoryMigrationRequiredP){}; memoryMigrationRequired(memoryMigrationRequiredP),
textureCacheFlush(textureCacheFlush){};
CsrDependencies csrDependencies; CsrDependencies csrDependencies;
TimestampPacketContainer *barrierTimestampPacketNodes = nullptr; TimestampPacketContainer *barrierTimestampPacketNodes = nullptr;
@ -113,6 +114,7 @@ struct DispatchFlags {
bool useGlobalAtomics = false; bool useGlobalAtomics = false;
bool areMultipleSubDevicesInContext = false; bool areMultipleSubDevicesInContext = false;
bool memoryMigrationRequired = false; bool memoryMigrationRequired = false;
bool textureCacheFlush = false;
}; };
struct CsrSizeRequestFlags { struct CsrSizeRequestFlags {

View File

@ -41,7 +41,8 @@ struct DispatchFlagsHelper {
false, //useSingleSubdevice false, //useSingleSubdevice
false, //useGlobalAtomics false, //useGlobalAtomics
false, //areMultipleSubDevicesInContext false, //areMultipleSubDevicesInContext
false //memoryMigrationRequired false, //memoryMigrationRequired
false //textureCacheFlush
); );
} }
}; };