feature: add heapless and global stateless scratch address patching

Related-To: NEO-10381

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2024-04-02 22:53:24 +00:00
committed by Compute-Runtime-Automation
parent 78885ae1fe
commit 73d558058c
27 changed files with 318 additions and 24 deletions

View File

@@ -73,6 +73,7 @@ struct EncodeDispatchKernelArgs {
bool dcFlushEnable = false;
bool isHeaplessModeEnabled = false;
bool interruptEvent = false;
bool immediateScratchAddressPatching = false;
bool requiresSystemMemoryFence() const {
return (isHostScopeSignalEvent && isKernelUsingSystemAllocation);

View File

@@ -332,15 +332,22 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
auto address = heap->getHeapGpuBase() + offsetThreadData;
std::memcpy(inlineDataPointer + indirectDataPointerAddress.offset, &address, indirectDataPointerAddress.pointerSize);
auto requiredScratchSlot0Size = kernelDescriptor.kernelAttributes.perThreadScratchSize[0];
auto requiredScratchSlot1Size = kernelDescriptor.kernelAttributes.perThreadScratchSize[1];
auto csr = args.device->getDefaultEngine().commandStreamReceiver;
auto ssh = container.getIndirectHeap(HeapType::surfaceState);
if (args.immediateScratchAddressPatching) {
auto requiredScratchSlot0Size = kernelDescriptor.kernelAttributes.perThreadScratchSize[0];
auto requiredScratchSlot1Size = kernelDescriptor.kernelAttributes.perThreadScratchSize[1];
auto csr = args.device->getDefaultEngine().commandStreamReceiver;
NEO::IndirectHeap *ssh = nullptr;
if (csr->getGlobalStatelessHeapAllocation() != nullptr) {
ssh = csr->getGlobalStatelessHeap();
} else {
ssh = args.surfaceStateHeap ? args.surfaceStateHeap : container.getIndirectHeap(HeapType::surfaceState);
}
uint64_t scratchAddress = 0u;
EncodeDispatchKernel<Family>::template setScratchAddress<heaplessModeEnabled>(scratchAddress, requiredScratchSlot0Size, requiredScratchSlot1Size, ssh, *csr);
auto scratchPointerAddress = kernelDescriptor.payloadMappings.implicitArgs.scratchPointerAddress;
std::memcpy(inlineDataPointer + scratchPointerAddress.offset, &scratchAddress, scratchPointerAddress.pointerSize);
uint64_t scratchAddress = 0u;
EncodeDispatchKernel<Family>::template setScratchAddress<heaplessModeEnabled>(scratchAddress, requiredScratchSlot0Size, requiredScratchSlot1Size, ssh, *csr);
auto scratchPointerAddress = kernelDescriptor.payloadMappings.implicitArgs.scratchPointerAddress;
std::memcpy(inlineDataPointer + scratchPointerAddress.offset, &scratchAddress, scratchPointerAddress.pointerSize);
}
} else {
walkerCmd.setIndirectDataStartAddress(static_cast<uint32_t>(offsetThreadData));
walkerCmd.setIndirectDataLength(sizeThreadData);

View File

@@ -87,7 +87,7 @@ if(SUPPORT_DG2_AND_LATER)
)
endif()
if(NOT SUPPORT_HEAPLESS)
if(NOT SUPPORTED_HEAPLESS)
list(APPEND NEO_CORE_COMMAND_STREAM
${CMAKE_CURRENT_SOURCE_DIR}/command_stream_receiver_hw_heap_addressing.inl
)

View File

@@ -303,7 +303,7 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushImmediateTask(
flushData.stateComputeModeFullConfigurationNeeded = getStateComputeModeDirty();
flushData.stateBaseAddressFullConfigurationNeeded = getGSBAStateDirty();
if (dispatchFlags.sshCpuBase != nullptr && (this->requiredScratchSlot0Size > 0 || this->requiredScratchSlot1Size > 0)) {
if (!this->heaplessModeEnabled && dispatchFlags.sshCpuBase != nullptr && (this->requiredScratchSlot0Size > 0 || this->requiredScratchSlot1Size > 0)) {
bool checkFeStateDirty = false;
bool checkSbaStateDirty = false;
scratchSpaceController->setRequiredScratchSpace(dispatchFlags.sshCpuBase,

View File

@@ -100,6 +100,7 @@ struct UnitTestHelper {
static bool findStateCacheFlushPipeControl(LinearStream &csrStream);
static void verifyDummyBlitWa(const RootDeviceEnvironment *rootDeviceEnvironment, GenCmdList::iterator &cmdIterator);
static GenCmdList::iterator findWalkerCmd(GenCmdList::iterator begin, GenCmdList::iterator end, bool heapless);
};
} // namespace NEO

View File

@@ -154,4 +154,9 @@ template <typename GfxFamily>
void UnitTestHelper<GfxFamily>::verifyDummyBlitWa(const RootDeviceEnvironment *rootDeviceEnvironment, GenCmdList::iterator &cmdIterator) {
}
template <typename GfxFamily>
GenCmdList::iterator UnitTestHelper<GfxFamily>::findWalkerCmd(GenCmdList::iterator begin, GenCmdList::iterator end, bool heapless) {
return find<typename GfxFamily::GPGPU_WALKER *>(begin, end);
}
} // namespace NEO

View File

@@ -138,4 +138,9 @@ void UnitTestHelper<GfxFamily>::verifyDummyBlitWa(const RootDeviceEnvironment *r
}
}
template <typename GfxFamily>
GenCmdList::iterator UnitTestHelper<GfxFamily>::findWalkerCmd(GenCmdList::iterator begin, GenCmdList::iterator end, bool heapless) {
return find<typename GfxFamily::COMPUTE_WALKER *>(begin, end);
}
} // namespace NEO

View File

@@ -88,7 +88,6 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
using BaseClass::sshState;
using BaseClass::staticWorkPartitioningEnabled;
using BaseClass::streamProperties;
using BaseClass::wasSubmittedToSingleSubdevice;
using BaseClass::CommandStreamReceiver::activePartitions;
using BaseClass::CommandStreamReceiver::activePartitionsConfig;
@@ -115,6 +114,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
using BaseClass::CommandStreamReceiver::globalFenceAllocation;
using BaseClass::CommandStreamReceiver::gpuHangCheckPeriod;
using BaseClass::CommandStreamReceiver::gsbaFor32BitProgrammed;
using BaseClass::CommandStreamReceiver::heaplessModeEnabled;
using BaseClass::CommandStreamReceiver::immWritePostSyncWriteOffset;
using BaseClass::CommandStreamReceiver::initDirectSubmission;
using BaseClass::CommandStreamReceiver::internalAllocationStorage;

View File

@@ -5095,3 +5095,30 @@ HWTEST_F(CommandStreamReceiverHwHeaplessTest, whenHeaplessCommandStreamReceiverF
EXPECT_ANY_THROW(csr->flushImmediateTaskStateless(commandStream, 0, csr->recordedImmediateDispatchFlags, *pDevice));
EXPECT_ANY_THROW(csr->handleImmediateFlushStatelessAllocationsResidency(0, commandStream));
}
HWTEST2_F(CommandStreamReceiverHwTest,
givenImmediateFlushTaskInHeaplessModeWhenNextDispatchRequiresScratchSpaceThenNoScratchIsAllocated,
IsAtLeastXeHpCore) {
using CFE_STATE = typename FamilyType::CFE_STATE;
auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver<FamilyType>();
commandStreamReceiver.storeMakeResidentAllocations = true;
commandStreamReceiver.heaplessModeEnabled = true;
commandStreamReceiver.flushImmediateTask(commandStream, commandStream.getUsed(), immediateFlushTaskFlags, *pDevice);
commandStreamReceiver.setRequiredScratchSizes(0x100, 0);
size_t usedSize = commandStreamReceiver.commandStream.getUsed();
commandStreamReceiver.flushImmediateTask(commandStream,
commandStream.getUsed(),
immediateFlushTaskFlags,
*pDevice);
HardwareParse hwParserCsr;
hwParserCsr.parseCommands<FamilyType>(commandStreamReceiver.commandStream, usedSize);
auto frontEndCmd = hwParserCsr.getCommand<CFE_STATE>();
ASSERT_EQ(nullptr, frontEndCmd);
EXPECT_EQ(nullptr, commandStreamReceiver.getScratchSpaceController()->getScratchSpaceSlot0Allocation());
}

View File

@@ -68,6 +68,7 @@ EncodeDispatchKernelArgs CommandEncodeStatesFixture::createDefaultDispatchKernel
false, // dcFlushEnable
false, // isHeaplessModeEnabled
false, // interruptEvent
false, // immediateScratchAddressPatching
};
return args;