diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl index 7e64e56e87..754f6cf909 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl @@ -115,7 +115,9 @@ ze_result_t CommandQueueHw::executeCommandListsRegular( size_t linearStreamSizeEstimate = this->estimateLinearStreamSizeInitial(ctx); - this->handleScratchSpaceAndUpdateGSBAStateDirtyFlag(ctx); + if (this->heaplessModeEnabled == false) { + this->handleScratchSpaceAndUpdateGSBAStateDirtyFlag(ctx); + } this->setFrontEndStateProperties(ctx); linearStreamSizeEstimate += this->estimateLinearStreamSizeComplementary(ctx, commandListHandles, numCommandLists); diff --git a/shared/source/command_container/command_encoder_xehp_and_later.inl b/shared/source/command_container/command_encoder_xehp_and_later.inl index bd21414faa..cc575f2848 100644 --- a/shared/source/command_container/command_encoder_xehp_and_later.inl +++ b/shared/source/command_container/command_encoder_xehp_and_later.inl @@ -11,6 +11,7 @@ #include "shared/source/command_stream/command_stream_receiver.h" #include "shared/source/command_stream/linear_stream.h" #include "shared/source/command_stream/preemption.h" +#include "shared/source/command_stream/scratch_space_controller.h" #include "shared/source/command_stream/stream_properties.h" #include "shared/source/debug_settings/debug_settings_manager.h" #include "shared/source/execution_environment/root_device_environment.h" @@ -320,12 +321,37 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis } if constexpr (heaplessModeEnabled) { + auto requiredScratchSlot0Size = kernelDescriptor.kernelAttributes.perThreadScratchSize[0]; + auto requiredScratchSlot1Size = kernelDescriptor.kernelAttributes.perThreadScratchSize[1]; + uint64_t scratchAddress = 0; + if (requiredScratchSlot0Size > 0 || requiredScratchSlot1Size > 0) { + auto csr = args.device->getDefaultEngine().commandStreamReceiver; + auto scratchController = csr->getScratchSpaceController(); + bool gsbaState = false; + bool frontEndState = false; + auto ssh = container.getIndirectHeap(HeapType::surfaceState); + scratchController->setRequiredScratchSpace(ssh->getCpuBase(), 0, requiredScratchSlot0Size, requiredScratchSlot1Size, + csr->peekTaskCount(), csr->getOsContext(), gsbaState, frontEndState); + + if (scratchController->getScratchSpaceSlot0Allocation()) { + csr->makeResident(*scratchController->getScratchSpaceSlot0Allocation()); + } + if (scratchController->getScratchSpaceSlot1Allocation()) { + csr->makeResident(*scratchController->getScratchSpaceSlot1Allocation()); + } + + scratchAddress = ssh->getGpuBase() + scratchController->getScratchPatchAddress(); + } + auto inlineDataPointer = reinterpret_cast(walkerCmd.getInlineDataPointer()); auto indirectDataPointerAddress = kernelDescriptor.payloadMappings.implicitArgs.indirectDataPointerAddress; auto heap = container.getIndirectHeap(HeapType::indirectObject); auto address = heap->getHeapGpuBase() + offsetThreadData; std::memcpy(inlineDataPointer + indirectDataPointerAddress.offset, &address, indirectDataPointerAddress.pointerSize); + auto scratchPointerAddress = kernelDescriptor.payloadMappings.implicitArgs.scratchPointerAddress; + std::memcpy(inlineDataPointer + scratchPointerAddress.offset, &scratchAddress, scratchPointerAddress.pointerSize); + } else { walkerCmd.setIndirectDataStartAddress(static_cast(offsetThreadData)); walkerCmd.setIndirectDataLength(sizeThreadData);