mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-05 09:09:04 +08:00
Program additional VFE_STATE instructions when needed
Additonal VFE_STATEs may be programmed when appending kernels to a command list and when the command list is executed. Related-To: NEO-4940, NEO-4574 Signed-off-by: Filip Hazubski <filip.hazubski@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
9b12dc4390
commit
3a2281bf77
@@ -16,6 +16,8 @@
|
||||
#include <level_zero/ze_api.h>
|
||||
#include <level_zero/zet_api.h>
|
||||
|
||||
#include "stream_properties.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
struct _ze_command_list_handle_t {};
|
||||
@@ -37,6 +39,17 @@ struct CommandList : _ze_command_list_handle_t {
|
||||
static CommandList *allocate(uint32_t numIddsPerBlock) { return new Type(numIddsPerBlock); }
|
||||
};
|
||||
|
||||
struct CommandToPatch {
|
||||
enum CommandType {
|
||||
FrontEndState,
|
||||
Invalid
|
||||
};
|
||||
void *pDestination = nullptr;
|
||||
void *pCommand = nullptr;
|
||||
CommandType type = Invalid;
|
||||
};
|
||||
using CommandsToPatch = StackVec<CommandToPatch, 1>;
|
||||
|
||||
virtual ze_result_t close() = 0;
|
||||
virtual ze_result_t destroy() = 0;
|
||||
virtual ze_result_t appendEventReset(ze_event_handle_t hEvent) = 0;
|
||||
@@ -200,6 +213,16 @@ struct CommandList : _ze_command_list_handle_t {
|
||||
|
||||
virtual ze_result_t setSyncModeQueue(bool syncMode) = 0;
|
||||
|
||||
const NEO::StreamProperties &getRequiredStreamState() {
|
||||
return requiredStreamState;
|
||||
}
|
||||
const NEO::StreamProperties &getFinalStreamState() {
|
||||
return finalStreamState;
|
||||
}
|
||||
const CommandsToPatch &getCommandsToPatch() {
|
||||
return commandsToPatch;
|
||||
}
|
||||
|
||||
protected:
|
||||
std::map<const void *, NEO::GraphicsAllocation *> hostPtrMap;
|
||||
uint32_t commandListPerThreadScratchSize = 0u;
|
||||
@@ -211,6 +234,10 @@ struct CommandList : _ze_command_list_handle_t {
|
||||
NEO::GraphicsAllocation *getAllocationFromHostPtrMap(const void *buffer, uint64_t bufferSize);
|
||||
NEO::GraphicsAllocation *getHostPtrAlloc(const void *buffer, uint64_t bufferSize);
|
||||
bool containsStatelessUncachedResource = false;
|
||||
|
||||
NEO::StreamProperties requiredStreamState{};
|
||||
NEO::StreamProperties finalStreamState{};
|
||||
CommandsToPatch commandsToPatch{};
|
||||
};
|
||||
|
||||
using CommandListAllocatorFn = CommandList *(*)(uint32_t);
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
#include "level_zero/core/source/cmdlist/cmdlist_imp.h"
|
||||
|
||||
#include "igfxfmid.h"
|
||||
#include "stream_properties.h"
|
||||
|
||||
namespace NEO {
|
||||
enum class ImageType;
|
||||
@@ -42,6 +43,7 @@ struct CommandListCoreFamily : CommandListImp {
|
||||
using CommandListImp::CommandListImp;
|
||||
ze_result_t initialize(Device *device, NEO::EngineGroupType engineGroupType) override;
|
||||
virtual void programL3(bool isSLMused);
|
||||
~CommandListCoreFamily() override;
|
||||
|
||||
ze_result_t close() override;
|
||||
ze_result_t appendEventReset(ze_event_handle_t hEvent) override;
|
||||
@@ -211,6 +213,8 @@ struct CommandListCoreFamily : CommandListImp {
|
||||
bool isCooperative);
|
||||
ze_result_t appendLaunchKernelSplit(ze_kernel_handle_t hKernel, const ze_group_count_t *pThreadGroupDimensions, ze_event_handle_t hEvent);
|
||||
ze_result_t prepareIndirectParams(const ze_group_count_t *pThreadGroupDimensions);
|
||||
void updateStreamProperties(Kernel &kernel);
|
||||
void clearCommandsToPatch();
|
||||
|
||||
void applyMemoryRangesBarrier(uint32_t numRanges, const size_t *pRangeSizes,
|
||||
const void **pRanges);
|
||||
@@ -228,6 +232,8 @@ struct CommandListCoreFamily : CommandListImp {
|
||||
uint64_t getInputBufferSize(NEO::ImageType imageType, uint64_t bytesPerPixel, const ze_image_region_t *region);
|
||||
MOCKABLE_VIRTUAL AlignedAllocationData getAlignedAllocation(Device *device, const void *buffer, uint64_t bufferSize);
|
||||
ze_result_t addEventsToCmdList(uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents);
|
||||
|
||||
bool containsAnyKernel = false;
|
||||
};
|
||||
|
||||
template <PRODUCT_FAMILY gfxProductFamily>
|
||||
|
||||
@@ -54,6 +54,11 @@ inline ze_result_t parseErrorCode(NEO::ErrorCode returnValue) {
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
CommandListCoreFamily<gfxCoreFamily>::~CommandListCoreFamily() {
|
||||
clearCommandsToPatch();
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
void CommandListCoreFamily<gfxCoreFamily>::programThreadArbitrationPolicy(Device *device) {
|
||||
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
|
||||
@@ -77,6 +82,10 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::reset() {
|
||||
unifiedMemoryControls.indirectSharedAllocationsAllowed = false;
|
||||
commandListPreemptionMode = device->getDevicePreemptionMode();
|
||||
commandListPerThreadScratchSize = 0u;
|
||||
requiredStreamState = {};
|
||||
finalStreamState = requiredStreamState;
|
||||
containsAnyKernel = false;
|
||||
clearCommandsToPatch();
|
||||
|
||||
if (!isCopyOnly()) {
|
||||
if (!NEO::ApiSpecificConfig::getBindlessConfiguration()) {
|
||||
@@ -1828,6 +1837,47 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::prepareIndirectParams(const ze
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
void CommandListCoreFamily<gfxCoreFamily>::updateStreamProperties(Kernel &kernel) {
|
||||
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
|
||||
using VFE_STATE_TYPE = typename GfxFamily::VFE_STATE_TYPE;
|
||||
|
||||
if (!containsAnyKernel) {
|
||||
requiredStreamState.setCooperativeKernelProperties(kernel.usesSyncBuffer(), device->getHwInfo());
|
||||
finalStreamState = requiredStreamState;
|
||||
containsAnyKernel = true;
|
||||
return;
|
||||
}
|
||||
|
||||
auto &hwInfo = device->getHwInfo();
|
||||
auto programVfe = finalStreamState.setCooperativeKernelProperties(kernel.usesSyncBuffer(), hwInfo);
|
||||
if (programVfe) {
|
||||
auto pVfeStateAddress = NEO::PreambleHelper<GfxFamily>::getSpaceForVfeState(commandContainer.getCommandStream(), hwInfo, engineGroupType);
|
||||
auto pVfeState = new VFE_STATE_TYPE;
|
||||
NEO::PreambleHelper<GfxFamily>::programVfeState(pVfeState, hwInfo, 0, 0, device->getMaxNumHwThreads(),
|
||||
NEO::AdditionalKernelExecInfo::NotApplicable, finalStreamState);
|
||||
commandsToPatch.push_back({pVfeStateAddress, pVfeState, CommandToPatch::FrontEndState});
|
||||
}
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
void CommandListCoreFamily<gfxCoreFamily>::clearCommandsToPatch() {
|
||||
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
|
||||
using VFE_STATE_TYPE = typename GfxFamily::VFE_STATE_TYPE;
|
||||
|
||||
for (auto &commandToPatch : commandsToPatch) {
|
||||
switch (commandToPatch.type) {
|
||||
case CommandList::CommandToPatch::FrontEndState:
|
||||
UNRECOVERABLE_IF(commandToPatch.pCommand == nullptr);
|
||||
delete reinterpret_cast<VFE_STATE_TYPE *>(commandToPatch.pCommand);
|
||||
break;
|
||||
default:
|
||||
UNRECOVERABLE_IF(true);
|
||||
}
|
||||
}
|
||||
commandsToPatch.clear();
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
ze_result_t CommandListCoreFamily<gfxCoreFamily>::setGlobalWorkSizeIndirect(NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress, uint32_t lws[3]) {
|
||||
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
|
||||
|
||||
@@ -101,6 +101,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
|
||||
kernel->getKernelDescriptor().kernelMetadata.kernelName.c_str());
|
||||
}
|
||||
|
||||
updateStreamProperties(*kernel);
|
||||
|
||||
NEO::EncodeDispatchKernel<GfxFamily>::encode(commandContainer,
|
||||
reinterpret_cast<const void *>(pThreadGroupDimensions),
|
||||
isIndirect,
|
||||
@@ -140,4 +142,5 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
|
||||
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
} // namespace L0
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include "level_zero/core/source/cmdqueue/cmdqueue_imp.h"
|
||||
|
||||
#include "igfxfmid.h"
|
||||
#include "stream_properties.h"
|
||||
|
||||
namespace L0 {
|
||||
|
||||
@@ -39,6 +40,8 @@ struct CommandQueueHw : public CommandQueueImp {
|
||||
size_t estimateStateBaseAddressCmdSize();
|
||||
MOCKABLE_VIRTUAL void programFrontEnd(uint64_t scratchAddress, uint32_t perThreadScratchSpaceSize, NEO::LinearStream &commandStream);
|
||||
|
||||
size_t estimateFrontEndCmdSizeForMultipleCommandLists(bool isFrontEndStateDirty, uint32_t numCommandLists,
|
||||
ze_command_list_handle_t *phCommandLists);
|
||||
size_t estimateFrontEndCmdSize();
|
||||
size_t estimatePipelineSelect();
|
||||
void programPipelineSelect(NEO::LinearStream &commandStream);
|
||||
@@ -50,6 +53,9 @@ struct CommandQueueHw : public CommandQueueImp {
|
||||
uint32_t perThreadScratchSpaceSize);
|
||||
|
||||
bool getPreemptionCmdProgramming() override;
|
||||
void patchCommands(CommandList &commandList, uint64_t scratchAddress);
|
||||
|
||||
NEO::StreamProperties streamProperties{};
|
||||
};
|
||||
|
||||
} // namespace L0
|
||||
|
||||
@@ -174,10 +174,11 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
|
||||
linearStreamSizeEstimate += sizeof(MI_BATCH_BUFFER_END);
|
||||
}
|
||||
|
||||
auto &hwInfo = device->getHwInfo();
|
||||
if (hFence) {
|
||||
fence = Fence::fromHandle(hFence);
|
||||
spaceForResidency += residencyContainerSpaceForFence;
|
||||
linearStreamSizeEstimate += isCopyOnlyCommandQueue ? NEO::EncodeMiFlushDW<GfxFamily>::getMiFlushDwCmdSizeForDataWrite() : NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForPipeControlWithPostSyncOperation(device->getHwInfo());
|
||||
linearStreamSizeEstimate += isCopyOnlyCommandQueue ? NEO::EncodeMiFlushDW<GfxFamily>::getMiFlushDwCmdSizeForDataWrite() : NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForPipeControlWithPostSyncOperation(hwInfo);
|
||||
}
|
||||
|
||||
spaceForResidency += residencyContainerSpaceForTagWrite;
|
||||
@@ -201,9 +202,7 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
|
||||
linearStreamSizeEstimate += estimatePipelineSelect();
|
||||
}
|
||||
|
||||
if (frontEndStateDirty) {
|
||||
linearStreamSizeEstimate += estimateFrontEndCmdSize();
|
||||
}
|
||||
linearStreamSizeEstimate += estimateFrontEndCmdSizeForMultipleCommandLists(frontEndStateDirty, numCommandLists, phCommandLists);
|
||||
|
||||
if (gsbaStateDirty) {
|
||||
linearStreamSizeEstimate += estimateStateBaseAddressCmdSize();
|
||||
@@ -216,7 +215,7 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
|
||||
linearStreamSizeEstimate += NEO::SWTagsManager::estimateSpaceForSWTags<GfxFamily>();
|
||||
}
|
||||
|
||||
linearStreamSizeEstimate += isCopyOnlyCommandQueue ? NEO::EncodeMiFlushDW<GfxFamily>::getMiFlushDwCmdSizeForDataWrite() : NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForPipeControlWithPostSyncOperation(device->getHwInfo());
|
||||
linearStreamSizeEstimate += isCopyOnlyCommandQueue ? NEO::EncodeMiFlushDW<GfxFamily>::getMiFlushDwCmdSizeForDataWrite() : NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForPipeControlWithPostSyncOperation(hwInfo);
|
||||
size_t alignedSize = alignUp<size_t>(linearStreamSizeEstimate, minCmdBufferPtrAlign);
|
||||
size_t padding = alignedSize - linearStreamSizeEstimate;
|
||||
reserveLinearStreamSize(alignedSize);
|
||||
@@ -256,9 +255,6 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
|
||||
commandQueueDebugCmdsProgrammed = true;
|
||||
}
|
||||
|
||||
if (frontEndStateDirty) {
|
||||
programFrontEnd(scratchSpaceController->getScratchPatchAddress(), scratchSpaceController->getPerThreadScratchSpaceSize(), child);
|
||||
}
|
||||
if (gsbaStateDirty) {
|
||||
auto indirectHeap = CommandList::fromHandle(phCommandLists[0])->commandContainer.getIndirectHeap(NEO::HeapType::INDIRECT_OBJECT);
|
||||
programStateBaseAddress(scratchSpaceController->calculateNewGSH(), indirectHeap->getGraphicsAllocation()->isAllocatedInLocalMemoryPool(), child);
|
||||
@@ -332,6 +328,22 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
|
||||
statePreemption = commandListPreemption;
|
||||
}
|
||||
|
||||
if (!isCopyOnlyCommandQueue) {
|
||||
auto &requiredStreamState = commandList->getRequiredStreamState();
|
||||
auto programVfe = streamProperties.setCooperativeKernelProperties(requiredStreamState.getCooperativeKernelProperties(), hwInfo);
|
||||
if (frontEndStateDirty) {
|
||||
programVfe = true;
|
||||
frontEndStateDirty = false;
|
||||
}
|
||||
if (programVfe) {
|
||||
programFrontEnd(scratchSpaceController->getScratchPatchAddress(), scratchSpaceController->getPerThreadScratchSpaceSize(), child);
|
||||
}
|
||||
auto &finalStreamState = commandList->getFinalStreamState();
|
||||
streamProperties.setCooperativeKernelProperties(finalStreamState.getCooperativeKernelProperties(), hwInfo);
|
||||
}
|
||||
|
||||
patchCommands(*commandList, scratchSpaceController->getScratchPatchAddress());
|
||||
|
||||
for (size_t iter = 0; iter < cmdBufferCount; iter++) {
|
||||
auto allocation = cmdBufferAllocations[iter];
|
||||
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programBatchBufferStart(&child, allocation->getGpuAddress(), true);
|
||||
@@ -381,7 +393,7 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
|
||||
child, POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
|
||||
fence->getGpuAddress(),
|
||||
Fence::STATE_SIGNALED,
|
||||
device->getHwInfo(),
|
||||
hwInfo,
|
||||
args);
|
||||
}
|
||||
}
|
||||
@@ -445,6 +457,33 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateFrontEndCmdSize() {
|
||||
return NEO::PreambleHelper<GfxFamily>::getVFECommandsSize();
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
size_t CommandQueueHw<gfxCoreFamily>::estimateFrontEndCmdSizeForMultipleCommandLists(
|
||||
bool isFrontEndStateDirty, uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists) {
|
||||
|
||||
auto streamPropertiesCopy = streamProperties;
|
||||
auto singleFrontEndCmdSize = estimateFrontEndCmdSize();
|
||||
auto &hwInfo = device->getHwInfo();
|
||||
size_t estimatedSize = 0;
|
||||
|
||||
for (size_t i = 0; i < numCommandLists; i++) {
|
||||
auto commandList = CommandList::fromHandle(phCommandLists[i]);
|
||||
auto &requiredStreamState = commandList->getRequiredStreamState();
|
||||
auto isVfeRequired = streamPropertiesCopy.setCooperativeKernelProperties(requiredStreamState.getCooperativeKernelProperties(), hwInfo);
|
||||
if (isFrontEndStateDirty) {
|
||||
isVfeRequired = true;
|
||||
isFrontEndStateDirty = false;
|
||||
}
|
||||
if (isVfeRequired) {
|
||||
estimatedSize += singleFrontEndCmdSize;
|
||||
}
|
||||
auto &finalStreamState = commandList->getFinalStreamState();
|
||||
streamPropertiesCopy.setCooperativeKernelProperties(finalStreamState.getCooperativeKernelProperties(), hwInfo);
|
||||
}
|
||||
|
||||
return estimatedSize;
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
size_t CommandQueueHw<gfxCoreFamily>::estimatePipelineSelect() {
|
||||
|
||||
|
||||
@@ -121,4 +121,10 @@ void CommandQueueHw<gfxCoreFamily>::handleScratchSpace(NEO::ResidencyContainer &
|
||||
}
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
void CommandQueueHw<gfxCoreFamily>::patchCommands(CommandList &commandList, uint64_t scratchAddress) {
|
||||
auto &commandsToPatch = commandList.getCommandsToPatch();
|
||||
UNRECOVERABLE_IF(!commandsToPatch.empty());
|
||||
}
|
||||
|
||||
} // namespace L0
|
||||
|
||||
Reference in New Issue
Block a user