Program additional VFE_STATE instructions when needed

Additonal VFE_STATEs may be programmed when appending kernels to a
command list and when the command list is executed.

Related-To: NEO-4940, NEO-4574

Signed-off-by: Filip Hazubski <filip.hazubski@intel.com>
This commit is contained in:
Filip Hazubski
2021-04-23 17:27:40 +00:00
committed by Compute-Runtime-Automation
parent 9b12dc4390
commit 3a2281bf77
13 changed files with 200 additions and 11 deletions

View File

@@ -16,6 +16,8 @@
#include <level_zero/ze_api.h>
#include <level_zero/zet_api.h>
#include "stream_properties.h"
#include <vector>
struct _ze_command_list_handle_t {};
@@ -37,6 +39,17 @@ struct CommandList : _ze_command_list_handle_t {
static CommandList *allocate(uint32_t numIddsPerBlock) { return new Type(numIddsPerBlock); }
};
struct CommandToPatch {
enum CommandType {
FrontEndState,
Invalid
};
void *pDestination = nullptr;
void *pCommand = nullptr;
CommandType type = Invalid;
};
using CommandsToPatch = StackVec<CommandToPatch, 1>;
virtual ze_result_t close() = 0;
virtual ze_result_t destroy() = 0;
virtual ze_result_t appendEventReset(ze_event_handle_t hEvent) = 0;
@@ -200,6 +213,16 @@ struct CommandList : _ze_command_list_handle_t {
virtual ze_result_t setSyncModeQueue(bool syncMode) = 0;
const NEO::StreamProperties &getRequiredStreamState() {
return requiredStreamState;
}
const NEO::StreamProperties &getFinalStreamState() {
return finalStreamState;
}
const CommandsToPatch &getCommandsToPatch() {
return commandsToPatch;
}
protected:
std::map<const void *, NEO::GraphicsAllocation *> hostPtrMap;
uint32_t commandListPerThreadScratchSize = 0u;
@@ -211,6 +234,10 @@ struct CommandList : _ze_command_list_handle_t {
NEO::GraphicsAllocation *getAllocationFromHostPtrMap(const void *buffer, uint64_t bufferSize);
NEO::GraphicsAllocation *getHostPtrAlloc(const void *buffer, uint64_t bufferSize);
bool containsStatelessUncachedResource = false;
NEO::StreamProperties requiredStreamState{};
NEO::StreamProperties finalStreamState{};
CommandsToPatch commandsToPatch{};
};
using CommandListAllocatorFn = CommandList *(*)(uint32_t);

View File

@@ -11,6 +11,7 @@
#include "level_zero/core/source/cmdlist/cmdlist_imp.h"
#include "igfxfmid.h"
#include "stream_properties.h"
namespace NEO {
enum class ImageType;
@@ -42,6 +43,7 @@ struct CommandListCoreFamily : CommandListImp {
using CommandListImp::CommandListImp;
ze_result_t initialize(Device *device, NEO::EngineGroupType engineGroupType) override;
virtual void programL3(bool isSLMused);
~CommandListCoreFamily() override;
ze_result_t close() override;
ze_result_t appendEventReset(ze_event_handle_t hEvent) override;
@@ -211,6 +213,8 @@ struct CommandListCoreFamily : CommandListImp {
bool isCooperative);
ze_result_t appendLaunchKernelSplit(ze_kernel_handle_t hKernel, const ze_group_count_t *pThreadGroupDimensions, ze_event_handle_t hEvent);
ze_result_t prepareIndirectParams(const ze_group_count_t *pThreadGroupDimensions);
void updateStreamProperties(Kernel &kernel);
void clearCommandsToPatch();
void applyMemoryRangesBarrier(uint32_t numRanges, const size_t *pRangeSizes,
const void **pRanges);
@@ -228,6 +232,8 @@ struct CommandListCoreFamily : CommandListImp {
uint64_t getInputBufferSize(NEO::ImageType imageType, uint64_t bytesPerPixel, const ze_image_region_t *region);
MOCKABLE_VIRTUAL AlignedAllocationData getAlignedAllocation(Device *device, const void *buffer, uint64_t bufferSize);
ze_result_t addEventsToCmdList(uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents);
bool containsAnyKernel = false;
};
template <PRODUCT_FAMILY gfxProductFamily>

View File

@@ -54,6 +54,11 @@ inline ze_result_t parseErrorCode(NEO::ErrorCode returnValue) {
return ZE_RESULT_SUCCESS;
}
template <GFXCORE_FAMILY gfxCoreFamily>
CommandListCoreFamily<gfxCoreFamily>::~CommandListCoreFamily() {
clearCommandsToPatch();
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::programThreadArbitrationPolicy(Device *device) {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
@@ -77,6 +82,10 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::reset() {
unifiedMemoryControls.indirectSharedAllocationsAllowed = false;
commandListPreemptionMode = device->getDevicePreemptionMode();
commandListPerThreadScratchSize = 0u;
requiredStreamState = {};
finalStreamState = requiredStreamState;
containsAnyKernel = false;
clearCommandsToPatch();
if (!isCopyOnly()) {
if (!NEO::ApiSpecificConfig::getBindlessConfiguration()) {
@@ -1828,6 +1837,47 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::prepareIndirectParams(const ze
return ZE_RESULT_SUCCESS;
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::updateStreamProperties(Kernel &kernel) {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using VFE_STATE_TYPE = typename GfxFamily::VFE_STATE_TYPE;
if (!containsAnyKernel) {
requiredStreamState.setCooperativeKernelProperties(kernel.usesSyncBuffer(), device->getHwInfo());
finalStreamState = requiredStreamState;
containsAnyKernel = true;
return;
}
auto &hwInfo = device->getHwInfo();
auto programVfe = finalStreamState.setCooperativeKernelProperties(kernel.usesSyncBuffer(), hwInfo);
if (programVfe) {
auto pVfeStateAddress = NEO::PreambleHelper<GfxFamily>::getSpaceForVfeState(commandContainer.getCommandStream(), hwInfo, engineGroupType);
auto pVfeState = new VFE_STATE_TYPE;
NEO::PreambleHelper<GfxFamily>::programVfeState(pVfeState, hwInfo, 0, 0, device->getMaxNumHwThreads(),
NEO::AdditionalKernelExecInfo::NotApplicable, finalStreamState);
commandsToPatch.push_back({pVfeStateAddress, pVfeState, CommandToPatch::FrontEndState});
}
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::clearCommandsToPatch() {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using VFE_STATE_TYPE = typename GfxFamily::VFE_STATE_TYPE;
for (auto &commandToPatch : commandsToPatch) {
switch (commandToPatch.type) {
case CommandList::CommandToPatch::FrontEndState:
UNRECOVERABLE_IF(commandToPatch.pCommand == nullptr);
delete reinterpret_cast<VFE_STATE_TYPE *>(commandToPatch.pCommand);
break;
default:
UNRECOVERABLE_IF(true);
}
}
commandsToPatch.clear();
}
template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandListCoreFamily<gfxCoreFamily>::setGlobalWorkSizeIndirect(NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress, uint32_t lws[3]) {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;

View File

@@ -101,6 +101,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
kernel->getKernelDescriptor().kernelMetadata.kernelName.c_str());
}
updateStreamProperties(*kernel);
NEO::EncodeDispatchKernel<GfxFamily>::encode(commandContainer,
reinterpret_cast<const void *>(pThreadGroupDimensions),
isIndirect,
@@ -140,4 +142,5 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
return ZE_RESULT_SUCCESS;
}
} // namespace L0

View File

@@ -16,6 +16,7 @@
#include "level_zero/core/source/cmdqueue/cmdqueue_imp.h"
#include "igfxfmid.h"
#include "stream_properties.h"
namespace L0 {
@@ -39,6 +40,8 @@ struct CommandQueueHw : public CommandQueueImp {
size_t estimateStateBaseAddressCmdSize();
MOCKABLE_VIRTUAL void programFrontEnd(uint64_t scratchAddress, uint32_t perThreadScratchSpaceSize, NEO::LinearStream &commandStream);
size_t estimateFrontEndCmdSizeForMultipleCommandLists(bool isFrontEndStateDirty, uint32_t numCommandLists,
ze_command_list_handle_t *phCommandLists);
size_t estimateFrontEndCmdSize();
size_t estimatePipelineSelect();
void programPipelineSelect(NEO::LinearStream &commandStream);
@@ -50,6 +53,9 @@ struct CommandQueueHw : public CommandQueueImp {
uint32_t perThreadScratchSpaceSize);
bool getPreemptionCmdProgramming() override;
void patchCommands(CommandList &commandList, uint64_t scratchAddress);
NEO::StreamProperties streamProperties{};
};
} // namespace L0

View File

@@ -174,10 +174,11 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
linearStreamSizeEstimate += sizeof(MI_BATCH_BUFFER_END);
}
auto &hwInfo = device->getHwInfo();
if (hFence) {
fence = Fence::fromHandle(hFence);
spaceForResidency += residencyContainerSpaceForFence;
linearStreamSizeEstimate += isCopyOnlyCommandQueue ? NEO::EncodeMiFlushDW<GfxFamily>::getMiFlushDwCmdSizeForDataWrite() : NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForPipeControlWithPostSyncOperation(device->getHwInfo());
linearStreamSizeEstimate += isCopyOnlyCommandQueue ? NEO::EncodeMiFlushDW<GfxFamily>::getMiFlushDwCmdSizeForDataWrite() : NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForPipeControlWithPostSyncOperation(hwInfo);
}
spaceForResidency += residencyContainerSpaceForTagWrite;
@@ -201,9 +202,7 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
linearStreamSizeEstimate += estimatePipelineSelect();
}
if (frontEndStateDirty) {
linearStreamSizeEstimate += estimateFrontEndCmdSize();
}
linearStreamSizeEstimate += estimateFrontEndCmdSizeForMultipleCommandLists(frontEndStateDirty, numCommandLists, phCommandLists);
if (gsbaStateDirty) {
linearStreamSizeEstimate += estimateStateBaseAddressCmdSize();
@@ -216,7 +215,7 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
linearStreamSizeEstimate += NEO::SWTagsManager::estimateSpaceForSWTags<GfxFamily>();
}
linearStreamSizeEstimate += isCopyOnlyCommandQueue ? NEO::EncodeMiFlushDW<GfxFamily>::getMiFlushDwCmdSizeForDataWrite() : NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForPipeControlWithPostSyncOperation(device->getHwInfo());
linearStreamSizeEstimate += isCopyOnlyCommandQueue ? NEO::EncodeMiFlushDW<GfxFamily>::getMiFlushDwCmdSizeForDataWrite() : NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForPipeControlWithPostSyncOperation(hwInfo);
size_t alignedSize = alignUp<size_t>(linearStreamSizeEstimate, minCmdBufferPtrAlign);
size_t padding = alignedSize - linearStreamSizeEstimate;
reserveLinearStreamSize(alignedSize);
@@ -256,9 +255,6 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
commandQueueDebugCmdsProgrammed = true;
}
if (frontEndStateDirty) {
programFrontEnd(scratchSpaceController->getScratchPatchAddress(), scratchSpaceController->getPerThreadScratchSpaceSize(), child);
}
if (gsbaStateDirty) {
auto indirectHeap = CommandList::fromHandle(phCommandLists[0])->commandContainer.getIndirectHeap(NEO::HeapType::INDIRECT_OBJECT);
programStateBaseAddress(scratchSpaceController->calculateNewGSH(), indirectHeap->getGraphicsAllocation()->isAllocatedInLocalMemoryPool(), child);
@@ -332,6 +328,22 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
statePreemption = commandListPreemption;
}
if (!isCopyOnlyCommandQueue) {
auto &requiredStreamState = commandList->getRequiredStreamState();
auto programVfe = streamProperties.setCooperativeKernelProperties(requiredStreamState.getCooperativeKernelProperties(), hwInfo);
if (frontEndStateDirty) {
programVfe = true;
frontEndStateDirty = false;
}
if (programVfe) {
programFrontEnd(scratchSpaceController->getScratchPatchAddress(), scratchSpaceController->getPerThreadScratchSpaceSize(), child);
}
auto &finalStreamState = commandList->getFinalStreamState();
streamProperties.setCooperativeKernelProperties(finalStreamState.getCooperativeKernelProperties(), hwInfo);
}
patchCommands(*commandList, scratchSpaceController->getScratchPatchAddress());
for (size_t iter = 0; iter < cmdBufferCount; iter++) {
auto allocation = cmdBufferAllocations[iter];
NEO::EncodeBatchBufferStartOrEnd<GfxFamily>::programBatchBufferStart(&child, allocation->getGpuAddress(), true);
@@ -381,7 +393,7 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
child, POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
fence->getGpuAddress(),
Fence::STATE_SIGNALED,
device->getHwInfo(),
hwInfo,
args);
}
}
@@ -445,6 +457,33 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateFrontEndCmdSize() {
return NEO::PreambleHelper<GfxFamily>::getVFECommandsSize();
}
template <GFXCORE_FAMILY gfxCoreFamily>
size_t CommandQueueHw<gfxCoreFamily>::estimateFrontEndCmdSizeForMultipleCommandLists(
bool isFrontEndStateDirty, uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists) {
auto streamPropertiesCopy = streamProperties;
auto singleFrontEndCmdSize = estimateFrontEndCmdSize();
auto &hwInfo = device->getHwInfo();
size_t estimatedSize = 0;
for (size_t i = 0; i < numCommandLists; i++) {
auto commandList = CommandList::fromHandle(phCommandLists[i]);
auto &requiredStreamState = commandList->getRequiredStreamState();
auto isVfeRequired = streamPropertiesCopy.setCooperativeKernelProperties(requiredStreamState.getCooperativeKernelProperties(), hwInfo);
if (isFrontEndStateDirty) {
isVfeRequired = true;
isFrontEndStateDirty = false;
}
if (isVfeRequired) {
estimatedSize += singleFrontEndCmdSize;
}
auto &finalStreamState = commandList->getFinalStreamState();
streamPropertiesCopy.setCooperativeKernelProperties(finalStreamState.getCooperativeKernelProperties(), hwInfo);
}
return estimatedSize;
}
template <GFXCORE_FAMILY gfxCoreFamily>
size_t CommandQueueHw<gfxCoreFamily>::estimatePipelineSelect() {

View File

@@ -121,4 +121,10 @@ void CommandQueueHw<gfxCoreFamily>::handleScratchSpace(NEO::ResidencyContainer &
}
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandQueueHw<gfxCoreFamily>::patchCommands(CommandList &commandList, uint64_t scratchAddress) {
auto &commandsToPatch = commandList.getCommandsToPatch();
UNRECOVERABLE_IF(!commandsToPatch.empty());
}
} // namespace L0