Add heap sharing to immediate command lists

This change is intended to be used in immediate command lists that are
using flush task functionality.
With this change all immediate command list using the same csr will consume
shared allocations for dsh and ssh heaps. This will decrease number of SBA
commands dispatched when multiple command lists coexists and dispatch kernels.
With this change new SBA command should be dispatched only when current heap
allocation is exhausted.
Functionality is currently disabled and available under debug key.
Functionality will be enabled by default for all immediate command lists
with flush task functionality enabled.

Related-To: NEO-7142

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2022-09-26 22:28:10 +00:00
committed by Compute-Runtime-Automation
parent 71bef6094d
commit 3d92186362
35 changed files with 671 additions and 93 deletions

View File

@@ -297,6 +297,7 @@ struct CommandList : _ze_command_list_handle_t {
bool commandListSLMEnabled = false;
bool requiresQueueUncachedMocs = false;
bool isBcsSplitNeeded = false;
bool immediateCmdListHeapSharing = false;
protected:
NEO::GraphicsAllocation *getAllocationFromHostPtrMap(const void *buffer, uint64_t bufferSize);

View File

@@ -135,6 +135,11 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::initialize(Device *device, NEO
commandContainer.setFlushTaskUsedForImmediate(this->isFlushTaskSubmissionEnabled);
}
if (this->immediateCmdListHeapSharing) {
commandContainer.setImmediateCmdListCsr(this->csr);
commandContainer.setNumIddPerBlock(1);
}
commandContainer.setReservedSshSize(getReserveSshSize());
DeviceImp *deviceImp = static_cast<DeviceImp *>(device);
auto returnValue = commandContainer.initialize(deviceImp->getActiveDevice(), deviceImp->allocationsForReuse.get(), !isCopyOnly());

View File

@@ -43,8 +43,14 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
if (kernelDescriptor.kernelAttributes.flags.isInvalid) {
return ZE_RESULT_ERROR_INVALID_ARGUMENT;
}
appendEventForProfiling(event, true, false);
const auto kernelImmutableData = kernel->getImmutableData();
if (this->immediateCmdListHeapSharing) {
auto kernelInfo = kernelImmutableData->getKernelInfo();
commandContainer.ensureHeapSizePrepared(
NEO::EncodeDispatchKernel<GfxFamily>::getSizeRequiredSsh(*kernelInfo),
NEO::EncodeDispatchKernel<GfxFamily>::getSizeRequiredDsh(*kernelInfo));
}
appendEventForProfiling(event, true, false);
auto perThreadScratchSize = std::max<std::uint32_t>(this->getCommandListPerThreadScratchSize(),
kernel->getImmutableData()->getDescriptor().kernelAttributes.perThreadScratchSize[0]);
this->setCommandListPerThreadScratchSize(perThreadScratchSize);
@@ -147,7 +153,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
NEO::EncodeDispatchKernel<GfxFamily>::encode(commandContainer, dispatchKernelArgs, getLogicalStateHelper());
this->containsStatelessUncachedResource = dispatchKernelArgs.requiresUncachedMocs;
if (neoDevice->getDebugger()) {
if (neoDevice->getDebugger() && !this->immediateCmdListHeapSharing) {
auto *ssh = commandContainer.getIndirectHeap(NEO::HeapType::SURFACE_STATE);
auto surfaceStateSpace = neoDevice->getDebugger()->getDebugSurfaceReservedSurfaceState(*ssh);
auto surfaceState = GfxFamily::cmdInitRenderSurfaceState;

View File

@@ -21,10 +21,10 @@ constexpr size_t maxImmediateCommandSize = 4 * MemoryConstants::kiloByte;
template <GFXCORE_FAMILY gfxCoreFamily>
struct CommandListCoreFamilyImmediate : public CommandListCoreFamily<gfxCoreFamily> {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using BaseClass = CommandListCoreFamily<gfxCoreFamily>;
using BaseClass::executeCommandListImmediate;
using BaseClass::BaseClass;
using BaseClass::executeCommandListImmediate;
ze_result_t appendLaunchKernel(ze_kernel_handle_t kernelHandle,
const ze_group_count_t *threadGroupDimensions,

View File

@@ -7,6 +7,8 @@
#pragma once
#include "shared/source/command_container/command_encoder.h"
#include "shared/source/command_stream/command_stream_receiver_hw.h"
#include "shared/source/command_stream/wait_status.h"
#include "shared/source/helpers/hw_helper.h"
#include "shared/source/helpers/hw_info.h"
@@ -143,6 +145,32 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::executeCommandListImm
this->csr->makeResident(*this->device->getDebugSurface());
}
NEO::Device *neoDevice = this->device->getNEODevice();
if (neoDevice->getDebugger() && this->immediateCmdListHeapSharing) {
auto csrHw = static_cast<NEO::CommandStreamReceiverHw<GfxFamily> *>(this->csr);
auto sshStateCopy = csrHw->getSshState();
bool sshDirty = sshStateCopy.updateAndCheck(ssh);
if (sshDirty) {
auto surfaceStateSpace = neoDevice->getDebugger()->getDebugSurfaceReservedSurfaceState(*ssh);
auto surfaceState = GfxFamily::cmdInitRenderSurfaceState;
NEO::EncodeSurfaceStateArgs args;
args.outMemory = &surfaceState;
args.graphicsAddress = this->device->getDebugSurface()->getGpuAddress();
args.size = this->device->getDebugSurface()->getUnderlyingBufferSize();
args.mocs = this->device->getMOCS(false, false);
args.numAvailableDevices = neoDevice->getNumGenericSubDevices();
args.allocation = this->device->getDebugSurface();
args.gmmHelper = neoDevice->getGmmHelper();
args.useGlobalAtomics = false;
args.areMultipleSubDevicesInContext = false;
args.isDebuggerActive = true;
NEO::EncodeSurfaceState<GfxFamily>::encodeBuffer(args);
*reinterpret_cast<typename GfxFamily::RENDER_SURFACE_STATE *>(surfaceStateSpace) = surfaceState;
}
}
auto completionStamp = this->csr->flushTask(
*commandStream,
commandStreamStart,

View File

@@ -139,6 +139,12 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
if (kernelDescriptor.kernelAttributes.flags.isInvalid) {
return ZE_RESULT_ERROR_INVALID_ARGUMENT;
}
if (this->immediateCmdListHeapSharing) {
auto kernelInfo = kernelImmutableData->getKernelInfo();
commandContainer.ensureHeapSizePrepared(
NEO::EncodeDispatchKernel<GfxFamily>::getSizeRequiredSsh(*kernelInfo),
NEO::EncodeDispatchKernel<GfxFamily>::getSizeRequiredDsh(*kernelInfo));
}
commandListPerThreadScratchSize = std::max<uint32_t>(commandListPerThreadScratchSize, kernelDescriptor.kernelAttributes.perThreadScratchSize[0]);
commandListPerThreadPrivateScratchSize = std::max<uint32_t>(commandListPerThreadPrivateScratchSize, kernelDescriptor.kernelAttributes.perThreadScratchSize[1]);
@@ -265,7 +271,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
}
}
if (neoDevice->getDebugger()) {
if (neoDevice->getDebugger() && !this->immediateCmdListHeapSharing) {
auto *ssh = commandContainer.getIndirectHeap(NEO::HeapType::SURFACE_STATE);
auto surfaceStateSpace = neoDevice->getDebugger()->getDebugSurfaceReservedSurfaceState(*ssh);
auto surfaceState = GfxFamily::cmdInitRenderSurfaceState;

View File

@@ -127,6 +127,7 @@ CommandList *CommandList::createImmediate(uint32_t productFamily, Device *device
UNRECOVERABLE_IF(nullptr == csr);
commandList = static_cast<CommandListImp *>((*allocator)(CommandList::commandListimmediateIddsPerBlock));
commandList->csr = csr;
commandList->internalUsage = internalUsage;
commandList->cmdListType = CommandListType::TYPE_IMMEDIATE;
commandList->isSyncModeQueue = (desc->mode == ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS);
@@ -135,6 +136,7 @@ CommandList *CommandList::createImmediate(uint32_t productFamily, Device *device
if (NEO::DebugManager.flags.EnableFlushTaskSubmission.get() != -1) {
commandList->isFlushTaskSubmissionEnabled = !!NEO::DebugManager.flags.EnableFlushTaskSubmission.get();
}
commandList->immediateCmdListHeapSharing = L0HwHelper::enableImmediateCmdListHeapSharing(commandList->isFlushTaskSubmissionEnabled);
}
returnValue = commandList->initialize(device, engineGroupType, desc->flags);
if (returnValue != ZE_RESULT_SUCCESS) {
@@ -151,7 +153,6 @@ CommandList *CommandList::createImmediate(uint32_t productFamily, Device *device
}
commandList->cmdQImmediate = commandQueue;
commandList->csr = csr;
commandList->isTbxMode = (csr->getType() == NEO::CommandStreamReceiverType::CSR_TBX) || (csr->getType() == NEO::CommandStreamReceiverType::CSR_TBX_WITH_AUB);
commandList->commandListPreemptionMode = device->getDevicePreemptionMode();

View File

@@ -1092,7 +1092,9 @@ Device *Device::create(DriverHandle *driverHandle, NEO::Device *neoDevice, bool
device->getSourceLevelDebugger()
->notifyNewDevice(osInterface ? osInterface->getDriverModel()->getDeviceHandle() : 0);
}
device->createSysmanHandle(isSubDevice);
if (device->getNEODevice()->getAllEngines()[0].commandStreamReceiver->getType() == NEO::CommandStreamReceiverType::CSR_HW) {
device->createSysmanHandle(isSubDevice);
}
device->resourcesReleased = false;
device->populateSubDeviceCopyEngineGroups();

View File

@@ -39,4 +39,12 @@ bool L0HwHelper::enableStateComputeModeTracking() {
return defaultValue;
}
bool L0HwHelper::enableImmediateCmdListHeapSharing(bool cmdlistSupport) {
bool enabled = false;
if (NEO::DebugManager.flags.EnableImmediateCmdListHeapSharing.get() != -1) {
return !!NEO::DebugManager.flags.EnableImmediateCmdListHeapSharing.get();
}
return enabled;
}
} // namespace L0

View File

@@ -33,6 +33,7 @@ class L0HwHelper {
static bool enableFrontEndStateTracking();
static bool enablePipelineSelectStateTracking();
static bool enableStateComputeModeTracking();
static bool enableImmediateCmdListHeapSharing(bool cmdlistSupport);
virtual void setAdditionalGroupProperty(ze_command_queue_group_properties_t &groupProperty, NEO::EngineGroupT &group) const = 0;
virtual L0::Event *createEvent(L0::EventPool *eventPool, const ze_event_desc_t *desc, L0::Device *device) const = 0;

View File

@@ -385,6 +385,11 @@ int main(int argc, char *argv[]) {
verbose = isVerbose(argc, argv);
bool useSyncQueue = isSyncQueueEnabled(argc, argv);
bool commandListShared = isCommandListShared(argc, argv);
bool commandListCoexist = isParamEnabled(argc, argv, "-o", "--coexists");
if (commandListCoexist) {
std::cerr << "Command List coexists between tests" << std::endl;
commandListShared = false;
}
bool aubMode = isAubMode(argc, argv);
ze_context_handle_t context = nullptr;
@@ -410,18 +415,43 @@ int main(int argc, char *argv[]) {
SUCCESS_OR_TERMINATE(zeCommandListCreateImmediate(context, device0, &cmdQueueDesc, &cmdList));
}
ze_command_list_handle_t cmdListStandardMemoryCopy = nullptr;
ze_command_list_handle_t cmdListMemoryCopyRegion = nullptr;
ze_command_list_handle_t cmdListLaunchGpuKernel = nullptr;
if (commandListCoexist) {
ze_command_queue_desc_t cmdQueueDesc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC};
cmdQueueDesc.pNext = nullptr;
cmdQueueDesc.flags = 0;
cmdQueueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL;
cmdQueueDesc.ordinal = getCommandQueueOrdinal(device0);
cmdQueueDesc.index = 0;
selectQueueMode(cmdQueueDesc, useSyncQueue);
SUCCESS_OR_TERMINATE(zeCommandListCreateImmediate(context, device0, &cmdQueueDesc, &cmdListStandardMemoryCopy));
SUCCESS_OR_TERMINATE(zeCommandListCreateImmediate(context, device0, &cmdQueueDesc, &cmdListMemoryCopyRegion));
SUCCESS_OR_TERMINATE(zeCommandListCreateImmediate(context, device0, &cmdQueueDesc, &cmdListLaunchGpuKernel));
cmdList = cmdListStandardMemoryCopy;
}
std::string currentTest;
currentTest = "Standard Memory Copy";
testAppendMemoryCopy(context, device0, useSyncQueue, outputValidationSuccessful, cmdList);
printResult(aubMode, outputValidationSuccessful, blackBoxName, currentTest);
if (outputValidationSuccessful || aubMode) {
if (commandListCoexist) {
cmdList = cmdListMemoryCopyRegion;
}
currentTest = "Memory Copy Region";
testAppendMemoryCopyRegion(context, device0, useSyncQueue, outputValidationSuccessful, cmdList);
printResult(aubMode, outputValidationSuccessful, blackBoxName, currentTest);
}
if (outputValidationSuccessful || aubMode) {
if (commandListCoexist) {
cmdList = cmdListLaunchGpuKernel;
}
currentTest = "Launch GPU Kernel";
testAppendGpuKernel(context, device0, useSyncQueue, outputValidationSuccessful, cmdList);
printResult(aubMode, outputValidationSuccessful, blackBoxName, currentTest);
@@ -430,6 +460,11 @@ int main(int argc, char *argv[]) {
if (commandListShared) {
SUCCESS_OR_TERMINATE(zeCommandListDestroy(cmdList));
}
if (commandListCoexist) {
SUCCESS_OR_TERMINATE(zeCommandListDestroy(cmdListStandardMemoryCopy));
SUCCESS_OR_TERMINATE(zeCommandListDestroy(cmdListMemoryCopyRegion));
SUCCESS_OR_TERMINATE(zeCommandListDestroy(cmdListLaunchGpuKernel));
}
SUCCESS_OR_TERMINATE(zeContextDestroy(context));

View File

@@ -94,7 +94,7 @@ void ModuleMutableCommandListFixture::setUp(uint32_t revision) {
false,
returnValue));
NEO::EngineGroupType engineGroupType = NEO::HwHelper::get(device->getHwInfo().platform.eRenderCoreFamily).getEngineGroupType(neoDevice->getDefaultEngine().getEngineType(), neoDevice->getDefaultEngine().getEngineUsage(), device->getHwInfo());
engineGroupType = NEO::HwHelper::get(device->getHwInfo().platform.eRenderCoreFamily).getEngineGroupType(neoDevice->getDefaultEngine().getEngineType(), neoDevice->getDefaultEngine().getEngineUsage(), device->getHwInfo());
commandList.reset(whiteboxCast(CommandList::create(productFamily, device, engineGroupType, 0u, returnValue)));
commandListImmediate.reset(whiteboxCast(CommandList::createImmediate(productFamily, device, &queueDesc, false, engineGroupType, returnValue)));
@@ -131,5 +131,11 @@ void CmdListStateComputeModeStateFixture::setUp() {
ModuleMutableCommandListFixture::setUp();
}
void ImmediateCmdListSharedHeapsFixture::setUp() {
DebugManager.flags.EnableFlushTaskSubmission.set(1);
DebugManager.flags.EnableImmediateCmdListHeapSharing.set(1);
ModuleMutableCommandListFixture::setUp();
}
} // namespace ult
} // namespace L0

View File

@@ -78,6 +78,7 @@ struct ModuleMutableCommandListFixture : public ModuleImmutableDataFixture {
std::unique_ptr<L0::ult::CommandList> commandListImmediate;
std::unique_ptr<ModuleImmutableDataFixture::MockKernel> kernel;
L0::ult::CommandQueue *commandQueue;
NEO::EngineGroupType engineGroupType;
};
struct MultiReturnCommandListFixture : public ModuleMutableCommandListFixture {
@@ -117,5 +118,11 @@ struct CmdListLargeGrfFixture : public CmdListStateComputeModeStateFixture {
void testBody();
};
struct ImmediateCmdListSharedHeapsFixture : public ModuleMutableCommandListFixture {
void setUp();
DebugManagerStateRestore restorer;
};
} // namespace ult
} // namespace L0

View File

@@ -126,11 +126,15 @@ struct ModuleImmutableDataFixture : public DeviceFixture {
public:
using KernelImp::crossThreadData;
using KernelImp::crossThreadDataSize;
using KernelImp::dynamicStateHeapData;
using KernelImp::dynamicStateHeapDataSize;
using KernelImp::kernelArgHandlers;
using KernelImp::kernelHasIndirectAccess;
using KernelImp::kernelRequiresGenerationOfLocalIdsByRuntime;
using KernelImp::privateMemoryGraphicsAllocation;
using KernelImp::requiredWorkgroupOrder;
using KernelImp::surfaceStateHeapData;
using KernelImp::surfaceStateHeapDataSize;
MockKernel(MockModule *mockModule) : WhiteBox<L0::KernelImp>(mockModule) {
}

View File

@@ -59,6 +59,7 @@ struct WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>
using BaseClass::getAllocationFromHostPtrMap;
using BaseClass::getHostPtrAlloc;
using BaseClass::hostPtrMap;
using BaseClass::immediateCmdListHeapSharing;
using BaseClass::indirectAllocationsAllowed;
using BaseClass::initialize;
using BaseClass::partitionCount;
@@ -123,6 +124,7 @@ struct WhiteBox<L0::CommandListCoreFamilyImmediate<gfxCoreFamily>>
using BaseClass::csr;
using BaseClass::finalStreamState;
using BaseClass::frontEndStateTracking;
using BaseClass::immediateCmdListHeapSharing;
using BaseClass::isFlushTaskSubmissionEnabled;
using BaseClass::partitionCount;
using BaseClass::pipelineSelectStateTracking;
@@ -134,9 +136,11 @@ struct WhiteBox<L0::CommandListCoreFamilyImmediate<gfxCoreFamily>>
template <GFXCORE_FAMILY gfxCoreFamily>
struct MockCommandListImmediate : public CommandListCoreFamilyImmediate<gfxCoreFamily> {
using CommandListCoreFamilyImmediate<gfxCoreFamily>::requiredStreamState;
using CommandListCoreFamilyImmediate<gfxCoreFamily>::containsAnyKernel;
using CommandListCoreFamilyImmediate<gfxCoreFamily>::indirectAllocationsAllowed;
using BaseClass = CommandListCoreFamilyImmediate<gfxCoreFamily>;
using BaseClass::containsAnyKernel;
using BaseClass::immediateCmdListHeapSharing;
using BaseClass::indirectAllocationsAllowed;
using BaseClass::requiredStreamState;
};
template <>
@@ -148,6 +152,7 @@ struct WhiteBox<::L0::CommandList> : public ::L0::CommandListImp {
using BaseClass::commandListPreemptionMode;
using BaseClass::csr;
using BaseClass::frontEndStateTracking;
using BaseClass::immediateCmdListHeapSharing;
using BaseClass::initialize;
using BaseClass::isFlushTaskSubmissionEnabled;
using BaseClass::nonImmediateLogicalStateHelper;

View File

@@ -45,6 +45,8 @@ struct WhiteBox<::L0::Kernel> : public ::L0::KernelImp {
using ::L0::KernelImp::createPrintfBuffer;
using ::L0::KernelImp::crossThreadData;
using ::L0::KernelImp::crossThreadDataSize;
using ::L0::KernelImp::dynamicStateHeapData;
using ::L0::KernelImp::dynamicStateHeapDataSize;
using ::L0::KernelImp::groupSize;
using ::L0::KernelImp::kernelImmData;
using ::L0::KernelImp::kernelRequiresGenerationOfLocalIdsByRuntime;
@@ -76,6 +78,8 @@ struct WhiteBoxKernelHw : public KernelHw<gfxCoreFamily> {
using ::L0::KernelImp::createPrintfBuffer;
using ::L0::KernelImp::crossThreadData;
using ::L0::KernelImp::crossThreadDataSize;
using ::L0::KernelImp::dynamicStateHeapData;
using ::L0::KernelImp::dynamicStateHeapDataSize;
using ::L0::KernelImp::groupSize;
using ::L0::KernelImp::kernelImmData;
using ::L0::KernelImp::kernelRequiresGenerationOfLocalIdsByRuntime;

View File

@@ -5,6 +5,10 @@
*
*/
#include "shared/source/command_container/command_encoder.h"
#include "shared/source/kernel/kernel_descriptor.h"
#include "shared/test/common/helpers/unit_test_helper.h"
#include "shared/test/common/libult/ult_command_stream_receiver.h"
#include "shared/test/common/mocks/mock_command_stream_receiver.h"
#include "shared/test/common/mocks/ult_device_factory.h"
#include "shared/test/common/test_macros/hw_test.h"
@@ -716,5 +720,165 @@ HWTEST2_F(CommandListTest, givenCmdListWithNoIndirectAccessWhenExecutingCommandL
commandList->cmdQImmediate = oldCommandQueue;
}
using ImmediateCmdListSharedHeapsTest = Test<ImmediateCmdListSharedHeapsFixture>;
HWTEST2_F(ImmediateCmdListSharedHeapsTest, givenMultipleCommandListsUsingSharedHeapsWhenDispatchingKernelThenExpectSingleSbaCommandAndHeapsReused, IsAtLeastSkl) {
using STATE_BASE_ADDRESS = typename FamilyType::STATE_BASE_ADDRESS;
using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE;
using SAMPLER_STATE = typename FamilyType::SAMPLER_STATE;
using SAMPLER_BORDER_COLOR_STATE = typename FamilyType::SAMPLER_BORDER_COLOR_STATE;
auto &hwInfo = device->getHwInfo();
uint32_t expectedSbaCount = 1;
auto &hwInfoConfig = *NEO::HwInfoConfig::get(hwInfo.platform.eProductFamily);
if (hwInfoConfig.isAdditionalStateBaseAddressWARequired(hwInfo)) {
expectedSbaCount++;
}
bool dshPresent = hwInfo.capabilityTable.supportsImages || NEO::UnitTestHelper<FamilyType>::getAdditionalDshSize() > 0;
if (dshPresent) {
mockKernelImmData->kernelInfo->kernelDescriptor.payloadMappings.samplerTable.numSamplers = 2;
mockKernelImmData->kernelInfo->kernelDescriptor.payloadMappings.samplerTable.tableOffset = sizeof(SAMPLER_BORDER_COLOR_STATE);
mockKernelImmData->kernelInfo->kernelDescriptor.payloadMappings.samplerTable.borderColor = 0;
kernel->dynamicStateHeapDataSize = static_cast<uint32_t>(sizeof(SAMPLER_STATE) * 2 + mockKernelImmData->kernelInfo->kernelDescriptor.payloadMappings.samplerTable.tableOffset);
kernel->dynamicStateHeapData.reset(new uint8_t[kernel->dynamicStateHeapDataSize]);
mockKernelImmData->mockKernelDescriptor->payloadMappings.samplerTable = mockKernelImmData->kernelInfo->kernelDescriptor.payloadMappings.samplerTable;
}
mockKernelImmData->kernelInfo->heapInfo.SurfaceStateHeapSize = static_cast<uint32_t>(sizeof(RENDER_SURFACE_STATE) + sizeof(uint32_t));
mockKernelImmData->mockKernelDescriptor->payloadMappings.bindingTable.numEntries = 1;
mockKernelImmData->mockKernelDescriptor->payloadMappings.bindingTable.tableOffset = 0x40;
mockKernelImmData->mockKernelDescriptor->kernelAttributes.bufferAddressingMode = NEO::KernelDescriptor::BindfulAndStateless;
kernel->surfaceStateHeapDataSize = mockKernelImmData->kernelInfo->heapInfo.SurfaceStateHeapSize;
kernel->surfaceStateHeapData.reset(new uint8_t[kernel->surfaceStateHeapDataSize]);
EXPECT_TRUE(commandListImmediate->isFlushTaskSubmissionEnabled);
EXPECT_TRUE(commandListImmediate->immediateCmdListHeapSharing);
auto &cmdContainer = commandListImmediate->commandContainer;
EXPECT_EQ(1u, cmdContainer.getNumIddPerBlock());
EXPECT_TRUE(cmdContainer.immediateCmdListSharedHeap(HeapType::DYNAMIC_STATE));
EXPECT_TRUE(cmdContainer.immediateCmdListSharedHeap(HeapType::SURFACE_STATE));
auto &ultCsr = neoDevice->getUltCommandStreamReceiver<FamilyType>();
auto &csrStream = ultCsr.commandStream;
const ze_group_count_t groupCount{1, 1, 1};
CmdListKernelLaunchParams launchParams = {};
auto result = ZE_RESULT_SUCCESS;
auto csrDshHeap = &ultCsr.getIndirectHeap(HeapType::DYNAMIC_STATE, MemoryConstants::pageSize64k);
auto csrSshHeap = &ultCsr.getIndirectHeap(HeapType::SURFACE_STATE, MemoryConstants::pageSize64k);
size_t dshUsed = csrDshHeap->getUsed();
size_t sshUsed = csrSshHeap->getUsed();
size_t csrUsedBefore = csrStream.getUsed();
result = commandListImmediate->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
size_t csrUsedAfter = csrStream.getUsed();
NEO::IndirectHeap *containerDshHeap = cmdContainer.getIndirectHeap(HeapType::DYNAMIC_STATE);
NEO::IndirectHeap *containerSshHeap = cmdContainer.getIndirectHeap(HeapType::SURFACE_STATE);
if (dshPresent) {
EXPECT_EQ(csrDshHeap, containerDshHeap);
} else {
EXPECT_EQ(nullptr, containerDshHeap);
}
EXPECT_EQ(csrSshHeap, containerSshHeap);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList,
ptrOffset(csrStream.getCpuBase(), csrUsedBefore),
(csrUsedAfter - csrUsedBefore)));
auto sbaCmds = findAll<STATE_BASE_ADDRESS *>(cmdList.begin(), cmdList.end());
ASSERT_EQ(expectedSbaCount, sbaCmds.size());
auto &sbaCmd = *genCmdCast<STATE_BASE_ADDRESS *>(*sbaCmds[0]);
if (dshPresent) {
EXPECT_TRUE(sbaCmd.getDynamicStateBaseAddressModifyEnable());
EXPECT_EQ(csrDshHeap->getHeapGpuBase(), sbaCmd.getDynamicStateBaseAddress());
} else {
EXPECT_FALSE(sbaCmd.getDynamicStateBaseAddressModifyEnable());
EXPECT_EQ(0u, sbaCmd.getDynamicStateBaseAddress());
}
EXPECT_TRUE(sbaCmd.getSurfaceStateBaseAddressModifyEnable());
EXPECT_EQ(csrSshHeap->getHeapGpuBase(), sbaCmd.getSurfaceStateBaseAddress());
dshUsed = csrDshHeap->getUsed() - dshUsed;
sshUsed = csrSshHeap->getUsed() - sshUsed;
if (dshPresent) {
EXPECT_LT(0u, dshUsed);
} else {
EXPECT_EQ(0u, dshUsed);
}
EXPECT_LT(0u, sshUsed);
size_t dshEstimated = NEO::EncodeDispatchKernel<FamilyType>::getSizeRequiredDsh(*kernel->getImmutableData()->getKernelInfo());
size_t sshEstimated = NEO::EncodeDispatchKernel<FamilyType>::getSizeRequiredSsh(*kernel->getImmutableData()->getKernelInfo());
EXPECT_GE(dshEstimated, dshUsed);
EXPECT_GE(sshEstimated, sshUsed);
ze_command_queue_desc_t queueDesc{};
queueDesc.ordinal = 0u;
queueDesc.index = 0u;
queueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL;
std::unique_ptr<L0::ult::CommandList> commandListImmediateCoexisting;
commandListImmediateCoexisting.reset(whiteboxCast(CommandList::createImmediate(productFamily, device, &queueDesc, false, engineGroupType, result)));
auto &cmdContainerCoexisting = commandListImmediateCoexisting->commandContainer;
EXPECT_EQ(1u, cmdContainerCoexisting.getNumIddPerBlock());
EXPECT_TRUE(cmdContainerCoexisting.immediateCmdListSharedHeap(HeapType::DYNAMIC_STATE));
EXPECT_TRUE(cmdContainerCoexisting.immediateCmdListSharedHeap(HeapType::SURFACE_STATE));
dshUsed = csrDshHeap->getUsed();
sshUsed = csrSshHeap->getUsed();
csrUsedBefore = csrStream.getUsed();
result = commandListImmediateCoexisting->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
csrUsedAfter = csrStream.getUsed();
auto containerDshHeapCoexisting = cmdContainerCoexisting.getIndirectHeap(HeapType::DYNAMIC_STATE);
auto containerSshHeapCoexisting = cmdContainerCoexisting.getIndirectHeap(HeapType::SURFACE_STATE);
if (dshPresent) {
EXPECT_EQ(csrDshHeap, containerDshHeapCoexisting);
} else {
EXPECT_EQ(nullptr, containerDshHeapCoexisting);
}
EXPECT_EQ(csrSshHeap, containerSshHeapCoexisting);
cmdList.clear();
sbaCmds.clear();
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList,
ptrOffset(csrStream.getCpuBase(), csrUsedBefore),
(csrUsedAfter - csrUsedBefore)));
sbaCmds = findAll<STATE_BASE_ADDRESS *>(cmdList.begin(), cmdList.end());
EXPECT_EQ(0u, sbaCmds.size());
dshUsed = csrDshHeap->getUsed() - dshUsed;
sshUsed = csrSshHeap->getUsed() - sshUsed;
if (dshPresent) {
EXPECT_LT(0u, dshUsed);
} else {
EXPECT_EQ(0u, dshUsed);
}
EXPECT_LT(0u, sshUsed);
EXPECT_GE(dshEstimated, dshUsed);
EXPECT_GE(sshEstimated, sshUsed);
}
} // namespace ult
} // namespace L0

View File

@@ -15,6 +15,7 @@
#include "level_zero/core/source/cmdlist/cmdlist.h"
#include "level_zero/core/source/event/event.h"
#include "level_zero/core/test/unit_tests/fixtures/device_fixture.h"
#include "level_zero/core/test/unit_tests/fixtures/module_fixture.h"
#include "level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h"
#include "level_zero/core/test/unit_tests/mocks/mock_kernel.h"
#include "level_zero/core/test/unit_tests/sources/debugger/l0_debugger_fixture.h"
@@ -569,5 +570,64 @@ HWTEST2_F(L0DebuggerTest, givenXeHpOrXeHpgCoreAndDebugIsActiveThenDisableL3Cache
INSTANTIATE_TEST_CASE_P(SBAModesForDebugger, L0DebuggerParameterizedTests, ::testing::Values(0, 1));
struct MockKernelImmutableData : public KernelImmutableData {
using KernelImmutableData::isaGraphicsAllocation;
using KernelImmutableData::kernelDescriptor;
using KernelImmutableData::kernelInfo;
MockKernelImmutableData(L0::Device *device) : KernelImmutableData(device) {}
};
HWTEST2_F(L0DebuggerTest, givenFlushTaskSubmissionAndSharedHeapsEnabledWhenAppendingKernelUsingNewHeapThenDebugSurfaceIsProgrammedOnce, IsAtLeastGen12lp) {
using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE;
DebugManagerStateRestore restorer;
NEO::DebugManager.flags.EnableFlushTaskSubmission.set(true);
NEO::DebugManager.flags.EnableImmediateCmdListHeapSharing.set(1);
ze_command_queue_desc_t queueDesc = {};
ze_result_t returnValue = ZE_RESULT_SUCCESS;
auto commandList = CommandList::createImmediate(productFamily, device, &queueDesc, false, NEO::EngineGroupType::RenderCompute, returnValue);
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
EXPECT_TRUE(commandList->isFlushTaskSubmissionEnabled);
EXPECT_TRUE(commandList->immediateCmdListHeapSharing);
auto kernelInfo = std::make_unique<NEO::KernelInfo>();
auto kernelDescriptor = std::make_unique<NEO::KernelDescriptor>();
auto kernelImmData = std::make_unique<MockKernelImmutableData>(device);
kernelImmData->kernelInfo = kernelInfo.get();
kernelImmData->kernelDescriptor = kernelDescriptor.get();
kernelImmData->isaGraphicsAllocation.reset(new MockGraphicsAllocation());
Mock<::L0::Kernel> kernel;
kernel.kernelImmData = kernelImmData.get();
CmdListKernelLaunchParams launchParams = {};
ze_group_count_t groupCount{1, 1, 1};
returnValue = commandList->appendLaunchKernel(kernel.toHandle(), &groupCount, nullptr, 0, nullptr, launchParams);
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
auto csrHeap = &commandList->csr->getIndirectHeap(NEO::HeapType::SURFACE_STATE, 0);
ASSERT_NE(nullptr, csrHeap);
auto debugSurfaceState = reinterpret_cast<RENDER_SURFACE_STATE *>(csrHeap->getCpuBase());
ASSERT_NE(debugSurfaceState, nullptr);
auto debugSurface = static_cast<::L0::DeviceImp *>(device)->getDebugSurface();
ASSERT_NE(debugSurface, nullptr);
ASSERT_EQ(debugSurface->getGpuAddress(), debugSurfaceState->getSurfaceBaseAddress());
memset(debugSurfaceState, 0, sizeof(*debugSurfaceState));
returnValue = commandList->appendLaunchKernel(kernel.toHandle(), &groupCount, nullptr, 0, nullptr, launchParams);
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
ASSERT_EQ(0u, debugSurfaceState->getSurfaceBaseAddress());
kernelImmData->isaGraphicsAllocation.reset(nullptr);
commandList->destroy();
}
} // namespace ult
} // namespace L0

View File

@@ -96,6 +96,9 @@ CommandContainer::ErrorCode CommandContainer::initialize(Device *device, Allocat
if (!hardwareInfo.capabilityTable.supportsImages && IndirectHeap::Type::DYNAMIC_STATE == i) {
continue;
}
if (immediateCmdListSharedHeap(static_cast<HeapType>(i))) {
continue;
}
allocationIndirectHeaps[i] = heapHelper->getHeapAllocation(i,
heapSize,
alignedSize,
@@ -185,32 +188,40 @@ void *CommandContainer::getHeapSpaceAllowGrow(HeapType heapType,
size_t size) {
auto indirectHeap = getIndirectHeap(heapType);
if (indirectHeap->getAvailableSpace() < size) {
size_t newSize = indirectHeap->getUsed() + indirectHeap->getAvailableSpace();
newSize *= 2;
newSize = std::max(newSize, indirectHeap->getAvailableSpace() + size);
newSize = alignUp(newSize, MemoryConstants::pageSize);
auto oldAlloc = getIndirectHeapAllocation(heapType);
auto newAlloc = getHeapHelper()->getHeapAllocation(heapType, newSize, MemoryConstants::pageSize, device->getRootDeviceIndex());
UNRECOVERABLE_IF(!oldAlloc);
UNRECOVERABLE_IF(!newAlloc);
auto oldBase = indirectHeap->getHeapGpuBase();
indirectHeap->replaceGraphicsAllocation(newAlloc);
indirectHeap->replaceBuffer(newAlloc->getUnderlyingBuffer(),
newAlloc->getUnderlyingBufferSize());
auto newBase = indirectHeap->getHeapGpuBase();
getResidencyContainer().push_back(newAlloc);
getDeallocationContainer().push_back(oldAlloc);
setIndirectHeapAllocation(heapType, newAlloc);
if (oldBase != newBase) {
setHeapDirty(heapType);
if (immediateCmdListSharedHeap(heapType)) {
UNRECOVERABLE_IF(indirectHeap == nullptr);
UNRECOVERABLE_IF(indirectHeap->getAvailableSpace() < size);
getResidencyContainer().push_back(indirectHeap->getGraphicsAllocation());
} else {
if (indirectHeap->getAvailableSpace() < size) {
size_t newSize = indirectHeap->getUsed() + indirectHeap->getAvailableSpace();
newSize *= 2;
newSize = std::max(newSize, indirectHeap->getAvailableSpace() + size);
newSize = alignUp(newSize, MemoryConstants::pageSize);
auto oldAlloc = getIndirectHeapAllocation(heapType);
auto newAlloc = getHeapHelper()->getHeapAllocation(heapType, newSize, MemoryConstants::pageSize, device->getRootDeviceIndex());
UNRECOVERABLE_IF(!oldAlloc);
UNRECOVERABLE_IF(!newAlloc);
auto oldBase = indirectHeap->getHeapGpuBase();
indirectHeap->replaceGraphicsAllocation(newAlloc);
indirectHeap->replaceBuffer(newAlloc->getUnderlyingBuffer(),
newAlloc->getUnderlyingBufferSize());
auto newBase = indirectHeap->getHeapGpuBase();
getResidencyContainer().push_back(newAlloc);
getDeallocationContainer().push_back(oldAlloc);
setIndirectHeapAllocation(heapType, newAlloc);
if (oldBase != newBase) {
setHeapDirty(heapType);
}
}
}
return indirectHeap->getSpace(size);
}
IndirectHeap *CommandContainer::getHeapWithRequiredSizeAndAlignment(HeapType heapType, size_t sizeRequired, size_t alignment) {
auto indirectHeap = getIndirectHeap(heapType);
UNRECOVERABLE_IF(indirectHeap == nullptr);
auto sizeRequested = sizeRequired;
auto heapBuffer = indirectHeap->getSpace(0);
@@ -218,27 +229,32 @@ IndirectHeap *CommandContainer::getHeapWithRequiredSizeAndAlignment(HeapType hea
sizeRequested += alignment;
}
if (indirectHeap->getAvailableSpace() < sizeRequested) {
size_t newSize = indirectHeap->getUsed() + indirectHeap->getAvailableSpace();
newSize = alignUp(newSize, MemoryConstants::pageSize);
auto oldAlloc = getIndirectHeapAllocation(heapType);
auto newAlloc = getHeapHelper()->getHeapAllocation(heapType, newSize, MemoryConstants::pageSize, device->getRootDeviceIndex());
UNRECOVERABLE_IF(!oldAlloc);
UNRECOVERABLE_IF(!newAlloc);
auto oldBase = indirectHeap->getHeapGpuBase();
indirectHeap->replaceGraphicsAllocation(newAlloc);
indirectHeap->replaceBuffer(newAlloc->getUnderlyingBuffer(),
newAlloc->getUnderlyingBufferSize());
auto newBase = indirectHeap->getHeapGpuBase();
getResidencyContainer().push_back(newAlloc);
getDeallocationContainer().push_back(oldAlloc);
setIndirectHeapAllocation(heapType, newAlloc);
if (oldBase != newBase) {
setHeapDirty(heapType);
}
if (heapType == HeapType::SURFACE_STATE) {
indirectHeap->getSpace(reservedSshSize);
sshAllocations.push_back(oldAlloc);
if (immediateCmdListSharedHeap(heapType)) {
UNRECOVERABLE_IF(indirectHeap->getAvailableSpace() < sizeRequested);
getResidencyContainer().push_back(indirectHeap->getGraphicsAllocation());
} else {
if (indirectHeap->getAvailableSpace() < sizeRequested) {
size_t newSize = indirectHeap->getUsed() + indirectHeap->getAvailableSpace();
newSize = alignUp(newSize, MemoryConstants::pageSize);
auto oldAlloc = getIndirectHeapAllocation(heapType);
auto newAlloc = getHeapHelper()->getHeapAllocation(heapType, newSize, MemoryConstants::pageSize, device->getRootDeviceIndex());
UNRECOVERABLE_IF(!oldAlloc);
UNRECOVERABLE_IF(!newAlloc);
auto oldBase = indirectHeap->getHeapGpuBase();
indirectHeap->replaceGraphicsAllocation(newAlloc);
indirectHeap->replaceBuffer(newAlloc->getUnderlyingBuffer(),
newAlloc->getUnderlyingBufferSize());
auto newBase = indirectHeap->getHeapGpuBase();
getResidencyContainer().push_back(newAlloc);
getDeallocationContainer().push_back(oldAlloc);
setIndirectHeapAllocation(heapType, newAlloc);
if (oldBase != newBase) {
setHeapDirty(heapType);
}
if (heapType == HeapType::SURFACE_STATE) {
indirectHeap->getSpace(reservedSshSize);
sshAllocations.push_back(oldAlloc);
}
}
}
@@ -329,7 +345,19 @@ void CommandContainer::prepareBindfulSsh() {
}
IndirectHeap *CommandContainer::getIndirectHeap(HeapType heapType) {
return indirectHeaps[heapType].get();
if (immediateCmdListSharedHeap(heapType)) {
return heapType == HeapType::SURFACE_STATE ? sharedSshCsrHeap : sharedDshCsrHeap;
} else {
return indirectHeaps[heapType].get();
}
}
void CommandContainer::ensureHeapSizePrepared(size_t sshRequiredSize, size_t dshRequiredSize) {
sharedSshCsrHeap = &immediateCmdListCsr->getIndirectHeap(HeapType::SURFACE_STATE, sshRequiredSize);
if (dshRequiredSize > 0) {
sharedDshCsrHeap = &immediateCmdListCsr->getIndirectHeap(HeapType::DYNAMIC_STATE, dshRequiredSize);
}
}
} // namespace NEO

View File

@@ -17,6 +17,7 @@
#include <vector>
namespace NEO {
class CommandStreamReceiver;
class Device;
class GraphicsAllocation;
class LinearStream;
@@ -94,13 +95,20 @@ class CommandContainer : public NonCopyableOrMovableClass {
void setIddBlock(void *iddBlock) { this->iddBlock = iddBlock; }
void *getIddBlock() { return iddBlock; }
uint32_t getNumIddPerBlock() const { return numIddsPerBlock; }
void setNumIddPerBlock(uint32_t value) { numIddsPerBlock = value; }
void setReservedSshSize(size_t reserveSize) {
reservedSshSize = reserveSize;
}
bool getFlushTaskUsedForImmediate() const { return isFlushTaskUsedForImmediate; }
void setFlushTaskUsedForImmediate(bool flushTaskUsedForImmediate) { isFlushTaskUsedForImmediate = flushTaskUsedForImmediate; }
void setImmediateCmdListCsr(CommandStreamReceiver *newValue) {
this->immediateCmdListCsr = newValue;
}
bool immediateCmdListSharedHeap(HeapType heapType) {
return (this->immediateCmdListCsr != nullptr && (heapType == HeapType::DYNAMIC_STATE || heapType == HeapType::SURFACE_STATE));
}
void ensureHeapSizePrepared(size_t sshRequiredSize, size_t dshRequiredSize);
HeapContainer sshAllocations;
uint64_t currentLinearStreamStartOffset = 0u;
uint32_t slmSize = std::numeric_limits<uint32_t>::max();
@@ -129,6 +137,9 @@ class CommandContainer : public NonCopyableOrMovableClass {
Device *device = nullptr;
AllocationsList *reusableAllocationList = nullptr;
size_t reservedSshSize = 0;
CommandStreamReceiver *immediateCmdListCsr = nullptr;
IndirectHeap *sharedSshCsrHeap = nullptr;
IndirectHeap *sharedDshCsrHeap = nullptr;
uint32_t dirtyHeaps = std::numeric_limits<uint32_t>::max();
uint32_t numIddsPerBlock = 64;

View File

@@ -30,6 +30,7 @@ class IndirectHeap;
class LogicalStateHelper;
class Gmm;
struct HardwareInfo;
struct KernelInfo;
struct StateComputeModeProperties;
struct EncodeDispatchKernelArgs {
@@ -112,6 +113,10 @@ struct EncodeDispatchKernel {
static void adjustWalkOrder(WALKER_TYPE &walkerCmd, uint32_t requiredWorkGroupOrder, const HardwareInfo &hwInfo);
static constexpr bool shouldUpdateGlobalAtomics(bool &currentVal, bool refVal, bool updateCurrent);
static size_t getSizeRequiredDsh(const KernelInfo &kernelInfo);
static size_t getSizeRequiredSsh(const KernelInfo &kernelInfo);
inline static uint32_t additionalSizeRequiredDsh();
};
template <typename GfxFamily>
@@ -121,8 +126,8 @@ struct EncodeStates {
using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE;
using SAMPLER_BORDER_COLOR_STATE = typename GfxFamily::SAMPLER_BORDER_COLOR_STATE;
static const uint32_t alignIndirectStatePointer = MemoryConstants::cacheLineSize;
static const size_t alignInterfaceDescriptorData = MemoryConstants::cacheLineSize;
static constexpr uint32_t alignIndirectStatePointer = MemoryConstants::cacheLineSize;
static constexpr size_t alignInterfaceDescriptorData = MemoryConstants::cacheLineSize;
static uint32_t copySamplerState(IndirectHeap *dsh,
uint32_t samplerStateOffset,

View File

@@ -27,6 +27,7 @@
#include "shared/source/kernel/implicit_args.h"
#include "shared/source/kernel/kernel_descriptor.h"
#include "shared/source/os_interface/hw_info_config.h"
#include "shared/source/program/kernel_info.h"
#include "encode_surface_state.inl"
@@ -698,6 +699,39 @@ void EncodeDispatchKernel<Family>::adjustInterfaceDescriptorData(INTERFACE_DESCR
template <typename Family>
constexpr bool EncodeDispatchKernel<Family>::shouldUpdateGlobalAtomics(bool &currentVal, bool refVal, bool updateCurrent) { return false; }
template <typename Family>
size_t EncodeDispatchKernel<Family>::getSizeRequiredDsh(const KernelInfo &kernelInfo) {
using INTERFACE_DESCRIPTOR_DATA = typename Family::INTERFACE_DESCRIPTOR_DATA;
constexpr auto samplerStateSize = sizeof(typename Family::SAMPLER_STATE);
const auto numSamplers = kernelInfo.kernelDescriptor.payloadMappings.samplerTable.numSamplers;
const auto additionalDshSize = additionalSizeRequiredDsh();
if (numSamplers == 0U) {
return alignUp(additionalDshSize, EncodeStates<Family>::alignInterfaceDescriptorData);
}
size_t size = kernelInfo.kernelDescriptor.payloadMappings.samplerTable.tableOffset -
kernelInfo.kernelDescriptor.payloadMappings.samplerTable.borderColor;
size = alignUp(size, EncodeStates<Family>::alignIndirectStatePointer);
size += numSamplers * samplerStateSize;
size = alignUp(size, INTERFACE_DESCRIPTOR_DATA::SAMPLERSTATEPOINTER_ALIGN_SIZE);
if (additionalDshSize > 0) {
size += additionalDshSize;
size = alignUp(size, EncodeStates<Family>::alignInterfaceDescriptorData);
}
return size;
}
template <typename Family>
size_t EncodeDispatchKernel<Family>::getSizeRequiredSsh(const KernelInfo &kernelInfo) {
using BINDING_TABLE_STATE = typename Family::BINDING_TABLE_STATE;
size_t requiredSshSize = kernelInfo.heapInfo.SurfaceStateHeapSize;
requiredSshSize = alignUp(requiredSshSize, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
return requiredSshSize;
}
template <typename Family>
void EncodeIndirectParams<Family>::setGlobalWorkSizeIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, const uint32_t *lws) {
for (int i = 0; i < 3; ++i) {

View File

@@ -104,13 +104,13 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
PreemptionHelper::programInterfaceDescriptorDataPreemption<Family>(&idd, args.preemptionMode);
auto heap = ApiSpecificConfig::getBindlessConfiguration() ? args.device->getBindlessHeapsHelper()->getHeap(BindlessHeapsHelper::GLOBAL_DSH) : container.getIndirectHeap(HeapType::DYNAMIC_STATE);
UNRECOVERABLE_IF(!heap);
uint32_t samplerStateOffset = 0;
uint32_t samplerCount = 0;
if (kernelDescriptor.payloadMappings.samplerTable.numSamplers > 0) {
auto heap = ApiSpecificConfig::getBindlessConfiguration() ? args.device->getBindlessHeapsHelper()->getHeap(BindlessHeapsHelper::GLOBAL_DSH) : container.getIndirectHeap(HeapType::DYNAMIC_STATE);
UNRECOVERABLE_IF(!heap);
samplerCount = kernelDescriptor.payloadMappings.samplerTable.numSamplers;
samplerStateOffset = EncodeStates<Family>::copySamplerState(heap, kernelDescriptor.payloadMappings.samplerTable.tableOffset,
kernelDescriptor.payloadMappings.samplerTable.numSamplers,
@@ -539,4 +539,9 @@ void EncodeDispatchKernel<Family>::setupPostSyncMocs(WALKER_TYPE &walkerCmd, con
template <typename Family>
void EncodeDispatchKernel<Family>::adjustWalkOrder(WALKER_TYPE &walkerCmd, uint32_t requiredWorkGroupOrder, const HardwareInfo &hwInfo) {}
template <typename Family>
uint32_t EncodeDispatchKernel<Family>::additionalSizeRequiredDsh() {
return sizeof(typename Family::INTERFACE_DESCRIPTOR_DATA);
}
} // namespace NEO

View File

@@ -127,13 +127,13 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
PreemptionHelper::programInterfaceDescriptorDataPreemption<Family>(&idd, args.preemptionMode);
if constexpr (Family::supportsSampler) {
auto heap = ApiSpecificConfig::getBindlessConfiguration() ? args.device->getBindlessHeapsHelper()->getHeap(BindlessHeapsHelper::GLOBAL_DSH) : container.getIndirectHeap(HeapType::DYNAMIC_STATE);
UNRECOVERABLE_IF(!heap);
uint32_t samplerStateOffset = 0;
uint32_t samplerCount = 0;
if (kernelDescriptor.payloadMappings.samplerTable.numSamplers > 0) {
auto heap = ApiSpecificConfig::getBindlessConfiguration() ? args.device->getBindlessHeapsHelper()->getHeap(BindlessHeapsHelper::GLOBAL_DSH) : container.getIndirectHeap(HeapType::DYNAMIC_STATE);
UNRECOVERABLE_IF(!heap);
samplerCount = kernelDescriptor.payloadMappings.samplerTable.numSamplers;
samplerStateOffset = EncodeStates<Family>::copySamplerState(
heap, kernelDescriptor.payloadMappings.samplerTable.tableOffset,
@@ -768,4 +768,9 @@ inline void EncodeStoreMMIO<Family>::appendFlags(MI_STORE_REGISTER_MEM *storeReg
template <typename Family>
void EncodeDispatchKernel<Family>::adjustWalkOrder(WALKER_TYPE &walkerCmd, uint32_t requiredWorkGroupOrder, const HardwareInfo &hwInfo) {}
template <typename Family>
uint32_t EncodeDispatchKernel<Family>::additionalSizeRequiredDsh() {
return 0u;
}
} // namespace NEO

View File

@@ -140,6 +140,10 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
}
void initializeDeviceWithFirstSubmission() override;
HeapDirtyState &getSshState() {
return sshState;
}
protected:
void programPreemption(LinearStream &csr, DispatchFlags &dispatchFlags);
void programL3(LinearStream &csr, uint32_t &newL3Config);

View File

@@ -342,7 +342,7 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
if (stallingCommandsOnNextFlushRequired) {
programStallingCommandsForBarrier(commandStreamCSR, dispatchFlags);
}
const bool hasDsh = hwInfo.capabilityTable.supportsImages;
const bool hasDsh = hwInfo.capabilityTable.supportsImages && dsh != nullptr;
bool dshDirty = hasDsh ? dshState.updateAndCheck(dsh) : false;
bool iohDirty = iohState.updateAndCheck(ioh);
bool sshDirty = sshState.updateAndCheck(ssh);

View File

@@ -407,6 +407,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, EnableDrmCompletionFence, -1, "Enables DRM compl
DECLARE_DEBUG_VARIABLE(int32_t, UseDrmCompletionFenceForAllAllocations, -1, "Uses DRM completion fence for all allocations, -1:default (disabled), 0:disable, 1:enable")
DECLARE_DEBUG_VARIABLE(int32_t, EnableChipsetUniqueUUID, -1, "Enables retrieving chipset unique UUID using telemetry, -1:default (disabled), 0:disable, 1:enable")
DECLARE_DEBUG_VARIABLE(int32_t, EnableFlushTaskSubmission, -1, "Driver uses csr flushTask for immediate commandlist submissions, -1:default (enabled), 0:disabled, 1:enabled")
DECLARE_DEBUG_VARIABLE(int32_t, EnableImmediateCmdListHeapSharing, -1, "Immediate command lists using flush task use current csr heap instead private cmd list heap, -1:default (disabled), 0:disabled, 1:enabled")
DECLARE_DEBUG_VARIABLE(int32_t, EnableBcsSwControlWa, -1, "Enable BCS WA via BCSSWCONTROL MMIO. -1: default, 0: disabled, 1: if src in system mem, 2: if dst in system mem, 3: if src and dst in system mem, 4: always")
/* IMPLICIT SCALING */

View File

@@ -91,6 +91,7 @@ struct UnitTestHelper {
static bool getDisableFusionStateFromFrontEndCommand(const typename GfxFamily::VFE_STATE_TYPE &feCmd);
static bool getComputeDispatchAllWalkerFromFrontEndCommand(const typename GfxFamily::VFE_STATE_TYPE &feCmd);
static bool getSystolicFlagValueFromPipelineSelectCommand(const typename GfxFamily::PIPELINE_SELECT &pipelineSelectCmd);
static size_t getAdditionalDshSize();
};
} // namespace NEO

View File

@@ -92,4 +92,9 @@ bool UnitTestHelper<GfxFamily>::getSystolicFlagValueFromPipelineSelectCommand(co
return false;
}
template <typename GfxFamily>
size_t UnitTestHelper<GfxFamily>::getAdditionalDshSize() {
return sizeof(typename GfxFamily::INTERFACE_DESCRIPTOR_DATA);
}
} // namespace NEO

View File

@@ -121,4 +121,9 @@ bool UnitTestHelper<GfxFamily>::getSystolicFlagValueFromPipelineSelectCommand(co
return pipelineSelectCmd.getSystolicModeEnable();
}
template <typename GfxFamily>
size_t UnitTestHelper<GfxFamily>::getAdditionalDshSize() {
return 0;
}
} // namespace NEO

View File

@@ -172,6 +172,7 @@ EnableUsmConcurrentAccessSupport = 0
EnableSharedSystemUsmSupport = -1
EnablePassInlineData = -1
ForceFineGrainedSVMSupport = -1
EnableImmediateCmdListHeapSharing = -1
ForcePipeSupport = -1
ForceSystemMemoryPlacement = 0
ForceNonSystemMemoryPlacement = 0

View File

@@ -18,26 +18,17 @@ using namespace NEO;
constexpr uint32_t defaultNumIddsPerBlock = 64;
class CommandContainerTest : public DeviceFixture,
public ::testing::Test {
using CommandContainerFixture = DeviceFixture;
using CommandContainerTest = Test<CommandContainerFixture>;
class MyMockCommandContainer : public CommandContainer {
public:
void SetUp() override {
::testing::Test::SetUp();
DeviceFixture::setUp();
}
void TearDown() override {
DeviceFixture::tearDown();
::testing::Test::TearDown();
}
using CommandContainer::allocationIndirectHeaps;
using CommandContainer::dirtyHeaps;
using CommandContainer::getTotalCmdBufferSize;
};
struct CommandContainerHeapStateTests : public ::testing::Test {
class MyMockCommandContainer : public CommandContainer {
public:
using CommandContainer::dirtyHeaps;
};
MyMockCommandContainer myCommandContainer;
};
@@ -795,23 +786,18 @@ TEST_F(CommandContainerTest, givenCmdContainerWhenContainerIsInitializedThenStre
TEST_F(CommandContainerTest, GivenCmdContainerAndDebugFlagWhenContainerIsInitializedThenStreamSizeEqualsAlignedTotalCmdBuffSizeDecreasedOfReservedSize) {
DebugManagerStateRestore restorer;
class MyCommandContainer : public CommandContainer {
public:
using CommandContainer::getTotalCmdBufferSize;
};
DebugManager.flags.OverrideCmdListCmdBufferSizeInKb.set(0);
MyCommandContainer cmdContainer;
MyMockCommandContainer cmdContainer;
cmdContainer.initialize(pDevice, nullptr, true);
size_t alignedSize = alignUp<size_t>(cmdContainer.getTotalCmdBufferSize(), MemoryConstants::pageSize64k);
EXPECT_EQ(cmdContainer.getCommandStream()->getMaxAvailableSpace(), alignedSize - MyCommandContainer::cmdBufferReservedSize);
EXPECT_EQ(cmdContainer.getCommandStream()->getMaxAvailableSpace(), alignedSize - MyMockCommandContainer::cmdBufferReservedSize);
auto newSizeInKB = 512;
DebugManager.flags.OverrideCmdListCmdBufferSizeInKb.set(newSizeInKB);
MyCommandContainer cmdContainer2;
MyMockCommandContainer cmdContainer2;
cmdContainer2.initialize(pDevice, nullptr, true);
alignedSize = alignUp<size_t>(cmdContainer.getTotalCmdBufferSize(), MemoryConstants::pageSize64k);
EXPECT_EQ(cmdContainer2.getCommandStream()->getMaxAvailableSpace(), alignedSize - MyCommandContainer::cmdBufferReservedSize);
EXPECT_EQ(cmdContainer2.getCommandStream()->getMaxAvailableSpace(), alignedSize - MyMockCommandContainer::cmdBufferReservedSize);
}
TEST_F(CommandContainerTest, givenCmdContainerWhenAlocatingNextCmdBufferThenStreamSizeEqualAlignedTotalCmdBuffSizeDecreasedOfReservedSize) {
@@ -841,15 +827,68 @@ TEST_F(CommandContainerTest, givenCmdContainerWhenCloseAndAllocateNextCommandBuf
}
TEST_F(CommandContainerTest, GivenCmdContainerWhenContainerIsInitializedThenSurfaceStateIndirectHeapSizeIsCorrect) {
class MyCommandContainer : public CommandContainer {
public:
using CommandContainer::allocationIndirectHeaps;
};
MyCommandContainer cmdContainer;
MyMockCommandContainer cmdContainer;
cmdContainer.initialize(pDevice, nullptr, true);
auto size = cmdContainer.allocationIndirectHeaps[IndirectHeap::Type::SURFACE_STATE]->getUnderlyingBufferSize();
constexpr size_t expectedHeapSize = MemoryConstants::pageSize64k;
EXPECT_EQ(expectedHeapSize, size);
}
TEST_F(CommandContainerTest, givenCmdContainerHasImmediateCsrWhenGettingHeapWithoutEnsuringSpaceThenExpectNullptrReturnedOrUnrecoverable) {
CommandContainer cmdContainer;
cmdContainer.setImmediateCmdListCsr(pDevice->getDefaultEngine().commandStreamReceiver);
cmdContainer.setNumIddPerBlock(1);
auto code = cmdContainer.initialize(pDevice, nullptr, true);
EXPECT_EQ(CommandContainer::ErrorCode::SUCCESS, code);
EXPECT_EQ(nullptr, cmdContainer.getIndirectHeap(HeapType::DYNAMIC_STATE));
EXPECT_EQ(nullptr, cmdContainer.getIndirectHeap(HeapType::SURFACE_STATE));
EXPECT_THROW(cmdContainer.getHeapSpaceAllowGrow(HeapType::DYNAMIC_STATE, 0), std::exception);
EXPECT_THROW(cmdContainer.getHeapWithRequiredSizeAndAlignment(HeapType::DYNAMIC_STATE, 0, 0), std::exception);
EXPECT_THROW(cmdContainer.getHeapSpaceAllowGrow(HeapType::SURFACE_STATE, 0), std::exception);
EXPECT_THROW(cmdContainer.getHeapWithRequiredSizeAndAlignment(HeapType::SURFACE_STATE, 0, 0), std::exception);
cmdContainer.ensureHeapSizePrepared(0, 0);
EXPECT_EQ(nullptr, cmdContainer.getIndirectHeap(HeapType::DYNAMIC_STATE));
EXPECT_NE(nullptr, cmdContainer.getIndirectHeap(HeapType::SURFACE_STATE));
EXPECT_THROW(cmdContainer.getHeapSpaceAllowGrow(HeapType::DYNAMIC_STATE, 0), std::exception);
EXPECT_THROW(cmdContainer.getHeapWithRequiredSizeAndAlignment(HeapType::DYNAMIC_STATE, 0, 0), std::exception);
EXPECT_NO_THROW(cmdContainer.getHeapSpaceAllowGrow(HeapType::SURFACE_STATE, 0));
EXPECT_NO_THROW(cmdContainer.getHeapWithRequiredSizeAndAlignment(HeapType::SURFACE_STATE, 0, 0));
cmdContainer.ensureHeapSizePrepared(4 * MemoryConstants::kiloByte, 4 * MemoryConstants::kiloByte);
auto dshHeap = cmdContainer.getIndirectHeap(HeapType::DYNAMIC_STATE);
EXPECT_NE(nullptr, dshHeap);
auto sshHeap = cmdContainer.getIndirectHeap(HeapType::SURFACE_STATE);
EXPECT_NE(nullptr, sshHeap);
size_t sizeUsedDsh = dshHeap->getUsed();
size_t sizeUsedSsh = sshHeap->getUsed();
void *dshPtr = cmdContainer.getHeapSpaceAllowGrow(HeapType::DYNAMIC_STATE, 64);
void *sshPtr = cmdContainer.getHeapSpaceAllowGrow(HeapType::SURFACE_STATE, 64);
EXPECT_EQ(ptrOffset(dshHeap->getCpuBase(), sizeUsedDsh), dshPtr);
EXPECT_EQ(ptrOffset(sshHeap->getCpuBase(), sizeUsedSsh), sshPtr);
auto alignedHeapDsh = cmdContainer.getHeapWithRequiredSizeAndAlignment(HeapType::DYNAMIC_STATE, 128, 128);
auto alignedHeapSsh = cmdContainer.getHeapWithRequiredSizeAndAlignment(HeapType::SURFACE_STATE, 128, 128);
EXPECT_EQ(dshHeap, alignedHeapDsh);
EXPECT_EQ(sshHeap, alignedHeapSsh);
dshHeap->getSpace(dshHeap->getAvailableSpace() - 32);
sshHeap->getSpace(sshHeap->getAvailableSpace() - 32);
EXPECT_THROW(cmdContainer.getHeapSpaceAllowGrow(HeapType::DYNAMIC_STATE, 64), std::exception);
EXPECT_THROW(cmdContainer.getHeapWithRequiredSizeAndAlignment(HeapType::DYNAMIC_STATE, 64, 64), std::exception);
EXPECT_THROW(cmdContainer.getHeapSpaceAllowGrow(HeapType::SURFACE_STATE, 64), std::exception);
EXPECT_THROW(cmdContainer.getHeapWithRequiredSizeAndAlignment(HeapType::SURFACE_STATE, 64, 64), std::exception);
}

View File

@@ -2124,3 +2124,27 @@ using SystolicSupport = IsAnyProducts<IGFX_ALDERLAKE_P, IGFX_XE_HP_SDV, IGFX_DG2
HWTEST2_F(CommandStreamReceiverSystolicTests, givenSystolicModeChangedWhenFlushTaskCalledThenSystolicStateIsUpdated, SystolicSupport) {
testBody<FamilyType>();
}
HWTEST_F(CommandStreamReceiverTest, givenSshDirtyStateWhenUpdatingStateWithNewHeapThenExpectDirtyStateTrue) {
MockGraphicsAllocation allocation{};
allocation.gpuAddress = 0xABC000;
allocation.size = 0x1000;
IndirectHeap dummyHeap(&allocation, false);
auto dirtyStateCopy = static_cast<CommandStreamReceiverHw<FamilyType> *>(commandStreamReceiver)->getSshState();
bool check = dirtyStateCopy.updateAndCheck(&dummyHeap);
EXPECT_TRUE(check);
check = dirtyStateCopy.updateAndCheck(&dummyHeap);
EXPECT_FALSE(check);
auto dirtyState = static_cast<CommandStreamReceiverHw<FamilyType> *>(commandStreamReceiver)->getSshState();
check = dirtyState.updateAndCheck(&dummyHeap);
EXPECT_TRUE(check);
check = dirtyState.updateAndCheck(&dummyHeap);
EXPECT_FALSE(check);
}

View File

@@ -17,6 +17,7 @@
#include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "shared/test/common/helpers/default_hw_info.h"
#include "shared/test/common/helpers/gtest_helpers.h"
#include "shared/test/common/helpers/unit_test_helper.h"
#include "shared/test/common/mocks/mock_device.h"
#include "shared/test/common/test_macros/test.h"
#include "shared/test/unit_test/fixtures/command_container_fixture.h"
@@ -1324,3 +1325,67 @@ HWTEST_F(BindlessCommandEncodeStatesTest, givenBindlessModeDisabledelWithSampler
EXPECT_EQ(std::find(cmdContainer->getResidencyContainer().begin(), cmdContainer->getResidencyContainer().end(), pDevice->getBindlessHeapsHelper()->getHeap(BindlessHeapsHelper::GLOBAL_DSH)->getGraphicsAllocation()), cmdContainer->getResidencyContainer().end());
}
HWTEST_F(CommandEncodeStatesTest, givenKernelInfoWhenGettingRequiredDshSpaceThenReturnCorrectValues) {
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
using SAMPLER_STATE = typename FamilyType::SAMPLER_STATE;
size_t additionalSize = UnitTestHelper<FamilyType>::getAdditionalDshSize();
size_t expectedSize = alignUp(additionalSize, EncodeStates<FamilyType>::alignInterfaceDescriptorData);
// no samplers
kernelInfo.kernelDescriptor.payloadMappings.samplerTable.numSamplers = 0;
size_t size = EncodeDispatchKernel<FamilyType>::getSizeRequiredDsh(kernelInfo);
EXPECT_EQ(expectedSize, size);
// two samplers, no border color state
kernelInfo.kernelDescriptor.payloadMappings.samplerTable.numSamplers = 2;
kernelInfo.kernelDescriptor.payloadMappings.samplerTable.tableOffset = 0;
kernelInfo.kernelDescriptor.payloadMappings.samplerTable.borderColor = 0;
// align samplers
size_t alignedSamplers = alignUp(2 * sizeof(SAMPLER_STATE), INTERFACE_DESCRIPTOR_DATA::SAMPLERSTATEPOINTER_ALIGN_SIZE);
// additional IDD for requiring platforms
if (additionalSize > 0) {
expectedSize = alignUp(alignedSamplers + additionalSize, EncodeStates<FamilyType>::alignInterfaceDescriptorData);
} else {
expectedSize = alignedSamplers;
}
size = EncodeDispatchKernel<FamilyType>::getSizeRequiredDsh(kernelInfo);
EXPECT_EQ(expectedSize, size);
// three samplers, border color state
kernelInfo.kernelDescriptor.payloadMappings.samplerTable.numSamplers = 3;
kernelInfo.kernelDescriptor.payloadMappings.samplerTable.tableOffset = 32;
// align border color state and samplers
alignedSamplers = alignUp(alignUp(32, EncodeStates<FamilyType>::alignIndirectStatePointer) + 3 * sizeof(SAMPLER_STATE), INTERFACE_DESCRIPTOR_DATA::SAMPLERSTATEPOINTER_ALIGN_SIZE);
// additional IDD for requiring platforms
if (additionalSize > 0) {
expectedSize = alignUp(alignedSamplers + additionalSize, EncodeStates<FamilyType>::alignInterfaceDescriptorData);
} else {
expectedSize = alignedSamplers;
}
size = EncodeDispatchKernel<FamilyType>::getSizeRequiredDsh(kernelInfo);
EXPECT_EQ(expectedSize, size);
}
HWTEST_F(CommandEncodeStatesTest, givenKernelInfoWhenGettingRequiredSshSpaceThenReturnCorrectValues) {
using BINDING_TABLE_STATE = typename FamilyType::BINDING_TABLE_STATE;
using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE;
// no surface states
kernelInfo.heapInfo.SurfaceStateHeapSize = 0;
size_t size = EncodeDispatchKernel<FamilyType>::getSizeRequiredSsh(kernelInfo);
EXPECT_EQ(0u, size);
// two surface states and BTI indices
kernelInfo.heapInfo.SurfaceStateHeapSize = 2 * sizeof(RENDER_SURFACE_STATE) + 2 * sizeof(uint32_t);
size_t expectedSize = alignUp(kernelInfo.heapInfo.SurfaceStateHeapSize, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
size = EncodeDispatchKernel<FamilyType>::getSizeRequiredSsh(kernelInfo);
EXPECT_EQ(expectedSize, size);
}

View File

@@ -8,6 +8,7 @@
#pragma once
#include "shared/source/command_container/command_encoder.h"
#include "shared/source/kernel/kernel_descriptor.h"
#include "shared/source/program/kernel_info.h"
#include "shared/test/common/fixtures/device_fixture.h"
#include "shared/test/common/test_macros/hw_test.h"
@@ -44,6 +45,7 @@ class CommandEncodeStatesFixture : public DeviceFixture {
}
KernelDescriptor descriptor;
KernelInfo kernelInfo;
std::unique_ptr<MyMockCommandContainer> cmdContainer;
};