performance: Add debug flag to set BCS split minimal size

Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
This commit is contained in:
Lukasz Jobczyk
2023-05-16 11:15:43 +00:00
committed by Compute-Runtime-Automation
parent eb8e69d2d4
commit 0e758e4bb5
11 changed files with 44 additions and 3 deletions

View File

@@ -379,7 +379,7 @@ struct CommandList : _ze_command_list_handle_t {
NEO::CommandStreamReceiver *csr = nullptr;
Device *device = nullptr;
inline static constexpr size_t minimalSizeForBcsSplit = 4 * MemoryConstants::megaByte;
size_t minimalSizeForBcsSplit = 4 * MemoryConstants::megaByte;
size_t cmdListCurrentStartOffset = 0;
unsigned long numThreads = 1u;

View File

@@ -34,6 +34,9 @@
namespace L0 {
CommandList::CommandList(uint32_t numIddsPerBlock) : commandContainer(numIddsPerBlock) {
if (NEO::DebugManager.flags.SplitBcsSize.get() != -1) {
this->minimalSizeForBcsSplit = NEO::DebugManager.flags.SplitBcsSize.get() * MemoryConstants::kiloByte;
}
}
CommandListAllocatorFn commandListFactory[IGFX_MAX_PRODUCT] = {};

View File

@@ -226,6 +226,7 @@ struct WhiteBox<::L0::CommandList> : public ::L0::CommandListImp {
using BaseClass::isFlushTaskSubmissionEnabled;
using BaseClass::isSyncModeQueue;
using BaseClass::isTbxMode;
using BaseClass::minimalSizeForBcsSplit;
using BaseClass::nonImmediateLogicalStateHelper;
using BaseClass::partitionCount;
using BaseClass::pipelineSelectStateTracking;

View File

@@ -1363,6 +1363,21 @@ TEST_F(CommandListCreate, GivenGpuHangWhenCreatingImmCmdListWithSyncModeAndAppen
static_cast<WhiteBox<::L0::CommandQueue> *>(whiteBoxCmdList->cmdQImmediate)->csr = oldCsr;
}
TEST_F(CommandListCreate, givenSplitBcsSizeWhenCreateCommandListThenProperSizeSet) {
DebugManagerStateRestore restorer;
DebugManager.flags.SplitBcsSize.set(120);
ze_command_queue_desc_t desc = {};
ze_result_t returnValue;
std::unique_ptr<L0::CommandList> commandList(CommandList::createImmediate(productFamily, device, &desc, false, NEO::EngineGroupType::RenderCompute, returnValue));
auto whiteBoxCmdList = static_cast<CommandList *>(commandList.get());
ASSERT_EQ(ZE_RESULT_SUCCESS, returnValue);
ASSERT_NE(nullptr, commandList);
EXPECT_EQ(whiteBoxCmdList->minimalSizeForBcsSplit, 120 * MemoryConstants::kiloByte);
}
HWTEST_F(CommandListCreate, GivenGpuHangWhenCreatingImmediateCommandListAndAppendingSignalEventsThenDeviceLostIsReturned) {
DebugManagerStateRestore restorer;
DebugManager.flags.EnableFlushTaskSubmission.set(1);

View File

@@ -439,6 +439,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
BcsInfoMask splitEngines = EngineHelpers::oddLinkedCopyEnginesMask;
BcsInfoMask h2dEngines = NEO::EngineHelpers::h2dCopyEngineMask;
BcsInfoMask d2hEngines = NEO::EngineHelpers::d2hCopyEngineMask;
size_t minimalSizeForBcsSplit = 16 * MemoryConstants::megaByte;
LinearStream *commandStream = nullptr;

View File

@@ -39,6 +39,9 @@ class CommandQueueHw : public CommandQueue {
ClDevice *device,
const cl_queue_properties *properties,
bool internalUsage) : BaseClass(context, device, properties, internalUsage) {
if (DebugManager.flags.SplitBcsSize.get() != -1) {
this->minimalSizeForBcsSplit = DebugManager.flags.SplitBcsSize.get() * MemoryConstants::kiloByte;
}
auto clPriority = getCmdQueueProperties<cl_queue_priority_khr>(properties, CL_QUEUE_PRIORITY_KHR);

View File

@@ -1217,8 +1217,6 @@ size_t CommandQueueHw<GfxFamily>::calculateHostPtrSizeForImage(const size_t *reg
template <typename GfxFamily>
bool CommandQueueHw<GfxFamily>::isSplitEnqueueBlitNeeded(TransferDirection transferDirection, size_t transferSize, CommandStreamReceiver &csr) {
constexpr size_t minimalSizeForBcsSplit = 16 * MemoryConstants::megaByte;
auto bcsSplit = getDevice().isBcsSplitSupported() &&
csr.getOsContext().getEngineType() == aub_stream::EngineType::ENGINE_BCS &&
transferSize >= minimalSizeForBcsSplit &&

View File

@@ -348,6 +348,23 @@ HWTEST_F(IoqCommandQueueHwBlitTest, givenSplitBcsCopyWhenCheckIsSplitEnqueueBlit
}
}
HWTEST_F(IoqCommandQueueHwBlitTest, givenSplitBcsSizeSetWhenCheckIsSplitEnqueueBlitNeededThenReturnProperValue) {
DebugManagerStateRestore restorer;
DebugManager.flags.SplitBcsCopy.set(1);
DebugManager.flags.SplitBcsSize.set(100);
auto *cmdQHw = static_cast<CommandQueueHw<FamilyType> *>(this->pCmdQ);
VariableBackup<UltHwConfig> backup{&ultHwConfig};
ultHwConfig.useBlitSplit = true;
{
EXPECT_FALSE(cmdQHw->isSplitEnqueueBlitNeeded(TransferDirection::HostToLocal, 150 * MemoryConstants::kiloByte, *cmdQHw->getBcsCommandStreamReceiver(aub_stream::EngineType::ENGINE_BCS)));
}
{
MockCommandQueueHw<FamilyType> queue(this->pContext, this->pClDevice, nullptr);
EXPECT_TRUE(queue.isSplitEnqueueBlitNeeded(TransferDirection::HostToLocal, 150 * MemoryConstants::kiloByte, *queue.getBcsCommandStreamReceiver(aub_stream::EngineType::ENGINE_BCS)));
EXPECT_EQ(queue.minimalSizeForBcsSplit, 100 * MemoryConstants::kiloByte);
}
}
char hostPtr[16 * MemoryConstants::megaByte];
struct BcsSplitBufferTraits {
enum { flags = CL_MEM_READ_WRITE };

View File

@@ -255,6 +255,7 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
using BaseClass::gpgpuEngine;
using BaseClass::isBlitAuxTranslationRequired;
using BaseClass::latestSentEnqueueType;
using BaseClass::minimalSizeForBcsSplit;
using BaseClass::obtainCommandStream;
using BaseClass::obtainNewTimestampPacketNodes;
using BaseClass::overrideEngine;

View File

@@ -319,6 +319,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, DeferCmdQGpgpuInitialization, -1, "-1: default,
DECLARE_DEBUG_VARIABLE(int32_t, DeferCmdQBcsInitialization, -1, "-1: default, 0:disabled, 1: enabled.")
DECLARE_DEBUG_VARIABLE(int32_t, PreferInternalBcsEngine, -1, "-1: default, 0:disabled, 1: enabled. When enabled use internal BCS engine for internal transfers, when disabled use regular engine")
DECLARE_DEBUG_VARIABLE(int32_t, SplitBcsCopy, -1, "-1: default, 0:disabled, 1: enabled. When enqueues copy to main copy engine then split between even linked copy engines")
DECLARE_DEBUG_VARIABLE(int32_t, SplitBcsSize, -1, "-1: default, >=0: Size to apply BCS split from")
DECLARE_DEBUG_VARIABLE(int32_t, SplitBcsMask, 0, "0: default, >0: bitmask: indicates bcs engines for split")
DECLARE_DEBUG_VARIABLE(int32_t, SplitBcsMaskH2D, 0, "0: default, >0: bitmask: indicates bcs engines for H2D split")
DECLARE_DEBUG_VARIABLE(int32_t, SplitBcsMaskD2H, 0, "0: default, >0: bitmask: indicates bcs engines for D2H split")

View File

@@ -419,6 +419,7 @@ AssignBCSAtEnqueue = -1
DeferCmdQGpgpuInitialization = -1
DeferCmdQBcsInitialization = -1
SplitBcsCopy = -1
SplitBcsSize = -1
SplitBcsMask = 0
SplitBcsMaskH2D = 0
SplitBcsMaskD2H = 0