performance: add skeleton method to cmdlist immediate flush task

Related-To: NEO-7808

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2023-06-28 18:05:36 +00:00
committed by Compute-Runtime-Automation
parent a6fee8994d
commit 21823af419
19 changed files with 104 additions and 0 deletions

View File

@@ -323,6 +323,7 @@ struct CommandListCoreFamily : CommandListImp {
void postInitComputeSetup();
NEO::PreemptionMode obtainKernelPreemptionMode(Kernel *kernel);
virtual bool isRelaxedOrderingDispatchAllowed(uint32_t numWaitEvents) const { return false; }
virtual void setupFlushMethod(const NEO::RootDeviceEnvironment &rootDeviceEnvironment) {}
};
template <PRODUCT_FAMILY gfxProductFamily>

View File

@@ -194,6 +194,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::initialize(Device *device, NEO
if (this->isFlushTaskSubmissionEnabled) {
commandContainer.setFlushTaskUsedForImmediate(this->isFlushTaskSubmissionEnabled);
commandContainer.setNumIddPerBlock(1);
this->setupFlushMethod(rootDeviceEnvironment);
}
if (this->immediateCmdListHeapSharing) {

View File

@@ -153,6 +153,7 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily<gfxCoreFami
ze_result_t executeCommandListImmediateWithFlushTaskImpl(bool performMigration, bool hasStallingCmds, bool hasRelaxedOrderingDependencies, CommandQueue *cmdQ);
NEO::CompletionStamp flushRegularTask(NEO::LinearStream &cmdStreamTask, size_t taskStartOffset, bool hasStallingCmds, bool hasRelaxedOrderingDependencies);
NEO::CompletionStamp flushImmediateRegularTask(NEO::LinearStream &cmdStreamTask, size_t taskStartOffset, bool hasStallingCmds, bool hasRelaxedOrderingDependencies);
NEO::CompletionStamp flushBcsTask(NEO::LinearStream &cmdStreamTask, size_t taskStartOffset, bool hasStallingCmds, bool hasRelaxedOrderingDependencies, NEO::CommandStreamReceiver *csr);
void checkAvailableSpace(uint32_t numEvents, bool hasRelaxedOrderingDependencies);
@@ -183,6 +184,7 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily<gfxCoreFami
void printKernelsPrintfOutput(bool hangDetected);
ze_result_t synchronizeInOrderExecution(uint64_t timeout) const;
bool hasStallingCmdsForRelaxedOrdering(uint32_t numWaitEvents, bool relaxedOrderingDispatch);
void setupFlushMethod(const NEO::RootDeviceEnvironment &rootDeviceEnvironment) override;
MOCKABLE_VIRTUAL void checkAssert();
ComputeFlushMethodType computeFlushMethod = nullptr;

View File

@@ -109,6 +109,11 @@ NEO::CompletionStamp CommandListCoreFamilyImmediate<gfxCoreFamily>::flushBcsTask
return csr->flushBcsTask(cmdStreamTask, taskStartOffset, dispatchBcsFlags, this->device->getHwInfo());
}
template <GFXCORE_FAMILY gfxCoreFamily>
NEO::CompletionStamp CommandListCoreFamilyImmediate<gfxCoreFamily>::flushImmediateRegularTask(NEO::LinearStream &cmdStreamTask, size_t taskStartOffset, bool hasStallingCmds, bool hasRelaxedOrderingDependencies) {
return {NEO::CompletionStamp::getTaskCountFromSubmissionStatusError(NEO::SubmissionStatus::UNSUPPORTED)};
}
template <GFXCORE_FAMILY gfxCoreFamily>
NEO::CompletionStamp CommandListCoreFamilyImmediate<gfxCoreFamily>::flushRegularTask(NEO::LinearStream &cmdStreamTask, size_t taskStartOffset, bool hasStallingCmds, bool hasRelaxedOrderingDependencies) {
NEO::DispatchFlags dispatchFlags(
@@ -1155,4 +1160,11 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::synchronizeInOrderExe
return status;
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamilyImmediate<gfxCoreFamily>::setupFlushMethod(const NEO::RootDeviceEnvironment &rootDeviceEnvironment) {
if (L0GfxCoreHelper::useImmediateComputeFlushTask(rootDeviceEnvironment)) {
this->computeFlushMethod = &CommandListCoreFamilyImmediate<gfxCoreFamily>::flushImmediateRegularTask;
}
}
} // namespace L0

View File

@@ -112,6 +112,14 @@ bool L0GfxCoreHelper::dispatchCmdListBatchBufferAsPrimary(const NEO::RootDeviceE
return value && allowPrimary;
}
bool L0GfxCoreHelper::useImmediateComputeFlushTask(const NEO::RootDeviceEnvironment &rootDeviceEnvironment) {
if (NEO::DebugManager.flags.UseImmediateFlushTask.get() != -1) {
return !!(NEO::DebugManager.flags.UseImmediateFlushTask.get());
}
auto &l0GfxCoreHelper = rootDeviceEnvironment.getHelper<L0GfxCoreHelper>();
return l0GfxCoreHelper.platformSupportsImmediateComputeFlushTask();
}
} // namespace L0
template <>

View File

@@ -51,6 +51,7 @@ class L0GfxCoreHelper : public NEO::ApiGfxCoreHelper {
static bool useSignalAllEventPackets(const NEO::HardwareInfo &hwInfo);
static NEO::HeapAddressModel getHeapAddressModel(const NEO::RootDeviceEnvironment &rootDeviceEnvironment);
static bool dispatchCmdListBatchBufferAsPrimary(const NEO::RootDeviceEnvironment &rootDeviceEnvironment, bool allowPrimary);
static bool useImmediateComputeFlushTask(const NEO::RootDeviceEnvironment &rootDeviceEnvironment);
virtual void setAdditionalGroupProperty(ze_command_queue_group_properties_t &groupProperty, NEO::EngineGroupT &group) const = 0;
virtual L0::Event *createEvent(L0::EventPool *eventPool, const ze_event_desc_t *desc, L0::Device *device) const = 0;
@@ -76,6 +77,7 @@ class L0GfxCoreHelper : public NEO::ApiGfxCoreHelper {
virtual uint32_t getEventBaseMaxPacketCount(const NEO::RootDeviceEnvironment &rootDeviceEnvironment) const = 0;
virtual NEO::HeapAddressModel getPlatformHeapAddressModel() const = 0;
virtual std::vector<uint32_t> getSupportedNumGrfs() const = 0;
virtual bool platformSupportsImmediateComputeFlushTask() const = 0;
protected:
L0GfxCoreHelper() = default;
@@ -112,6 +114,7 @@ class L0GfxCoreHelperHw : public L0GfxCoreHelper {
uint32_t getEventBaseMaxPacketCount(const NEO::RootDeviceEnvironment &rootDeviceEnvironment) const override;
NEO::HeapAddressModel getPlatformHeapAddressModel() const override;
std::vector<uint32_t> getSupportedNumGrfs() const override;
bool platformSupportsImmediateComputeFlushTask() const override;
protected:
L0GfxCoreHelperHw() = default;

View File

@@ -69,4 +69,9 @@ bool L0GfxCoreHelperHw<Family>::platformSupportsPrimaryBatchBufferCmdList() cons
return false;
}
template <typename Family>
bool L0GfxCoreHelperHw<Family>::platformSupportsImmediateComputeFlushTask() const {
return false;
}
} // namespace L0

View File

@@ -82,4 +82,9 @@ bool L0GfxCoreHelperHw<Family>::platformSupportsPrimaryBatchBufferCmdList() cons
return true;
}
template <typename Family>
bool L0GfxCoreHelperHw<Family>::platformSupportsImmediateComputeFlushTask() const {
return false;
}
} // namespace L0

View File

@@ -367,5 +367,11 @@ void PrimaryBatchBufferPreamblelessCmdListFixture::tearDown() {
PrimaryBatchBufferCmdListFixture::tearDown();
}
void ImmediateFlushTaskCmdListFixture::setUp() {
DebugManager.flags.UseImmediateFlushTask.set(1);
ModuleMutableCommandListFixture::setUp();
}
} // namespace ult
} // namespace L0

View File

@@ -304,5 +304,9 @@ struct PrimaryBatchBufferPreamblelessCmdListFixture : public PrimaryBatchBufferC
std::unique_ptr<L0::ult::CommandList> commandList3;
};
struct ImmediateFlushTaskCmdListFixture : public ModuleMutableCommandListFixture {
void setUp();
};
} // namespace ult
} // namespace L0

View File

@@ -1888,5 +1888,17 @@ HWTEST2_F(RayTracingCmdListTest,
ultCsr->isMadeResident(rtAllocation, residentCount);
}
using ImmediateFlushTaskCmdListTests = Test<ImmediateFlushTaskCmdListFixture>;
HWTEST2_F(ImmediateFlushTaskCmdListTests,
givenInitialVersionOfImmediateFlushTaskWhenImmediateFlushTaskSelectedThenUnsupportedErrorReturned,
IsAtLeastXeHpCore) {
ze_group_count_t groupCount{1, 1, 1};
CmdListKernelLaunchParams launchParams = {};
auto result = commandListImmediate->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
EXPECT_EQ(ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY, result);
}
} // namespace ult
} // namespace L0

View File

@@ -889,5 +889,34 @@ TEST_F(L0GfxCoreHelperTest, givenL0GfxCoreHelperWhenGettingDefaultCmdlistPrimary
EXPECT_EQ(l0GfxCoreHelper.platformSupportsPrimaryBatchBufferCmdList(), L0GfxCoreHelper::dispatchCmdListBatchBufferAsPrimary(rootDeviceEnvironment, true));
}
HWTEST2_F(L0GfxCoreHelperTest, givenL0GfxCoreHelperOnGenPlatformsWhenGettingPlatformUseImmediateFlushTaskThenReturnFalse, IsAtMostGen12lp) {
MockExecutionEnvironment executionEnvironment;
auto &l0GfxCoreHelper = executionEnvironment.rootDeviceEnvironments[0]->getHelper<L0GfxCoreHelper>();
EXPECT_FALSE(l0GfxCoreHelper.platformSupportsImmediateComputeFlushTask());
}
TEST_F(L0GfxCoreHelperTest, givenL0GfxCoreHelperWhenGettingDefaultUseImmediateFlushTaskThenUsePlatformDefaultSetting) {
MockExecutionEnvironment executionEnvironment;
auto &rootDeviceEnvironment = *executionEnvironment.rootDeviceEnvironments[0].get();
auto &l0GfxCoreHelper = rootDeviceEnvironment.getHelper<L0GfxCoreHelper>();
EXPECT_EQ(l0GfxCoreHelper.platformSupportsImmediateComputeFlushTask(), L0GfxCoreHelper::useImmediateComputeFlushTask(rootDeviceEnvironment));
}
TEST_F(L0GfxCoreHelperTest, givenL0GfxCoreHelperUsingOverrideDebugKeyWhenGettingUseImmediateFlushTaskThenUseDbgKeyValue) {
DebugManagerStateRestore restorer;
MockExecutionEnvironment executionEnvironment;
const auto &rootDeviceEnvironment = *executionEnvironment.rootDeviceEnvironments[0].get();
DebugManager.flags.UseImmediateFlushTask.set(0);
EXPECT_FALSE(L0GfxCoreHelper::useImmediateComputeFlushTask(rootDeviceEnvironment));
DebugManager.flags.UseImmediateFlushTask.set(1);
EXPECT_TRUE(L0GfxCoreHelper::useImmediateComputeFlushTask(rootDeviceEnvironment));
}
} // namespace ult
} // namespace L0

View File

@@ -69,5 +69,10 @@ XE_HPC_CORETEST_F(L0GfxCoreHelperTestXeHpc, GivenXeHpcWhenCheckingL0HelperForCmd
EXPECT_TRUE(l0GfxCoreHelper.platformSupportsPrimaryBatchBufferCmdList());
}
XE_HPC_CORETEST_F(L0GfxCoreHelperTestXeHpc, GivenXeHpcWhenCheckingL0HelperForPlatformSupportsImmediateFlushTaskThenReturnFalse) {
auto &l0GfxCoreHelper = getHelper<L0GfxCoreHelper>();
EXPECT_FALSE(l0GfxCoreHelper.platformSupportsImmediateComputeFlushTask());
}
} // namespace ult
} // namespace L0

View File

@@ -71,5 +71,10 @@ XE_HPG_CORETEST_F(L0GfxCoreHelperTestXeHpg, GivenXeHpgWhenCheckingL0HelperForCmd
EXPECT_TRUE(l0GfxCoreHelper.platformSupportsPrimaryBatchBufferCmdList());
}
XE_HPG_CORETEST_F(L0GfxCoreHelperTestXeHpg, GivenXeHpgWhenCheckingL0HelperForPlatformSupportsImmediateFlushTaskThenReturnFalse) {
auto &l0GfxCoreHelper = getHelper<L0GfxCoreHelper>();
EXPECT_FALSE(l0GfxCoreHelper.platformSupportsImmediateComputeFlushTask());
}
} // namespace ult
} // namespace L0

View File

@@ -1032,6 +1032,8 @@ TaskCountType CompletionStamp::getTaskCountFromSubmissionStatusError(SubmissionS
return CompletionStamp::outOfDeviceMemory;
case SubmissionStatus::FAILED:
return CompletionStamp::failed;
case SubmissionStatus::UNSUPPORTED:
return CompletionStamp::unsupported;
default:
return 0;
}

View File

@@ -337,6 +337,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, ReuseKernelBinaries, -1, "-1: default, 0:disable
DECLARE_DEBUG_VARIABLE(int32_t, SetAmountOfReusableAllocations, -1, "-1: default, 0:disabled, > 1: enabled. If enabled, driver will fill reusable allocation lists with given amount of command buffers and heaps at initialization of immediate command list.")
DECLARE_DEBUG_VARIABLE(int32_t, UseHighAlignmentForHeapExtended, -1, "-1: default, 0:disabled, > 1: enabled. If enabled, driver aligns HEAP_EXTENDED allocations to GPU VA that is next power of 2 for a given size, if disables GPU VA is using 2MB/64KB alignment.")
DECLARE_DEBUG_VARIABLE(int32_t, DispatchCmdlistCmdBufferPrimary, -1, "-1: default, 0: dispatch command buffers as seconadry, 1: dispatch command buffers as primary and chain")
DECLARE_DEBUG_VARIABLE(int32_t, UseImmediateFlushTask, -1, "-1: default, 0: use regular flush task, 1: use immediate flush task")
/*DIRECT SUBMISSION FLAGS*/
DECLARE_DEBUG_VARIABLE(int32_t, EnableDirectSubmission, -1, "-1: default (disabled), 0: disable, 1:enable. Enables direct submission of command buffers bypassing KMD")

View File

@@ -23,6 +23,7 @@ struct CompletionStamp {
FlushStamp flushStamp;
static constexpr TaskCountType notReady = std::numeric_limits<TaskCountType>::max() - 0xF;
static constexpr TaskCountType unsupported = std::numeric_limits<TaskCountType>::max() - 0xE;
static constexpr TaskCountType failed = std::numeric_limits<TaskCountType>::max() - 0x6;
static constexpr TaskCountType gpuHang = std::numeric_limits<TaskCountType>::max() - 0x5;
static constexpr TaskCountType outOfDeviceMemory = std::numeric_limits<TaskCountType>::max() - 0x4;

View File

@@ -315,6 +315,7 @@ ForceMultiGpuAtomics = -1
ForceBufferCompressionFormat = -1
ExperimentalSetWalkerPartitionCount = 0
EnableStatelessCompressionWithUnifiedMemory = 0
UseImmediateFlushTask = -1
EnableMultiGpuAtomicsOptimization = 1
EnableHwGenerationLocalIds = -1
WalkerPartitionPreferHighestDimension = -1

View File

@@ -2672,6 +2672,7 @@ TEST(CommandStreamReceiverSimpleTest, whenTranslatingSubmissionStatusToTaskCount
EXPECT_EQ(CompletionStamp::outOfHostMemory, CompletionStamp::getTaskCountFromSubmissionStatusError(SubmissionStatus::OUT_OF_HOST_MEMORY));
EXPECT_EQ(CompletionStamp::outOfDeviceMemory, CompletionStamp::getTaskCountFromSubmissionStatusError(SubmissionStatus::OUT_OF_MEMORY));
EXPECT_EQ(CompletionStamp::failed, CompletionStamp::getTaskCountFromSubmissionStatusError(SubmissionStatus::FAILED));
EXPECT_EQ(CompletionStamp::unsupported, CompletionStamp::getTaskCountFromSubmissionStatusError(SubmissionStatus::UNSUPPORTED));
}
HWTEST_F(CommandStreamReceiverHwTest, givenFailureOnFlushWhenFlushingBcsTaskThenErrorIsPropagated) {