feature: new multitile post sync layout for immediate write [2/n]

No functional changes in this commit. This is prework.

Related-To: NEO-7966

Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
This commit is contained in:
Dunajski, Bartosz
2023-06-07 15:06:16 +00:00
committed by Compute-Runtime-Automation
parent 112bbec6e9
commit 3d49658f50
21 changed files with 195 additions and 4 deletions

View File

@@ -284,6 +284,7 @@ struct CommandListCoreFamily : CommandListImp {
void appendComputeBarrierCommand();
NEO::PipeControlArgs createBarrierFlags();
void appendMultiTileBarrier(NEO::Device &neoDevice);
void appendDispatchOffsetRegister(bool workloadPartitionEvent, bool beforeProfilingCmds);
size_t estimateBufferSizeMultiTileBarrier(const NEO::RootDeviceEnvironment &rootDeviceEnvironment);
uint64_t getInputBufferSize(NEO::ImageType imageType, uint64_t bytesPerPixel, const ze_image_region_t *region);
MOCKABLE_VIRTUAL AlignedAllocationData getAlignedAllocationData(Device *device, const void *buffer, uint64_t bufferSize, bool hostCopyAllowed);

View File

@@ -2320,6 +2320,9 @@ void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfiling(Event *event,
}
commandContainer.addToResidencyContainer(&event->getAllocation(this->device));
bool workloadPartition = isTimestampEventForMultiTile(event);
appendDispatchOffsetRegister(workloadPartition, true);
if (beforeWalker) {
event->resetKernelCountAndPacketUsedCount();
@@ -2339,9 +2342,10 @@ void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfiling(Event *event,
uint64_t baseAddr = event->getGpuAddress(this->device);
NEO::MemorySynchronizationCommands<GfxFamily>::addAdditionalSynchronization(*commandContainer.getCommandStream(), baseAddr, false, rootDeviceEnvironment);
bool workloadPartition = isTimestampEventForMultiTile(event);
appendWriteKernelTimestamp(event, beforeWalker, true, workloadPartition);
}
appendDispatchOffsetRegister(workloadPartition, false);
}
}

View File

@@ -293,4 +293,8 @@ inline NEO::PreemptionMode CommandListCoreFamily<gfxCoreFamily>::obtainKernelPre
return NEO::PreemptionHelper::taskPreemptionMode(device->getDevicePreemptionMode(), flags);
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::appendDispatchOffsetRegister(bool workloadPartitionEvent, bool beforeProfilingCmds) {
}
} // namespace L0

View File

@@ -481,4 +481,13 @@ void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfilingAllWalkers(Eve
}
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::appendDispatchOffsetRegister(bool workloadPartitionEvent, bool beforeProfilingCmds) {
if (workloadPartitionEvent && NEO::ApiSpecificConfig::isDynamicPostSyncAllocLayoutEnabled()) {
auto offset = beforeProfilingCmds ? NEO::ImplicitScalingDispatch<GfxFamily>::getTimeStampPostSyncOffset() : NEO::ImplicitScalingDispatch<GfxFamily>::getImmediateWritePostSyncOffset();
NEO::ImplicitScalingDispatch<GfxFamily>::dispatchOffsetRegister(*commandContainer.getCommandStream(), offset);
}
}
} // namespace L0

View File

@@ -119,6 +119,9 @@ struct Event : _ze_event_handle_t {
size_t getSinglePacketSize() const {
return singlePacketSize;
}
void setSinglePacketSize(size_t size) {
singlePacketSize = size;
}
size_t getTimestampSizeInDw() const {
return timestampSizeInDw;
}

View File

@@ -6,6 +6,7 @@
*/
#pragma once
#include "shared/source/helpers/api_specific_config.h"
#include "shared/source/helpers/timestamp_packet.h"
#include "level_zero/core/source/event/event.h"
@@ -33,6 +34,10 @@ struct EventImp : public Event {
globalEndOffset = NEO::TimestampPackets<TagSizeT>::getGlobalEndOffset();
timestampSizeInDw = (sizeof(TagSizeT) / sizeof(uint32_t));
singlePacketSize = NEO::TimestampPackets<TagSizeT>::getSinglePacketSize();
if (NEO::ApiSpecificConfig::isDynamicPostSyncAllocLayoutEnabled()) {
singlePacketSize = sizeof(uint64_t);
}
}
~EventImp() override {}

View File

@@ -30,6 +30,7 @@ Event *Event::create(EventPool *eventPool, const ze_event_desc_t *desc, Device *
if (eventPool->isEventPoolTimestampFlagSet()) {
event->setEventTimestampFlag(true);
event->setSinglePacketSize(NEO::TimestampPackets<TagSizeT>::getSinglePacketSize());
}
auto &hwInfo = neoDevice->getHardwareInfo();
@@ -51,7 +52,7 @@ Event *Event::create(EventPool *eventPool, const ze_event_desc_t *desc, Device *
event->kernelEventCompletionData =
std::make_unique<KernelEventCompletionData<TagSizeT>[]>(event->maxKernelCount);
bool useContextEndOffset = eventPool->isImplicitScalingCapableFlagSet();
bool useContextEndOffset = eventPool->isImplicitScalingCapableFlagSet() && !NEO::ApiSpecificConfig::isDynamicPostSyncAllocLayoutEnabled();
int32_t overrideUseContextEndOffset = NEO::DebugManager.flags.UseContextEndOffsetForEventCompletion.get();
if (overrideUseContextEndOffset != -1) {
useContextEndOffset = !!overrideUseContextEndOffset;

View File

@@ -35,6 +35,10 @@ bool ApiSpecificConfig::isDeviceAllocationCacheEnabled() {
return false;
}
bool ApiSpecificConfig::isDynamicPostSyncAllocLayoutEnabled() {
return (NEO::DebugManager.flags.EnableDynamicPostSyncAllocLayout.get() == 1);
}
ApiSpecificConfig::ApiType ApiSpecificConfig::getApiType() {
return ApiSpecificConfig::L0;
}

View File

@@ -1421,6 +1421,52 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, MultiTileCommandListAppendLaunchKernelXeHpCoreTest,
EXPECT_EQ(4u, commandList->partitionCount);
}
HWCMDTEST_F(IGFX_XE_HP_CORE, MultiTileCommandListAppendLaunchKernelXeHpCoreTest, givenDebugVariableSetWhenUsingNonTimestampEventThenDontOverridePostSyncMode) {
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
DebugManager.flags.EnableDynamicPostSyncAllocLayout.set(1);
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC;
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
eventPoolDesc.count = 1;
ze_event_desc_t eventDesc = {};
eventDesc.stype = ZE_STRUCTURE_TYPE_EVENT_DESC;
eventDesc.index = 0;
auto deviceHandle = device->toHandle();
ze_result_t result = ZE_RESULT_SUCCESS;
std::unique_ptr<L0::EventPool> eventPool(EventPool::create(device->getDriverHandle(), context, 1, &deviceHandle, &eventPoolDesc, result));
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
std::unique_ptr<L0::Event> event(Event::create<typename FamilyType::TimestampPacketType>(eventPool.get(), &eventDesc, device));
EXPECT_FALSE(event->isUsingContextEndOffset());
ze_event_handle_t hEventHandle = event->toHandle();
ze_group_count_t groupCount{256, 1, 1};
CmdListKernelLaunchParams launchParams = {};
result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, hEventHandle, 0, nullptr, launchParams, false);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(4u, event->getPacketsInUse());
EXPECT_EQ(4u, commandList->partitionCount);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList, commandList->getCmdContainer().getCommandStream()->getCpuBase(), commandList->getCmdContainer().getCommandStream()->getUsed()));
auto itorWalker = find<WALKER_TYPE *>(cmdList.begin(), cmdList.end());
auto cmd = genCmdCast<WALKER_TYPE *>(*itorWalker);
ASSERT_NE(nullptr, cmd);
EXPECT_TRUE(cmd->getWorkloadPartitionEnable());
auto &postSync = cmd->getPostSync();
EXPECT_EQ(POSTSYNC_DATA::OPERATION_WRITE_IMMEDIATE_DATA, postSync.getOperation());
}
HWTEST2_F(MultiTileCommandListAppendLaunchKernelXeHpCoreTest, givenCooperativeKernelWhenAppendingKernelsThenSetProperPartitionSize, IsAtLeastXeHpCore) {
ze_group_count_t groupCount{16, 1, 1};

View File

@@ -827,6 +827,9 @@ struct CommandListSignalAllEventPacketFixture : public ModuleFixture {
void testAppendSignalEventForProfiling() {
using FamilyType = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM;
bool dynamicAllocSize = (ImplicitScalingDispatch<FamilyType>::getImmediateWritePostSyncOffset() != ImplicitScalingDispatch<FamilyType>::getTimeStampPostSyncOffset());
auto commandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>>();
auto engineType = copyOnly == 1 ? NEO::EngineGroupType::Copy : NEO::EngineGroupType::Compute;
@@ -860,6 +863,14 @@ struct CommandListSignalAllEventPacketFixture : public ModuleFixture {
ptrOffset(cmdStream->getCpuBase(), sizeBefore),
(sizeAfter - sizeBefore)));
if (dynamicAllocSize) {
auto lriCmd = genCmdCast<MI_LOAD_REGISTER_IMM *>(*cmdList.begin());
ASSERT_NE(nullptr, lriCmd);
EXPECT_EQ(NEO::PartitionRegisters<FamilyType>::addressOffsetCCSOffset, lriCmd->getRegisterOffset());
EXPECT_EQ(NEO::ImplicitScalingDispatch<FamilyType>::getTimeStampPostSyncOffset(), lriCmd->getDataDword());
}
auto itorStoreDataImm = findAll<MI_STORE_DATA_IMM *>(cmdList.begin(), cmdList.end());
if constexpr (limitEventPacketes == 1) {
@@ -887,6 +898,14 @@ struct CommandListSignalAllEventPacketFixture : public ModuleFixture {
gpuAddress += (event->getSinglePacketSize() * commandList->partitionCount);
}
}
if (dynamicAllocSize) {
auto lriCmd = genCmdCast<MI_LOAD_REGISTER_IMM *>(*cmdList.rbegin());
ASSERT_NE(nullptr, lriCmd);
EXPECT_EQ(NEO::PartitionRegisters<FamilyType>::addressOffsetCCSOffset, lriCmd->getRegisterOffset());
EXPECT_EQ(NEO::ImplicitScalingDispatch<FamilyType>::getImmediateWritePostSyncOffset(), lriCmd->getDataDword());
}
}
template <GFXCORE_FAMILY gfxCoreFamily>
@@ -1304,6 +1323,19 @@ HWTEST2_F(MultiTileCommandListSignalAllEventPacketTest, givenSignalPacketsEventW
testAppendSignalEventForProfiling<gfxCoreFamily>();
}
struct MultiTileCommandListSignalAllocLayoutTest : public MultiTileCommandListSignalAllEventPacketTest {
void SetUp() override {
DebugManager.flags.EnableDynamicPostSyncAllocLayout.set(1);
MultiTileCommandListSignalAllEventPacketTest::SetUp();
}
};
HWTEST2_F(MultiTileCommandListSignalAllocLayoutTest, givenDynamicLayoutEnabledWhenAppendEventForProfilingCalledThenProgramOffsetMmio, IsAtLeastXeHpCore) {
EXPECT_NE(ImplicitScalingDispatch<FamilyType>::getImmediateWritePostSyncOffset(), ImplicitScalingDispatch<FamilyType>::getTimeStampPostSyncOffset());
testAppendSignalEventForProfiling<gfxCoreFamily>();
}
HWTEST2_F(MultiTileCommandListSignalAllEventPacketTest, givenSignalPacketsEventWhenAppendSignalImmediateEventThenAllPacketCompletionDispatched, IsAtLeastXeHpCore) {
testAppendSignalEventPostAppendCall<gfxCoreFamily>(0);
}

View File

@@ -2790,6 +2790,39 @@ HWTEST_F(EventSizeTests, givenDebugFlagwhenCreatingEventPoolThenUseCorrectSizeAn
}
}
HWTEST_F(EventTests, givenDebugFlagSetWhenCreatingNonTimestampEventsThenPacketsSizeIsQword) {
DebugManagerStateRestore restore;
DebugManager.flags.EnableDynamicPostSyncAllocLayout.set(1);
ze_result_t result = ZE_RESULT_SUCCESS;
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.count = 1;
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
const ze_event_desc_t eventDesc = {ZE_STRUCTURE_TYPE_EVENT_DESC, nullptr, 0, 0, 0};
std::unique_ptr<L0::EventPool> timestampPool(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
eventPoolDesc.flags = 0;
std::unique_ptr<L0::EventPool> regularPool(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result));
ze_event_handle_t timestampEventHandle = nullptr;
ze_event_handle_t regularEventHandle = nullptr;
timestampPool->createEvent(&eventDesc, &timestampEventHandle);
regularPool->createEvent(&eventDesc, &regularEventHandle);
auto timestampEvent = Event::fromHandle(timestampEventHandle);
auto regularEvent = Event::fromHandle(regularEventHandle);
EXPECT_EQ(NEO::TimestampPackets<typename FamilyType::TimestampPacketType>::getSinglePacketSize(), timestampEvent->getSinglePacketSize());
EXPECT_EQ(sizeof(uint64_t), regularEvent->getSinglePacketSize());
timestampEvent->destroy();
regularEvent->destroy();
}
HWTEST_F(EventTests,
WhenHostEventSyncThenExpectDownloadEventAllocationWithEachQuery) {
std::map<GraphicsAllocation *, uint32_t> downloadAllocationTrack;

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2021-2022 Intel Corporation
* Copyright (C) 2021-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -7,6 +7,7 @@
#include "shared/source/command_container/implicit_scaling.h"
#include "shared/source/helpers/api_specific_config.h"
#include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "level_zero/core/source/compiler_interface/l0_reg_path.h"
@@ -46,6 +47,16 @@ TEST(ApiSpecificConfigL0Tests, WhenCheckingIfDeviceAllocationCacheIsEnabledThenR
EXPECT_FALSE(ApiSpecificConfig::isDeviceAllocationCacheEnabled());
}
TEST(ApiSpecificConfigL0Tests, GivenDebugFlagSetWhenCheckingIfDynamicPostSyncAllocLayoutEnabledThenReturnTrue) {
DebugManagerStateRestore restore;
EXPECT_FALSE(ApiSpecificConfig::isDynamicPostSyncAllocLayoutEnabled());
DebugManager.flags.EnableDynamicPostSyncAllocLayout.set(1);
EXPECT_TRUE(ApiSpecificConfig::isDynamicPostSyncAllocLayoutEnabled());
}
TEST(ImplicitScalingApiTests, givenLevelZeroApiUsedThenSupportEnabled) {
EXPECT_TRUE(ImplicitScaling::apiSupport);
}

View File

@@ -35,6 +35,10 @@ bool ApiSpecificConfig::isDeviceAllocationCacheEnabled() {
return false;
}
bool ApiSpecificConfig::isDynamicPostSyncAllocLayoutEnabled() {
return false;
}
ApiSpecificConfig::ApiType ApiSpecificConfig::getApiType() {
return ApiSpecificConfig::OCL;
}

View File

@@ -49,6 +49,16 @@ TEST(ApiSpecificConfigOclTests, WhenCheckingIfDeviceAllocationCacheIsEnabledThen
EXPECT_FALSE(ApiSpecificConfig::isDeviceAllocationCacheEnabled());
}
TEST(ApiSpecificConfigOclTests, WhenCheckingIfDynamicPostSyncAllocLayoutEnabledThenReturnFalse) {
DebugManagerStateRestore restore;
EXPECT_FALSE(ApiSpecificConfig::isDynamicPostSyncAllocLayoutEnabled());
DebugManager.flags.EnableDynamicPostSyncAllocLayout.set(1);
EXPECT_FALSE(ApiSpecificConfig::isDynamicPostSyncAllocLayoutEnabled());
}
TEST(ApiSpecificConfigOclTests, givenEnableStatelessCompressionWhenProvidingSvmGpuAllocationThenPreferCompressedBuffer) {
DebugManagerStateRestore dbgRestorer;
DebugManager.flags.RenderCompressedBuffersEnabled.set(1);

View File

@@ -313,7 +313,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
if (args.partitionCount > 1 && !args.isInternal) {
const uint64_t workPartitionAllocationGpuVa = args.device->getDefaultEngine().commandStreamReceiver->getWorkPartitionAllocationGpuAddress();
if (args.eventAddress != 0) {
if (args.eventAddress != 0 && !NEO::ApiSpecificConfig::isDynamicPostSyncAllocLayoutEnabled()) {
postSync.setOperation(POSTSYNC_DATA::OPERATION_WRITE_TIMESTAMP);
}
ImplicitScalingDispatch<Family>::dispatchCommands(*listCmdBufferStream,

View File

@@ -10,6 +10,7 @@
#include "shared/source/command_container/walker_partition_xehp_and_later.h"
#include "shared/source/command_stream/linear_stream.h"
#include "shared/source/execution_environment/root_device_environment.h"
#include "shared/source/helpers/api_specific_config.h"
#include "shared/source/helpers/gfx_core_helper.h"
#include "shared/source/memory_manager/graphics_allocation.h"
@@ -235,6 +236,9 @@ inline void ImplicitScalingDispatch<GfxFamily>::dispatchOffsetRegister(LinearStr
template <typename GfxFamily>
inline uint32_t ImplicitScalingDispatch<GfxFamily>::getImmediateWritePostSyncOffset() {
if (ApiSpecificConfig::isDynamicPostSyncAllocLayoutEnabled()) {
return static_cast<uint32_t>(sizeof(uint64_t));
}
return static_cast<uint32_t>(GfxCoreHelperHw<GfxFamily>::getSingleTimestampPacketSizeHw());
}

View File

@@ -242,6 +242,7 @@ DECLARE_DEBUG_VARIABLE(int64_t, OverrideEventSynchronizeTimeout, -1, "-1: defaul
DECLARE_DEBUG_VARIABLE(int32_t, ForceTlbFlush, -1, "-1: default, 0: Tlb flush disabled, 1: Tlb Flush enabled")
DECLARE_DEBUG_VARIABLE(int32_t, DebugSetMemoryDiagnosticsDelay, -1, "-1: default, >=0: delay time in minutes necessary for completion of Memory diagnostics")
DECLARE_DEBUG_VARIABLE(int32_t, EnableDeviceStateVerification, -1, "-1: default, 0: disable, 1: enable check of device state before submit on Windows")
DECLARE_DEBUG_VARIABLE(int32_t, EnableDynamicPostSyncAllocLayout, -1, "-1: default, 0: Keep Timestamp size layout, 1: Use write immediate layout (qword) and switch dynamically to TS for profiling")
/*LOGGING FLAGS*/
DECLARE_DEBUG_VARIABLE(int32_t, PrintDriverDiagnostics, -1, "prints driver diagnostics messages to standard output, value corresponds to hint level")

View File

@@ -18,6 +18,7 @@ struct ApiSpecificConfig {
static bool getGlobalBindlessHeapConfiguration();
static bool getBindlessMode();
static bool isDeviceAllocationCacheEnabled();
static bool isDynamicPostSyncAllocLayoutEnabled();
static ApiType getApiType();
static std::string getName();
static uint64_t getReducedMaxAllocSize(uint64_t maxAllocSize);

View File

@@ -533,4 +533,5 @@ OverrideHwIpVersion = -1
PrintGlobalTimestampInNs = 0
EnableDeviceStateVerification = -1
VfBarResourceAllocationWa = 1
EnableDynamicPostSyncAllocLayout = -1
# Please don't edit below this line

View File

@@ -6,6 +6,7 @@
*/
#include "shared/source/command_container/walker_partition_interface.h"
#include "shared/source/helpers/api_specific_config.h"
#include "shared/source/helpers/gfx_core_helper.h"
#include "shared/source/helpers/pipe_control_args.h"
#include "shared/test/common/cmd_parse/gen_cmd_parse.h"
@@ -1563,3 +1564,15 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
auto bbStart = reinterpret_cast<MI_BATCH_BUFFER_START *>(*bbStartList.begin());
EXPECT_EQ(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER::SECOND_LEVEL_BATCH_BUFFER_SECOND_LEVEL_BATCH, bbStart->getSecondLevelBatchBuffer());
}
HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, givenDebugFlagSetWhenCheckingImmWriteOffsetThenReturnQwordSize) {
EXPECT_EQ(static_cast<uint32_t>(GfxCoreHelperHw<FamilyType>::getSingleTimestampPacketSizeHw()), ImplicitScalingDispatch<FamilyType>::getImmediateWritePostSyncOffset());
DebugManager.flags.EnableDynamicPostSyncAllocLayout.set(1);
if (ApiSpecificConfig::isDynamicPostSyncAllocLayoutEnabled()) {
EXPECT_EQ(static_cast<uint32_t>(sizeof(uint64_t)), ImplicitScalingDispatch<FamilyType>::getImmediateWritePostSyncOffset());
} else {
EXPECT_EQ(static_cast<uint32_t>(GfxCoreHelperHw<FamilyType>::getSingleTimestampPacketSizeHw()), ImplicitScalingDispatch<FamilyType>::getImmediateWritePostSyncOffset());
}
}

View File

@@ -55,6 +55,10 @@ bool ApiSpecificConfig::isDeviceAllocationCacheEnabled() {
return false;
}
bool ApiSpecificConfig::isDynamicPostSyncAllocLayoutEnabled() {
return (NEO::DebugManager.flags.EnableDynamicPostSyncAllocLayout.get() == 1);
}
ApiSpecificConfig::ApiType ApiSpecificConfig::getApiType() {
return apiTypeForUlts;
}