Refactor and modularize walker partition code
Related-To: NEO-6244 Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
parent
e82c2e4653
commit
b65d8909e4
|
@ -10,5 +10,7 @@
|
|||
namespace NEO {
|
||||
namespace ImplicitScaling {
|
||||
bool apiSupport = false;
|
||||
}
|
||||
bool semaphoreProgrammingRequired = false;
|
||||
bool crossTileAtomicSynchronization = true;
|
||||
} // namespace ImplicitScaling
|
||||
} // namespace NEO
|
||||
|
|
|
@ -10,5 +10,7 @@
|
|||
namespace NEO {
|
||||
namespace ImplicitScaling {
|
||||
bool apiSupport = true;
|
||||
}
|
||||
bool semaphoreProgrammingRequired = false;
|
||||
bool crossTileAtomicSynchronization = true;
|
||||
} // namespace ImplicitScaling
|
||||
} // namespace NEO
|
||||
|
|
|
@ -88,7 +88,6 @@ set(IGDRCL_SRCS_tests_command_queue
|
|||
${CMAKE_CURRENT_SOURCE_DIR}/ooq_task_tests.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/read_write_buffer_cpu_copy.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/sync_buffer_handler_tests.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/walker_partition_tests_xehp_and_later.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/work_group_size_tests.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/zero_size_enqueue_tests.cpp
|
||||
)
|
||||
|
@ -98,7 +97,10 @@ if(TESTS_XEHP_AND_LATER)
|
|||
${CMAKE_CURRENT_SOURCE_DIR}/dispatch_walker_tests_xehp_and_later.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/enqueue_media_kernel_xehp_and_later.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/enqueue_resource_barier_tests_xehp_and_later.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/walker_partition_tests_xehp_and_later.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/walker_partition_fixture_xehp_and_later.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/walker_partition_fixture_xehp_and_later.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/walker_partition_tests_xehp_and_later_1.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/walker_partition_tests_xehp_and_later_2.cpp
|
||||
)
|
||||
endif()
|
||||
|
||||
|
|
|
@ -1072,8 +1072,6 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, whenWalkerPart
|
|||
MockClDevice *device = deviceFactory.rootDevices[0];
|
||||
MockContext context{device};
|
||||
|
||||
auto synchronizeBeforeExecution = false;
|
||||
auto staticPartitioning = false;
|
||||
auto cmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(&context, device, nullptr);
|
||||
auto &csr = cmdQ->getUltCommandStreamReceiver();
|
||||
|
||||
|
@ -1087,35 +1085,39 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, whenWalkerPart
|
|||
DispatchInfo dispatchInfo{};
|
||||
dispatchInfo.setNumberOfWorkgroups({32, 1, 1});
|
||||
|
||||
synchronizeBeforeExecution = false;
|
||||
WalkerPartition::WalkerPartitionArgs testArgs = {};
|
||||
testArgs.initializeWparidRegister = true;
|
||||
testArgs.crossTileAtomicSynchronization = true;
|
||||
testArgs.usePipeControlStall = true;
|
||||
testArgs.partitionCount = 2u;
|
||||
testArgs.tileCount = static_cast<uint32_t>(device->getDeviceBitfield().count());
|
||||
|
||||
DebugManager.flags.SynchronizeWalkerInWparidMode.set(0);
|
||||
staticPartitioning = false;
|
||||
testArgs.staticPartitioning = false;
|
||||
testArgs.synchronizeBeforeExecution = false;
|
||||
csr.staticWorkPartitioningEnabled = false;
|
||||
auto partitionSize = WalkerPartition::estimateSpaceRequiredInCommandBuffer<FamilyType>(false, 16u, synchronizeBeforeExecution, false, staticPartitioning, false);
|
||||
auto partitionSize = WalkerPartition::estimateSpaceRequiredInCommandBuffer<FamilyType>(testArgs);
|
||||
auto returnedSize = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *cmdQ.get(), kernel->mockKernel, dispatchInfo);
|
||||
EXPECT_EQ(returnedSize, partitionSize + baseSize);
|
||||
|
||||
synchronizeBeforeExecution = false;
|
||||
DebugManager.flags.SynchronizeWalkerInWparidMode.set(0);
|
||||
staticPartitioning = true;
|
||||
testArgs.staticPartitioning = true;
|
||||
csr.staticWorkPartitioningEnabled = true;
|
||||
partitionSize = WalkerPartition::estimateSpaceRequiredInCommandBuffer<FamilyType>(false, 16u, synchronizeBeforeExecution, false, staticPartitioning, false);
|
||||
partitionSize = WalkerPartition::estimateSpaceRequiredInCommandBuffer<FamilyType>(testArgs);
|
||||
returnedSize = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *cmdQ.get(), kernel->mockKernel, dispatchInfo);
|
||||
EXPECT_EQ(returnedSize, partitionSize + baseSize);
|
||||
|
||||
synchronizeBeforeExecution = true;
|
||||
DebugManager.flags.SynchronizeWalkerInWparidMode.set(1);
|
||||
staticPartitioning = false;
|
||||
testArgs.synchronizeBeforeExecution = true;
|
||||
testArgs.staticPartitioning = false;
|
||||
csr.staticWorkPartitioningEnabled = false;
|
||||
partitionSize = WalkerPartition::estimateSpaceRequiredInCommandBuffer<FamilyType>(false, 16u, synchronizeBeforeExecution, false, staticPartitioning, false);
|
||||
partitionSize = WalkerPartition::estimateSpaceRequiredInCommandBuffer<FamilyType>(testArgs);
|
||||
returnedSize = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *cmdQ.get(), kernel->mockKernel, dispatchInfo);
|
||||
EXPECT_EQ(returnedSize, partitionSize + baseSize);
|
||||
|
||||
synchronizeBeforeExecution = true;
|
||||
DebugManager.flags.SynchronizeWalkerInWparidMode.set(1);
|
||||
staticPartitioning = true;
|
||||
testArgs.synchronizeBeforeExecution = true;
|
||||
testArgs.staticPartitioning = true;
|
||||
csr.staticWorkPartitioningEnabled = true;
|
||||
partitionSize = WalkerPartition::estimateSpaceRequiredInCommandBuffer<FamilyType>(false, 16u, synchronizeBeforeExecution, false, staticPartitioning, false);
|
||||
partitionSize = WalkerPartition::estimateSpaceRequiredInCommandBuffer<FamilyType>(testArgs);
|
||||
returnedSize = EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_NDRANGE_KERNEL, false, false, *cmdQ.get(), kernel->mockKernel, dispatchInfo);
|
||||
EXPECT_EQ(returnedSize, partitionSize + baseSize);
|
||||
}
|
||||
|
@ -1167,7 +1169,14 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, whenQueueIsMul
|
|||
HardwareCommandsHelper<FamilyType>::getSizeRequiredCS() +
|
||||
EncodeMemoryPrefetch<FamilyType>::getSizeForMemoryPrefetch(kernel->kernelInfo.heapInfo.KernelHeapSize);
|
||||
|
||||
auto partitionSize = WalkerPartition::estimateSpaceRequiredInCommandBuffer<FamilyType>(false, 16u, false, false, false, false);
|
||||
WalkerPartition::WalkerPartitionArgs testArgs = {};
|
||||
testArgs.initializeWparidRegister = true;
|
||||
testArgs.usePipeControlStall = true;
|
||||
testArgs.crossTileAtomicSynchronization = true;
|
||||
testArgs.partitionCount = 16u;
|
||||
testArgs.tileCount = static_cast<uint32_t>(device->getDeviceBitfield().count());
|
||||
|
||||
auto partitionSize = WalkerPartition::estimateSpaceRequiredInCommandBuffer<FamilyType>(testArgs);
|
||||
|
||||
DispatchInfo dispatchInfo{};
|
||||
dispatchInfo.setNumberOfWorkgroups({32, 1, 1});
|
||||
|
|
|
@ -0,0 +1,25 @@
|
|||
/*
|
||||
* Copyright (C) 2021 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#include "opencl/test/unit_test/command_queue/walker_partition_fixture_xehp_and_later.h"
|
||||
|
||||
void WalkerPartitionTests::SetUp() {
|
||||
cmdBufferAddress = cmdBuffer;
|
||||
|
||||
testArgs.synchronizeBeforeExecution = false;
|
||||
testArgs.nativeCrossTileAtomicSync = false;
|
||||
testArgs.initializeWparidRegister = true;
|
||||
testArgs.usePipeControlStall = true;
|
||||
testArgs.crossTileAtomicSynchronization = true;
|
||||
}
|
||||
|
||||
void WalkerPartitionTests::TearDown() {
|
||||
auto initialCommandBufferPointer = cmdBuffer;
|
||||
if (checkForProperCmdBufferAddressOffset) {
|
||||
EXPECT_EQ(ptrDiff(cmdBufferAddress, initialCommandBufferPointer), totalBytesProgrammed);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,38 @@
|
|||
/*
|
||||
* Copyright (C) 2021 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#include "shared/source/command_container/walker_partition_xehp_and_later.h"
|
||||
#include "shared/test/common/cmd_parse/gen_cmd_parse.h"
|
||||
#include "shared/test/common/helpers/debug_manager_state_restore.h"
|
||||
#include "shared/test/common/helpers/unit_test_helper.h"
|
||||
|
||||
#include "test.h"
|
||||
|
||||
using namespace WalkerPartition;
|
||||
|
||||
struct WalkerPartitionTests : public ::testing::Test {
|
||||
void SetUp() override;
|
||||
|
||||
void TearDown() override;
|
||||
|
||||
template <typename GfxFamily>
|
||||
auto createWalker(uint64_t postSyncAddress) {
|
||||
WalkerPartition::COMPUTE_WALKER<GfxFamily> walker;
|
||||
walker = GfxFamily::cmdInitGpgpuWalker;
|
||||
walker.setPartitionType(COMPUTE_WALKER<GfxFamily>::PARTITION_TYPE::PARTITION_TYPE_X);
|
||||
auto &postSync = walker.getPostSync();
|
||||
postSync.setOperation(POSTSYNC_DATA<GfxFamily>::OPERATION::OPERATION_WRITE_TIMESTAMP);
|
||||
postSync.setDestinationAddress(postSyncAddress);
|
||||
return walker;
|
||||
}
|
||||
|
||||
char cmdBuffer[4096u];
|
||||
WalkerPartition::WalkerPartitionArgs testArgs = {};
|
||||
void *cmdBufferAddress = nullptr;
|
||||
uint32_t totalBytesProgrammed = 0u;
|
||||
bool checkForProperCmdBufferAddressOffset = true;
|
||||
};
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -14,3 +14,11 @@ using namespace NEO;
|
|||
TEST(ImplicitScalingApiTests, givenOpenClApiUsedThenSupportEnabled) {
|
||||
EXPECT_TRUE(ImplicitScaling::apiSupport);
|
||||
}
|
||||
|
||||
TEST(ImplicitScalingApiTests, givenOpenClApiUsedThenSemaphoreProgrammingRequiredIsFalse) {
|
||||
EXPECT_FALSE(ImplicitScaling::semaphoreProgrammingRequired);
|
||||
}
|
||||
|
||||
TEST(ImplicitScalingApiTests, givenOpenClApiUsedThenCrossTileAtomicSynchronization) {
|
||||
EXPECT_TRUE(ImplicitScaling::crossTileAtomicSynchronization);
|
||||
}
|
||||
|
|
|
@ -266,8 +266,8 @@ ForceWorkgroupSize1x1x1 = -1
|
|||
ForceThreadGroupDispatchSize = -1
|
||||
ForceStatelessL1CachingPolicy = -1
|
||||
ForceMemoryBankIndexOverride = -1
|
||||
ExperimentalSynchronizeWithSemaphores = -1
|
||||
ExperimentalForceCrossAtomicSynchronization = -1
|
||||
SynchronizeWithSemaphores = -1
|
||||
UseCrossAtomicSynchronization = -1
|
||||
EnableStatelessCompression = -1
|
||||
EnableMultiTileCompression = -1
|
||||
EnablePrivateScratchSlot1 = -1
|
||||
|
@ -313,7 +313,7 @@ OverrideUseKmdWaitFunction = -1
|
|||
EnableCacheFlushAfterWalkerForAllQueues = -1
|
||||
Force32BitDriverSupport = -1
|
||||
OverrideCmdQueueSynchronousMode = -1
|
||||
ExperimentalUseAtomicsForNativeSectionCleanup = -1
|
||||
UseAtomicsForNativeSectionCleanup = -1
|
||||
HBMSizePerTileInGigabytes = 0
|
||||
OverrideSystolicPipelineSelect = -1
|
||||
OverrideSystolicInComputeWalker = -1
|
||||
|
@ -324,6 +324,9 @@ DoNotFreeResources = 0
|
|||
OverrideGmmResourceUsageField = -1
|
||||
LogAllocationType = 0
|
||||
ProgramAdditionalPipeControlBeforeStateComputeModeCommand = 0
|
||||
ProgramNativeCleanup = -1
|
||||
WparidRegisterProgramming = -1
|
||||
UsePipeControlAfterPartitionedWalker = -1
|
||||
OverrideBufferSuitableForRenderCompression = -1
|
||||
AllowMixingRegularAndCooperativeKernels = 0
|
||||
AllowPatchingVfeStateInCommandLists = 0
|
|
@ -34,12 +34,55 @@ bool ImplicitScalingHelper::isSynchronizeBeforeExecutionRequired() {
|
|||
return synchronizeBeforeExecution;
|
||||
}
|
||||
|
||||
bool ImplicitScalingHelper::isSemaphoreProgrammingRequired() {
|
||||
auto semaphoreProgrammingRequired = ImplicitScaling::semaphoreProgrammingRequired;
|
||||
if (NEO::DebugManager.flags.SynchronizeWithSemaphores.get() == 1) {
|
||||
semaphoreProgrammingRequired = true;
|
||||
}
|
||||
return semaphoreProgrammingRequired;
|
||||
}
|
||||
|
||||
bool ImplicitScalingHelper::isCrossTileAtomicRequired() {
|
||||
auto crossTileAtomicSynchronization = ImplicitScaling::crossTileAtomicSynchronization;
|
||||
if (NEO::DebugManager.flags.UseCrossAtomicSynchronization.get() == 0) {
|
||||
crossTileAtomicSynchronization = false;
|
||||
}
|
||||
return crossTileAtomicSynchronization;
|
||||
}
|
||||
|
||||
bool ImplicitScalingHelper::useAtomicsForNativeCleanup() {
|
||||
bool useAtomics = false;
|
||||
int overrideUseAtomics = DebugManager.flags.ExperimentalUseAtomicsForNativeSectionCleanup.get();
|
||||
int overrideUseAtomics = DebugManager.flags.UseAtomicsForNativeSectionCleanup.get();
|
||||
if (overrideUseAtomics != -1) {
|
||||
useAtomics = !!(overrideUseAtomics);
|
||||
}
|
||||
return useAtomics;
|
||||
}
|
||||
|
||||
bool ImplicitScalingHelper::programNativeCleanup(bool defaultNativeCleanup) {
|
||||
int overrideProgramNativeCleanup = DebugManager.flags.ProgramNativeCleanup.get();
|
||||
if (overrideProgramNativeCleanup != -1) {
|
||||
defaultNativeCleanup = !!(overrideProgramNativeCleanup);
|
||||
}
|
||||
return defaultNativeCleanup;
|
||||
}
|
||||
|
||||
bool ImplicitScalingHelper::initWparidRegister() {
|
||||
bool initWparidRegister = true;
|
||||
int overrideInitWparidRegister = DebugManager.flags.WparidRegisterProgramming.get();
|
||||
if (overrideInitWparidRegister != -1) {
|
||||
initWparidRegister = !!(overrideInitWparidRegister);
|
||||
}
|
||||
return initWparidRegister;
|
||||
}
|
||||
|
||||
bool ImplicitScalingHelper::usePipeControl() {
|
||||
bool usePipeControl = true;
|
||||
int overrideUsePipeControl = DebugManager.flags.UsePipeControlAfterPartitionedWalker.get();
|
||||
if (overrideUsePipeControl != -1) {
|
||||
usePipeControl = !!(overrideUsePipeControl);
|
||||
}
|
||||
return usePipeControl;
|
||||
}
|
||||
|
||||
} // namespace NEO
|
||||
|
|
|
@ -15,12 +15,22 @@ class LinearStream;
|
|||
|
||||
namespace ImplicitScaling {
|
||||
extern bool apiSupport;
|
||||
}
|
||||
extern bool semaphoreProgrammingRequired;
|
||||
extern bool crossTileAtomicSynchronization;
|
||||
|
||||
constexpr uint32_t partitionAddressOffsetDwords = 2u;
|
||||
constexpr uint32_t partitionAddressOffset = sizeof(uint32_t) * partitionAddressOffsetDwords;
|
||||
} // namespace ImplicitScaling
|
||||
|
||||
struct ImplicitScalingHelper {
|
||||
static bool isImplicitScalingEnabled(const DeviceBitfield &devices, bool preCondition);
|
||||
static bool isSemaphoreProgrammingRequired();
|
||||
static bool isCrossTileAtomicRequired();
|
||||
static bool isSynchronizeBeforeExecutionRequired();
|
||||
static bool useAtomicsForNativeCleanup();
|
||||
static bool programNativeCleanup(bool defaultNativeCleanup);
|
||||
static bool initWparidRegister();
|
||||
static bool usePipeControl();
|
||||
};
|
||||
|
||||
template <typename GfxFamily>
|
||||
|
|
|
@ -20,6 +20,7 @@ size_t ImplicitScalingDispatch<GfxFamily>::getSize(bool nativeCrossTileAtomicSyn
|
|||
typename GfxFamily::COMPUTE_WALKER::PARTITION_TYPE partitionType{};
|
||||
bool staticPartitioning = false;
|
||||
const uint32_t tileCount = static_cast<uint32_t>(devices.count());
|
||||
|
||||
const uint32_t partitionCount = WalkerPartition::computePartitionCountAndPartitionType<GfxFamily>(tileCount,
|
||||
preferStaticPartitioning,
|
||||
groupStart,
|
||||
|
@ -28,15 +29,21 @@ size_t ImplicitScalingDispatch<GfxFamily>::getSize(bool nativeCrossTileAtomicSyn
|
|||
&partitionType,
|
||||
&staticPartitioning);
|
||||
UNRECOVERABLE_IF(staticPartitioning && (tileCount != partitionCount));
|
||||
WalkerPartition::WalkerPartitionArgs args = {};
|
||||
|
||||
auto synchronizeBeforeExecution = ImplicitScalingHelper::isSynchronizeBeforeExecutionRequired();
|
||||
const bool useAtomicsForNativeCleanup = ImplicitScalingHelper::useAtomicsForNativeCleanup();
|
||||
return static_cast<size_t>(WalkerPartition::estimateSpaceRequiredInCommandBuffer<GfxFamily>(false,
|
||||
16u,
|
||||
synchronizeBeforeExecution,
|
||||
nativeCrossTileAtomicSync,
|
||||
staticPartitioning,
|
||||
useAtomicsForNativeCleanup));
|
||||
args.partitionCount = partitionCount;
|
||||
args.tileCount = tileCount;
|
||||
args.synchronizeBeforeExecution = ImplicitScalingHelper::isSynchronizeBeforeExecutionRequired();
|
||||
args.useAtomicsForNativeCleanup = ImplicitScalingHelper::useAtomicsForNativeCleanup();
|
||||
args.nativeCrossTileAtomicSync = ImplicitScalingHelper::programNativeCleanup(nativeCrossTileAtomicSync);
|
||||
args.initializeWparidRegister = ImplicitScalingHelper::initWparidRegister();
|
||||
args.crossTileAtomicSynchronization = ImplicitScalingHelper::isCrossTileAtomicRequired();
|
||||
args.semaphoreProgrammingRequired = ImplicitScalingHelper::isSemaphoreProgrammingRequired();
|
||||
args.usePipeControlStall = ImplicitScalingHelper::usePipeControl();
|
||||
args.emitBatchBufferEnd = false;
|
||||
args.staticPartitioning = staticPartitioning;
|
||||
|
||||
return static_cast<size_t>(WalkerPartition::estimateSpaceRequiredInCommandBuffer<GfxFamily>(args));
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
|
@ -54,36 +61,43 @@ void ImplicitScalingDispatch<GfxFamily>::dispatchCommands(LinearStream &commandS
|
|||
|
||||
bool staticPartitioning = false;
|
||||
partitionCount = WalkerPartition::computePartitionCountAndSetPartitionType<GfxFamily>(&walkerCmd, tileCount, preferStaticPartitioning, usesImages, &staticPartitioning);
|
||||
const bool synchronizeBeforeExecution = ImplicitScalingHelper::isSynchronizeBeforeExecutionRequired();
|
||||
const bool useAtomicsForNativeCleanup = ImplicitScalingHelper::useAtomicsForNativeCleanup();
|
||||
|
||||
WalkerPartition::WalkerPartitionArgs args = {};
|
||||
args.workPartitionAllocationGpuVa = workPartitionAllocationGpuVa;
|
||||
args.partitionCount = partitionCount;
|
||||
args.tileCount = tileCount;
|
||||
args.synchronizeBeforeExecution = ImplicitScalingHelper::isSynchronizeBeforeExecutionRequired();
|
||||
args.useAtomicsForNativeCleanup = ImplicitScalingHelper::useAtomicsForNativeCleanup();
|
||||
args.nativeCrossTileAtomicSync = ImplicitScalingHelper::programNativeCleanup(nativeCrossTileAtomicSync);
|
||||
args.initializeWparidRegister = ImplicitScalingHelper::initWparidRegister();
|
||||
args.crossTileAtomicSynchronization = ImplicitScalingHelper::isCrossTileAtomicRequired();
|
||||
args.semaphoreProgrammingRequired = ImplicitScalingHelper::isSemaphoreProgrammingRequired();
|
||||
args.usePipeControlStall = ImplicitScalingHelper::usePipeControl();
|
||||
args.emitBatchBufferEnd = false;
|
||||
args.secondaryBatchBuffer = useSecondaryBatchBuffer;
|
||||
args.staticPartitioning = staticPartitioning;
|
||||
|
||||
if (staticPartitioning) {
|
||||
UNRECOVERABLE_IF(tileCount != partitionCount);
|
||||
WalkerPartition::constructStaticallyPartitionedCommandBuffer<GfxFamily>(commandStream.getSpace(0u),
|
||||
commandStream.getGraphicsAllocation()->getGpuAddress() + commandStream.getUsed(),
|
||||
&walkerCmd,
|
||||
totalProgrammedSize,
|
||||
partitionCount,
|
||||
tileCount,
|
||||
synchronizeBeforeExecution,
|
||||
useSecondaryBatchBuffer,
|
||||
nativeCrossTileAtomicSync,
|
||||
workPartitionAllocationGpuVa,
|
||||
useAtomicsForNativeCleanup);
|
||||
args);
|
||||
} else {
|
||||
if (DebugManager.flags.ExperimentalSetWalkerPartitionCount.get()) {
|
||||
partitionCount = DebugManager.flags.ExperimentalSetWalkerPartitionCount.get();
|
||||
if (partitionCount == 1u) {
|
||||
walkerCmd.setPartitionType(GfxFamily::COMPUTE_WALKER::PARTITION_TYPE::PARTITION_TYPE_DISABLED);
|
||||
}
|
||||
args.partitionCount = partitionCount;
|
||||
}
|
||||
|
||||
WalkerPartition::constructDynamicallyPartitionedCommandBuffer<GfxFamily>(commandStream.getSpace(0u),
|
||||
commandStream.getGraphicsAllocation()->getGpuAddress() + commandStream.getUsed(),
|
||||
&walkerCmd, totalProgrammedSize,
|
||||
partitionCount, tileCount,
|
||||
false, synchronizeBeforeExecution, useSecondaryBatchBuffer,
|
||||
nativeCrossTileAtomicSync,
|
||||
useAtomicsForNativeCleanup);
|
||||
&walkerCmd,
|
||||
totalProgrammedSize,
|
||||
args);
|
||||
}
|
||||
commandStream.getSpace(totalProgrammedSize);
|
||||
}
|
||||
|
|
|
@ -18,6 +18,22 @@
|
|||
|
||||
namespace WalkerPartition {
|
||||
|
||||
struct WalkerPartitionArgs {
|
||||
uint64_t workPartitionAllocationGpuVa = 0;
|
||||
uint32_t partitionCount = 0;
|
||||
uint32_t tileCount = 0;
|
||||
bool emitBatchBufferEnd = false;
|
||||
bool secondaryBatchBuffer = false;
|
||||
bool synchronizeBeforeExecution = false;
|
||||
bool crossTileAtomicSynchronization = false;
|
||||
bool semaphoreProgrammingRequired = false;
|
||||
bool staticPartitioning = false;
|
||||
bool nativeCrossTileAtomicSync = false;
|
||||
bool useAtomicsForNativeCleanup = false;
|
||||
bool initializeWparidRegister = false;
|
||||
bool usePipeControlStall = false;
|
||||
};
|
||||
|
||||
template <typename GfxFamily>
|
||||
using COMPUTE_WALKER = typename GfxFamily::COMPUTE_WALKER;
|
||||
template <typename GfxFamily>
|
||||
|
@ -73,22 +89,6 @@ Command *putCommand(void *&inputAddress, uint32_t &totalBytesProgrammed) {
|
|||
return commandToReturn;
|
||||
}
|
||||
|
||||
bool inline isSemaphoreProgrammingRequired() {
|
||||
auto semaphoreProgrammingRequired = false;
|
||||
if (NEO::DebugManager.flags.ExperimentalSynchronizeWithSemaphores.get() == 1) {
|
||||
semaphoreProgrammingRequired = true;
|
||||
}
|
||||
return semaphoreProgrammingRequired;
|
||||
}
|
||||
|
||||
bool inline isCrossTileAtomicRequired() {
|
||||
auto crossTileAtomicSynchronization = true;
|
||||
if (NEO::DebugManager.flags.ExperimentalForceCrossAtomicSynchronization.get() == 0) {
|
||||
crossTileAtomicSynchronization = false;
|
||||
}
|
||||
return crossTileAtomicSynchronization;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint32_t computePartitionCountAndPartitionType(uint32_t preferredMinimalPartitionCount,
|
||||
bool preferStaticPartitioning,
|
||||
|
@ -349,25 +349,55 @@ void programStoreMemImmediateDword(void *&inputAddress, uint32_t &totalBytesProg
|
|||
*storeDataImmediate = cmd;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint64_t computeNativeCrossTileSyncControlSectionSize(bool useAtomicsForNativeCleanup) {
|
||||
if (useAtomicsForNativeCleanup) {
|
||||
return sizeof(MI_ATOMIC<GfxFamily>);
|
||||
} else {
|
||||
return sizeof(MI_STORE_DATA_IMM<GfxFamily>);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void programNativeCrossTileSyncControl(void *&inputAddress,
|
||||
uint32_t &totalBytesProgrammed,
|
||||
uint64_t finalSyncTileCountField,
|
||||
uint64_t address,
|
||||
bool useAtomicsForNativeCleanup) {
|
||||
if (useAtomicsForNativeCleanup) {
|
||||
programMiAtomic<GfxFamily>(inputAddress,
|
||||
totalBytesProgrammed,
|
||||
finalSyncTileCountField,
|
||||
address,
|
||||
false,
|
||||
MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_MOVE);
|
||||
} else {
|
||||
programStoreMemImmediateDword<GfxFamily>(inputAddress,
|
||||
totalBytesProgrammed,
|
||||
finalSyncTileCountField,
|
||||
address,
|
||||
0u);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint64_t computeTilesSynchronizationWithAtomicsSectionSize() {
|
||||
return sizeof(MI_ATOMIC<GfxFamily>) +
|
||||
sizeof(MI_SEMAPHORE_WAIT<GfxFamily>);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void programTilesSynchronizationWithAtomics(void *¤tBatchBufferPointer,
|
||||
uint32_t &totalBytesProgrammed,
|
||||
uint64_t atomicAddress,
|
||||
uint32_t tileCount) {
|
||||
programMiAtomic<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, false, MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT);
|
||||
programWaitForSemaphore<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, tileCount, MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint64_t computeNativeCrossTileSyncCleanupSectionSize(size_t fieldsForCleanupCount, bool useAtomicsForNativeCleanup) {
|
||||
return fieldsForCleanupCount * computeNativeCrossTileSyncControlSectionSize<GfxFamily>(useAtomicsForNativeCleanup) +
|
||||
2 * computeTilesSynchronizationWithAtomicsSectionSize<GfxFamily>();
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void programNativeCrossTileSyncCleanup(void *&inputAddress,
|
||||
uint32_t &totalBytesProgrammed,
|
||||
|
@ -377,28 +407,18 @@ void programNativeCrossTileSyncCleanup(void *&inputAddress,
|
|||
uint32_t tileCount,
|
||||
bool useAtomicsForNativeCleanup) {
|
||||
// Synchronize tiles, so the fields are not cleared while still in use
|
||||
programMiAtomic<GfxFamily>(inputAddress, totalBytesProgrammed, finalSyncTileCountAddress, false, MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT);
|
||||
programWaitForSemaphore<GfxFamily>(inputAddress, totalBytesProgrammed, finalSyncTileCountAddress, tileCount, MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD);
|
||||
programTilesSynchronizationWithAtomics<GfxFamily>(inputAddress, totalBytesProgrammed, finalSyncTileCountAddress, tileCount);
|
||||
|
||||
for (auto fieldIndex = 0u; fieldIndex < fieldsForCleanupCount; fieldIndex++) {
|
||||
const uint64_t addressForCleanup = baseAddressForCleanup + fieldIndex * sizeof(uint32_t);
|
||||
if (useAtomicsForNativeCleanup) {
|
||||
programMiAtomic<GfxFamily>(inputAddress,
|
||||
totalBytesProgrammed,
|
||||
addressForCleanup,
|
||||
false,
|
||||
MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_MOVE);
|
||||
} else {
|
||||
programStoreMemImmediateDword<GfxFamily>(inputAddress,
|
||||
programNativeCrossTileSyncControl<GfxFamily>(inputAddress,
|
||||
totalBytesProgrammed,
|
||||
addressForCleanup,
|
||||
0u);
|
||||
}
|
||||
useAtomicsForNativeCleanup);
|
||||
}
|
||||
|
||||
//this synchronization point ensures that all tiles finished zeroing and will fairly access control section atomic variables
|
||||
programMiAtomic<GfxFamily>(inputAddress, totalBytesProgrammed, finalSyncTileCountAddress, false, MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT);
|
||||
programWaitForSemaphore<GfxFamily>(inputAddress, totalBytesProgrammed, finalSyncTileCountAddress, 2 * tileCount, MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD);
|
||||
programTilesSynchronizationWithAtomics<GfxFamily>(inputAddress, totalBytesProgrammed, finalSyncTileCountAddress, 2 * tileCount);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
|
@ -412,15 +432,6 @@ void programTilesSynchronizationWithPostSyncs(void *¤tBatchBufferPointer,
|
|||
}
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void programTilesSynchronizationWithAtomics(void *¤tBatchBufferPointer,
|
||||
uint32_t &totalBytesProgrammed,
|
||||
uint64_t atomicAddress,
|
||||
uint32_t tileCount) {
|
||||
programMiAtomic<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, false, MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT);
|
||||
programWaitForSemaphore<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, tileCount, MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint64_t computeWalkerSectionSize() {
|
||||
return sizeof(BATCH_BUFFER_START<GfxFamily>) +
|
||||
|
@ -428,46 +439,30 @@ uint64_t computeWalkerSectionSize() {
|
|||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint64_t computeNativeCrossTileSyncControlSectionSize(bool useAtomicsForNativeCleanup) {
|
||||
if (useAtomicsForNativeCleanup) {
|
||||
return sizeof(MI_ATOMIC<GfxFamily>);
|
||||
} else {
|
||||
return sizeof(MI_STORE_DATA_IMM<GfxFamily>);
|
||||
uint64_t computeControlSectionOffset(WalkerPartitionArgs &args) {
|
||||
uint64_t size = 0u;
|
||||
|
||||
size += args.synchronizeBeforeExecution ? computeTilesSynchronizationWithAtomicsSectionSize<GfxFamily>() : 0;
|
||||
size += sizeof(LOAD_REGISTER_IMM<GfxFamily>); //predication mask
|
||||
size += sizeof(MI_ATOMIC<GfxFamily>); //current id for partition
|
||||
size += sizeof(LOAD_REGISTER_REG<GfxFamily>); //id into register
|
||||
size += sizeof(MI_SET_PREDICATE<GfxFamily>) * 2 +
|
||||
sizeof(BATCH_BUFFER_START<GfxFamily>) * 2;
|
||||
size += (args.semaphoreProgrammingRequired ? sizeof(MI_SEMAPHORE_WAIT<GfxFamily>) * args.partitionCount : 0u);
|
||||
size += computeWalkerSectionSize<GfxFamily>();
|
||||
size += args.usePipeControlStall ? sizeof(PIPE_CONTROL<GfxFamily>) : 0u;
|
||||
if (args.crossTileAtomicSynchronization || args.nativeCrossTileAtomicSync) {
|
||||
size += computeTilesSynchronizationWithAtomicsSectionSize<GfxFamily>();
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint64_t computeNativeCrossTileSyncCleanupSectionSize(size_t fieldsForCleanupCount, bool useAtomicsForNativeCleanup) {
|
||||
return fieldsForCleanupCount * computeNativeCrossTileSyncControlSectionSize<GfxFamily>(useAtomicsForNativeCleanup) +
|
||||
2 * sizeof(MI_ATOMIC<GfxFamily>) +
|
||||
2 * sizeof(MI_SEMAPHORE_WAIT<GfxFamily>);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint64_t computeControlSectionOffset(uint32_t partitionCount, bool synchronizeBeforeExecution, bool nativeCrossTileAtomicSync, bool useAtomicsForNativeCleanup) {
|
||||
auto synchronizationCount = (synchronizeBeforeExecution) ? 2u : 1u;
|
||||
if (!isCrossTileAtomicRequired() && !nativeCrossTileAtomicSync) {
|
||||
synchronizationCount--;
|
||||
if (args.nativeCrossTileAtomicSync) {
|
||||
size += computeNativeCrossTileSyncControlSectionSize<GfxFamily>(args.useAtomicsForNativeCleanup);
|
||||
}
|
||||
|
||||
return sizeof(LOAD_REGISTER_IMM<GfxFamily>) +
|
||||
sizeof(MI_ATOMIC<GfxFamily>) * (1u + synchronizationCount) +
|
||||
sizeof(LOAD_REGISTER_REG<GfxFamily>) +
|
||||
sizeof(MI_SET_PREDICATE<GfxFamily>) * 2 +
|
||||
sizeof(BATCH_BUFFER_START<GfxFamily>) * 2 +
|
||||
sizeof(PIPE_CONTROL<GfxFamily>) +
|
||||
sizeof(MI_SEMAPHORE_WAIT<GfxFamily>) * synchronizationCount +
|
||||
(isSemaphoreProgrammingRequired() ? sizeof(MI_SEMAPHORE_WAIT<GfxFamily>) * partitionCount : 0u) +
|
||||
computeWalkerSectionSize<GfxFamily>() +
|
||||
(nativeCrossTileAtomicSync ? computeNativeCrossTileSyncControlSectionSize<GfxFamily>(useAtomicsForNativeCleanup) : 0u);
|
||||
return size;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint64_t computeWalkerSectionStart(uint32_t partitionCount,
|
||||
bool synchronizeBeforeExecution,
|
||||
bool nativeCrossTileAtomicSync,
|
||||
bool useAtomicsForNativeCleanup) {
|
||||
return computeControlSectionOffset<GfxFamily>(partitionCount, synchronizeBeforeExecution, nativeCrossTileAtomicSync, useAtomicsForNativeCleanup) -
|
||||
uint64_t computeWalkerSectionStart(WalkerPartitionArgs &args) {
|
||||
return computeControlSectionOffset<GfxFamily>(args) -
|
||||
computeWalkerSectionSize<GfxFamily>();
|
||||
}
|
||||
|
||||
|
@ -537,26 +532,17 @@ void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer,
|
|||
uint64_t gpuAddressOfAllocation,
|
||||
COMPUTE_WALKER<GfxFamily> *inputWalker,
|
||||
uint32_t &totalBytesProgrammed,
|
||||
uint32_t partitionCount,
|
||||
uint32_t tileCount,
|
||||
bool emitBatchBufferEnd,
|
||||
bool synchronizeBeforeExecution,
|
||||
bool secondaryBatchBuffer,
|
||||
bool nativeCrossTileAtomicSync,
|
||||
bool useAtomicsForNativeCleanup) {
|
||||
WalkerPartitionArgs &args) {
|
||||
totalBytesProgrammed = 0u;
|
||||
void *currentBatchBufferPointer = cpuPointer;
|
||||
|
||||
auto controlSectionOffset = computeControlSectionOffset<GfxFamily>(partitionCount, synchronizeBeforeExecution, nativeCrossTileAtomicSync, useAtomicsForNativeCleanup);
|
||||
if (synchronizeBeforeExecution) {
|
||||
auto controlSectionOffset = computeControlSectionOffset<GfxFamily>(args);
|
||||
if (args.synchronizeBeforeExecution) {
|
||||
auto tileAtomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(BatchBufferControlData, inTileCount);
|
||||
programMiAtomic<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, tileAtomicAddress, false, MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT);
|
||||
|
||||
//if all tiles hit the atomic, it means we may go further
|
||||
programWaitForSemaphore<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, tileAtomicAddress, tileCount, MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD);
|
||||
programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, tileAtomicAddress, args.tileCount);
|
||||
}
|
||||
|
||||
programWparidMask<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, partitionCount);
|
||||
programWparidMask<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, args.partitionCount);
|
||||
|
||||
programMiAtomic<GfxFamily>(currentBatchBufferPointer,
|
||||
totalBytesProgrammed,
|
||||
|
@ -573,36 +559,32 @@ void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer,
|
|||
programMiBatchBufferStart<GfxFamily>(currentBatchBufferPointer,
|
||||
totalBytesProgrammed,
|
||||
gpuAddressOfAllocation +
|
||||
computeWalkerSectionStart<GfxFamily>(partitionCount,
|
||||
synchronizeBeforeExecution,
|
||||
nativeCrossTileAtomicSync,
|
||||
useAtomicsForNativeCleanup),
|
||||
computeWalkerSectionStart<GfxFamily>(args),
|
||||
true,
|
||||
secondaryBatchBuffer);
|
||||
args.secondaryBatchBuffer);
|
||||
|
||||
//disable predication to not noop subsequent commands.
|
||||
programWparidPredication<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, false);
|
||||
|
||||
if (nativeCrossTileAtomicSync) {
|
||||
if (args.nativeCrossTileAtomicSync) {
|
||||
const auto finalSyncTileCountField = gpuAddressOfAllocation + controlSectionOffset + offsetof(BatchBufferControlData, finalSyncTileCount);
|
||||
programNativeCrossTileSyncControl<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountField, useAtomicsForNativeCleanup);
|
||||
programNativeCrossTileSyncControl<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountField, args.useAtomicsForNativeCleanup);
|
||||
}
|
||||
|
||||
programPipeControlCommand<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, true);
|
||||
if (args.usePipeControlStall) {
|
||||
programPipeControlCommand<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, true);
|
||||
}
|
||||
|
||||
if (isSemaphoreProgrammingRequired()) {
|
||||
if (args.semaphoreProgrammingRequired) {
|
||||
auto postSyncAddress = inputWalker->getPostSync().getDestinationAddress() + 8llu;
|
||||
for (uint32_t partitionId = 0u; partitionId < partitionCount; partitionId++) {
|
||||
for (uint32_t partitionId = 0u; partitionId < args.partitionCount; partitionId++) {
|
||||
programWaitForSemaphore<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, postSyncAddress + partitionId * 16llu, 1u, MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD);
|
||||
}
|
||||
}
|
||||
|
||||
if (isCrossTileAtomicRequired() || nativeCrossTileAtomicSync) {
|
||||
if (args.crossTileAtomicSynchronization || args.nativeCrossTileAtomicSync) {
|
||||
auto tileAtomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(BatchBufferControlData, tileCount);
|
||||
programMiAtomic<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, tileAtomicAddress, false, MI_ATOMIC<GfxFamily>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT);
|
||||
|
||||
//if all tiles hit the atomic, it means we may go further
|
||||
programWaitForSemaphore<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, tileAtomicAddress, tileCount, MI_SEMAPHORE_WAIT<GfxFamily>::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD);
|
||||
programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, tileAtomicAddress, args.tileCount);
|
||||
}
|
||||
|
||||
//this bb start goes to the end of partitioned command buffer
|
||||
|
@ -611,12 +593,12 @@ void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer,
|
|||
totalBytesProgrammed,
|
||||
gpuAddressOfAllocation + controlSectionOffset + sizeof(BatchBufferControlData),
|
||||
false,
|
||||
secondaryBatchBuffer);
|
||||
args.secondaryBatchBuffer);
|
||||
|
||||
//Walker section
|
||||
programPartitionedWalker<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, partitionCount);
|
||||
programPartitionedWalker<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount);
|
||||
|
||||
programMiBatchBufferStart<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, gpuAddressOfAllocation, false, secondaryBatchBuffer);
|
||||
programMiBatchBufferStart<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, gpuAddressOfAllocation, false, args.secondaryBatchBuffer);
|
||||
|
||||
auto controlSection = reinterpret_cast<BatchBufferControlData *>(ptrOffset(cpuPointer, static_cast<size_t>(controlSectionOffset)));
|
||||
controlSection->partitionCount = 0u;
|
||||
|
@ -626,18 +608,18 @@ void constructDynamicallyPartitionedCommandBuffer(void *cpuPointer,
|
|||
totalBytesProgrammed += sizeof(BatchBufferControlData);
|
||||
currentBatchBufferPointer = ptrOffset(currentBatchBufferPointer, sizeof(BatchBufferControlData));
|
||||
|
||||
if (nativeCrossTileAtomicSync) {
|
||||
if (args.nativeCrossTileAtomicSync) {
|
||||
const auto finalSyncTileCountAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(BatchBufferControlData, finalSyncTileCount);
|
||||
programNativeCrossTileSyncCleanup<GfxFamily>(currentBatchBufferPointer,
|
||||
totalBytesProgrammed,
|
||||
finalSyncTileCountAddress,
|
||||
gpuAddressOfAllocation + controlSectionOffset,
|
||||
dynamicPartitioningFieldsForCleanupCount,
|
||||
tileCount,
|
||||
useAtomicsForNativeCleanup);
|
||||
args.tileCount,
|
||||
args.useAtomicsForNativeCleanup);
|
||||
}
|
||||
|
||||
if (emitBatchBufferEnd) {
|
||||
if (args.emitBatchBufferEnd) {
|
||||
auto batchBufferEnd = putCommand<BATCH_BUFFER_END<GfxFamily>>(currentBatchBufferPointer, totalBytesProgrammed);
|
||||
*batchBufferEnd = GfxFamily::cmdInitBatchBufferEnd;
|
||||
}
|
||||
|
@ -651,14 +633,28 @@ struct StaticPartitioningControlSection {
|
|||
static constexpr inline size_t staticPartitioningFieldsForCleanupCount = sizeof(StaticPartitioningControlSection) / sizeof(uint32_t) - 1;
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint64_t computeStaticPartitioningControlSectionOffset(uint32_t partitionCount, bool synchronizeBeforeExecution, bool nativeCrossTileAtomicSync, bool useAtomicsForNativeCleanup) {
|
||||
const auto beforeExecutionSyncAtomicSize = synchronizeBeforeExecution ? (sizeof(MI_SEMAPHORE_WAIT<GfxFamily>) + sizeof(MI_ATOMIC<GfxFamily>)) : 0u;
|
||||
const auto afterExecutionSyncAtomicSize = (isCrossTileAtomicRequired() || nativeCrossTileAtomicSync) ? (sizeof(MI_SEMAPHORE_WAIT<GfxFamily>) + sizeof(MI_ATOMIC<GfxFamily>)) : 0u;
|
||||
const auto afterExecutionSyncPostSyncSize = isSemaphoreProgrammingRequired() ? sizeof(MI_SEMAPHORE_WAIT<GfxFamily>) * partitionCount : 0u;
|
||||
const auto nativeCrossTileSyncSize = nativeCrossTileAtomicSync ? computeNativeCrossTileSyncControlSectionSize<GfxFamily>(useAtomicsForNativeCleanup) : 0u;
|
||||
uint64_t computeStaticPartitioningControlSectionOffset(WalkerPartitionArgs &args) {
|
||||
const auto beforeExecutionSyncAtomicSize = args.synchronizeBeforeExecution
|
||||
? computeTilesSynchronizationWithAtomicsSectionSize<GfxFamily>()
|
||||
: 0u;
|
||||
const auto afterExecutionSyncAtomicSize = (args.crossTileAtomicSynchronization || args.nativeCrossTileAtomicSync)
|
||||
? computeTilesSynchronizationWithAtomicsSectionSize<GfxFamily>()
|
||||
: 0u;
|
||||
const auto afterExecutionSyncPostSyncSize = args.semaphoreProgrammingRequired
|
||||
? sizeof(MI_SEMAPHORE_WAIT<GfxFamily>) * args.partitionCount
|
||||
: 0u;
|
||||
const auto nativeCrossTileSyncSize = args.nativeCrossTileAtomicSync
|
||||
? computeNativeCrossTileSyncControlSectionSize<GfxFamily>(args.useAtomicsForNativeCleanup)
|
||||
: 0u;
|
||||
const auto wparidRegisterSize = args.initializeWparidRegister
|
||||
? sizeof(LOAD_REGISTER_MEM<GfxFamily>)
|
||||
: 0u;
|
||||
const auto pipeControlSize = args.usePipeControlStall
|
||||
? sizeof(PIPE_CONTROL<GfxFamily>)
|
||||
: 0u;
|
||||
return beforeExecutionSyncAtomicSize +
|
||||
sizeof(LOAD_REGISTER_MEM<GfxFamily>) +
|
||||
sizeof(PIPE_CONTROL<GfxFamily>) +
|
||||
wparidRegisterSize +
|
||||
pipeControlSize +
|
||||
sizeof(COMPUTE_WALKER<GfxFamily>) +
|
||||
nativeCrossTileSyncSize +
|
||||
afterExecutionSyncAtomicSize +
|
||||
|
@ -671,49 +667,48 @@ void constructStaticallyPartitionedCommandBuffer(void *cpuPointer,
|
|||
uint64_t gpuAddressOfAllocation,
|
||||
COMPUTE_WALKER<GfxFamily> *inputWalker,
|
||||
uint32_t &totalBytesProgrammed,
|
||||
uint32_t partitionCount,
|
||||
uint32_t tileCount,
|
||||
bool synchronizeBeforeExecution,
|
||||
bool secondaryBatchBuffer,
|
||||
bool nativeCrossTileAtomicSync,
|
||||
uint64_t workPartitionAllocationGpuVa,
|
||||
bool useAtomicsForNativeCleanup) {
|
||||
WalkerPartitionArgs &args) {
|
||||
totalBytesProgrammed = 0u;
|
||||
void *currentBatchBufferPointer = cpuPointer;
|
||||
|
||||
// Get address of the control section
|
||||
const auto controlSectionOffset = computeStaticPartitioningControlSectionOffset<GfxFamily>(partitionCount, synchronizeBeforeExecution, nativeCrossTileAtomicSync, useAtomicsForNativeCleanup);
|
||||
const auto controlSectionOffset = computeStaticPartitioningControlSectionOffset<GfxFamily>(args);
|
||||
const auto afterControlSectionOffset = controlSectionOffset + sizeof(StaticPartitioningControlSection);
|
||||
|
||||
// Synchronize tiles before walker
|
||||
if (synchronizeBeforeExecution) {
|
||||
if (args.synchronizeBeforeExecution) {
|
||||
const auto atomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, synchronizeBeforeWalkerCounter);
|
||||
programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, tileCount);
|
||||
programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, args.tileCount);
|
||||
}
|
||||
|
||||
// Load partition ID to wparid register and execute walker
|
||||
programMiLoadRegisterMem<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, workPartitionAllocationGpuVa, wparidCCSOffset);
|
||||
programPartitionedWalker<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, partitionCount);
|
||||
if (args.initializeWparidRegister) {
|
||||
programMiLoadRegisterMem<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, args.workPartitionAllocationGpuVa, wparidCCSOffset);
|
||||
}
|
||||
programPartitionedWalker<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount);
|
||||
|
||||
// Prepare for cleanup section
|
||||
if (nativeCrossTileAtomicSync) {
|
||||
if (args.nativeCrossTileAtomicSync) {
|
||||
const auto finalSyncTileCountField = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, finalSyncTileCounter);
|
||||
programNativeCrossTileSyncControl<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountField, useAtomicsForNativeCleanup);
|
||||
programNativeCrossTileSyncControl<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountField, args.useAtomicsForNativeCleanup);
|
||||
}
|
||||
|
||||
programPipeControlCommand<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, true); // flush L3 cache
|
||||
if (args.usePipeControlStall) {
|
||||
programPipeControlCommand<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, true); // flush L3 cache
|
||||
}
|
||||
|
||||
// Synchronize tiles after walker
|
||||
if (isSemaphoreProgrammingRequired()) {
|
||||
programTilesSynchronizationWithPostSyncs<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, partitionCount);
|
||||
if (args.semaphoreProgrammingRequired) {
|
||||
programTilesSynchronizationWithPostSyncs<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, inputWalker, args.partitionCount);
|
||||
}
|
||||
if (isCrossTileAtomicRequired() || nativeCrossTileAtomicSync) {
|
||||
|
||||
if (args.crossTileAtomicSynchronization || args.nativeCrossTileAtomicSync) {
|
||||
const auto atomicAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, synchronizeAfterWalkerCounter);
|
||||
programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, tileCount);
|
||||
programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, atomicAddress, args.tileCount);
|
||||
}
|
||||
|
||||
// Jump over the control section
|
||||
programMiBatchBufferStart<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, gpuAddressOfAllocation + afterControlSectionOffset, false, secondaryBatchBuffer);
|
||||
programMiBatchBufferStart<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, gpuAddressOfAllocation + afterControlSectionOffset, false, args.secondaryBatchBuffer);
|
||||
|
||||
// Control section
|
||||
DEBUG_BREAK_IF(totalBytesProgrammed != controlSectionOffset);
|
||||
|
@ -724,35 +719,31 @@ void constructStaticallyPartitionedCommandBuffer(void *cpuPointer,
|
|||
DEBUG_BREAK_IF(totalBytesProgrammed != afterControlSectionOffset);
|
||||
|
||||
// Cleanup section
|
||||
if (nativeCrossTileAtomicSync) {
|
||||
if (args.nativeCrossTileAtomicSync) {
|
||||
const auto finalSyncTileCountAddress = gpuAddressOfAllocation + controlSectionOffset + offsetof(StaticPartitioningControlSection, finalSyncTileCounter);
|
||||
programNativeCrossTileSyncCleanup<GfxFamily>(currentBatchBufferPointer,
|
||||
totalBytesProgrammed,
|
||||
finalSyncTileCountAddress,
|
||||
gpuAddressOfAllocation + controlSectionOffset,
|
||||
staticPartitioningFieldsForCleanupCount,
|
||||
tileCount,
|
||||
useAtomicsForNativeCleanup);
|
||||
args.tileCount,
|
||||
args.useAtomicsForNativeCleanup);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint64_t estimateSpaceRequiredInCommandBuffer(bool requiresBatchBufferEnd,
|
||||
uint32_t partitionCount,
|
||||
bool synchronizeBeforeExecution,
|
||||
bool nativeCrossTileAtomicSync,
|
||||
bool staticPartitioning,
|
||||
bool useAtomicsForNativeCleanup) {
|
||||
uint64_t estimateSpaceRequiredInCommandBuffer(WalkerPartitionArgs &args) {
|
||||
|
||||
uint64_t size = {};
|
||||
if (staticPartitioning) {
|
||||
size += computeStaticPartitioningControlSectionOffset<GfxFamily>(partitionCount, synchronizeBeforeExecution, nativeCrossTileAtomicSync, useAtomicsForNativeCleanup);
|
||||
if (args.staticPartitioning) {
|
||||
size += computeStaticPartitioningControlSectionOffset<GfxFamily>(args);
|
||||
size += sizeof(StaticPartitioningControlSection);
|
||||
size += nativeCrossTileAtomicSync ? computeNativeCrossTileSyncCleanupSectionSize<GfxFamily>(staticPartitioningFieldsForCleanupCount, useAtomicsForNativeCleanup) : 0u;
|
||||
size += args.nativeCrossTileAtomicSync ? computeNativeCrossTileSyncCleanupSectionSize<GfxFamily>(staticPartitioningFieldsForCleanupCount, args.useAtomicsForNativeCleanup) : 0u;
|
||||
} else {
|
||||
size += computeControlSectionOffset<GfxFamily>(partitionCount, synchronizeBeforeExecution, nativeCrossTileAtomicSync, useAtomicsForNativeCleanup);
|
||||
size += computeControlSectionOffset<GfxFamily>(args);
|
||||
size += sizeof(BatchBufferControlData);
|
||||
size += requiresBatchBufferEnd ? sizeof(BATCH_BUFFER_END<GfxFamily>) : 0u;
|
||||
size += nativeCrossTileAtomicSync ? computeNativeCrossTileSyncCleanupSectionSize<GfxFamily>(dynamicPartitioningFieldsForCleanupCount, useAtomicsForNativeCleanup) : 0u;
|
||||
size += args.emitBatchBufferEnd ? sizeof(BATCH_BUFFER_END<GfxFamily>) : 0u;
|
||||
size += args.nativeCrossTileAtomicSync ? computeNativeCrossTileSyncCleanupSectionSize<GfxFamily>(dynamicPartitioningFieldsForCleanupCount, args.useAtomicsForNativeCleanup) : 0u;
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
|
|
@ -123,15 +123,11 @@ DECLARE_DEBUG_VARIABLE(int32_t, CFENumberOfWalkers, -1, "Set Number of Walkers i
|
|||
DECLARE_DEBUG_VARIABLE(int32_t, CFEMaximumNumberOfThreads, -1, "Set Maximum Number of Threads in CFE_STATE on XEHP, -1 - do not set")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, CFEOverDispatchControl, -1, "Set Over Dispatch Control in CFE_STATE on XEHP, -1 - do not set")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, CFELargeGRFThreadAdjustDisable, -1, "Set Large GRF thread adjust Disable field in CFE_STATE, -1 - do not set")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, SynchronizeWalkerInWparidMode, -1, "-1: default, 0: do not synchronize 1: synchronize all tiles prior to doing work distrubution")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, EnableWalkerPartition, -1, "-1: default, 0: disable, 1: enable, Enables Walker Partitioning via WPARID.")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, OverrideNumComputeUnitsForScratch, -1, "Override number of compute units used for scratch size calculation")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ForceWorkgroupSize1x1x1, -1, "-1: default, 0: disable, 1: enable, force workgroup size 1x1x1 in builtins")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ForceThreadGroupDispatchSize, -1, "Set ThreadGroupDispatchSize in INTERFACE_DESCRIPTOR_DATA, -1 - default, 0 - TG size 8, 1 - TG size 4, 2 - TG size 2, 3 - Reserved")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ForceStatelessL1CachingPolicy, -1, "-1: default, >=0 : program value for stateless L1 caching")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ForceMemoryBankIndexOverride, -1, "-1: default, 0: disable, 1:enable, Force index=1 of memory bank for XEHP")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalSynchronizeWithSemaphores, -1, "Experimental implementation: 1: Emit Semaphores waiting after Walker completion in WPARID mode 0: do not emit semaphores after Walker")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalForceCrossAtomicSynchronization, -1, "Experimental implementation: 1: Cross Tile Atomic Synchronization present 0: Cross tile atomic synchronization disabled")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, EnablePrivateScratchSlot1, -1, "-1: default, 0: disable, 1: enable Allows using private scratch space")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, DisablePipeControlPrecedingPostSyncCommand, -1, "-1 default - disabled adding PIPE_CONTROL, 0 - disabled adding PIPE_CONTROL, 1 - enabled adding PIPE_CONTROL")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, UseCachingPolicyForIndirectObjectHeap, -1, "Use selected caching policy for IOH, -1 - default, 0 - Uncached, 1 - L3 Caching, 2 - L1 Caching")
|
||||
|
@ -142,13 +138,11 @@ DECLARE_DEBUG_VARIABLE(int32_t, ForceMultiGpuPartialWrites, -1, "-1: default - 0
|
|||
DECLARE_DEBUG_VARIABLE(int32_t, ForceMultiGpuAtomicsInComputeMode, -1, "-1: default - 0 for multiOsContext capable, 0: program value 0 in MultiGpuAtomics bit in STATE_COMPUTE_MODE, 1: program value 1 in MultiGpuAtomics bit in STATE_COMPUTE_MODE")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ForceMultiGpuAtomics, -1, "-1: default - 0 for multiOsContext capable, 0: program value 0 in MultiGpuAtomics controls 1: program value 1 in MultiGpuAtomics controls")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ForceBufferCompressionFormat, -1, "-1: default, >0: Format value")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalSetWalkerPartitionCount, 0, "Experimental implementation: Set number of COMPUTE_WALKERs for a given Partition Type, 0 - do not set the feature.")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, EnableHwGenerationLocalIds, -1, "-1: default, 0: disable, 1: enable : Enables generation of local ids on HW")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, WalkerPartitionPreferHighestDimension, -1, "-1: default, 0: prefer biggest dimension, 1: prefer Z over Y over X if they divide partition count evenly")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, SetMinimalPartitionSize, -1, "-1 default value set to 512 workgroups, 0 - disabled, >0 - minimal partition size in workgroups (should be power of 2)")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, OverrideBlitterTargetMemory, -1, "-1:default 0: overwrites to System 1: overwrites to Local")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, OverrideBlitterMocs, -1, "-1: default, >=0 SetGivenMocsInBlitterTransfers")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalSetWalkerPartitionType, -1, "Experimental implementation: Set COMPUTE_WALKER Partition Type. Valid values for types from 1 to 3")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, OverridePostSyncMocs, -1, "-1: default, >=0 Override post sync mocs with value")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, EnableImmediateVmBindExt, -1, "Use immediate bind extension to a new residency model on Linux (requires kernel support), -1: default (enabled whith direct submission), 0: disabled, 1: enabled")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ForceExecutionTile, -1, "-1: default, 0+: given tile is choosen as submission, must be used with EnableWalkerPartition = 0.")
|
||||
|
@ -237,11 +231,21 @@ DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionOverrideComputeSupport, -1, "Ove
|
|||
DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionDisableCacheFlush, -1, "-1: driver default, 0: additional cache flush is present 1: disable dispatching cache flush commands")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionNewResourceTlbFlush, -1, "-1: driver default - flush when new resource is bound, 0: disabled, 1: enabled")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionDisableMonitorFence, -1, "Disable dispatching monitor fence commands")
|
||||
DECLARE_DEBUG_VARIABLE(bool, USMEvictAfterMigration, true, "Evict USM allocation after implicit migration to GPU")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, EnableDirectSubmissionController, -1, "Enable direct submission terminating after given timeout, -1: default, 0: disabled, 1: enabled")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionControllerTimeout, -1, "Set direct submission controller timeout, -1: default 5 ms, >=0: timeout in ms")
|
||||
|
||||
/* IMPLICIT SCALING */
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, EnableWalkerPartition, -1, "-1: default, 0: disable, 1: enable, Enables Walker Partitioning via WPARID.")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, SynchronizeWalkerInWparidMode, -1, "-1: default, 0: do not synchronize 1: synchronize all tiles prior to doing work distrubution")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, SynchronizeWithSemaphores, -1, "-1: default (disabled), 1: Emit Semaphores waiting after Walker completion in WPARID mode 0: do not emit semaphores after Walker")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, UseCrossAtomicSynchronization, -1, "-1: default (enabled), 1: Cross Tile Atomic Synchronization present 0: Cross tile atomic synchronization disabled")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, UseAtomicsForNativeSectionCleanup, -1, "-1: default (disabled), 0: use store data op, 1: use atomic op")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ProgramNativeCleanup, -1, "-1: default (API dependent), 0: Do not program native cleanup, 1: program native cleanup")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, WparidRegisterProgramming, -1, "-1: default (enabled), 0: do not program wparid register, 1: programing wparid register")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, UsePipeControlAfterPartitionedWalker, -1, "-1: default (enabled), 0: do not add PipeControl, 1: add PipeControl")
|
||||
|
||||
/*FEATURE FLAGS*/
|
||||
DECLARE_DEBUG_VARIABLE(bool, USMEvictAfterMigration, true, "Evict USM allocation after implicit migration to GPU")
|
||||
DECLARE_DEBUG_VARIABLE(bool, EnableNV12, true, "Enables NV12 extension")
|
||||
DECLARE_DEBUG_VARIABLE(bool, EnablePackedYuv, true, "Enables cl_packed_yuv extension")
|
||||
DECLARE_DEBUG_VARIABLE(bool, EnableDeferredDeleter, true, "Enables async deleter")
|
||||
|
@ -305,8 +309,9 @@ DECLARE_DEBUG_VARIABLE(int32_t, OverrideSystolicPipelineSelect, -1, "set SYSTOLI
|
|||
DECLARE_DEBUG_VARIABLE(int32_t, OverrideSystolicInComputeWalker, -1, "set SYSTOLIC MODE ENABLE in COMPUTE_WALKER cmd, -1:default, 0:disable, 1:enable")
|
||||
|
||||
/*EXPERIMENTAL TOGGLES*/
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalSetWalkerPartitionCount, 0, "Experimental implementation: Set number of COMPUTE_WALKERs for a given Partition Type, 0 - do not set the feature.")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalSetWalkerPartitionType, -1, "Experimental implementation: Set COMPUTE_WALKER Partition Type. Valid values for types from 1 to 3")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalEnableCustomLocalMemoryAlignment, 0, "Align local memory allocations to a given value. Works only with allocations at least as big as the value. 0: no effect, 2097152: 2 megabytes, 1073741824: 1 gigabyte")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ExperimentalUseAtomicsForNativeSectionCleanup, -1, "-1: default (disabled), 0: use store data op, 1: use atomic op")
|
||||
|
||||
/*DRIVER TOGGLES*/
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ForceOCLVersion, 0, "Force specific OpenCL API version")
|
||||
|
|
|
@ -1026,7 +1026,13 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesDynamicImplicitScaling, givenImp
|
|||
uint32_t expectedPartitionSize = (dims[0] + partitionCount - 1u) / partitionCount;
|
||||
EXPECT_EQ(expectedPartitionSize, partitionWalkerCmd->getPartitionSize());
|
||||
|
||||
auto cleanupSectionOffset = WalkerPartition::computeControlSectionOffset<FamilyType>(partitionCount, false, true, false);
|
||||
WalkerPartition::WalkerPartitionArgs args = {};
|
||||
args.initializeWparidRegister = true;
|
||||
args.usePipeControlStall = true;
|
||||
args.partitionCount = partitionCount;
|
||||
args.nativeCrossTileAtomicSync = true;
|
||||
|
||||
auto cleanupSectionOffset = WalkerPartition::computeControlSectionOffset<FamilyType>(args);
|
||||
uint64_t expectedCleanupGpuVa = cmdContainer->getCommandStream()->getGraphicsAllocation()->getGpuAddress() +
|
||||
cleanupSectionOffset;
|
||||
constexpr uint32_t expectedData = 0ull;
|
||||
|
|
|
@ -51,11 +51,85 @@ TEST_F(ImplicitScalingTests, givenDefaultSettingsWhenCheckingAtomicsForNativeCle
|
|||
}
|
||||
|
||||
TEST_F(ImplicitScalingTests, givenForceNotUseAtomicsWhenCheckingAtomicsForNativeCleanupThenExpectFalse) {
|
||||
DebugManager.flags.ExperimentalUseAtomicsForNativeSectionCleanup.set(0);
|
||||
DebugManager.flags.UseAtomicsForNativeSectionCleanup.set(0);
|
||||
EXPECT_FALSE(ImplicitScalingHelper::useAtomicsForNativeCleanup());
|
||||
}
|
||||
|
||||
TEST_F(ImplicitScalingTests, givenForceUseAtomicsWhenCheckingAtomicsForNativeCleanupThenExpectTrue) {
|
||||
DebugManager.flags.ExperimentalUseAtomicsForNativeSectionCleanup.set(1);
|
||||
DebugManager.flags.UseAtomicsForNativeSectionCleanup.set(1);
|
||||
EXPECT_TRUE(ImplicitScalingHelper::useAtomicsForNativeCleanup());
|
||||
}
|
||||
|
||||
TEST_F(ImplicitScalingTests, givenDefaultSettingsIsFalseWhenCheckingProgramNativeCleanupThenExpectFalse) {
|
||||
EXPECT_FALSE(ImplicitScalingHelper::programNativeCleanup(false));
|
||||
}
|
||||
|
||||
TEST_F(ImplicitScalingTests, givenDefaultSettingsIsTrueWhenCheckingProgramNativeCleanupThenExpectTrue) {
|
||||
EXPECT_TRUE(ImplicitScalingHelper::programNativeCleanup(true));
|
||||
}
|
||||
|
||||
TEST_F(ImplicitScalingTests, givenForceNotProgramNativeCleanupWhenDefaultNativeCleanupIsTrueThenExpectFalse) {
|
||||
DebugManager.flags.ProgramNativeCleanup.set(0);
|
||||
EXPECT_FALSE(ImplicitScalingHelper::programNativeCleanup(true));
|
||||
}
|
||||
|
||||
TEST_F(ImplicitScalingTests, givenForceProgramNativeCleanupWhenDefaultNativeCleanupIsFalseThenExpectTrue) {
|
||||
DebugManager.flags.ProgramNativeCleanup.set(1);
|
||||
EXPECT_TRUE(ImplicitScalingHelper::programNativeCleanup(false));
|
||||
}
|
||||
|
||||
TEST_F(ImplicitScalingTests, givenDefaultSettingsWhenCheckingToProgramWparidRegisterThenExpectTrue) {
|
||||
EXPECT_TRUE(ImplicitScalingHelper::initWparidRegister());
|
||||
}
|
||||
|
||||
TEST_F(ImplicitScalingTests, givenForceNotProgramWparidRegisterWhenCheckingRegisterProgramThenExpectFalse) {
|
||||
DebugManager.flags.WparidRegisterProgramming.set(0);
|
||||
EXPECT_FALSE(ImplicitScalingHelper::initWparidRegister());
|
||||
}
|
||||
|
||||
TEST_F(ImplicitScalingTests, givenForceProgramWparidRegisterWhenCheckingRegisterProgramThenExpectTrue) {
|
||||
DebugManager.flags.WparidRegisterProgramming.set(1);
|
||||
EXPECT_TRUE(ImplicitScalingHelper::initWparidRegister());
|
||||
}
|
||||
|
||||
TEST_F(ImplicitScalingTests, givenDefaultSettingsWhenCheckingToUsePipeControlThenExpectTrue) {
|
||||
EXPECT_TRUE(ImplicitScalingHelper::usePipeControl());
|
||||
}
|
||||
|
||||
TEST_F(ImplicitScalingTests, givenForceNotUsePipeControlWhenCheckingPipeControlUseThenExpectFalse) {
|
||||
DebugManager.flags.UsePipeControlAfterPartitionedWalker.set(0);
|
||||
EXPECT_FALSE(ImplicitScalingHelper::usePipeControl());
|
||||
}
|
||||
|
||||
TEST_F(ImplicitScalingTests, givenForceUsePipeControlWhenCheckingPipeControlUseThenExpectTrue) {
|
||||
DebugManager.flags.UsePipeControlAfterPartitionedWalker.set(1);
|
||||
EXPECT_TRUE(ImplicitScalingHelper::usePipeControl());
|
||||
}
|
||||
|
||||
TEST_F(ImplicitScalingTests, givenDefaultSettingsWhenCheckingSemaphoreUseThenExpectFalse) {
|
||||
EXPECT_FALSE(ImplicitScalingHelper::isSemaphoreProgrammingRequired());
|
||||
}
|
||||
|
||||
TEST_F(ImplicitScalingTests, givenForceSemaphoreNotUseWhenCheckingSemaphoreUseThenExpectFalse) {
|
||||
DebugManager.flags.SynchronizeWithSemaphores.set(0);
|
||||
EXPECT_FALSE(ImplicitScalingHelper::isSemaphoreProgrammingRequired());
|
||||
}
|
||||
|
||||
TEST_F(ImplicitScalingTests, givenForceSemaphoreUseWhenCheckingSemaphoreUseThenExpectTrue) {
|
||||
DebugManager.flags.SynchronizeWithSemaphores.set(1);
|
||||
EXPECT_TRUE(ImplicitScalingHelper::isSemaphoreProgrammingRequired());
|
||||
}
|
||||
|
||||
TEST_F(ImplicitScalingTests, givenDefaultSettingsWhenCheckingCrossTileAtomicSyncThenExpectTrue) {
|
||||
EXPECT_TRUE(ImplicitScalingHelper::isCrossTileAtomicRequired());
|
||||
}
|
||||
|
||||
TEST_F(ImplicitScalingTests, givenForceDisableWhenCheckingCrossTileAtomicSyncThenExpectFalse) {
|
||||
DebugManager.flags.UseCrossAtomicSynchronization.set(0);
|
||||
EXPECT_FALSE(ImplicitScalingHelper::isCrossTileAtomicRequired());
|
||||
}
|
||||
|
||||
TEST_F(ImplicitScalingTests, givenForceEnableWhenCheckingCrossTileAtomicSyncThenExpectTrue) {
|
||||
DebugManager.flags.UseCrossAtomicSynchronization.set(1);
|
||||
EXPECT_TRUE(ImplicitScalingHelper::isCrossTileAtomicRequired());
|
||||
}
|
||||
|
|
|
@ -232,3 +232,111 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningPrefer
|
|||
auto itorLrm = find<MI_LOAD_REGISTER_MEM *>(loadRegisterMemList.begin(), loadRegisterMemList.end());
|
||||
ASSERT_NE(itorLrm, loadRegisterMemList.end());
|
||||
}
|
||||
|
||||
HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningPreferredWhenForceDisabledWparidRegisterThenExpectNoCommandFound) {
|
||||
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
|
||||
using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
|
||||
using BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
|
||||
using MI_LOAD_REGISTER_MEM = typename FamilyType::MI_LOAD_REGISTER_MEM;
|
||||
|
||||
DebugManager.flags.WparidRegisterProgramming.set(0);
|
||||
|
||||
uint64_t workPartitionAllocationAddress = 0x987654;
|
||||
uint64_t postSyncAddress = (1ull << 48) | (1ull << 24);
|
||||
|
||||
WALKER_TYPE walker = FamilyType::cmdInitGpgpuWalker;
|
||||
walker.setThreadGroupIdXDimension(1);
|
||||
auto &postSync = walker.getPostSync();
|
||||
postSync.setOperation(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP);
|
||||
postSync.setDestinationAddress(postSyncAddress);
|
||||
|
||||
size_t expectedSize = 0;
|
||||
size_t totalBytesProgrammed = 0;
|
||||
|
||||
expectedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
|
||||
|
||||
uint32_t partitionCount = 0;
|
||||
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, workPartitionAllocationAddress);
|
||||
totalBytesProgrammed = commandStream.getUsed();
|
||||
EXPECT_EQ(expectedSize, totalBytesProgrammed);
|
||||
EXPECT_EQ(twoTile.count(), partitionCount);
|
||||
|
||||
HardwareParse hwParser;
|
||||
hwParser.parseCommands<FamilyType>(commandStream, 0);
|
||||
|
||||
GenCmdList loadRegisterMemList = hwParser.getCommandsList<MI_LOAD_REGISTER_MEM>();
|
||||
auto itorLrm = find<MI_LOAD_REGISTER_MEM *>(loadRegisterMemList.begin(), loadRegisterMemList.end());
|
||||
EXPECT_EQ(itorLrm, loadRegisterMemList.end());
|
||||
}
|
||||
|
||||
HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenStaticPartitioningPreferredWhenForceDisabledPipeControlThenExpectNoCommandFound) {
|
||||
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
|
||||
using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
|
||||
using BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
|
||||
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
|
||||
|
||||
DebugManager.flags.UsePipeControlAfterPartitionedWalker.set(0);
|
||||
|
||||
uint64_t workPartitionAllocationAddress = 0x987654;
|
||||
uint64_t postSyncAddress = (1ull << 48) | (1ull << 24);
|
||||
|
||||
WALKER_TYPE walker = FamilyType::cmdInitGpgpuWalker;
|
||||
walker.setThreadGroupIdXDimension(1);
|
||||
auto &postSync = walker.getPostSync();
|
||||
postSync.setOperation(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP);
|
||||
postSync.setDestinationAddress(postSyncAddress);
|
||||
|
||||
size_t expectedSize = 0;
|
||||
size_t totalBytesProgrammed = 0;
|
||||
|
||||
expectedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, true, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
|
||||
|
||||
uint32_t partitionCount = 0;
|
||||
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, workPartitionAllocationAddress);
|
||||
totalBytesProgrammed = commandStream.getUsed();
|
||||
EXPECT_EQ(expectedSize, totalBytesProgrammed);
|
||||
EXPECT_EQ(twoTile.count(), partitionCount);
|
||||
|
||||
HardwareParse hwParser;
|
||||
hwParser.parseCommands<FamilyType>(commandStream, 0);
|
||||
|
||||
GenCmdList pipeControlList = hwParser.getCommandsList<PIPE_CONTROL>();
|
||||
auto itorPipeControl = find<PIPE_CONTROL *>(pipeControlList.begin(), pipeControlList.end());
|
||||
EXPECT_EQ(itorPipeControl, pipeControlList.end());
|
||||
}
|
||||
|
||||
HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, GivenDynamicPartitioningPreferredWhenForceDisabledPipeControlThenExpectNoCommandFound) {
|
||||
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
|
||||
using POSTSYNC_DATA = typename FamilyType::POSTSYNC_DATA;
|
||||
using BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
|
||||
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
|
||||
|
||||
DebugManager.flags.UsePipeControlAfterPartitionedWalker.set(0);
|
||||
|
||||
uint64_t workPartitionAllocationAddress = 0x0;
|
||||
uint64_t postSyncAddress = (1ull << 48) | (1ull << 24);
|
||||
|
||||
WALKER_TYPE walker = FamilyType::cmdInitGpgpuWalker;
|
||||
walker.setThreadGroupIdXDimension(32);
|
||||
auto &postSync = walker.getPostSync();
|
||||
postSync.setOperation(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP);
|
||||
postSync.setDestinationAddress(postSyncAddress);
|
||||
|
||||
size_t expectedSize = 0;
|
||||
size_t totalBytesProgrammed = 0;
|
||||
|
||||
expectedSize = ImplicitScalingDispatch<FamilyType>::getSize(false, false, twoTile, Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1));
|
||||
|
||||
uint32_t partitionCount = 0;
|
||||
ImplicitScalingDispatch<FamilyType>::dispatchCommands(commandStream, walker, twoTile, partitionCount, true, false, false, workPartitionAllocationAddress);
|
||||
totalBytesProgrammed = commandStream.getUsed();
|
||||
EXPECT_EQ(expectedSize, totalBytesProgrammed);
|
||||
EXPECT_EQ(twoTile.count(), partitionCount);
|
||||
|
||||
HardwareParse hwParser;
|
||||
hwParser.parseCommands<FamilyType>(commandStream, 0);
|
||||
|
||||
GenCmdList pipeControlList = hwParser.getCommandsList<PIPE_CONTROL>();
|
||||
auto itorPipeControl = find<PIPE_CONTROL *>(pipeControlList.begin(), pipeControlList.end());
|
||||
EXPECT_EQ(itorPipeControl, pipeControlList.end());
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue