Add implicit scaling capability to L0 barriers

Related-To: NEO-6262

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2021-11-04 16:54:21 +00:00
committed by Compute-Runtime-Automation
parent 870b324d72
commit 49d4b8f1d8
7 changed files with 244 additions and 54 deletions

View File

@@ -13,6 +13,7 @@
#include "level_zero/core/source/cmdlist/cmdlist_imp.h" #include "level_zero/core/source/cmdlist/cmdlist_imp.h"
#include "igfxfmid.h" #include "igfxfmid.h"
#include "pipe_control_args.h"
namespace NEO { namespace NEO {
enum class ImageType; enum class ImageType;
@@ -240,6 +241,8 @@ struct CommandListCoreFamily : CommandListImp {
void appendSignalEventPostWalker(ze_event_handle_t hEvent); void appendSignalEventPostWalker(ze_event_handle_t hEvent);
void programStateBaseAddress(NEO::CommandContainer &container, bool genericMediaStateClearRequired); void programStateBaseAddress(NEO::CommandContainer &container, bool genericMediaStateClearRequired);
void programThreadArbitrationPolicy(Device *device); void programThreadArbitrationPolicy(Device *device);
void appendComputeBarrierCommand();
NEO::PipeControlArgs createBarrierFlags();
uint64_t getInputBufferSize(NEO::ImageType imageType, uint64_t bytesPerPixel, const ze_image_region_t *region); uint64_t getInputBufferSize(NEO::ImageType imageType, uint64_t bytesPerPixel, const ze_image_region_t *region);
MOCKABLE_VIRTUAL AlignedAllocationData getAlignedAllocation(Device *device, const void *buffer, uint64_t bufferSize); MOCKABLE_VIRTUAL AlignedAllocationData getAlignedAllocation(Device *device, const void *buffer, uint64_t bufferSize);

View File

@@ -2288,4 +2288,32 @@ void CommandListCoreFamily<gfxCoreFamily>::programStateBaseAddress(NEO::CommandC
template <GFXCORE_FAMILY gfxCoreFamily> template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::adjustWriteKernelTimestamp(uint64_t globalAddress, uint64_t contextAddress, bool maskLsb, uint32_t mask) {} void CommandListCoreFamily<gfxCoreFamily>::adjustWriteKernelTimestamp(uint64_t globalAddress, uint64_t contextAddress, bool maskLsb, uint32_t mask) {}
template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendBarrier(ze_event_handle_t hSignalEvent,
uint32_t numWaitEvents,
ze_event_handle_t *phWaitEvents) {
ze_result_t ret = addEventsToCmdList(numWaitEvents, phWaitEvents);
if (ret) {
return ret;
}
appendEventForProfiling(hSignalEvent, true);
if (!hSignalEvent) {
if (isCopyOnly()) {
size_t estimatedSizeRequired = NEO::EncodeMiFlushDW<GfxFamily>::getMiFlushDwCmdSizeForDataWrite();
increaseCommandStreamSpace(estimatedSizeRequired);
NEO::MiFlushArgs args;
NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(*commandContainer.getCommandStream(), 0, 0, args);
} else {
appendComputeBarrierCommand();
}
} else {
appendSignalEventPostWalker(hSignalEvent);
}
return ZE_RESULT_SUCCESS;
}
} // namespace L0 } // namespace L0

View File

@@ -33,32 +33,6 @@ size_t CommandListCoreFamily<gfxCoreFamily>::getReserveSshSize() {
return helper.getRenderSurfaceStateSize(); return helper.getRenderSurfaceStateSize();
} }
template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendBarrier(ze_event_handle_t hSignalEvent,
uint32_t numWaitEvents,
ze_event_handle_t *phWaitEvents) {
ze_result_t ret = addEventsToCmdList(numWaitEvents, phWaitEvents);
if (ret) {
return ret;
}
appendEventForProfiling(hSignalEvent, true);
if (!hSignalEvent) {
if (isCopyOnly()) {
NEO::MiFlushArgs args;
NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(*commandContainer.getCommandStream(), 0, 0, args);
} else {
NEO::PipeControlArgs args;
NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControl(*commandContainer.getCommandStream(), args);
}
} else {
appendSignalEventPostWalker(hSignalEvent);
}
return ZE_RESULT_SUCCESS;
}
template <GFXCORE_FAMILY gfxCoreFamily> template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(ze_kernel_handle_t hKernel, ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(ze_kernel_handle_t hKernel,
const ze_group_count_t *pThreadGroupDimensions, const ze_group_count_t *pThreadGroupDimensions,
@@ -203,4 +177,19 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
template <GFXCORE_FAMILY gfxCoreFamily> template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::appendMultiPartitionPrologue(uint32_t partitionDataSize) {} void CommandListCoreFamily<gfxCoreFamily>::appendMultiPartitionPrologue(uint32_t partitionDataSize) {}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::appendComputeBarrierCommand() {
size_t estimatedSizeRequired = NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForSinglePipeControl();
increaseCommandStreamSpace(estimatedSizeRequired);
NEO::PipeControlArgs args = createBarrierFlags();
NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControl(*commandContainer.getCommandStream(), args);
}
template <GFXCORE_FAMILY gfxCoreFamily>
NEO::PipeControlArgs CommandListCoreFamily<gfxCoreFamily>::createBarrierFlags() {
NEO::PipeControlArgs args;
return args;
}
} // namespace L0 } // namespace L0

View File

@@ -36,33 +36,6 @@ size_t CommandListCoreFamily<gfxCoreFamily>::getReserveSshSize() {
return 4 * MemoryConstants::pageSize; return 4 * MemoryConstants::pageSize;
} }
template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendBarrier(ze_event_handle_t hSignalEvent,
uint32_t numWaitEvents,
ze_event_handle_t *phWaitEvents) {
ze_result_t ret = addEventsToCmdList(numWaitEvents, phWaitEvents);
if (ret) {
return ret;
}
appendEventForProfiling(hSignalEvent, true);
if (!hSignalEvent) {
if (isCopyOnly()) {
NEO::MiFlushArgs args;
NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(*commandContainer.getCommandStream(), 0, 0, args);
} else {
NEO::PipeControlArgs args;
args.hdcPipelineFlush = true;
NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControl(*commandContainer.getCommandStream(), args);
}
} else {
appendSignalEventPostWalker(hSignalEvent);
}
return ZE_RESULT_SUCCESS;
}
template <GFXCORE_FAMILY gfxCoreFamily> template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::applyMemoryRangesBarrier(uint32_t numRanges, void CommandListCoreFamily<gfxCoreFamily>::applyMemoryRangesBarrier(uint32_t numRanges,
const size_t *pRangeSizes, const size_t *pRangeSizes,
@@ -345,4 +318,30 @@ void CommandListCoreFamily<gfxCoreFamily>::appendMultiPartitionPrologue(uint32_t
true); true);
} }
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::appendComputeBarrierCommand() {
NEO::PipeControlArgs args = createBarrierFlags();
if (this->partitionCount > 1) {
size_t estimatedSizeRequired = NEO::ImplicitScalingDispatch<GfxFamily>::getBarrierSize(true);
increaseCommandStreamSpace(estimatedSizeRequired);
NEO::ImplicitScalingDispatch<GfxFamily>::dispatchBarrierCommands(*commandContainer.getCommandStream(),
device->getNEODevice()->getDeviceBitfield(),
args,
true,
true);
} else {
size_t estimatedSizeRequired = NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForSinglePipeControl();
increaseCommandStreamSpace(estimatedSizeRequired);
NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControl(*commandContainer.getCommandStream(), args);
}
}
template <GFXCORE_FAMILY gfxCoreFamily>
NEO::PipeControlArgs CommandListCoreFamily<gfxCoreFamily>::createBarrierFlags() {
NEO::PipeControlArgs args;
args.hdcPipelineFlush = true;
return args;
}
} // namespace L0 } // namespace L0

View File

@@ -44,5 +44,35 @@ class CommandListFixture : public DeviceFixture {
std::unique_ptr<Event> event; std::unique_ptr<Event> event;
}; };
struct MultiTileCommandListFixture : public SingleRootMultiSubDeviceFixture {
void SetUp() {
SingleRootMultiSubDeviceFixture::SetUp();
ze_result_t returnValue;
commandList.reset(whitebox_cast(CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, returnValue)));
commandList->partitionCount = 2;
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
eventPoolDesc.count = 2;
ze_event_desc_t eventDesc = {};
eventDesc.index = 0;
eventDesc.wait = 0;
eventDesc.signal = 0;
eventPool = std::unique_ptr<EventPool>(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc));
event = std::unique_ptr<Event>(Event::create<uint32_t>(eventPool.get(), &eventDesc, device));
}
void TearDown() {
SingleRootMultiSubDeviceFixture::TearDown();
}
std::unique_ptr<L0::ult::CommandList> commandList;
std::unique_ptr<EventPool> eventPool;
std::unique_ptr<Event> event;
};
} // namespace ult } // namespace ult
} // namespace L0 } // namespace L0

View File

@@ -51,6 +51,7 @@ struct WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>
using BaseClass::hostPtrMap; using BaseClass::hostPtrMap;
using BaseClass::indirectAllocationsAllowed; using BaseClass::indirectAllocationsAllowed;
using BaseClass::initialize; using BaseClass::initialize;
using BaseClass::partitionCount;
using BaseClass::patternAllocations; using BaseClass::patternAllocations;
using BaseClass::requiredStreamState; using BaseClass::requiredStreamState;
using BaseClass::unifiedMemoryControls; using BaseClass::unifiedMemoryControls;
@@ -70,6 +71,7 @@ struct WhiteBox<L0::CommandListCoreFamilyImmediate<gfxCoreFamily>>
using BaseClass::clearCommandsToPatch; using BaseClass::clearCommandsToPatch;
using BaseClass::commandsToPatch; using BaseClass::commandsToPatch;
using BaseClass::finalStreamState; using BaseClass::finalStreamState;
using BaseClass::partitionCount;
using BaseClass::requiredStreamState; using BaseClass::requiredStreamState;
WhiteBox() : BaseClass(BaseClass::defaultNumIddsPerBlock) {} WhiteBox() : BaseClass(BaseClass::defaultNumIddsPerBlock) {}
@@ -82,6 +84,7 @@ struct WhiteBox<::L0::CommandList> : public ::L0::CommandListImp {
using BaseClass::commandContainer; using BaseClass::commandContainer;
using BaseClass::commandListPreemptionMode; using BaseClass::commandListPreemptionMode;
using BaseClass::initialize; using BaseClass::initialize;
using BaseClass::partitionCount;
WhiteBox(Device *device); WhiteBox(Device *device);
~WhiteBox() override; ~WhiteBox() override;

View File

@@ -7,6 +7,7 @@
#include "shared/source/command_container/command_encoder.h" #include "shared/source/command_container/command_encoder.h"
#include "shared/test/common/cmd_parse/gen_cmd_parse.h" #include "shared/test/common/cmd_parse/gen_cmd_parse.h"
#include "shared/test/common/helpers/unit_test_helper.h"
#include "test.h" #include "test.h"
@@ -79,5 +80,142 @@ HWTEST_F(CommandListAppendBarrier, GivenEventVsNoEventWhenAppendingBarrierThenCo
ASSERT_LE(sizeWithoutEvent, sizeWithEvent); ASSERT_LE(sizeWithoutEvent, sizeWithEvent);
} }
using MultiTileCommandListAppendBarrier = Test<MultiTileCommandListFixture>;
HWTEST2_F(MultiTileCommandListAppendBarrier, WhenAppendingBarrierThenPipeControlIsGenerated, IsWithinXeGfxFamily) {
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
using MI_ATOMIC = typename FamilyType::MI_ATOMIC;
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
size_t beforeControlSectionOffset = sizeof(MI_STORE_DATA_IMM) +
sizeof(PIPE_CONTROL) +
sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT) +
sizeof(MI_BATCH_BUFFER_START);
size_t startOffset = beforeControlSectionOffset +
(2 * sizeof(uint32_t));
size_t expectedUseBuffer = startOffset +
sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT) +
sizeof(MI_STORE_DATA_IMM) +
sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT);
auto usedSpaceBefore = commandList->commandContainer.getCommandStream()->getUsed();
auto gpuBaseAddress = commandList->commandContainer.getCommandStream()->getGraphicsAllocation()->getGpuAddress() +
usedSpaceBefore;
auto gpuCrossTileSyncAddress = gpuBaseAddress +
beforeControlSectionOffset;
auto gpuFinalSyncAddress = gpuCrossTileSyncAddress +
sizeof(uint32_t);
auto gpuStartAddress = gpuBaseAddress +
startOffset;
auto result = commandList->appendBarrier(nullptr, 0, nullptr);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
auto usedSpaceAfter = commandList->commandContainer.getCommandStream()->getUsed();
ASSERT_GT(usedSpaceAfter, usedSpaceBefore);
size_t usedBuffer = usedSpaceAfter - usedSpaceBefore;
EXPECT_EQ(expectedUseBuffer, usedBuffer);
void *cmdBuffer = ptrOffset(commandList->commandContainer.getCommandStream()->getCpuBase(), usedSpaceBefore);
size_t parsedOffset = 0;
{
auto storeDataImm = genCmdCast<MI_STORE_DATA_IMM *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, storeDataImm);
EXPECT_EQ(gpuFinalSyncAddress, storeDataImm->getAddress());
EXPECT_EQ(0u, storeDataImm->getDataDword0());
parsedOffset += sizeof(MI_STORE_DATA_IMM);
}
{
auto pipeControl = genCmdCast<PIPE_CONTROL *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, pipeControl);
EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable());
EXPECT_FALSE(pipeControl->getDcFlushEnable());
parsedOffset += sizeof(PIPE_CONTROL);
}
{
auto miAtomic = genCmdCast<MI_ATOMIC *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, miAtomic);
auto miAtomicProgrammedAddress = NEO::UnitTestHelper<FamilyType>::getAtomicMemoryAddress(*miAtomic);
EXPECT_EQ(gpuCrossTileSyncAddress, miAtomicProgrammedAddress);
EXPECT_FALSE(miAtomic->getReturnDataControl());
EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode());
parsedOffset += sizeof(MI_ATOMIC);
}
{
auto miSemaphore = genCmdCast<MI_SEMAPHORE_WAIT *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, miSemaphore);
EXPECT_EQ(gpuCrossTileSyncAddress, miSemaphore->getSemaphoreGraphicsAddress());
EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, miSemaphore->getCompareOperation());
EXPECT_EQ(2u, miSemaphore->getSemaphoreDataDword());
parsedOffset += sizeof(MI_SEMAPHORE_WAIT);
}
{
auto bbStart = genCmdCast<MI_BATCH_BUFFER_START *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, bbStart);
EXPECT_EQ(gpuStartAddress, bbStart->getBatchBufferStartAddress());
EXPECT_EQ(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER::SECOND_LEVEL_BATCH_BUFFER_SECOND_LEVEL_BATCH, bbStart->getSecondLevelBatchBuffer());
parsedOffset += sizeof(MI_BATCH_BUFFER_START);
}
{
auto crossField = reinterpret_cast<uint32_t *>(ptrOffset(cmdBuffer, parsedOffset));
EXPECT_EQ(0u, *crossField);
parsedOffset += sizeof(uint32_t);
auto finalField = reinterpret_cast<uint32_t *>(ptrOffset(cmdBuffer, parsedOffset));
EXPECT_EQ(0u, *finalField);
parsedOffset += sizeof(uint32_t);
}
{
auto miAtomic = genCmdCast<MI_ATOMIC *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, miAtomic);
auto miAtomicProgrammedAddress = NEO::UnitTestHelper<FamilyType>::getAtomicMemoryAddress(*miAtomic);
EXPECT_EQ(gpuFinalSyncAddress, miAtomicProgrammedAddress);
EXPECT_FALSE(miAtomic->getReturnDataControl());
EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode());
parsedOffset += sizeof(MI_ATOMIC);
}
{
auto miSemaphore = genCmdCast<MI_SEMAPHORE_WAIT *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, miSemaphore);
EXPECT_EQ(gpuFinalSyncAddress, miSemaphore->getSemaphoreGraphicsAddress());
EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, miSemaphore->getCompareOperation());
EXPECT_EQ(2u, miSemaphore->getSemaphoreDataDword());
parsedOffset += sizeof(MI_SEMAPHORE_WAIT);
}
{
auto storeDataImm = genCmdCast<MI_STORE_DATA_IMM *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, storeDataImm);
EXPECT_EQ(gpuCrossTileSyncAddress, storeDataImm->getAddress());
EXPECT_EQ(0u, storeDataImm->getDataDword0());
parsedOffset += sizeof(MI_STORE_DATA_IMM);
}
{
auto miAtomic = genCmdCast<MI_ATOMIC *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, miAtomic);
auto miAtomicProgrammedAddress = NEO::UnitTestHelper<FamilyType>::getAtomicMemoryAddress(*miAtomic);
EXPECT_EQ(gpuFinalSyncAddress, miAtomicProgrammedAddress);
EXPECT_FALSE(miAtomic->getReturnDataControl());
EXPECT_EQ(MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT, miAtomic->getAtomicOpcode());
parsedOffset += sizeof(MI_ATOMIC);
}
{
auto miSemaphore = genCmdCast<MI_SEMAPHORE_WAIT *>(ptrOffset(cmdBuffer, parsedOffset));
ASSERT_NE(nullptr, miSemaphore);
EXPECT_EQ(gpuFinalSyncAddress, miSemaphore->getSemaphoreGraphicsAddress());
EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, miSemaphore->getCompareOperation());
EXPECT_EQ(4u, miSemaphore->getSemaphoreDataDword());
parsedOffset += sizeof(MI_SEMAPHORE_WAIT);
}
EXPECT_EQ(expectedUseBuffer, parsedOffset);
}
} // namespace ult } // namespace ult
} // namespace L0 } // namespace L0