Command container appends BB_END on cmd buffer allocation end

When linear stream created for command container has not enough space
for command and BB_END it will program BB_END and allocate new command
buffer allocation. Pointer returned from getSpace in this case will
return storage from new command buffer allocation.

Related-To: NEO-5707

Signed-off-by: Maciej Plewka <maciej.plewka@intel.com>
This commit is contained in:
Maciej Plewka
2022-01-12 16:57:42 +00:00
committed by Compute-Runtime-Automation
parent 92316c48f2
commit 9d8ce7aace
31 changed files with 262 additions and 306 deletions

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2020-2021 Intel Corporation
* Copyright (C) 2020-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -159,7 +159,6 @@ struct CommandListCoreFamily : CommandListImp {
ze_result_t reset() override;
ze_result_t executeCommandListImmediate(bool performMigration) override;
size_t getReserveSshSize();
void increaseCommandStreamSpace(size_t commandSize);
protected:
MOCKABLE_VIRTUAL ze_result_t appendMemoryCopyKernelWithGA(void *dstPtr, NEO::GraphicsAllocation *dstPtrAlloc,

View File

@@ -332,7 +332,6 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendEventReset(ze_event_hand
if (this->partitionCount > 1) {
estimateSize += estimateBufferSizeMultiTileBarrier(hwInfo);
}
increaseCommandStreamSpace(estimateSize);
for (uint32_t i = 0u; i < packetsToReset; i++) {
NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(
@@ -896,13 +895,6 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyBlit(uintptr_t
commandContainer.addToResidencyContainer(clearColorAllocation);
NEO::BlitPropertiesContainer blitPropertiesContainer{blitProperties};
bool blitterDirectSubmission = true; // assume direct submission enabled, since usually MI_BATCH_BUFFER_START is bigger than MI_BATCH_BUFFER_END
size_t estimatedSize = NEO::BlitCommandsHelper<GfxFamily>::template BlitCommandsHelper<GfxFamily>::estimateBlitCommandsSize(blitPropertiesContainer,
false,
false,
blitterDirectSubmission,
*device->getNEODevice()->getExecutionEnvironment()->rootDeviceEnvironments[device->getRootDeviceIndex()]);
increaseCommandStreamSpace(estimatedSize);
NEO::BlitCommandsHelper<GfxFamily>::dispatchBlitCommandsForBufferPerRow(blitProperties, *commandContainer.getCommandStream(), *device->getNEODevice()->getExecutionEnvironment()->rootDeviceEnvironments[device->getRootDeviceIndex()]);
@@ -946,13 +938,6 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyBlitRegion(NEO
}
NEO::BlitPropertiesContainer blitPropertiesContainer{blitProperties};
bool blitterDirectSubmission = true; // assume direct submission enabled, since usually MI_BATCH_BUFFER_START is bigger than MI_BATCH_BUFFER_END
size_t estimatedSize = NEO::BlitCommandsHelper<GfxFamily>::template BlitCommandsHelper<GfxFamily>::estimateBlitCommandsSize(blitPropertiesContainer,
false,
false,
blitterDirectSubmission,
*device->getNEODevice()->getExecutionEnvironment()->rootDeviceEnvironments[device->getRootDeviceIndex()]);
increaseCommandStreamSpace(estimatedSize);
appendEventForProfiling(hSignalEvent, true);
bool copyRegionPreferred = NEO::BlitCommandsHelper<GfxFamily>::isCopyRegionPreferred(copySizeModified, *device->getNEODevice()->getExecutionEnvironment()->rootDeviceEnvironments[device->getRootDeviceIndex()]);
@@ -1684,11 +1669,9 @@ void CommandListCoreFamily<gfxCoreFamily>::appendSignalEventPostWalker(ze_event_
if (isCopyOnly()) {
NEO::MiFlushArgs args;
args.commandWithPostSync = true;
increaseCommandStreamSpace(NEO::EncodeMiFlushDW<GfxFamily>::getMiFlushDwCmdSizeForDataWrite());
NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(*commandContainer.getCommandStream(), baseAddr, Event::STATE_SIGNALED,
args, hwInfo);
} else {
increaseCommandStreamSpace(NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForPipeControlWithPostSyncOperation(hwInfo));
NEO::PipeControlArgs args;
args.dcFlushEnable = NEO::MemorySynchronizationCommands<GfxFamily>::getDcFlushEnable(event->signalScope, hwInfo);
if (this->partitionCount > 1) {
@@ -1839,7 +1822,6 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendSignalEvent(ze_event_han
if (isCopyOnly()) {
NEO::MiFlushArgs args;
args.commandWithPostSync = true;
increaseCommandStreamSpace(NEO::EncodeMiFlushDW<GfxFamily>::getMiFlushDwCmdSizeForDataWrite());
NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(*commandContainer.getCommandStream(), ptrOffset(baseAddr, eventSignalOffset),
Event::STATE_SIGNALED, args, hwInfo);
} else {
@@ -1851,7 +1833,6 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendSignalEvent(ze_event_han
event->setPacketsInUse(this->partitionCount);
}
if (applyScope || event->isEventTimestampFlagSet()) {
increaseCommandStreamSpace(NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForPipeControlWithPostSyncOperation(hwInfo));
NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(
*commandContainer.getCommandStream(),
POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
@@ -1860,7 +1841,6 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendSignalEvent(ze_event_han
hwInfo,
args);
} else {
increaseCommandStreamSpace(NEO::EncodeStoreMemory<GfxFamily>::getStoreDataImmSize());
NEO::EncodeStoreMemory<GfxFamily>::programStoreDataImm(
*commandContainer.getCommandStream(),
ptrOffset(baseAddr, eventSignalOffset),
@@ -1928,7 +1908,6 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWaitOnEvents(uint32_t nu
estimatedBufferSize += NEO::EncodeSempahore<GfxFamily>::getSizeMiSemaphoreWait();
}
}
increaseCommandStreamSpace(estimatedBufferSize);
if (dcFlushRequired) {
if (isCopyOnly()) {
@@ -2204,17 +2183,6 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::reserveSpace(size_t size, void
return ZE_RESULT_SUCCESS;
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::increaseCommandStreamSpace(size_t commandSize) {
using MI_BATCH_BUFFER_END = typename GfxFamily::MI_BATCH_BUFFER_END;
size_t estimatedSizeRequired = commandSize + sizeof(MI_BATCH_BUFFER_END);
if (commandContainer.getCommandStream()->getAvailableSpace() < estimatedSizeRequired) {
auto bbEnd = commandContainer.getCommandStream()->template getSpaceForCmd<MI_BATCH_BUFFER_END>();
*bbEnd = GfxFamily::cmdInitBatchBufferEnd;
commandContainer.allocateNextCommandBuffer();
}
}
template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandListCoreFamily<gfxCoreFamily>::prepareIndirectParams(const ze_group_count_t *pThreadGroupDimensions) {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
@@ -2353,9 +2321,6 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendBarrier(ze_event_handle_
const auto &hwInfo = this->device->getHwInfo();
if (!hSignalEvent) {
if (isCopyOnly()) {
size_t estimatedSizeRequired = NEO::EncodeMiFlushDW<GfxFamily>::getMiFlushDwCmdSizeForDataWrite();
increaseCommandStreamSpace(estimatedSizeRequired);
NEO::MiFlushArgs args;
NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(*commandContainer.getCommandStream(), 0, 0, args, hwInfo);
} else {

View File

@@ -183,9 +183,6 @@ void CommandListCoreFamily<gfxCoreFamily>::appendMultiPartitionEpilogue() {}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::appendComputeBarrierCommand() {
size_t estimatedSizeRequired = NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForSinglePipeControl();
increaseCommandStreamSpace(estimatedSizeRequired);
NEO::PipeControlArgs args = createBarrierFlags();
NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControl(*commandContainer.getCommandStream(), args);
}

View File

@@ -134,8 +134,6 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
const auto &hwInfo = this->device->getHwInfo();
if (NEO::DebugManager.flags.ForcePipeControlPriorToWalker.get()) {
increaseCommandStreamSpace(NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForSinglePipeControl());
NEO::PipeControlArgs args;
NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControl(*commandContainer.getCommandStream(), args);
}
@@ -245,8 +243,6 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
event->setPacketsInUse(partitionCount);
}
if (L3FlushEnable) {
size_t estimatedSize = NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForPipeControlWithPostSyncOperation(hwInfo);
increaseCommandStreamSpace(estimatedSize);
programEventL3Flush<gfxCoreFamily>(hEvent, this->device, partitionCount, commandContainer);
}
}
@@ -302,16 +298,12 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::appendMultiPartitionPrologue(uint32_t partitionDataSize) {
size_t estimatedSizeRequired = NEO::ImplicitScalingDispatch<GfxFamily>::getOffsetRegisterSize();
increaseCommandStreamSpace(estimatedSizeRequired);
NEO::ImplicitScalingDispatch<GfxFamily>::dispatchOffsetRegister(*commandContainer.getCommandStream(),
partitionDataSize);
}
template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::appendMultiPartitionEpilogue() {
const size_t estimatedSizeRequired = NEO::ImplicitScalingDispatch<GfxFamily>::getOffsetRegisterSize();
increaseCommandStreamSpace(estimatedSizeRequired);
NEO::ImplicitScalingDispatch<GfxFamily>::dispatchOffsetRegister(*commandContainer.getCommandStream(),
NEO::ImplicitScalingDispatch<GfxFamily>::getPostSyncOffset());
}
@@ -320,14 +312,9 @@ template <GFXCORE_FAMILY gfxCoreFamily>
void CommandListCoreFamily<gfxCoreFamily>::appendComputeBarrierCommand() {
if (this->partitionCount > 1) {
auto neoDevice = device->getNEODevice();
auto &hwInfo = neoDevice->getHardwareInfo();
increaseCommandStreamSpace(estimateBufferSizeMultiTileBarrier(hwInfo));
appendMultiTileBarrier(*neoDevice);
} else {
NEO::PipeControlArgs args = createBarrierFlags();
size_t estimatedSizeRequired = NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForSinglePipeControl();
increaseCommandStreamSpace(estimatedSizeRequired);
NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControl(*commandContainer.getCommandStream(), args);
}
}

View File

@@ -44,9 +44,6 @@ ze_result_t CommandListCoreFamily<IGFX_XE_HPC_CORE>::appendMemoryPrefetch(const
NEO::LinearStream &cmdStream = *commandContainer.getCommandStream();
size_t estimatedSizeRequired = NEO::EncodeMemoryPrefetch<GfxFamily>::getSizeForMemoryPrefetch(size);
increaseCommandStreamSpace(estimatedSizeRequired);
NEO::EncodeMemoryPrefetch<GfxFamily>::programMemoryPrefetch(cmdStream, *gpuAlloc, static_cast<uint32_t>(size), offset, hwInfo);
return ZE_RESULT_SUCCESS;
@@ -56,9 +53,6 @@ template <>
void CommandListCoreFamily<IGFX_XE_HPC_CORE>::applyMemoryRangesBarrier(uint32_t numRanges,
const size_t *pRangeSizes,
const void **pRanges) {
increaseCommandStreamSpace(NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForSinglePipeControl());
NEO::PipeControlArgs args;
args.hdcPipelineFlush = true;
args.unTypedDataPortCacheFlush = true;

View File

@@ -106,10 +106,9 @@ HWTEST_F(CommandListAppendLaunchKernel, givenNotEnoughSpaceInCommandStreamWhenAp
const auto streamCpu = stream->getCpuBase();
Vec3<size_t> groupCount{1, 1, 1};
auto requiredSizeEstimate = EncodeDispatchKernel<FamilyType>::estimateEncodeDispatchKernelCmdsSize(
device->getNEODevice(), {0, 0, 0}, groupCount, false, false, false, kernel.get(), false);
auto sizeLeftInStream = sizeof(MI_BATCH_BUFFER_END);
auto available = stream->getAvailableSpace();
stream->getSpace(available - requiredSizeEstimate + 1);
stream->getSpace(available - sizeLeftInStream);
auto bbEndPosition = stream->getSpace(0);
const uint32_t threadGroupDimensions[3] = {1, 1, 1};
@@ -236,38 +235,6 @@ HWTEST_F(CommandListAppendLaunchKernel, WhenAppendingMultipleTimesThenSshIsNotDe
EXPECT_NE(initialAllocation, reallocatedAllocation);
}
HWTEST2_F(CommandListAppendLaunchKernel, WhenAppendingFunctionThenUsedCmdBufferSizeDoesNotExceedEstimate, IsAtLeastSkl) {
createKernel();
ze_group_count_t groupCount{1, 1, 1};
auto commandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>>();
ze_result_t ret = commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
ASSERT_EQ(ZE_RESULT_SUCCESS, ret);
auto sizeBefore = commandList->commandContainer.getCommandStream()->getUsed();
auto result = commandList->appendLaunchKernelWithParams(kernel->toHandle(), &groupCount, nullptr, false, false, false);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
auto sizeAfter = commandList->commandContainer.getCommandStream()->getUsed();
auto estimate = NEO::EncodeDispatchKernel<FamilyType>::estimateEncodeDispatchKernelCmdsSize(
device->getNEODevice(), Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1), false, false, false, kernel.get(), false);
EXPECT_LE(sizeAfter - sizeBefore, estimate);
sizeBefore = commandList->commandContainer.getCommandStream()->getUsed();
result = commandList->appendLaunchKernelWithParams(kernel->toHandle(), &groupCount, nullptr, true, false, false);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
sizeAfter = commandList->commandContainer.getCommandStream()->getUsed();
estimate = NEO::EncodeDispatchKernel<FamilyType>::estimateEncodeDispatchKernelCmdsSize(
device->getNEODevice(), Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1), false, false, false, kernel.get(), false);
EXPECT_LE(sizeAfter - sizeBefore, estimate);
EXPECT_LE(sizeAfter - sizeBefore, estimate);
}
HWCMDTEST_F(IGFX_GEN8_CORE, CommandListAppendLaunchKernel, givenEventsWhenAppendingKernelThenPostSyncToEventIsGenerated) {
using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER;
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
@@ -689,19 +656,10 @@ HWTEST_F(CommandListAppendLaunchKernel, givenIndirectDispatchWithImplicitArgsWhe
auto result = context->allocDeviceMem(device->toHandle(), &deviceDesc, 16384u, 4096u, &alloc);
ASSERT_EQ(result, ZE_RESULT_SUCCESS);
auto sizeBefore = commandList->commandContainer.getCommandStream()->getUsed();
result = commandList->appendLaunchKernelIndirect(kernel.toHandle(),
static_cast<ze_group_count_t *>(alloc),
nullptr, 0, nullptr);
EXPECT_EQ(result, ZE_RESULT_SUCCESS);
auto sizeAfter = commandList->commandContainer.getCommandStream()->getUsed();
auto estimate = NEO::EncodeDispatchKernel<FamilyType>::estimateEncodeDispatchKernelCmdsSize(
device->getNEODevice(), Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1), false, false, true, &kernel, false);
EXPECT_LE(sizeAfter - sizeBefore, estimate);
auto heap = commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT);
uint64_t pImplicitArgsGPUVA = heap->getGraphicsAllocation()->getGpuAddress() + kernel.getSizeForImplicitArgsPatching() - sizeof(ImplicitArgs);
auto workDimStoreRegisterMemCmd = FamilyType::cmdInitStoreRegisterMem;
@@ -869,19 +827,11 @@ HWTEST_F(CommandListAppendLaunchKernel, givenIndirectDispatchWhenAppendingThenWo
auto result = context->allocDeviceMem(device->toHandle(), &deviceDesc, 16384u, 4096u, &alloc);
ASSERT_EQ(result, ZE_RESULT_SUCCESS);
auto sizeBefore = commandList->commandContainer.getCommandStream()->getUsed();
result = commandList->appendLaunchKernelIndirect(kernel.toHandle(),
static_cast<ze_group_count_t *>(alloc),
nullptr, 0, nullptr);
EXPECT_EQ(result, ZE_RESULT_SUCCESS);
auto sizeAfter = commandList->commandContainer.getCommandStream()->getUsed();
auto estimate = NEO::EncodeDispatchKernel<FamilyType>::estimateEncodeDispatchKernelCmdsSize(
device->getNEODevice(), Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1), false, false, true, &kernel, false);
EXPECT_LE(sizeAfter - sizeBefore, estimate);
kernel.groupSize[2] = 2;
result = commandList->appendLaunchKernelIndirect(kernel.toHandle(),
static_cast<ze_group_count_t *>(alloc),

View File

@@ -1209,20 +1209,19 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, MultiTileCommandListAppendLaunchFunctionXeHpCoreTes
HWTEST2_F(MultiTileCommandListAppendLaunchFunctionXeHpCoreTest, givenCooperativeKernelWhenAppendingKernelsThenDoNotUseImplicitScaling, IsAtLeastXeHpCore) {
ze_group_count_t groupCount{1, 1, 1};
auto estimateWithNonCooperativeKernel = NEO::EncodeDispatchKernel<FamilyType>::estimateEncodeDispatchKernelCmdsSize(
device->getNEODevice(), Vec3<size_t>{0, 0, 0}, Vec3<size_t>{1, 1, 1}, false, false, false, kernel.get(), true);
auto estimateWithCooperativeKernel = NEO::EncodeDispatchKernel<FamilyType>::estimateEncodeDispatchKernelCmdsSize(
device->getNEODevice(), Vec3<size_t>{0, 0, 0}, Vec3<size_t>{1, 1, 1}, false, true, false, kernel.get(), true);
EXPECT_GT(estimateWithNonCooperativeKernel, estimateWithCooperativeKernel);
auto commandListWithNonCooperativeKernel = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>>();
auto result = commandListWithNonCooperativeKernel->initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
auto sizeBefore = commandListWithNonCooperativeKernel->commandContainer.getCommandStream()->getUsed();
result = commandListWithNonCooperativeKernel->appendLaunchKernelWithParams(kernel->toHandle(), &groupCount, nullptr, false, false, false);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
auto sizeUsedWithNonCooperativeKernel = commandListWithNonCooperativeKernel->commandContainer.getCommandStream()->getUsed() - sizeBefore;
EXPECT_LE(sizeUsedWithNonCooperativeKernel, estimateWithNonCooperativeKernel);
auto sizeAfter = commandListWithNonCooperativeKernel->commandContainer.getCommandStream()->getUsed();
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList, ptrOffset(commandListWithNonCooperativeKernel->commandContainer.getCommandStream()->getCpuBase(), sizeBefore), sizeAfter - sizeBefore));
auto itorWalker = find<typename FamilyType::WALKER_TYPE *>(cmdList.begin(), cmdList.end());
auto cmd = genCmdCast<typename FamilyType::WALKER_TYPE *>(*itorWalker);
EXPECT_TRUE(cmd->getWorkloadPartitionEnable());
auto commandListWithCooperativeKernel = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>>();
result = commandListWithCooperativeKernel->initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
@@ -1230,8 +1229,14 @@ HWTEST2_F(MultiTileCommandListAppendLaunchFunctionXeHpCoreTest, givenCooperative
sizeBefore = commandListWithCooperativeKernel->commandContainer.getCommandStream()->getUsed();
result = commandListWithCooperativeKernel->appendLaunchKernelWithParams(kernel->toHandle(), &groupCount, nullptr, false, false, true);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
auto sizeUsedWithCooperativeKernel = commandListWithCooperativeKernel->commandContainer.getCommandStream()->getUsed() - sizeBefore;
EXPECT_LE(sizeUsedWithCooperativeKernel, estimateWithCooperativeKernel);
sizeAfter = commandListWithCooperativeKernel->commandContainer.getCommandStream()->getUsed();
cmdList.clear();
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList, ptrOffset(commandListWithNonCooperativeKernel->commandContainer.getCommandStream()->getCpuBase(), sizeBefore), sizeAfter - sizeBefore));
itorWalker = find<typename FamilyType::WALKER_TYPE *>(cmdList.begin(), cmdList.end());
cmd = genCmdCast<typename FamilyType::WALKER_TYPE *>(*itorWalker);
EXPECT_TRUE(cmd->getWorkloadPartitionEnable());
}
} // namespace ult

View File

@@ -1449,3 +1449,13 @@ HWTEST2_F(HwHelperTest, givenHwInfoConfigWhenCheckingForceNonGpuCoherencyWAThenF
EXPECT_FALSE(hwHelper.forceNonGpuCoherencyWA(true));
EXPECT_FALSE(hwHelper.forceNonGpuCoherencyWA(false));
}
HWTEST_F(HwHelperTest, GivenHwInfoWhenGetBatchBufferEndSizeCalledThenCorrectSizeReturned) {
const auto &hwHelper = HwHelper::get(renderCoreFamily);
EXPECT_EQ(hwHelper.getBatchBufferEndSize(), sizeof(typename FamilyType::MI_BATCH_BUFFER_END));
}
HWTEST_F(HwHelperTest, GivenHwInfoWhenGetBatchBufferEndReferenceCalledThenCorrectPtrReturned) {
const auto &hwHelper = HwHelper::get(renderCoreFamily);
EXPECT_EQ(hwHelper.getBatchBufferEndReference(), reinterpret_cast<const void *>(&FamilyType::cmdInitBatchBufferEnd));
}

View File

@@ -69,8 +69,10 @@ ErrorCode CommandContainer::initialize(Device *device, AllocationsList *reusable
cmdBufferAllocations.push_back(cmdBufferAllocation);
commandStream = std::unique_ptr<LinearStream>(new LinearStream(cmdBufferAllocation->getUnderlyingBuffer(),
defaultListCmdBufferSize));
const auto &hardwareInfo = device->getHardwareInfo();
auto &hwHelper = NEO::HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
commandStream = std::make_unique<LinearStream>(cmdBufferAllocation->getUnderlyingBuffer(),
alignedSize - cmdBufferReservedSize, this, hwHelper.getBatchBufferEndSize());
commandStream->replaceGraphicsAllocation(cmdBufferAllocation);
@@ -264,7 +266,8 @@ void CommandContainer::allocateNextCommandBuffer() {
cmdBufferAllocations.push_back(cmdBufferAllocation);
commandStream->replaceBuffer(cmdBufferAllocation->getUnderlyingBuffer(), defaultListCmdBufferSize);
size_t alignedSize = alignUp<size_t>(totalCmdBufferSize, MemoryConstants::pageSize64k);
commandStream->replaceBuffer(cmdBufferAllocation->getUnderlyingBuffer(), alignedSize - cmdBufferReservedSize);
commandStream->replaceGraphicsAllocation(cmdBufferAllocation);
if (!getFlushTaskUsedForImmediate()) {
@@ -272,6 +275,14 @@ void CommandContainer::allocateNextCommandBuffer() {
}
}
void CommandContainer::closeAndAllocateNextCommandBuffer() {
auto &hwHelper = NEO::HwHelper::get(device->getHardwareInfo().platform.eRenderCoreFamily);
auto bbEndSize = hwHelper.getBatchBufferEndSize();
auto ptr = commandStream->getSpace(0u);
memcpy_s(ptr, bbEndSize, hwHelper.getBatchBufferEndReference(), bbEndSize);
allocateNextCommandBuffer();
}
void CommandContainer::prepareBindfulSsh() {
if (ApiSpecificConfig::getBindlessConfiguration()) {
if (allocationIndirectHeaps[IndirectHeap::Type::SURFACE_STATE] == nullptr) {

View File

@@ -36,10 +36,9 @@ enum class ErrorCode {
class CommandContainer : public NonCopyableOrMovableClass {
public:
static constexpr size_t defaultListCmdBufferSize = MemoryConstants::kiloByte * 256;
static constexpr size_t totalCmdBufferSize =
defaultListCmdBufferSize +
MemoryConstants::cacheLineSize +
CSRequirements::csOverfetchSize;
static constexpr size_t cmdBufferReservedSize = MemoryConstants::cacheLineSize +
CSRequirements::csOverfetchSize;
static constexpr size_t totalCmdBufferSize = defaultListCmdBufferSize + cmdBufferReservedSize;
CommandContainer();
@@ -86,6 +85,7 @@ class CommandContainer : public NonCopyableOrMovableClass {
IndirectHeap *getHeapWithRequiredSizeAndAlignment(HeapType heapType, size_t sizeRequired, size_t alignment);
void allocateNextCommandBuffer();
void closeAndAllocateNextCommandBuffer();
void handleCmdBufferAllocations(size_t startIndex);
GraphicsAllocation *obtainNextCommandBufferAllocation();

View File

@@ -7,7 +7,6 @@
#pragma once
#include "shared/source/command_container/cmdcontainer.h"
#include "shared/source/command_stream/linear_stream.h"
#include "shared/source/debugger/debugger.h"
#include "shared/source/execution_environment/execution_environment.h"
#include "shared/source/helpers/definitions/mi_flush_args.h"
@@ -64,10 +63,6 @@ struct EncodeDispatchKernel {
static void *getInterfaceDescriptor(CommandContainer &container, uint32_t &iddOffset);
static size_t estimateEncodeDispatchKernelCmdsSize(Device *device, const Vec3<size_t> &groupStart, const Vec3<size_t> &groupCount,
bool isInternal, bool isCooperative, bool isIndirect, DispatchKernelEncoderI *dispatchInterface,
bool isPartitioned);
static bool isRuntimeLocalIdsGenerationRequired(uint32_t activeChannels,
size_t *lws,
std::array<uint8_t, 3> walkOrder,
@@ -116,8 +111,6 @@ struct EncodeStates {
const void *fnDynamicStateHeap,
BindlessHeapsHelper *bindlessHeapHelper,
const HardwareInfo &hwInfo);
static size_t getAdjustStateComputeModeSize();
};
template <typename GfxFamily>
@@ -186,9 +179,6 @@ struct EncodeIndirectParams {
static void setWorkDimIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offset, uint64_t crossThreadAddress, const uint32_t *groupSize);
static void setGlobalWorkSizeIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, const uint32_t *lws);
static size_t getCmdsSizeForIndirectParams();
static size_t getCmdsSizeForSetGroupSizeIndirect();
static size_t getCmdsSizeForSetGroupCountIndirect();
static size_t getCmdsSizeForSetWorkDimIndirect(const uint32_t *groupSize, bool misalignedPtr);
};

View File

@@ -92,11 +92,6 @@ uint32_t EncodeStates<Family>::copySamplerState(IndirectHeap *dsh,
return samplerStateOffsetInDsh;
} // namespace NEO
template <typename Family>
inline size_t EncodeStates<Family>::getAdjustStateComputeModeSize() {
return 0;
}
template <typename Family>
void EncodeMathMMIO<Family>::encodeMulRegVal(CommandContainer &container, uint32_t offset, uint32_t val, uint64_t dstAddress) {
int logLws = 0;
@@ -665,22 +660,6 @@ void EncodeIndirectParams<Family>::setGlobalWorkSizeIndirect(CommandContainer &c
}
}
template <typename Family>
inline size_t EncodeIndirectParams<Family>::getCmdsSizeForIndirectParams() {
return 3 * sizeof(typename Family::MI_LOAD_REGISTER_MEM);
}
template <typename Family>
inline size_t EncodeIndirectParams<Family>::getCmdsSizeForSetGroupCountIndirect() {
return 3 * (sizeof(MI_STORE_REGISTER_MEM));
}
template <typename Family>
inline size_t EncodeIndirectParams<Family>::getCmdsSizeForSetGroupSizeIndirect() {
constexpr uint32_t aluCmdSize = sizeof(MI_MATH) + sizeof(MI_MATH_ALU_INST_INLINE) * NUM_ALU_INST_FOR_READ_MODIFY_WRITE;
return 3 * (sizeof(MI_LOAD_REGISTER_REG) + sizeof(MI_LOAD_REGISTER_IMM) + aluCmdSize + sizeof(MI_STORE_REGISTER_MEM));
}
template <typename Family>
inline size_t EncodeIndirectParams<Family>::getCmdsSizeForSetWorkDimIndirect(const uint32_t *groupSize, bool misaligedPtr) {
constexpr uint32_t aluCmdSize = sizeof(MI_MATH) + sizeof(MI_MATH_ALU_INST_INLINE) * NUM_ALU_INST_FOR_READ_MODIFY_WRITE;

View File

@@ -66,15 +66,6 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
if (!args.isIndirect) {
threadDimsVec = {threadDims[0], threadDims[1], threadDims[2]};
}
size_t estimatedSizeRequired = estimateEncodeDispatchKernelCmdsSize(args.device, threadStartVec, threadDimsVec,
args.isInternal, args.isCooperative, args.isIndirect,
args.dispatchInterface, false);
if (container.getCommandStream()->getAvailableSpace() < estimatedSizeRequired) {
auto bbEnd = listCmdBufferStream->getSpaceForCmd<MI_BATCH_BUFFER_END>();
*bbEnd = Family::cmdInitBatchBufferEnd;
container.allocateNextCommandBuffer();
}
WALKER_TYPE cmd = Family::cmdInitGpgpuWalker;
auto idd = Family::cmdInitInterfaceDescriptorData;
@@ -343,40 +334,6 @@ inline void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(const Har
template <typename Family>
void EncodeDispatchKernel<Family>::appendAdditionalIDDFields(INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const HardwareInfo &hwInfo, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy) {}
template <typename Family>
size_t EncodeDispatchKernel<Family>::estimateEncodeDispatchKernelCmdsSize(Device *device, const Vec3<size_t> &groupStart,
const Vec3<size_t> &groupCount, bool isInternal,
bool isCooperative, bool isIndirect, DispatchKernelEncoderI *dispatchInterface,
bool isPartitioned) {
using MEDIA_STATE_FLUSH = typename Family::MEDIA_STATE_FLUSH;
using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename Family::MEDIA_INTERFACE_DESCRIPTOR_LOAD;
using MI_BATCH_BUFFER_END = typename Family::MI_BATCH_BUFFER_END;
size_t issueMediaInterfaceDescriptorLoad = sizeof(MEDIA_STATE_FLUSH) + sizeof(MEDIA_INTERFACE_DESCRIPTOR_LOAD);
size_t totalSize = sizeof(WALKER_TYPE);
totalSize += PreemptionHelper::getPreemptionWaCsSize<Family>(*device);
totalSize += sizeof(MEDIA_STATE_FLUSH);
totalSize += issueMediaInterfaceDescriptorLoad;
totalSize += EncodeStates<Family>::getAdjustStateComputeModeSize();
totalSize += EncodeWA<Family>::getAdditionalPipelineSelectSize(*device);
totalSize += EncodeIndirectParams<Family>::getCmdsSizeForIndirectParams();
totalSize += EncodeIndirectParams<Family>::getCmdsSizeForSetGroupCountIndirect();
totalSize += EncodeIndirectParams<Family>::getCmdsSizeForSetGroupSizeIndirect();
if (isIndirect) {
UNRECOVERABLE_IF(dispatchInterface == nullptr);
totalSize += EncodeIndirectParams<Family>::getCmdsSizeForSetWorkDimIndirect(dispatchInterface->getGroupSize(), false);
if (dispatchInterface->getImplicitArgs()) {
totalSize += EncodeIndirectParams<Family>::getCmdsSizeForSetGroupCountIndirect();
totalSize += EncodeIndirectParams<Family>::getCmdsSizeForSetGroupSizeIndirect();
totalSize += EncodeIndirectParams<Family>::getCmdsSizeForSetWorkDimIndirect(dispatchInterface->getGroupSize(), true);
}
}
totalSize += sizeof(MI_BATCH_BUFFER_END);
return totalSize;
}
template <typename Family>
inline void EncodeComputeMode<Family>::programComputeModeCommand(LinearStream &csr, StateComputeModeProperties &properties, const HardwareInfo &hwInfo) {
}

View File

@@ -63,15 +63,6 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
if (!args.isIndirect) {
threadDimsVec = {threadDims[0], threadDims[1], threadDims[2]};
}
size_t estimatedSizeRequired = estimateEncodeDispatchKernelCmdsSize(args.device, threadStartVec, threadDimsVec,
args.isInternal, args.isCooperative, args.isIndirect, args.dispatchInterface,
args.partitionCount > 1);
if (container.getCommandStream()->getAvailableSpace() < estimatedSizeRequired) {
auto bbEnd = listCmdBufferStream->getSpaceForCmd<MI_BATCH_BUFFER_END>();
*bbEnd = Family::cmdInitBatchBufferEnd;
container.allocateNextCommandBuffer();
}
bool specialModeRequired = kernelDescriptor.kernelAttributes.flags.usesSpecialPipelineSelectMode;
if (PreambleHelper<Family>::isSpecialPipelineSelectModeChanged(container.lastPipelineSelectModeRequired, specialModeRequired, hwInfo)) {
@@ -448,36 +439,6 @@ void EncodeDispatchKernel<Family>::encodeThreadData(WALKER_TYPE &walkerCmd,
}
}
template <typename Family>
size_t EncodeDispatchKernel<Family>::estimateEncodeDispatchKernelCmdsSize(Device *device, const Vec3<size_t> &groupStart,
const Vec3<size_t> &groupCount, bool isInternal,
bool isCooperative, bool isIndirect, DispatchKernelEncoderI *dispatchInterface,
bool isPartitioned) {
size_t totalSize = sizeof(WALKER_TYPE);
totalSize += PreemptionHelper::getPreemptionWaCsSize<Family>(*device);
totalSize += EncodeStates<Family>::getAdjustStateComputeModeSize();
totalSize += EncodeIndirectParams<Family>::getCmdsSizeForIndirectParams();
totalSize += EncodeIndirectParams<Family>::getCmdsSizeForSetGroupCountIndirect();
totalSize += EncodeIndirectParams<Family>::getCmdsSizeForSetGroupSizeIndirect();
if (isIndirect) {
UNRECOVERABLE_IF(dispatchInterface == nullptr);
totalSize += EncodeIndirectParams<Family>::getCmdsSizeForSetWorkDimIndirect(dispatchInterface->getGroupSize(), false);
if (dispatchInterface->getImplicitArgs()) {
totalSize += EncodeIndirectParams<Family>::getCmdsSizeForSetGroupCountIndirect();
totalSize += EncodeIndirectParams<Family>::getCmdsSizeForSetGroupSizeIndirect();
totalSize += EncodeIndirectParams<Family>::getCmdsSizeForSetWorkDimIndirect(dispatchInterface->getGroupSize(), true);
}
}
if ((isPartitioned && !isCooperative) &&
!isInternal) {
const bool staticPartitioning = device->getDefaultEngine().commandStreamReceiver->isStaticWorkPartitioningEnabled();
totalSize += ImplicitScalingDispatch<Family>::getSize(true, staticPartitioning, device->getDeviceBitfield(), groupStart, groupCount);
}
return totalSize;
}
template <typename Family>
void EncodeStateBaseAddress<Family>::setIohAddressForDebugger(NEO::Debugger::SbaAddresses &sbaAddress, const STATE_BASE_ADDRESS &sbaCmd) {
}

View File

@@ -100,8 +100,10 @@ void ImplicitScalingDispatch<GfxFamily>::dispatchCommands(LinearStream &commandS
staticPartitioning,
useSecondaryBatchBuffer);
uint64_t cmdBufferGpuAddress = commandStream.getGraphicsAllocation()->getGpuAddress() + commandStream.getUsed();
void *commandBuffer = commandStream.getSpace(0u);
auto dispatchCommandsSize = getSize(apiSelfCleanup, preferStaticPartitioning, devices, {walkerCmd.getThreadGroupIdStartingX(), walkerCmd.getThreadGroupIdStartingY(), walkerCmd.getThreadGroupIdStartingZ()}, {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()});
void *commandBuffer = commandStream.getSpace(dispatchCommandsSize);
uint64_t cmdBufferGpuAddress = commandStream.getGraphicsAllocation()->getGpuAddress() + commandStream.getUsed() - dispatchCommandsSize;
if (staticPartitioning) {
UNRECOVERABLE_IF(tileCount != partitionCount);
WalkerPartition::constructStaticallyPartitionedCommandBuffer<GfxFamily>(commandBuffer,
@@ -126,7 +128,7 @@ void ImplicitScalingDispatch<GfxFamily>::dispatchCommands(LinearStream &commandS
args,
hwInfo);
}
commandStream.getSpace(totalProgrammedSize);
UNRECOVERABLE_IF(totalProgrammedSize != dispatchCommandsSize);
}
template <typename GfxFamily>
@@ -166,8 +168,9 @@ void ImplicitScalingDispatch<GfxFamily>::dispatchBarrierCommands(LinearStream &c
args.postSyncGpuAddress = gpuAddress;
args.postSyncImmediateValue = immediateData;
uint64_t cmdBufferGpuAddress = commandStream.getGraphicsAllocation()->getGpuAddress() + commandStream.getUsed();
void *commandBuffer = commandStream.getSpace(0u);
auto barrierCommandsSize = getBarrierSize(hwInfo, apiSelfCleanup, args.usePostSync);
void *commandBuffer = commandStream.getSpace(barrierCommandsSize);
uint64_t cmdBufferGpuAddress = commandStream.getGraphicsAllocation()->getGpuAddress() + commandStream.getUsed() - barrierCommandsSize;
WalkerPartition::constructBarrierCommandBuffer<GfxFamily>(commandBuffer,
cmdBufferGpuAddress,
@@ -175,7 +178,7 @@ void ImplicitScalingDispatch<GfxFamily>::dispatchBarrierCommands(LinearStream &c
args,
flushArgs,
hwInfo);
commandStream.getSpace(totalProgrammedSize);
UNRECOVERABLE_IF(totalProgrammedSize != barrierCommandsSize);
}
template <typename GfxFamily>

View File

@@ -273,7 +273,7 @@ class CommandStreamReceiver {
uint64_t getWorkPartitionAllocationGpuAddress() const;
bool isRcs() const;
MOCKABLE_VIRTUAL bool isRcs() const;
virtual void initializeDefaultsForInternalEngine(){};

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2018-2021 Intel Corporation
* Copyright (C) 2018-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -33,4 +33,10 @@ LinearStream::LinearStream(GraphicsAllocation *gfxAllocation)
LinearStream::LinearStream()
: LinearStream(nullptr) {
}
LinearStream::LinearStream(void *buffer, size_t bufferSize, CommandContainer *cmdContainer, size_t batchBufferEndSize)
: LinearStream(buffer, bufferSize) {
this->cmdContainer = cmdContainer;
this->batchBufferEndSize = batchBufferEndSize;
}
} // namespace NEO

View File

@@ -1,13 +1,16 @@
/*
* Copyright (C) 2018-2021 Intel Corporation
* Copyright (C) 2018-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#pragma once
#include "shared/source/command_container/cmdcontainer.h"
#include "shared/source/helpers/debug_helpers.h"
#include "shared/source/helpers/hw_helper.h"
#include "shared/source/helpers/ptr_math.h"
#include "shared/source/helpers/string.h"
#include <atomic>
#include <cstddef>
@@ -23,6 +26,7 @@ class LinearStream {
LinearStream(void *buffer, size_t bufferSize);
LinearStream(GraphicsAllocation *buffer);
LinearStream(GraphicsAllocation *gfxAllocation, void *buffer, size_t bufferSize);
LinearStream(void *buffer, size_t bufferSize, CommandContainer *cmdContainer, size_t batchBufferEndSize);
void *getCpuBase() const;
void *getSpace(size_t size);
size_t getMaxAvailableSpace() const;
@@ -44,6 +48,8 @@ class LinearStream {
size_t maxAvailableSpace;
void *buffer;
GraphicsAllocation *graphicsAllocation;
CommandContainer *cmdContainer = nullptr;
size_t batchBufferEndSize = 0;
};
inline void *LinearStream::getCpuBase() const {
@@ -51,6 +57,10 @@ inline void *LinearStream::getCpuBase() const {
}
inline void *LinearStream::getSpace(size_t size) {
if (cmdContainer != nullptr && getAvailableSpace() < batchBufferEndSize + size) {
UNRECOVERABLE_IF(sizeUsed + batchBufferEndSize > maxAvailableSpace);
cmdContainer->closeAndAllocateNextCommandBuffer();
}
UNRECOVERABLE_IF(sizeUsed + size > maxAvailableSpace);
auto memory = ptrOffset(buffer, sizeUsed);
sizeUsed += size;

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2020-2021 Intel Corporation
* Copyright (C) 2020-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -32,11 +32,6 @@ size_t EncodeWA<Family>::getAdditionalPipelineSelectSize(Device &device) {
return size;
}
template <>
size_t EncodeStates<Family>::getAdjustStateComputeModeSize() {
return sizeof(typename Family::STATE_COMPUTE_MODE);
}
template <>
void EncodeComputeMode<Family>::programComputeModeCommand(LinearStream &csr, StateComputeModeProperties &properties, const HardwareInfo &hwInfo) {
using STATE_COMPUTE_MODE = typename Family::STATE_COMPUTE_MODE;

View File

@@ -9,7 +9,6 @@
#include "shared/source/aub_mem_dump/aub_mem_dump.h"
#include "shared/source/built_ins/sip.h"
#include "shared/source/command_container/command_encoder.h"
#include "shared/source/command_stream/linear_stream.h"
#include "shared/source/commands/bxml_generator_glue.h"
#include "shared/source/helpers/aux_translation.h"
#include "shared/source/helpers/definitions/engine_group_types.h"
@@ -28,6 +27,7 @@ namespace NEO {
class GmmHelper;
class GraphicsAllocation;
class TagAllocatorBase;
class LinearSteram;
class Gmm;
struct AllocationData;
struct AllocationProperties;
@@ -155,6 +155,8 @@ class HwHelper {
virtual bool forceNonGpuCoherencyWA(bool requiresCoherency) const = 0;
virtual bool platformSupportsImplicitScaling(const NEO::HardwareInfo &hwInfo) const = 0;
virtual bool isLinuxCompletionFenceSupported() const = 0;
virtual size_t getBatchBufferEndSize() const = 0;
virtual const void *getBatchBufferEndReference() const = 0;
protected:
HwHelper() = default;
@@ -391,6 +393,8 @@ class HwHelperHw : public HwHelper {
bool forceNonGpuCoherencyWA(bool requiresCoherency) const override;
bool platformSupportsImplicitScaling(const NEO::HardwareInfo &hwInfo) const override;
bool isLinuxCompletionFenceSupported() const override;
size_t getBatchBufferEndSize() const override;
const void *getBatchBufferEndReference() const override;
protected:
static const AuxTranslationMode defaultAuxTranslationMode;

View File

@@ -710,4 +710,12 @@ template <typename GfxFamily>
bool HwHelperHw<GfxFamily>::forceNonGpuCoherencyWA(bool requiresCoherency) const {
return requiresCoherency;
}
template <typename GfxFamily>
size_t HwHelperHw<GfxFamily>::getBatchBufferEndSize() const {
return sizeof(typename GfxFamily::MI_BATCH_BUFFER_END);
}
template <typename GfxFamily>
const void *HwHelperHw<GfxFamily>::getBatchBufferEndReference() const {
return reinterpret_cast<const void *>(&GfxFamily::cmdInitBatchBufferEnd);
}
} // namespace NEO

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2019-2021 Intel Corporation
* Copyright (C) 2019-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -7,7 +7,6 @@
#include "shared/source/command_stream/stream_properties.h"
#include "shared/source/helpers/flat_batch_buffer_helper.h"
#include "shared/source/helpers/hw_helper.h"
#include "shared/source/helpers/preamble_base.inl"
#include "shared/source/kernel/kernel_execution_type.h"

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2021 Intel Corporation
* Copyright (C) 2021-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -7,6 +7,7 @@
#pragma once
#include "shared/source/command_container/command_encoder.h"
#include "shared/source/command_stream/linear_stream.h"
#include "shared/source/debug_settings/debug_settings_manager.h"
#include "shared/source/memory_manager/memory_manager.h"
#include "shared/source/utilities/software_tags.h"

View File

@@ -1,11 +1,14 @@
/*
* Copyright (C) 2021 Intel Corporation
* Copyright (C) 2021-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/command_container/command_encoder.h"
#include "shared/source/helpers/preamble.h"
#include "shared/test/common/mocks/mock_command_stream_receiver.h"
#include "shared/test/common/mocks/mock_device.h"
#include "shared/test/common/test_macros/test.h"
using namespace NEO;
@@ -14,3 +17,30 @@ using Gen12LpCommandEncodeTest = testing::Test;
GEN12LPTEST_F(Gen12LpCommandEncodeTest, givenGen12LpPlatformWhenDoBindingTablePrefetchIsCalledThenReturnsTrue) {
EXPECT_FALSE(EncodeSurfaceState<FamilyType>::doBindingTablePrefetch());
}
template <bool rcs>
class MyCommandStreamReceiverMock : public MockCommandStreamReceiver {
public:
MyCommandStreamReceiverMock(ExecutionEnvironment &executionEnvironment, uint32_t rootDeviceIndex, const DeviceBitfield deviceBitfield) : MockCommandStreamReceiver(executionEnvironment, rootDeviceIndex, deviceBitfield) {}
bool isRcs() const override {
return rcs;
}
};
GEN12LPTEST_F(Gen12LpCommandEncodeTest, givenGen12LpPlatformWhenDefaultEngineIsRcsThenAdditionalPipelineSelectSizeEqualTwoPipelineSelectSize) {
MockDevice device;
auto csr = std::make_unique<MyCommandStreamReceiverMock<true>>(*device.getExecutionEnvironment(), 0, device.getDeviceBitfield());
auto oldCsr = device.getDefaultEngine().commandStreamReceiver;
device.getDefaultEngine().commandStreamReceiver = csr.get();
EXPECT_EQ(2 * PreambleHelper<FamilyType>::getCmdSizeForPipelineSelect(device.getHardwareInfo()), EncodeWA<FamilyType>::getAdditionalPipelineSelectSize(device));
device.getDefaultEngine().commandStreamReceiver = oldCsr;
}
GEN12LPTEST_F(Gen12LpCommandEncodeTest, givenGen12LpPlatformWhenDefaultEngineIsNotRcsThenAdditionalPipelineSelectSizeEqualZero) {
MockDevice device;
auto csr = std::make_unique<MyCommandStreamReceiverMock<false>>(*device.getExecutionEnvironment(), 0, device.getDeviceBitfield());
auto oldCsr = device.getDefaultEngine().commandStreamReceiver;
device.getDefaultEngine().commandStreamReceiver = csr.get();
EXPECT_EQ(0u, EncodeWA<FamilyType>::getAdditionalPipelineSelectSize(device));
device.getDefaultEngine().commandStreamReceiver = oldCsr;
}

View File

@@ -103,22 +103,6 @@ GEN12LPTEST_F(CommandEncoderTest, givenVariousEngineTypesWhenEncodeSBAThenAdditi
}
}
GEN12LPTEST_F(CommandEncoderTest, givenVariousEngineTypesWhenEstimateCommandBufferSizeThenRcsHasAdditionalPipelineSelectWASize) {
using PIPELINE_SELECT = typename FamilyType::PIPELINE_SELECT;
using STATE_COMPUTE_MODE = typename FamilyType::STATE_COMPUTE_MODE;
auto sizeWA = EncodeDispatchKernel<FamilyType>::estimateEncodeDispatchKernelCmdsSize(pDevice, Vec3<size_t>(0, 0, 0),
Vec3<size_t>(1, 1, 1), false, false, false, nullptr, false);
static_cast<MockOsContext *>(pDevice->getDefaultEngine().osContext)->engineType = aub_stream::ENGINE_CCS;
auto size = EncodeDispatchKernel<FamilyType>::estimateEncodeDispatchKernelCmdsSize(pDevice, Vec3<size_t>(0, 0, 0),
Vec3<size_t>(1, 1, 1), false, false, false, nullptr, false);
auto expectedDiff = 2 * PreambleHelper<FamilyType>::getCmdSizeForPipelineSelect(pDevice->getHardwareInfo());
auto diff = sizeWA - size;
EXPECT_EQ(expectedDiff, diff);
}
GEN12LPTEST_F(CommandEncoderTest, GivenGen12LpWhenProgrammingL3StateOnThenExpectNoCommandsDispatched) {
using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM;

View File

@@ -1,11 +1,12 @@
/*
* Copyright (C) 2021 Intel Corporation
* Copyright (C) 2021-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/command_container/command_encoder.h"
#include "shared/source/command_stream/linear_stream.h"
#include "shared/source/command_stream/stream_properties.h"
#include "shared/test/common/helpers/default_hw_info.h"
#include "shared/test/common/test_macros/test.h"

View File

@@ -1,11 +1,12 @@
/*
* Copyright (C) 2021 Intel Corporation
* Copyright (C) 2021-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/command_container/command_encoder.h"
#include "shared/source/command_stream/linear_stream.h"
#include "shared/source/command_stream/stream_properties.h"
#include "shared/test/common/helpers/default_hw_info.h"
#include "shared/test/common/test_macros/test.h"

View File

@@ -6,6 +6,7 @@
*/
#include "shared/source/command_container/cmdcontainer.h"
#include "shared/source/command_stream/linear_stream.h"
#include "shared/source/memory_manager/allocations_list.h"
#include "shared/test/common/fixtures/device_fixture.h"
#include "shared/test/common/helpers/debug_manager_state_restore.h"
@@ -475,7 +476,8 @@ TEST_F(CommandContainerTest, whenAllocateNextCmdBufferIsCalledThenNewAllocationI
EXPECT_NE(nullptr, nextBuffer);
EXPECT_EQ(0u, sizeUsed);
EXPECT_NE(initialBuffer, nextBuffer);
const size_t cmdBufSize = CommandContainer::defaultListCmdBufferSize;
size_t alignedSize = alignUp<size_t>(CommandContainer::totalCmdBufferSize, MemoryConstants::pageSize64k);
const size_t cmdBufSize = alignedSize - CommandContainer::cmdBufferReservedSize;
EXPECT_EQ(cmdBufSize, availableSize);
ASSERT_EQ(2u, cmdContainer->getCmdBufferAllocations().size());
@@ -682,3 +684,48 @@ TEST_F(CommandContainerTest, givenContainerAllocatesNextCommandBufferWhenResetin
}
EXPECT_TRUE(firstAllocationFound);
}
class MyLinearStreamMock : public LinearStream {
public:
using LinearStream::cmdContainer;
};
TEST_F(CommandContainerTest, givenCmdContainerWhenContainerIsInitializedThenStreamContainsContainerPtr) {
CommandContainer cmdContainer;
cmdContainer.initialize(pDevice, nullptr);
EXPECT_EQ(reinterpret_cast<MyLinearStreamMock *>(cmdContainer.getCommandStream())->cmdContainer, &cmdContainer);
}
TEST_F(CommandContainerTest, givenCmdContainerWhenContainerIsInitializedThenStreamSizeEqualAlignedTotalCmdBuffSizeDecreasedOfReservedSize) {
CommandContainer cmdContainer;
cmdContainer.initialize(pDevice, nullptr);
size_t alignedSize = alignUp<size_t>(CommandContainer::totalCmdBufferSize, MemoryConstants::pageSize64k);
EXPECT_EQ(cmdContainer.getCommandStream()->getMaxAvailableSpace(), alignedSize - CommandContainer::cmdBufferReservedSize);
}
TEST_F(CommandContainerTest, givenCmdContainerWhenAlocatingNextCmdBufferThenStreamSizeEqualAlignedTotalCmdBuffSizeDecreasedOfReservedSize) {
CommandContainer cmdContainer;
cmdContainer.initialize(pDevice, nullptr);
cmdContainer.allocateNextCommandBuffer();
size_t alignedSize = alignUp<size_t>(CommandContainer::totalCmdBufferSize, MemoryConstants::pageSize64k);
EXPECT_EQ(cmdContainer.getCommandStream()->getMaxAvailableSpace(), alignedSize - CommandContainer::cmdBufferReservedSize);
}
TEST_F(CommandContainerTest, givenCmdContainerWhenCloseAndAllocateNextCommandBufferCalledThenBBEndPlacedAtEndOfLinearStream) {
CommandContainer cmdContainer;
cmdContainer.initialize(pDevice, nullptr);
auto &hwInfo = pDevice->getHardwareInfo();
auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
auto ptr = cmdContainer.getCommandStream()->getSpace(0u);
cmdContainer.closeAndAllocateNextCommandBuffer();
EXPECT_EQ(memcmp(ptr, hwHelper.getBatchBufferEndReference(), hwHelper.getBatchBufferEndSize()), 0);
}
TEST_F(CommandContainerTest, givenCmdContainerWhenCloseAndAllocateNextCommandBufferCalledThenNewCmdBufferAllocationCreated) {
CommandContainer cmdContainer;
cmdContainer.initialize(pDevice, nullptr);
EXPECT_EQ(cmdContainer.getCmdBufferAllocations().size(), 1u);
cmdContainer.closeAndAllocateNextCommandBuffer();
EXPECT_EQ(cmdContainer.getCmdBufferAllocations().size(), 2u);
}

View File

@@ -6,6 +6,7 @@
*/
#include "shared/source/command_container/command_encoder.h"
#include "shared/source/command_stream/linear_stream.h"
#include "shared/source/memory_manager/graphics_allocation.h"
#include "shared/test/common/helpers/default_hw_info.h"
#include "shared/test/common/helpers/unit_test_helper.h"

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2018-2021 Intel Corporation
* Copyright (C) 2018-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -7,6 +7,7 @@
#include "shared/source/command_stream/linear_stream.h"
#include "shared/source/memory_manager/graphics_allocation.h"
#include "shared/test/common/fixtures/device_fixture.h"
#include "shared/test/common/fixtures/linear_stream_fixture.h"
#include "shared/test/common/mocks/mock_graphics_allocation.h"
@@ -109,3 +110,87 @@ TEST_F(LinearStreamTest, givenNewGraphicsAllocationWhenReplaceIsCalledThenLinear
linearStream.replaceGraphicsAllocation(&newGraphicsAllocation);
EXPECT_EQ(&newGraphicsAllocation, linearStream.getGraphicsAllocation());
}
class MyLinearStreamMock : public LinearStream {
public:
using LinearStream::sizeUsed;
};
TEST_F(LinearStreamTest, givenLinearStreamWithoutCmdContainerWhenOneByteLeftInStreamThenGetSpaceDontThrowAbort) {
reinterpret_cast<MyLinearStreamMock *>(&linearStream)->sizeUsed = linearStream.getMaxAvailableSpace() - 1;
EXPECT_NO_THROW(linearStream.getSpace(1));
}
using CommandContainerLinearStreamTest = Test<DeviceFixture>;
TEST_F(CommandContainerLinearStreamTest, givenLinearStreamWithCmdContainerWhenOneByteLeftInStreamThenGetSpaceThrowAbort) {
CommandContainer cmdContainer;
cmdContainer.initialize(pDevice, nullptr);
auto stream = reinterpret_cast<MyLinearStreamMock *>(cmdContainer.getCommandStream());
stream->sizeUsed = stream->getMaxAvailableSpace() - 1;
EXPECT_THROW(stream->getSpace(1), std::exception);
}
TEST_F(CommandContainerLinearStreamTest, givenLinearStreamWithCmdContainerWhenThereIsNoSpaceForCommandAndBBEndThenNewCmdBufferAllocated) {
CommandContainer cmdContainer;
cmdContainer.initialize(pDevice, nullptr);
auto &hwInfo = pDevice->getHardwareInfo();
auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
auto stream = reinterpret_cast<MyLinearStreamMock *>(cmdContainer.getCommandStream());
size_t dummyCommandSize = 2;
stream->sizeUsed = stream->getMaxAvailableSpace() - hwHelper.getBatchBufferEndSize() - (dummyCommandSize - 1);
EXPECT_EQ(cmdContainer.getCmdBufferAllocations().size(), 1u);
stream->getSpace(dummyCommandSize);
EXPECT_EQ(cmdContainer.getCmdBufferAllocations().size(), 2u);
}
TEST_F(CommandContainerLinearStreamTest, givenLinearStreamWithCmdContainerWhenThereIsNoSpaceForCommandAndBBEndThenLinearStreamHasNewAllocation) {
CommandContainer cmdContainer;
cmdContainer.initialize(pDevice, nullptr);
auto &hwInfo = pDevice->getHardwareInfo();
auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
auto stream = reinterpret_cast<MyLinearStreamMock *>(cmdContainer.getCommandStream());
size_t dummyCommandSize = 2;
stream->sizeUsed = stream->getMaxAvailableSpace() - hwHelper.getBatchBufferEndSize() - (dummyCommandSize - 1);
auto oldBuffer = stream->getCpuBase();
stream->getSpace(dummyCommandSize);
auto newBuffer = stream->getCpuBase();
EXPECT_NE(newBuffer, oldBuffer);
}
TEST_F(CommandContainerLinearStreamTest, givenLinearStreamWithCmdContainerWhenThereIsNoSpaceForCommandAndBBEndThenGetSpaceReturnPtrFromNewAllocation) {
CommandContainer cmdContainer;
cmdContainer.initialize(pDevice, nullptr);
auto &hwInfo = pDevice->getHardwareInfo();
auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
auto stream = reinterpret_cast<MyLinearStreamMock *>(cmdContainer.getCommandStream());
size_t dummyCommandSize = 2;
stream->sizeUsed = stream->getMaxAvailableSpace() - hwHelper.getBatchBufferEndSize() - (dummyCommandSize - 1);
auto ptr = stream->getSpace(dummyCommandSize);
auto buffer = stream->getCpuBase();
EXPECT_EQ(buffer, ptr);
}
TEST_F(CommandContainerLinearStreamTest, givenLinearStreamWithCmdContainerWhenThereIsSpaceForCommandAndBBEndThenNewCmdBufferIsNotAllocated) {
CommandContainer cmdContainer;
cmdContainer.initialize(pDevice, nullptr);
auto &hwInfo = pDevice->getHardwareInfo();
auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
auto stream = reinterpret_cast<MyLinearStreamMock *>(cmdContainer.getCommandStream());
size_t dummyCommandSize = 2;
stream->sizeUsed = stream->getMaxAvailableSpace() - hwHelper.getBatchBufferEndSize() - (dummyCommandSize);
EXPECT_EQ(cmdContainer.getCmdBufferAllocations().size(), 1u);
stream->getSpace(dummyCommandSize);
EXPECT_EQ(cmdContainer.getCmdBufferAllocations().size(), 1u);
}
TEST_F(CommandContainerLinearStreamTest, givenLinearStreamWithCmdContainerWhenThereIsNoSpaceForCommandAndBBEndThenBBEndAddedAtEndOfStream) {
CommandContainer cmdContainer;
cmdContainer.initialize(pDevice, nullptr);
auto &hwInfo = pDevice->getHardwareInfo();
auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
auto stream = reinterpret_cast<MyLinearStreamMock *>(cmdContainer.getCommandStream());
size_t dummyCommandSize = 2;
stream->sizeUsed = stream->getMaxAvailableSpace() - hwHelper.getBatchBufferEndSize() - (dummyCommandSize - 1);
auto ptr = stream->getSpace(0u);
stream->getSpace(dummyCommandSize);
EXPECT_EQ(memcmp(ptr, hwHelper.getBatchBufferEndReference(), hwHelper.getBatchBufferEndSize()), 0);
}

View File

@@ -955,8 +955,6 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesDynamicImplicitScaling, givenImp
bool requiresUncachedMocs = false;
bool isInternal = false;
size_t regularEstimateSize = EncodeDispatchKernel<FamilyType>::estimateEncodeDispatchKernelCmdsSize(
pDevice, Vec3<size_t>(0, 0, 0), Vec3<size_t>(16, 1, 1), isInternal, false, false, nullptr, false);
EncodeDispatchKernelArgs dispatchArgs = createDefaultDispatchKernelArgs(pDevice, dispatchInterface.get(), dims, requiresUncachedMocs);
dispatchArgs.isInternal = isInternal;
@@ -972,8 +970,6 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesDynamicImplicitScaling, givenImp
EXPECT_EQ(WALKER_TYPE::PARTITION_TYPE::PARTITION_TYPE_DISABLED, baseWalkerCmd->getPartitionType());
EXPECT_EQ(16u, baseWalkerCmd->getThreadGroupIdXDimension());
size_t partitionEstimateSize = EncodeDispatchKernel<FamilyType>::estimateEncodeDispatchKernelCmdsSize(
pDevice, Vec3<size_t>(0, 0, 0), Vec3<size_t>(16, 1, 1), isInternal, false, false, nullptr, true);
dispatchArgs.partitionCount = 2;
EncodeDispatchKernel<FamilyType>::encode(*cmdContainer.get(), dispatchArgs);
@@ -982,7 +978,6 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesDynamicImplicitScaling, givenImp
size_t expectedPartitionedWalkerSize = ImplicitScalingDispatch<FamilyType>::getSize(true, false, pDevice->getDeviceBitfield(), Vec3<size_t>(0, 0, 0), Vec3<size_t>(16, 1, 1));
EXPECT_EQ(expectedPartitionedWalkerSize, partitionedWalkerSize);
EXPECT_EQ(partitionEstimateSize, regularEstimateSize + expectedPartitionedWalkerSize);
GenCmdList partitionedWalkerList;
CmdParse<FamilyType>::parseCommandBuffer(
@@ -1020,23 +1015,18 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesDynamicImplicitScaling, givenImp
std::unique_ptr<MockDispatchKernelEncoder> dispatchInterface(new MockDispatchKernelEncoder());
bool isInternal = false;
size_t baseEstimateSize = EncodeDispatchKernel<FamilyType>::estimateEncodeDispatchKernelCmdsSize(
pDevice, Vec3<size_t>(0, 0, 0), Vec3<size_t>(16, 1, 1), isInternal, false, false, dispatchInterface.get(), false);
bool requiresUncachedMocs = false;
EncodeDispatchKernelArgs dispatchArgs = createDefaultDispatchKernelArgs(pDevice, dispatchInterface.get(), dims, requiresUncachedMocs);
dispatchArgs.isInternal = isInternal;
dispatchArgs.partitionCount = 2;
size_t partitionEstimateSize = EncodeDispatchKernel<FamilyType>::estimateEncodeDispatchKernelCmdsSize(
pDevice, Vec3<size_t>(0, 0, 0), Vec3<size_t>(16, 1, 1), isInternal, false, false, dispatchInterface.get(), true);
EncodeDispatchKernel<FamilyType>::encode(*cmdContainer.get(), dispatchArgs);
EXPECT_EQ(2u, dispatchArgs.partitionCount);
size_t partitionedWalkerSize = cmdContainer->getCommandStream()->getUsed();
size_t expectedPartitionedWalkerSize = ImplicitScalingDispatch<FamilyType>::getSize(true, false, pDevice->getDeviceBitfield(), Vec3<size_t>(0, 0, 0), Vec3<size_t>(16, 1, 1));
EXPECT_EQ(partitionEstimateSize, baseEstimateSize + expectedPartitionedWalkerSize);
EXPECT_EQ(expectedPartitionedWalkerSize, partitionedWalkerSize);
GenCmdList partitionedWalkerList;
@@ -1124,23 +1114,17 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesDynamicImplicitScaling,
std::unique_ptr<MockDispatchKernelEncoder> dispatchInterface(new MockDispatchKernelEncoder());
bool isInternal = false;
size_t baseEstimateSize = EncodeDispatchKernel<FamilyType>::estimateEncodeDispatchKernelCmdsSize(
pDevice, Vec3<size_t>(0, 0, 0), Vec3<size_t>(16, 1, 1), isInternal, false, false, dispatchInterface.get(), false);
bool requiresUncachedMocs = false;
EncodeDispatchKernelArgs dispatchArgs = createDefaultDispatchKernelArgs(pDevice, dispatchInterface.get(), dims, requiresUncachedMocs);
dispatchArgs.isInternal = isInternal;
dispatchArgs.partitionCount = 2;
size_t partitionEstimateSize = EncodeDispatchKernel<FamilyType>::estimateEncodeDispatchKernelCmdsSize(
pDevice, Vec3<size_t>(0, 0, 0), Vec3<size_t>(16, 1, 1), isInternal, false, false, dispatchInterface.get(), true);
EncodeDispatchKernel<FamilyType>::encode(*cmdContainer.get(), dispatchArgs);
EXPECT_EQ(2u, dispatchArgs.partitionCount);
size_t partitionedWalkerSize = cmdContainer->getCommandStream()->getUsed();
size_t expectedPartitionedWalkerSize = ImplicitScalingDispatch<FamilyType>::getSize(true, false, pDevice->getDeviceBitfield(), Vec3<size_t>(0, 0, 0), Vec3<size_t>(16, 1, 1));
EXPECT_EQ(partitionEstimateSize, baseEstimateSize + expectedPartitionedWalkerSize);
EXPECT_EQ(expectedPartitionedWalkerSize, partitionedWalkerSize);
GenCmdList partitionedWalkerList;
@@ -1187,20 +1171,12 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesDynamicImplicitScaling, givenImp
uint32_t dims[] = {16, 1, 1};
std::unique_ptr<MockDispatchKernelEncoder> dispatchInterface(new MockDispatchKernelEncoder());
bool isInternal = false;
size_t baseEstimateSize = EncodeDispatchKernel<FamilyType>::estimateEncodeDispatchKernelCmdsSize(
pDevice, Vec3<size_t>(0, 0, 0), Vec3<size_t>(16, 1, 1), isInternal, false, false, dispatchInterface.get(), false);
isInternal = true;
bool isInternal = true;
bool requiresUncachedMocs = false;
EncodeDispatchKernelArgs dispatchArgs = createDefaultDispatchKernelArgs(pDevice, dispatchInterface.get(), dims, requiresUncachedMocs);
dispatchArgs.isInternal = isInternal;
dispatchArgs.partitionCount = 2;
size_t internalEstimateSize = EncodeDispatchKernel<FamilyType>::estimateEncodeDispatchKernelCmdsSize(
pDevice, Vec3<size_t>(0, 0, 0), Vec3<size_t>(16, 1, 1), isInternal, false, false, dispatchInterface.get(), true);
EXPECT_EQ(baseEstimateSize, internalEstimateSize);
EncodeDispatchKernel<FamilyType>::encode(*cmdContainer.get(), dispatchArgs);
size_t internalWalkerSize = cmdContainer->getCommandStream()->getUsed();