Make partitioned post sync operations for partitioned workloads

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz 2021-09-03 11:42:31 +00:00 committed by Compute-Runtime-Automation
parent 86f8150dc7
commit 6b299a3ab0
22 changed files with 359 additions and 56 deletions

View File

@ -216,12 +216,6 @@ struct CommandList : _ze_command_list_handle_t {
TYPE_IMMEDIATE = 1u
};
CommandQueue *cmdQImmediate = nullptr;
NEO::CommandStreamReceiver *csr = nullptr;
uint32_t cmdListType = CommandListType::TYPE_REGULAR;
Device *device = nullptr;
std::vector<Kernel *> printfFunctionContainer;
virtual ze_result_t executeCommandListImmediate(bool performMigration) = 0;
virtual ze_result_t initialize(Device *device, NEO::EngineGroupType engineGroupType, ze_command_list_flags_t flags) = 0;
virtual ~CommandList();
@ -241,33 +235,41 @@ struct CommandList : _ze_command_list_handle_t {
return commandsToPatch;
}
bool isSyncModeQueue = false;
bool commandListSLMEnabled = false;
uint32_t commandListPerThreadScratchSize = 0u;
NEO::PreemptionMode commandListPreemptionMode = NEO::PreemptionMode::Initial;
uint32_t threadArbitrationPolicy = NEO::ThreadArbitrationPolicy::RoundRobin;
bool isFlushTaskSubmissionEnabled = false;
void makeResidentAndMigrate(bool);
void migrateSharedAllocations();
std::vector<Kernel *> printfFunctionContainer;
CommandQueue *cmdQImmediate = nullptr;
NEO::CommandStreamReceiver *csr = nullptr;
Device *device = nullptr;
NEO::PreemptionMode commandListPreemptionMode = NEO::PreemptionMode::Initial;
uint32_t cmdListType = CommandListType::TYPE_REGULAR;
uint32_t commandListPerThreadScratchSize = 0u;
uint32_t threadArbitrationPolicy = NEO::ThreadArbitrationPolicy::RoundRobin;
uint32_t partitionCount = 1;
bool isFlushTaskSubmissionEnabled = false;
bool isSyncModeQueue = false;
bool commandListSLMEnabled = false;
protected:
std::map<const void *, NEO::GraphicsAllocation *> hostPtrMap;
NEO::EngineGroupType engineGroupType;
ze_command_list_flags_t flags = 0u;
UnifiedMemoryControls unifiedMemoryControls;
bool indirectAllocationsAllowed = false;
bool internalUsage = false;
bool containsCooperativeKernelsFlag = false;
NEO::GraphicsAllocation *getAllocationFromHostPtrMap(const void *buffer, uint64_t bufferSize);
NEO::GraphicsAllocation *getHostPtrAlloc(const void *buffer, uint64_t bufferSize);
bool containsStatelessUncachedResource = false;
std::map<const void *, NEO::GraphicsAllocation *> hostPtrMap;
std::vector<NEO::GraphicsAllocation *> ownedPrivateAllocations;
NEO::StreamProperties requiredStreamState{};
NEO::StreamProperties finalStreamState{};
CommandsToPatch commandsToPatch{};
std::vector<NEO::GraphicsAllocation *> ownedPrivateAllocations;
ze_command_list_flags_t flags = 0u;
UnifiedMemoryControls unifiedMemoryControls;
NEO::EngineGroupType engineGroupType;
bool indirectAllocationsAllowed = false;
bool internalUsage = false;
bool containsCooperativeKernelsFlag = false;
bool containsStatelessUncachedResource = false;
};
using CommandListAllocatorFn = CommandList *(*)(uint32_t);

View File

@ -108,7 +108,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::reset() {
device->getNEODevice()->getMemoryManager()->freeGraphicsMemory(alloc);
}
this->ownedPrivateAllocations.clear();
partitionCount = 1;
return ZE_RESULT_SUCCESS;
}

View File

@ -228,6 +228,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
partitionCount,
internalUsage,
isCooperative);
this->partitionCount = std::max(partitionCount, this->partitionCount);
if (hEvent) {
auto event = Event::fromHandle(hEvent);
if (partitionCount > 1) {

View File

@ -106,10 +106,17 @@ ze_result_t CommandQueueImp::synchronizeByPollingForTaskCount(uint64_t timeout)
timeoutMicroseconds = NEO::TimeoutControls::maxTimeout;
}
csr->waitForCompletionWithTimeout(enableTimeout, timeoutMicroseconds, this->taskCount);
if (*csr->getTagAddress() < taskCountToWait) {
return ZE_RESULT_NOT_READY;
if (partitionCount > 1) {
volatile uint32_t *pollAddress = csr->getTagAddress();
for (uint32_t i = 0; i < partitionCount; i++) {
csr->waitForCompletionWithTimeout(pollAddress, enableTimeout, timeoutMicroseconds, this->taskCount);
pollAddress += addressOffsetDwords;
}
} else {
csr->waitForCompletionWithTimeout(enableTimeout, timeoutMicroseconds, this->taskCount);
if (*csr->getTagAddress() < taskCountToWait) {
return ZE_RESULT_NOT_READY;
}
}
postSyncOperations();

View File

@ -57,6 +57,7 @@ struct CommandQueue : _ze_command_queue_handle_t {
protected:
NEO::PreemptionMode commandQueuePreemptionMode = NEO::PreemptionMode::Initial;
uint32_t partitionCount = 1;
bool preemptionCmdSyncProgramming = true;
bool commandQueueDebugCmdsProgrammed = false;
bool isCopyOnlyCommandQueue = false;

View File

@ -10,6 +10,7 @@
#include "shared/source/built_ins/built_ins.h"
#include "shared/source/built_ins/sip.h"
#include "shared/source/command_container/command_encoder.h"
#include "shared/source/command_container/implicit_scaling.h"
#include "shared/source/command_stream/command_stream_receiver_hw.h"
#include "shared/source/command_stream/linear_stream.h"
#include "shared/source/command_stream/preemption.h"
@ -72,6 +73,9 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION;
using MI_LOAD_REGISTER_MEM = typename GfxFamily::MI_LOAD_REGISTER_MEM;
using MI_LOAD_REGISTER_IMM = typename GfxFamily::MI_LOAD_REGISTER_IMM;
auto lockCSR = csr->obtainUniqueOwnership();
auto anyCommandListWithCooperativeKernels = false;
@ -177,6 +181,8 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
heapContainer.push_back(element);
}
}
partitionCount = std::max(partitionCount, commandList->partitionCount);
}
size_t linearStreamSizeEstimate = totalCmdBuffers * sizeof(MI_BATCH_BUFFER_START);
@ -240,6 +246,10 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
}
linearStreamSizeEstimate += isCopyOnlyCommandQueue ? NEO::EncodeMiFlushDW<GfxFamily>::getMiFlushDwCmdSizeForDataWrite() : NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForPipeControlWithPostSyncOperation(hwInfo);
if (partitionCount > 1) {
linearStreamSizeEstimate += sizeof(MI_LOAD_REGISTER_MEM) + sizeof(MI_LOAD_REGISTER_IMM);
}
size_t alignedSize = alignUp<size_t>(linearStreamSizeEstimate, minCmdBufferPtrAlign);
size_t padding = alignedSize - linearStreamSizeEstimate;
reserveLinearStreamSize(alignedSize);
@ -399,6 +409,17 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
commandQueuePreemptionMode = statePreemption;
if (partitionCount > 1) {
uint64_t workPartitionAddress = csr->getWorkPartitionAllocationGpuAddress();
NEO::EncodeSetMMIO<GfxFamily>::encodeMEM(child,
NEO::PartitionRegisters<GfxFamily>::wparidCCSOffset,
workPartitionAddress);
NEO::EncodeSetMMIO<GfxFamily>::encodeIMM(child,
NEO::PartitionRegisters<GfxFamily>::addressOffsetCCSOffset,
addressOffset,
true);
}
if (hFence) {
csr->makeResident(fence->getAllocation());
if (isCopyOnlyCommandQueue) {
@ -407,6 +428,10 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(child, fence->getGpuAddress(), Fence::STATE_SIGNALED, args);
} else {
NEO::PipeControlArgs args(true);
if (partitionCount > 1) {
args.workloadPartitionOffset = true;
fence->setPartitionCount(partitionCount);
}
NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(
child, POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
fence->getGpuAddress(),
@ -539,6 +564,9 @@ void CommandQueueHw<gfxCoreFamily>::dispatchTaskCountWrite(NEO::LinearStream &co
NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(commandStream, gpuAddress, taskCountToWrite, args);
} else {
NEO::PipeControlArgs args(true);
if (partitionCount > 1) {
args.workloadPartitionOffset = true;
}
args.notifyEnable = csr->isUsedNotifyEnableForPostSync();
NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(
commandStream,

View File

@ -62,6 +62,9 @@ struct CommandQueueImp : public CommandQueue {
MemoryConstants::cacheLineSize +
NEO::CSRequirements::csOverfetchSize;
static constexpr uint32_t addressOffsetDwords = 2u;
static constexpr uint32_t addressOffset = sizeof(uint32_t) * addressOffsetDwords;
CommandQueueImp() = delete;
CommandQueueImp(Device *device, NEO::CommandStreamReceiver *csr, const ze_command_queue_desc_t *desc);

View File

@ -37,9 +37,15 @@ ze_result_t FenceImp::queryStatus() {
csr->downloadAllocations();
}
uint64_t *hostAddr = static_cast<uint64_t *>(allocation->getUnderlyingBuffer());
void *hostAddr = static_cast<uint64_t *>(allocation->getUnderlyingBuffer());
uint32_t queryVal = Fence::STATE_CLEARED;
memcpy_s(static_cast<void *>(&queryVal), sizeof(uint32_t), static_cast<void *>(hostAddr), sizeof(uint32_t));
for (uint32_t i = 0; i < partitionCount; i++) {
memcpy_s(static_cast<void *>(&queryVal), sizeof(uint32_t), hostAddr, sizeof(uint32_t));
if (queryVal == Fence::STATE_CLEARED) {
break;
}
hostAddr = ptrOffset(hostAddr, CommandQueueImp::addressOffset);
}
return queryVal == Fence::STATE_CLEARED ? ZE_RESULT_NOT_READY : ZE_RESULT_SUCCESS;
}

View File

@ -47,8 +47,13 @@ struct Fence : _ze_fence_handle_t {
return allocation->getGpuAddress();
}
void setPartitionCount(uint32_t newPartitionCount) {
partitionCount = newPartitionCount;
}
protected:
NEO::GraphicsAllocation *allocation = nullptr;
uint32_t partitionCount = 1;
};
struct FenceImp : public Fence {

View File

@ -27,8 +27,10 @@ struct WhiteBox<::L0::CommandQueue> : public ::L0::CommandQueueImp {
using BaseClass::printfFunctionContainer;
using BaseClass::submitBatchBuffer;
using BaseClass::synchronizeByPollingForTaskCount;
using BaseClass::taskCount;
using CommandQueue::commandQueuePreemptionMode;
using CommandQueue::internalUsage;
using CommandQueue::partitionCount;
WhiteBox(Device *device, NEO::CommandStreamReceiver *csr,
const ze_command_queue_desc_t *desc);
@ -85,6 +87,7 @@ struct MockCommandQueueHw : public L0::CommandQueueHw<gfxCoreFamily> {
using BaseClass::commandStream;
using BaseClass::printfFunctionContainer;
using L0::CommandQueue::internalUsage;
using L0::CommandQueue::partitionCount;
using L0::CommandQueue::preemptionCmdSyncProgramming;
using L0::CommandQueueImp::csr;

View File

@ -845,5 +845,22 @@ HWTEST2_F(CommandListCreate, whenContainsCooperativeKernelsIsCalledThenCorrectVa
}
}
HWTEST_F(CommandListCreate, whenCommandListIsResetThenPartitionCountIsReversedToOne) {
ze_result_t returnValue;
std::unique_ptr<L0::CommandList> commandList(CommandList::create(productFamily,
device,
NEO::EngineGroupType::Compute,
0u,
returnValue));
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
commandList->partitionCount = 2;
returnValue = commandList->reset();
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
EXPECT_EQ(1u, commandList->partitionCount);
}
} // namespace ult
} // namespace L0

View File

@ -1529,6 +1529,12 @@ struct SynchronizeCsr : public NEO::UltCommandStreamReceiver<GfxFamily> {
tagAddress = new uint32_t;
}
bool waitForCompletionWithTimeout(volatile uint32_t *pollAddress, bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait) override {
enableTimeoutSet = enableTimeout;
waitForComplitionCalledTimes++;
return true;
}
bool waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait) override {
enableTimeoutSet = enableTimeout;
waitForComplitionCalledTimes++;
@ -1623,6 +1629,32 @@ HWTEST_F(CommandQueueSynchronizeTest, givenDebugOverrideEnabledWhenCallToSynchro
L0::CommandQueue::fromHandle(commandQueue)->destroy();
}
HWTEST_F(CommandQueueSynchronizeTest, givenMultiplePartitionCountWhenCallingSynchronizeThenExpectTheSameNumberCsrSynchronizeCalls) {
auto csr = std::unique_ptr<SynchronizeCsr<FamilyType>>(new SynchronizeCsr<FamilyType>(*device->getNEODevice()->getExecutionEnvironment(),
device->getNEODevice()->getDeviceBitfield()));
csr->setupContext(*device->getNEODevice()->getDefaultEngine().osContext);
const ze_command_queue_desc_t desc{};
ze_result_t returnValue;
auto commandQueue = whitebox_cast(CommandQueue::create(productFamily,
device,
csr.get(),
&desc,
false,
false,
returnValue));
EXPECT_EQ(returnValue, ZE_RESULT_SUCCESS);
ASSERT_NE(nullptr, commandQueue);
commandQueue->partitionCount = 2;
uint64_t timeout = std::numeric_limits<uint64_t>::max();
commandQueue->synchronize(timeout);
EXPECT_EQ(2u, csr->waitForComplitionCalledTimes);
L0::CommandQueue::fromHandle(commandQueue)->destroy();
}
struct MemoryManagerCommandQueueCreateNegativeTest : public NEO::MockMemoryManager {
MemoryManagerCommandQueueCreateNegativeTest(NEO::ExecutionEnvironment &executionEnvironment) : NEO::MockMemoryManager(const_cast<NEO::ExecutionEnvironment &>(executionEnvironment)) {}
NEO::GraphicsAllocation *allocateGraphicsMemoryWithProperties(const NEO::AllocationProperties &properties) override {

View File

@ -53,6 +53,42 @@ struct CommandQueueExecuteCommandLists : public Test<DeviceFixture> {
ze_command_list_handle_t commandLists[numCommandLists];
};
struct MultiDeviceCommandQueueExecuteCommandLists : public Test<MultiDeviceFixture> {
void SetUp() override {
DebugManager.flags.EnableWalkerPartition.set(1);
numRootDevices = 1u;
MultiDeviceFixture::SetUp();
uint32_t deviceCount = 1;
ze_device_handle_t deviceHandle;
driverHandle->getDevice(&deviceCount, &deviceHandle);
device = Device::fromHandle(deviceHandle);
ASSERT_NE(nullptr, device);
ze_result_t returnValue;
commandLists[0] = CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, returnValue)->toHandle();
ASSERT_NE(nullptr, commandLists[0]);
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
commandLists[1] = CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, returnValue)->toHandle();
ASSERT_NE(nullptr, commandLists[1]);
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
}
void TearDown() override {
for (auto i = 0u; i < numCommandLists; i++) {
auto commandList = CommandList::fromHandle(commandLists[i]);
commandList->destroy();
}
MultiDeviceFixture::TearDown();
}
L0::Device *device = nullptr;
const static uint32_t numCommandLists = 2;
ze_command_list_handle_t commandLists[numCommandLists];
};
HWTEST_F(CommandQueueExecuteCommandLists, whenASecondLevelBatchBufferPerCommandListAddedThenProperSizeExpected) {
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
using MI_BATCH_BUFFER_END = typename FamilyType::MI_BATCH_BUFFER_END;
@ -763,5 +799,95 @@ HWTEST_F(CommandQueueExecuteCommandListSWTagsTests, givenEnableSWTagsAndCommandL
EXPECT_TRUE(tagFound);
}
HWTEST2_F(MultiDeviceCommandQueueExecuteCommandLists, givenMultiplePartitionCountWhenExecutingCmdListThenExpectMmioProgrammingAndCorrectEstimation, IsAtLeastXeHpCore) {
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
using POST_SYNC_OPERATION = typename FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION;
using MI_LOAD_REGISTER_MEM = typename FamilyType::MI_LOAD_REGISTER_MEM;
using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM;
using PARSE = typename FamilyType::PARSE;
ze_command_queue_desc_t desc{};
desc.ordinal = 0u;
desc.index = 0u;
desc.priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL;
desc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
ze_result_t returnValue;
auto commandQueue = whitebox_cast(CommandQueue::create(productFamily,
device,
device->getNEODevice()->getDefaultEngine().commandStreamReceiver,
&desc,
false,
false,
returnValue));
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
ze_fence_desc_t fenceDesc{};
auto fence = whitebox_cast(Fence::create(commandQueue, &fenceDesc));
ASSERT_NE(nullptr, fence);
ze_fence_handle_t fenceHandle = fence->toHandle();
ASSERT_NE(nullptr, commandQueue->commandStream);
//1st execute call initialized pipeline
auto result = commandQueue->executeCommandLists(numCommandLists, commandLists, fenceHandle, true);
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
auto usedSpaceBefore = commandQueue->commandStream->getUsed();
result = commandQueue->executeCommandLists(numCommandLists, commandLists, fenceHandle, true);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
auto usedSpaceAfter = commandQueue->commandStream->getUsed();
ASSERT_GT(usedSpaceAfter, usedSpaceBefore);
size_t cmdBufferSizeWithoutMmioProgramming = usedSpaceAfter - usedSpaceBefore;
auto workPartitionAddress = device->getNEODevice()->getDefaultEngine().commandStreamReceiver->getWorkPartitionAllocationGpuAddress();
for (auto i = 0u; i < numCommandLists; i++) {
auto commandList = CommandList::fromHandle(commandLists[i]);
commandList->partitionCount = 2;
}
usedSpaceBefore = commandQueue->commandStream->getUsed();
result = commandQueue->executeCommandLists(numCommandLists, commandLists, fenceHandle, true);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
usedSpaceAfter = commandQueue->commandStream->getUsed();
ASSERT_GT(usedSpaceAfter, usedSpaceBefore);
size_t cmdBufferSizeWithtMmioProgramming = usedSpaceAfter - usedSpaceBefore;
size_t expectedSizeWithMmioProgramming = cmdBufferSizeWithoutMmioProgramming + sizeof(MI_LOAD_REGISTER_IMM) + sizeof(MI_LOAD_REGISTER_MEM);
EXPECT_GE(expectedSizeWithMmioProgramming, cmdBufferSizeWithtMmioProgramming);
GenCmdList cmdList;
ASSERT_TRUE(PARSE::parseCommandBuffer(cmdList, ptrOffset(commandQueue->commandStream->getCpuBase(), usedSpaceBefore), usedSpaceAfter));
auto itorLri = find<MI_LOAD_REGISTER_IMM *>(cmdList.begin(), cmdList.end());
ASSERT_NE(cmdList.end(), itorLri);
auto itorLrm = find<MI_LOAD_REGISTER_MEM *>(cmdList.begin(), cmdList.end());
ASSERT_NE(cmdList.end(), itorLrm);
auto loadRegisterImm = static_cast<MI_LOAD_REGISTER_IMM *>(*itorLri);
EXPECT_EQ(0x23B4u, loadRegisterImm->getRegisterOffset());
EXPECT_EQ(8u, loadRegisterImm->getDataDword());
auto loadRegisterMem = static_cast<MI_LOAD_REGISTER_MEM *>(*itorLrm);
EXPECT_EQ(0x221Cu, loadRegisterMem->getRegisterAddress());
EXPECT_EQ(workPartitionAddress, loadRegisterMem->getMemoryAddress());
auto pipeControlList = findAll<PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
uint32_t foundPostSyncPipeControl = 0u;
for (size_t i = 0; i < pipeControlList.size(); i++) {
auto pipeControl = reinterpret_cast<PIPE_CONTROL *>(*pipeControlList[i]);
if (pipeControl->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) {
EXPECT_TRUE(pipeControl->getWorkloadPartitionIdOffsetEnable());
foundPostSyncPipeControl++;
}
}
EXPECT_EQ(2u, foundPostSyncPipeControl);
fence->destroy();
commandQueue->destroy();
}
} // namespace ult
} // namespace L0

View File

@ -405,3 +405,42 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HwHelperTestXeHPAndLater, givenHwHelperWhenGettingB
EXPECT_EQ(messageExtDescriptor.getBindlessSurfaceOffsetToPatch(), value);
EXPECT_EQ(0x200u, value);
}
HWCMDTEST_F(IGFX_XE_HP_CORE, PipeControlHelperTestsXeHPAndLater, givenPostSyncPipeControlWhenSettingWorkloadPartitionFlagThenExpectPipeControlFlagSet) {
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
using POST_SYNC_OPERATION = typename FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION;
uint8_t buffer[128] = {};
LinearStream stream(buffer, sizeof(buffer));
HardwareInfo hardwareInfo = *defaultHwInfo;
uint64_t gpuAddress = 0xBADA550;
uint64_t data = 0xABCDEF;
PipeControlArgs args;
args.workloadPartitionOffset = true;
MemorySynchronizationCommands<FamilyType>::addPipeControlAndProgramPostSyncOperation(
stream,
POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
gpuAddress,
data,
hardwareInfo,
args);
GenCmdList cmdList;
FamilyType::PARSE::parseCommandBuffer(cmdList, stream.getCpuBase(), stream.getUsed());
auto pipeControls = findAll<PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
bool foundPostSyncPipeControl = false;
for (size_t i = 0; i < pipeControls.size(); i++) {
auto pipeControl = reinterpret_cast<PIPE_CONTROL *>(*pipeControls[i]);
if (pipeControl->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) {
EXPECT_EQ(static_cast<uint32_t>(gpuAddress), pipeControl->getAddress());
EXPECT_EQ(data, pipeControl->getImmediateData());
EXPECT_TRUE(pipeControl->getWorkloadPartitionIdOffsetEnable());
foundPostSyncPipeControl = true;
break;
}
}
EXPECT_TRUE(foundPostSyncPipeControl);
}

View File

@ -195,11 +195,13 @@ struct EncodeSetMMIO {
static const size_t sizeREG = sizeof(MI_LOAD_REGISTER_REG);
static void encodeIMM(CommandContainer &container, uint32_t offset, uint32_t data, bool remap);
static void encodeMEM(CommandContainer &container, uint32_t offset, uint64_t address);
static void encodeREG(CommandContainer &container, uint32_t dstOffset, uint32_t srcOffset);
static void encodeIMM(LinearStream &cmdStream, uint32_t offset, uint32_t data, bool remap);
static void encodeMEM(LinearStream &cmdStream, uint32_t offset, uint64_t address);
static void encodeREG(LinearStream &cmdStream, uint32_t dstOffset, uint32_t srcOffset);
static bool isRemapApplicable(uint32_t offset);
static void remapOffset(MI_LOAD_REGISTER_MEM *pMiLoadReg);
static void remapOffset(MI_LOAD_REGISTER_REG *pMiLoadReg);

View File

@ -297,30 +297,45 @@ void EncodeMath<Family>::bitwiseOr(CommandContainer &container,
template <typename Family>
inline void EncodeSetMMIO<Family>::encodeIMM(CommandContainer &container, uint32_t offset, uint32_t data, bool remap) {
LriHelper<Family>::program(container.getCommandStream(),
EncodeSetMMIO<Family>::encodeIMM(*container.getCommandStream(), offset, data, remap);
}
template <typename Family>
inline void EncodeSetMMIO<Family>::encodeMEM(CommandContainer &container, uint32_t offset, uint64_t address) {
EncodeSetMMIO<Family>::encodeMEM(*container.getCommandStream(), offset, address);
}
template <typename Family>
inline void EncodeSetMMIO<Family>::encodeREG(CommandContainer &container, uint32_t dstOffset, uint32_t srcOffset) {
EncodeSetMMIO<Family>::encodeREG(*container.getCommandStream(), dstOffset, srcOffset);
}
template <typename Family>
inline void EncodeSetMMIO<Family>::encodeIMM(LinearStream &cmdStream, uint32_t offset, uint32_t data, bool remap) {
LriHelper<Family>::program(&cmdStream,
offset,
data,
remap);
}
template <typename Family>
void EncodeSetMMIO<Family>::encodeMEM(CommandContainer &container, uint32_t offset, uint64_t address) {
void EncodeSetMMIO<Family>::encodeMEM(LinearStream &cmdStream, uint32_t offset, uint64_t address) {
MI_LOAD_REGISTER_MEM cmd = Family::cmdInitLoadRegisterMem;
cmd.setRegisterAddress(offset);
cmd.setMemoryAddress(address);
remapOffset(&cmd);
auto buffer = container.getCommandStream()->getSpaceForCmd<MI_LOAD_REGISTER_MEM>();
auto buffer = cmdStream.getSpaceForCmd<MI_LOAD_REGISTER_MEM>();
*buffer = cmd;
}
template <typename Family>
void EncodeSetMMIO<Family>::encodeREG(CommandContainer &container, uint32_t dstOffset, uint32_t srcOffset) {
void EncodeSetMMIO<Family>::encodeREG(LinearStream &cmdStream, uint32_t dstOffset, uint32_t srcOffset) {
MI_LOAD_REGISTER_REG cmd = Family::cmdInitLoadRegisterReg;
cmd.setSourceRegisterAddress(srcOffset);
cmd.setDestinationRegisterAddress(dstOffset);
remapOffset(&cmd);
auto buffer = container.getCommandStream()->getSpaceForCmd<MI_LOAD_REGISTER_REG>();
auto buffer = cmdStream.getSpaceForCmd<MI_LOAD_REGISTER_REG>();
*buffer = cmd;
}
@ -508,8 +523,8 @@ bool EncodeDispatchKernel<Family>::inlineDataProgrammingRequired(const KernelDes
return false;
}
template <typename GfxFamily>
void EncodeDispatchKernel<GfxFamily>::adjustTimestampPacket(WALKER_TYPE &walkerCmd, const HardwareInfo &hwInfo) {}
template <typename Family>
void EncodeDispatchKernel<Family>::adjustTimestampPacket(WALKER_TYPE &walkerCmd, const HardwareInfo &hwInfo) {}
template <typename Family>
void EncodeIndirectParams<Family>::setGroupCountIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress) {
@ -706,12 +721,12 @@ void EncodeBatchBufferStartOrEnd<Family>::programBatchBufferEnd(CommandContainer
*buffer = cmd;
}
template <typename GfxFamily>
void EncodeMiFlushDW<GfxFamily>::programMiFlushDw(LinearStream &commandStream, uint64_t immediateDataGpuAddress, uint64_t immediateData, MiFlushArgs &args) {
template <typename Family>
void EncodeMiFlushDW<Family>::programMiFlushDw(LinearStream &commandStream, uint64_t immediateDataGpuAddress, uint64_t immediateData, MiFlushArgs &args) {
programMiFlushDwWA(commandStream);
auto miFlushDwCmd = commandStream.getSpaceForCmd<MI_FLUSH_DW>();
MI_FLUSH_DW miFlush = GfxFamily::cmdInitMiFlushDw;
MI_FLUSH_DW miFlush = Family::cmdInitMiFlushDw;
if (args.commandWithPostSync) {
auto postSyncType = args.timeStampOperation ? MI_FLUSH_DW::POST_SYNC_OPERATION_WRITE_TIMESTAMP_REGISTER : MI_FLUSH_DW::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA_QWORD;
miFlush.setPostSyncOperation(postSyncType);
@ -724,16 +739,16 @@ void EncodeMiFlushDW<GfxFamily>::programMiFlushDw(LinearStream &commandStream, u
*miFlushDwCmd = miFlush;
}
template <typename GfxFamily>
size_t EncodeMiFlushDW<GfxFamily>::getMiFlushDwCmdSizeForDataWrite() {
return sizeof(typename GfxFamily::MI_FLUSH_DW) + EncodeMiFlushDW<GfxFamily>::getMiFlushDwWaSize();
template <typename Family>
size_t EncodeMiFlushDW<Family>::getMiFlushDwCmdSizeForDataWrite() {
return sizeof(typename Family::MI_FLUSH_DW) + EncodeMiFlushDW<Family>::getMiFlushDwWaSize();
}
template <typename GfxFamily>
inline void EncodeMemoryPrefetch<GfxFamily>::programMemoryPrefetch(LinearStream &commandStream, const GraphicsAllocation &graphicsAllocation, uint32_t size, size_t offset, const HardwareInfo &hwInfo) {}
template <typename Family>
inline void EncodeMemoryPrefetch<Family>::programMemoryPrefetch(LinearStream &commandStream, const GraphicsAllocation &graphicsAllocation, uint32_t size, size_t offset, const HardwareInfo &hwInfo) {}
template <typename GfxFamily>
inline size_t EncodeMemoryPrefetch<GfxFamily>::getSizeForMemoryPrefetch(size_t size) { return 0u; }
template <typename Family>
inline size_t EncodeMemoryPrefetch<Family>::getSizeForMemoryPrefetch(size_t size) { return 0u; }
template <typename Family>
void EncodeMiArbCheck<Family>::program(LinearStream &commandStream) {

View File

@ -586,9 +586,9 @@ bool EncodeSurfaceState<Family>::doBindingTablePrefetch() {
return false;
}
template <typename GfxFamily>
void EncodeSurfaceState<GfxFamily>::encodeExtraBufferParams(R_SURFACE_STATE *surfaceState, GraphicsAllocation *allocation, GmmHelper *gmmHelper,
bool isReadOnly, uint32_t numAvailableDevices, bool useGlobalAtomics, bool areMultipleSubDevicesInContext) {
template <typename Family>
void EncodeSurfaceState<Family>::encodeExtraBufferParams(R_SURFACE_STATE *surfaceState, GraphicsAllocation *allocation, GmmHelper *gmmHelper,
bool isReadOnly, uint32_t numAvailableDevices, bool useGlobalAtomics, bool areMultipleSubDevicesInContext) {
Gmm *gmm = allocation ? allocation->getDefaultGmm() : nullptr;
uint32_t compressionFormat = 0;
@ -627,7 +627,7 @@ void EncodeSurfaceState<GfxFamily>::encodeExtraBufferParams(R_SURFACE_STATE *sur
surfaceState->setDisableSupportForMultiGpuPartialWrites(!!DebugManager.flags.ForceMultiGpuPartialWrites.get());
}
if (EncodeSurfaceState<GfxFamily>::isAuxModeEnabled(surfaceState, gmm)) {
if (EncodeSurfaceState<Family>::isAuxModeEnabled(surfaceState, gmm)) {
auto resourceFormat = gmm->gmmResourceInfo->getResourceFormat();
compressionFormat = gmmHelper->getClientContext()->getSurfaceStateCompressionFormat(resourceFormat);

View File

@ -42,4 +42,12 @@ struct ImplicitScalingDispatch {
uint64_t workPartitionAllocationGpuVa);
};
template <typename GfxFamily>
struct PartitionRegisters {
enum {
wparidCCSOffset = 0x221C,
addressOffsetCCSOffset = 0x23B4
};
};
} // namespace NEO

View File

@ -257,6 +257,10 @@ void CommandStreamReceiver::cleanupResources() {
}
bool CommandStreamReceiver::waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait) {
return waitForCompletionWithTimeout(getTagAddress(), enableTimeout, timeoutMicroseconds, taskCountToWait);
}
bool CommandStreamReceiver::waitForCompletionWithTimeout(volatile uint32_t *pollAddress, bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait) {
std::chrono::high_resolution_clock::time_point time1, time2;
int64_t timeDiff = 0;
@ -272,8 +276,8 @@ bool CommandStreamReceiver::waitForCompletionWithTimeout(bool enableTimeout, int
}
time1 = std::chrono::high_resolution_clock::now();
while (*getTagAddress() < taskCountToWait && timeDiff <= timeoutMicroseconds) {
if (WaitUtils::waitFunction(getTagAddress(), taskCountToWait)) {
while (*pollAddress < taskCountToWait && timeDiff <= timeoutMicroseconds) {
if (WaitUtils::waitFunction(pollAddress, taskCountToWait)) {
break;
}
@ -282,7 +286,8 @@ bool CommandStreamReceiver::waitForCompletionWithTimeout(bool enableTimeout, int
timeDiff = std::chrono::duration_cast<std::chrono::microseconds>(time2 - time1).count();
}
}
if (*getTagAddress() >= taskCountToWait) {
if (*pollAddress >= taskCountToWait) {
return true;
}
return false;

View File

@ -158,6 +158,7 @@ class CommandStreamReceiver {
virtual void waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode) = 0;
virtual bool waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait);
MOCKABLE_VIRTUAL bool waitForCompletionWithTimeout(volatile uint32_t *pollAddress, bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait);
virtual void downloadAllocations(){};
void setSamplerCacheFlushRequired(SamplerCacheFlushState value) { this->samplerCacheFlushRequired = value; }
@ -292,7 +293,7 @@ class CommandStreamReceiver {
LinearStream commandStream;
// offset for debug state must be 8 bytes, if only 4 bytes are used tag writes overwrite it
const uint64_t debugPauseStateAddressOffset = 8;
const uint64_t debugPauseStateAddressOffset = MemoryConstants::cacheLineSize;
uint64_t totalMemoryUsed = 0u;
volatile uint32_t *tagAddress = nullptr;

View File

@ -21,6 +21,7 @@ struct PipeControlArgsBase {
bool tlbInvalidation = false;
bool compressionControlSurfaceCcsFlush = false;
bool notifyEnable = false;
bool workloadPartitionOffset = false;
protected:
PipeControlArgsBase() = default;

View File

@ -186,6 +186,7 @@ template <>
void MemorySynchronizationCommands<Family>::setPipeControlExtraProperties(PIPE_CONTROL &pipeControl, PipeControlArgs &args) {
pipeControl.setHdcPipelineFlush(args.hdcPipelineFlush);
pipeControl.setCompressionControlSurfaceCcsFlush(args.compressionControlSurfaceCcsFlush);
pipeControl.setWorkloadPartitionIdOffsetEnable(args.workloadPartitionOffset);
if (DebugManager.flags.FlushAllCaches.get()) {
pipeControl.setHdcPipelineFlush(true);