Make partitioned post sync operations for partitioned workloads
Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
parent
86f8150dc7
commit
6b299a3ab0
|
@ -216,12 +216,6 @@ struct CommandList : _ze_command_list_handle_t {
|
|||
TYPE_IMMEDIATE = 1u
|
||||
};
|
||||
|
||||
CommandQueue *cmdQImmediate = nullptr;
|
||||
NEO::CommandStreamReceiver *csr = nullptr;
|
||||
uint32_t cmdListType = CommandListType::TYPE_REGULAR;
|
||||
Device *device = nullptr;
|
||||
std::vector<Kernel *> printfFunctionContainer;
|
||||
|
||||
virtual ze_result_t executeCommandListImmediate(bool performMigration) = 0;
|
||||
virtual ze_result_t initialize(Device *device, NEO::EngineGroupType engineGroupType, ze_command_list_flags_t flags) = 0;
|
||||
virtual ~CommandList();
|
||||
|
@ -241,33 +235,41 @@ struct CommandList : _ze_command_list_handle_t {
|
|||
return commandsToPatch;
|
||||
}
|
||||
|
||||
bool isSyncModeQueue = false;
|
||||
bool commandListSLMEnabled = false;
|
||||
uint32_t commandListPerThreadScratchSize = 0u;
|
||||
NEO::PreemptionMode commandListPreemptionMode = NEO::PreemptionMode::Initial;
|
||||
uint32_t threadArbitrationPolicy = NEO::ThreadArbitrationPolicy::RoundRobin;
|
||||
bool isFlushTaskSubmissionEnabled = false;
|
||||
|
||||
void makeResidentAndMigrate(bool);
|
||||
void migrateSharedAllocations();
|
||||
|
||||
std::vector<Kernel *> printfFunctionContainer;
|
||||
CommandQueue *cmdQImmediate = nullptr;
|
||||
NEO::CommandStreamReceiver *csr = nullptr;
|
||||
Device *device = nullptr;
|
||||
NEO::PreemptionMode commandListPreemptionMode = NEO::PreemptionMode::Initial;
|
||||
uint32_t cmdListType = CommandListType::TYPE_REGULAR;
|
||||
uint32_t commandListPerThreadScratchSize = 0u;
|
||||
uint32_t threadArbitrationPolicy = NEO::ThreadArbitrationPolicy::RoundRobin;
|
||||
uint32_t partitionCount = 1;
|
||||
bool isFlushTaskSubmissionEnabled = false;
|
||||
bool isSyncModeQueue = false;
|
||||
bool commandListSLMEnabled = false;
|
||||
|
||||
protected:
|
||||
std::map<const void *, NEO::GraphicsAllocation *> hostPtrMap;
|
||||
NEO::EngineGroupType engineGroupType;
|
||||
ze_command_list_flags_t flags = 0u;
|
||||
UnifiedMemoryControls unifiedMemoryControls;
|
||||
bool indirectAllocationsAllowed = false;
|
||||
bool internalUsage = false;
|
||||
bool containsCooperativeKernelsFlag = false;
|
||||
NEO::GraphicsAllocation *getAllocationFromHostPtrMap(const void *buffer, uint64_t bufferSize);
|
||||
NEO::GraphicsAllocation *getHostPtrAlloc(const void *buffer, uint64_t bufferSize);
|
||||
bool containsStatelessUncachedResource = false;
|
||||
|
||||
std::map<const void *, NEO::GraphicsAllocation *> hostPtrMap;
|
||||
std::vector<NEO::GraphicsAllocation *> ownedPrivateAllocations;
|
||||
|
||||
NEO::StreamProperties requiredStreamState{};
|
||||
NEO::StreamProperties finalStreamState{};
|
||||
CommandsToPatch commandsToPatch{};
|
||||
|
||||
std::vector<NEO::GraphicsAllocation *> ownedPrivateAllocations;
|
||||
ze_command_list_flags_t flags = 0u;
|
||||
UnifiedMemoryControls unifiedMemoryControls;
|
||||
|
||||
NEO::EngineGroupType engineGroupType;
|
||||
bool indirectAllocationsAllowed = false;
|
||||
bool internalUsage = false;
|
||||
bool containsCooperativeKernelsFlag = false;
|
||||
bool containsStatelessUncachedResource = false;
|
||||
};
|
||||
|
||||
using CommandListAllocatorFn = CommandList *(*)(uint32_t);
|
||||
|
|
|
@ -108,7 +108,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::reset() {
|
|||
device->getNEODevice()->getMemoryManager()->freeGraphicsMemory(alloc);
|
||||
}
|
||||
this->ownedPrivateAllocations.clear();
|
||||
|
||||
partitionCount = 1;
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
|
|
|
@ -228,6 +228,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
|
|||
partitionCount,
|
||||
internalUsage,
|
||||
isCooperative);
|
||||
this->partitionCount = std::max(partitionCount, this->partitionCount);
|
||||
if (hEvent) {
|
||||
auto event = Event::fromHandle(hEvent);
|
||||
if (partitionCount > 1) {
|
||||
|
|
|
@ -106,10 +106,17 @@ ze_result_t CommandQueueImp::synchronizeByPollingForTaskCount(uint64_t timeout)
|
|||
timeoutMicroseconds = NEO::TimeoutControls::maxTimeout;
|
||||
}
|
||||
|
||||
csr->waitForCompletionWithTimeout(enableTimeout, timeoutMicroseconds, this->taskCount);
|
||||
|
||||
if (*csr->getTagAddress() < taskCountToWait) {
|
||||
return ZE_RESULT_NOT_READY;
|
||||
if (partitionCount > 1) {
|
||||
volatile uint32_t *pollAddress = csr->getTagAddress();
|
||||
for (uint32_t i = 0; i < partitionCount; i++) {
|
||||
csr->waitForCompletionWithTimeout(pollAddress, enableTimeout, timeoutMicroseconds, this->taskCount);
|
||||
pollAddress += addressOffsetDwords;
|
||||
}
|
||||
} else {
|
||||
csr->waitForCompletionWithTimeout(enableTimeout, timeoutMicroseconds, this->taskCount);
|
||||
if (*csr->getTagAddress() < taskCountToWait) {
|
||||
return ZE_RESULT_NOT_READY;
|
||||
}
|
||||
}
|
||||
|
||||
postSyncOperations();
|
||||
|
|
|
@ -57,6 +57,7 @@ struct CommandQueue : _ze_command_queue_handle_t {
|
|||
|
||||
protected:
|
||||
NEO::PreemptionMode commandQueuePreemptionMode = NEO::PreemptionMode::Initial;
|
||||
uint32_t partitionCount = 1;
|
||||
bool preemptionCmdSyncProgramming = true;
|
||||
bool commandQueueDebugCmdsProgrammed = false;
|
||||
bool isCopyOnlyCommandQueue = false;
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
#include "shared/source/built_ins/built_ins.h"
|
||||
#include "shared/source/built_ins/sip.h"
|
||||
#include "shared/source/command_container/command_encoder.h"
|
||||
#include "shared/source/command_container/implicit_scaling.h"
|
||||
#include "shared/source/command_stream/command_stream_receiver_hw.h"
|
||||
#include "shared/source/command_stream/linear_stream.h"
|
||||
#include "shared/source/command_stream/preemption.h"
|
||||
|
@ -72,6 +73,9 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
|
|||
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
|
||||
using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION;
|
||||
|
||||
using MI_LOAD_REGISTER_MEM = typename GfxFamily::MI_LOAD_REGISTER_MEM;
|
||||
using MI_LOAD_REGISTER_IMM = typename GfxFamily::MI_LOAD_REGISTER_IMM;
|
||||
|
||||
auto lockCSR = csr->obtainUniqueOwnership();
|
||||
|
||||
auto anyCommandListWithCooperativeKernels = false;
|
||||
|
@ -177,6 +181,8 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
|
|||
heapContainer.push_back(element);
|
||||
}
|
||||
}
|
||||
|
||||
partitionCount = std::max(partitionCount, commandList->partitionCount);
|
||||
}
|
||||
|
||||
size_t linearStreamSizeEstimate = totalCmdBuffers * sizeof(MI_BATCH_BUFFER_START);
|
||||
|
@ -240,6 +246,10 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
|
|||
}
|
||||
|
||||
linearStreamSizeEstimate += isCopyOnlyCommandQueue ? NEO::EncodeMiFlushDW<GfxFamily>::getMiFlushDwCmdSizeForDataWrite() : NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForPipeControlWithPostSyncOperation(hwInfo);
|
||||
if (partitionCount > 1) {
|
||||
linearStreamSizeEstimate += sizeof(MI_LOAD_REGISTER_MEM) + sizeof(MI_LOAD_REGISTER_IMM);
|
||||
}
|
||||
|
||||
size_t alignedSize = alignUp<size_t>(linearStreamSizeEstimate, minCmdBufferPtrAlign);
|
||||
size_t padding = alignedSize - linearStreamSizeEstimate;
|
||||
reserveLinearStreamSize(alignedSize);
|
||||
|
@ -399,6 +409,17 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
|
|||
|
||||
commandQueuePreemptionMode = statePreemption;
|
||||
|
||||
if (partitionCount > 1) {
|
||||
uint64_t workPartitionAddress = csr->getWorkPartitionAllocationGpuAddress();
|
||||
NEO::EncodeSetMMIO<GfxFamily>::encodeMEM(child,
|
||||
NEO::PartitionRegisters<GfxFamily>::wparidCCSOffset,
|
||||
workPartitionAddress);
|
||||
NEO::EncodeSetMMIO<GfxFamily>::encodeIMM(child,
|
||||
NEO::PartitionRegisters<GfxFamily>::addressOffsetCCSOffset,
|
||||
addressOffset,
|
||||
true);
|
||||
}
|
||||
|
||||
if (hFence) {
|
||||
csr->makeResident(fence->getAllocation());
|
||||
if (isCopyOnlyCommandQueue) {
|
||||
|
@ -407,6 +428,10 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandLists(
|
|||
NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(child, fence->getGpuAddress(), Fence::STATE_SIGNALED, args);
|
||||
} else {
|
||||
NEO::PipeControlArgs args(true);
|
||||
if (partitionCount > 1) {
|
||||
args.workloadPartitionOffset = true;
|
||||
fence->setPartitionCount(partitionCount);
|
||||
}
|
||||
NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(
|
||||
child, POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
|
||||
fence->getGpuAddress(),
|
||||
|
@ -539,6 +564,9 @@ void CommandQueueHw<gfxCoreFamily>::dispatchTaskCountWrite(NEO::LinearStream &co
|
|||
NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(commandStream, gpuAddress, taskCountToWrite, args);
|
||||
} else {
|
||||
NEO::PipeControlArgs args(true);
|
||||
if (partitionCount > 1) {
|
||||
args.workloadPartitionOffset = true;
|
||||
}
|
||||
args.notifyEnable = csr->isUsedNotifyEnableForPostSync();
|
||||
NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(
|
||||
commandStream,
|
||||
|
|
|
@ -62,6 +62,9 @@ struct CommandQueueImp : public CommandQueue {
|
|||
MemoryConstants::cacheLineSize +
|
||||
NEO::CSRequirements::csOverfetchSize;
|
||||
|
||||
static constexpr uint32_t addressOffsetDwords = 2u;
|
||||
static constexpr uint32_t addressOffset = sizeof(uint32_t) * addressOffsetDwords;
|
||||
|
||||
CommandQueueImp() = delete;
|
||||
CommandQueueImp(Device *device, NEO::CommandStreamReceiver *csr, const ze_command_queue_desc_t *desc);
|
||||
|
||||
|
|
|
@ -37,9 +37,15 @@ ze_result_t FenceImp::queryStatus() {
|
|||
csr->downloadAllocations();
|
||||
}
|
||||
|
||||
uint64_t *hostAddr = static_cast<uint64_t *>(allocation->getUnderlyingBuffer());
|
||||
void *hostAddr = static_cast<uint64_t *>(allocation->getUnderlyingBuffer());
|
||||
uint32_t queryVal = Fence::STATE_CLEARED;
|
||||
memcpy_s(static_cast<void *>(&queryVal), sizeof(uint32_t), static_cast<void *>(hostAddr), sizeof(uint32_t));
|
||||
for (uint32_t i = 0; i < partitionCount; i++) {
|
||||
memcpy_s(static_cast<void *>(&queryVal), sizeof(uint32_t), hostAddr, sizeof(uint32_t));
|
||||
if (queryVal == Fence::STATE_CLEARED) {
|
||||
break;
|
||||
}
|
||||
hostAddr = ptrOffset(hostAddr, CommandQueueImp::addressOffset);
|
||||
}
|
||||
return queryVal == Fence::STATE_CLEARED ? ZE_RESULT_NOT_READY : ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
|
|
|
@ -47,8 +47,13 @@ struct Fence : _ze_fence_handle_t {
|
|||
return allocation->getGpuAddress();
|
||||
}
|
||||
|
||||
void setPartitionCount(uint32_t newPartitionCount) {
|
||||
partitionCount = newPartitionCount;
|
||||
}
|
||||
|
||||
protected:
|
||||
NEO::GraphicsAllocation *allocation = nullptr;
|
||||
uint32_t partitionCount = 1;
|
||||
};
|
||||
|
||||
struct FenceImp : public Fence {
|
||||
|
|
|
@ -27,8 +27,10 @@ struct WhiteBox<::L0::CommandQueue> : public ::L0::CommandQueueImp {
|
|||
using BaseClass::printfFunctionContainer;
|
||||
using BaseClass::submitBatchBuffer;
|
||||
using BaseClass::synchronizeByPollingForTaskCount;
|
||||
using BaseClass::taskCount;
|
||||
using CommandQueue::commandQueuePreemptionMode;
|
||||
using CommandQueue::internalUsage;
|
||||
using CommandQueue::partitionCount;
|
||||
|
||||
WhiteBox(Device *device, NEO::CommandStreamReceiver *csr,
|
||||
const ze_command_queue_desc_t *desc);
|
||||
|
@ -85,6 +87,7 @@ struct MockCommandQueueHw : public L0::CommandQueueHw<gfxCoreFamily> {
|
|||
using BaseClass::commandStream;
|
||||
using BaseClass::printfFunctionContainer;
|
||||
using L0::CommandQueue::internalUsage;
|
||||
using L0::CommandQueue::partitionCount;
|
||||
using L0::CommandQueue::preemptionCmdSyncProgramming;
|
||||
using L0::CommandQueueImp::csr;
|
||||
|
||||
|
|
|
@ -845,5 +845,22 @@ HWTEST2_F(CommandListCreate, whenContainsCooperativeKernelsIsCalledThenCorrectVa
|
|||
}
|
||||
}
|
||||
|
||||
HWTEST_F(CommandListCreate, whenCommandListIsResetThenPartitionCountIsReversedToOne) {
|
||||
ze_result_t returnValue;
|
||||
std::unique_ptr<L0::CommandList> commandList(CommandList::create(productFamily,
|
||||
device,
|
||||
NEO::EngineGroupType::Compute,
|
||||
0u,
|
||||
returnValue));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
|
||||
commandList->partitionCount = 2;
|
||||
|
||||
returnValue = commandList->reset();
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
|
||||
EXPECT_EQ(1u, commandList->partitionCount);
|
||||
}
|
||||
|
||||
} // namespace ult
|
||||
} // namespace L0
|
||||
|
|
|
@ -1529,6 +1529,12 @@ struct SynchronizeCsr : public NEO::UltCommandStreamReceiver<GfxFamily> {
|
|||
tagAddress = new uint32_t;
|
||||
}
|
||||
|
||||
bool waitForCompletionWithTimeout(volatile uint32_t *pollAddress, bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait) override {
|
||||
enableTimeoutSet = enableTimeout;
|
||||
waitForComplitionCalledTimes++;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait) override {
|
||||
enableTimeoutSet = enableTimeout;
|
||||
waitForComplitionCalledTimes++;
|
||||
|
@ -1623,6 +1629,32 @@ HWTEST_F(CommandQueueSynchronizeTest, givenDebugOverrideEnabledWhenCallToSynchro
|
|||
L0::CommandQueue::fromHandle(commandQueue)->destroy();
|
||||
}
|
||||
|
||||
HWTEST_F(CommandQueueSynchronizeTest, givenMultiplePartitionCountWhenCallingSynchronizeThenExpectTheSameNumberCsrSynchronizeCalls) {
|
||||
auto csr = std::unique_ptr<SynchronizeCsr<FamilyType>>(new SynchronizeCsr<FamilyType>(*device->getNEODevice()->getExecutionEnvironment(),
|
||||
device->getNEODevice()->getDeviceBitfield()));
|
||||
csr->setupContext(*device->getNEODevice()->getDefaultEngine().osContext);
|
||||
|
||||
const ze_command_queue_desc_t desc{};
|
||||
ze_result_t returnValue;
|
||||
auto commandQueue = whitebox_cast(CommandQueue::create(productFamily,
|
||||
device,
|
||||
csr.get(),
|
||||
&desc,
|
||||
false,
|
||||
false,
|
||||
returnValue));
|
||||
EXPECT_EQ(returnValue, ZE_RESULT_SUCCESS);
|
||||
ASSERT_NE(nullptr, commandQueue);
|
||||
|
||||
commandQueue->partitionCount = 2;
|
||||
uint64_t timeout = std::numeric_limits<uint64_t>::max();
|
||||
commandQueue->synchronize(timeout);
|
||||
|
||||
EXPECT_EQ(2u, csr->waitForComplitionCalledTimes);
|
||||
|
||||
L0::CommandQueue::fromHandle(commandQueue)->destroy();
|
||||
}
|
||||
|
||||
struct MemoryManagerCommandQueueCreateNegativeTest : public NEO::MockMemoryManager {
|
||||
MemoryManagerCommandQueueCreateNegativeTest(NEO::ExecutionEnvironment &executionEnvironment) : NEO::MockMemoryManager(const_cast<NEO::ExecutionEnvironment &>(executionEnvironment)) {}
|
||||
NEO::GraphicsAllocation *allocateGraphicsMemoryWithProperties(const NEO::AllocationProperties &properties) override {
|
||||
|
|
|
@ -53,6 +53,42 @@ struct CommandQueueExecuteCommandLists : public Test<DeviceFixture> {
|
|||
ze_command_list_handle_t commandLists[numCommandLists];
|
||||
};
|
||||
|
||||
struct MultiDeviceCommandQueueExecuteCommandLists : public Test<MultiDeviceFixture> {
|
||||
void SetUp() override {
|
||||
DebugManager.flags.EnableWalkerPartition.set(1);
|
||||
numRootDevices = 1u;
|
||||
MultiDeviceFixture::SetUp();
|
||||
|
||||
uint32_t deviceCount = 1;
|
||||
ze_device_handle_t deviceHandle;
|
||||
driverHandle->getDevice(&deviceCount, &deviceHandle);
|
||||
device = Device::fromHandle(deviceHandle);
|
||||
ASSERT_NE(nullptr, device);
|
||||
|
||||
ze_result_t returnValue;
|
||||
commandLists[0] = CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, returnValue)->toHandle();
|
||||
ASSERT_NE(nullptr, commandLists[0]);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
|
||||
commandLists[1] = CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, returnValue)->toHandle();
|
||||
ASSERT_NE(nullptr, commandLists[1]);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
}
|
||||
|
||||
void TearDown() override {
|
||||
for (auto i = 0u; i < numCommandLists; i++) {
|
||||
auto commandList = CommandList::fromHandle(commandLists[i]);
|
||||
commandList->destroy();
|
||||
}
|
||||
|
||||
MultiDeviceFixture::TearDown();
|
||||
}
|
||||
|
||||
L0::Device *device = nullptr;
|
||||
const static uint32_t numCommandLists = 2;
|
||||
ze_command_list_handle_t commandLists[numCommandLists];
|
||||
};
|
||||
|
||||
HWTEST_F(CommandQueueExecuteCommandLists, whenASecondLevelBatchBufferPerCommandListAddedThenProperSizeExpected) {
|
||||
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
|
||||
using MI_BATCH_BUFFER_END = typename FamilyType::MI_BATCH_BUFFER_END;
|
||||
|
@ -763,5 +799,95 @@ HWTEST_F(CommandQueueExecuteCommandListSWTagsTests, givenEnableSWTagsAndCommandL
|
|||
EXPECT_TRUE(tagFound);
|
||||
}
|
||||
|
||||
HWTEST2_F(MultiDeviceCommandQueueExecuteCommandLists, givenMultiplePartitionCountWhenExecutingCmdListThenExpectMmioProgrammingAndCorrectEstimation, IsAtLeastXeHpCore) {
|
||||
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
|
||||
using POST_SYNC_OPERATION = typename FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION;
|
||||
using MI_LOAD_REGISTER_MEM = typename FamilyType::MI_LOAD_REGISTER_MEM;
|
||||
using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM;
|
||||
using PARSE = typename FamilyType::PARSE;
|
||||
|
||||
ze_command_queue_desc_t desc{};
|
||||
desc.ordinal = 0u;
|
||||
desc.index = 0u;
|
||||
desc.priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL;
|
||||
desc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
|
||||
|
||||
ze_result_t returnValue;
|
||||
auto commandQueue = whitebox_cast(CommandQueue::create(productFamily,
|
||||
device,
|
||||
device->getNEODevice()->getDefaultEngine().commandStreamReceiver,
|
||||
&desc,
|
||||
false,
|
||||
false,
|
||||
returnValue));
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
|
||||
ze_fence_desc_t fenceDesc{};
|
||||
auto fence = whitebox_cast(Fence::create(commandQueue, &fenceDesc));
|
||||
ASSERT_NE(nullptr, fence);
|
||||
ze_fence_handle_t fenceHandle = fence->toHandle();
|
||||
|
||||
ASSERT_NE(nullptr, commandQueue->commandStream);
|
||||
|
||||
//1st execute call initialized pipeline
|
||||
auto result = commandQueue->executeCommandLists(numCommandLists, commandLists, fenceHandle, true);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
|
||||
|
||||
auto usedSpaceBefore = commandQueue->commandStream->getUsed();
|
||||
result = commandQueue->executeCommandLists(numCommandLists, commandLists, fenceHandle, true);
|
||||
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
auto usedSpaceAfter = commandQueue->commandStream->getUsed();
|
||||
ASSERT_GT(usedSpaceAfter, usedSpaceBefore);
|
||||
size_t cmdBufferSizeWithoutMmioProgramming = usedSpaceAfter - usedSpaceBefore;
|
||||
|
||||
auto workPartitionAddress = device->getNEODevice()->getDefaultEngine().commandStreamReceiver->getWorkPartitionAllocationGpuAddress();
|
||||
|
||||
for (auto i = 0u; i < numCommandLists; i++) {
|
||||
auto commandList = CommandList::fromHandle(commandLists[i]);
|
||||
commandList->partitionCount = 2;
|
||||
}
|
||||
|
||||
usedSpaceBefore = commandQueue->commandStream->getUsed();
|
||||
result = commandQueue->executeCommandLists(numCommandLists, commandLists, fenceHandle, true);
|
||||
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
usedSpaceAfter = commandQueue->commandStream->getUsed();
|
||||
ASSERT_GT(usedSpaceAfter, usedSpaceBefore);
|
||||
size_t cmdBufferSizeWithtMmioProgramming = usedSpaceAfter - usedSpaceBefore;
|
||||
|
||||
size_t expectedSizeWithMmioProgramming = cmdBufferSizeWithoutMmioProgramming + sizeof(MI_LOAD_REGISTER_IMM) + sizeof(MI_LOAD_REGISTER_MEM);
|
||||
EXPECT_GE(expectedSizeWithMmioProgramming, cmdBufferSizeWithtMmioProgramming);
|
||||
|
||||
GenCmdList cmdList;
|
||||
ASSERT_TRUE(PARSE::parseCommandBuffer(cmdList, ptrOffset(commandQueue->commandStream->getCpuBase(), usedSpaceBefore), usedSpaceAfter));
|
||||
|
||||
auto itorLri = find<MI_LOAD_REGISTER_IMM *>(cmdList.begin(), cmdList.end());
|
||||
ASSERT_NE(cmdList.end(), itorLri);
|
||||
auto itorLrm = find<MI_LOAD_REGISTER_MEM *>(cmdList.begin(), cmdList.end());
|
||||
ASSERT_NE(cmdList.end(), itorLrm);
|
||||
|
||||
auto loadRegisterImm = static_cast<MI_LOAD_REGISTER_IMM *>(*itorLri);
|
||||
EXPECT_EQ(0x23B4u, loadRegisterImm->getRegisterOffset());
|
||||
EXPECT_EQ(8u, loadRegisterImm->getDataDword());
|
||||
|
||||
auto loadRegisterMem = static_cast<MI_LOAD_REGISTER_MEM *>(*itorLrm);
|
||||
EXPECT_EQ(0x221Cu, loadRegisterMem->getRegisterAddress());
|
||||
EXPECT_EQ(workPartitionAddress, loadRegisterMem->getMemoryAddress());
|
||||
|
||||
auto pipeControlList = findAll<PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
|
||||
|
||||
uint32_t foundPostSyncPipeControl = 0u;
|
||||
for (size_t i = 0; i < pipeControlList.size(); i++) {
|
||||
auto pipeControl = reinterpret_cast<PIPE_CONTROL *>(*pipeControlList[i]);
|
||||
if (pipeControl->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) {
|
||||
EXPECT_TRUE(pipeControl->getWorkloadPartitionIdOffsetEnable());
|
||||
foundPostSyncPipeControl++;
|
||||
}
|
||||
}
|
||||
EXPECT_EQ(2u, foundPostSyncPipeControl);
|
||||
|
||||
fence->destroy();
|
||||
commandQueue->destroy();
|
||||
}
|
||||
|
||||
} // namespace ult
|
||||
} // namespace L0
|
||||
|
|
|
@ -405,3 +405,42 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HwHelperTestXeHPAndLater, givenHwHelperWhenGettingB
|
|||
EXPECT_EQ(messageExtDescriptor.getBindlessSurfaceOffsetToPatch(), value);
|
||||
EXPECT_EQ(0x200u, value);
|
||||
}
|
||||
|
||||
HWCMDTEST_F(IGFX_XE_HP_CORE, PipeControlHelperTestsXeHPAndLater, givenPostSyncPipeControlWhenSettingWorkloadPartitionFlagThenExpectPipeControlFlagSet) {
|
||||
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
|
||||
using POST_SYNC_OPERATION = typename FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION;
|
||||
|
||||
uint8_t buffer[128] = {};
|
||||
LinearStream stream(buffer, sizeof(buffer));
|
||||
HardwareInfo hardwareInfo = *defaultHwInfo;
|
||||
uint64_t gpuAddress = 0xBADA550;
|
||||
uint64_t data = 0xABCDEF;
|
||||
|
||||
PipeControlArgs args;
|
||||
args.workloadPartitionOffset = true;
|
||||
|
||||
MemorySynchronizationCommands<FamilyType>::addPipeControlAndProgramPostSyncOperation(
|
||||
stream,
|
||||
POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
|
||||
gpuAddress,
|
||||
data,
|
||||
hardwareInfo,
|
||||
args);
|
||||
|
||||
GenCmdList cmdList;
|
||||
FamilyType::PARSE::parseCommandBuffer(cmdList, stream.getCpuBase(), stream.getUsed());
|
||||
auto pipeControls = findAll<PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
|
||||
|
||||
bool foundPostSyncPipeControl = false;
|
||||
for (size_t i = 0; i < pipeControls.size(); i++) {
|
||||
auto pipeControl = reinterpret_cast<PIPE_CONTROL *>(*pipeControls[i]);
|
||||
if (pipeControl->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) {
|
||||
EXPECT_EQ(static_cast<uint32_t>(gpuAddress), pipeControl->getAddress());
|
||||
EXPECT_EQ(data, pipeControl->getImmediateData());
|
||||
EXPECT_TRUE(pipeControl->getWorkloadPartitionIdOffsetEnable());
|
||||
foundPostSyncPipeControl = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
EXPECT_TRUE(foundPostSyncPipeControl);
|
||||
}
|
||||
|
|
|
@ -195,11 +195,13 @@ struct EncodeSetMMIO {
|
|||
static const size_t sizeREG = sizeof(MI_LOAD_REGISTER_REG);
|
||||
|
||||
static void encodeIMM(CommandContainer &container, uint32_t offset, uint32_t data, bool remap);
|
||||
|
||||
static void encodeMEM(CommandContainer &container, uint32_t offset, uint64_t address);
|
||||
|
||||
static void encodeREG(CommandContainer &container, uint32_t dstOffset, uint32_t srcOffset);
|
||||
|
||||
static void encodeIMM(LinearStream &cmdStream, uint32_t offset, uint32_t data, bool remap);
|
||||
static void encodeMEM(LinearStream &cmdStream, uint32_t offset, uint64_t address);
|
||||
static void encodeREG(LinearStream &cmdStream, uint32_t dstOffset, uint32_t srcOffset);
|
||||
|
||||
static bool isRemapApplicable(uint32_t offset);
|
||||
static void remapOffset(MI_LOAD_REGISTER_MEM *pMiLoadReg);
|
||||
static void remapOffset(MI_LOAD_REGISTER_REG *pMiLoadReg);
|
||||
|
|
|
@ -297,30 +297,45 @@ void EncodeMath<Family>::bitwiseOr(CommandContainer &container,
|
|||
|
||||
template <typename Family>
|
||||
inline void EncodeSetMMIO<Family>::encodeIMM(CommandContainer &container, uint32_t offset, uint32_t data, bool remap) {
|
||||
LriHelper<Family>::program(container.getCommandStream(),
|
||||
EncodeSetMMIO<Family>::encodeIMM(*container.getCommandStream(), offset, data, remap);
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
inline void EncodeSetMMIO<Family>::encodeMEM(CommandContainer &container, uint32_t offset, uint64_t address) {
|
||||
EncodeSetMMIO<Family>::encodeMEM(*container.getCommandStream(), offset, address);
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
inline void EncodeSetMMIO<Family>::encodeREG(CommandContainer &container, uint32_t dstOffset, uint32_t srcOffset) {
|
||||
EncodeSetMMIO<Family>::encodeREG(*container.getCommandStream(), dstOffset, srcOffset);
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
inline void EncodeSetMMIO<Family>::encodeIMM(LinearStream &cmdStream, uint32_t offset, uint32_t data, bool remap) {
|
||||
LriHelper<Family>::program(&cmdStream,
|
||||
offset,
|
||||
data,
|
||||
remap);
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeSetMMIO<Family>::encodeMEM(CommandContainer &container, uint32_t offset, uint64_t address) {
|
||||
void EncodeSetMMIO<Family>::encodeMEM(LinearStream &cmdStream, uint32_t offset, uint64_t address) {
|
||||
MI_LOAD_REGISTER_MEM cmd = Family::cmdInitLoadRegisterMem;
|
||||
cmd.setRegisterAddress(offset);
|
||||
cmd.setMemoryAddress(address);
|
||||
remapOffset(&cmd);
|
||||
|
||||
auto buffer = container.getCommandStream()->getSpaceForCmd<MI_LOAD_REGISTER_MEM>();
|
||||
auto buffer = cmdStream.getSpaceForCmd<MI_LOAD_REGISTER_MEM>();
|
||||
*buffer = cmd;
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeSetMMIO<Family>::encodeREG(CommandContainer &container, uint32_t dstOffset, uint32_t srcOffset) {
|
||||
void EncodeSetMMIO<Family>::encodeREG(LinearStream &cmdStream, uint32_t dstOffset, uint32_t srcOffset) {
|
||||
MI_LOAD_REGISTER_REG cmd = Family::cmdInitLoadRegisterReg;
|
||||
cmd.setSourceRegisterAddress(srcOffset);
|
||||
cmd.setDestinationRegisterAddress(dstOffset);
|
||||
remapOffset(&cmd);
|
||||
auto buffer = container.getCommandStream()->getSpaceForCmd<MI_LOAD_REGISTER_REG>();
|
||||
auto buffer = cmdStream.getSpaceForCmd<MI_LOAD_REGISTER_REG>();
|
||||
*buffer = cmd;
|
||||
}
|
||||
|
||||
|
@ -508,8 +523,8 @@ bool EncodeDispatchKernel<Family>::inlineDataProgrammingRequired(const KernelDes
|
|||
return false;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void EncodeDispatchKernel<GfxFamily>::adjustTimestampPacket(WALKER_TYPE &walkerCmd, const HardwareInfo &hwInfo) {}
|
||||
template <typename Family>
|
||||
void EncodeDispatchKernel<Family>::adjustTimestampPacket(WALKER_TYPE &walkerCmd, const HardwareInfo &hwInfo) {}
|
||||
|
||||
template <typename Family>
|
||||
void EncodeIndirectParams<Family>::setGroupCountIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress) {
|
||||
|
@ -706,12 +721,12 @@ void EncodeBatchBufferStartOrEnd<Family>::programBatchBufferEnd(CommandContainer
|
|||
*buffer = cmd;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void EncodeMiFlushDW<GfxFamily>::programMiFlushDw(LinearStream &commandStream, uint64_t immediateDataGpuAddress, uint64_t immediateData, MiFlushArgs &args) {
|
||||
template <typename Family>
|
||||
void EncodeMiFlushDW<Family>::programMiFlushDw(LinearStream &commandStream, uint64_t immediateDataGpuAddress, uint64_t immediateData, MiFlushArgs &args) {
|
||||
programMiFlushDwWA(commandStream);
|
||||
|
||||
auto miFlushDwCmd = commandStream.getSpaceForCmd<MI_FLUSH_DW>();
|
||||
MI_FLUSH_DW miFlush = GfxFamily::cmdInitMiFlushDw;
|
||||
MI_FLUSH_DW miFlush = Family::cmdInitMiFlushDw;
|
||||
if (args.commandWithPostSync) {
|
||||
auto postSyncType = args.timeStampOperation ? MI_FLUSH_DW::POST_SYNC_OPERATION_WRITE_TIMESTAMP_REGISTER : MI_FLUSH_DW::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA_QWORD;
|
||||
miFlush.setPostSyncOperation(postSyncType);
|
||||
|
@ -724,16 +739,16 @@ void EncodeMiFlushDW<GfxFamily>::programMiFlushDw(LinearStream &commandStream, u
|
|||
*miFlushDwCmd = miFlush;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
size_t EncodeMiFlushDW<GfxFamily>::getMiFlushDwCmdSizeForDataWrite() {
|
||||
return sizeof(typename GfxFamily::MI_FLUSH_DW) + EncodeMiFlushDW<GfxFamily>::getMiFlushDwWaSize();
|
||||
template <typename Family>
|
||||
size_t EncodeMiFlushDW<Family>::getMiFlushDwCmdSizeForDataWrite() {
|
||||
return sizeof(typename Family::MI_FLUSH_DW) + EncodeMiFlushDW<Family>::getMiFlushDwWaSize();
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
inline void EncodeMemoryPrefetch<GfxFamily>::programMemoryPrefetch(LinearStream &commandStream, const GraphicsAllocation &graphicsAllocation, uint32_t size, size_t offset, const HardwareInfo &hwInfo) {}
|
||||
template <typename Family>
|
||||
inline void EncodeMemoryPrefetch<Family>::programMemoryPrefetch(LinearStream &commandStream, const GraphicsAllocation &graphicsAllocation, uint32_t size, size_t offset, const HardwareInfo &hwInfo) {}
|
||||
|
||||
template <typename GfxFamily>
|
||||
inline size_t EncodeMemoryPrefetch<GfxFamily>::getSizeForMemoryPrefetch(size_t size) { return 0u; }
|
||||
template <typename Family>
|
||||
inline size_t EncodeMemoryPrefetch<Family>::getSizeForMemoryPrefetch(size_t size) { return 0u; }
|
||||
|
||||
template <typename Family>
|
||||
void EncodeMiArbCheck<Family>::program(LinearStream &commandStream) {
|
||||
|
|
|
@ -586,9 +586,9 @@ bool EncodeSurfaceState<Family>::doBindingTablePrefetch() {
|
|||
return false;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void EncodeSurfaceState<GfxFamily>::encodeExtraBufferParams(R_SURFACE_STATE *surfaceState, GraphicsAllocation *allocation, GmmHelper *gmmHelper,
|
||||
bool isReadOnly, uint32_t numAvailableDevices, bool useGlobalAtomics, bool areMultipleSubDevicesInContext) {
|
||||
template <typename Family>
|
||||
void EncodeSurfaceState<Family>::encodeExtraBufferParams(R_SURFACE_STATE *surfaceState, GraphicsAllocation *allocation, GmmHelper *gmmHelper,
|
||||
bool isReadOnly, uint32_t numAvailableDevices, bool useGlobalAtomics, bool areMultipleSubDevicesInContext) {
|
||||
Gmm *gmm = allocation ? allocation->getDefaultGmm() : nullptr;
|
||||
uint32_t compressionFormat = 0;
|
||||
|
||||
|
@ -627,7 +627,7 @@ void EncodeSurfaceState<GfxFamily>::encodeExtraBufferParams(R_SURFACE_STATE *sur
|
|||
surfaceState->setDisableSupportForMultiGpuPartialWrites(!!DebugManager.flags.ForceMultiGpuPartialWrites.get());
|
||||
}
|
||||
|
||||
if (EncodeSurfaceState<GfxFamily>::isAuxModeEnabled(surfaceState, gmm)) {
|
||||
if (EncodeSurfaceState<Family>::isAuxModeEnabled(surfaceState, gmm)) {
|
||||
auto resourceFormat = gmm->gmmResourceInfo->getResourceFormat();
|
||||
compressionFormat = gmmHelper->getClientContext()->getSurfaceStateCompressionFormat(resourceFormat);
|
||||
|
||||
|
|
|
@ -42,4 +42,12 @@ struct ImplicitScalingDispatch {
|
|||
uint64_t workPartitionAllocationGpuVa);
|
||||
};
|
||||
|
||||
template <typename GfxFamily>
|
||||
struct PartitionRegisters {
|
||||
enum {
|
||||
wparidCCSOffset = 0x221C,
|
||||
addressOffsetCCSOffset = 0x23B4
|
||||
};
|
||||
};
|
||||
|
||||
} // namespace NEO
|
||||
|
|
|
@ -257,6 +257,10 @@ void CommandStreamReceiver::cleanupResources() {
|
|||
}
|
||||
|
||||
bool CommandStreamReceiver::waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait) {
|
||||
return waitForCompletionWithTimeout(getTagAddress(), enableTimeout, timeoutMicroseconds, taskCountToWait);
|
||||
}
|
||||
|
||||
bool CommandStreamReceiver::waitForCompletionWithTimeout(volatile uint32_t *pollAddress, bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait) {
|
||||
std::chrono::high_resolution_clock::time_point time1, time2;
|
||||
int64_t timeDiff = 0;
|
||||
|
||||
|
@ -272,8 +276,8 @@ bool CommandStreamReceiver::waitForCompletionWithTimeout(bool enableTimeout, int
|
|||
}
|
||||
|
||||
time1 = std::chrono::high_resolution_clock::now();
|
||||
while (*getTagAddress() < taskCountToWait && timeDiff <= timeoutMicroseconds) {
|
||||
if (WaitUtils::waitFunction(getTagAddress(), taskCountToWait)) {
|
||||
while (*pollAddress < taskCountToWait && timeDiff <= timeoutMicroseconds) {
|
||||
if (WaitUtils::waitFunction(pollAddress, taskCountToWait)) {
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -282,7 +286,8 @@ bool CommandStreamReceiver::waitForCompletionWithTimeout(bool enableTimeout, int
|
|||
timeDiff = std::chrono::duration_cast<std::chrono::microseconds>(time2 - time1).count();
|
||||
}
|
||||
}
|
||||
if (*getTagAddress() >= taskCountToWait) {
|
||||
|
||||
if (*pollAddress >= taskCountToWait) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
|
|
|
@ -158,6 +158,7 @@ class CommandStreamReceiver {
|
|||
|
||||
virtual void waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, bool forcePowerSavingMode) = 0;
|
||||
virtual bool waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait);
|
||||
MOCKABLE_VIRTUAL bool waitForCompletionWithTimeout(volatile uint32_t *pollAddress, bool enableTimeout, int64_t timeoutMicroseconds, uint32_t taskCountToWait);
|
||||
virtual void downloadAllocations(){};
|
||||
|
||||
void setSamplerCacheFlushRequired(SamplerCacheFlushState value) { this->samplerCacheFlushRequired = value; }
|
||||
|
@ -292,7 +293,7 @@ class CommandStreamReceiver {
|
|||
LinearStream commandStream;
|
||||
|
||||
// offset for debug state must be 8 bytes, if only 4 bytes are used tag writes overwrite it
|
||||
const uint64_t debugPauseStateAddressOffset = 8;
|
||||
const uint64_t debugPauseStateAddressOffset = MemoryConstants::cacheLineSize;
|
||||
uint64_t totalMemoryUsed = 0u;
|
||||
|
||||
volatile uint32_t *tagAddress = nullptr;
|
||||
|
|
|
@ -21,6 +21,7 @@ struct PipeControlArgsBase {
|
|||
bool tlbInvalidation = false;
|
||||
bool compressionControlSurfaceCcsFlush = false;
|
||||
bool notifyEnable = false;
|
||||
bool workloadPartitionOffset = false;
|
||||
|
||||
protected:
|
||||
PipeControlArgsBase() = default;
|
||||
|
|
|
@ -186,6 +186,7 @@ template <>
|
|||
void MemorySynchronizationCommands<Family>::setPipeControlExtraProperties(PIPE_CONTROL &pipeControl, PipeControlArgs &args) {
|
||||
pipeControl.setHdcPipelineFlush(args.hdcPipelineFlush);
|
||||
pipeControl.setCompressionControlSurfaceCcsFlush(args.compressionControlSurfaceCcsFlush);
|
||||
pipeControl.setWorkloadPartitionIdOffsetEnable(args.workloadPartitionOffset);
|
||||
|
||||
if (DebugManager.flags.FlushAllCaches.get()) {
|
||||
pipeControl.setHdcPipelineFlush(true);
|
||||
|
|
Loading…
Reference in New Issue