Add multi tile event signal capability

Related-To: NEO-6262

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz 2021-12-08 11:12:14 +00:00 committed by Compute-Runtime-Automation
parent a27c7af2da
commit bac79244d5
7 changed files with 261 additions and 40 deletions

View File

@ -1675,14 +1675,23 @@ void CommandListCoreFamily<gfxCoreFamily>::appendSignalEventPostWalker(ze_event_
if (isCopyOnly()) {
NEO::MiFlushArgs args;
args.commandWithPostSync = true;
increaseCommandStreamSpace(NEO::EncodeMiFlushDW<GfxFamily>::getMiFlushDwCmdSizeForDataWrite());
NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(*commandContainer.getCommandStream(), baseAddr, Event::STATE_SIGNALED, args);
} else {
auto &hwInfo = commandContainer.getDevice()->getHardwareInfo();
increaseCommandStreamSpace(NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForPipeControlWithPostSyncOperation(hwInfo));
NEO::PipeControlArgs args;
args.dcFlushEnable = (!event->signalScope) ? false : true;
args.dcFlushEnable = !!event->signalScope;
if (this->partitionCount > 1) {
args.workloadPartitionOffset = true;
event->setPacketsInUse(this->partitionCount);
}
NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(
*commandContainer.getCommandStream(), POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
baseAddr, Event::STATE_SIGNALED,
commandContainer.getDevice()->getHardwareInfo(),
*commandContainer.getCommandStream(),
POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
baseAddr,
Event::STATE_SIGNALED,
hwInfo,
args);
}
}
@ -1821,26 +1830,37 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendSignalEvent(ze_event_han
if (isCopyOnly()) {
NEO::MiFlushArgs args;
args.commandWithPostSync = true;
increaseCommandStreamSpace(NEO::EncodeMiFlushDW<GfxFamily>::getMiFlushDwCmdSizeForDataWrite());
NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(*commandContainer.getCommandStream(), ptrOffset(baseAddr, eventSignalOffset), Event::STATE_SIGNALED, args);
} else {
NEO::PipeControlArgs args;
applyScope = (!event->signalScope) ? false : true;
applyScope = !!event->signalScope;
if (NEO::MemorySynchronizationCommands<GfxFamily>::isDcFlushAllowed()) {
args.dcFlushEnable = applyScope;
}
if (this->partitionCount > 1) {
args.workloadPartitionOffset = true;
event->setPacketsInUse(this->partitionCount);
}
if (applyScope || event->isEventTimestampFlagSet()) {
auto &hwInfo = commandContainer.getDevice()->getHardwareInfo();
increaseCommandStreamSpace(NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForPipeControlWithPostSyncOperation(hwInfo));
NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(
*commandContainer.getCommandStream(), POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
ptrOffset(baseAddr, eventSignalOffset), Event::STATE_SIGNALED,
commandContainer.getDevice()->getHardwareInfo(),
*commandContainer.getCommandStream(),
POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
ptrOffset(baseAddr, eventSignalOffset),
Event::STATE_SIGNALED,
hwInfo,
args);
} else {
NEO::EncodeStoreMemory<GfxFamily>::programStoreDataImm(*commandContainer.getCommandStream(),
ptrOffset(baseAddr, eventSignalOffset),
Event::STATE_SIGNALED,
0u,
false,
false);
increaseCommandStreamSpace(NEO::EncodeStoreMemory<GfxFamily>::getStoreDataImmSize());
NEO::EncodeStoreMemory<GfxFamily>::programStoreDataImm(
*commandContainer.getCommandStream(),
ptrOffset(baseAddr, eventSignalOffset),
Event::STATE_SIGNALED,
0u,
false,
args.workloadPartitionOffset);
}
}

View File

@ -87,8 +87,6 @@ struct Event : _ze_event_handle_t {
bool isEventTimestampFlagSet() { return isTimestampEvent; }
virtual ze_result_t hostEventSetValue(uint32_t eventValue) = 0;
uint64_t globalStartTS;
uint64_t globalEndTS;
uint64_t contextStartTS;
@ -158,7 +156,6 @@ struct EventImp : public Event {
uint64_t getPacketAddress(Device *device) override;
uint32_t getPacketsInUse() override;
void setPacketsInUse(uint32_t value) override;
ze_result_t hostEventSetValue(uint32_t eventValue) override;
std::unique_ptr<KernelEventCompletionData<TagSizeT>[]> kernelEventCompletionData;
@ -170,6 +167,7 @@ struct EventImp : public Event {
ze_result_t calculateProfilingData();
ze_result_t queryStatusKernelTimestamp();
ze_result_t queryStatusNonTimestamp();
ze_result_t hostEventSetValue(TagSizeT eventValue);
ze_result_t hostEventSetValueTimestamps(TagSizeT eventVal);
void assignKernelEventCompletionData(void *address);
};

View File

@ -122,7 +122,7 @@ ze_result_t EventImp<TagSizeT>::queryStatusNonTimestamp() {
template <typename TagSizeT>
ze_result_t EventImp<TagSizeT>::queryStatus() {
uint64_t *hostAddr = static_cast<uint64_t *>(hostAddress);
TagSizeT *hostAddr = static_cast<TagSizeT *>(hostAddress);
if (metricStreamer != nullptr) {
*hostAddr = metricStreamer->getNotificationState();
@ -138,8 +138,8 @@ ze_result_t EventImp<TagSizeT>::queryStatus() {
template <typename TagSizeT>
ze_result_t EventImp<TagSizeT>::hostEventSetValueTimestamps(TagSizeT eventVal) {
auto baseAddr = reinterpret_cast<uint64_t>(hostAddress);
auto signalScopeFlag = this->signalScope;
auto baseAddr = castToUint64(hostAddress);
auto signalScopeFlag = !!this->signalScope;
auto eventTsSetFunc = [&eventVal, &signalScopeFlag](auto tsAddr) {
auto tsptr = reinterpret_cast<void *>(tsAddr);
@ -165,16 +165,22 @@ ze_result_t EventImp<TagSizeT>::hostEventSetValueTimestamps(TagSizeT eventVal) {
}
template <typename TagSizeT>
ze_result_t EventImp<TagSizeT>::hostEventSetValue(uint32_t eventVal) {
ze_result_t EventImp<TagSizeT>::hostEventSetValue(TagSizeT eventVal) {
if (isEventTimestampFlagSet()) {
return hostEventSetValueTimestamps(static_cast<TagSizeT>(eventVal));
return hostEventSetValueTimestamps(eventVal);
}
auto hostAddr = static_cast<uint64_t *>(hostAddress);
UNRECOVERABLE_IF(hostAddr == nullptr);
memcpy_s(static_cast<void *>(hostAddr), sizeof(uint32_t), static_cast<void *>(&eventVal), sizeof(uint32_t));
auto packetHostAddr = hostAddress;
UNRECOVERABLE_IF(packetHostAddr == nullptr);
NEO::CpuIntrinsics::clFlush(hostAddr);
for (uint32_t i = 0; i < kernelCount; i++) {
uint32_t packetsToSet = kernelEventCompletionData[i].getPacketsUsed();
for (uint32_t j = 0; j < packetsToSet; j++) {
memcpy_s(packetHostAddr, sizeof(TagSizeT), static_cast<void *>(&eventVal), sizeof(TagSizeT));
NEO::CpuIntrinsics::clFlush(packetHostAddr);
packetHostAddr = ptrOffset(packetHostAddr, singlePacketSize);
}
}
return ZE_RESULT_SUCCESS;
}

View File

@ -49,33 +49,28 @@ class MockEvent : public ::L0::Event {
ze_result_t destroy() override {
return ZE_RESULT_SUCCESS;
};
}
ze_result_t hostSignal() override {
return ZE_RESULT_SUCCESS;
};
}
ze_result_t hostSynchronize(uint64_t timeout) override {
return ZE_RESULT_SUCCESS;
};
}
ze_result_t queryStatus() override {
return ZE_RESULT_SUCCESS;
};
}
ze_result_t reset() override {
return ZE_RESULT_SUCCESS;
};
}
ze_result_t queryKernelTimestamp(ze_kernel_timestamp_result_t *dstptr) override {
return ZE_RESULT_SUCCESS;
};
}
ze_result_t queryTimestampsExp(L0::Device *device, uint32_t *pCount, ze_kernel_timestamp_result_t *pTimestamps) override {
return ZE_RESULT_SUCCESS;
};
ze_result_t hostEventSetValue(uint32_t eventValue) override {
return ZE_RESULT_SUCCESS;
}
uint32_t getPacketsInUse() override { return 1; }
void resetPackets() override{};
void setPacketsInUse(uint32_t value) override{};
void resetPackets() override {}
void setPacketsInUse(uint32_t value) override {}
uint64_t getPacketAddress(L0::Device *) override { return 0; }
std::unique_ptr<NEO::GraphicsAllocation> mockAllocation;

View File

@ -186,5 +186,168 @@ HWTEST2_F(CommandListAppendSignalEvent, givenTimestampEventUsedInSignalThenPipeC
ASSERT_TRUE(postSyncFound);
}
HWTEST2_F(CommandListAppendSignalEvent,
givenMultiTileCommandListWhenAppendingScopeEventSignalThenExpectPartitionedPipeControl, IsAtLeastXeHpCore) {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION;
using MI_BATCH_BUFFER_END = typename FamilyType::MI_BATCH_BUFFER_END;
auto cmdStream = commandList->commandContainer.getCommandStream();
size_t useSize = cmdStream->getAvailableSpace();
useSize -= sizeof(MI_BATCH_BUFFER_END);
cmdStream->getSpace(useSize);
constexpr uint32_t packets = 2u;
event->setEventTimestampFlag(false);
event->signalScope = ZE_EVENT_SCOPE_FLAG_HOST;
commandList->partitionCount = packets;
ze_result_t returnValue = commandList->appendSignalEvent(event->toHandle());
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
EXPECT_EQ(packets, event->getPacketsInUse());
auto gpuAddress = event->getGpuAddress(device);
auto &hwInfo = device->getNEODevice()->getHardwareInfo();
size_t expectedSize = NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForPipeControlWithPostSyncOperation(hwInfo);
size_t usedSize = cmdStream->getUsed();
EXPECT_EQ(expectedSize, usedSize);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList,
cmdStream->getCpuBase(),
usedSize));
auto pipeControlList = findAll<PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
ASSERT_NE(0u, pipeControlList.size());
uint32_t postSyncFound = 0;
for (auto &it : pipeControlList) {
auto cmd = genCmdCast<PIPE_CONTROL *>(*it);
if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) {
EXPECT_EQ(Event::STATE_SIGNALED, cmd->getImmediateData());
EXPECT_TRUE(cmd->getCommandStreamerStallEnable());
EXPECT_EQ(gpuAddress, NEO::UnitTestHelper<FamilyType>::getPipeControlPostSyncAddress(*cmd));
EXPECT_EQ(MemorySynchronizationCommands<FamilyType>::isDcFlushAllowed(), cmd->getDcFlushEnable());
EXPECT_TRUE(cmd->getWorkloadPartitionIdOffsetEnable());
postSyncFound++;
gpuAddress += event->getSinglePacketSize();
}
}
EXPECT_EQ(1u, postSyncFound);
}
HWTEST2_F(CommandListAppendSignalEvent,
givenMultiTileCommandListWhenAppendingNonScopeEventSignalThenExpectPartitionedStoreDataImm, IsAtLeastXeHpCore) {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
using MI_BATCH_BUFFER_END = typename FamilyType::MI_BATCH_BUFFER_END;
auto cmdStream = commandList->commandContainer.getCommandStream();
size_t useSize = cmdStream->getAvailableSpace();
useSize -= sizeof(MI_BATCH_BUFFER_END);
cmdStream->getSpace(useSize);
constexpr uint32_t packets = 2u;
event->setEventTimestampFlag(false);
event->signalScope = 0;
commandList->partitionCount = packets;
ze_result_t returnValue = commandList->appendSignalEvent(event->toHandle());
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
EXPECT_EQ(packets, event->getPacketsInUse());
auto gpuAddress = event->getGpuAddress(device);
size_t expectedSize = NEO::EncodeStoreMemory<GfxFamily>::getStoreDataImmSize();
size_t usedSize = cmdStream->getUsed();
EXPECT_EQ(expectedSize, usedSize);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList,
cmdStream->getCpuBase(),
usedSize));
auto storeDataImmList = findAll<MI_STORE_DATA_IMM *>(cmdList.begin(), cmdList.end());
ASSERT_NE(0u, storeDataImmList.size());
uint32_t postSyncFound = 0;
for (auto &it : storeDataImmList) {
auto cmd = genCmdCast<MI_STORE_DATA_IMM *>(*it);
EXPECT_EQ(gpuAddress, cmd->getAddress());
EXPECT_FALSE(cmd->getStoreQword());
EXPECT_EQ(Event::STATE_SIGNALED, cmd->getDataDword0());
EXPECT_EQ(0u, cmd->getDataDword1());
EXPECT_EQ(MI_STORE_DATA_IMM::DWORD_LENGTH::DWORD_LENGTH_STORE_DWORD, cmd->getDwordLength());
EXPECT_TRUE(cmd->getWorkloadPartitionIdOffsetEnable());
postSyncFound++;
gpuAddress += event->getSinglePacketSize();
}
EXPECT_EQ(1u, postSyncFound);
}
HWTEST2_F(CommandListAppendSignalEvent,
givenMultiTileCommandListWhenAppendingScopeEventSignalAfterWalkerThenExpectPartitionedPipeControl, IsAtLeastXeHpCore) {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION;
using MI_BATCH_BUFFER_END = typename FamilyType::MI_BATCH_BUFFER_END;
auto commandList = std::make_unique<::L0::ult::CommandListCoreFamily<gfxCoreFamily>>();
ASSERT_NE(nullptr, commandList);
ze_result_t returnValue = commandList->initialize(device, NEO::EngineGroupType::Compute, 0u);
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
auto cmdStream = commandList->commandContainer.getCommandStream();
size_t useSize = cmdStream->getAvailableSpace();
useSize -= sizeof(MI_BATCH_BUFFER_END);
cmdStream->getSpace(useSize);
constexpr uint32_t packets = 2u;
event->setEventTimestampFlag(false);
event->signalScope = ZE_EVENT_SCOPE_FLAG_HOST;
commandList->partitionCount = packets;
commandList->appendSignalEventPostWalker(event->toHandle());
EXPECT_EQ(packets, event->getPacketsInUse());
auto gpuAddress = event->getGpuAddress(device);
auto &hwInfo = device->getNEODevice()->getHardwareInfo();
size_t expectedSize = NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForPipeControlWithPostSyncOperation(hwInfo);
size_t usedSize = cmdStream->getUsed();
EXPECT_EQ(expectedSize, usedSize);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList,
cmdStream->getCpuBase(),
usedSize));
auto pipeControlList = findAll<PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
ASSERT_NE(0u, pipeControlList.size());
uint32_t postSyncFound = 0;
for (auto &it : pipeControlList) {
auto cmd = genCmdCast<PIPE_CONTROL *>(*it);
if (cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) {
EXPECT_EQ(Event::STATE_SIGNALED, cmd->getImmediateData());
EXPECT_TRUE(cmd->getCommandStreamerStallEnable());
EXPECT_EQ(gpuAddress, NEO::UnitTestHelper<FamilyType>::getPipeControlPostSyncAddress(*cmd));
EXPECT_EQ(MemorySynchronizationCommands<FamilyType>::isDcFlushAllowed(), cmd->getDcFlushEnable());
EXPECT_TRUE(cmd->getWorkloadPartitionIdOffsetEnable());
postSyncFound++;
gpuAddress += event->getSinglePacketSize();
}
}
EXPECT_EQ(1u, postSyncFound);
}
} // namespace ult
} // namespace L0

View File

@ -21,6 +21,13 @@
#include "level_zero/core/test/unit_tests/mocks/mock_device.h"
#include "level_zero/core/test/unit_tests/mocks/mock_event.h"
#include <atomic>
namespace CpuIntrinsicsTests {
extern std::atomic<uintptr_t> lastClFlushedPtr;
extern std::atomic<uint32_t> clFlushCounter;
} // namespace CpuIntrinsicsTests
namespace L0 {
namespace ult {
using EventPoolCreate = Test<DeviceFixture>;
@ -646,7 +653,7 @@ TEST_F(EventPoolIPCEventResetTests, whenOpeningIpcHandleForEventPoolCreateWithIp
EXPECT_EQ(*hostAddr, Event::STATE_INITIAL);
// change state
event0->hostEventSetValue(Event::STATE_SIGNALED);
event0->hostSignal();
hostAddr = static_cast<uint32_t *>(event0->getHostAddress());
EXPECT_EQ(*hostAddr, Event::STATE_SIGNALED);
@ -1324,6 +1331,36 @@ TEST_F(EventTests, givenTwoEventsCreatedThenTheyHaveDifferentAddresses) {
event1->destroy();
}
TEST_F(EventTests, givenRegularEventUseMultiplePacketsWhenHostSignalThenExpectAllPacketsAreSignaled) {
eventDesc.index = 0;
eventDesc.signal = 0;
eventDesc.wait = 0;
auto event = std::unique_ptr<L0::EventImp<uint32_t>>(static_cast<L0::EventImp<uint32_t> *>(L0::Event::create<uint32_t>(eventPool,
&eventDesc,
device)));
ASSERT_NE(event, nullptr);
uint32_t *hostAddr = static_cast<uint32_t *>(event->getHostAddress());
EXPECT_EQ(*hostAddr, Event::STATE_INITIAL);
EXPECT_EQ(1u, event->getPacketsInUse());
constexpr uint32_t packetsUsed = 4u;
event->setPacketsInUse(packetsUsed);
event->setEventTimestampFlag(false);
CpuIntrinsicsTests::lastClFlushedPtr = 0u;
CpuIntrinsicsTests::clFlushCounter = 0u;
event->hostSignal();
for (uint32_t i = 0; i < packetsUsed; i++) {
EXPECT_EQ(Event::STATE_SIGNALED, *hostAddr);
hostAddr = ptrOffset(hostAddr, event->getSinglePacketSize());
}
uintptr_t expectedPtrVal = reinterpret_cast<uintptr_t>(hostAddr) - event->getSinglePacketSize();
EXPECT_EQ(expectedPtrVal, CpuIntrinsicsTests::lastClFlushedPtr);
EXPECT_EQ(packetsUsed, CpuIntrinsicsTests::clFlushCounter);
}
struct EventSizeFixture : public DeviceFixture {
void SetUp() {
DeviceFixture::SetUp();

View File

@ -16,6 +16,7 @@
namespace CpuIntrinsicsTests {
//std::atomic is used for sake of sanitation in MT tests
std::atomic<uintptr_t> lastClFlushedPtr(0u);
std::atomic<uint32_t> clFlushCounter(0u);
std::atomic<uint32_t> pauseCounter(0u);
volatile uint32_t *pauseAddress = nullptr;
@ -29,6 +30,7 @@ namespace NEO {
namespace CpuIntrinsics {
void clFlush(void const *ptr) {
CpuIntrinsicsTests::clFlushCounter++;
CpuIntrinsicsTests::lastClFlushedPtr = reinterpret_cast<uintptr_t>(ptr);
}