Use Semaphore to wait for dependencies on the same device

Change-Id: Ib04c960c50183c080d02753815ece80b58d1980e
Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
This commit is contained in:
Dunajski, Bartosz
2018-09-07 09:09:24 +02:00
committed by sys_ocldev
parent 393ce116e7
commit d04614dce3
8 changed files with 158 additions and 5 deletions

View File

@ -206,7 +206,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
auto taskLevel = 0u;
obtainTaskLevelAndBlockedStatus(taskLevel, numEventsInWaitList, eventWaitList, blockQueue, commandType);
auto &commandStream = getCommandStream<GfxFamily, commandType>(*this, profilingRequired, perfCountersRequired, multiDispatchInfo);
auto &commandStream = getCommandStream<GfxFamily, commandType>(*this, numEventsInWaitList, profilingRequired, perfCountersRequired, multiDispatchInfo);
auto commandStreamStart = commandStream.getUsed();
DBG_LOG(EventsDebugEnable, "blockQueue", blockQueue, "virtualEvent", virtualEvent, "taskLevel", taskLevel);

View File

@ -261,6 +261,9 @@ class GpgpuWalkerHelper {
SchedulerKernel &scheduler,
IndirectHeap *ssh,
IndirectHeap *dsh);
static void dispatchOnDeviceWaitlistSemaphores(LinearStream *commandStream, Device &currentDevice,
cl_uint numEventsInWaitList, const cl_event *eventWaitList);
};
template <typename GfxFamily>
@ -282,7 +285,7 @@ LinearStream &getCommandStream(CommandQueue &commandQueue, bool reserveProfiling
}
template <typename GfxFamily, uint32_t eventType>
LinearStream &getCommandStream(CommandQueue &commandQueue, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, const MultiDispatchInfo &multiDispatchInfo) {
LinearStream &getCommandStream(CommandQueue &commandQueue, cl_uint numEventsInWaitList, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, const MultiDispatchInfo &multiDispatchInfo) {
size_t expectedSizeCS = 0;
Kernel *parentKernel = multiDispatchInfo.peekParentKernel();
for (auto &dispatchInfo : multiDispatchInfo) {
@ -294,6 +297,7 @@ LinearStream &getCommandStream(CommandQueue &commandQueue, bool reserveProfiling
}
if (commandQueue.getDevice().peekCommandStreamReceiver()->peekTimestampPacketWriteEnabled()) {
expectedSizeCS += EnqueueOperation<GfxFamily>::getSizeRequiredForTimestampPacketWrite();
expectedSizeCS += numEventsInWaitList * sizeof(typename GfxFamily::MI_SEMAPHORE_WAIT);
}
return commandQueue.getCS(expectedSizeCS);
}

View File

@ -494,6 +494,11 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchWalker(
ssh = &getIndirectHeap<GfxFamily, IndirectHeap::SURFACE_STATE>(commandQueue, multiDispatchInfo);
}
if (commandQueue.getDevice().peekCommandStreamReceiver()->peekTimestampPacketWriteEnabled()) {
GpgpuWalkerHelper<GfxFamily>::dispatchOnDeviceWaitlistSemaphores(commandStream, commandQueue.getDevice(),
numEventsInWaitList, eventWaitList);
}
dsh->align(KernelCommandsHelper<GfxFamily>::alignInterfaceDescriptorData);
uint32_t interfaceDescriptorIndex = 0;
@ -645,6 +650,28 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchWalker(
dispatchProfilingPerfEndCommands(hwTimeStamps, hwPerfCounter, commandStream, commandQueue);
}
template <typename GfxFamily>
inline void GpgpuWalkerHelper<GfxFamily>::dispatchOnDeviceWaitlistSemaphores(LinearStream *commandStream, Device &currentDevice,
cl_uint numEventsInWaitList, const cl_event *eventWaitList) {
using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT;
for (cl_uint i = 0; i < numEventsInWaitList; i++) {
auto event = castToObjectOrAbort<Event>(eventWaitList[i]);
if (event->isUserEvent() || (&event->getCommandQueue()->getDevice() != &currentDevice)) {
continue;
}
auto timestampPacket = event->getTimestampPacket();
auto compareAddress = timestampPacket->pickAddressForDataWrite(TimestampPacket::DataIndex::ContextEnd);
auto miSemaphoreCmd = commandStream->getSpaceForCmd<MI_SEMAPHORE_WAIT>();
*miSemaphoreCmd = MI_SEMAPHORE_WAIT::sInit();
miSemaphoreCmd->setCompareOperation(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD);
miSemaphoreCmd->setSemaphoreDataDword(1);
miSemaphoreCmd->setSemaphoreGraphicsAddress(compareAddress);
}
}
template <typename GfxFamily>
inline void GpgpuWalkerHelper<GfxFamily>::getDefaultDshSpace(
const size_t &offsetInterfaceDescriptorTable,

View File

@ -719,4 +719,5 @@ void Event::setTimestampPacketNode(TagNode<TimestampPacket> *node) {
timestampPacketNode = node;
}
TimestampPacket *Event::getTimestampPacket() const { return timestampPacketNode->tag; }
} // namespace OCLRT

View File

@ -127,6 +127,7 @@ class Event : public BaseObject<_cl_event>, public IDNode<Event> {
GraphicsAllocation *getHwTimeStampAllocation();
void setTimestampPacketNode(TagNode<TimestampPacket> *node);
TimestampPacket *getTimestampPacket() const;
bool isPerfCountersEnabled() {
return perfCountersEnabled;

View File

@ -241,7 +241,7 @@ HWTEST_F(ParentKernelCommandStreamFixture, GivenDispatchInfoWithParentKernelWhen
size_t totalKernelSize = alignUp(numOfKernels * size, MemoryConstants::pageSize);
LinearStream &commandStream = getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*pCmdQ, false, false, multiDispatchInfo);
LinearStream &commandStream = getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*pCmdQ, 0, false, false, multiDispatchInfo);
EXPECT_LT(totalKernelSize, commandStream.getMaxAvailableSpace());

View File

@ -134,16 +134,39 @@ HWCMDTEST_F(IGFX_GEN8_CORE, TimestampPacketTests, givenTimestampPacketWriteEnabl
MockMultiDispatchInfo multiDispatchInfo(std::vector<Kernel *>({kernel1.mockKernel, kernel2.mockKernel}));
device->getUltCommandStreamReceiver<FamilyType>().timestampPacketWriteEnabled = false;
getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(cmdQ, false, false, multiDispatchInfo);
getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(cmdQ, 0, false, false, multiDispatchInfo);
auto sizeWithDisabled = cmdQ.requestedCmdStreamSize;
device->getUltCommandStreamReceiver<FamilyType>().timestampPacketWriteEnabled = true;
getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(cmdQ, false, false, multiDispatchInfo);
getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(cmdQ, 0, false, false, multiDispatchInfo);
auto sizeWithEnabled = cmdQ.requestedCmdStreamSize;
EXPECT_EQ(sizeWithEnabled, sizeWithDisabled + (2 * sizeof(typename FamilyType::PIPE_CONTROL)));
}
HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenEstimatingStreamSizeWithWaitlistThenAddSizeForSemaphores) {
auto device = std::unique_ptr<MockDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(platformDevices[0]));
MockCommandQueue cmdQ(nullptr, device.get(), nullptr);
MockKernelWithInternals kernel1(*device);
MockKernelWithInternals kernel2(*device);
MockMultiDispatchInfo multiDispatchInfo(std::vector<Kernel *>({kernel1.mockKernel, kernel2.mockKernel}));
cl_uint numEventsOnWaitlist = 5;
device->getUltCommandStreamReceiver<FamilyType>().timestampPacketWriteEnabled = false;
getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(cmdQ, numEventsOnWaitlist, false, false, multiDispatchInfo);
auto sizeWithDisabled = cmdQ.requestedCmdStreamSize;
device->getUltCommandStreamReceiver<FamilyType>().timestampPacketWriteEnabled = true;
getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(cmdQ, numEventsOnWaitlist, false, false, multiDispatchInfo);
auto sizeWithEnabled = cmdQ.requestedCmdStreamSize;
size_t extendedSize = sizeWithDisabled + (2 * sizeof(typename FamilyType::PIPE_CONTROL)) +
(numEventsOnWaitlist * sizeof(typename FamilyType::MI_SEMAPHORE_WAIT));
EXPECT_EQ(sizeWithEnabled, extendedSize);
}
HWCMDTEST_F(IGFX_GEN8_CORE, TimestampPacketTests, givenTimestampPacketWhenDispatchingGpuWalkerThenAddTwoPcForLastWalker) {
using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER;
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
@ -295,3 +318,85 @@ HWCMDTEST_F(IGFX_GEN8_CORE, TimestampPacketTests, givenTimestampPacketWriteEnabl
}
EXPECT_TRUE(walkerFound);
}
HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteEnabledWhenDispatchingThenProgramSemaphoresForWaitlist) {
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
using WALKER = WALKER_TYPE<FamilyType>;
ExecutionEnvironment executionEnvironment;
executionEnvironment.incRefInternal();
auto device1 = std::unique_ptr<MockDevice>(Device::create<MockDevice>(nullptr, &executionEnvironment));
auto device2 = std::unique_ptr<MockDevice>(Device::create<MockDevice>(nullptr, &executionEnvironment));
MockKernelWithInternals kernel1(*device1);
device1->getUltCommandStreamReceiver<FamilyType>().timestampPacketWriteEnabled = true;
MockContext context1(device1.get());
MockContext context2(device2.get());
MockMultiDispatchInfo
multiDispatchInfo(std::vector<Kernel *>({kernel1.mockKernel}));
MockCommandQueue cmdQ1(&context1, device1.get(), nullptr);
MockCommandQueue cmdQ2(&context2, device2.get(), nullptr);
auto &cmdStream = cmdQ1.getCS(0);
const cl_uint eventsOnWaitlist = 6;
TagNode<TimestampPacket> *tagNodes[eventsOnWaitlist];
for (size_t i = 0; i < eventsOnWaitlist; i++) {
tagNodes[i] = executionEnvironment.memoryManager->getTimestampPacketAllocator()->getTag();
}
UserEvent event1;
UserEvent event2;
Event event3(&cmdQ1, 0, 0, 0);
event3.setTimestampPacketNode(tagNodes[2]);
Event event4(&cmdQ2, 0, 0, 0);
event4.setTimestampPacketNode(tagNodes[3]);
Event event5(&cmdQ1, 0, 0, 0);
event5.setTimestampPacketNode(tagNodes[4]);
Event event6(&cmdQ2, 0, 0, 0);
event6.setTimestampPacketNode(tagNodes[5]);
cl_event waitlist[] = {&event1, &event2, &event3, &event4, &event5, &event6};
GpgpuWalkerHelper<FamilyType>::dispatchWalker(
cmdQ1,
multiDispatchInfo,
eventsOnWaitlist,
waitlist,
nullptr,
nullptr,
nullptr,
nullptr,
device1->getPreemptionMode(),
false);
HardwareParse hwParser;
hwParser.parseCommands<FamilyType>(cmdStream, 0);
auto verifySemaphore = [](MI_SEMAPHORE_WAIT *semaphoreCmd, Event *compareEvent) {
EXPECT_EQ(semaphoreCmd->getCompareOperation(), MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD);
EXPECT_EQ(1u, semaphoreCmd->getSemaphoreDataDword());
EXPECT_EQ(compareEvent->getTimestampPacket()->pickAddressForDataWrite(TimestampPacket::DataIndex::ContextEnd),
semaphoreCmd->getSemaphoreGraphicsAddress());
};
uint32_t semaphoresFound = 0;
uint32_t walkersFound = 0;
for (auto it = hwParser.cmdList.begin(); it != hwParser.cmdList.end(); it++) {
auto semaphoreCmd = genCmdCast<MI_SEMAPHORE_WAIT *>(*it);
if (semaphoreCmd) {
semaphoresFound++;
if (semaphoresFound == 1) {
verifySemaphore(semaphoreCmd, &event3);
} else if (semaphoresFound == 2) {
verifySemaphore(semaphoreCmd, &event5);
}
}
if (genCmdCast<WALKER *>(*it)) {
walkersFound++;
EXPECT_EQ(2u, semaphoresFound); // semaphores from events programmed before walker
}
}
EXPECT_EQ(1u, walkersFound);
EXPECT_EQ(2u, semaphoresFound); // total number of semaphores found in cmdList
}

View File

@ -343,6 +343,21 @@ struct GENX {
}
} STATE_SIP;
typedef struct tagMI_SEMAPHORE_WAIT {
typedef enum tagCOMPARE_OPERATION {
COMPARE_OPERATION_SAD_NOT_EQUAL_SDD = 0x5,
} COMPARE_OPERATION;
static tagMI_SEMAPHORE_WAIT sInit(void) {
MI_SEMAPHORE_WAIT state;
return state;
}
inline void setSemaphoreDataDword(uint32_t value) {}
inline void setSemaphoreGraphicsAddress(uint64_t value) {}
inline void setCompareOperation(COMPARE_OPERATION value) {}
} MI_SEMAPHORE_WAIT;
typedef GPGPU_WALKER WALKER_TYPE;
static GPGPU_WALKER cmdInitGpgpuWalker;
static INTERFACE_DESCRIPTOR_DATA cmdInitInterfaceDescriptorData;