fix: submit marker for cpu transfer with event on IOQ queue

Related-To: NEO-8081

Signed-off-by: Maciej Plewka <maciej.plewka@intel.com>
This commit is contained in:
Maciej Plewka
2023-08-04 12:10:12 +00:00
committed by Compute-Runtime-Automation
parent f3b2458a9c
commit 2e249d819e
8 changed files with 197 additions and 18 deletions

View File

@@ -27,6 +27,7 @@ void *CommandQueue::cpuDataTransferHandler(TransferProperties &transferPropertie
EventBuilder eventBuilder; EventBuilder eventBuilder;
bool eventCompleted = false; bool eventCompleted = false;
bool mapOperation = transferProperties.cmdType == CL_COMMAND_MAP_BUFFER || transferProperties.cmdType == CL_COMMAND_MAP_IMAGE; bool mapOperation = transferProperties.cmdType == CL_COMMAND_MAP_BUFFER || transferProperties.cmdType == CL_COMMAND_MAP_IMAGE;
ErrorCodeHelper err(&retVal, CL_SUCCESS); ErrorCodeHelper err(&retVal, CL_SUCCESS);
if (mapOperation) { if (mapOperation) {
@@ -45,8 +46,18 @@ void *CommandQueue::cpuDataTransferHandler(TransferProperties &transferPropertie
} }
transferProperties.memObj->removeMappedPtr(unmapInfo.ptr); transferProperties.memObj->removeMappedPtr(unmapInfo.ptr);
} }
auto blockQueue = false;
TaskCountType taskLevel = 0u;
TakeOwnershipWrapper<CommandQueue> queueOwnership(*this);
auto commandStreamReceiverOwnership = getGpgpuCommandStreamReceiver().obtainUniqueOwnership();
obtainTaskLevelAndBlockedStatus(taskLevel, eventsRequest.numEventsInWaitList, eventsRequest.eventWaitList, blockQueue, transferProperties.cmdType);
bool isMarkerRequiredForEventSignal = !blockQueue &&
!transferProperties.blocking &&
!transferProperties.finishRequired &&
!isOOQEnabled() &&
eventsRequest.outEvent != nullptr;
if (eventsRequest.outEvent) { if (eventsRequest.outEvent && !isMarkerRequiredForEventSignal) {
eventBuilder.create<Event>(this, transferProperties.cmdType, CompletionStamp::notReady, CompletionStamp::notReady); eventBuilder.create<Event>(this, transferProperties.cmdType, CompletionStamp::notReady, CompletionStamp::notReady);
outEventObj = eventBuilder.getEvent(); outEventObj = eventBuilder.getEvent();
outEventObj->setQueueTimeStamp(); outEventObj->setQueueTimeStamp();
@@ -54,13 +65,6 @@ void *CommandQueue::cpuDataTransferHandler(TransferProperties &transferPropertie
*eventsRequest.outEvent = outEventObj; *eventsRequest.outEvent = outEventObj;
} }
TakeOwnershipWrapper<CommandQueue> queueOwnership(*this);
auto commandStreamReceiverOwnership = getGpgpuCommandStreamReceiver().obtainUniqueOwnership();
auto blockQueue = false;
TaskCountType taskLevel = 0u;
obtainTaskLevelAndBlockedStatus(taskLevel, eventsRequest.numEventsInWaitList, eventsRequest.eventWaitList, blockQueue, transferProperties.cmdType);
DBG_LOG(LogTaskCounts, __FUNCTION__, "taskLevel", taskLevel); DBG_LOG(LogTaskCounts, __FUNCTION__, "taskLevel", taskLevel);
if (outEventObj) { if (outEventObj) {
@@ -81,9 +85,10 @@ void *CommandQueue::cpuDataTransferHandler(TransferProperties &transferPropertie
mapOperation ? transferProperties.mapFlags == CL_MAP_READ : unmapInfo.readOnly, mapOperation ? transferProperties.mapFlags == CL_MAP_READ : unmapInfo.readOnly,
eventBuilder); eventBuilder);
} }
if (!isMarkerRequiredForEventSignal) {
commandStreamReceiverOwnership.unlock(); commandStreamReceiverOwnership.unlock();
queueOwnership.unlock(); queueOwnership.unlock();
}
// read/write buffers are always blocking // read/write buffers are always blocking
if (!blockQueue || transferProperties.blocking) { if (!blockQueue || transferProperties.blocking) {
@@ -94,11 +99,9 @@ void *CommandQueue::cpuDataTransferHandler(TransferProperties &transferPropertie
outEventObj->setSubmitTimeStamp(); outEventObj->setSubmitTimeStamp();
} }
// wait for the completness of previous commands // wait for the completness of previous commands
if (transferProperties.cmdType != CL_COMMAND_UNMAP_MEM_OBJECT) { if (transferProperties.finishRequired) {
if (!transferProperties.memObj->isMemObjZeroCopy() || transferProperties.blocking) { finish();
finish(); eventCompleted = true;
eventCompleted = true;
}
} }
if (outEventObj) { if (outEventObj) {
@@ -158,6 +161,12 @@ void *CommandQueue::cpuDataTransferHandler(TransferProperties &transferPropertie
} else { } else {
outEventObj->updateExecutionStatus(); outEventObj->updateExecutionStatus();
} }
} else if (isMarkerRequiredForEventSignal) {
enqueueMarkerWithWaitList(0, nullptr, eventsRequest.outEvent);
commandStreamReceiverOwnership.unlock();
queueOwnership.unlock();
outEventObj = castToObject<Event>(*eventsRequest.outEvent);
outEventObj->setCmdType(transferProperties.cmdType);
} }
if (modifySimulationFlags) { if (modifySimulationFlags) {
auto graphicsAllocation = transferProperties.memObj->getGraphicsAllocation(getDevice().getRootDeviceIndex()); auto graphicsAllocation = transferProperties.memObj->getGraphicsAllocation(getDevice().getRootDeviceIndex());

View File

@@ -103,6 +103,7 @@ TransferProperties::TransferProperties(MemObj *memObj, cl_command_type cmdType,
: memObj(memObj), ptr(ptr), cmdType(cmdType), mapFlags(mapFlags), blocking(blocking), doTransferOnCpu(doTransferOnCpu) { : memObj(memObj), ptr(ptr), cmdType(cmdType), mapFlags(mapFlags), blocking(blocking), doTransferOnCpu(doTransferOnCpu) {
// no size or offset passed for unmap operation // no size or offset passed for unmap operation
if (cmdType != CL_COMMAND_UNMAP_MEM_OBJECT) { if (cmdType != CL_COMMAND_UNMAP_MEM_OBJECT) {
finishRequired = !memObj->isMemObjZeroCopy() || blocking;
if (memObj->peekClMemObjType() == CL_MEM_OBJECT_BUFFER) { if (memObj->peekClMemObjType() == CL_MEM_OBJECT_BUFFER) {
size[0] = *sizePtr; size[0] = *sizePtr;
offset[0] = *offsetPtr; offset[0] = *offsetPtr;

View File

@@ -53,6 +53,7 @@ struct TransferProperties {
uint32_t mipPtrOffset = 0; uint32_t mipPtrOffset = 0;
bool blocking = false; bool blocking = false;
bool doTransferOnCpu = false; bool doTransferOnCpu = false;
bool finishRequired = false;
void *getCpuPtrForReadWrite(); void *getCpuPtrForReadWrite();
}; };

View File

@@ -17,6 +17,7 @@
#include "shared/test/common/mocks/mock_timestamp_container.h" #include "shared/test/common/mocks/mock_timestamp_container.h"
#include "shared/test/common/utilities/base_object_utils.h" #include "shared/test/common/utilities/base_object_utils.h"
#include "opencl/source/helpers/task_information.h"
#include "opencl/test/unit_test/command_queue/command_queue_fixture.h" #include "opencl/test/unit_test/command_queue/command_queue_fixture.h"
#include "opencl/test/unit_test/fixtures/buffer_fixture.h" #include "opencl/test/unit_test/fixtures/buffer_fixture.h"
#include "opencl/test/unit_test/fixtures/image_fixture.h" #include "opencl/test/unit_test/fixtures/image_fixture.h"
@@ -1270,3 +1271,158 @@ HWTEST_F(CommandQueueHwTest, givenRelaxedOrderingEnabledWhenCheckingIfAllowedByC
EXPECT_FALSE(mockCmdQueueHw.relaxedOrderingForGpgpuAllowed(0)); EXPECT_FALSE(mockCmdQueueHw.relaxedOrderingForGpgpuAllowed(0));
EXPECT_FALSE(mockCmdQueueHw.relaxedOrderingForGpgpuAllowed(1)); EXPECT_FALSE(mockCmdQueueHw.relaxedOrderingForGpgpuAllowed(1));
} }
HWTEST_F(CommandQueueHwTest, givenBlockedCommandQueueWhenTransferOnCpuThenEnqueueMarkerIsNotCalled) {
auto commandQueue = std::make_unique<MockCommandQueueHw<FamilyType>>(context, pClDevice, nullptr);
MockGraphicsAllocation alloc{};
auto buffer = std::make_unique<MockBuffer>(context, alloc);
cl_event returnEvent = nullptr;
auto retVal = CL_SUCCESS;
commandQueue->taskLevel = CompletionStamp::notReady;
size_t offset = 0;
size_t size = 4096u;
TransferProperties transferProperties(buffer.get(), CL_COMMAND_MAP_BUFFER, 0, false, &offset, &size, nullptr, false, pDevice->getRootDeviceIndex());
EventsRequest eventsRequest(0, nullptr, &returnEvent);
commandQueue->cpuDataTransferHandler(transferProperties, eventsRequest, retVal);
EXPECT_EQ(0, commandQueue->enqueueMarkerWithWaitListCalledCount);
auto retEvent = reinterpret_cast<MockEvent<Event> *>(castToObject<Event>(returnEvent));
[[maybe_unused]] auto cmd = std::unique_ptr<Command>(retEvent->cmdToSubmit.exchange(nullptr));
clReleaseEvent(returnEvent);
}
HWTEST_F(CommandQueueHwTest, givenCommandQueueWhenCpuTransferIsBlockedThenEnqueueMarkerIsNotCalled) {
auto commandQueue = std::make_unique<MockCommandQueueHw<FamilyType>>(context, pClDevice, nullptr);
MockGraphicsAllocation alloc{};
auto buffer = std::make_unique<MockBuffer>(context, alloc);
cl_event returnEvent = nullptr;
auto retVal = CL_SUCCESS;
commandQueue->taskLevel = 0u;
size_t offset = 0;
size_t size = 4096u;
auto mem = std::make_unique<uint8_t[]>(size);
buffer->hostPtr = mem.get();
buffer->memoryStorage = mem.get();
TransferProperties transferProperties(buffer.get(), CL_COMMAND_MAP_BUFFER, 0, true, &offset, &size, nullptr, false, pDevice->getRootDeviceIndex());
EventsRequest eventsRequest(0, nullptr, &returnEvent);
commandQueue->cpuDataTransferHandler(transferProperties, eventsRequest, retVal);
EXPECT_EQ(0, commandQueue->enqueueMarkerWithWaitListCalledCount);
clReleaseEvent(returnEvent);
}
HWTEST_F(CommandQueueHwTest, givenCommandQueueWhenCpuTransferOperationIsOtherThanUnmapAndMemoryIsNotZeroCopyCommandThenEnqueueMarkerIsNotCalled) {
auto commandQueue = std::make_unique<MockCommandQueueHw<FamilyType>>(context, pClDevice, nullptr);
MockGraphicsAllocation alloc{};
auto buffer = std::make_unique<MockBuffer>(context, alloc);
buffer->isZeroCopy = false;
cl_event returnEvent = nullptr;
auto retVal = CL_SUCCESS;
commandQueue->taskLevel = 0u;
size_t offset = 0;
size_t size = 4096u;
auto mem = std::make_unique<uint8_t[]>(size);
buffer->hostPtr = mem.get();
buffer->memoryStorage = mem.get();
TransferProperties transferProperties(buffer.get(), CL_COMMAND_MAP_BUFFER, 0, false, &offset, &size, nullptr, false, pDevice->getRootDeviceIndex());
EventsRequest eventsRequest(0, nullptr, &returnEvent);
commandQueue->cpuDataTransferHandler(transferProperties, eventsRequest, retVal);
EXPECT_EQ(0, commandQueue->enqueueMarkerWithWaitListCalledCount);
clReleaseEvent(returnEvent);
}
HWTEST_F(CommandQueueHwTest, givenOOQWhenCpuTransferIsCalledThenEnqueueMarkerIsNotCalled) {
cl_queue_properties ooqProperties[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, 0};
auto commandQueue = std::make_unique<MockCommandQueueHw<FamilyType>>(context, pClDevice, ooqProperties);
MockGraphicsAllocation alloc{};
auto buffer = std::make_unique<MockBuffer>(context, alloc);
cl_event returnEvent = nullptr;
auto retVal = CL_SUCCESS;
commandQueue->taskLevel = 0u;
size_t offset = 0;
size_t size = 4096u;
auto mem = std::make_unique<uint8_t[]>(size);
buffer->hostPtr = mem.get();
buffer->memoryStorage = mem.get();
TransferProperties transferProperties(buffer.get(), CL_COMMAND_MAP_BUFFER, 0, false, &offset, &size, nullptr, false, pDevice->getRootDeviceIndex());
EventsRequest eventsRequest(0, nullptr, &returnEvent);
commandQueue->cpuDataTransferHandler(transferProperties, eventsRequest, retVal);
EXPECT_EQ(0, commandQueue->enqueueMarkerWithWaitListCalledCount);
clReleaseEvent(returnEvent);
}
HWTEST_F(CommandQueueHwTest, givenCommandQueueWhenOutEventIsNotPassedToCpuTransferThenEnqueueMarkerIsNotCalled) {
auto commandQueue = std::make_unique<MockCommandQueueHw<FamilyType>>(context, pClDevice, nullptr);
MockGraphicsAllocation alloc{};
auto buffer = std::make_unique<MockBuffer>(context, alloc);
auto retVal = CL_SUCCESS;
commandQueue->taskLevel = 0u;
size_t offset = 0;
size_t size = 4096u;
auto mem = std::make_unique<uint8_t[]>(size);
buffer->hostPtr = mem.get();
buffer->memoryStorage = mem.get();
TransferProperties transferProperties(buffer.get(), CL_COMMAND_MAP_BUFFER, 0, false, &offset, &size, nullptr, false, pDevice->getRootDeviceIndex());
EventsRequest eventsRequest(0, nullptr, nullptr);
commandQueue->cpuDataTransferHandler(transferProperties, eventsRequest, retVal);
EXPECT_EQ(0, commandQueue->enqueueMarkerWithWaitListCalledCount);
}
HWTEST_F(CommandQueueHwTest, givenNotBlockedIOQWhenCpuTransferIsNotBlockedOutEventPassedCommandTypeIsUnmapAndMemoryIsZeroCopyThenEnqueueMarkerIsCalled) {
auto commandQueue = std::make_unique<MockCommandQueueHw<FamilyType>>(context, pClDevice, nullptr);
MockGraphicsAllocation alloc{};
auto buffer = std::make_unique<MockBuffer>(context, alloc);
cl_event returnEvent = nullptr;
auto retVal = CL_SUCCESS;
commandQueue->taskLevel = 0u;
size_t offset = 0;
size_t size = 4096u;
TransferProperties transferProperties(buffer.get(), CL_COMMAND_UNMAP_MEM_OBJECT, 0, false, &offset, &size, nullptr, false, pDevice->getRootDeviceIndex());
auto returnPtr = ptrOffset(transferProperties.memObj->getCpuAddressForMapping(),
transferProperties.memObj->calculateOffsetForMapping(transferProperties.offset) + transferProperties.mipPtrOffset);
transferProperties.memObj->addMappedPtr(returnPtr, transferProperties.memObj->calculateMappedPtrLength(transferProperties.size),
transferProperties.mapFlags, transferProperties.size, transferProperties.offset, transferProperties.mipLevel, nullptr);
EventsRequest eventsRequest(0, nullptr, &returnEvent);
commandQueue->cpuDataTransferHandler(transferProperties, eventsRequest, retVal);
EXPECT_EQ(1, commandQueue->enqueueMarkerWithWaitListCalledCount);
clReleaseEvent(returnEvent);
}
HWTEST_F(CommandQueueHwTest, givenNotBlockedIOQWhenCpuTransferIsBlockedOutEventPassedCommandTypeIsUnmapAndMemoryIsNotZeroCopyThenEnqueueMarkerIsCalled) {
auto commandQueue = std::make_unique<MockCommandQueueHw<FamilyType>>(context, pClDevice, nullptr);
MockGraphicsAllocation alloc{};
auto buffer = std::make_unique<MockBuffer>(context, alloc);
buffer->isZeroCopy = false;
cl_event returnEvent = nullptr;
auto retVal = CL_SUCCESS;
commandQueue->taskLevel = 0u;
size_t offset = 0;
size_t size = 4096u;
TransferProperties transferProperties(buffer.get(), CL_COMMAND_UNMAP_MEM_OBJECT, 0, false, &offset, &size, nullptr, false, pDevice->getRootDeviceIndex());
auto returnPtr = ptrOffset(transferProperties.memObj->getCpuAddressForMapping(),
transferProperties.memObj->calculateOffsetForMapping(transferProperties.offset) + transferProperties.mipPtrOffset);
transferProperties.memObj->addMappedPtr(returnPtr, transferProperties.memObj->calculateMappedPtrLength(transferProperties.size),
transferProperties.mapFlags, transferProperties.size, transferProperties.offset, transferProperties.mipLevel, nullptr);
EventsRequest eventsRequest(0, nullptr, &returnEvent);
commandQueue->cpuDataTransferHandler(transferProperties, eventsRequest, retVal);
EXPECT_EQ(1, commandQueue->enqueueMarkerWithWaitListCalledCount);
clReleaseEvent(returnEvent);
}
HWTEST_F(CommandQueueHwTest, givenNotBlockedIOQWhenCpuTransferIsBlockedOutEventPassedCommandTypeIsOtherThanUnmapAndMemoryIsZeroCopyThenEnqueueMarkerIsCalled) {
auto commandQueue = std::make_unique<MockCommandQueueHw<FamilyType>>(context, pClDevice, nullptr);
MockGraphicsAllocation alloc{};
auto buffer = std::make_unique<MockBuffer>(context, alloc);
cl_event returnEvent = nullptr;
auto retVal = CL_SUCCESS;
commandQueue->taskLevel = 0u;
size_t offset = 0;
size_t size = 4096u;
TransferProperties transferProperties(buffer.get(), CL_COMMAND_MAP_BUFFER, 0, false, &offset, &size, nullptr, false, pDevice->getRootDeviceIndex());
EventsRequest eventsRequest(0, nullptr, &returnEvent);
commandQueue->cpuDataTransferHandler(transferProperties, eventsRequest, retVal);
EXPECT_EQ(1, commandQueue->enqueueMarkerWithWaitListCalledCount);
clReleaseEvent(returnEvent);
}

View File

@@ -169,6 +169,9 @@ TEST_F(EnqueueUnmapMemObjTest, WhenUnmappingMemoryObjectThenWaitEventIsUpdated)
EXPECT_NE(nullptr, ptrResult); EXPECT_NE(nullptr, ptrResult);
EXPECT_NE(nullptr, waitEvent); EXPECT_NE(nullptr, waitEvent);
Event *wEvent = castToObject<Event>(waitEvent);
EXPECT_EQ(CL_QUEUED, wEvent->peekExecutionStatus());
retVal = clEnqueueUnmapMemObject( retVal = clEnqueueUnmapMemObject(
pCmdQ, pCmdQ,
buffer, buffer,
@@ -179,15 +182,16 @@ TEST_F(EnqueueUnmapMemObjTest, WhenUnmappingMemoryObjectThenWaitEventIsUpdated)
EXPECT_EQ(CL_SUCCESS, retVal); EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_NE(nullptr, retEvent); EXPECT_NE(nullptr, retEvent);
Event *wEvent = castToObject<Event>(waitEvent);
EXPECT_EQ(CL_COMPLETE, wEvent->peekExecutionStatus()); EXPECT_EQ(CL_COMPLETE, wEvent->peekExecutionStatus());
Event *rEvent = castToObject<Event>(retEvent); Event *rEvent = castToObject<Event>(retEvent);
EXPECT_EQ(CL_COMPLETE, rEvent->peekExecutionStatus()); EXPECT_EQ(CL_QUEUED, rEvent->peekExecutionStatus());
retVal = clWaitForEvents(1, &retEvent); retVal = clWaitForEvents(1, &retEvent);
EXPECT_EQ(CL_SUCCESS, retVal); EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_EQ(CL_COMPLETE, rEvent->peekExecutionStatus());
retVal = clReleaseMemObject(buffer); retVal = clReleaseMemObject(buffer);
EXPECT_EQ(CL_SUCCESS, retVal); EXPECT_EQ(CL_SUCCESS, retVal);

View File

@@ -41,6 +41,7 @@ class MockBuffer : public MockBufferStorage, public Buffer {
using Buffer::size; using Buffer::size;
using MemObj::associatedMemObject; using MemObj::associatedMemObject;
using MemObj::context; using MemObj::context;
using MemObj::hostPtr;
using MemObj::isZeroCopy; using MemObj::isZeroCopy;
using MemObj::memObjectType; using MemObj::memObjectType;
using MemObj::memoryStorage; using MemObj::memoryStorage;

View File

@@ -441,6 +441,11 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
return CommandQueue::isCompleted(gpgpuTaskCount, bcsStates); return CommandQueue::isCompleted(gpgpuTaskCount, bcsStates);
} }
cl_int enqueueMarkerWithWaitList(cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) override {
enqueueMarkerWithWaitListCalledCount++;
return BaseClass::enqueueMarkerWithWaitList(numEventsInWaitList, eventWaitList, event);
}
unsigned int lastCommandType; unsigned int lastCommandType;
std::vector<Kernel *> lastEnqueuedKernels; std::vector<Kernel *> lastEnqueuedKernels;
MultiDispatchInfo storedMultiDispatchInfo; MultiDispatchInfo storedMultiDispatchInfo;
@@ -470,6 +475,7 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
std::optional<WaitStatus> waitForAllEnginesReturnValue{}; std::optional<WaitStatus> waitForAllEnginesReturnValue{};
std::optional<WaitStatus> waitUntilCompleteReturnValue{}; std::optional<WaitStatus> waitUntilCompleteReturnValue{};
int waitForAllEnginesCalledCount{0}; int waitForAllEnginesCalledCount{0};
int enqueueMarkerWithWaitListCalledCount{0};
LinearStream *peekCommandStream() { LinearStream *peekCommandStream() {
return this->commandStream; return this->commandStream;

View File

@@ -37,6 +37,7 @@ struct MockEvent : public BaseEventType {
using BaseEventType::timeStampNode; using BaseEventType::timeStampNode;
using Event::calcProfilingData; using Event::calcProfilingData;
using Event::calculateSubmitTimestampData; using Event::calculateSubmitTimestampData;
using Event::cmdToSubmit;
using Event::isWaitForTimestampsEnabled; using Event::isWaitForTimestampsEnabled;
using Event::magic; using Event::magic;
using Event::multiRootDeviceTimestampPacketContainer; using Event::multiRootDeviceTimestampPacketContainer;