fix: Improved event profiling setup for bcs split enqueue

Fixes OpenCL Khronos failures in test profiling with bcs split

Related-To: NEO-8927

Signed-off-by: Milczarek, Slawomir <slawomir.milczarek@intel.com>
This commit is contained in:
Milczarek, Slawomir
2023-12-07 11:13:18 +00:00
committed by Compute-Runtime-Automation
parent 44c23cb8be
commit bec9874487
5 changed files with 97 additions and 19 deletions

View File

@@ -386,6 +386,8 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
bool getHeaplessModeEnabled() const { return this->heaplessModeEnabled; }
bool isBcsSplitInitialized() const { return this->bcsSplitInitialized; }
protected:
void *enqueueReadMemObjForMap(TransferProperties &transferProperties, EventsRequest &eventsRequest, cl_int &errcodeRet);
cl_int enqueueWriteMemObjForUnmap(MemObj *memObj, void *mappedPtr, EventsRequest &eventsRequest);

View File

@@ -364,7 +364,7 @@ class CommandQueueHw : public CommandQueue {
cl_int dispatchBcsOrGpgpuEnqueue(MultiDispatchInfo &dispatchInfo, Surface *(&surfaces)[surfaceCount], EBuiltInOps::Type builtInOperation, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event, bool blocking, CommandStreamReceiver &csr);
template <uint32_t cmdType>
cl_int enqueueBlit(const MultiDispatchInfo &multiDispatchInfo, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event, bool blocking, CommandStreamReceiver &bcsCsr);
cl_int enqueueBlit(const MultiDispatchInfo &multiDispatchInfo, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event, bool blocking, CommandStreamReceiver &bcsCsr, EventBuilder *pExternalEventBuilder);
bool isSplitEnqueueBlitNeeded(TransferDirection transferDirection, size_t transferSize, CommandStreamReceiver &csr);
size_t getTotalSizeFromRectRegion(const size_t *region);

View File

@@ -1108,7 +1108,7 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueCommandWithoutKernel(
csrDeps.makeResident(getGpgpuCommandStreamReceiver());
}
if (eventBuilder.getEvent() && isProfilingEnabled()) {
if (eventBuilder.getEvent() && isProfilingEnabled() && !isBcsSplitInitialized()) {
eventBuilder.getEvent()->setSubmitTimeStamp();
eventBuilder.getEvent()->setStartTimeStamp();
}
@@ -1309,6 +1309,16 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlitSplit(MultiDispatchInfo &dispatchIn
auto size = dispatchInfo.peekBuiltinOpParams().size.x;
auto remainingSize = size;
EventBuilder externalEventBuilder;
EventBuilder *pEventBuilder = nullptr;
DEBUG_BREAK_IF(!this->isBcsSplitInitialized());
if (event && this->isProfilingEnabled()) {
pEventBuilder = &externalEventBuilder;
setupEvent(*pEventBuilder, event, cmdType);
castToObjectOrAbort<Event>(*event)->setSubmitTimeStamp();
castToObjectOrAbort<Event>(*event)->setStartTimeStamp();
}
for (size_t i = 0; i < copyEngines.size(); i++) {
auto localSize = remainingSize / (copyEngines.size() - i);
auto localParams = dispatchInfo.peekBuiltinOpParams();
@@ -1321,7 +1331,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlitSplit(MultiDispatchInfo &dispatchIn
this->timestampPacketContainer->assignAndIncrementNodesRefCounts(previousEnqueueNode);
ret = enqueueBlit<cmdType>(dispatchInfo, numEventsInWaitList, eventWaitList, remainingSize == 0 ? event : nullptr, false, *copyEngines[i]);
ret = enqueueBlit<cmdType>(dispatchInfo, numEventsInWaitList, eventWaitList, remainingSize == 0 ? event : nullptr, false, *copyEngines[i], pEventBuilder);
DEBUG_BREAK_IF(ret != CL_SUCCESS);
this->timestampPacketContainer->moveNodesToNewContainer(splitNodes);
@@ -1349,16 +1359,23 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlitSplit(MultiDispatchInfo &dispatchIn
template <typename GfxFamily>
template <uint32_t cmdType>
cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDispatchInfo, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event, bool blocking, CommandStreamReceiver &bcsCsr) {
cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDispatchInfo, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event, bool blocking, CommandStreamReceiver &bcsCsr, EventBuilder *pExternalEventBuilder) {
auto bcsCommandStreamReceiverOwnership = bcsCsr.obtainUniqueOwnership();
std::unique_lock<NEO::CommandStreamReceiver::MutexType> commandStreamReceiverOwnership;
registerBcsCsrClient(bcsCsr);
EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, event);
EventBuilder eventBuilder;
EventBuilder internalEventBuilder;
EventBuilder *pEventBuilder = nullptr;
setupEvent(eventBuilder, eventsRequest.outEvent, cmdType);
if (pExternalEventBuilder) {
DEBUG_BREAK_IF(!this->isBcsSplitInitialized() || !this->isProfilingEnabled());
pEventBuilder = pExternalEventBuilder;
} else {
pEventBuilder = &internalEventBuilder;
setupEvent(*pEventBuilder, eventsRequest.outEvent, cmdType);
}
eventsRequest.setupBcsCsrForOutputEvent(bcsCsr);
std::unique_ptr<KernelOperation> blockedCommandsData;
@@ -1412,11 +1429,11 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDisp
obtainNewTimestampPacketNodes(1, timestampPacketDependencies.previousEnqueueNodes, clearAllDependencies, bcsCsr);
csrDeps.timestampPacketContainer.push_back(&timestampPacketDependencies.previousEnqueueNodes);
if (eventBuilder.getEvent()) {
eventBuilder.getEvent()->addTimestampPacketNodes(*timestampPacketContainer);
if (pEventBuilder->getEvent()) {
pEventBuilder->getEvent()->addTimestampPacketNodes(*timestampPacketContainer);
}
if (eventBuilder.getEvent() && eventBuilder.getEvent()->getContext()->getRootDeviceIndices().size() > 1) {
multiRootEventSyncStamp = eventBuilder.getEvent()->getMultiRootTimestampSyncNode();
if (pEventBuilder->getEvent() && pEventBuilder->getEvent()->getContext()->getRootDeviceIndices().size() > 1) {
multiRootEventSyncStamp = pEventBuilder->getEvent()->getMultiRootTimestampSyncNode();
bcsCsr.makeResident(*multiRootEventSyncStamp->getBaseGraphicsAllocation());
}
@@ -1442,7 +1459,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDisp
if (!blockQueue) {
completionStamp = enqueueCommandWithoutKernel(nullptr, 0, gpgpuCommandStream, gpgpuCommandStreamStart, blocking,
enqueueProperties, timestampPacketDependencies, eventsRequest,
eventBuilder, taskLevel, csrDeps, &bcsCsr, false);
*pEventBuilder, taskLevel, csrDeps, &bcsCsr, false);
if (completionStamp.taskCount > CompletionStamp::notReady) {
return CommandQueue::getErrorCodeFromTaskCount(completionStamp.taskCount);
}
@@ -1453,18 +1470,18 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDisp
}
}
if (eventBuilder.getEvent()) {
eventBuilder.getEvent()->flushStamp->replaceStampObject(this->flushStamp->getStampReference());
if (pEventBuilder->getEvent()) {
pEventBuilder->getEvent()->flushStamp->replaceStampObject(this->flushStamp->getStampReference());
}
this->latestSentEnqueueType = enqueueProperties.operation;
setLastBcsPacket(bcsCsr.getOsContext().getEngineType());
}
updateFromCompletionStamp(completionStamp, eventBuilder.getEvent());
updateFromCompletionStamp(completionStamp, pEventBuilder->getEvent());
if (blockQueue) {
enqueueBlocked(cmdType, nullptr, 0, multiDispatchInfo, timestampPacketDependencies, blockedCommandsData, enqueueProperties, eventsRequest, eventBuilder, nullptr, &bcsCsr, multiRootEventSyncStamp);
enqueueBlocked(cmdType, nullptr, 0, multiDispatchInfo, timestampPacketDependencies, blockedCommandsData, enqueueProperties, eventsRequest, *pEventBuilder, nullptr, &bcsCsr, multiRootEventSyncStamp);
if (gpgpuSubmission) {
if (debugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.get() == 1) {
@@ -1510,7 +1527,7 @@ cl_int CommandQueueHw<GfxFamily>::dispatchBcsOrGpgpuEnqueue(MultiDispatchInfo &d
if (dispatchInfo.peekBuiltinOpParams().bcsSplit) {
ret = enqueueBlitSplit<cmdType>(dispatchInfo, numEventsInWaitList, eventWaitList, event, blocking, csr);
} else {
ret = enqueueBlit<cmdType>(dispatchInfo, numEventsInWaitList, eventWaitList, event, blocking, csr);
ret = enqueueBlit<cmdType>(dispatchInfo, numEventsInWaitList, eventWaitList, event, blocking, csr, nullptr);
}
return ret;

View File

@@ -13,6 +13,8 @@
#include "shared/test/common/test_macros/hw_test.h"
#include "shared/test/common/utilities/base_object_utils.h"
#include "opencl/source/command_queue/enqueue_common.h"
#include "opencl/source/event/event.h"
#include "opencl/test/unit_test/command_queue/command_queue_fixture.h"
#include "opencl/test/unit_test/fixtures/buffer_fixture.h"
#include "opencl/test/unit_test/mocks/mock_buffer.h"
@@ -105,19 +107,19 @@ HWTEST2_F(CommandQueuePvcAndLaterTests, givenMultipleBcsEnginesWhenDispatchingCo
baseNumClientsBcs3 = bcsCsr0->getNumClients();
baseNumClientsBcs7 = bcsCsr0->getNumClients();
auto retVal = queue.template enqueueBlit<CL_COMMAND_READ_BUFFER>(dispatchInfo, 0, nullptr, nullptr, false, *bcsCsr0);
auto retVal = queue.template enqueueBlit<CL_COMMAND_READ_BUFFER>(dispatchInfo, 0, nullptr, nullptr, false, *bcsCsr0, nullptr);
EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_EQ(baseNumClientsBcs0 + 1, bcsCsr0->getNumClients());
EXPECT_EQ(baseNumClientsBcs3, bcsCsr3->getNumClients());
EXPECT_EQ(baseNumClientsBcs7, bcsCsr7->getNumClients());
retVal = queue.template enqueueBlit<CL_COMMAND_READ_BUFFER>(dispatchInfo, 0, nullptr, nullptr, false, *bcsCsr3);
retVal = queue.template enqueueBlit<CL_COMMAND_READ_BUFFER>(dispatchInfo, 0, nullptr, nullptr, false, *bcsCsr3, nullptr);
EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_EQ(baseNumClientsBcs0 + 1, bcsCsr0->getNumClients());
EXPECT_EQ(baseNumClientsBcs3 + 1, bcsCsr3->getNumClients());
EXPECT_EQ(baseNumClientsBcs7, bcsCsr7->getNumClients());
retVal = queue.template enqueueBlit<CL_COMMAND_READ_BUFFER>(dispatchInfo, 0, nullptr, nullptr, false, *bcsCsr7);
retVal = queue.template enqueueBlit<CL_COMMAND_READ_BUFFER>(dispatchInfo, 0, nullptr, nullptr, false, *bcsCsr7, nullptr);
EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_EQ(baseNumClientsBcs0 + 1, bcsCsr0->getNumClients());
EXPECT_EQ(baseNumClientsBcs3 + 1, bcsCsr3->getNumClients());
@@ -129,6 +131,62 @@ HWTEST2_F(CommandQueuePvcAndLaterTests, givenMultipleBcsEnginesWhenDispatchingCo
EXPECT_EQ(baseNumClientsBcs7, bcsCsr7->getNumClients());
}
HWTEST2_F(CommandQueuePvcAndLaterTests, givenMultipleBcsEnginesWhenEnqueueBlitIsCalledWithProfilingEnabledThenSetupEventProfilingInfoCorrectly, IsAtLeastXeHpcCore) {
DebugManagerStateRestore restorer;
debugManager.flags.EnableCopyEngineSelector.set(1);
HardwareInfo hwInfo = *defaultHwInfo;
hwInfo.featureTable.ftrBcsInfo = maxNBitValue(9);
hwInfo.capabilityTable.blitterOperationsSupported = true;
MockDevice *device = MockDevice::createWithNewExecutionEnvironment<MockDevice>(&hwInfo, 0);
MockClDevice clDevice{device};
MockContext context{&clDevice};
MockCommandQueueHw<FamilyType> queue(&context, &clDevice, nullptr);
queue.setProfilingEnabled();
queue.bcsSplitInitialized = true;
queue.clearBcsEngines();
queue.insertBcsEngine(aub_stream::EngineType::ENGINE_BCS);
queue.insertBcsEngine(aub_stream::EngineType::ENGINE_BCS3);
queue.insertBcsEngine(aub_stream::EngineType::ENGINE_BCS7);
auto bcsCsr0 = queue.getBcsCommandStreamReceiver(aub_stream::EngineType::ENGINE_BCS);
MockGraphicsAllocation mockGraphicsAllocation;
MockBuffer mockMemObj(mockGraphicsAllocation);
BuiltinOpParams params;
params.dstPtr = reinterpret_cast<void *>(0x12300);
params.dstOffset = {0, 0, 0};
params.srcMemObj = &mockMemObj;
params.srcOffset = {0, 0, 0};
params.size = {1, 0, 0};
params.transferAllocation = &mockGraphicsAllocation;
MultiDispatchInfo dispatchInfo(params);
cl_event clEvent;
auto retVal = queue.template enqueueBlitSplit<CL_COMMAND_READ_BUFFER>(dispatchInfo, 0, nullptr, &clEvent, true, *bcsCsr0);
EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_TRUE(queue.isBcsSplitInitialized());
EXPECT_TRUE(queue.isProfilingEnabled());
uint64_t queuedTime = 0;
uint64_t submitTime = 0;
uint64_t startTime = 0;
auto event = castToObject<Event>(clEvent);
event->getEventProfilingInfo(CL_PROFILING_COMMAND_QUEUED, sizeof(queuedTime), &queuedTime, nullptr);
event->getEventProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, sizeof(submitTime), &submitTime, nullptr);
event->getEventProfilingInfo(CL_PROFILING_COMMAND_START, sizeof(startTime), &startTime, nullptr);
EXPECT_GE(queuedTime, 0u);
EXPECT_GE(submitTime, queuedTime);
EXPECT_GE(startTime, submitTime);
clReleaseEvent(clEvent);
}
HWTEST2_F(CommandQueuePvcAndLaterTests, givenAdditionalBcsWhenCreatingCommandQueueThenUseCorrectEngine, IsAtLeastXeHpcCore) {
DebugManagerStateRestore restorer;
debugManager.flags.EnableCopyEngineSelector.set(1);

View File

@@ -263,6 +263,7 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
using BaseClass::bcsEngineCount;
using BaseClass::bcsEngines;
using BaseClass::bcsQueueEngineType;
using BaseClass::bcsSplitInitialized;
using BaseClass::bcsStates;
using BaseClass::bcsTimestampPacketContainers;
using BaseClass::blitEnqueueAllowed;