mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-18 22:08:53 +08:00
fix: Improved event profiling setup for bcs split enqueue
Fixes OpenCL Khronos failures in test profiling with bcs split Related-To: NEO-8927 Signed-off-by: Milczarek, Slawomir <slawomir.milczarek@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
44c23cb8be
commit
bec9874487
@@ -386,6 +386,8 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
|
||||
|
||||
bool getHeaplessModeEnabled() const { return this->heaplessModeEnabled; }
|
||||
|
||||
bool isBcsSplitInitialized() const { return this->bcsSplitInitialized; }
|
||||
|
||||
protected:
|
||||
void *enqueueReadMemObjForMap(TransferProperties &transferProperties, EventsRequest &eventsRequest, cl_int &errcodeRet);
|
||||
cl_int enqueueWriteMemObjForUnmap(MemObj *memObj, void *mappedPtr, EventsRequest &eventsRequest);
|
||||
|
||||
@@ -364,7 +364,7 @@ class CommandQueueHw : public CommandQueue {
|
||||
cl_int dispatchBcsOrGpgpuEnqueue(MultiDispatchInfo &dispatchInfo, Surface *(&surfaces)[surfaceCount], EBuiltInOps::Type builtInOperation, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event, bool blocking, CommandStreamReceiver &csr);
|
||||
|
||||
template <uint32_t cmdType>
|
||||
cl_int enqueueBlit(const MultiDispatchInfo &multiDispatchInfo, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event, bool blocking, CommandStreamReceiver &bcsCsr);
|
||||
cl_int enqueueBlit(const MultiDispatchInfo &multiDispatchInfo, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event, bool blocking, CommandStreamReceiver &bcsCsr, EventBuilder *pExternalEventBuilder);
|
||||
|
||||
bool isSplitEnqueueBlitNeeded(TransferDirection transferDirection, size_t transferSize, CommandStreamReceiver &csr);
|
||||
size_t getTotalSizeFromRectRegion(const size_t *region);
|
||||
|
||||
@@ -1108,7 +1108,7 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueCommandWithoutKernel(
|
||||
csrDeps.makeResident(getGpgpuCommandStreamReceiver());
|
||||
}
|
||||
|
||||
if (eventBuilder.getEvent() && isProfilingEnabled()) {
|
||||
if (eventBuilder.getEvent() && isProfilingEnabled() && !isBcsSplitInitialized()) {
|
||||
eventBuilder.getEvent()->setSubmitTimeStamp();
|
||||
eventBuilder.getEvent()->setStartTimeStamp();
|
||||
}
|
||||
@@ -1309,6 +1309,16 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlitSplit(MultiDispatchInfo &dispatchIn
|
||||
auto size = dispatchInfo.peekBuiltinOpParams().size.x;
|
||||
auto remainingSize = size;
|
||||
|
||||
EventBuilder externalEventBuilder;
|
||||
EventBuilder *pEventBuilder = nullptr;
|
||||
DEBUG_BREAK_IF(!this->isBcsSplitInitialized());
|
||||
if (event && this->isProfilingEnabled()) {
|
||||
pEventBuilder = &externalEventBuilder;
|
||||
setupEvent(*pEventBuilder, event, cmdType);
|
||||
castToObjectOrAbort<Event>(*event)->setSubmitTimeStamp();
|
||||
castToObjectOrAbort<Event>(*event)->setStartTimeStamp();
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < copyEngines.size(); i++) {
|
||||
auto localSize = remainingSize / (copyEngines.size() - i);
|
||||
auto localParams = dispatchInfo.peekBuiltinOpParams();
|
||||
@@ -1321,7 +1331,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlitSplit(MultiDispatchInfo &dispatchIn
|
||||
|
||||
this->timestampPacketContainer->assignAndIncrementNodesRefCounts(previousEnqueueNode);
|
||||
|
||||
ret = enqueueBlit<cmdType>(dispatchInfo, numEventsInWaitList, eventWaitList, remainingSize == 0 ? event : nullptr, false, *copyEngines[i]);
|
||||
ret = enqueueBlit<cmdType>(dispatchInfo, numEventsInWaitList, eventWaitList, remainingSize == 0 ? event : nullptr, false, *copyEngines[i], pEventBuilder);
|
||||
DEBUG_BREAK_IF(ret != CL_SUCCESS);
|
||||
|
||||
this->timestampPacketContainer->moveNodesToNewContainer(splitNodes);
|
||||
@@ -1349,16 +1359,23 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlitSplit(MultiDispatchInfo &dispatchIn
|
||||
|
||||
template <typename GfxFamily>
|
||||
template <uint32_t cmdType>
|
||||
cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDispatchInfo, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event, bool blocking, CommandStreamReceiver &bcsCsr) {
|
||||
cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDispatchInfo, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event, bool blocking, CommandStreamReceiver &bcsCsr, EventBuilder *pExternalEventBuilder) {
|
||||
auto bcsCommandStreamReceiverOwnership = bcsCsr.obtainUniqueOwnership();
|
||||
std::unique_lock<NEO::CommandStreamReceiver::MutexType> commandStreamReceiverOwnership;
|
||||
|
||||
registerBcsCsrClient(bcsCsr);
|
||||
|
||||
EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, event);
|
||||
EventBuilder eventBuilder;
|
||||
EventBuilder internalEventBuilder;
|
||||
EventBuilder *pEventBuilder = nullptr;
|
||||
|
||||
setupEvent(eventBuilder, eventsRequest.outEvent, cmdType);
|
||||
if (pExternalEventBuilder) {
|
||||
DEBUG_BREAK_IF(!this->isBcsSplitInitialized() || !this->isProfilingEnabled());
|
||||
pEventBuilder = pExternalEventBuilder;
|
||||
} else {
|
||||
pEventBuilder = &internalEventBuilder;
|
||||
setupEvent(*pEventBuilder, eventsRequest.outEvent, cmdType);
|
||||
}
|
||||
eventsRequest.setupBcsCsrForOutputEvent(bcsCsr);
|
||||
|
||||
std::unique_ptr<KernelOperation> blockedCommandsData;
|
||||
@@ -1412,11 +1429,11 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDisp
|
||||
obtainNewTimestampPacketNodes(1, timestampPacketDependencies.previousEnqueueNodes, clearAllDependencies, bcsCsr);
|
||||
csrDeps.timestampPacketContainer.push_back(×tampPacketDependencies.previousEnqueueNodes);
|
||||
|
||||
if (eventBuilder.getEvent()) {
|
||||
eventBuilder.getEvent()->addTimestampPacketNodes(*timestampPacketContainer);
|
||||
if (pEventBuilder->getEvent()) {
|
||||
pEventBuilder->getEvent()->addTimestampPacketNodes(*timestampPacketContainer);
|
||||
}
|
||||
if (eventBuilder.getEvent() && eventBuilder.getEvent()->getContext()->getRootDeviceIndices().size() > 1) {
|
||||
multiRootEventSyncStamp = eventBuilder.getEvent()->getMultiRootTimestampSyncNode();
|
||||
if (pEventBuilder->getEvent() && pEventBuilder->getEvent()->getContext()->getRootDeviceIndices().size() > 1) {
|
||||
multiRootEventSyncStamp = pEventBuilder->getEvent()->getMultiRootTimestampSyncNode();
|
||||
bcsCsr.makeResident(*multiRootEventSyncStamp->getBaseGraphicsAllocation());
|
||||
}
|
||||
|
||||
@@ -1442,7 +1459,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDisp
|
||||
if (!blockQueue) {
|
||||
completionStamp = enqueueCommandWithoutKernel(nullptr, 0, gpgpuCommandStream, gpgpuCommandStreamStart, blocking,
|
||||
enqueueProperties, timestampPacketDependencies, eventsRequest,
|
||||
eventBuilder, taskLevel, csrDeps, &bcsCsr, false);
|
||||
*pEventBuilder, taskLevel, csrDeps, &bcsCsr, false);
|
||||
if (completionStamp.taskCount > CompletionStamp::notReady) {
|
||||
return CommandQueue::getErrorCodeFromTaskCount(completionStamp.taskCount);
|
||||
}
|
||||
@@ -1453,18 +1470,18 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDisp
|
||||
}
|
||||
}
|
||||
|
||||
if (eventBuilder.getEvent()) {
|
||||
eventBuilder.getEvent()->flushStamp->replaceStampObject(this->flushStamp->getStampReference());
|
||||
if (pEventBuilder->getEvent()) {
|
||||
pEventBuilder->getEvent()->flushStamp->replaceStampObject(this->flushStamp->getStampReference());
|
||||
}
|
||||
|
||||
this->latestSentEnqueueType = enqueueProperties.operation;
|
||||
|
||||
setLastBcsPacket(bcsCsr.getOsContext().getEngineType());
|
||||
}
|
||||
updateFromCompletionStamp(completionStamp, eventBuilder.getEvent());
|
||||
updateFromCompletionStamp(completionStamp, pEventBuilder->getEvent());
|
||||
|
||||
if (blockQueue) {
|
||||
enqueueBlocked(cmdType, nullptr, 0, multiDispatchInfo, timestampPacketDependencies, blockedCommandsData, enqueueProperties, eventsRequest, eventBuilder, nullptr, &bcsCsr, multiRootEventSyncStamp);
|
||||
enqueueBlocked(cmdType, nullptr, 0, multiDispatchInfo, timestampPacketDependencies, blockedCommandsData, enqueueProperties, eventsRequest, *pEventBuilder, nullptr, &bcsCsr, multiRootEventSyncStamp);
|
||||
|
||||
if (gpgpuSubmission) {
|
||||
if (debugManager.flags.ForceCsrLockInBcsEnqueueOnlyForGpgpuSubmission.get() == 1) {
|
||||
@@ -1510,7 +1527,7 @@ cl_int CommandQueueHw<GfxFamily>::dispatchBcsOrGpgpuEnqueue(MultiDispatchInfo &d
|
||||
if (dispatchInfo.peekBuiltinOpParams().bcsSplit) {
|
||||
ret = enqueueBlitSplit<cmdType>(dispatchInfo, numEventsInWaitList, eventWaitList, event, blocking, csr);
|
||||
} else {
|
||||
ret = enqueueBlit<cmdType>(dispatchInfo, numEventsInWaitList, eventWaitList, event, blocking, csr);
|
||||
ret = enqueueBlit<cmdType>(dispatchInfo, numEventsInWaitList, eventWaitList, event, blocking, csr, nullptr);
|
||||
}
|
||||
|
||||
return ret;
|
||||
|
||||
@@ -13,6 +13,8 @@
|
||||
#include "shared/test/common/test_macros/hw_test.h"
|
||||
#include "shared/test/common/utilities/base_object_utils.h"
|
||||
|
||||
#include "opencl/source/command_queue/enqueue_common.h"
|
||||
#include "opencl/source/event/event.h"
|
||||
#include "opencl/test/unit_test/command_queue/command_queue_fixture.h"
|
||||
#include "opencl/test/unit_test/fixtures/buffer_fixture.h"
|
||||
#include "opencl/test/unit_test/mocks/mock_buffer.h"
|
||||
@@ -105,19 +107,19 @@ HWTEST2_F(CommandQueuePvcAndLaterTests, givenMultipleBcsEnginesWhenDispatchingCo
|
||||
baseNumClientsBcs3 = bcsCsr0->getNumClients();
|
||||
baseNumClientsBcs7 = bcsCsr0->getNumClients();
|
||||
|
||||
auto retVal = queue.template enqueueBlit<CL_COMMAND_READ_BUFFER>(dispatchInfo, 0, nullptr, nullptr, false, *bcsCsr0);
|
||||
auto retVal = queue.template enqueueBlit<CL_COMMAND_READ_BUFFER>(dispatchInfo, 0, nullptr, nullptr, false, *bcsCsr0, nullptr);
|
||||
EXPECT_EQ(CL_SUCCESS, retVal);
|
||||
EXPECT_EQ(baseNumClientsBcs0 + 1, bcsCsr0->getNumClients());
|
||||
EXPECT_EQ(baseNumClientsBcs3, bcsCsr3->getNumClients());
|
||||
EXPECT_EQ(baseNumClientsBcs7, bcsCsr7->getNumClients());
|
||||
|
||||
retVal = queue.template enqueueBlit<CL_COMMAND_READ_BUFFER>(dispatchInfo, 0, nullptr, nullptr, false, *bcsCsr3);
|
||||
retVal = queue.template enqueueBlit<CL_COMMAND_READ_BUFFER>(dispatchInfo, 0, nullptr, nullptr, false, *bcsCsr3, nullptr);
|
||||
EXPECT_EQ(CL_SUCCESS, retVal);
|
||||
EXPECT_EQ(baseNumClientsBcs0 + 1, bcsCsr0->getNumClients());
|
||||
EXPECT_EQ(baseNumClientsBcs3 + 1, bcsCsr3->getNumClients());
|
||||
EXPECT_EQ(baseNumClientsBcs7, bcsCsr7->getNumClients());
|
||||
|
||||
retVal = queue.template enqueueBlit<CL_COMMAND_READ_BUFFER>(dispatchInfo, 0, nullptr, nullptr, false, *bcsCsr7);
|
||||
retVal = queue.template enqueueBlit<CL_COMMAND_READ_BUFFER>(dispatchInfo, 0, nullptr, nullptr, false, *bcsCsr7, nullptr);
|
||||
EXPECT_EQ(CL_SUCCESS, retVal);
|
||||
EXPECT_EQ(baseNumClientsBcs0 + 1, bcsCsr0->getNumClients());
|
||||
EXPECT_EQ(baseNumClientsBcs3 + 1, bcsCsr3->getNumClients());
|
||||
@@ -129,6 +131,62 @@ HWTEST2_F(CommandQueuePvcAndLaterTests, givenMultipleBcsEnginesWhenDispatchingCo
|
||||
EXPECT_EQ(baseNumClientsBcs7, bcsCsr7->getNumClients());
|
||||
}
|
||||
|
||||
HWTEST2_F(CommandQueuePvcAndLaterTests, givenMultipleBcsEnginesWhenEnqueueBlitIsCalledWithProfilingEnabledThenSetupEventProfilingInfoCorrectly, IsAtLeastXeHpcCore) {
|
||||
DebugManagerStateRestore restorer;
|
||||
debugManager.flags.EnableCopyEngineSelector.set(1);
|
||||
HardwareInfo hwInfo = *defaultHwInfo;
|
||||
hwInfo.featureTable.ftrBcsInfo = maxNBitValue(9);
|
||||
hwInfo.capabilityTable.blitterOperationsSupported = true;
|
||||
MockDevice *device = MockDevice::createWithNewExecutionEnvironment<MockDevice>(&hwInfo, 0);
|
||||
MockClDevice clDevice{device};
|
||||
MockContext context{&clDevice};
|
||||
|
||||
MockCommandQueueHw<FamilyType> queue(&context, &clDevice, nullptr);
|
||||
queue.setProfilingEnabled();
|
||||
queue.bcsSplitInitialized = true;
|
||||
queue.clearBcsEngines();
|
||||
|
||||
queue.insertBcsEngine(aub_stream::EngineType::ENGINE_BCS);
|
||||
queue.insertBcsEngine(aub_stream::EngineType::ENGINE_BCS3);
|
||||
queue.insertBcsEngine(aub_stream::EngineType::ENGINE_BCS7);
|
||||
|
||||
auto bcsCsr0 = queue.getBcsCommandStreamReceiver(aub_stream::EngineType::ENGINE_BCS);
|
||||
|
||||
MockGraphicsAllocation mockGraphicsAllocation;
|
||||
MockBuffer mockMemObj(mockGraphicsAllocation);
|
||||
|
||||
BuiltinOpParams params;
|
||||
params.dstPtr = reinterpret_cast<void *>(0x12300);
|
||||
params.dstOffset = {0, 0, 0};
|
||||
params.srcMemObj = &mockMemObj;
|
||||
params.srcOffset = {0, 0, 0};
|
||||
params.size = {1, 0, 0};
|
||||
params.transferAllocation = &mockGraphicsAllocation;
|
||||
|
||||
MultiDispatchInfo dispatchInfo(params);
|
||||
|
||||
cl_event clEvent;
|
||||
auto retVal = queue.template enqueueBlitSplit<CL_COMMAND_READ_BUFFER>(dispatchInfo, 0, nullptr, &clEvent, true, *bcsCsr0);
|
||||
EXPECT_EQ(CL_SUCCESS, retVal);
|
||||
|
||||
EXPECT_TRUE(queue.isBcsSplitInitialized());
|
||||
EXPECT_TRUE(queue.isProfilingEnabled());
|
||||
|
||||
uint64_t queuedTime = 0;
|
||||
uint64_t submitTime = 0;
|
||||
uint64_t startTime = 0;
|
||||
auto event = castToObject<Event>(clEvent);
|
||||
event->getEventProfilingInfo(CL_PROFILING_COMMAND_QUEUED, sizeof(queuedTime), &queuedTime, nullptr);
|
||||
event->getEventProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, sizeof(submitTime), &submitTime, nullptr);
|
||||
event->getEventProfilingInfo(CL_PROFILING_COMMAND_START, sizeof(startTime), &startTime, nullptr);
|
||||
|
||||
EXPECT_GE(queuedTime, 0u);
|
||||
EXPECT_GE(submitTime, queuedTime);
|
||||
EXPECT_GE(startTime, submitTime);
|
||||
|
||||
clReleaseEvent(clEvent);
|
||||
}
|
||||
|
||||
HWTEST2_F(CommandQueuePvcAndLaterTests, givenAdditionalBcsWhenCreatingCommandQueueThenUseCorrectEngine, IsAtLeastXeHpcCore) {
|
||||
DebugManagerStateRestore restorer;
|
||||
debugManager.flags.EnableCopyEngineSelector.set(1);
|
||||
|
||||
@@ -263,6 +263,7 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
|
||||
using BaseClass::bcsEngineCount;
|
||||
using BaseClass::bcsEngines;
|
||||
using BaseClass::bcsQueueEngineType;
|
||||
using BaseClass::bcsSplitInitialized;
|
||||
using BaseClass::bcsStates;
|
||||
using BaseClass::bcsTimestampPacketContainers;
|
||||
using BaseClass::blitEnqueueAllowed;
|
||||
|
||||
Reference in New Issue
Block a user