fix: unregister CSR client on OCL sync points

Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
This commit is contained in:
Dunajski, Bartosz
2023-08-11 10:22:19 +00:00
committed by Compute-Runtime-Automation
parent 998e0a5833
commit 3cf1f5c462
9 changed files with 111 additions and 41 deletions

View File

@@ -1293,7 +1293,7 @@ WaitStatus CommandQueue::waitForAllEngines(bool blockedQueue, PrintfHandler *pri
waitStatus = waitUntilComplete(taskCount, activeBcsStates, flushStamp->peekStamp(), false, cleanTemporaryAllocationsList, waitedOnTimestamps);
tryReleaseDeferredNodes(false);
handlePostCompletionOperations(false);
if (printfHandler) {
if (!printfHandler->printEnqueueOutput()) {
@@ -1381,13 +1381,15 @@ bool CommandQueue::migrateMultiGraphicsAllocationsIfRequired(const BuiltinOpPara
return migrationHandled;
}
void CommandQueue::tryReleaseDeferredNodes(bool checkEventsState) {
void CommandQueue::handlePostCompletionOperations(bool checkQueueCompletion) {
TakeOwnershipWrapper<CommandQueue> queueOwnership(*this);
if (checkEventsState && !isCompleted(this->taskCount, this->bcsStates)) {
if (checkQueueCompletion && !isCompleted(this->taskCount, this->bcsStates)) {
return;
}
unregisterGpgpuAndBcsCsrClients();
TimestampPacketContainer nodesToRelease;
if (deferredTimestampPackets) {
deferredTimestampPackets->swapNodes(nodesToRelease);
@@ -1398,4 +1400,51 @@ void CommandQueue::tryReleaseDeferredNodes(bool checkEventsState) {
}
}
void CommandQueue::registerGpgpuCsrClient() {
if (!gpgpuCsrClientRegistered) {
gpgpuCsrClientRegistered = true;
getGpgpuCommandStreamReceiver().registerClient();
}
}
void CommandQueue::registerBcsCsrClient(CommandStreamReceiver &bcsCsr) {
auto engineType = bcsCsr.getOsContext().getEngineType();
auto &bcsState = bcsStates[EngineHelpers::getBcsIndex(engineType)];
if (!bcsState.csrClientRegistered) {
bcsState.csrClientRegistered = true;
bcsCsr.registerClient();
}
}
void CommandQueue::unregisterGpgpuCsrClient() {
if (gpgpuCsrClientRegistered) {
gpgpuEngine->commandStreamReceiver->unregisterClient();
gpgpuCsrClientRegistered = false;
}
}
void CommandQueue::unregisterBcsCsrClient(CommandStreamReceiver &bcsCsr) {
auto engineType = bcsCsr.getOsContext().getEngineType();
auto &bcsState = bcsStates[EngineHelpers::getBcsIndex(engineType)];
if (bcsState.isValid() && bcsState.csrClientRegistered) {
bcsCsr.unregisterClient();
bcsState.csrClientRegistered = false;
}
}
void CommandQueue::unregisterGpgpuAndBcsCsrClients() {
unregisterGpgpuCsrClient();
for (auto &engine : this->bcsEngines) {
if (engine) {
unregisterBcsCsrClient(*engine->commandStreamReceiver);
}
}
}
} // namespace NEO

View File

@@ -377,7 +377,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
const std::array<CopyEngineState, bcsInfoMaskSize> &peekActiveBcsStates() const { return bcsStates; }
void tryReleaseDeferredNodes(bool checkEventsState);
void handlePostCompletionOperations(bool checkQueueCompletion);
protected:
void *enqueueReadMemObjForMap(TransferProperties &transferProperties, EventsRequest &eventsRequest, cl_int &errcodeRet);
@@ -414,6 +414,14 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
virtual bool obtainTimestampPacketForCacheFlush(bool isCacheFlushRequired) const = 0;
void assignDataToOverwrittenBcsNode(TagNodeBase *node);
void registerGpgpuCsrClient();
void registerBcsCsrClient(CommandStreamReceiver &bcsCsr);
void unregisterGpgpuCsrClient();
void unregisterBcsCsrClient(CommandStreamReceiver &bcsCsr);
void unregisterGpgpuAndBcsCsrClients();
Context *context = nullptr;
ClDevice *device = nullptr;
mutable EngineControl *gpgpuEngine = nullptr;
@@ -463,6 +471,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
bool stallingCommandsOnNextFlushRequired = false;
bool dcFlushRequiredOnStallingCommandsOnNextFlush = false;
bool splitBarrierRequired = false;
bool gpgpuCsrClientRegistered = false;
};
template <typename PtrType>

View File

@@ -521,10 +521,5 @@ class CommandQueueHw : public CommandQueue {
bool isBlitAuxTranslationRequired(const MultiDispatchInfo &multiDispatchInfo);
bool relaxedOrderingForGpgpuAllowed(uint32_t numWaitEvents) const;
void registerGpgpuCsrClient();
void registerBcsCsrClient(CommandStreamReceiver &bcsCsr);
bool gpgpuCsrClientRegistered = false;
};
} // namespace NEO

View File

@@ -245,37 +245,8 @@ void CommandQueueHw<Family>::setupEvent(EventBuilder &eventBuilder, cl_event *ou
}
}
template <typename GfxFamily>
void CommandQueueHw<GfxFamily>::registerGpgpuCsrClient() {
if (!gpgpuCsrClientRegistered) {
gpgpuCsrClientRegistered = true;
getGpgpuCommandStreamReceiver().registerClient();
}
}
template <typename GfxFamily>
void CommandQueueHw<GfxFamily>::registerBcsCsrClient(CommandStreamReceiver &bcsCsr) {
auto engineType = bcsCsr.getOsContext().getEngineType();
auto &bcsState = bcsStates[EngineHelpers::getBcsIndex(engineType)];
if (!bcsState.csrClientRegistered) {
bcsState.csrClientRegistered = true;
bcsCsr.registerClient();
}
}
template <typename Family>
CommandQueueHw<Family>::~CommandQueueHw() {
if (gpgpuCsrClientRegistered) {
gpgpuEngine->commandStreamReceiver->unregisterClient();
}
for (auto &copyEngine : bcsStates) {
if (copyEngine.isValid() && copyEngine.csrClientRegistered) {
bcsEngines[EngineHelpers::getBcsIndex(copyEngine.engineType)]->commandStreamReceiver->unregisterClient();
}
}
unregisterGpgpuAndBcsCsrClients();
}
} // namespace NEO

View File

@@ -454,7 +454,7 @@ inline WaitStatus Event::wait(bool blocking, bool useQuickKmdSleep) {
DEBUG_BREAK_IF(this->taskLevel == CompletionStamp::notReady && this->executionStatus >= 0);
cmdQueue->tryReleaseDeferredNodes(true);
cmdQueue->handlePostCompletionOperations(true);
auto *allocationStorage = cmdQueue->getGpgpuCommandStreamReceiver().getInternalAllocationStorage();
allocationStorage->cleanAllocationList(this->taskCount, TEMPORARY_ALLOCATION);

View File

@@ -1099,6 +1099,47 @@ HWTEST_F(CommandQueueHwTest, givenCommandQueueWhenDispatchingWorkThenRegisterCsr
EXPECT_EQ(baseNumClients, csr.getNumClients());
}
HWTEST_F(CommandQueueHwTest, givenCsrClientWhenCallingSyncPointsThenUnregister) {
MockKernelWithInternals mockKernelWithInternals(*pClDevice);
auto mockKernel = mockKernelWithInternals.mockKernel;
auto &csr = pDevice->getUltCommandStreamReceiver<FamilyType>();
size_t gws = 1;
auto baseNumClients = csr.getNumClients();
MockCommandQueueHw<FamilyType> mockCmdQueueHw{context, pClDevice, nullptr};
EXPECT_EQ(CL_SUCCESS, mockCmdQueueHw.enqueueKernel(mockKernel, 1, nullptr, &gws, nullptr, 0, nullptr, nullptr));
EXPECT_EQ(baseNumClients + 1, csr.getNumClients());
mockCmdQueueHw.finish();
EXPECT_EQ(baseNumClients, csr.getNumClients()); // queue synchronized
cl_event e0, e1;
EXPECT_EQ(CL_SUCCESS, mockCmdQueueHw.enqueueKernel(mockKernel, 1, nullptr, &gws, nullptr, 0, nullptr, &e0));
EXPECT_EQ(baseNumClients + 1, csr.getNumClients());
*csr.tagAddress = mockCmdQueueHw.taskCount;
EXPECT_EQ(CL_SUCCESS, mockCmdQueueHw.enqueueKernel(mockKernel, 1, nullptr, &gws, nullptr, 0, nullptr, &e1));
EXPECT_EQ(baseNumClients + 1, csr.getNumClients());
clWaitForEvents(1, &e0);
EXPECT_EQ(baseNumClients + 1, csr.getNumClients()); // CSR task count < queue task count
*csr.tagAddress = mockCmdQueueHw.taskCount;
clWaitForEvents(1, &e0);
EXPECT_EQ(baseNumClients, csr.getNumClients()); // queue ready
clReleaseEvent(e0);
clReleaseEvent(e1);
}
HWTEST_F(CommandQueueHwTest, givenKernelSplitEnqueueReadBufferWhenBlockedThenEnqueueSurfacesMakeResidentIsCalledOnce) {
UserEvent userEvent(context);
auto &csr = pDevice->getUltCommandStreamReceiver<FamilyType>();

View File

@@ -39,7 +39,7 @@ HWTEST_F(ClTbxCommandStreamTests, givenTbxCsrWhenDispatchBlitEnqueueThenProcessC
EngineControl engineControl0{&tbxCsr0, &osContext0};
MockOsContext osContext1(1, EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::Regular}, pDevice->getDeviceBitfield()));
tbxCsr1.setupContext(osContext0);
tbxCsr1.setupContext(osContext1);
EngineControl engineControl1{&tbxCsr1, &osContext1};
MockCommandQueueHw<FamilyType> cmdQ(&context, pClDevice, nullptr);
@@ -47,6 +47,8 @@ HWTEST_F(ClTbxCommandStreamTests, givenTbxCsrWhenDispatchBlitEnqueueThenProcessC
cmdQ.clearBcsEngines();
cmdQ.bcsEngines[0] = &engineControl1;
cmdQ.bcsStates[0] = {aub_stream::ENGINE_BCS, 0, false};
cl_int error = CL_SUCCESS;
std::unique_ptr<Buffer> buffer(Buffer::create(&context, 0, 1, nullptr, error));

View File

@@ -609,6 +609,8 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenAllBcsEnginesReadyWhenWaitingForEventThe
}
clReleaseEvent(event);
commandQueue.reset();
}
HWTEST_TEMPLATED_F(BcsBufferTests, givenMapAllocationWhenEnqueueingReadOrWriteBufferThenStoreMapAllocationInDispatchParameters) {

View File

@@ -272,6 +272,7 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
using BaseClass::relaxedOrderingForGpgpuAllowed;
using BaseClass::requiresCacheFlushAfterWalker;
using BaseClass::splitBarrierRequired;
using BaseClass::taskCount;
using BaseClass::throttle;
using BaseClass::timestampPacketContainer;