mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-04 15:53:45 +08:00
OCL: Optimize IOQ barriers handling
Related-To: NEO-7458 Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
6aadf63725
commit
b3c2fa41c5
@@ -187,6 +187,8 @@ cl_int CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
|
||||
const auto &hwInfo = this->getDevice().getHardwareInfo();
|
||||
auto &productHelper = getDevice().getProductHelper();
|
||||
bool canUsePipeControlInsteadOfSemaphoresForOnCsrDependencies = false;
|
||||
bool isNonStallingIoqBarrier = (CL_COMMAND_BARRIER == commandType) && !isOOQEnabled() && (DebugManager.flags.OptimizeIoqBarriersHandling.get() != 0);
|
||||
bool isNonStallingIoqBarrierWithDependencies = isNonStallingIoqBarrier && (eventsRequest.numEventsInWaitList > 0);
|
||||
|
||||
if (computeCommandStreamReceiver.peekTimestampPacketWriteEnabled()) {
|
||||
canUsePipeControlInsteadOfSemaphoresForOnCsrDependencies = this->peekLatestSentEnqueueOperation() == EnqueueProperties::Operation::GpuKernel &&
|
||||
@@ -200,7 +202,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
|
||||
auto allocator = computeCommandStreamReceiver.getTimestampPacketAllocator();
|
||||
|
||||
size_t nodesCount = 0u;
|
||||
if (isCacheFlushCommand(commandType) || isMarkerWithPostSyncWrite) {
|
||||
if (isCacheFlushCommand(commandType) || isMarkerWithPostSyncWrite || isNonStallingIoqBarrierWithDependencies) {
|
||||
nodesCount = 1;
|
||||
} else if (!multiDispatchInfo.empty()) {
|
||||
nodesCount = estimateTimestampPacketNodesCount(multiDispatchInfo);
|
||||
@@ -257,7 +259,7 @@ cl_int CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
|
||||
} else if (isCacheFlushCommand(commandType)) {
|
||||
processDispatchForCacheFlush(surfacesForResidency, numSurfaceForResidency, &commandStream, csrDeps);
|
||||
} else if (computeCommandStreamReceiver.peekTimestampPacketWriteEnabled()) {
|
||||
if (CL_COMMAND_BARRIER == commandType) {
|
||||
if (CL_COMMAND_BARRIER == commandType && !isNonStallingIoqBarrier) {
|
||||
setStallingCommandsOnNextFlush(true);
|
||||
this->splitBarrierRequired = true;
|
||||
}
|
||||
@@ -280,6 +282,10 @@ cl_int CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
|
||||
TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer<GfxFamily>(commandStream, csrDeps, false);
|
||||
}
|
||||
|
||||
if (isNonStallingIoqBarrierWithDependencies) {
|
||||
TimestampPacketHelper::nonStallingContextEndNodeSignal<GfxFamily>(commandStream, *this->timestampPacketContainer->peekNodes()[0], getGpgpuCommandStreamReceiver().isMultiTileOperationEnabled());
|
||||
}
|
||||
|
||||
if (isMarkerWithPostSyncWrite) {
|
||||
if (numEventsInWaitList == 0) {
|
||||
computeCommandStreamReceiver.programComputeBarrierCommand(commandStream);
|
||||
|
||||
@@ -237,6 +237,10 @@ size_t EnqueueOperation<GfxFamily>::getTotalSizeRequiredCS(uint32_t eventType, c
|
||||
}
|
||||
expectedSizeCS += MemorySynchronizationCommands<GfxFamily>::getSizeForSingleBarrier(false);
|
||||
|
||||
if ((CL_COMMAND_BARRIER == eventType) && !commandQueue.isOOQEnabled() && eventsInWaitlist) {
|
||||
expectedSizeCS += EncodeStoreMemory<GfxFamily>::getStoreDataImmSize();
|
||||
}
|
||||
|
||||
return expectedSizeCS;
|
||||
}
|
||||
|
||||
@@ -255,6 +259,7 @@ size_t EnqueueOperation<GfxFamily>::getSizeRequiredCSNonKernel(bool reserveProfi
|
||||
if (reserveProfilingCmdsSpace) {
|
||||
size += 2 * MemorySynchronizationCommands<GfxFamily>::getSizeForSingleBarrier(false) + 4 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
}
|
||||
|
||||
return size;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2022 Intel Corporation
|
||||
* Copyright (C) 2022-2023 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -246,7 +246,34 @@ HWTEST_TEMPLATED_F(BlitEnqueueWithDisabledGpgpuSubmissionTests, givenImmediateDi
|
||||
EXPECT_EQ(1u, gpgpuCsr->peekTaskCount());
|
||||
}
|
||||
|
||||
HWTEST_TEMPLATED_F(BlitEnqueueWithDisabledGpgpuSubmissionTests, givenCacheFlushNotRequiredWhenDoingBcsCopyAfterBarrierThenSubmitToGpgpu) {
|
||||
HWTEST_TEMPLATED_F(BlitEnqueueWithDisabledGpgpuSubmissionTests, givenCacheFlushNotRequiredWhenDoingBcsCopyAfterBarrierThenDontSubmitToGpgpu) {
|
||||
auto mockCommandQueue = static_cast<MockCommandQueueHw<FamilyType> *>(commandQueue.get());
|
||||
EXPECT_EQ(EnqueueProperties::Operation::None, mockCommandQueue->latestSentEnqueueType);
|
||||
|
||||
DebugManager.flags.ForceGpgpuSubmissionForBcsEnqueue.set(-1);
|
||||
|
||||
mockCommandQueue->overrideIsCacheFlushForBcsRequired.enabled = true;
|
||||
mockCommandQueue->overrideIsCacheFlushForBcsRequired.returnValue = false;
|
||||
|
||||
auto buffer = createBuffer(1, false);
|
||||
buffer->forceDisallowCPUCopy = true;
|
||||
int hostPtr = 0;
|
||||
|
||||
EXPECT_EQ(0u, gpgpuCsr->peekTaskCount());
|
||||
commandQueue->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr);
|
||||
EXPECT_EQ(1u, gpgpuCsr->peekTaskCount());
|
||||
|
||||
commandQueue->enqueueBarrierWithWaitList(0, nullptr, nullptr);
|
||||
EXPECT_EQ(1u, gpgpuCsr->peekTaskCount());
|
||||
|
||||
commandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr);
|
||||
EXPECT_EQ(EnqueueProperties::Operation::Blit, mockCommandQueue->latestSentEnqueueType);
|
||||
EXPECT_EQ(1u, gpgpuCsr->peekTaskCount());
|
||||
}
|
||||
|
||||
HWTEST_TEMPLATED_F(BlitEnqueueWithDisabledGpgpuSubmissionTests, givenCacheFlushNotRequiredAndDebugFlagSetWhenDoingBcsCopyAfterBarrierThenSubmitToGpgpu) {
|
||||
DebugManager.flags.OptimizeIoqBarriersHandling.set(0);
|
||||
|
||||
auto mockCommandQueue = static_cast<MockCommandQueueHw<FamilyType> *>(commandQueue.get());
|
||||
EXPECT_EQ(EnqueueProperties::Operation::None, mockCommandQueue->latestSentEnqueueType);
|
||||
|
||||
|
||||
@@ -5,9 +5,11 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#include "shared/source/command_container/command_encoder.h"
|
||||
#include "shared/source/command_stream/command_stream_receiver.h"
|
||||
#include "shared/test/common/cmd_parse/gen_cmd_parse.h"
|
||||
#include "shared/test/common/helpers/debug_manager_state_restore.h"
|
||||
#include "shared/test/common/libult/ult_command_stream_receiver.h"
|
||||
#include "shared/test/common/test_macros/test.h"
|
||||
|
||||
#include "opencl/source/command_queue/command_queue_hw.h"
|
||||
@@ -292,6 +294,10 @@ HWTEST_F(BarrierTest, givenEmptyCommandStreamAndBlockedBarrierCommandWhenUserEve
|
||||
|
||||
// Consume all memory except what is needed for this enqueue
|
||||
size_t barrierCmdStreamSize = NEO::EnqueueOperation<FamilyType>::getSizeRequiredCS(CL_COMMAND_BARRIER, false, false, *pCmdQ, nullptr, {});
|
||||
if (pDevice->getUltCommandStreamReceiver<FamilyType>().peekTimestampPacketWriteEnabled()) {
|
||||
barrierCmdStreamSize += EncodeStoreMemory<FamilyType>::getStoreDataImmSize();
|
||||
}
|
||||
|
||||
commandStream.getSpace(commandStream.getMaxAvailableSpace() - barrierCmdStreamSize);
|
||||
|
||||
// now trigger event
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
*/
|
||||
|
||||
#include "shared/source/built_ins/built_ins.h"
|
||||
#include "shared/source/command_container/command_encoder.h"
|
||||
#include "shared/test/common/test_macros/test.h"
|
||||
|
||||
#include "opencl/source/command_queue/command_queue_hw.h"
|
||||
@@ -99,9 +100,10 @@ HWTEST_F(GetSizeRequiredTest, WhenEnqueuingBarrierThenHeapsAndCommandBufferAreNo
|
||||
|
||||
size_t expectedStreamSize = 0;
|
||||
if (pCmdQ->getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
|
||||
expectedStreamSize = alignUp(MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(
|
||||
pDevice->getRootDeviceEnvironment(), false),
|
||||
MemoryConstants::cacheLineSize);
|
||||
auto unalignedSize = MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(pDevice->getRootDeviceEnvironment(), false) +
|
||||
EncodeStoreMemory<FamilyType>::getStoreDataImmSize() +
|
||||
sizeof(typename FamilyType::MI_BATCH_BUFFER_END);
|
||||
expectedStreamSize = alignUp(unalignedSize, MemoryConstants::cacheLineSize);
|
||||
}
|
||||
|
||||
EXPECT_EQ(expectedStreamSize, commandStream.getUsed() - usedBeforeCS);
|
||||
|
||||
@@ -178,7 +178,78 @@ HWTEST_F(TimestampPacketTests, givenWithWaitlistAndEventWhenMarkerProfilingEnabl
|
||||
clReleaseEvent(event);
|
||||
}
|
||||
|
||||
HWTEST_F(TimestampPacketTests, whenEnqueueingBarrierThenRequestPipeControlOnCsrFlush) {
|
||||
HWTEST_F(TimestampPacketTests, whenEnqueueingBarrierThenDontRequestPipeControlOnCsrFlush) {
|
||||
auto &csr = device->getUltCommandStreamReceiver<FamilyType>();
|
||||
csr.timestampPacketWriteEnabled = true;
|
||||
|
||||
MockCommandQueueHw<FamilyType> cmdQ(context, device.get(), nullptr);
|
||||
EXPECT_FALSE(cmdQ.isStallingCommandsOnNextFlushRequired());
|
||||
|
||||
MockKernelWithInternals mockKernel(*device, context);
|
||||
cmdQ.enqueueKernel(mockKernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); // obtain first TimestampPackets<uint32_t>
|
||||
|
||||
TimestampPacketContainer cmdQNodes;
|
||||
cmdQNodes.assignAndIncrementNodesRefCounts(*cmdQ.timestampPacketContainer);
|
||||
|
||||
cmdQ.enqueueBarrierWithWaitList(0, nullptr, nullptr);
|
||||
|
||||
EXPECT_EQ(cmdQ.timestampPacketContainer->peekNodes().at(0), cmdQNodes.peekNodes().at(0)); // dont obtain new node
|
||||
EXPECT_EQ(1u, cmdQ.timestampPacketContainer->peekNodes().size());
|
||||
|
||||
EXPECT_FALSE(cmdQ.isStallingCommandsOnNextFlushRequired());
|
||||
}
|
||||
|
||||
HWTEST_F(TimestampPacketTests, givenWaitlistWhenEnqueueingBarrierThenProgramNonStallingBarrier) {
|
||||
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
|
||||
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
|
||||
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
|
||||
|
||||
auto &csr = device->getUltCommandStreamReceiver<FamilyType>();
|
||||
csr.timestampPacketWriteEnabled = true;
|
||||
|
||||
MockKernelWithInternals mockKernel(*device, context);
|
||||
|
||||
MockCommandQueueHw<FamilyType> cmdQ(context, device.get(), nullptr);
|
||||
|
||||
cl_event outEvent;
|
||||
cmdQ.enqueueKernel(mockKernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, &outEvent);
|
||||
auto &cmdStream = cmdQ.getCS(0);
|
||||
size_t offset = cmdStream.getUsed();
|
||||
|
||||
TimestampPacketContainer cmdQNodes;
|
||||
cmdQNodes.assignAndIncrementNodesRefCounts(*cmdQ.timestampPacketContainer);
|
||||
|
||||
cmdQ.enqueueBarrierWithWaitList(1, &outEvent, nullptr);
|
||||
|
||||
EXPECT_NE(cmdQ.timestampPacketContainer->peekNodes().at(0), cmdQNodes.peekNodes().at(0)); // obtain new node
|
||||
|
||||
HardwareParse hwParser;
|
||||
hwParser.parseCommands<FamilyType>(cmdStream, offset);
|
||||
|
||||
auto it = hwParser.cmdList.begin();
|
||||
|
||||
if (device->getProductHelper().isResolveDependenciesByPipeControlsSupported(device->getHardwareInfo(), false)) {
|
||||
EXPECT_NE(nullptr, genCmdCast<PIPE_CONTROL *>(*it));
|
||||
} else {
|
||||
EXPECT_NE(nullptr, genCmdCast<MI_SEMAPHORE_WAIT *>(*it));
|
||||
EXPECT_NE(nullptr, genCmdCast<MI_SEMAPHORE_WAIT *>(*(++it)));
|
||||
}
|
||||
|
||||
auto sdiCmd = genCmdCast<MI_STORE_DATA_IMM *>(*(++it));
|
||||
ASSERT_NE(nullptr, sdiCmd);
|
||||
|
||||
auto expectedGpuVa = TimestampPacketHelper::getContextEndGpuAddress(*cmdQ.timestampPacketContainer->peekNodes()[0]);
|
||||
|
||||
EXPECT_EQ(expectedGpuVa, sdiCmd->getAddress());
|
||||
EXPECT_EQ(0u, sdiCmd->getStoreQword());
|
||||
EXPECT_EQ(0u, sdiCmd->getDataDword0());
|
||||
|
||||
clReleaseEvent(outEvent);
|
||||
}
|
||||
|
||||
HWTEST_F(TimestampPacketTests, givenDebugFlagSetWhenEnqueueingBarrierThenRequestPipeControlOnCsrFlush) {
|
||||
DebugManager.flags.OptimizeIoqBarriersHandling.set(0);
|
||||
|
||||
auto &csr = device->getUltCommandStreamReceiver<FamilyType>();
|
||||
csr.timestampPacketWriteEnabled = true;
|
||||
|
||||
@@ -211,7 +282,7 @@ HWTEST_F(TimestampPacketTests, givenTimestampPacketWriteDisabledWhenEnqueueingBa
|
||||
EXPECT_FALSE(cmdQ.isStallingCommandsOnNextFlushRequired());
|
||||
}
|
||||
|
||||
HWTEST_F(TimestampPacketTests, givenBlockedQueueWhenEnqueueingBarrierThenRequestPipeControlOnCsrFlush) {
|
||||
HWTEST_F(TimestampPacketTests, givenBlockedQueueWhenEnqueueingBarrierThenDontRequestPipeControlOnCsrFlush) {
|
||||
auto &csr = device->getUltCommandStreamReceiver<FamilyType>();
|
||||
csr.timestampPacketWriteEnabled = true;
|
||||
|
||||
@@ -220,7 +291,7 @@ HWTEST_F(TimestampPacketTests, givenBlockedQueueWhenEnqueueingBarrierThenRequest
|
||||
auto userEvent = makeReleaseable<UserEvent>();
|
||||
cl_event waitlist[] = {userEvent.get()};
|
||||
cmdQ.enqueueBarrierWithWaitList(1, waitlist, nullptr);
|
||||
EXPECT_TRUE(cmdQ.isStallingCommandsOnNextFlushRequired());
|
||||
EXPECT_FALSE(cmdQ.isStallingCommandsOnNextFlushRequired());
|
||||
userEvent->setStatus(CL_COMPLETE);
|
||||
}
|
||||
|
||||
|
||||
@@ -675,6 +675,8 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenPipeControlRequestWhenDispatchingBlitEnq
|
||||
}
|
||||
|
||||
HWTEST_TEMPLATED_F(BcsBufferTests, givenBarrierWhenReleasingMultipleBlockedEnqueuesThenProgramBarrierOnce) {
|
||||
DebugManager.flags.OptimizeIoqBarriersHandling.set(0);
|
||||
|
||||
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
|
||||
|
||||
auto cmdQ = clUniquePtr(new MockCommandQueueHw<FamilyType>(bcsMockContext.get(), device.get(), nullptr));
|
||||
|
||||
Reference in New Issue
Block a user