Fix optimize timestamp packet dependiencies

-program barrier after global fence allocation is programmed
-do not double barrier timestamp in blit enqueue
-flush GPGPU while submitting to BCS when barrier requested

Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
This commit is contained in:
Lukasz Jobczyk
2022-02-02 16:30:03 +00:00
committed by Compute-Runtime-Automation
parent 45d23868de
commit 9ff1307b4b
7 changed files with 112 additions and 56 deletions

View File

@ -1,12 +1,17 @@
/*
* Copyright (C) 2018-2021 Intel Corporation
* Copyright (C) 2018-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#pragma once
#include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "shared/test/common/test_macros/test_checks_shared.h"
#include "opencl/source/command_queue/command_queue.h"
#include "opencl/test/unit_test/fixtures/cl_device_fixture.h"
#include "opencl/test/unit_test/fixtures/context_fixture.h"
#include "opencl/test/unit_test/mocks/mock_context.h"
#include "CL/cl.h"
@ -85,4 +90,37 @@ static const cl_command_queue_properties DefaultCommandQueueProperties[] = {
0,
CL_QUEUE_PROFILING_ENABLE,
};
template <bool ooq>
struct CommandQueueHwBlitTest : ClDeviceFixture, ContextFixture, CommandQueueHwFixture, ::testing::Test {
using ContextFixture::SetUp;
void SetUp() override {
hwInfo = *::defaultHwInfo;
hwInfo.capabilityTable.blitterOperationsSupported = true;
REQUIRE_FULL_BLITTER_OR_SKIP(&hwInfo);
DebugManager.flags.EnableBlitterOperationsSupport.set(1);
DebugManager.flags.EnableTimestampPacket.set(1);
DebugManager.flags.PreferCopyEngineForCopyBufferToBuffer.set(1);
ClDeviceFixture::SetUpImpl(&hwInfo);
cl_device_id device = pClDevice;
ContextFixture::SetUp(1, &device);
cl_command_queue_properties queueProperties = ooq ? CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE : 0;
CommandQueueHwFixture::SetUp(pClDevice, queueProperties);
}
void TearDown() override {
CommandQueueHwFixture::TearDown();
ContextFixture::TearDown();
ClDeviceFixture::TearDown();
}
HardwareInfo hwInfo{};
DebugManagerStateRestore state{};
};
using IoqCommandQueueHwBlitTest = CommandQueueHwBlitTest<false>;
using OoqCommandQueueHwBlitTest = CommandQueueHwBlitTest<true>;
} // namespace NEO

View File

@ -15,16 +15,12 @@
#include "shared/test/common/mocks/mock_os_library.h"
#include "shared/test/common/mocks/mock_source_level_debugger.h"
#include "shared/test/common/test_macros/matchers.h"
#include "shared/test/common/test_macros/test.h"
#include "shared/test/common/test_macros/test_checks_shared.h"
#include "shared/test/unit_test/utilities/base_object_utils.h"
#include "opencl/source/built_ins/builtins_dispatch_builder.h"
#include "opencl/source/helpers/dispatch_info_builder.h"
#include "opencl/test/unit_test/command_queue/command_queue_fixture.h"
#include "opencl/test/unit_test/fixtures/buffer_fixture.h"
#include "opencl/test/unit_test/fixtures/cl_device_fixture.h"
#include "opencl/test/unit_test/fixtures/context_fixture.h"
#include "opencl/test/unit_test/fixtures/image_fixture.h"
#include "opencl/test/unit_test/mocks/mock_buffer.h"
#include "opencl/test/unit_test/mocks/mock_command_queue.h"
@ -1501,38 +1497,6 @@ HWTEST_F(CommandQueueHwTest, givenFinishWhenFlushBatchedSubmissionsFailsThenErro
EXPECT_EQ(CL_OUT_OF_RESOURCES, errorCode);
}
template <bool ooq>
struct CommandQueueHwBlitTest : ClDeviceFixture, ContextFixture, CommandQueueHwFixture, ::testing::Test {
using ContextFixture::SetUp;
void SetUp() override {
hwInfo = *::defaultHwInfo;
hwInfo.capabilityTable.blitterOperationsSupported = true;
REQUIRE_FULL_BLITTER_OR_SKIP(&hwInfo);
DebugManager.flags.EnableBlitterOperationsSupport.set(1);
DebugManager.flags.EnableTimestampPacket.set(1);
DebugManager.flags.PreferCopyEngineForCopyBufferToBuffer.set(1);
ClDeviceFixture::SetUpImpl(&hwInfo);
cl_device_id device = pClDevice;
ContextFixture::SetUp(1, &device);
cl_command_queue_properties queueProperties = ooq ? CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE : 0;
CommandQueueHwFixture::SetUp(pClDevice, queueProperties);
}
void TearDown() override {
CommandQueueHwFixture::TearDown();
ContextFixture::TearDown();
ClDeviceFixture::TearDown();
}
HardwareInfo hwInfo{};
DebugManagerStateRestore state{};
};
using IoqCommandQueueHwBlitTest = CommandQueueHwBlitTest<false>;
using OoqCommandQueueHwBlitTest = CommandQueueHwBlitTest<true>;
HWTEST_F(IoqCommandQueueHwBlitTest, givenGpgpuCsrWhenEnqueueingSubsequentBlitsThenGpgpuCommandStreamIsNotObtained) {
auto &gpgpuCsr = pDevice->getUltCommandStreamReceiver<FamilyType>();
auto srcBuffer = std::unique_ptr<Buffer>{BufferHelper<>::create(pContext)};
@ -1648,6 +1612,7 @@ HWTEST_F(OoqCommandQueueHwBlitTest, givenBlitAfterBarrierWhenEnqueueingCommandTh
HWTEST_F(OoqCommandQueueHwBlitTest, givenBlitBeforeBarrierWhenEnqueueingCommandThenWaitForBlitBeforeBarrier) {
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
using XY_COPY_BLT = typename FamilyType::XY_COPY_BLT;
if (pCmdQ->getTimestampPacketContainer() == nullptr) {
GTEST_SKIP();
@ -1702,6 +1667,10 @@ HWTEST_F(OoqCommandQueueHwBlitTest, givenBlitBeforeBarrierWhenEnqueueingCommandT
const auto semaphore = genCmdCast<MI_SEMAPHORE_WAIT *>(*semaphoreItor);
EXPECT_EQ(barrierNodeAddress, semaphore->getSemaphoreGraphicsAddress());
EXPECT_EQ(bcsHwParser.cmdList.end(), find<PIPE_CONTROL *>(semaphoreItor, bcsHwParser.cmdList.end()));
// Only one barrier semaphore from first BCS enqueue
const auto blitItor = find<XY_COPY_BLT *>(bcsHwParser.cmdList.begin(), bcsHwParser.cmdList.end());
EXPECT_EQ(1u, findAll<MI_SEMAPHORE_WAIT *>(bcsHwParser.cmdList.begin(), blitItor).size());
}
EXPECT_EQ(CL_SUCCESS, pCmdQ->finish());

View File

@ -1,19 +1,24 @@
/*
* Copyright (C) 2021 Intel Corporation
* Copyright (C) 2021-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/helpers/engine_node_helper.h"
#include "shared/test/common/cmd_parse/hw_parse.h"
#include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "shared/test/common/mocks/mock_graphics_allocation.h"
#include "shared/test/common/test_macros/test.h"
#include "shared/test/unit_test/utilities/base_object_utils.h"
#include "opencl/test/unit_test/command_queue/command_queue_fixture.h"
#include "opencl/test/unit_test/fixtures/buffer_fixture.h"
#include "opencl/test/unit_test/mocks/mock_buffer.h"
#include "opencl/test/unit_test/mocks/mock_cl_device.h"
#include "opencl/test/unit_test/mocks/mock_command_queue.h"
#include "opencl/test/unit_test/mocks/mock_context.h"
#include "opencl/test/unit_test/mocks/mock_kernel.h"
using namespace NEO;
@ -460,3 +465,42 @@ HWTEST2_F(BcsCsrSelectionCommandQueueTests, givenMultipleEnginesInQueueWhenSelec
EXPECT_EQ(queue->getBcsCommandStreamReceiver(aub_stream::ENGINE_BCS2), &queue->selectCsrForBuiltinOperation(args));
}
}
HWTEST2_F(OoqCommandQueueHwBlitTest, givenBarrierBeforeFirstKernelWhenEnqueueNDRangeThenProgramBarrierBeforeGlobalAllocation, IsPVC) {
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
using STATE_SYSTEM_MEM_FENCE_ADDRESS = typename FamilyType::STATE_SYSTEM_MEM_FENCE_ADDRESS;
using MI_MEM_FENCE = typename FamilyType::MI_MEM_FENCE;
if (pCmdQ->getTimestampPacketContainer() == nullptr) {
GTEST_SKIP();
}
DebugManagerStateRestore restore{};
DebugManager.flags.DoCpuCopyOnReadBuffer.set(0);
DebugManager.flags.ForceCacheFlushForBcs.set(0);
DebugManager.flags.UpdateTaskCountFromWait.set(1);
DebugManager.flags.ProgramGlobalFenceAsMiMemFenceCommandInCommandStream.set(1);
MockKernelWithInternals mockKernelWithInternals(*pClDevice);
MockKernel *kernel = mockKernelWithInternals.mockKernel;
size_t offset = 0;
size_t gws = 1;
BufferDefaults::context = context;
auto buffer = clUniquePtr(BufferHelper<>::create());
char ptr[1] = {};
EXPECT_EQ(CL_SUCCESS, pCmdQ->enqueueReadBuffer(buffer.get(), CL_FALSE, 0, 1u, ptr, nullptr, 0, nullptr, nullptr));
EXPECT_EQ(CL_SUCCESS, pCmdQ->enqueueReadBuffer(buffer.get(), CL_FALSE, 0, 1u, ptr, nullptr, 0, nullptr, nullptr));
EXPECT_EQ(CL_SUCCESS, pCmdQ->enqueueBarrierWithWaitList(0, nullptr, nullptr));
auto ccsStart = pCmdQ->getGpgpuCommandStreamReceiver().getCS().getUsed();
EXPECT_EQ(CL_SUCCESS, pCmdQ->enqueueKernel(kernel, 1, &offset, &gws, nullptr, 0, nullptr, nullptr));
HardwareParse ccsHwParser;
ccsHwParser.parseCommands<FamilyType>(pCmdQ->getGpgpuCommandStreamReceiver().getCS(0), ccsStart);
const auto memFenceStateItor = find<STATE_SYSTEM_MEM_FENCE_ADDRESS *>(ccsHwParser.cmdList.begin(), ccsHwParser.cmdList.end());
const auto memFenceItor = find<MI_MEM_FENCE *>(memFenceStateItor, ccsHwParser.cmdList.end());
EXPECT_NE(ccsHwParser.cmdList.end(), memFenceItor);
EXPECT_NE(ccsHwParser.cmdList.end(), memFenceStateItor);
}