/* * Copyright (C) 2019-2021 Intel Corporation * * SPDX-License-Identifier: MIT * */ #include "shared/source/helpers/pause_on_gpu_properties.h" #include "shared/source/helpers/vec.h" #include "shared/source/memory_manager/unified_memory_manager.h" #include "shared/test/common/cmd_parse/hw_parse.h" #include "shared/test/common/helpers/debug_manager_state_restore.h" #include "shared/test/common/helpers/unit_test_helper.h" #include "shared/test/common/helpers/variable_backup.h" #include "shared/test/common/mocks/mock_device.h" #include "shared/test/unit_test/compiler_interface/linker_mock.h" #include "shared/test/unit_test/utilities/base_object_utils.h" #include "opencl/source/event/user_event.h" #include "opencl/source/mem_obj/buffer.h" #include "opencl/test/unit_test/mocks/mock_cl_device.h" #include "opencl/test/unit_test/mocks/mock_command_queue.h" #include "opencl/test/unit_test/mocks/mock_context.h" #include "opencl/test/unit_test/mocks/mock_kernel.h" #include "opencl/test/unit_test/mocks/mock_program.h" #include "opencl/test/unit_test/mocks/mock_timestamp_container.h" #include "opencl/test/unit_test/test_macros/test_checks_ocl.h" #include "test.h" namespace NEO { extern CommandStreamReceiverCreateFunc commandStreamReceiverFactory[2 * IGFX_MAX_CORE]; template struct BlitEnqueueTests : public ::testing::Test { class BcsMockContext : public MockContext { public: BcsMockContext(ClDevice *device) : MockContext(device) { bcsOsContext.reset(OsContext::create(nullptr, 0, 0, EngineTypeUsage{aub_stream::ENGINE_BCS, EngineUsage::Regular}, PreemptionMode::Disabled, false)); bcsCsr.reset(createCommandStream(*device->getExecutionEnvironment(), device->getRootDeviceIndex(), device->getDeviceBitfield())); bcsCsr->setupContext(*bcsOsContext); bcsCsr->initializeTagAllocation(); auto mockBlitMemoryToAllocation = [this](const Device &device, GraphicsAllocation *memory, size_t offset, const void *hostPtr, Vec3 size) -> BlitOperationResult { if (!device.getRootDeviceEnvironment().getMutableHardwareInfo()->capabilityTable.blitterOperationsSupported) { return BlitOperationResult::Unsupported; } auto blitProperties = BlitProperties::constructPropertiesForReadWriteBuffer(BlitterConstants::BlitDirection::HostPtrToBuffer, *bcsCsr, memory, nullptr, hostPtr, memory->getGpuAddress(), 0, 0, 0, size, 0, 0, 0, 0); BlitPropertiesContainer container; container.push_back(blitProperties); bcsCsr->blitBuffer(container, true, false); return BlitOperationResult::Success; }; blitMemoryToAllocationFuncBackup = mockBlitMemoryToAllocation; } std::unique_ptr bcsOsContext; std::unique_ptr bcsCsr; VariableBackup blitMemoryToAllocationFuncBackup{ &BlitHelperFunctions::blitMemoryToAllocation}; }; template void SetUpT() { if (is32bit) { GTEST_SKIP(); } REQUIRE_AUX_RESOLVES(); DebugManager.flags.EnableTimestampPacket.set(timestampPacketEnabled); DebugManager.flags.EnableBlitterForEnqueueOperations.set(1); DebugManager.flags.ForceAuxTranslationMode.set(static_cast(AuxTranslationMode::Blit)); DebugManager.flags.RenderCompressedBuffersEnabled.set(1); DebugManager.flags.ForceGpgpuSubmissionForBcsEnqueue.set(1); DebugManager.flags.CsrDispatchMode.set(static_cast(DispatchMode::ImmediateDispatch)); DebugManager.flags.EnableLocalMemory.set(1); device = std::make_unique(MockDevice::createWithNewExecutionEnvironment(nullptr)); auto &capabilityTable = device->getRootDeviceEnvironment().getMutableHardwareInfo()->capabilityTable; bool createBcsEngine = !capabilityTable.blitterOperationsSupported; capabilityTable.blitterOperationsSupported = true; if (createBcsEngine) { auto &engine = device->getEngine(getChosenEngineType(device->getHardwareInfo()), EngineUsage::LowPriority); bcsOsContext.reset(OsContext::create(nullptr, 1, device->getDeviceBitfield(), EngineTypeUsage{aub_stream::ENGINE_BCS, EngineUsage::Regular}, PreemptionMode::Disabled, false)); engine.osContext = bcsOsContext.get(); engine.commandStreamReceiver->setupContext(*bcsOsContext); } bcsMockContext = std::make_unique(device.get()); auto mockCmdQueue = new MockCommandQueueHw(bcsMockContext.get(), device.get(), nullptr); commandQueue.reset(mockCmdQueue); mockKernel = std::make_unique(*device, bcsMockContext.get()); auto mockProgram = mockKernel->mockProgram; mockProgram->setAllowNonUniform(true); gpgpuCsr = mockCmdQueue->gpgpuEngine->commandStreamReceiver; bcsCsr = mockCmdQueue->bcsEngine->commandStreamReceiver; } template void TearDownT() {} template void setMockKernelArgs(std::array buffers) { for (uint32_t i = 0; i < buffers.size(); i++) { mockKernel->kernelInfo.addArgBuffer(i, 0); } mockKernel->mockKernel->initialize(); EXPECT_TRUE(mockKernel->mockKernel->auxTranslationRequired); for (uint32_t i = 0; i < buffers.size(); i++) { cl_mem clMem = buffers[i]; mockKernel->mockKernel->setArgBuffer(i, sizeof(cl_mem *), &clMem); } } template void setMockKernelArgs(std::array allocs) { for (uint32_t i = 0; i < allocs.size(); i++) { mockKernel->kernelInfo.addArgBuffer(i, 0); } mockKernel->mockKernel->initialize(); EXPECT_TRUE(mockKernel->mockKernel->auxTranslationRequired); for (uint32_t i = 0; i < allocs.size(); i++) { auto alloc = allocs[i]; auto ptr = reinterpret_cast(alloc->getGpuAddressToPatch()); mockKernel->mockKernel->setArgSvmAlloc(i, ptr, alloc); } } ReleaseableObjectPtr createBuffer(size_t size, bool compressed) { auto buffer = clUniquePtr(Buffer::create(bcsMockContext.get(), CL_MEM_READ_WRITE, size, nullptr, retVal)); if (compressed) { buffer->getGraphicsAllocation(device->getRootDeviceIndex())->setAllocationType(GraphicsAllocation::AllocationType::BUFFER_COMPRESSED); } else { buffer->getGraphicsAllocation(device->getRootDeviceIndex())->setAllocationType(GraphicsAllocation::AllocationType::BUFFER); } return buffer; } std::unique_ptr createGfxAllocation(size_t size, bool compressed) { auto alloc = std::unique_ptr(new MockGraphicsAllocation(nullptr, size)); if (compressed) { alloc->setAllocationType(GraphicsAllocation::AllocationType::BUFFER_COMPRESSED); } else { alloc->setAllocationType(GraphicsAllocation::AllocationType::BUFFER); } return alloc; } template GenCmdList getCmdList(LinearStream &linearStream, size_t offset) { HardwareParse hwParser; hwParser.parseCommands(linearStream, offset); return hwParser.cmdList; } template GenCmdList::iterator expectPipeControl(GenCmdList::iterator itorStart, GenCmdList::iterator itorEnd) { using PIPE_CONTROL = typename Family::PIPE_CONTROL; PIPE_CONTROL *pipeControlCmd = nullptr; GenCmdList::iterator commandItor = itorStart; bool stallingWrite = false; do { commandItor = find(commandItor, itorEnd); if (itorEnd == commandItor) { EXPECT_TRUE(false); return itorEnd; } pipeControlCmd = genCmdCast(*commandItor); stallingWrite = pipeControlCmd->getPostSyncOperation() == PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA && pipeControlCmd->getCommandStreamerStallEnable(); ++commandItor; } while (!stallingWrite); return --commandItor; } template GenCmdList::iterator expectMiFlush(GenCmdList::iterator itorStart, GenCmdList::iterator itorEnd) { Family *miFlushCmd = nullptr; GenCmdList::iterator commandItor = itorStart; bool miFlushWithMemoryWrite = false; do { commandItor = find(commandItor, itorEnd); if (itorEnd == commandItor) { EXPECT_TRUE(false); return itorEnd; } miFlushCmd = genCmdCast(*commandItor); miFlushWithMemoryWrite = miFlushCmd->getDestinationAddress() != 0; ++commandItor; } while (!miFlushWithMemoryWrite); return --commandItor; } template GenCmdList::iterator expectCommand(GenCmdList::iterator itorStart, GenCmdList::iterator itorEnd) { auto commandItor = find(itorStart, itorEnd); EXPECT_TRUE(commandItor != itorEnd); return commandItor; } template void verifySemaphore(GenCmdList::iterator &semaphoreItor, uint64_t expectedAddress) { using MI_SEMAPHORE_WAIT = typename Family::MI_SEMAPHORE_WAIT; auto semaphoreCmd = genCmdCast(*semaphoreItor); EXPECT_EQ(expectedAddress, semaphoreCmd->getSemaphoreGraphicsAddress()); } DebugManagerStateRestore restore; std::unique_ptr bcsOsContext; std::unique_ptr device; std::unique_ptr bcsMockContext; std::unique_ptr commandQueue; std::unique_ptr mockKernel; CommandStreamReceiver *bcsCsr = nullptr; CommandStreamReceiver *gpgpuCsr = nullptr; size_t gws[3] = {63, 0, 0}; size_t lws[3] = {16, 0, 0}; uint32_t hostPtr = 0; cl_int retVal = CL_SUCCESS; }; using BlitAuxTranslationTests = BlitEnqueueTests<1>; HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitAuxTranslationWhenConstructingCommandBufferThenEnsureCorrectOrder) { using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; using WALKER_TYPE = typename FamilyType::WALKER_TYPE; using XY_COPY_BLT = typename FamilyType::XY_COPY_BLT; using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW; auto buffer0 = createBuffer(1, true); auto buffer1 = createBuffer(1, false); auto buffer2 = createBuffer(1, true); setMockKernelArgs(std::array{{buffer0.get(), buffer1.get(), buffer2.get()}}); auto mockCmdQ = static_cast *>(commandQueue.get()); auto initialBcsTaskCount = mockCmdQ->bcsTaskCount; mockCmdQ->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr); EXPECT_EQ(mockCmdQ->bcsTaskCount, initialBcsTaskCount + 1); // Gpgpu command buffer { auto cmdListCsr = getCmdList(gpgpuCsr->getCS(0), 0); auto cmdListQueue = getCmdList(commandQueue->getCS(0), 0); // Barrier expectPipeControl(cmdListCsr.begin(), cmdListCsr.end()); // Aux to NonAux auto cmdFound = expectCommand(cmdListQueue.begin(), cmdListQueue.end()); cmdFound = expectCommand(++cmdFound, cmdListQueue.end()); // Walker cmdFound = expectCommand(++cmdFound, cmdListQueue.end()); cmdFound = expectCommand(++cmdFound, cmdListQueue.end()); // NonAux to Aux cmdFound = expectCommand(++cmdFound, cmdListQueue.end()); cmdFound = expectCommand(++cmdFound, cmdListQueue.end()); // task count expectPipeControl(++cmdFound, cmdListQueue.end()); } // BCS command buffer { auto cmdList = getCmdList(bcsCsr->getCS(0), 0); // Barrier auto cmdFound = expectCommand(cmdList.begin(), cmdList.end()); // Aux to NonAux cmdFound = expectCommand(++cmdFound, cmdList.end()); cmdFound = expectCommand(++cmdFound, cmdList.end()); cmdFound = expectCommand(++cmdFound, cmdList.end()); cmdFound = expectCommand(++cmdFound, cmdList.end()); // wait for NDR (walker split) cmdFound = expectCommand(++cmdFound, cmdList.end()); cmdFound = expectCommand(++cmdFound, cmdList.end()); // NonAux to Aux cmdFound = expectCommand(++cmdFound, cmdList.end()); cmdFound = expectCommand(++cmdFound, cmdList.end()); cmdFound = expectCommand(++cmdFound, cmdList.end()); cmdFound = expectCommand(++cmdFound, cmdList.end()); // taskCount expectCommand(++cmdFound, cmdList.end()); } } HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitAuxTranslationWhenConstructingBlockedCommandBufferThenEnsureCorrectOrder) { using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; using WALKER_TYPE = typename FamilyType::WALKER_TYPE; using XY_COPY_BLT = typename FamilyType::XY_COPY_BLT; using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW; auto buffer0 = createBuffer(1, true); auto buffer1 = createBuffer(1, false); auto buffer2 = createBuffer(1, true); setMockKernelArgs(std::array{{buffer0.get(), buffer1.get(), buffer2.get()}}); auto mockCmdQ = static_cast *>(commandQueue.get()); auto initialBcsTaskCount = mockCmdQ->bcsTaskCount; UserEvent userEvent; cl_event waitlist[] = {&userEvent}; mockCmdQ->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, lws, 1, waitlist, nullptr); userEvent.setStatus(CL_COMPLETE); EXPECT_EQ(mockCmdQ->bcsTaskCount, initialBcsTaskCount + 1); // Gpgpu command buffer { auto cmdListCsr = getCmdList(gpgpuCsr->getCS(0), 0); auto ultCsr = static_cast *>(gpgpuCsr); auto cmdListQueue = getCmdList(*ultCsr->lastFlushedCommandStream, 0); // Barrier expectPipeControl(cmdListCsr.begin(), cmdListCsr.end()); // Aux to NonAux auto cmdFound = expectCommand(cmdListQueue.begin(), cmdListQueue.end()); cmdFound = expectCommand(++cmdFound, cmdListQueue.end()); // Walker cmdFound = expectCommand(++cmdFound, cmdListQueue.end()); cmdFound = expectCommand(++cmdFound, cmdListQueue.end()); // NonAux to Aux cmdFound = expectCommand(++cmdFound, cmdListQueue.end()); cmdFound = expectCommand(++cmdFound, cmdListQueue.end()); // task count expectPipeControl(++cmdFound, cmdListQueue.end()); } // BCS command buffer { auto cmdList = getCmdList(bcsCsr->getCS(0), 0); // Barrier auto cmdFound = expectCommand(cmdList.begin(), cmdList.end()); // Aux to NonAux cmdFound = expectCommand(++cmdFound, cmdList.end()); cmdFound = expectCommand(++cmdFound, cmdList.end()); cmdFound = expectCommand(++cmdFound, cmdList.end()); cmdFound = expectCommand(++cmdFound, cmdList.end()); // wait for NDR (walker split) cmdFound = expectCommand(++cmdFound, cmdList.end()); cmdFound = expectCommand(++cmdFound, cmdList.end()); // NonAux to Aux cmdFound = expectCommand(++cmdFound, cmdList.end()); cmdFound = expectCommand(++cmdFound, cmdList.end()); cmdFound = expectCommand(++cmdFound, cmdList.end()); cmdFound = expectCommand(++cmdFound, cmdList.end()); // taskCount expectCommand(++cmdFound, cmdList.end()); } EXPECT_FALSE(mockCmdQ->isQueueBlocked()); } HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructingCommandBufferThenSynchronizeBarrier) { using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; auto buffer = createBuffer(1, true); setMockKernelArgs(std::array{{buffer.get()}}); commandQueue->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); auto cmdListCsr = getCmdList(gpgpuCsr->getCS(0), 0); auto pipeControl = expectPipeControl(cmdListCsr.begin(), cmdListCsr.end()); auto pipeControlCmd = genCmdCast(*pipeControl); uint64_t low = pipeControlCmd->getAddress(); uint64_t high = pipeControlCmd->getAddressHigh(); uint64_t barrierGpuAddress = (high << 32) | low; auto cmdList = getCmdList(bcsCsr->getCS(0), 0); auto semaphore = expectCommand(cmdList.begin(), cmdList.end()); verifySemaphore(semaphore, barrierGpuAddress); } HWTEST_TEMPLATED_F(BlitAuxTranslationTests, whenFlushTagUpdateThenMiFlushDwIsFlushed) { using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW; bcsCsr->flushTagUpdate(); auto cmdListBcs = getCmdList(bcsCsr->getCS(0), 0); auto cmdFound = expectCommand(cmdListBcs.begin(), cmdListBcs.end()); EXPECT_NE(cmdFound, cmdListBcs.end()); } HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructingCommandBufferThenSynchronizeBcsOutput) { using XY_COPY_BLT = typename FamilyType::XY_COPY_BLT; using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW; using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; using WALKER_TYPE = typename FamilyType::WALKER_TYPE; auto buffer0 = createBuffer(1, true); auto buffer1 = createBuffer(1, true); setMockKernelArgs(std::array{{buffer0.get(), buffer1.get()}}); commandQueue->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); uint64_t auxToNonAuxOutputAddress[2] = {}; uint64_t nonAuxToAuxOutputAddress[2] = {}; { auto cmdListBcs = getCmdList(bcsCsr->getCS(0), 0); auto cmdFound = expectCommand(cmdListBcs.begin(), cmdListBcs.end()); cmdFound = expectMiFlush(++cmdFound, cmdListBcs.end()); auto miflushDwCmd = genCmdCast(*cmdFound); auxToNonAuxOutputAddress[0] = miflushDwCmd->getDestinationAddress(); cmdFound = expectMiFlush(++cmdFound, cmdListBcs.end()); miflushDwCmd = genCmdCast(*cmdFound); auxToNonAuxOutputAddress[1] = miflushDwCmd->getDestinationAddress(); cmdFound = expectCommand(++cmdFound, cmdListBcs.end()); cmdFound = expectMiFlush(++cmdFound, cmdListBcs.end()); miflushDwCmd = genCmdCast(*cmdFound); nonAuxToAuxOutputAddress[0] = miflushDwCmd->getDestinationAddress(); cmdFound = expectMiFlush(++cmdFound, cmdListBcs.end()); miflushDwCmd = genCmdCast(*cmdFound); nonAuxToAuxOutputAddress[1] = miflushDwCmd->getDestinationAddress(); } { auto cmdListQueue = getCmdList(commandQueue->getCS(0), 0); // Aux to NonAux auto cmdFound = expectCommand(cmdListQueue.begin(), cmdListQueue.end()); verifySemaphore(cmdFound, auxToNonAuxOutputAddress[0]); cmdFound = expectCommand(++cmdFound, cmdListQueue.end()); verifySemaphore(cmdFound, auxToNonAuxOutputAddress[1]); // Walker cmdFound = expectCommand(++cmdFound, cmdListQueue.end()); // NonAux to Aux cmdFound = expectCommand(++cmdFound, cmdListQueue.end()); verifySemaphore(cmdFound, nonAuxToAuxOutputAddress[0]); cmdFound = expectCommand(++cmdFound, cmdListQueue.end()); verifySemaphore(cmdFound, nonAuxToAuxOutputAddress[1]); } } HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructingCommandBufferThenSynchronizeKernel) { using XY_COPY_BLT = typename FamilyType::XY_COPY_BLT; using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; auto buffer = createBuffer(1, true); setMockKernelArgs(std::array{{buffer.get()}}); auto mockCmdQ = static_cast *>(commandQueue.get()); mockCmdQ->overrideIsCacheFlushForBcsRequired.enabled = true; mockCmdQ->overrideIsCacheFlushForBcsRequired.returnValue = false; mockCmdQ->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); auto kernelNode = mockCmdQ->timestampPacketContainer->peekNodes()[0]; auto kernelNodeAddress = TimestampPacketHelper::getContextEndGpuAddress(*kernelNode); auto cmdList = getCmdList(bcsCsr->getCS(0), 0); // Aux to nonAux auto cmdFound = expectCommand(cmdList.begin(), cmdList.end()); // semaphore before NonAux to Aux auto semaphore = expectCommand(++cmdFound, cmdList.end()); verifySemaphore(semaphore, kernelNodeAddress); } HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructingCommandBufferThenSynchronizeCacheFlush) { using XY_COPY_BLT = typename FamilyType::XY_COPY_BLT; using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; using WALKER_TYPE = typename FamilyType::WALKER_TYPE; auto buffer = createBuffer(1, true); setMockKernelArgs(std::array{{buffer.get()}}); auto mockCmdQ = static_cast *>(commandQueue.get()); mockCmdQ->overrideIsCacheFlushForBcsRequired.enabled = true; mockCmdQ->overrideIsCacheFlushForBcsRequired.returnValue = true; mockCmdQ->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); auto cmdListBcs = getCmdList(bcsCsr->getCS(0), 0); auto cmdListQueue = getCmdList(mockCmdQ->getCS(0), 0); uint64_t cacheFlushWriteAddress = 0; { auto cmdFound = expectCommand(cmdListQueue.begin(), cmdListQueue.end()); cmdFound = expectPipeControl(++cmdFound, cmdListQueue.end()); auto pipeControlCmd = genCmdCast(*cmdFound); if (!pipeControlCmd->getDcFlushEnable()) { // skip pipe control with TimestampPacket write cmdFound = expectPipeControl(++cmdFound, cmdListQueue.end()); pipeControlCmd = genCmdCast(*cmdFound); } EXPECT_TRUE(pipeControlCmd->getDcFlushEnable()); EXPECT_TRUE(pipeControlCmd->getCommandStreamerStallEnable()); uint64_t low = pipeControlCmd->getAddress(); uint64_t high = pipeControlCmd->getAddressHigh(); cacheFlushWriteAddress = (high << 32) | low; EXPECT_NE(0u, cacheFlushWriteAddress); } { // Aux to nonAux auto cmdFound = expectCommand(cmdListBcs.begin(), cmdListBcs.end()); // semaphore before NonAux to Aux cmdFound = expectCommand(++cmdFound, cmdListBcs.end()); verifySemaphore(cmdFound, cacheFlushWriteAddress); } } HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructingCommandBufferThenSynchronizeEvents) { using XY_COPY_BLT = typename FamilyType::XY_COPY_BLT; using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; auto buffer = createBuffer(1, true); setMockKernelArgs(std::array{{buffer.get()}}); auto event = make_releaseable(commandQueue.get(), CL_COMMAND_READ_BUFFER, 0, 0); MockTimestampPacketContainer eventDependencyContainer(*bcsCsr->getTimestampPacketAllocator(), 1); auto eventDependency = eventDependencyContainer.getNode(0); event->addTimestampPacketNodes(eventDependencyContainer); cl_event clEvent[] = {event.get()}; commandQueue->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, nullptr, 1, clEvent, nullptr); auto eventDependencyAddress = TimestampPacketHelper::getContextEndGpuAddress(*eventDependency); auto cmdList = getCmdList(bcsCsr->getCS(0), 0); // Barrier auto cmdFound = expectCommand(cmdList.begin(), cmdList.end()); // Event auto semaphore = expectCommand(++cmdFound, cmdList.end()); verifySemaphore(semaphore, eventDependencyAddress); cmdFound = expectCommand(++semaphore, cmdList.end()); expectCommand(++cmdFound, cmdList.end()); } HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenOutEventWhenDispatchingThenAssignNonAuxNodes) { using WALKER_TYPE = typename FamilyType::WALKER_TYPE; using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; auto buffer0 = createBuffer(1, true); auto buffer1 = createBuffer(1, false); auto buffer2 = createBuffer(1, true); setMockKernelArgs(std::array{{buffer0.get(), buffer1.get(), buffer2.get()}}); cl_event clEvent; commandQueue->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, &clEvent); auto event = castToObject(clEvent); auto &eventNodes = event->getTimestampPacketNodes()->peekNodes(); EXPECT_EQ(5u, eventNodes.size()); auto cmdListQueue = getCmdList(commandQueue->getCS(0), 0); auto cmdFound = expectCommand(cmdListQueue.begin(), cmdListQueue.end()); // NonAux to Aux cmdFound = expectCommand(++cmdFound, cmdListQueue.end()); auto eventNodeAddress = TimestampPacketHelper::getContextEndGpuAddress(*eventNodes[1]); verifySemaphore(cmdFound, eventNodeAddress); cmdFound = expectCommand(++cmdFound, cmdListQueue.end()); eventNodeAddress = TimestampPacketHelper::getContextEndGpuAddress(*eventNodes[2]); verifySemaphore(cmdFound, eventNodeAddress); clReleaseEvent(clEvent); } HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitAuxTranslationWhenDispatchingThenEstimateCmdBufferSize) { using WALKER_TYPE = typename FamilyType::WALKER_TYPE; using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; auto &hwInfo = device->getHardwareInfo(); auto mockCmdQ = static_cast *>(commandQueue.get()); mockCmdQ->overrideIsCacheFlushForBcsRequired.enabled = true; mockCmdQ->overrideIsCacheFlushForBcsRequired.returnValue = false; auto buffer0 = createBuffer(1, true); auto buffer1 = createBuffer(1, false); auto buffer2 = createBuffer(1, true); KernelObjsForAuxTranslation kernelObjects; kernelObjects.insert({KernelObjForAuxTranslation::Type::MEM_OBJ, buffer0.get()}); kernelObjects.insert({KernelObjForAuxTranslation::Type::MEM_OBJ, buffer2.get()}); size_t numBuffersToEstimate = 2; size_t dependencySize = numBuffersToEstimate * TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependencyWithBlitEnqueue(); setMockKernelArgs(std::array{{buffer0.get(), buffer1.get(), buffer2.get()}}); mockCmdQ->storeMultiDispatchInfo = true; mockCmdQ->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr); MultiDispatchInfo &multiDispatchInfo = mockCmdQ->storedMultiDispatchInfo; DispatchInfo *firstDispatchInfo = multiDispatchInfo.begin(); DispatchInfo *lastDispatchInfo = &(*multiDispatchInfo.rbegin()); EXPECT_NE(firstDispatchInfo, lastDispatchInfo); // walker split EXPECT_EQ(dependencySize, firstDispatchInfo->dispatchInitCommands.estimateCommandsSize(kernelObjects.size(), hwInfo, mockCmdQ->isCacheFlushForBcsRequired())); EXPECT_EQ(0u, firstDispatchInfo->dispatchEpilogueCommands.estimateCommandsSize(kernelObjects.size(), hwInfo, mockCmdQ->isCacheFlushForBcsRequired())); EXPECT_EQ(0u, lastDispatchInfo->dispatchInitCommands.estimateCommandsSize(kernelObjects.size(), hwInfo, mockCmdQ->isCacheFlushForBcsRequired())); EXPECT_EQ(dependencySize, lastDispatchInfo->dispatchEpilogueCommands.estimateCommandsSize(kernelObjects.size(), hwInfo, mockCmdQ->isCacheFlushForBcsRequired())); } HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitAuxTranslationWithRequiredCacheFlushWhenDispatchingThenEstimateCmdBufferSize) { using WALKER_TYPE = typename FamilyType::WALKER_TYPE; using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; auto &hwInfo = device->getHardwareInfo(); auto mockCmdQ = static_cast *>(commandQueue.get()); mockCmdQ->overrideIsCacheFlushForBcsRequired.enabled = true; mockCmdQ->overrideIsCacheFlushForBcsRequired.returnValue = true; auto buffer0 = createBuffer(1, true); auto buffer1 = createBuffer(1, false); auto buffer2 = createBuffer(1, true); KernelObjsForAuxTranslation kernelObjects; kernelObjects.insert({KernelObjForAuxTranslation::Type::MEM_OBJ, buffer0.get()}); kernelObjects.insert({KernelObjForAuxTranslation::Type::MEM_OBJ, buffer2.get()}); size_t numBuffersToEstimate = 2; size_t dependencySize = numBuffersToEstimate * TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependencyWithBlitEnqueue(); size_t cacheFlushSize = MemorySynchronizationCommands::getSizeForPipeControlWithPostSyncOperation(hwInfo); setMockKernelArgs(std::array{{buffer0.get(), buffer1.get(), buffer2.get()}}); mockCmdQ->storeMultiDispatchInfo = true; mockCmdQ->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, lws, 0, nullptr, nullptr); MultiDispatchInfo &multiDispatchInfo = mockCmdQ->storedMultiDispatchInfo; DispatchInfo *firstDispatchInfo = multiDispatchInfo.begin(); DispatchInfo *lastDispatchInfo = &(*multiDispatchInfo.rbegin()); EXPECT_NE(firstDispatchInfo, lastDispatchInfo); // walker split EXPECT_EQ(dependencySize, firstDispatchInfo->dispatchInitCommands.estimateCommandsSize(kernelObjects.size(), hwInfo, mockCmdQ->isCacheFlushForBcsRequired())); EXPECT_EQ(0u, firstDispatchInfo->dispatchEpilogueCommands.estimateCommandsSize(kernelObjects.size(), hwInfo, mockCmdQ->isCacheFlushForBcsRequired())); EXPECT_EQ(0u, lastDispatchInfo->dispatchInitCommands.estimateCommandsSize(kernelObjects.size(), hwInfo, mockCmdQ->isCacheFlushForBcsRequired())); EXPECT_EQ(dependencySize + cacheFlushSize, lastDispatchInfo->dispatchEpilogueCommands.estimateCommandsSize(kernelObjects.size(), hwInfo, mockCmdQ->isCacheFlushForBcsRequired())); } HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructingBlockedCommandBufferThenSynchronizeBarrier) { using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; auto buffer = createBuffer(1, true); setMockKernelArgs(std::array{{buffer.get()}}); UserEvent userEvent; cl_event waitlist[] = {&userEvent}; commandQueue->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, nullptr, 1, waitlist, nullptr); userEvent.setStatus(CL_COMPLETE); auto cmdListCsr = getCmdList(gpgpuCsr->getCS(0), 0); auto pipeControl = expectPipeControl(cmdListCsr.begin(), cmdListCsr.end()); auto pipeControlCmd = genCmdCast(*pipeControl); uint64_t low = pipeControlCmd->getAddress(); uint64_t high = pipeControlCmd->getAddressHigh(); uint64_t barrierGpuAddress = (high << 32) | low; auto cmdList = getCmdList(bcsCsr->getCS(0), 0); auto semaphore = expectCommand(cmdList.begin(), cmdList.end()); verifySemaphore(semaphore, barrierGpuAddress); EXPECT_FALSE(commandQueue->isQueueBlocked()); } HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructingBlockedCommandBufferThenSynchronizeEvents) { using XY_COPY_BLT = typename FamilyType::XY_COPY_BLT; using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; auto buffer = createBuffer(1, true); setMockKernelArgs(std::array{{buffer.get()}}); auto event = make_releaseable(commandQueue.get(), CL_COMMAND_READ_BUFFER, 0, 0); MockTimestampPacketContainer eventDependencyContainer(*bcsCsr->getTimestampPacketAllocator(), 1); auto eventDependency = eventDependencyContainer.getNode(0); event->addTimestampPacketNodes(eventDependencyContainer); UserEvent userEvent; cl_event waitlist[] = {&userEvent, event.get()}; commandQueue->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, nullptr, 2, waitlist, nullptr); userEvent.setStatus(CL_COMPLETE); auto eventDependencyAddress = TimestampPacketHelper::getContextEndGpuAddress(*eventDependency); auto cmdList = getCmdList(bcsCsr->getCS(0), 0); // Barrier auto cmdFound = expectCommand(cmdList.begin(), cmdList.end()); // Event auto semaphore = expectCommand(++cmdFound, cmdList.end()); verifySemaphore(semaphore, eventDependencyAddress); cmdFound = expectCommand(++semaphore, cmdList.end()); expectCommand(++cmdFound, cmdList.end()); EXPECT_FALSE(commandQueue->isQueueBlocked()); } HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructingBlockedCommandBufferThenSynchronizeKernel) { using XY_COPY_BLT = typename FamilyType::XY_COPY_BLT; using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; auto buffer = createBuffer(1, true); setMockKernelArgs(std::array{{buffer.get()}}); auto mockCmdQ = static_cast *>(commandQueue.get()); UserEvent userEvent; cl_event waitlist[] = {&userEvent}; mockCmdQ->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, nullptr, 1, waitlist, nullptr); userEvent.setStatus(CL_COMPLETE); auto kernelNode = mockCmdQ->timestampPacketContainer->peekNodes()[0]; auto kernelNodeAddress = TimestampPacketHelper::getContextEndGpuAddress(*kernelNode); auto cmdList = getCmdList(bcsCsr->getCS(0), 0); // Aux to nonAux auto cmdFound = expectCommand(cmdList.begin(), cmdList.end()); // semaphore before NonAux to Aux auto semaphore = expectCommand(++cmdFound, cmdList.end()); if (mockCmdQ->isCacheFlushForBcsRequired()) { semaphore = expectCommand(++semaphore, cmdList.end()); } verifySemaphore(semaphore, kernelNodeAddress); EXPECT_FALSE(commandQueue->isQueueBlocked()); } HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructingBlockedCommandBufferThenSynchronizeBcsOutput) { using XY_COPY_BLT = typename FamilyType::XY_COPY_BLT; using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW; using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; using WALKER_TYPE = typename FamilyType::WALKER_TYPE; auto buffer0 = createBuffer(1, true); auto buffer1 = createBuffer(1, true); setMockKernelArgs(std::array{{buffer0.get(), buffer1.get()}}); UserEvent userEvent; cl_event waitlist[] = {&userEvent}; commandQueue->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, nullptr, 1, waitlist, nullptr); userEvent.setStatus(CL_COMPLETE); uint64_t auxToNonAuxOutputAddress[2] = {}; uint64_t nonAuxToAuxOutputAddress[2] = {}; { auto cmdListBcs = getCmdList(bcsCsr->getCS(0), 0); auto cmdFound = expectCommand(cmdListBcs.begin(), cmdListBcs.end()); cmdFound = expectMiFlush(++cmdFound, cmdListBcs.end()); auto miflushDwCmd = genCmdCast(*cmdFound); auxToNonAuxOutputAddress[0] = miflushDwCmd->getDestinationAddress(); cmdFound = expectMiFlush(++cmdFound, cmdListBcs.end()); miflushDwCmd = genCmdCast(*cmdFound); auxToNonAuxOutputAddress[1] = miflushDwCmd->getDestinationAddress(); cmdFound = expectCommand(++cmdFound, cmdListBcs.end()); cmdFound = expectMiFlush(++cmdFound, cmdListBcs.end()); miflushDwCmd = genCmdCast(*cmdFound); nonAuxToAuxOutputAddress[0] = miflushDwCmd->getDestinationAddress(); cmdFound = expectMiFlush(++cmdFound, cmdListBcs.end()); miflushDwCmd = genCmdCast(*cmdFound); nonAuxToAuxOutputAddress[1] = miflushDwCmd->getDestinationAddress(); } { auto ultCsr = static_cast *>(gpgpuCsr); auto cmdListQueue = getCmdList(*ultCsr->lastFlushedCommandStream, 0); // Aux to NonAux auto cmdFound = expectCommand(cmdListQueue.begin(), cmdListQueue.end()); verifySemaphore(cmdFound, auxToNonAuxOutputAddress[0]); cmdFound = expectCommand(++cmdFound, cmdListQueue.end()); verifySemaphore(cmdFound, auxToNonAuxOutputAddress[1]); // Walker cmdFound = expectCommand(++cmdFound, cmdListQueue.end()); // NonAux to Aux cmdFound = expectCommand(++cmdFound, cmdListQueue.end()); verifySemaphore(cmdFound, nonAuxToAuxOutputAddress[0]); cmdFound = expectCommand(++cmdFound, cmdListQueue.end()); verifySemaphore(cmdFound, nonAuxToAuxOutputAddress[1]); } EXPECT_FALSE(commandQueue->isQueueBlocked()); } HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenEnqueueIsCalledThenDoImplicitFlushOnGpgpuCsr) { auto buffer = createBuffer(1, true); setMockKernelArgs(std::array{{buffer.get()}}); auto ultCsr = static_cast *>(gpgpuCsr); EXPECT_EQ(0u, ultCsr->taskCount); commandQueue->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); EXPECT_EQ(1u, ultCsr->taskCount); EXPECT_TRUE(ultCsr->recordedDispatchFlags.implicitFlush); } HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationOnGfxAllocationWhenEnqueueIsCalledThenDoImplicitFlushOnGpgpuCsr) { auto gfxAllocation = createGfxAllocation(1, true); setMockKernelArgs(std::array{{gfxAllocation.get()}}); auto ultCsr = static_cast *>(gpgpuCsr); EXPECT_EQ(0u, ultCsr->taskCount); commandQueue->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); EXPECT_EQ(1u, ultCsr->taskCount); EXPECT_TRUE(ultCsr->recordedDispatchFlags.implicitFlush); } using BlitEnqueueWithNoTimestampPacketTests = BlitEnqueueTests<0>; HWTEST_TEMPLATED_F(BlitEnqueueWithNoTimestampPacketTests, givenNoTimestampPacketsWritewhenEnqueueingBlitOperationThenEnginesAreSynchronized) { using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW; using WALKER_TYPE = typename FamilyType::WALKER_TYPE; const size_t bufferSize = 1u; auto buffer = createBuffer(bufferSize, false); auto ultCsr = static_cast *>(gpgpuCsr); ASSERT_EQ(0u, ultCsr->taskCount); setMockKernelArgs(std::array{{buffer.get()}}); commandQueue->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); char cpuBuffer[bufferSize]{}; commandQueue->enqueueReadBuffer(buffer.get(), CL_FALSE, 0, bufferSize, cpuBuffer, nullptr, 0, nullptr, nullptr); commandQueue->finish(); auto bcsCommands = getCmdList(bcsCsr->getCS(0), 0); auto ccsCommands = getCmdList(commandQueue->getCS(0), 0); auto cmdFound = expectCommand(bcsCommands.begin(), bcsCommands.end()); cmdFound = expectMiFlush(cmdFound++, bcsCommands.end()); auto miflushDwCmd = genCmdCast(*cmdFound); const auto bcsSignalAddress = miflushDwCmd->getDestinationAddress(); cmdFound = expectCommand(ccsCommands.begin(), ccsCommands.end()); cmdFound = expectCommand(cmdFound++, ccsCommands.end()); verifySemaphore(cmdFound, bcsSignalAddress); } struct BlitEnqueueWithDebugCapabilityTests : public BlitEnqueueTests<0> { template void findSemaphores(GenCmdList &cmdList) { auto semaphore = find(cmdList.begin(), cmdList.end()); while (semaphore != cmdList.end()) { auto semaphoreCmd = genCmdCast(*semaphore); if (static_cast(DebugPauseState::hasUserStartConfirmation) == semaphoreCmd->getSemaphoreDataDword() && debugPauseStateAddress == semaphoreCmd->getSemaphoreGraphicsAddress()) { EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_EQUAL_SDD, semaphoreCmd->getCompareOperation()); EXPECT_EQ(MI_SEMAPHORE_WAIT::WAIT_MODE::WAIT_MODE_POLLING_MODE, semaphoreCmd->getWaitMode()); semaphoreBeforeCopyFound++; } if (static_cast(DebugPauseState::hasUserEndConfirmation) == semaphoreCmd->getSemaphoreDataDword() && debugPauseStateAddress == semaphoreCmd->getSemaphoreGraphicsAddress()) { EXPECT_EQ(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_EQUAL_SDD, semaphoreCmd->getCompareOperation()); EXPECT_EQ(MI_SEMAPHORE_WAIT::WAIT_MODE::WAIT_MODE_POLLING_MODE, semaphoreCmd->getWaitMode()); semaphoreAfterCopyFound++; } semaphore = find(++semaphore, cmdList.end()); } } template void findMiFlushes(GenCmdList &cmdList) { auto miFlush = find(cmdList.begin(), cmdList.end()); while (miFlush != cmdList.end()) { auto miFlushCmd = genCmdCast(*miFlush); if (static_cast(DebugPauseState::waitingForUserStartConfirmation) == miFlushCmd->getImmediateData() && debugPauseStateAddress == miFlushCmd->getDestinationAddress()) { EXPECT_EQ(MI_FLUSH_DW::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA_QWORD, miFlushCmd->getPostSyncOperation()); miFlushBeforeCopyFound++; } if (static_cast(DebugPauseState::waitingForUserEndConfirmation) == miFlushCmd->getImmediateData() && debugPauseStateAddress == miFlushCmd->getDestinationAddress()) { EXPECT_EQ(MI_FLUSH_DW::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA_QWORD, miFlushCmd->getPostSyncOperation()); miFlushAfterCopyFound++; } miFlush = find(++miFlush, cmdList.end()); } } uint32_t semaphoreBeforeCopyFound = 0; uint32_t semaphoreAfterCopyFound = 0; uint32_t miFlushBeforeCopyFound = 0; uint32_t miFlushAfterCopyFound = 0; ReleaseableObjectPtr buffer; uint64_t debugPauseStateAddress = 0; int hostPtr = 0; }; HWTEST_TEMPLATED_F(BlitEnqueueWithDebugCapabilityTests, givenDebugFlagSetWhenDispatchingBlitEnqueueThenAddPausingCommands) { using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW; auto ultBcsCsr = static_cast *>(bcsCsr); debugPauseStateAddress = ultBcsCsr->getDebugPauseStateGPUAddress(); buffer = createBuffer(1, false); buffer->forceDisallowCPUCopy = true; DebugManager.flags.PauseOnBlitCopy.set(1); commandQueue->enqueueWriteBuffer(buffer.get(), true, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr); commandQueue->enqueueWriteBuffer(buffer.get(), true, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr); HardwareParse hwParser; hwParser.parseCommands(ultBcsCsr->commandStream); findSemaphores(hwParser.cmdList); EXPECT_EQ(1u, semaphoreBeforeCopyFound); EXPECT_EQ(1u, semaphoreAfterCopyFound); findMiFlushes(hwParser.cmdList); EXPECT_EQ(1u, miFlushBeforeCopyFound); EXPECT_EQ(1u, miFlushAfterCopyFound); } HWTEST_TEMPLATED_F(BlitEnqueueWithDebugCapabilityTests, givenDebugFlagSetToMinusTwoWhenDispatchingBlitEnqueueThenAddPausingCommandsForEachEnqueue) { using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW; auto ultBcsCsr = static_cast *>(bcsCsr); debugPauseStateAddress = ultBcsCsr->getDebugPauseStateGPUAddress(); buffer = createBuffer(1, false); buffer->forceDisallowCPUCopy = true; DebugManager.flags.PauseOnBlitCopy.set(-2); commandQueue->enqueueWriteBuffer(buffer.get(), true, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr); commandQueue->enqueueWriteBuffer(buffer.get(), true, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr); HardwareParse hwParser; hwParser.parseCommands(ultBcsCsr->commandStream); findSemaphores(hwParser.cmdList); EXPECT_EQ(2u, semaphoreBeforeCopyFound); EXPECT_EQ(2u, semaphoreAfterCopyFound); findMiFlushes(hwParser.cmdList); EXPECT_EQ(2u, miFlushBeforeCopyFound); EXPECT_EQ(2u, miFlushAfterCopyFound); } HWTEST_TEMPLATED_F(BlitEnqueueWithDebugCapabilityTests, givenPauseModeSetToBeforeOnlyWhenDispatchingBlitEnqueueThenAddPauseCommandsOnlyBeforeEnqueue) { using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW; auto ultBcsCsr = static_cast *>(bcsCsr); debugPauseStateAddress = ultBcsCsr->getDebugPauseStateGPUAddress(); buffer = createBuffer(1, false); buffer->forceDisallowCPUCopy = true; DebugManager.flags.PauseOnBlitCopy.set(0); DebugManager.flags.PauseOnGpuMode.set(PauseOnGpuProperties::PauseMode::BeforeWorkload); commandQueue->enqueueWriteBuffer(buffer.get(), true, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr); HardwareParse hwParser; hwParser.parseCommands(ultBcsCsr->commandStream); findSemaphores(hwParser.cmdList); EXPECT_EQ(1u, semaphoreBeforeCopyFound); EXPECT_EQ(0u, semaphoreAfterCopyFound); findMiFlushes(hwParser.cmdList); EXPECT_EQ(1u, miFlushBeforeCopyFound); EXPECT_EQ(0u, miFlushAfterCopyFound); } HWTEST_TEMPLATED_F(BlitEnqueueWithDebugCapabilityTests, givenPauseModeSetToAfterOnlyWhenDispatchingBlitEnqueueThenAddPauseCommandsOnlyAfterEnqueue) { using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW; auto ultBcsCsr = static_cast *>(bcsCsr); debugPauseStateAddress = ultBcsCsr->getDebugPauseStateGPUAddress(); buffer = createBuffer(1, false); buffer->forceDisallowCPUCopy = true; DebugManager.flags.PauseOnBlitCopy.set(0); DebugManager.flags.PauseOnGpuMode.set(PauseOnGpuProperties::PauseMode::AfterWorkload); commandQueue->enqueueWriteBuffer(buffer.get(), true, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr); HardwareParse hwParser; hwParser.parseCommands(ultBcsCsr->commandStream); findSemaphores(hwParser.cmdList); EXPECT_EQ(0u, semaphoreBeforeCopyFound); EXPECT_EQ(1u, semaphoreAfterCopyFound); findMiFlushes(hwParser.cmdList); EXPECT_EQ(0u, miFlushBeforeCopyFound); EXPECT_EQ(1u, miFlushAfterCopyFound); } HWTEST_TEMPLATED_F(BlitEnqueueWithDebugCapabilityTests, givenPauseModeSetToBeforeAndAfterWorkloadWhenDispatchingBlitEnqueueThenAddPauseCommandsAroundEnqueue) { using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW; auto ultBcsCsr = static_cast *>(bcsCsr); debugPauseStateAddress = ultBcsCsr->getDebugPauseStateGPUAddress(); buffer = createBuffer(1, false); buffer->forceDisallowCPUCopy = true; DebugManager.flags.PauseOnBlitCopy.set(0); DebugManager.flags.PauseOnGpuMode.set(PauseOnGpuProperties::PauseMode::BeforeAndAfterWorkload); commandQueue->enqueueWriteBuffer(buffer.get(), true, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr); HardwareParse hwParser; hwParser.parseCommands(ultBcsCsr->commandStream); findSemaphores(hwParser.cmdList); EXPECT_EQ(1u, semaphoreBeforeCopyFound); EXPECT_EQ(1u, semaphoreAfterCopyFound); findMiFlushes(hwParser.cmdList); EXPECT_EQ(1u, miFlushBeforeCopyFound); EXPECT_EQ(1u, miFlushAfterCopyFound); } HWTEST_TEMPLATED_F(BlitEnqueueWithDebugCapabilityTests, givenDebugFlagSetWhenCreatingCsrThenCreateDebugThread) { DebugManager.flags.PauseOnBlitCopy.set(1); auto localDevice = std::make_unique(MockDevice::createWithNewExecutionEnvironment(nullptr)); auto ultCsr = static_cast *>(localDevice->getDefaultEngine().commandStreamReceiver); EXPECT_NE(nullptr, ultCsr->userPauseConfirmation.get()); } struct BlitEnqueueFlushTests : public BlitEnqueueTests<1> { template class MyUltCsr : public UltCommandStreamReceiver { public: using UltCommandStreamReceiver::UltCommandStreamReceiver; bool flush(BatchBuffer &batchBuffer, ResidencyContainer &allocationsForResidency) override { latestFlushedCounter = ++(*flushCounter); return UltCommandStreamReceiver::flush(batchBuffer, allocationsForResidency); } static CommandStreamReceiver *create(bool withAubDump, ExecutionEnvironment &executionEnvironment, uint32_t rootDeviceIndex, const DeviceBitfield deviceBitfield) { return new MyUltCsr(executionEnvironment, rootDeviceIndex, deviceBitfield); } uint32_t *flushCounter = nullptr; uint32_t latestFlushedCounter = 0; }; template void SetUpT() { auto csrCreateFcn = &commandStreamReceiverFactory[IGFX_MAX_CORE + defaultHwInfo->platform.eRenderCoreFamily]; variableBackup = std::make_unique>(csrCreateFcn); *csrCreateFcn = MyUltCsr::create; BlitEnqueueTests<1>::SetUpT(); } std::unique_ptr> variableBackup; }; HWTEST_TEMPLATED_F(BlitEnqueueFlushTests, givenNonBlockedQueueWhenBlitEnqueuedThenFlushGpgpuCsrFirst) { auto buffer = createBuffer(1, false); buffer->forceDisallowCPUCopy = true; int hostPtr = 0; uint32_t flushCounter = 0; auto myUltGpgpuCsr = static_cast *>(gpgpuCsr); myUltGpgpuCsr->flushCounter = &flushCounter; auto myUltBcsCsr = static_cast *>(bcsCsr); myUltBcsCsr->flushCounter = &flushCounter; commandQueue->enqueueWriteBuffer(buffer.get(), true, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr); EXPECT_EQ(1u, myUltGpgpuCsr->latestFlushedCounter); EXPECT_EQ(2u, myUltBcsCsr->latestFlushedCounter); } HWTEST_TEMPLATED_F(BlitEnqueueFlushTests, givenBlockedQueueWhenBlitEnqueuedThenFlushGpgpuCsrFirst) { auto buffer = createBuffer(1, false); buffer->forceDisallowCPUCopy = true; int hostPtr = 0; uint32_t flushCounter = 0; auto myUltGpgpuCsr = static_cast *>(gpgpuCsr); myUltGpgpuCsr->flushCounter = &flushCounter; auto myUltBcsCsr = static_cast *>(bcsCsr); myUltBcsCsr->flushCounter = &flushCounter; UserEvent userEvent; cl_event waitlist[] = {&userEvent}; commandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 1, waitlist, nullptr); userEvent.setStatus(CL_COMPLETE); EXPECT_EQ(1u, myUltGpgpuCsr->latestFlushedCounter); EXPECT_EQ(2u, myUltBcsCsr->latestFlushedCounter); EXPECT_FALSE(commandQueue->isQueueBlocked()); } HWTEST_TEMPLATED_F(BlitEnqueueFlushTests, givenDebugFlagSetWhenCheckingBcsCacheFlushRequirementThenReturnCorrectValue) { auto mockCommandQueue = static_cast *>(commandQueue.get()); DebugManager.flags.ForceCacheFlushForBcs.set(0); EXPECT_FALSE(mockCommandQueue->isCacheFlushForBcsRequired()); DebugManager.flags.ForceCacheFlushForBcs.set(1); EXPECT_TRUE(mockCommandQueue->isCacheFlushForBcsRequired()); } using BlitEnqueueTaskCountTests = BlitEnqueueTests<1>; HWTEST_TEMPLATED_F(BlitEnqueueTaskCountTests, whenWaitUntilCompletionCalledThenWaitForSpecificBcsTaskCount) { uint32_t gpgpuTaskCount = 123; uint32_t bcsTaskCount = 123; commandQueue->waitUntilComplete(gpgpuTaskCount, bcsTaskCount, 0, false); EXPECT_EQ(gpgpuTaskCount, static_cast *>(gpgpuCsr)->latestWaitForCompletionWithTimeoutTaskCount.load()); EXPECT_EQ(bcsTaskCount, static_cast *>(bcsCsr)->latestWaitForCompletionWithTimeoutTaskCount.load()); } HWTEST_TEMPLATED_F(BlitEnqueueTaskCountTests, givenEventWithNotreadyBcsTaskCountThenDontReportCompletion) { const uint32_t gpgpuTaskCount = 123; const uint32_t bcsTaskCount = 123; *gpgpuCsr->getTagAddress() = gpgpuTaskCount; *bcsCsr->getTagAddress() = bcsTaskCount - 1; commandQueue->updateBcsTaskCount(bcsTaskCount); Event event(commandQueue.get(), CL_COMMAND_WRITE_BUFFER, 1, gpgpuTaskCount); event.updateCompletionStamp(gpgpuTaskCount, bcsTaskCount, 1, 0); event.updateExecutionStatus(); EXPECT_EQ(static_cast(CL_SUBMITTED), event.peekExecutionStatus()); *bcsCsr->getTagAddress() = bcsTaskCount; event.updateExecutionStatus(); EXPECT_EQ(static_cast(CL_COMPLETE), event.peekExecutionStatus()); } HWTEST_TEMPLATED_F(BlitEnqueueTaskCountTests, givenEventWhenWaitingForCompletionThenWaitForCurrentBcsTaskCount) { auto buffer = createBuffer(1, false); buffer->forceDisallowCPUCopy = true; int hostPtr = 0; auto ultGpgpuCsr = static_cast *>(gpgpuCsr); auto ultBcsCsr = static_cast *>(bcsCsr); cl_event outEvent1, outEvent2; commandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, &outEvent1); commandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, &outEvent2); clWaitForEvents(1, &outEvent2); EXPECT_EQ(2u, ultGpgpuCsr->latestWaitForCompletionWithTimeoutTaskCount.load()); EXPECT_EQ(2u, ultBcsCsr->latestWaitForCompletionWithTimeoutTaskCount.load()); clWaitForEvents(1, &outEvent1); EXPECT_EQ(1u, ultGpgpuCsr->latestWaitForCompletionWithTimeoutTaskCount.load()); EXPECT_EQ(1u, ultBcsCsr->latestWaitForCompletionWithTimeoutTaskCount.load()); clReleaseEvent(outEvent1); clReleaseEvent(outEvent2); } HWTEST_TEMPLATED_F(BlitEnqueueTaskCountTests, givenBufferDumpingEnabledWhenEnqueueingThenSetCorrectDumpOption) { auto buffer = createBuffer(1, false); buffer->forceDisallowCPUCopy = true; int hostPtr = 0; DebugManager.flags.AUBDumpAllocsOnEnqueueReadOnly.set(true); DebugManager.flags.AUBDumpBufferFormat.set("BIN"); auto mockCommandQueue = static_cast *>(commandQueue.get()); { // BCS enqueue commandQueue->enqueueReadBuffer(buffer.get(), true, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr); EXPECT_TRUE(mockCommandQueue->notifyEnqueueReadBufferCalled); EXPECT_TRUE(mockCommandQueue->useBcsCsrOnNotifyEnabled); mockCommandQueue->notifyEnqueueReadBufferCalled = false; } { // Non-BCS enqueue DebugManager.flags.EnableBlitterForEnqueueOperations.set(0); commandQueue->enqueueReadBuffer(buffer.get(), true, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr); EXPECT_TRUE(mockCommandQueue->notifyEnqueueReadBufferCalled); EXPECT_FALSE(mockCommandQueue->useBcsCsrOnNotifyEnabled); } } HWTEST_TEMPLATED_F(BlitEnqueueTaskCountTests, givenBlockedEventWhenWaitingForCompletionThenWaitForCurrentBcsTaskCount) { auto buffer = createBuffer(1, false); buffer->forceDisallowCPUCopy = true; int hostPtr = 0; auto ultGpgpuCsr = static_cast *>(gpgpuCsr); auto ultBcsCsr = static_cast *>(bcsCsr); cl_event outEvent1, outEvent2; UserEvent userEvent; cl_event waitlist1 = &userEvent; cl_event *waitlist2 = &outEvent1; commandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 1, &waitlist1, &outEvent1); commandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 1, waitlist2, &outEvent2); userEvent.setStatus(CL_COMPLETE); clWaitForEvents(1, &outEvent2); EXPECT_EQ(2u, ultGpgpuCsr->latestWaitForCompletionWithTimeoutTaskCount.load()); EXPECT_EQ(2u, ultBcsCsr->latestWaitForCompletionWithTimeoutTaskCount.load()); clWaitForEvents(1, &outEvent1); EXPECT_EQ(1u, ultGpgpuCsr->latestWaitForCompletionWithTimeoutTaskCount.load()); EXPECT_EQ(1u, ultBcsCsr->latestWaitForCompletionWithTimeoutTaskCount.load()); clReleaseEvent(outEvent1); clReleaseEvent(outEvent2); EXPECT_FALSE(commandQueue->isQueueBlocked()); } HWTEST_TEMPLATED_F(BlitEnqueueTaskCountTests, givenBlockedEnqueueWithoutKernelWhenWaitingForCompletionThenWaitForCurrentBcsTaskCount) { auto ultGpgpuCsr = static_cast *>(gpgpuCsr); auto ultBcsCsr = static_cast *>(bcsCsr); cl_event outEvent1, outEvent2; UserEvent userEvent; cl_event waitlist1 = &userEvent; cl_event *waitlist2 = &outEvent1; commandQueue->enqueueMarkerWithWaitList(1, &waitlist1, &outEvent1); commandQueue->enqueueMarkerWithWaitList(1, waitlist2, &outEvent2); userEvent.setStatus(CL_COMPLETE); clWaitForEvents(1, &outEvent2); EXPECT_EQ(1u, ultGpgpuCsr->latestWaitForCompletionWithTimeoutTaskCount.load()); EXPECT_EQ(0u, ultBcsCsr->latestWaitForCompletionWithTimeoutTaskCount.load()); clWaitForEvents(1, &outEvent1); EXPECT_EQ(0u, ultGpgpuCsr->latestWaitForCompletionWithTimeoutTaskCount.load()); EXPECT_EQ(0u, ultBcsCsr->latestWaitForCompletionWithTimeoutTaskCount.load()); clReleaseEvent(outEvent1); clReleaseEvent(outEvent2); EXPECT_FALSE(commandQueue->isQueueBlocked()); } HWTEST_TEMPLATED_F(BlitEnqueueTaskCountTests, givenEventFromCpuCopyWhenWaitingForCompletionThenWaitForCurrentBcsTaskCount) { auto buffer = createBuffer(1, false); int hostPtr = 0; auto ultGpgpuCsr = static_cast *>(gpgpuCsr); auto ultBcsCsr = static_cast *>(bcsCsr); ultGpgpuCsr->taskCount = 1; commandQueue->taskCount = 1; ultBcsCsr->taskCount = 2; commandQueue->updateBcsTaskCount(2); cl_event outEvent1, outEvent2; commandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, &outEvent1); commandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, &outEvent2); clWaitForEvents(1, &outEvent2); EXPECT_EQ(3u, static_cast *>(gpgpuCsr)->latestWaitForCompletionWithTimeoutTaskCount.load()); EXPECT_EQ(4u, static_cast *>(bcsCsr)->latestWaitForCompletionWithTimeoutTaskCount.load()); clWaitForEvents(1, &outEvent1); EXPECT_EQ(2u, static_cast *>(gpgpuCsr)->latestWaitForCompletionWithTimeoutTaskCount.load()); EXPECT_EQ(3u, static_cast *>(bcsCsr)->latestWaitForCompletionWithTimeoutTaskCount.load()); clReleaseEvent(outEvent1); clReleaseEvent(outEvent2); } using BlitEnqueueWithDisabledGpgpuSubmissionTests = BlitEnqueueTests<1>; HWTEST_TEMPLATED_F(BlitEnqueueWithDisabledGpgpuSubmissionTests, givenCacheFlushRequiredWhenDoingBcsCopyThenSubmitToGpgpuOnlyIfPreviousEnqueueWasGpgpu) { auto mockCommandQueue = static_cast *>(commandQueue.get()); EXPECT_EQ(EnqueueProperties::Operation::None, mockCommandQueue->latestSentEnqueueType); DebugManager.flags.ForceGpgpuSubmissionForBcsEnqueue.set(-1); mockCommandQueue->overrideIsCacheFlushForBcsRequired.enabled = true; mockCommandQueue->overrideIsCacheFlushForBcsRequired.returnValue = true; auto buffer = createBuffer(1, false); buffer->forceDisallowCPUCopy = true; int hostPtr = 0; commandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr); EXPECT_EQ(EnqueueProperties::Operation::Blit, mockCommandQueue->latestSentEnqueueType); EXPECT_EQ(0u, gpgpuCsr->peekTaskCount()); commandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr); EXPECT_EQ(EnqueueProperties::Operation::Blit, mockCommandQueue->latestSentEnqueueType); EXPECT_EQ(0u, gpgpuCsr->peekTaskCount()); commandQueue->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); EXPECT_EQ(EnqueueProperties::Operation::GpuKernel, mockCommandQueue->latestSentEnqueueType); EXPECT_EQ(1u, gpgpuCsr->peekTaskCount()); commandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr); EXPECT_EQ(EnqueueProperties::Operation::Blit, mockCommandQueue->latestSentEnqueueType); EXPECT_EQ(2u, gpgpuCsr->peekTaskCount()); commandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr); EXPECT_EQ(EnqueueProperties::Operation::Blit, mockCommandQueue->latestSentEnqueueType); EXPECT_EQ(2u, gpgpuCsr->peekTaskCount()); } HWTEST_TEMPLATED_F(BlitEnqueueWithDisabledGpgpuSubmissionTests, givenProfilingEnabledWhenSubmittingWithoutFlushToGpgpuThenSetSubmitTime) { auto mockCommandQueue = static_cast *>(commandQueue.get()); EXPECT_EQ(EnqueueProperties::Operation::None, mockCommandQueue->latestSentEnqueueType); DebugManager.flags.ForceGpgpuSubmissionForBcsEnqueue.set(-1); mockCommandQueue->overrideIsCacheFlushForBcsRequired.enabled = true; mockCommandQueue->overrideIsCacheFlushForBcsRequired.returnValue = true; mockCommandQueue->setProfilingEnabled(); auto buffer = createBuffer(1, false); buffer->forceDisallowCPUCopy = true; int hostPtr = 0; cl_event clEvent; commandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, &clEvent); EXPECT_EQ(EnqueueProperties::Operation::Blit, mockCommandQueue->latestSentEnqueueType); EXPECT_EQ(0u, gpgpuCsr->peekTaskCount()); auto event = castToObject(clEvent); uint64_t submitTime = 0; event->getEventProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, sizeof(submitTime), &submitTime, nullptr); EXPECT_NE(0u, submitTime); clReleaseEvent(clEvent); } HWTEST_TEMPLATED_F(BlitEnqueueWithDisabledGpgpuSubmissionTests, givenCacheFlushNotRequiredWhenDoingBcsCopyThenDontSubmitToGpgpu) { auto mockCommandQueue = static_cast *>(commandQueue.get()); EXPECT_EQ(EnqueueProperties::Operation::None, mockCommandQueue->latestSentEnqueueType); DebugManager.flags.ForceGpgpuSubmissionForBcsEnqueue.set(-1); mockCommandQueue->overrideIsCacheFlushForBcsRequired.enabled = true; mockCommandQueue->overrideIsCacheFlushForBcsRequired.returnValue = false; auto buffer = createBuffer(1, false); buffer->forceDisallowCPUCopy = true; int hostPtr = 0; commandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr); EXPECT_EQ(EnqueueProperties::Operation::Blit, mockCommandQueue->latestSentEnqueueType); EXPECT_EQ(0u, gpgpuCsr->peekTaskCount()); commandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr); EXPECT_EQ(EnqueueProperties::Operation::Blit, mockCommandQueue->latestSentEnqueueType); EXPECT_EQ(0u, gpgpuCsr->peekTaskCount()); commandQueue->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); EXPECT_EQ(EnqueueProperties::Operation::GpuKernel, mockCommandQueue->latestSentEnqueueType); EXPECT_EQ(1u, gpgpuCsr->peekTaskCount()); commandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr); EXPECT_EQ(EnqueueProperties::Operation::Blit, mockCommandQueue->latestSentEnqueueType); EXPECT_EQ(2u, gpgpuCsr->peekTaskCount()); commandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr); EXPECT_EQ(EnqueueProperties::Operation::Blit, mockCommandQueue->latestSentEnqueueType); EXPECT_EQ(2u, gpgpuCsr->peekTaskCount()); } HWTEST_TEMPLATED_F(BlitEnqueueWithDisabledGpgpuSubmissionTests, givenCacheFlushNotRequiredWhenDoingBcsCopyAfterBarrierThenSubmitToGpgpu) { auto mockCommandQueue = static_cast *>(commandQueue.get()); EXPECT_EQ(EnqueueProperties::Operation::None, mockCommandQueue->latestSentEnqueueType); DebugManager.flags.ForceGpgpuSubmissionForBcsEnqueue.set(-1); mockCommandQueue->overrideIsCacheFlushForBcsRequired.enabled = true; mockCommandQueue->overrideIsCacheFlushForBcsRequired.returnValue = false; auto buffer = createBuffer(1, false); buffer->forceDisallowCPUCopy = true; int hostPtr = 0; EXPECT_EQ(0u, gpgpuCsr->peekTaskCount()); commandQueue->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); EXPECT_EQ(1u, gpgpuCsr->peekTaskCount()); commandQueue->enqueueBarrierWithWaitList(0, nullptr, nullptr); EXPECT_EQ(1u, gpgpuCsr->peekTaskCount()); commandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr); EXPECT_EQ(EnqueueProperties::Operation::Blit, mockCommandQueue->latestSentEnqueueType); EXPECT_EQ(2u, gpgpuCsr->peekTaskCount()); } HWTEST_TEMPLATED_F(BlitEnqueueWithDisabledGpgpuSubmissionTests, givenCacheFlushNotRequiredWhenDoingBcsCopyOnBlockedQueueThenSubmitToGpgpu) { auto mockCommandQueue = static_cast *>(commandQueue.get()); EXPECT_EQ(EnqueueProperties::Operation::None, mockCommandQueue->latestSentEnqueueType); DebugManager.flags.ForceGpgpuSubmissionForBcsEnqueue.set(-1); mockCommandQueue->overrideIsCacheFlushForBcsRequired.enabled = true; mockCommandQueue->overrideIsCacheFlushForBcsRequired.returnValue = false; auto buffer = createBuffer(1, false); buffer->forceDisallowCPUCopy = true; int hostPtr = 0; UserEvent userEvent; cl_event waitlist = &userEvent; commandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 1, &waitlist, nullptr); EXPECT_EQ(EnqueueProperties::Operation::None, mockCommandQueue->latestSentEnqueueType); userEvent.setStatus(CL_COMPLETE); EXPECT_EQ(EnqueueProperties::Operation::Blit, mockCommandQueue->latestSentEnqueueType); EXPECT_EQ(1u, gpgpuCsr->peekTaskCount()); EXPECT_FALSE(commandQueue->isQueueBlocked()); } HWTEST_TEMPLATED_F(BlitEnqueueWithDisabledGpgpuSubmissionTests, givenCacheFlushRequiredWhenDoingBcsCopyOnBlockedQueueThenSubmitToGpgpu) { auto mockCommandQueue = static_cast *>(commandQueue.get()); EXPECT_EQ(EnqueueProperties::Operation::None, mockCommandQueue->latestSentEnqueueType); DebugManager.flags.ForceGpgpuSubmissionForBcsEnqueue.set(-1); mockCommandQueue->overrideIsCacheFlushForBcsRequired.enabled = true; mockCommandQueue->overrideIsCacheFlushForBcsRequired.returnValue = true; auto buffer = createBuffer(1, false); buffer->forceDisallowCPUCopy = true; int hostPtr = 0; UserEvent userEvent; cl_event waitlist = &userEvent; commandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 1, &waitlist, nullptr); EXPECT_EQ(EnqueueProperties::Operation::None, mockCommandQueue->latestSentEnqueueType); userEvent.setStatus(CL_COMPLETE); EXPECT_EQ(EnqueueProperties::Operation::Blit, mockCommandQueue->latestSentEnqueueType); EXPECT_EQ(1u, gpgpuCsr->peekTaskCount()); EXPECT_FALSE(commandQueue->isQueueBlocked()); } HWTEST_TEMPLATED_F(BlitEnqueueWithDisabledGpgpuSubmissionTests, givenCacheFlushRequiredWhenDoingBcsCopyThatRequiresCacheFlushThenSubmitToGpgpu) { using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; using XY_COPY_BLT = typename FamilyType::XY_COPY_BLT; DebugManager.flags.ForceGpgpuSubmissionForBcsEnqueue.set(-1); auto mockCommandQueue = static_cast *>(commandQueue.get()); mockCommandQueue->overrideIsCacheFlushForBcsRequired.enabled = true; mockCommandQueue->overrideIsCacheFlushForBcsRequired.returnValue = true; auto buffer = createBuffer(1, false); buffer->forceDisallowCPUCopy = true; int hostPtr = 0; // enqueue kernel to force gpgpu submission on write buffer commandQueue->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); EXPECT_EQ(1u, gpgpuCsr->peekTaskCount()); auto offset = mockCommandQueue->getCS(0).getUsed(); commandQueue->enqueueWriteBuffer(buffer.get(), false, 0, 1, &hostPtr, nullptr, 0, nullptr, nullptr); EXPECT_EQ(2u, gpgpuCsr->peekTaskCount()); auto cmdListBcs = getCmdList(bcsCsr->getCS(0), 0); auto cmdListQueue = getCmdList(mockCommandQueue->getCS(0), offset); uint64_t cacheFlushWriteAddress = 0; { auto cmdFound = expectPipeControl(cmdListQueue.begin(), cmdListQueue.end()); auto pipeControlCmd = genCmdCast(*cmdFound); EXPECT_TRUE(pipeControlCmd->getDcFlushEnable()); EXPECT_TRUE(pipeControlCmd->getCommandStreamerStallEnable()); uint64_t low = pipeControlCmd->getAddress(); uint64_t high = pipeControlCmd->getAddressHigh(); cacheFlushWriteAddress = (high << 32) | low; EXPECT_NE(0u, cacheFlushWriteAddress); } { auto cmdFound = expectCommand(cmdListBcs.begin(), cmdListBcs.end()); verifySemaphore(cmdFound, cacheFlushWriteAddress); cmdFound = expectCommand(cmdListBcs.begin(), cmdListBcs.end()); EXPECT_NE(cmdListBcs.end(), cmdFound); } } HWTEST_TEMPLATED_F(BlitEnqueueWithDisabledGpgpuSubmissionTests, givenSubmissionToDifferentEngineWhenRequestingForNewTimestmapPacketThenDontClearDependencies) { auto mockCommandQueue = static_cast *>(commandQueue.get()); const bool clearDependencies = true; const bool blitEnqueue = true; const bool nonBlitEnqueue = false; { TimestampPacketContainer previousNodes; mockCommandQueue->obtainNewTimestampPacketNodes(1, previousNodes, clearDependencies, nonBlitEnqueue); // init EXPECT_EQ(0u, previousNodes.peekNodes().size()); } { TimestampPacketContainer previousNodes; mockCommandQueue->obtainNewTimestampPacketNodes(1, previousNodes, clearDependencies, blitEnqueue); EXPECT_EQ(1u, previousNodes.peekNodes().size()); } { TimestampPacketContainer previousNodes; mockCommandQueue->obtainNewTimestampPacketNodes(1, previousNodes, clearDependencies, blitEnqueue); EXPECT_EQ(0u, previousNodes.peekNodes().size()); } } using BlitCopyTests = BlitEnqueueTests<1>; HWTEST_TEMPLATED_F(BlitCopyTests, givenKernelAllocationInLocalMemoryWhenCreatingWithoutAllowedCpuAccessThenUseBcsForTransfer) { DebugManager.flags.ForceLocalMemoryAccessMode.set(static_cast(LocalMemoryAccessMode::CpuAccessDisallowed)); DebugManager.flags.ForceNonSystemMemoryPlacement.set(1 << (static_cast(GraphicsAllocation::AllocationType::KERNEL_ISA) - 1)); uint32_t kernelHeap = 0; KernelInfo kernelInfo; kernelInfo.heapInfo.KernelHeapSize = 1; kernelInfo.heapInfo.pKernelHeap = &kernelHeap; auto initialTaskCount = bcsMockContext->bcsCsr->peekTaskCount(); kernelInfo.createKernelAllocation(device->getDevice(), false); if (kernelInfo.kernelAllocation->isAllocatedInLocalMemoryPool()) { EXPECT_EQ(initialTaskCount + 1, bcsMockContext->bcsCsr->peekTaskCount()); } else { EXPECT_EQ(initialTaskCount, bcsMockContext->bcsCsr->peekTaskCount()); } device->getMemoryManager()->freeGraphicsMemory(kernelInfo.kernelAllocation); } HWTEST_TEMPLATED_F(BlitCopyTests, givenKernelAllocationInLocalMemoryWhenCreatingWithAllowedCpuAccessThenDontUseBcsForTransfer) { DebugManager.flags.ForceLocalMemoryAccessMode.set(static_cast(LocalMemoryAccessMode::CpuAccessAllowed)); DebugManager.flags.ForceNonSystemMemoryPlacement.set(1 << (static_cast(GraphicsAllocation::AllocationType::KERNEL_ISA) - 1)); uint32_t kernelHeap = 0; KernelInfo kernelInfo; kernelInfo.heapInfo.KernelHeapSize = 1; kernelInfo.heapInfo.pKernelHeap = &kernelHeap; auto initialTaskCount = bcsMockContext->bcsCsr->peekTaskCount(); kernelInfo.createKernelAllocation(device->getDevice(), false); EXPECT_EQ(initialTaskCount, bcsMockContext->bcsCsr->peekTaskCount()); device->getMemoryManager()->freeGraphicsMemory(kernelInfo.kernelAllocation); } HWTEST_TEMPLATED_F(BlitCopyTests, givenKernelAllocationInLocalMemoryWhenCreatingWithDisallowedCpuAccessAndDisabledBlitterThenFallbackToCpuCopy) { DebugManager.flags.ForceLocalMemoryAccessMode.set(static_cast(LocalMemoryAccessMode::CpuAccessDisallowed)); DebugManager.flags.ForceNonSystemMemoryPlacement.set(1 << (static_cast(GraphicsAllocation::AllocationType::KERNEL_ISA) - 1)); device->getExecutionEnvironment()->rootDeviceEnvironments[0]->getMutableHardwareInfo()->capabilityTable.blitterOperationsSupported = false; uint32_t kernelHeap = 0; KernelInfo kernelInfo; kernelInfo.heapInfo.KernelHeapSize = 1; kernelInfo.heapInfo.pKernelHeap = &kernelHeap; auto initialTaskCount = bcsMockContext->bcsCsr->peekTaskCount(); kernelInfo.createKernelAllocation(device->getDevice(), false); EXPECT_EQ(initialTaskCount, bcsMockContext->bcsCsr->peekTaskCount()); device->getMemoryManager()->freeGraphicsMemory(kernelInfo.kernelAllocation); } HWTEST_TEMPLATED_F(BlitCopyTests, givenLocalMemoryAccessNotAllowedWhenGlobalConstantsAreExportedThenUseBlitter) { DebugManager.flags.EnableLocalMemory.set(1); DebugManager.flags.ForceLocalMemoryAccessMode.set(static_cast(LocalMemoryAccessMode::CpuAccessDisallowed)); char constantData[128] = {}; ProgramInfo programInfo; programInfo.globalConstants.initData = constantData; programInfo.globalConstants.size = sizeof(constantData); auto mockLinkerInput = std::make_unique>(); mockLinkerInput->traits.exportsGlobalConstants = true; programInfo.linkerInput = std::move(mockLinkerInput); MockProgram program(bcsMockContext.get(), false, toClDeviceVector(*device)); EXPECT_EQ(0u, bcsMockContext->bcsCsr->peekTaskCount()); program.processProgramInfo(programInfo, *device); EXPECT_EQ(1u, bcsMockContext->bcsCsr->peekTaskCount()); auto rootDeviceIndex = device->getRootDeviceIndex(); ASSERT_NE(nullptr, program.getConstantSurface(rootDeviceIndex)); auto gpuAddress = reinterpret_cast(program.getConstantSurface(rootDeviceIndex)->getGpuAddress()); EXPECT_NE(nullptr, bcsMockContext->getSVMAllocsManager()->getSVMAlloc(gpuAddress)); } HWTEST_TEMPLATED_F(BlitCopyTests, givenKernelAllocationInLocalMemoryWithoutCpuAccessAllowedWhenSubstituteKernelHeapIsCalledThenUseBcsForTransfer) { DebugManager.flags.ForceLocalMemoryAccessMode.set(static_cast(LocalMemoryAccessMode::CpuAccessDisallowed)); DebugManager.flags.ForceNonSystemMemoryPlacement.set(1 << (static_cast(GraphicsAllocation::AllocationType::KERNEL_ISA) - 1)); device->getExecutionEnvironment()->rootDeviceEnvironments[0]->getMutableHardwareInfo()->capabilityTable.blitterOperationsSupported = true; MockKernelWithInternals kernel(*device); const size_t initialHeapSize = 0x40; kernel.kernelInfo.heapInfo.KernelHeapSize = initialHeapSize; kernel.kernelInfo.createKernelAllocation(device->getDevice(), false); ASSERT_NE(nullptr, kernel.kernelInfo.kernelAllocation); EXPECT_TRUE(kernel.kernelInfo.kernelAllocation->isAllocatedInLocalMemoryPool()); const size_t newHeapSize = initialHeapSize; char newHeap[newHeapSize]; auto initialTaskCount = bcsMockContext->bcsCsr->peekTaskCount(); kernel.mockKernel->substituteKernelHeap(newHeap, newHeapSize); EXPECT_EQ(initialTaskCount + 1, bcsMockContext->bcsCsr->peekTaskCount()); device->getMemoryManager()->freeGraphicsMemory(kernel.kernelInfo.kernelAllocation); } HWTEST_TEMPLATED_F(BlitCopyTests, givenKernelAllocationInLocalMemoryWithoutCpuAccessAllowedWhenLinkerRequiresPatchingOfInstructionSegmentsThenUseBcsForTransfer) { DebugManager.flags.ForceLocalMemoryAccessMode.set(static_cast(LocalMemoryAccessMode::CpuAccessDisallowed)); DebugManager.flags.ForceNonSystemMemoryPlacement.set(1 << (static_cast(GraphicsAllocation::AllocationType::KERNEL_ISA) - 1)); device->getExecutionEnvironment()->rootDeviceEnvironments[0]->getMutableHardwareInfo()->capabilityTable.blitterOperationsSupported = true; auto linkerInput = std::make_unique>(); linkerInput->traits.requiresPatchingOfInstructionSegments = true; KernelInfo kernelInfo = {}; std::vector kernelHeap; kernelHeap.resize(32, 7); kernelInfo.heapInfo.pKernelHeap = kernelHeap.data(); kernelInfo.heapInfo.KernelHeapSize = static_cast(kernelHeap.size()); kernelInfo.createKernelAllocation(device->getDevice(), false); ASSERT_NE(nullptr, kernelInfo.kernelAllocation); EXPECT_TRUE(kernelInfo.kernelAllocation->isAllocatedInLocalMemoryPool()); MockProgram program{nullptr, false, toClDeviceVector(*device)}; program.getKernelInfoArray(device->getRootDeviceIndex()).push_back(&kernelInfo); program.setLinkerInput(device->getRootDeviceIndex(), std::move(linkerInput)); auto initialTaskCount = bcsMockContext->bcsCsr->peekTaskCount(); auto ret = program.linkBinary(&device->getDevice(), nullptr, nullptr); EXPECT_EQ(CL_SUCCESS, ret); EXPECT_EQ(initialTaskCount + 1, bcsMockContext->bcsCsr->peekTaskCount()); program.getKernelInfoArray(device->getRootDeviceIndex()).clear(); device->getMemoryManager()->freeGraphicsMemory(kernelInfo.kernelAllocation); } } // namespace NEO