diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index 390ca74348..5b71cb0918 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -809,7 +809,7 @@ CompletionStamp CommandQueueHw::enqueueNonBlocked( } if (enqueueProperties.blitPropertiesContainer->size() > 0) { - this->bcsTaskCount = getBcsCommandStreamReceiver()->blitBuffer(*enqueueProperties.blitPropertiesContainer, false); + this->bcsTaskCount = getBcsCommandStreamReceiver()->blitBuffer(*enqueueProperties.blitPropertiesContainer, false, this->isProfilingEnabled()); dispatchFlags.implicitFlush = true; } @@ -956,9 +956,16 @@ CompletionStamp CommandQueueHw::enqueueCommandWithoutKernel( surface->makeResident(getGpgpuCommandStreamReceiver()); } + TimeStampData submitTimeStamp; + if (eventBuilder.getEvent() && isProfilingEnabled() && getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { + this->getDevice().getOSTime()->getCpuGpuTime(&submitTimeStamp); + eventBuilder.getEvent()->setSubmitTimeStamp(&submitTimeStamp); + eventBuilder.getEvent()->getTimestampPacketNodes()->makeResident(getGpgpuCommandStreamReceiver()); + } + if (enqueueProperties.operation == EnqueueProperties::Operation::Blit) { UNRECOVERABLE_IF(!enqueueProperties.blitPropertiesContainer); - this->bcsTaskCount = getBcsCommandStreamReceiver()->blitBuffer(*enqueueProperties.blitPropertiesContainer, false); + this->bcsTaskCount = getBcsCommandStreamReceiver()->blitBuffer(*enqueueProperties.blitPropertiesContainer, false, this->isProfilingEnabled()); } DispatchFlags dispatchFlags( diff --git a/opencl/source/event/event.cpp b/opencl/source/event/event.cpp index 3585f694d7..b64140e899 100644 --- a/opencl/source/event/event.cpp +++ b/opencl/source/event/event.cpp @@ -165,7 +165,6 @@ cl_int Event::getEventProfilingInfo(cl_profiling_info paramName, if (DebugManager.flags.ReturnRawGpuTimestamps.get()) { src = &queueTimeStamp.GPUTimeStamp; } - srcSize = sizeof(cl_ulong); break; diff --git a/opencl/source/helpers/task_information.cpp b/opencl/source/helpers/task_information.cpp index ee5ebae01b..20e268c1d1 100644 --- a/opencl/source/helpers/task_information.cpp +++ b/opencl/source/helpers/task_information.cpp @@ -204,7 +204,7 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate *currentTimestampPacketNodes, csrDeps, commandQueue.getGpgpuCommandStreamReceiver(), bcsCsr); - auto bcsTaskCount = bcsCsr.blitBuffer(kernelOperation->blitPropertiesContainer, false); + auto bcsTaskCount = bcsCsr.blitBuffer(kernelOperation->blitPropertiesContainer, false, commandQueue.isProfilingEnabled()); commandQueue.updateBcsTaskCount(bcsTaskCount); } @@ -290,7 +290,7 @@ void CommandWithoutKernel::dispatchBlitOperation() { blitProperties.csrDependencies.push_back(×tampPacketDependencies->barrierNodes); blitProperties.outputTimestampPacket = currentTimestampPacketNodes->peekNodes()[0]; - auto bcsTaskCount = bcsCsr->blitBuffer(kernelOperation->blitPropertiesContainer, false); + auto bcsTaskCount = bcsCsr->blitBuffer(kernelOperation->blitPropertiesContainer, false, commandQueue.isProfilingEnabled()); commandQueue.updateBcsTaskCount(bcsTaskCount); } diff --git a/opencl/test/unit_test/command_queue/blit_enqueue_tests.cpp b/opencl/test/unit_test/command_queue/blit_enqueue_tests.cpp index 99b613a2c8..d5d3817663 100644 --- a/opencl/test/unit_test/command_queue/blit_enqueue_tests.cpp +++ b/opencl/test/unit_test/command_queue/blit_enqueue_tests.cpp @@ -43,7 +43,7 @@ struct BlitEnqueueTests : public ::testing::Test { BlitPropertiesContainer container; container.push_back(blitProperties); - bcsCsr->blitBuffer(container, true); + bcsCsr->blitBuffer(container, true, false); return BlitOperationResult::Success; } diff --git a/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp index ce02743c13..7236c889a3 100644 --- a/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp @@ -16,6 +16,7 @@ #include "opencl/test/unit_test/fixtures/enqueue_handler_fixture.h" #include "opencl/test/unit_test/mocks/mock_command_queue.h" #include "opencl/test/unit_test/mocks/mock_csr.h" +#include "opencl/test/unit_test/mocks/mock_event.h" #include "opencl/test/unit_test/mocks/mock_execution_environment.h" #include "opencl/test/unit_test/mocks/mock_graphics_allocation.h" #include "opencl/test/unit_test/mocks/mock_timestamp_container.h" @@ -53,6 +54,80 @@ HWTEST_F(EnqueueHandlerTest, GivenCommandStreamWithoutKernelWhenCommandEnqueuedT EXPECT_EQ(allocation->getTaskCount(mockCmdQ->getGpgpuCommandStreamReceiver().getOsContext().getContextId()), 1u); } +template +struct EnqueueHandlerTimestampTest : public EnqueueHandlerTest { + void SetUp() override { + DebugManager.flags.EnableTimestampPacket.set(enabled); + EnqueueHandlerTest::SetUp(); + } + + void TearDown() override { + EnqueueHandlerTest::TearDown(); + } + + DebugManagerStateRestore restorer; +}; + +using EnqueueHandlerTimestampEnabledTest = EnqueueHandlerTimestampTest; + +HWTEST_F(EnqueueHandlerTimestampEnabledTest, givenProflingAndTimeStampPacketsEnabledWhenEnqueueCommandWithoutKernelThenSubmitTimeStampIsSet) { + cl_queue_properties properties[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0}; + std::unique_ptr> mockCmdQ(new MockCommandQueueHw(context, pClDevice, properties)); + + char buffer[64]; + std::unique_ptr allocation(new MockGraphicsAllocation(buffer, sizeof(buffer))); + std::unique_ptr surface(new GeneralSurface(allocation.get())); + EventsRequest eventsRequest(0, nullptr, nullptr); + EventBuilder eventBuilder; + eventBuilder.create>(mockCmdQ.get(), CL_COMMAND_USER, CompletionStamp::levelNotReady, CompletionStamp::levelNotReady); + auto ev = static_cast *>(eventBuilder.getEvent()); + Surface *surfaces[] = {surface.get()}; + auto blocking = true; + TimestampPacketDependencies timestampPacketDependencies; + EnqueueProperties enqueueProperties(false, false, false, true, nullptr); + + EXPECT_EQ(ev->submitTimeStamp.CPUTimeinNS, 0u); + EXPECT_EQ(ev->submitTimeStamp.GPUTimeStamp, 0u); + + mockCmdQ->enqueueCommandWithoutKernel(surfaces, 1, mockCmdQ->getCS(0), 0, blocking, enqueueProperties, timestampPacketDependencies, + eventsRequest, eventBuilder, 0); + + EXPECT_NE(ev->submitTimeStamp.CPUTimeinNS, 0u); + EXPECT_NE(ev->submitTimeStamp.GPUTimeStamp, 0u); + + delete ev; +} + +using EnqueueHandlerTimestampDisabledTest = EnqueueHandlerTimestampTest; + +HWTEST_F(EnqueueHandlerTimestampDisabledTest, givenProflingEnabledTimeStampPacketsDisabledWhenEnqueueCommandWithoutKernelThenSubmitTimeStampIsNotSet) { + cl_queue_properties properties[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0}; + std::unique_ptr> mockCmdQ(new MockCommandQueueHw(context, pClDevice, properties)); + + char buffer[64]; + std::unique_ptr allocation(new MockGraphicsAllocation(buffer, sizeof(buffer))); + std::unique_ptr surface(new GeneralSurface(allocation.get())); + EventsRequest eventsRequest(0, nullptr, nullptr); + EventBuilder eventBuilder; + eventBuilder.create>(mockCmdQ.get(), CL_COMMAND_USER, CompletionStamp::levelNotReady, CompletionStamp::levelNotReady); + auto ev = static_cast *>(eventBuilder.getEvent()); + Surface *surfaces[] = {surface.get()}; + auto blocking = true; + TimestampPacketDependencies timestampPacketDependencies; + EnqueueProperties enqueueProperties(false, false, false, true, nullptr); + + EXPECT_EQ(ev->submitTimeStamp.CPUTimeinNS, 0u); + EXPECT_EQ(ev->submitTimeStamp.GPUTimeStamp, 0u); + + mockCmdQ->enqueueCommandWithoutKernel(surfaces, 1, mockCmdQ->getCS(0), 0, blocking, enqueueProperties, timestampPacketDependencies, + eventsRequest, eventBuilder, 0); + + EXPECT_EQ(ev->submitTimeStamp.CPUTimeinNS, 0u); + EXPECT_EQ(ev->submitTimeStamp.GPUTimeStamp, 0u); + + delete ev; +} + HWTEST_F(EnqueueHandlerTest, givenNonBlitPropertyWhenEnqueueIsBlockedThenDontRegisterBlitProperties) { std::unique_ptr> mockCmdQ(new MockCommandQueueHw(context, pClDevice, 0)); auto &csr = mockCmdQ->getGpgpuCommandStreamReceiver(); diff --git a/opencl/test/unit_test/command_stream/aub_command_stream_receiver_1_tests.cpp b/opencl/test/unit_test/command_stream/aub_command_stream_receiver_1_tests.cpp index 36feee0f6b..af1e383f80 100644 --- a/opencl/test/unit_test/command_stream/aub_command_stream_receiver_1_tests.cpp +++ b/opencl/test/unit_test/command_stream/aub_command_stream_receiver_1_tests.cpp @@ -1167,7 +1167,7 @@ HWTEST_F(AubCommandStreamReceiverTests, WhenBlitBufferIsCalledThenCounterIsCorre BlitProperties blitProperties = BlitProperties::constructPropertiesForCopyBuffer(&allocation, &allocation, 0, 0, 0, 0, 0, 0, 0); BlitPropertiesContainer blitPropertiesContainer; blitPropertiesContainer.push_back(blitProperties); - aubCsr->blitBuffer(blitPropertiesContainer, true); + aubCsr->blitBuffer(blitPropertiesContainer, true, false); EXPECT_EQ(1u, aubCsr->blitBufferCalled); } diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_tests.cpp index 11bb7ba90e..c91af9d5ab 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_tests.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_tests.cpp @@ -331,7 +331,7 @@ struct BcsTests : public CommandStreamReceiverHwTest { BlitPropertiesContainer container; container.push_back(blitProperties); - return bcsCsr->blitBuffer(container, blocking); + return bcsCsr->blitBuffer(container, blocking, false); } TimestampPacketContainer timestampPacketContainer; @@ -350,8 +350,8 @@ HWTEST_F(BcsTests, givenBltSizeWhenEstimatingCommandSizeThenAddAllRequiredComman auto expectedAlignedSize = cmdsSizePerBlit * alignedNumberOfBlts; auto expectedNotAlignedSize = cmdsSizePerBlit * notAlignedNumberOfBlts; - auto alignedEstimatedSize = BlitCommandsHelper::estimateBlitCommandsSize({alignedBltSize, 1, 1}, csrDependencies, false); - auto notAlignedEstimatedSize = BlitCommandsHelper::estimateBlitCommandsSize({notAlignedBltSize, 1, 1}, csrDependencies, false); + auto alignedEstimatedSize = BlitCommandsHelper::estimateBlitCommandsSize({alignedBltSize, 1, 1}, csrDependencies, false, false); + auto notAlignedEstimatedSize = BlitCommandsHelper::estimateBlitCommandsSize({notAlignedBltSize, 1, 1}, csrDependencies, false, false); EXPECT_EQ(expectedAlignedSize, alignedEstimatedSize); EXPECT_EQ(expectedNotAlignedSize, notAlignedEstimatedSize); @@ -368,8 +368,8 @@ HWTEST_F(BcsTests, givenBltSizeWhenEstimatingCommandSizeForReadBufferRectThenAdd auto expectedAlignedSize = cmdsSizePerBlit * alignedNumberOfBlts; auto expectedNotAlignedSize = cmdsSizePerBlit * notAlignedNumberOfBlts; - auto alignedEstimatedSize = BlitCommandsHelper::estimateBlitCommandsSize(alignedBltSize, csrDependencies, false); - auto notAlignedEstimatedSize = BlitCommandsHelper::estimateBlitCommandsSize(notAlignedBltSize, csrDependencies, false); + auto alignedEstimatedSize = BlitCommandsHelper::estimateBlitCommandsSize(alignedBltSize, csrDependencies, false, false); + auto notAlignedEstimatedSize = BlitCommandsHelper::estimateBlitCommandsSize(notAlignedBltSize, csrDependencies, false, false); EXPECT_EQ(expectedAlignedSize, alignedEstimatedSize); EXPECT_EQ(expectedNotAlignedSize, notAlignedEstimatedSize); @@ -409,7 +409,7 @@ HWTEST_F(BcsTests, givenBlitPropertiesContainerWhenExstimatingCommandsSizeThenCa expectedAlignedSize = alignUp(expectedAlignedSize, MemoryConstants::cacheLineSize); - auto alignedEstimatedSize = BlitCommandsHelper::estimateBlitCommandsSize(blitPropertiesContainer, pDevice->getHardwareInfo()); + auto alignedEstimatedSize = BlitCommandsHelper::estimateBlitCommandsSize(blitPropertiesContainer, pDevice->getHardwareInfo(), false); EXPECT_EQ(expectedAlignedSize, alignedEstimatedSize); } @@ -437,7 +437,7 @@ HWTEST_F(BcsTests, givenBlitPropertiesContainerWhenExstimatingCommandsSizeForWri expectedAlignedSize = alignUp(expectedAlignedSize, MemoryConstants::cacheLineSize); - auto alignedEstimatedSize = BlitCommandsHelper::estimateBlitCommandsSize(blitPropertiesContainer, pDevice->getHardwareInfo()); + auto alignedEstimatedSize = BlitCommandsHelper::estimateBlitCommandsSize(blitPropertiesContainer, pDevice->getHardwareInfo(), false); EXPECT_EQ(expectedAlignedSize, alignedEstimatedSize); } @@ -448,8 +448,8 @@ HWTEST_F(BcsTests, givenTimestampPacketWriteRequestWhenEstimatingSizeForCommands auto expectedSizeWithTimestampPacketWrite = expectedBaseSize + EncodeMiFlushDW::getMiFlushDwCmdSizeForDataWrite(); auto expectedSizeWithoutTimestampPacketWrite = expectedBaseSize; - auto estimatedSizeWithTimestampPacketWrite = BlitCommandsHelper::estimateBlitCommandsSize({1, 1, 1}, csrDependencies, true); - auto estimatedSizeWithoutTimestampPacketWrite = BlitCommandsHelper::estimateBlitCommandsSize({1, 1, 1}, csrDependencies, false); + auto estimatedSizeWithTimestampPacketWrite = BlitCommandsHelper::estimateBlitCommandsSize({1, 1, 1}, csrDependencies, true, false); + auto estimatedSizeWithoutTimestampPacketWrite = BlitCommandsHelper::estimateBlitCommandsSize({1, 1, 1}, csrDependencies, false, false); EXPECT_EQ(expectedSizeWithTimestampPacketWrite, estimatedSizeWithTimestampPacketWrite); EXPECT_EQ(expectedSizeWithoutTimestampPacketWrite, estimatedSizeWithoutTimestampPacketWrite); @@ -469,7 +469,7 @@ HWTEST_F(BcsTests, givenBltSizeAndCsrDependenciesWhenEstimatingCommandSizeThenAd size_t expectedSize = (cmdsSizePerBlit * numberOfBlts) + TimestampPacketHelper::getRequiredCmdStreamSize(csrDependencies); - auto estimatedSize = BlitCommandsHelper::estimateBlitCommandsSize({1, 1, 1}, csrDependencies, false); + auto estimatedSize = BlitCommandsHelper::estimateBlitCommandsSize({1, 1, 1}, csrDependencies, false, false); EXPECT_EQ(expectedSize, estimatedSize); } @@ -972,7 +972,7 @@ HWTEST_F(BcsTests, givenMultipleBlitPropertiesWhenDispatchingThenProgramCommands blitPropertiesContainer.push_back(blitProperties1); blitPropertiesContainer.push_back(blitProperties2); - csr.blitBuffer(blitPropertiesContainer, true); + csr.blitBuffer(blitPropertiesContainer, true, false); HardwareParse hwParser; hwParser.parseCommands(csr.commandStream); @@ -1001,6 +1001,46 @@ HWTEST_F(BcsTests, givenMultipleBlitPropertiesWhenDispatchingThenProgramCommands EXPECT_EQ(2u, dependenciesFound); } +HWTEST_F(BcsTests, givenProfilingEnabledWhenBlitBufferThenCommandBufferIsConstructedProperly) { + auto bcsOsContext = std::unique_ptr(OsContext::create(nullptr, 0, 0, aub_stream::ENGINE_BCS, PreemptionMode::Disabled, + false, false, false)); + auto bcsCsr = std::make_unique>(*pDevice->getExecutionEnvironment(), pDevice->getRootDeviceIndex()); + bcsCsr->setupContext(*bcsOsContext); + bcsCsr->initializeTagAllocation(); + + cl_int retVal = CL_SUCCESS; + auto buffer = clUniquePtr(Buffer::create(context.get(), CL_MEM_READ_WRITE, 1, nullptr, retVal)); + void *hostPtr = reinterpret_cast(0x12340000); + + auto blitProperties = BlitProperties::constructPropertiesForReadWriteBuffer(BlitterConstants::BlitDirection::HostPtrToBuffer, + *bcsCsr, buffer->getGraphicsAllocation(), nullptr, hostPtr, + buffer->getGraphicsAllocation()->getGpuAddress(), 0, + 0, 0, {1, 1, 1}, 0, 0, 0, 0); + + MockTimestampPacketContainer timestamp(*bcsCsr->getTimestampPacketAllocator(), 1u); + blitProperties.outputTimestampPacket = timestamp.getNode(0); + + BlitPropertiesContainer blitPropertiesContainer; + blitPropertiesContainer.push_back(blitProperties); + + bcsCsr->blitBuffer(blitPropertiesContainer, false, true); + + HardwareParse hwParser; + hwParser.parseCommands(bcsCsr->commandStream); + auto &cmdList = hwParser.cmdList; + + auto cmdIterator = find(cmdList.begin(), cmdList.end()); + ASSERT_NE(cmdList.end(), cmdIterator); + cmdIterator = find(++cmdIterator, cmdList.end()); + ASSERT_NE(cmdList.end(), cmdIterator); + cmdIterator = find(++cmdIterator, cmdList.end()); + ASSERT_NE(cmdList.end(), cmdIterator); + cmdIterator = find(++cmdIterator, cmdList.end()); + ASSERT_NE(cmdList.end(), cmdIterator); + cmdIterator = find(++cmdIterator, cmdList.end()); + ASSERT_NE(cmdList.end(), cmdIterator); +} + HWTEST_F(BcsTests, givenInputAllocationsWhenBlitDispatchedThenMakeAllAllocationsResident) { auto &csr = pDevice->getUltCommandStreamReceiver(); csr.storeMakeResidentAllocations = true; @@ -1027,7 +1067,7 @@ HWTEST_F(BcsTests, givenInputAllocationsWhenBlitDispatchedThenMakeAllAllocations blitPropertiesContainer.push_back(blitProperties1); blitPropertiesContainer.push_back(blitProperties2); - csr.blitBuffer(blitPropertiesContainer, false); + csr.blitBuffer(blitPropertiesContainer, false, false); EXPECT_TRUE(csr.isMadeResident(buffer1->getGraphicsAllocation())); EXPECT_TRUE(csr.isMadeResident(buffer2->getGraphicsAllocation())); @@ -1070,7 +1110,7 @@ HWTEST_F(BcsTests, givenFenceAllocationIsRequiredWhenBlitDispatchedThenMakeAllAl blitPropertiesContainer.push_back(blitProperties1); blitPropertiesContainer.push_back(blitProperties2); - bcsCsr->blitBuffer(blitPropertiesContainer, false); + bcsCsr->blitBuffer(blitPropertiesContainer, false, false); EXPECT_TRUE(bcsCsr->isMadeResident(buffer1->getGraphicsAllocation())); EXPECT_TRUE(bcsCsr->isMadeResident(buffer2->getGraphicsAllocation())); diff --git a/opencl/test/unit_test/kernel/kernel_tests.cpp b/opencl/test/unit_test/kernel/kernel_tests.cpp index a2adaf7985..92d7376ed1 100644 --- a/opencl/test/unit_test/kernel/kernel_tests.cpp +++ b/opencl/test/unit_test/kernel/kernel_tests.cpp @@ -509,7 +509,7 @@ class CommandStreamReceiverMock : public CommandStreamReceiver { void waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool quickKmdSleep, bool forcePowerSavingMode) override { } - uint32_t blitBuffer(const BlitPropertiesContainer &blitPropertiesContainer, bool blocking) override { return taskCount; }; + uint32_t blitBuffer(const BlitPropertiesContainer &blitPropertiesContainer, bool blocking, bool profilingEnabled) override { return taskCount; }; CompletionStamp flushTask( LinearStream &commandStream, diff --git a/opencl/test/unit_test/libult/ult_aub_command_stream_receiver.h b/opencl/test/unit_test/libult/ult_aub_command_stream_receiver.h index f46fc70449..92c4fbffb0 100644 --- a/opencl/test/unit_test/libult/ult_aub_command_stream_receiver.h +++ b/opencl/test/unit_test/libult/ult_aub_command_stream_receiver.h @@ -39,9 +39,9 @@ class UltAubCommandStreamReceiver : public AUBCommandStreamReceiverHw return csr; } - uint32_t blitBuffer(const BlitPropertiesContainer &blitPropertiesContainer, bool blocking) override { + uint32_t blitBuffer(const BlitPropertiesContainer &blitPropertiesContainer, bool blocking, bool profilingEnabled) override { blitBufferCalled++; - return BaseClass::blitBuffer(blitPropertiesContainer, blocking); + return BaseClass::blitBuffer(blitPropertiesContainer, blocking, profilingEnabled); } uint32_t blitBufferCalled = 0; diff --git a/opencl/test/unit_test/libult/ult_command_stream_receiver.h b/opencl/test/unit_test/libult/ult_command_stream_receiver.h index 0cb0c7c87d..b218d8f2e6 100644 --- a/opencl/test/unit_test/libult/ult_command_stream_receiver.h +++ b/opencl/test/unit_test/libult/ult_command_stream_receiver.h @@ -191,9 +191,9 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw, publ return CommandStreamReceiverHw::obtainUniqueOwnership(); } - uint32_t blitBuffer(const BlitPropertiesContainer &blitPropertiesContainer, bool blocking) override { + uint32_t blitBuffer(const BlitPropertiesContainer &blitPropertiesContainer, bool blocking, bool profilingEnabled) override { blitBufferCalled++; - return CommandStreamReceiverHw::blitBuffer(blitPropertiesContainer, blocking); + return CommandStreamReceiverHw::blitBuffer(blitPropertiesContainer, blocking, profilingEnabled); } bool createPerDssBackedBuffer(Device &device) override { diff --git a/opencl/test/unit_test/mem_obj/buffer_tests.cpp b/opencl/test/unit_test/mem_obj/buffer_tests.cpp index 378e538f8b..42b250aea2 100644 --- a/opencl/test/unit_test/mem_obj/buffer_tests.cpp +++ b/opencl/test/unit_test/mem_obj/buffer_tests.cpp @@ -689,7 +689,7 @@ struct BcsBufferTests : public ::testing::Test { BlitPropertiesContainer container; container.push_back(blitProperties); - bcsCsr->blitBuffer(container, true); + bcsCsr->blitBuffer(container, true, false); return BlitOperationResult::Success; } diff --git a/shared/source/command_container/command_encoder.h b/shared/source/command_container/command_encoder.h index ede3ed2625..b2e5b454ce 100644 --- a/shared/source/command_container/command_encoder.h +++ b/shared/source/command_container/command_encoder.h @@ -148,6 +148,7 @@ struct EncodeStoreMMIO { static const size_t size = sizeof(MI_STORE_REGISTER_MEM); static void encode(LinearStream &csr, uint32_t offset, uint64_t address); + static void remapOffset(MI_STORE_REGISTER_MEM *pStoreRegMem); }; template diff --git a/shared/source/command_container/command_encoder.inl b/shared/source/command_container/command_encoder.inl index 822ebd1571..46fabb5e41 100644 --- a/shared/source/command_container/command_encoder.inl +++ b/shared/source/command_container/command_encoder.inl @@ -259,6 +259,7 @@ void EncodeStoreMMIO::encode(LinearStream &csr, uint32_t offset, uint64_ MI_STORE_REGISTER_MEM cmd = Family::cmdInitStoreRegisterMem; cmd.setRegisterAddress(offset); cmd.setMemoryAddress(address); + remapOffset(&cmd); auto buffer = csr.getSpaceForCmd(); *buffer = cmd; } diff --git a/shared/source/command_container/encode_compute_mode_bdw_plus.inl b/shared/source/command_container/encode_compute_mode_bdw_plus.inl index 9b5fc3daaf..edb8af76a5 100644 --- a/shared/source/command_container/encode_compute_mode_bdw_plus.inl +++ b/shared/source/command_container/encode_compute_mode_bdw_plus.inl @@ -13,4 +13,8 @@ namespace NEO { template void EncodeStates::adjustStateComputeMode(LinearStream &csr, uint32_t numGrfRequired, void *const stateComputeModePtr, bool isMultiOsContextCapable, bool requiresCoherency) { } + +template +void EncodeStoreMMIO::remapOffset(MI_STORE_REGISTER_MEM *pStoreRegMem) { +} } // namespace NEO \ No newline at end of file diff --git a/shared/source/command_container/encode_compute_mode_tgllp_plus.inl b/shared/source/command_container/encode_compute_mode_tgllp_plus.inl index 0e1550fe7b..4d630754d5 100644 --- a/shared/source/command_container/encode_compute_mode_tgllp_plus.inl +++ b/shared/source/command_container/encode_compute_mode_tgllp_plus.inl @@ -22,4 +22,9 @@ void EncodeStates::adjustStateComputeMode(LinearStream &csr, uint32_t nu EncodeComputeMode::adjustComputeMode(csr, numGrfRequired, &stateComputeMode, isMultiOsContextCapable); } + +template +void EncodeStoreMMIO::remapOffset(MI_STORE_REGISTER_MEM *pStoreRegMem) { + pStoreRegMem->setMmioRemapEnable(true); +} } // namespace NEO \ No newline at end of file diff --git a/shared/source/command_stream/command_stream_receiver.h b/shared/source/command_stream/command_stream_receiver.h index 00e181c32f..d7a8e17a50 100644 --- a/shared/source/command_stream/command_stream_receiver.h +++ b/shared/source/command_stream/command_stream_receiver.h @@ -194,7 +194,7 @@ class CommandStreamReceiver { this->latestSentTaskCount = latestSentTaskCount; } - virtual uint32_t blitBuffer(const BlitPropertiesContainer &blitPropertiesContainer, bool blocking) = 0; + virtual uint32_t blitBuffer(const BlitPropertiesContainer &blitPropertiesContainer, bool blocking, bool profilingEnabled) = 0; ScratchSpaceController *getScratchSpaceController() const { return scratchSpaceController.get(); diff --git a/shared/source/command_stream/command_stream_receiver_hw.h b/shared/source/command_stream/command_stream_receiver_hw.h index d073ff940e..880390c221 100644 --- a/shared/source/command_stream/command_stream_receiver_hw.h +++ b/shared/source/command_stream/command_stream_receiver_hw.h @@ -83,7 +83,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver { return CommandStreamReceiverType::CSR_HW; } - uint32_t blitBuffer(const BlitPropertiesContainer &blitPropertiesContainer, bool blocking) override; + uint32_t blitBuffer(const BlitPropertiesContainer &blitPropertiesContainer, bool blocking, bool profilingEnabled) override; bool isMultiOsContextCapable() const override; diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl index 3d260c0229..130fd247d2 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl @@ -854,13 +854,13 @@ bool CommandStreamReceiverHw::detectInitProgrammingFlagsRequired(cons } template -uint32_t CommandStreamReceiverHw::blitBuffer(const BlitPropertiesContainer &blitPropertiesContainer, bool blocking) { +uint32_t CommandStreamReceiverHw::blitBuffer(const BlitPropertiesContainer &blitPropertiesContainer, bool blocking, bool profilingEnabled) { using MI_BATCH_BUFFER_END = typename GfxFamily::MI_BATCH_BUFFER_END; using MI_FLUSH_DW = typename GfxFamily::MI_FLUSH_DW; auto lock = obtainUniqueOwnership(); - auto &commandStream = getCS(BlitCommandsHelper::estimateBlitCommandsSize(blitPropertiesContainer, peekHwInfo())); + auto &commandStream = getCS(BlitCommandsHelper::estimateBlitCommandsSize(blitPropertiesContainer, peekHwInfo(), profilingEnabled)); auto commandStreamStart = commandStream.getUsed(); auto newTaskCount = taskCount + 1; latestSentTaskCount = newTaskCount; @@ -870,11 +870,27 @@ uint32_t CommandStreamReceiverHw::blitBuffer(const BlitPropertiesCont for (auto &blitProperties : blitPropertiesContainer) { TimestampPacketHelper::programCsrDependencies(commandStream, blitProperties.csrDependencies, getOsContext().getNumSupportedDevices()); + if (blitProperties.outputTimestampPacket && profilingEnabled) { + auto timestampContextStartGpuAddress = blitProperties.outputTimestampPacket->getGpuAddress() + offsetof(TimestampPacketStorage, packets[0].contextStart); + auto timestampGlobalStartAddress = blitProperties.outputTimestampPacket->getGpuAddress() + offsetof(TimestampPacketStorage, packets[0].globalStart); + + EncodeStoreMMIO::encode(commandStream, GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, timestampContextStartGpuAddress); + EncodeStoreMMIO::encode(commandStream, REG_GLOBAL_TIMESTAMP_LDW, timestampGlobalStartAddress); + } + BlitCommandsHelper::dispatchBlitCommandsForBuffer(blitProperties, commandStream, *this->executionEnvironment.rootDeviceEnvironments[this->rootDeviceIndex]); if (blitProperties.outputTimestampPacket) { - auto timestampPacketGpuAddress = TimestampPacketHelper::getContextEndGpuAddress(*blitProperties.outputTimestampPacket); - EncodeMiFlushDW::programMiFlushDw(commandStream, timestampPacketGpuAddress, 0, true, true); + if (profilingEnabled) { + auto timestampContextEndGpuAddress = blitProperties.outputTimestampPacket->getGpuAddress() + offsetof(TimestampPacketStorage, packets[0].contextEnd); + auto timestampGlobalEndAddress = blitProperties.outputTimestampPacket->getGpuAddress() + offsetof(TimestampPacketStorage, packets[0].globalEnd); + + EncodeStoreMMIO::encode(commandStream, GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, timestampContextEndGpuAddress); + EncodeStoreMMIO::encode(commandStream, REG_GLOBAL_TIMESTAMP_LDW, timestampGlobalEndAddress); + } else { + auto timestampPacketGpuAddress = TimestampPacketHelper::getContextEndGpuAddress(*blitProperties.outputTimestampPacket); + EncodeMiFlushDW::programMiFlushDw(commandStream, timestampPacketGpuAddress, 0, true, true); + } makeResident(*blitProperties.outputTimestampPacket->getBaseGraphicsAllocation()); } diff --git a/shared/source/helpers/blit_commands_helper.h b/shared/source/helpers/blit_commands_helper.h index 5f67956b2d..7e271b435b 100644 --- a/shared/source/helpers/blit_commands_helper.h +++ b/shared/source/helpers/blit_commands_helper.h @@ -83,8 +83,8 @@ struct BlitCommandsHelper { static uint64_t getMaxBlitHeight(); static void dispatchPostBlitCommand(LinearStream &linearStream); static size_t estimatePostBlitCommandSize(); - static size_t estimateBlitCommandsSize(Vec3 copySize, const CsrDependencies &csrDependencies, bool updateTimestampPacket); - static size_t estimateBlitCommandsSize(const BlitPropertiesContainer &blitPropertiesContainer, const HardwareInfo &hwInfo); + static size_t estimateBlitCommandsSize(Vec3 copySize, const CsrDependencies &csrDependencies, bool updateTimestampPacket, bool profilingEnabled); + static size_t estimateBlitCommandsSize(const BlitPropertiesContainer &blitPropertiesContainer, const HardwareInfo &hwInfo, bool profilingEnabled); static uint64_t calculateBlitCommandDestinationBaseAddress(const BlitProperties &blitProperties, uint64_t offset, uint64_t row, uint64_t slice); static uint64_t calculateBlitCommandSourceBaseAddress(const BlitProperties &blitProperties, uint64_t offset, uint64_t row, uint64_t slice); static void dispatchBlitCommandsForBuffer(const BlitProperties &blitProperties, LinearStream &linearStream, const RootDeviceEnvironment &rootDeviceEnvironment); diff --git a/shared/source/helpers/blit_commands_helper_base.inl b/shared/source/helpers/blit_commands_helper_base.inl index 45456acb45..8d17ed4a9f 100644 --- a/shared/source/helpers/blit_commands_helper_base.inl +++ b/shared/source/helpers/blit_commands_helper_base.inl @@ -60,7 +60,7 @@ size_t BlitCommandsHelper::estimatePostBlitCommandSize() { } template -size_t BlitCommandsHelper::estimateBlitCommandsSize(Vec3 copySize, const CsrDependencies &csrDependencies, bool updateTimestampPacket) { +size_t BlitCommandsHelper::estimateBlitCommandsSize(Vec3 copySize, const CsrDependencies &csrDependencies, bool updateTimestampPacket, bool profilingEnabled) { size_t numberOfBlits = 0; uint64_t width = 1; uint64_t height = 1; @@ -87,17 +87,25 @@ size_t BlitCommandsHelper::estimateBlitCommandsSize(Vec3 copy const size_t cmdsSizePerBlit = (sizeof(typename GfxFamily::XY_COPY_BLT) + estimatePostBlitCommandSize()); + size_t timestampCmdSize = 0; + if (updateTimestampPacket) { + if (profilingEnabled) { + timestampCmdSize = 4 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); + } else { + timestampCmdSize = EncodeMiFlushDW::getMiFlushDwCmdSizeForDataWrite(); + } + } + return TimestampPacketHelper::getRequiredCmdStreamSize(csrDependencies) + - (cmdsSizePerBlit * numberOfBlits) + - (EncodeMiFlushDW::getMiFlushDwCmdSizeForDataWrite() * static_cast(updateTimestampPacket)); + (cmdsSizePerBlit * numberOfBlits) + timestampCmdSize; } template -size_t BlitCommandsHelper::estimateBlitCommandsSize(const BlitPropertiesContainer &blitPropertiesContainer, const HardwareInfo &hwInfo) { +size_t BlitCommandsHelper::estimateBlitCommandsSize(const BlitPropertiesContainer &blitPropertiesContainer, const HardwareInfo &hwInfo, bool profilingEnabled) { size_t size = 0; for (auto &blitProperties : blitPropertiesContainer) { size += BlitCommandsHelper::estimateBlitCommandsSize(blitProperties.copySize, blitProperties.csrDependencies, - blitProperties.outputTimestampPacket != nullptr); + blitProperties.outputTimestampPacket != nullptr, profilingEnabled); } size += MemorySynchronizationCommands::getSizeForAdditonalSynchronization(hwInfo); size += EncodeMiFlushDW::getMiFlushDwCmdSizeForDataWrite() + sizeof(typename GfxFamily::MI_BATCH_BUFFER_END); diff --git a/shared/test/unit_test/mocks/mock_command_stream_receiver.h b/shared/test/unit_test/mocks/mock_command_stream_receiver.h index 82bb7f98dd..87b0fba058 100644 --- a/shared/test/unit_test/mocks/mock_command_stream_receiver.h +++ b/shared/test/unit_test/mocks/mock_command_stream_receiver.h @@ -68,7 +68,7 @@ class MockCommandStreamReceiver : public CommandStreamReceiver { void waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool quickKmdSleep, bool forcePowerSavingMode) override { } - uint32_t blitBuffer(const BlitPropertiesContainer &blitPropertiesContainer, bool blocking) override { return taskCount; }; + uint32_t blitBuffer(const BlitPropertiesContainer &blitPropertiesContainer, bool blocking, bool profilingEnabled) override { return taskCount; }; CommandStreamReceiverType getType() override { return CommandStreamReceiverType::CSR_HW; @@ -156,9 +156,9 @@ class MockCsrHw2 : public CommandStreamReceiverHw { return completionStamp; } - uint32_t blitBuffer(const BlitPropertiesContainer &blitPropertiesContainer, bool blocking) override { + uint32_t blitBuffer(const BlitPropertiesContainer &blitPropertiesContainer, bool blocking, bool profilingEnabled) override { if (!skipBlitCalls) { - return CommandStreamReceiverHw::blitBuffer(blitPropertiesContainer, blocking); + return CommandStreamReceiverHw::blitBuffer(blitPropertiesContainer, blocking, profilingEnabled); } return taskCount; }