diff --git a/level_zero/core/source/printf_handler/printf_handler.cpp b/level_zero/core/source/printf_handler/printf_handler.cpp index f259d3c5f9..00849476bf 100644 --- a/level_zero/core/source/printf_handler/printf_handler.cpp +++ b/level_zero/core/source/printf_handler/printf_handler.cpp @@ -29,11 +29,13 @@ NEO::GraphicsAllocation *PrintfHandler::createPrintfBuffer(Device *device) { void PrintfHandler::printOutput(const KernelImmutableData *kernelData, NEO::GraphicsAllocation *printfBuffer, Device *device) { bool using32BitGpuPointers = kernelData->getDescriptor().kernelAttributes.gpuPointerSize == 4u; + + auto usesStringMap = kernelData->getDescriptor().kernelAttributes.flags.usesStringMapForPrintf || kernelData->getDescriptor().kernelAttributes.flags.requiresImplicitArgs; NEO::PrintFormatter printfFormatter{ static_cast(printfBuffer->getUnderlyingBuffer()), static_cast(printfBuffer->getUnderlyingBufferSize()), using32BitGpuPointers, - kernelData->getDescriptor().kernelAttributes.flags.usesStringMapForPrintf ? &kernelData->getDescriptor().kernelMetadata.printfStringsMap : nullptr}; + usesStringMap ? &kernelData->getDescriptor().kernelMetadata.printfStringsMap : nullptr}; printfFormatter.printKernelOutput(); *reinterpret_cast(printfBuffer->getUnderlyingBuffer()) = diff --git a/level_zero/core/test/unit_tests/fixtures/module_fixture.h b/level_zero/core/test/unit_tests/fixtures/module_fixture.h index e587dc5641..2667c1c514 100644 --- a/level_zero/core/test/unit_tests/fixtures/module_fixture.h +++ b/level_zero/core/test/unit_tests/fixtures/module_fixture.h @@ -42,6 +42,8 @@ struct ModuleImmutableDataFixture : public DeviceFixture { }; struct MockImmutableData : KernelImmutableData { + using KernelImmutableData::crossThreadDataSize; + using KernelImmutableData::crossThreadDataTemplate; using KernelImmutableData::kernelDescriptor; using KernelImmutableData::kernelInfo; MockImmutableData(uint32_t perHwThreadPrivateMemorySize) { @@ -113,6 +115,7 @@ struct ModuleImmutableDataFixture : public DeviceFixture { class MockKernel : public WhiteBox { public: + using KernelImp::crossThreadData; using KernelImp::kernelArgHandlers; using KernelImp::kernelHasIndirectAccess; using KernelImp::privateMemoryGraphicsAllocation; diff --git a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp index 68253583ac..d7cfc0f6bc 100644 --- a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp +++ b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp @@ -32,7 +32,11 @@ #include "level_zero/core/test/unit_tests/mocks/mock_kernel.h" #include "level_zero/core/test/unit_tests/mocks/mock_module.h" -void NEO::populateKernelDescriptor(KernelDescriptor &dst, const PatchTokenBinary::KernelFromPatchtokens &src, uint32_t gpuPointerSizeInBytes); +namespace NEO { +void populatePointerKernelArg(ArgDescPointer &dst, + CrossThreadDataOffset stateless, uint8_t pointerSize, SurfaceStateHeapOffset bindful, CrossThreadDataOffset bindless, + KernelDescriptor::AddressingMode addressingMode); +} namespace L0 { namespace ult { @@ -2123,6 +2127,36 @@ TEST_F(KernelImplicitArgTests, givenKernelWithImplicitArgsWhenSettingKernelParam EXPECT_EQ(0, memcmp(pImplicitArgs, &expectedImplicitArgs, sizeof(ImplicitArgs))); } +TEST_F(KernelImplicitArgTests, givenKernelWithImplicitArgsAndPrintfStringsMapWhenPrintOutputThenProperStringIsPrinted) { + std::unique_ptr mockKernelImmData = std::make_unique(0u); + + auto kernelDescriptor = mockKernelImmData->kernelDescriptor; + kernelDescriptor->kernelAttributes.flags.requiresImplicitArgs = true; + kernelDescriptor->kernelAttributes.flags.usesPrintf = false; + kernelDescriptor->kernelAttributes.flags.usesStringMapForPrintf = false; + std::string expectedString("test123"); + kernelDescriptor->kernelMetadata.printfStringsMap.insert(std::make_pair(0u, expectedString)); + + createModuleFromBinary(0u, false, mockKernelImmData.get()); + + auto kernel = std::make_unique(module.get()); + + ze_kernel_desc_t kernelDesc{ZE_STRUCTURE_TYPE_KERNEL_DESC}; + kernel->initialize(&kernelDesc); + + auto printfAllocation = reinterpret_cast(kernel->getPrintfBufferAllocation()->getUnderlyingBuffer()); + printfAllocation[0] = 8; + printfAllocation[1] = 0; + + EXPECT_TRUE(kernel->getKernelDescriptor().kernelAttributes.flags.requiresImplicitArgs); + ASSERT_NE(nullptr, kernel->getImplicitArgs()); + + testing::internal::CaptureStdout(); + kernel->printPrintfOutput(); + std::string output = testing::internal::GetCapturedStdout(); + EXPECT_STREQ(expectedString.c_str(), output.c_str()); +} + TEST_F(KernelImplicitArgTests, givenKernelWithoutImplicitArgsWhenPatchingImplicitArgsThenNothingHappens) { std::unique_ptr mockKernelImmData = std::make_unique(0u); mockKernelImmData->kernelDescriptor->kernelAttributes.flags.requiresImplicitArgs = false; diff --git a/opencl/source/helpers/task_information.h b/opencl/source/helpers/task_information.h index c7b9068cba..dd6bfb8a30 100644 --- a/opencl/source/helpers/task_information.h +++ b/opencl/source/helpers/task_information.h @@ -139,6 +139,7 @@ class CommandComputeKernel : public Command { LinearStream *getCommandStream() override { return kernelOperation->commandStream.get(); } Kernel *peekKernel() const { return kernel; } + PrintfHandler *peekPrintfHandler() const { return printfHandler.get(); } protected: std::vector surfaces; diff --git a/opencl/source/program/printf_handler.cpp b/opencl/source/program/printf_handler.cpp index 767272583e..d45f1e4ca1 100644 --- a/opencl/source/program/printf_handler.cpp +++ b/opencl/source/program/printf_handler.cpp @@ -61,16 +61,19 @@ void PrintfHandler::prepareDispatch(const MultiDispatchInfo &multiDispatchInfo) device.getDevice(), printfSurface, 0, printfSurfaceInitialDataSizePtr.get(), sizeof(*printfSurfaceInitialDataSizePtr.get())); - const auto &printfSurfaceArg = kernel->getKernelInfo().kernelDescriptor.payloadMappings.implicitArgs.printfSurfaceAddress; - auto printfPatchAddress = ptrOffset(reinterpret_cast(kernel->getCrossThreadData()), printfSurfaceArg.stateless); - patchWithRequiredSize(printfPatchAddress, printfSurfaceArg.pointerSize, (uintptr_t)printfSurface->getGpuAddressToPatch()); - if (isValidOffset(printfSurfaceArg.bindful)) { - auto surfaceState = ptrOffset(reinterpret_cast(kernel->getSurfaceStateHeap()), printfSurfaceArg.bindful); - void *addressToPatch = printfSurface->getUnderlyingBuffer(); - size_t sizeToPatch = printfSurface->getUnderlyingBufferSize(); - Buffer::setSurfaceState(&device.getDevice(), surfaceState, false, false, sizeToPatch, addressToPatch, 0, printfSurface, 0, 0, - kernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, - kernel->areMultipleSubDevicesInContext()); + if (kernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.usesPrintf) { + + const auto &printfSurfaceArg = kernel->getKernelInfo().kernelDescriptor.payloadMappings.implicitArgs.printfSurfaceAddress; + auto printfPatchAddress = ptrOffset(reinterpret_cast(kernel->getCrossThreadData()), printfSurfaceArg.stateless); + patchWithRequiredSize(printfPatchAddress, printfSurfaceArg.pointerSize, (uintptr_t)printfSurface->getGpuAddressToPatch()); + if (isValidOffset(printfSurfaceArg.bindful)) { + auto surfaceState = ptrOffset(reinterpret_cast(kernel->getSurfaceStateHeap()), printfSurfaceArg.bindful); + void *addressToPatch = printfSurface->getUnderlyingBuffer(); + size_t sizeToPatch = printfSurface->getUnderlyingBufferSize(); + Buffer::setSurfaceState(&device.getDevice(), surfaceState, false, false, sizeToPatch, addressToPatch, 0, printfSurface, 0, 0, + kernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, + kernel->areMultipleSubDevicesInContext()); + } } auto pImplicitArgs = kernel->getImplicitArgs(); if (pImplicitArgs) { @@ -83,31 +86,28 @@ void PrintfHandler::makeResident(CommandStreamReceiver &commandStreamReceiver) { } void PrintfHandler::printEnqueueOutput() { + auto usesStringMap = kernel->getDescriptor().kernelAttributes.flags.usesStringMapForPrintf || nullptr != kernel->getImplicitArgs(); const auto &hwInfoConfig = *HwInfoConfig::get(device.getHardwareInfo().platform.eProductFamily); + auto printfOutputBuffer = reinterpret_cast(printfSurface->getUnderlyingBuffer()); + auto printfOutputSize = static_cast(printfSurface->getUnderlyingBufferSize()); + std::unique_ptr printfOutputDecompressed; if (hwInfoConfig.allowStatelessCompression(device.getHardwareInfo())) { - auto printOutputSize = static_cast(printfSurface->getUnderlyingBufferSize()); - auto printOutputDecompressed = std::make_unique(printOutputSize); + printfOutputDecompressed = std::make_unique(printfOutputSize); + printfOutputBuffer = printfOutputDecompressed.get(); auto &bcsEngine = device.getEngine(EngineHelpers::getBcsEngineType(device.getHardwareInfo(), device.getDeviceBitfield(), device.getSelectorCopyEngine(), true), EngineUsage::Regular); BlitPropertiesContainer blitPropertiesContainer; blitPropertiesContainer.push_back( BlitProperties::constructPropertiesForReadWrite(BlitterConstants::BlitDirection::BufferToHostPtr, *bcsEngine.commandStreamReceiver, printfSurface, nullptr, - printOutputDecompressed.get(), + printfOutputDecompressed.get(), printfSurface->getGpuAddress(), - 0, 0, 0, Vec3(printOutputSize, 0, 0), 0, 0, 0, 0)); + 0, 0, 0, Vec3(printfOutputSize, 0, 0), 0, 0, 0, 0)); bcsEngine.commandStreamReceiver->blitBuffer(blitPropertiesContainer, true, false, device.getDevice()); - - PrintFormatter printFormatter(printOutputDecompressed.get(), printOutputSize, - kernel->is32Bit(), - kernel->getDescriptor().kernelAttributes.flags.usesStringMapForPrintf ? &kernel->getDescriptor().kernelMetadata.printfStringsMap : nullptr); - printFormatter.printKernelOutput(); - return; } - PrintFormatter printFormatter(reinterpret_cast(printfSurface->getUnderlyingBuffer()), static_cast(printfSurface->getUnderlyingBufferSize()), - kernel->is32Bit(), - kernel->getDescriptor().kernelAttributes.flags.usesStringMapForPrintf ? &kernel->getDescriptor().kernelMetadata.printfStringsMap : nullptr); + PrintFormatter printFormatter(printfOutputBuffer, printfOutputSize, kernel->is32Bit(), + usesStringMap ? &kernel->getDescriptor().kernelMetadata.printfStringsMap : nullptr); printFormatter.printKernelOutput(); } } // namespace NEO diff --git a/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp index 347c8fd010..2732b49a09 100644 --- a/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp @@ -623,49 +623,91 @@ HWCMDTEST_P(IGFX_GEN8_CORE, EnqueueKernelPrintfTest, GivenKernelWithPrintfBlocke } HWTEST_P(EnqueueKernelPrintfTest, GivenKernelWithPrintfBlockedByEventWhenEventUnblockedThenOutputPrinted) { - typedef typename FamilyType::PARSE PARSE; + testing::internal::CaptureStdout(); - // In scenarios with 32bit allocator and 64 bit tests this code won't work - // due to inability to retrieve original buffer pointer as it is done in this test. - auto memoryManager = pDevice->getMemoryManager(); - if (!memoryManager->peekForce32BitAllocations() && !memoryManager->isLimitedRange(0)) { - testing::internal::CaptureStdout(); + auto userEvent = make_releaseable(context); - auto userEvent = make_releaseable(context); + MockKernelWithInternals mockKernel(*pClDevice); + mockKernel.kernelInfo.setPrintfSurface(sizeof(uintptr_t), 0); + std::string testString = "test"; + mockKernel.kernelInfo.addToPrintfStringsMap(0, testString); - MockKernelWithInternals mockKernel(*pClDevice); - mockKernel.kernelInfo.setPrintfSurface(sizeof(uintptr_t), 0); - std::string testString = "test"; - mockKernel.kernelInfo.addToPrintfStringsMap(0, testString); + cl_uint workDim = 1; + size_t globalWorkOffset[3] = {0, 0, 0}; - cl_uint workDim = 1; - size_t globalWorkOffset[3] = {0, 0, 0}; + FillValues(); - FillValues(); + cl_event blockedEvent = userEvent.get(); + cl_event outEvent{}; + auto retVal = pCmdQ->enqueueKernel( + mockKernel, + workDim, + globalWorkOffset, + globalWorkSize, + localWorkSize, + 1, + &blockedEvent, + &outEvent); - cl_event blockedEvent = userEvent.get(); - auto retVal = pCmdQ->enqueueKernel( - mockKernel, - workDim, - globalWorkOffset, - globalWorkSize, - localWorkSize, - 1, - &blockedEvent, - nullptr); + ASSERT_EQ(CL_SUCCESS, retVal); - ASSERT_EQ(CL_SUCCESS, retVal); + auto pOutEvent = castToObject(outEvent); - auto crossThreadData = reinterpret_cast(mockKernel.mockKernel->getCrossThreadData()); - auto printfAllocation = reinterpret_cast(*crossThreadData); - printfAllocation[0] = 8; - printfAllocation[1] = 0; + auto printfAllocation = reinterpret_cast(static_cast(pOutEvent->peekCommand())->peekPrintfHandler()->getSurface()->getUnderlyingBuffer()); + printfAllocation[0] = 8; + printfAllocation[1] = 0; - userEvent->setStatus(CL_COMPLETE); + pOutEvent->release(); - std::string output = testing::internal::GetCapturedStdout(); - EXPECT_STREQ("test", output.c_str()); - } + userEvent->setStatus(CL_COMPLETE); + + std::string output = testing::internal::GetCapturedStdout(); + EXPECT_STREQ("test", output.c_str()); +} + +HWTEST_P(EnqueueKernelPrintfTest, GivenKernelWithImplicitArgsWithoutPrintfInParentKernelBlockedByEventWhenEventUnblockedThenOutputPrinted) { + auto userEvent = make_releaseable(context); + + MockKernelWithInternals mockKernel(*pClDevice); + std::string testString = "test"; + mockKernel.kernelInfo.addToPrintfStringsMap(0, testString); + mockKernel.kernelInfo.kernelDescriptor.kernelAttributes.flags.usesPrintf = false; + mockKernel.kernelInfo.kernelDescriptor.kernelAttributes.flags.usesStringMapForPrintf = false; + mockKernel.mockKernel->pImplicitArgs = std::make_unique(); + *mockKernel.mockKernel->pImplicitArgs = {}; + + cl_uint workDim = 1; + size_t globalWorkOffset[3] = {0, 0, 0}; + + FillValues(); + + cl_event blockedEvent = userEvent.get(); + cl_event outEvent{}; + auto retVal = pCmdQ->enqueueKernel( + mockKernel, + workDim, + globalWorkOffset, + globalWorkSize, + localWorkSize, + 1, + &blockedEvent, + &outEvent); + + ASSERT_EQ(CL_SUCCESS, retVal); + + auto pOutEvent = castToObject(outEvent); + + auto printfAllocation = reinterpret_cast(static_cast(pOutEvent->peekCommand())->peekPrintfHandler()->getSurface()->getUnderlyingBuffer()); + printfAllocation[0] = 8; + printfAllocation[1] = 0; + + pOutEvent->release(); + + testing::internal::CaptureStdout(); + userEvent->setStatus(CL_COMPLETE); + std::string output = testing::internal::GetCapturedStdout(); + + EXPECT_STREQ("test", output.c_str()); } INSTANTIATE_TEST_CASE_P(EnqueueKernel, diff --git a/opencl/test/unit_test/mocks/mock_kernel.h b/opencl/test/unit_test/mocks/mock_kernel.h index 865026a9a5..90db241a01 100644 --- a/opencl/test/unit_test/mocks/mock_kernel.h +++ b/opencl/test/unit_test/mocks/mock_kernel.h @@ -124,6 +124,7 @@ class MockKernel : public Kernel { using Kernel::parentEventOffset; using Kernel::patchBufferOffset; using Kernel::patchWithImplicitSurface; + using Kernel::pImplicitArgs; using Kernel::preferredWkgMultipleOffset; using Kernel::privateSurface; using Kernel::singleSubdevicePreferredInCurrentEnqueue;