diff --git a/level_zero/core/test/aub_tests/bindless/bindless_kernel_aub_tests.cpp b/level_zero/core/test/aub_tests/bindless/bindless_kernel_aub_tests.cpp index 6f1b3dbdfc..4e43373baf 100644 --- a/level_zero/core/test/aub_tests/bindless/bindless_kernel_aub_tests.cpp +++ b/level_zero/core/test/aub_tests/bindless/bindless_kernel_aub_tests.cpp @@ -59,11 +59,13 @@ struct L0BindlessAub : Test { ModuleImp *module = nullptr; }; -HWTEST_F(L0BindlessAub, DISABLED_GivenBindlessKernelWhenExecutedThenOutputIsCorrect) { +HWTEST2_F(L0BindlessAub, GivenBindlessKernelWhenExecutedThenOutputIsCorrect, IsAtMostXeHpgCore) { constexpr size_t bufferSize = MemoryConstants::pageSize; const uint32_t groupSize[] = {32, 1, 1}; const uint32_t groupCount[] = {bufferSize / 32, 1, 1}; + NEO::DebugManager.flags.UpdateCrossThreadDataSize.set(true); + NEO::SVMAllocsManager::UnifiedMemoryProperties unifiedMemoryProperties(InternalMemoryType::HOST_UNIFIED_MEMORY, context->rootDeviceIndices, context->deviceBitfields); diff --git a/opencl/test/unit_test/test_files/igdrcl.config b/opencl/test/unit_test/test_files/igdrcl.config index bf87b42d3a..ffd4693d80 100644 --- a/opencl/test/unit_test/test_files/igdrcl.config +++ b/opencl/test/unit_test/test_files/igdrcl.config @@ -357,4 +357,5 @@ ProgramGlobalFenceAsPostSyncOperationInComputeWalker = -1 ProgramGlobalFenceAsKernelInstructionInEUKernel = -1 DoNotReportTile1BscWaActive = -1 ForceTile0PlacementForTile1ResourcesWaActive = -1 -ClosEnabled = -1 \ No newline at end of file +ClosEnabled = -1 +UpdateCrossThreadDataSize = 0 \ No newline at end of file diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index 2fcb3555bd..33b8dfb585 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -74,6 +74,7 @@ DECLARE_DEBUG_VARIABLE(bool, DoNotFreeResources, false, "true: driver stops free DECLARE_DEBUG_VARIABLE(bool, AllowMixingRegularAndCooperativeKernels, false, "true: driver allows mixing regular and cooperative kernels in a single command list and in a single execute") DECLARE_DEBUG_VARIABLE(bool, AllowPatchingVfeStateInCommandLists, false, "true: MEDIA_VFE_STATE may be programmed in a command list") DECLARE_DEBUG_VARIABLE(bool, PrintMemoryRegionSizes, false, "print memory bank type, instance and it's size") +DECLARE_DEBUG_VARIABLE(bool, UpdateCrossThreadDataSize, false, "Turn on cross thread data size calculation for PATCH TOKEN binary") DECLARE_DEBUG_VARIABLE(std::string, ForceDeviceId, std::string("unk"), "DeviceId selected for testing") DECLARE_DEBUG_VARIABLE(std::string, LoadBinarySipFromFile, std::string("unk"), "Select binary file to load SIP kernel raw binary. When file named *_header.* exists, it is used as header") DECLARE_DEBUG_VARIABLE(std::string, InjectInternalBuildOptions, std::string("unk"), "Appends internal build options string to user modules") diff --git a/shared/source/kernel/kernel_descriptor.h b/shared/source/kernel/kernel_descriptor.h index 7efe63159a..9ea14c213e 100644 --- a/shared/source/kernel/kernel_descriptor.h +++ b/shared/source/kernel/kernel_descriptor.h @@ -7,6 +7,7 @@ #pragma once +#include "shared/source/helpers/aligned_memory.h" #include "shared/source/helpers/debug_helpers.h" #include "shared/source/kernel/debug_data.h" #include "shared/source/kernel/kernel_arg_descriptor.h" @@ -44,6 +45,103 @@ struct KernelDescriptor { virtual ~KernelDescriptor() = default; virtual bool hasRTCalls() const; + void updateCrossThreadDataSize() { + uint32_t crossThreadDataSize = 0; + for (uint32_t i = 0; i < 3; i++) { + if (isValidOffset(payloadMappings.dispatchTraits.globalWorkOffset[i])) { + crossThreadDataSize = std::max(crossThreadDataSize, payloadMappings.dispatchTraits.globalWorkOffset[i] + sizeof(uint32_t)); + } + if (isValidOffset(payloadMappings.dispatchTraits.globalWorkSize[i])) { + crossThreadDataSize = std::max(crossThreadDataSize, payloadMappings.dispatchTraits.globalWorkSize[i] + sizeof(uint32_t)); + } + if (isValidOffset(payloadMappings.dispatchTraits.localWorkSize[i])) { + crossThreadDataSize = std::max(crossThreadDataSize, payloadMappings.dispatchTraits.localWorkSize[i] + sizeof(uint32_t)); + } + if (isValidOffset(payloadMappings.dispatchTraits.localWorkSize2[i])) { + crossThreadDataSize = std::max(crossThreadDataSize, payloadMappings.dispatchTraits.localWorkSize2[i] + sizeof(uint32_t)); + } + if (isValidOffset(payloadMappings.dispatchTraits.enqueuedLocalWorkSize[i])) { + crossThreadDataSize = std::max(crossThreadDataSize, payloadMappings.dispatchTraits.enqueuedLocalWorkSize[i] + sizeof(uint32_t)); + } + if (isValidOffset(payloadMappings.dispatchTraits.numWorkGroups[i])) { + crossThreadDataSize = std::max(crossThreadDataSize, payloadMappings.dispatchTraits.numWorkGroups[i] + sizeof(uint32_t)); + } + } + + if (isValidOffset(payloadMappings.dispatchTraits.workDim)) { + crossThreadDataSize = std::max(crossThreadDataSize, payloadMappings.dispatchTraits.workDim + sizeof(uint32_t)); + } + + StackVec implicitArgsVec({&payloadMappings.implicitArgs.printfSurfaceAddress, + &payloadMappings.implicitArgs.globalVariablesSurfaceAddress, + &payloadMappings.implicitArgs.globalConstantsSurfaceAddress, + &payloadMappings.implicitArgs.privateMemoryAddress, + &payloadMappings.implicitArgs.deviceSideEnqueueEventPoolSurfaceAddress, + &payloadMappings.implicitArgs.deviceSideEnqueueDefaultQueueSurfaceAddress, + &payloadMappings.implicitArgs.systemThreadSurfaceAddress, + &payloadMappings.implicitArgs.syncBufferAddress}); + + for (size_t i = 0; i < implicitArgsVec.size(); i++) { + if (isValidOffset(implicitArgsVec[i]->bindless)) { + crossThreadDataSize = std::max(crossThreadDataSize, implicitArgsVec[i]->bindless + sizeof(uint32_t)); + } + + if (isValidOffset(implicitArgsVec[i]->stateless)) { + crossThreadDataSize = std::max(crossThreadDataSize, implicitArgsVec[i]->stateless + implicitArgsVec[i]->pointerSize); + } + } + + StackVec implicitArgsVec2({&payloadMappings.implicitArgs.privateMemorySize, + &payloadMappings.implicitArgs.maxWorkGroupSize, + &payloadMappings.implicitArgs.simdSize, + &payloadMappings.implicitArgs.deviceSideEnqueueParentEvent, + &payloadMappings.implicitArgs.preferredWkgMultiple, + &payloadMappings.implicitArgs.localMemoryStatelessWindowSize, + &payloadMappings.implicitArgs.localMemoryStatelessWindowStartAddres}); + + for (size_t i = 0; i < implicitArgsVec2.size(); i++) { + if (isValidOffset(*implicitArgsVec2[i])) { + crossThreadDataSize = std::max(crossThreadDataSize, *implicitArgsVec2[i] + sizeof(uint32_t)); + } + } + + for (size_t i = 0; i < payloadMappings.explicitArgs.size(); i++) { + + switch (payloadMappings.explicitArgs[i].type) { + case ArgDescriptor::ArgType::ArgTImage: { + auto &argImage = payloadMappings.explicitArgs[i].as(false); + if (isValidOffset(argImage.bindless)) { + crossThreadDataSize = std::max(crossThreadDataSize, argImage.bindless + sizeof(uint32_t)); + } + } break; + case ArgDescriptor::ArgType::ArgTPointer: { + auto &argPtr = payloadMappings.explicitArgs[i].as(false); + if (isValidOffset(argPtr.bindless)) { + crossThreadDataSize = std::max(crossThreadDataSize, argPtr.bindless + sizeof(uint32_t)); + } + if (isValidOffset(argPtr.stateless)) { + crossThreadDataSize = std::max(crossThreadDataSize, argPtr.stateless + argPtr.pointerSize); + } + } break; + case ArgDescriptor::ArgType::ArgTSampler: { + auto &argSampler = payloadMappings.explicitArgs[i].as(false); + UNRECOVERABLE_IF(isValidOffset(argSampler.bindless)); + } break; + case ArgDescriptor::ArgType::ArgTValue: { + auto &argVal = payloadMappings.explicitArgs[i].as(false); + for (size_t i = 0; i < argVal.elements.size(); i++) { + UNRECOVERABLE_IF(!isValidOffset(argVal.elements[i].offset)); + crossThreadDataSize = std::max(crossThreadDataSize, argVal.elements[i].offset + argVal.elements[i].size); + } + } break; + default: + break; + } + } + + this->kernelAttributes.crossThreadDataSize = std::max(this->kernelAttributes.crossThreadDataSize, static_cast(alignUp(crossThreadDataSize, 32))); + } + struct KernelAttributes { KernelAttributes() { flags.packed = 0U; } diff --git a/shared/source/kernel/kernel_descriptor_from_patchtokens.cpp b/shared/source/kernel/kernel_descriptor_from_patchtokens.cpp index 993545ff3e..c3730ff02a 100644 --- a/shared/source/kernel/kernel_descriptor_from_patchtokens.cpp +++ b/shared/source/kernel/kernel_descriptor_from_patchtokens.cpp @@ -525,6 +525,10 @@ void populateKernelDescriptor(KernelDescriptor &dst, const PatchTokenBinary::Ker } dst.kernelAttributes.gpuPointerSize = gpuPointerSizeInBytes; + + if (DebugManager.flags.UpdateCrossThreadDataSize.get()) { + dst.updateCrossThreadDataSize(); + } } } // namespace NEO diff --git a/shared/test/unit_test/kernel/kernel_descriptor_from_patchtokens_tests.cpp b/shared/test/unit_test/kernel/kernel_descriptor_from_patchtokens_tests.cpp index 31d07e6a02..355f17d51b 100644 --- a/shared/test/unit_test/kernel/kernel_descriptor_from_patchtokens_tests.cpp +++ b/shared/test/unit_test/kernel/kernel_descriptor_from_patchtokens_tests.cpp @@ -10,6 +10,7 @@ #include "shared/source/kernel/kernel_arg_descriptor_extended_vme.h" #include "shared/source/kernel/kernel_descriptor.h" #include "shared/source/kernel/kernel_descriptor_from_patchtokens.h" +#include "shared/test/common/helpers/debug_manager_state_restore.h" #include "shared/test/unit_test/device_binary_format/patchtokens_tests.h" #include "test.h" @@ -1311,3 +1312,194 @@ TEST(KernelDescriptorFromPatchtokens, GivenKernelWithChildBlocksMetadataImplicit EXPECT_EQ(childBlocks[0].Offset, dst.kernelMetadata.deviceSideEnqueueChildrenKernelsIdOffset[0].second); EXPECT_EQ(childBlocks[1].Offset, dst.kernelMetadata.deviceSideEnqueueChildrenKernelsIdOffset[1].second); } + +TEST(KernelDescriptorFromPatchtokens, GivenDispatchTraitsImplicitArgsAndExplicitArgsWhenPopulatingKernelDescriptorThenCrossThreadDataSizeIsSetToMaxOffsetAndAligned) { + NEO::PatchTokenBinary::KernelFromPatchtokens kernelTokens; + iOpenCL::SKernelBinaryHeaderCommon kernelHeader; + kernelTokens.header = &kernelHeader; + + DebugManagerStateRestore dbgRestorer; + NEO::DebugManager.flags.UpdateCrossThreadDataSize.set(true); + + iOpenCL::SPatchDataParameterBuffer localWorkSize[3] = {}; + iOpenCL::SPatchDataParameterBuffer localWorkSize2[3] = {}; + iOpenCL::SPatchDataParameterBuffer enqueuedLocalWorkSize[3] = {}; + iOpenCL::SPatchDataParameterBuffer numWorkGroups[3] = {}; + iOpenCL::SPatchDataParameterBuffer globalWorkOffset[3] = {}; + iOpenCL::SPatchDataParameterBuffer globalWorkSize[3] = {}; + iOpenCL::SPatchDataParameterBuffer maxWorkGroupSize = {}; + iOpenCL::SPatchDataParameterBuffer workDimensions = {}; + iOpenCL::SPatchDataParameterBuffer simdSize = {}; + iOpenCL::SPatchDataParameterBuffer parentEvent = {}; + iOpenCL::SPatchDataParameterBuffer privateMemoryStatelessSize = {}; + iOpenCL::SPatchDataParameterBuffer localMemoryStatelessWindowSize = {}; + iOpenCL::SPatchDataParameterBuffer localMemoryStatelessWindowStartAddress = {}; + iOpenCL::SPatchDataParameterBuffer preferredWorkgroupMultiple = {}; + + uint32_t expectedMaxSizes[5] = {96 + 4, 108 + 4, 128 + 2, 140 + 8, 176 + 8}; + for (int i = 0; i < 5; i++) { + NEO::KernelDescriptor kernelDescriptor; + + iOpenCL::SPatchExecutionEnvironment execEnv = {}; + execEnv.UseBindlessMode = 1; + kernelTokens.tokens.executionEnvironment = &execEnv; + + iOpenCL::SPatchDataParameterStream dataParameterStream = {}; + dataParameterStream.DataParameterStreamSize = 32; + kernelTokens.tokens.dataParameterStream = &dataParameterStream; + + iOpenCL::SPatchAllocateStatelessPrivateSurface privateSurface = {}; + privateSurface.DataParamOffset = 0; + kernelTokens.tokens.allocateStatelessPrivateSurface = &privateSurface; + + iOpenCL::SPatchAllocateStatelessConstantMemorySurfaceWithInitialization constantSurface = {}; + constantSurface.DataParamOffset = 8; + kernelTokens.tokens.allocateStatelessConstantMemorySurfaceWithInitialization = &constantSurface; + + iOpenCL::SPatchAllocateStatelessGlobalMemorySurfaceWithInitialization globalsSurface = {}; + globalsSurface.DataParamOffset = 16; + kernelTokens.tokens.allocateStatelessGlobalMemorySurfaceWithInitialization = &globalsSurface; + + iOpenCL::SPatchAllocateStatelessPrintfSurface printfSurface = {}; + printfSurface.DataParamOffset = 24; + kernelTokens.tokens.allocateStatelessPrintfSurface = &printfSurface; + + localWorkSize[0].Offset = 28; + localWorkSize[1].Offset = 32; + localWorkSize[2].Offset = 36; + localWorkSize2[0].Offset = 40; + localWorkSize2[1].Offset = 44; + localWorkSize2[2].Offset = 48; + enqueuedLocalWorkSize[0].Offset = 52; + enqueuedLocalWorkSize[1].Offset = 56; + enqueuedLocalWorkSize[2].Offset = 60; + numWorkGroups[0].Offset = 64; + numWorkGroups[1].Offset = 68; + numWorkGroups[2].Offset = 72; + globalWorkOffset[0].Offset = 76; + globalWorkOffset[1].Offset = 80; + globalWorkOffset[2].Offset = 84; + globalWorkSize[0].Offset = 88; + globalWorkSize[1].Offset = 92; + globalWorkSize[2].Offset = 96; + globalWorkSize[2].DataSize = 4; + maxWorkGroupSize.Offset = 100; + workDimensions.Offset = 104; + simdSize.Offset = 108; + parentEvent.Offset = 112; + privateMemoryStatelessSize.Offset = 116; + localMemoryStatelessWindowSize.Offset = 120; + localMemoryStatelessWindowStartAddress.Offset = 124; + preferredWorkgroupMultiple.Offset = 128; + preferredWorkgroupMultiple.DataSize = 2; + + for (uint32_t i = 0; i < 3U; ++i) { + kernelTokens.tokens.crossThreadPayloadArgs.localWorkSize[i] = &localWorkSize[i]; + kernelTokens.tokens.crossThreadPayloadArgs.localWorkSize2[i] = &localWorkSize2[i]; + kernelTokens.tokens.crossThreadPayloadArgs.globalWorkOffset[i] = &globalWorkOffset[i]; + kernelTokens.tokens.crossThreadPayloadArgs.enqueuedLocalWorkSize[i] = &enqueuedLocalWorkSize[i]; + kernelTokens.tokens.crossThreadPayloadArgs.globalWorkSize[i] = &globalWorkSize[i]; + kernelTokens.tokens.crossThreadPayloadArgs.numWorkGroups[i] = &numWorkGroups[i]; + } + + iOpenCL::SPatchSamplerKernelArgument paramArg0 = {}; + iOpenCL::SPatchDataParameterBuffer paramArg1 = {}; + iOpenCL::SPatchDataParameterBuffer paramArg2 = {}; + iOpenCL::SPatchGlobalMemoryObjectKernelArgument globalMemArg = {}; + + if (i > 0) { + kernelTokens.tokens.crossThreadPayloadArgs.workDimensions = &workDimensions; + kernelTokens.tokens.crossThreadPayloadArgs.maxWorkGroupSize = &maxWorkGroupSize; + kernelTokens.tokens.crossThreadPayloadArgs.simdSize = &simdSize; + if (i > 1) { + kernelTokens.tokens.crossThreadPayloadArgs.parentEvent = &parentEvent; + kernelTokens.tokens.crossThreadPayloadArgs.preferredWorkgroupMultiple = &preferredWorkgroupMultiple; + kernelTokens.tokens.crossThreadPayloadArgs.privateMemoryStatelessSize = &privateMemoryStatelessSize; + + if (i > 2) { + kernelTokens.tokens.crossThreadPayloadArgs.localMemoryStatelessWindowSize = &localMemoryStatelessWindowSize; + kernelTokens.tokens.crossThreadPayloadArgs.localMemoryStatelessWindowStartAddress = &localMemoryStatelessWindowStartAddress; + + kernelTokens.tokens.kernelArgs.resize(3); + + paramArg0.Token = iOpenCL::PATCH_TOKEN_SAMPLER_KERNEL_ARGUMENT; + paramArg0.ArgumentNumber = 0; + paramArg0.Offset = 132; + paramArg0.Type = iOpenCL::SAMPLER_OBJECT_TEXTURE; + + paramArg1.Token = iOpenCL::PATCH_TOKEN_DATA_PARAMETER_BUFFER; + paramArg1.ArgumentNumber = 1; + paramArg1.Type = iOpenCL::DATA_PARAMETER_KERNEL_ARGUMENT; + paramArg1.Offset = 136; + paramArg1.DataSize = 8; + paramArg1.SourceOffset = 5; + + paramArg2.Token = iOpenCL::PATCH_TOKEN_DATA_PARAMETER_BUFFER; + paramArg2.ArgumentNumber = 2; + paramArg2.Type = iOpenCL::DATA_PARAMETER_KERNEL_ARGUMENT; + paramArg2.Offset = 140; + paramArg2.DataSize = 8; + paramArg2.SourceOffset = 13; + + kernelTokens.tokens.kernelArgs[0].objectArg = ¶mArg0; + kernelTokens.tokens.kernelArgs[1].byValMap.push_back(¶mArg1); + kernelTokens.tokens.kernelArgs[2].byValMap.push_back(¶mArg2); + + if (i > 3) { + globalMemArg.Token = iOpenCL::PATCH_TOKEN_GLOBAL_MEMORY_OBJECT_KERNEL_ARGUMENT; + globalMemArg.ArgumentNumber = 3; + globalMemArg.Offset = 176; + globalMemArg.Size = 8; + + kernelTokens.tokens.kernelArgs.resize(4); + kernelTokens.tokens.kernelArgs[3].objectArg = &globalMemArg; + } + } + } + } + + NEO::populateKernelDescriptor(kernelDescriptor, kernelTokens, 8); + + EXPECT_EQ(alignUp(expectedMaxSizes[i], 32), kernelDescriptor.kernelAttributes.crossThreadDataSize); + } +} + +TEST(KernelDescriptorFromPatchtokens, GivenUpdateCrossThreadDataSizeAndNoCrossThreadPayloadWhenPopulatingKernelDescriptorThenCrossThreadDataSizeRemainsZero) { + NEO::PatchTokenBinary::KernelFromPatchtokens kernelTokens; + iOpenCL::SKernelBinaryHeaderCommon kernelHeader; + kernelTokens.header = &kernelHeader; + + DebugManagerStateRestore dbgRestorer; + NEO::DebugManager.flags.UpdateCrossThreadDataSize.set(true); + NEO::KernelDescriptor kernelDescriptor; + + NEO::populateKernelDescriptor(kernelDescriptor, kernelTokens, 8); + + EXPECT_EQ(0u, kernelDescriptor.kernelAttributes.crossThreadDataSize); +} + +TEST(KernelDescriptorFromPatchtokens, GivenUpdateCrossThreadDataSizeFalseWhenPopulatingKernelDescriptorThenCrossThreadDataSizeIsNotUpdated) { + NEO::PatchTokenBinary::KernelFromPatchtokens kernelTokens; + iOpenCL::SKernelBinaryHeaderCommon kernelHeader; + kernelTokens.header = &kernelHeader; + + DebugManagerStateRestore dbgRestorer; + NEO::DebugManager.flags.UpdateCrossThreadDataSize.set(false); + + NEO::KernelDescriptor kernelDescriptor; + + iOpenCL::SPatchExecutionEnvironment execEnv = {}; + kernelTokens.tokens.executionEnvironment = &execEnv; + + iOpenCL::SPatchDataParameterStream dataParameterStream = {}; + dataParameterStream.DataParameterStreamSize = 32; + kernelTokens.tokens.dataParameterStream = &dataParameterStream; + + iOpenCL::SPatchAllocateStatelessPrivateSurface privateSurface = {}; + privateSurface.DataParamOffset = 64; + kernelTokens.tokens.allocateStatelessPrivateSurface = &privateSurface; + + NEO::populateKernelDescriptor(kernelDescriptor, kernelTokens, 8); + + EXPECT_EQ(32u, kernelDescriptor.kernelAttributes.crossThreadDataSize); +} \ No newline at end of file