diff --git a/level_zero/core/source/kernel/kernel_hw.h b/level_zero/core/source/kernel/kernel_hw.h index 2acd7251d4..690ceedccc 100644 --- a/level_zero/core/source/kernel/kernel_hw.h +++ b/level_zero/core/source/kernel/kernel_hw.h @@ -32,10 +32,9 @@ struct KernelHw : public KernelImp { void setBufferSurfaceState(uint32_t argIndex, void *address, NEO::GraphicsAllocation *alloc) override { uint64_t baseAddress = alloc->getGpuAddressToPatch(); - auto sshAlignmentMask = NEO::EncodeSurfaceState::getSurfaceBaseAddressAlignmentMask(); // Remove misaligned bytes, accounted for in bufferOffset patch token - baseAddress &= sshAlignmentMask; + baseAddress &= this->surfaceStateAlignmentMask; auto misalignedSize = ptrDiff(alloc->getGpuAddressToPatch(), baseAddress); auto offset = ptrDiff(address, reinterpret_cast(baseAddress)); size_t bufferSizeForSsh = alloc->getUnderlyingBufferSize(); @@ -52,7 +51,7 @@ struct KernelHw : public KernelImp { offsetedAddress = baseAddress != reinterpret_cast(address); baseAddress = reinterpret_cast(address); bufferSizeForSsh -= offset; - DEBUG_BREAK_IF(baseAddress != (baseAddress & sshAlignmentMask)); + DEBUG_BREAK_IF(baseAddress != (baseAddress & this->surfaceStateAlignmentMask)); offset = 0; } @@ -76,9 +75,8 @@ struct KernelHw : public KernelImp { } uint64_t bufferAddressForSsh = baseAddress; - auto alignment = NEO::EncodeSurfaceState::getSurfaceBaseAddressAlignment(); bufferSizeForSsh += misalignedSize; - bufferSizeForSsh = alignUp(bufferSizeForSsh, alignment); + bufferSizeForSsh = alignUp(bufferSizeForSsh, this->surfaceStateAlignment); bool l3Enabled = true; // Allocation MUST be cacheline (64 byte) aligned in order to enable L3 caching otherwise Heap corruption will occur coming from the KMD. @@ -114,38 +112,6 @@ struct KernelHw : public KernelImp { UNRECOVERABLE_IF(surfaceStateAddress == nullptr); *reinterpret_cast(surfaceStateAddress) = surfaceState; } - - void evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::KernelDescriptor &kernelDescriptor) override { - size_t localWorkSizes[3]; - localWorkSizes[0] = this->groupSize[0]; - localWorkSizes[1] = this->groupSize[1]; - localWorkSizes[2] = this->groupSize[2]; - - kernelRequiresGenerationOfLocalIdsByRuntime = NEO::EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired( - kernelDescriptor.kernelAttributes.numLocalIdChannels, - localWorkSizes, - std::array{ - {kernelDescriptor.kernelAttributes.workgroupWalkOrder[0], - kernelDescriptor.kernelAttributes.workgroupWalkOrder[1], - kernelDescriptor.kernelAttributes.workgroupWalkOrder[2]}}, - kernelDescriptor.kernelAttributes.flags.requiresWorkgroupWalkOrder, - requiredWorkgroupOrder, - kernelDescriptor.kernelAttributes.simdSize); - } - - uint32_t getIndirectSize() const override { - uint32_t totalPayloadSize = getCrossThreadDataSize() + getPerThreadDataSizeForWholeThreadGroup(); - - if (getKernelDescriptor().kernelAttributes.flags.passInlineData) { - if (totalPayloadSize > GfxFamily::DefaultWalkerType::getInlineDataSize()) { - totalPayloadSize -= GfxFamily::DefaultWalkerType::getInlineDataSize(); - } else { - totalPayloadSize = 0; - } - } - - return totalPayloadSize; - } }; } // namespace L0 diff --git a/level_zero/core/source/kernel/kernel_imp.cpp b/level_zero/core/source/kernel/kernel_imp.cpp index 41bfe82b42..6cd8e015cb 100644 --- a/level_zero/core/source/kernel/kernel_imp.cpp +++ b/level_zero/core/source/kernel/kernel_imp.cpp @@ -1078,6 +1078,9 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) { this->rcsAvailable = gfxHelper.isRcsAvailable(hwInfo); this->cooperativeSupport = productHelper.isCooperativeEngineSupported(hwInfo); + this->walkerInlineDataSize = gfxHelper.getDefaultWalkerInlineDataSize(); + this->surfaceStateAlignmentMask = gfxHelper.getSurfaceBaseAddressAlignmentMask(); + this->surfaceStateAlignment = gfxHelper.getSurfaceBaseAddressAlignment(); if (isaAllocation->getAllocationType() == NEO::AllocationType::kernelIsaInternal && this->kernelImmData->getIsaParentAllocation() == nullptr) { isaAllocation->setTbxWritable(true, std::numeric_limits::max()); @@ -1621,4 +1624,39 @@ KernelExt *KernelImp::getExtension(uint32_t extensionType) { return nullptr; } +uint32_t KernelImp::getIndirectSize() const { + uint32_t totalPayloadSize = getCrossThreadDataSize() + getPerThreadDataSizeForWholeThreadGroup(); + + if (getKernelDescriptor().kernelAttributes.flags.passInlineData) { + if (totalPayloadSize > this->walkerInlineDataSize) { + totalPayloadSize -= this->walkerInlineDataSize; + } else { + totalPayloadSize = 0; + } + } + + return totalPayloadSize; +} + +void KernelImp::evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::KernelDescriptor &kernelDescriptor) { + auto &gfxHelper = module->getDevice()->getNEODevice()->getRootDeviceEnvironment().getHelper(); + + size_t localWorkSizes[3]; + localWorkSizes[0] = this->groupSize[0]; + localWorkSizes[1] = this->groupSize[1]; + localWorkSizes[2] = this->groupSize[2]; + + std::array kernelWalkOrder{ + kernelDescriptor.kernelAttributes.workgroupWalkOrder[0], + kernelDescriptor.kernelAttributes.workgroupWalkOrder[1], + kernelDescriptor.kernelAttributes.workgroupWalkOrder[2]}; + + kernelRequiresGenerationOfLocalIdsByRuntime = gfxHelper.isRuntimeLocalIdsGenerationRequired(kernelDescriptor.kernelAttributes.numLocalIdChannels, + localWorkSizes, + kernelWalkOrder, + kernelDescriptor.kernelAttributes.flags.requiresWorkgroupWalkOrder, + requiredWorkgroupOrder, + kernelDescriptor.kernelAttributes.simdSize); +} + } // namespace L0 diff --git a/level_zero/core/source/kernel/kernel_imp.h b/level_zero/core/source/kernel/kernel_imp.h index 5762ebc6d2..796f01ae6c 100644 --- a/level_zero/core/source/kernel/kernel_imp.h +++ b/level_zero/core/source/kernel/kernel_imp.h @@ -239,6 +239,8 @@ struct KernelImp : Kernel { return kernelArgInfos; } + uint32_t getIndirectSize() const override; + protected: KernelImp() = default; @@ -248,7 +250,7 @@ struct KernelImp : Kernel { void createPrintfBuffer(); void setAssertBuffer(); - virtual void evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::KernelDescriptor &kernelDescriptor) = 0; + MOCKABLE_VIRTUAL void evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::KernelDescriptor &kernelDescriptor); void *patchBindlessSurfaceState(NEO::GraphicsAllocation *alloc, uint32_t bindless); uint32_t getSurfaceStateIndexForBindlessOffset(NEO::CrossThreadDataOffset bindlessOffset) const; ze_result_t validateWorkgroupSize() const; @@ -268,6 +270,8 @@ struct KernelImp : Kernel { NEO::GraphicsAllocation *printfBuffer = nullptr; size_t syncBufferIndex = std::numeric_limits::max(); size_t regionGroupBarrierIndex = std::numeric_limits::max(); + uintptr_t surfaceStateAlignmentMask = 0; + uintptr_t surfaceStateAlignment = 0; uint32_t groupSize[3] = {0u, 0u, 0u}; uint32_t numThreadsPerThreadGroup = 1u; @@ -286,6 +290,7 @@ struct KernelImp : Kernel { uint32_t perThreadDataSizeForWholeThreadGroupAllocated = 0; uint32_t perThreadDataSizeForWholeThreadGroup = 0u; uint32_t perThreadDataSize = 0u; + uint32_t walkerInlineDataSize = 0; UnifiedMemoryControls unifiedMemoryControls; std::vector slmArgSizes; diff --git a/level_zero/core/test/unit_tests/fixtures/module_fixture.h b/level_zero/core/test/unit_tests/fixtures/module_fixture.h index 826ac33411..f946526c30 100644 --- a/level_zero/core/test/unit_tests/fixtures/module_fixture.h +++ b/level_zero/core/test/unit_tests/fixtures/module_fixture.h @@ -108,9 +108,12 @@ struct ModuleImmutableDataFixture : public DeviceFixture { using KernelImp::requiredWorkgroupOrder; using KernelImp::slmArgOffsetValues; using KernelImp::slmArgSizes; + using KernelImp::surfaceStateAlignment; + using KernelImp::surfaceStateAlignmentMask; using KernelImp::surfaceStateHeapData; using KernelImp::surfaceStateHeapDataSize; using KernelImp::unifiedMemoryControls; + using KernelImp::walkerInlineDataSize; MockKernel(MockModule *mockModule) : WhiteBox(mockModule) { implicitArgsVersion = 0; diff --git a/level_zero/core/test/unit_tests/mocks/mock_kernel.h b/level_zero/core/test/unit_tests/mocks/mock_kernel.h index dc4c10704e..f9170081f2 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_kernel.h +++ b/level_zero/core/test/unit_tests/mocks/mock_kernel.h @@ -74,11 +74,14 @@ struct WhiteBox<::L0::KernelImp> : public ::L0::KernelImp { using ::L0::KernelImp::setAssertBuffer; using ::L0::KernelImp::slmArgsTotalSize; using ::L0::KernelImp::suggestGroupSizeCache; + using ::L0::KernelImp::surfaceStateAlignment; + using ::L0::KernelImp::surfaceStateAlignmentMask; using ::L0::KernelImp::surfaceStateHeapData; using ::L0::KernelImp::surfaceStateHeapDataSize; using ::L0::KernelImp::syncBufferIndex; using ::L0::KernelImp::unifiedMemoryControls; using ::L0::KernelImp::usingSurfaceStateHeap; + using ::L0::KernelImp::walkerInlineDataSize; void setBufferSurfaceState(uint32_t argIndex, void *address, NEO::GraphicsAllocation *alloc) override {} diff --git a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp index a090a4c37f..77800bb99e 100644 --- a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp +++ b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp @@ -81,10 +81,13 @@ struct WhiteBoxKernelHw : public KernelHw { using ::L0::KernelImp::perThreadDataSizeForWholeThreadGroup; using ::L0::KernelImp::printfBuffer; using ::L0::KernelImp::requiredWorkgroupOrder; + using ::L0::KernelImp::surfaceStateAlignment; + using ::L0::KernelImp::surfaceStateAlignmentMask; using ::L0::KernelImp::surfaceStateHeapData; using ::L0::KernelImp::surfaceStateHeapDataSize; using ::L0::KernelImp::unifiedMemoryControls; using ::L0::KernelImp::usingSurfaceStateHeap; + using ::L0::KernelImp::walkerInlineDataSize; void evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::KernelDescriptor &kernelDescriptor) override {} @@ -108,6 +111,12 @@ TEST_F(KernelInitTest, givenKernelToInitWhenItHasUnknownArgThenUnknowKernelArgHa kernel->initialize(&desc); EXPECT_EQ(kernel->kernelArgHandlers[0], &KernelImp::setArgUnknown); EXPECT_EQ(mockKernelImmData->getDescriptor().payloadMappings.explicitArgs[0].type, NEO::ArgDescriptor::argTUnknown); + + auto &gfxCoreHelper = device->getGfxCoreHelper(); + + EXPECT_EQ(gfxCoreHelper.getSurfaceBaseAddressAlignment(), kernel->surfaceStateAlignment); + EXPECT_EQ(gfxCoreHelper.getSurfaceBaseAddressAlignmentMask(), kernel->surfaceStateAlignmentMask); + EXPECT_EQ(gfxCoreHelper.getDefaultWalkerInlineDataSize(), kernel->walkerInlineDataSize); } TEST_F(KernelInitTest, givenKernelToInitAndPreemptionEnabledWhenItHasUnknownArgThenUnknowKernelArgHandlerAssigned) { diff --git a/shared/source/helpers/gfx_core_helper.h b/shared/source/helpers/gfx_core_helper.h index 72b4a0efe1..9ce5118b75 100644 --- a/shared/source/helpers/gfx_core_helper.h +++ b/shared/source/helpers/gfx_core_helper.h @@ -216,6 +216,9 @@ class GfxCoreHelper { virtual bool isCacheFlushPriorImageReadRequired() const = 0; virtual uint32_t getQueuePriorityLevels() const = 0; + virtual uint32_t getDefaultWalkerInlineDataSize() const = 0; + virtual uintptr_t getSurfaceBaseAddressAlignmentMask() const = 0; + virtual uintptr_t getSurfaceBaseAddressAlignment() const = 0; virtual ~GfxCoreHelper() = default; @@ -270,6 +273,13 @@ class GfxCoreHelperHw : public GfxCoreHelper { return GfxFamily::template getInitInterfaceDescriptor().KERNELSTARTPOINTER_ALIGN_SIZE; } + uint32_t getDefaultWalkerInlineDataSize() const override { + using DefaultWalkerType = typename GfxFamily::DefaultWalkerType; + return DefaultWalkerType::getInlineDataSize(); + } + uintptr_t getSurfaceBaseAddressAlignmentMask() const override; + uintptr_t getSurfaceBaseAddressAlignment() const override; + uint32_t getComputeUnitsUsedForScratch(const RootDeviceEnvironment &rootDeviceEnvironment) const override; uint32_t getPitchAlignmentForImage(const RootDeviceEnvironment &rootDeviceEnvironment) const override; diff --git a/shared/source/helpers/gfx_core_helper_base.inl b/shared/source/helpers/gfx_core_helper_base.inl index 15945e5fff..57c28fdf64 100644 --- a/shared/source/helpers/gfx_core_helper_base.inl +++ b/shared/source/helpers/gfx_core_helper_base.inl @@ -928,4 +928,14 @@ uint32_t GfxCoreHelperHw::getQueuePriorityLevels() const { return 2; } +template +uintptr_t GfxCoreHelperHw::getSurfaceBaseAddressAlignmentMask() const { + return EncodeSurfaceState::getSurfaceBaseAddressAlignmentMask(); +} + +template +uintptr_t GfxCoreHelperHw::getSurfaceBaseAddressAlignment() const { + return EncodeSurfaceState::getSurfaceBaseAddressAlignment(); +} + } // namespace NEO diff --git a/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp b/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp index 5edd7a7e50..8ce0df8213 100644 --- a/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp +++ b/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp @@ -1959,3 +1959,19 @@ TEST_F(GfxCoreHelperTest, whenGetQueuePriorityLevelsQueriedThen2IsReturned) { auto &gfxCoreHelper = getHelper(); EXPECT_EQ(2u, gfxCoreHelper.getQueuePriorityLevels()); } + +HWTEST_F(GfxCoreHelperTest, whenGettingWalkerInlineDataSizeThenCorrectValueReturned) { + using DefaultWalkerType = typename FamilyType::DefaultWalkerType; + auto &gfxCoreHelper = getHelper(); + EXPECT_EQ(DefaultWalkerType::getInlineDataSize(), gfxCoreHelper.getDefaultWalkerInlineDataSize()); +} + +HWTEST_F(GfxCoreHelperTest, whenGettingSurfaceBaseAddressAlignmentMaskThenCorrectValueReturned) { + auto &gfxCoreHelper = getHelper(); + EXPECT_EQ(EncodeSurfaceState::getSurfaceBaseAddressAlignmentMask(), gfxCoreHelper.getSurfaceBaseAddressAlignmentMask()); +} + +HWTEST_F(GfxCoreHelperTest, whenGettingSurfaceBaseAddressAlignmentThenCorrectValueReturned) { + auto &gfxCoreHelper = getHelper(); + EXPECT_EQ(EncodeSurfaceState::getSurfaceBaseAddressAlignment(), gfxCoreHelper.getSurfaceBaseAddressAlignment()); +}