From f1c2ea0b699124447a1ca5e6d5561460e8fb0078 Mon Sep 17 00:00:00 2001 From: John Falkowski Date: Thu, 25 Apr 2024 06:14:39 +0000 Subject: [PATCH] fix: kernel access to multiple stateful virtual regions Related-to: NEO-8350 Signed-off-by: John Falkowski --- level_zero/core/source/kernel/kernel_hw.h | 20 ++++------------ .../unit_tests/sources/module/test_module.cpp | 24 +++++++------------ .../command_container/command_encoder.inl | 4 ++-- shared/source/helpers/constants.h | 1 + 4 files changed, 16 insertions(+), 33 deletions(-) diff --git a/level_zero/core/source/kernel/kernel_hw.h b/level_zero/core/source/kernel/kernel_hw.h index 95de2beb1c..aa60acc892 100644 --- a/level_zero/core/source/kernel/kernel_hw.h +++ b/level_zero/core/source/kernel/kernel_hw.h @@ -39,23 +39,10 @@ struct KernelHw : public KernelImp { auto misalignedSize = ptrDiff(alloc->getGpuAddressToPatch(), baseAddress); auto offset = ptrDiff(address, reinterpret_cast(baseAddress)); size_t bufferSizeForSsh = alloc->getUnderlyingBufferSize(); - // If the allocation is part of a mapped virtual range, then check to see if the buffer size needs to be extended to include more physical buffers. + // If the allocation is part of a mapped virtual range, then set size to maximum to allow for access across multiple virtual ranges. Device *device = module->getDevice(); auto allocData = device->getDriverHandle()->getSvmAllocsManager()->getSVMAlloc(reinterpret_cast(alloc->getGpuAddress())); - if (allocData && allocData->virtualReservationData) { - size_t calcBufferSizeForSsh = bufferSizeForSsh; - for (const auto &mappedAllocationData : allocData->virtualReservationData->mappedAllocations) { - // Add additional allocations buffer size to be programmed to allow full usage of the memory range if the allocation is after this starting address. - if (address != mappedAllocationData.second->ptr && mappedAllocationData.second->ptr > address) { - calcBufferSizeForSsh += mappedAllocationData.second->mappedAllocation->allocation->getUnderlyingBufferSize(); - // Only allow for the surface state to be extended up to 4GB in size. - bufferSizeForSsh = std::min(calcBufferSizeForSsh, MemoryConstants::gigaByte * 4); - if (bufferSizeForSsh == MemoryConstants::gigaByte * 4) { - break; - } - } - } - } + auto argInfo = kernelImmData->getDescriptor().payloadMappings.explicitArgs[argIndex].as(); bool offsetWasPatched = NEO::patchNonPointer(ArrayRef(this->crossThreadData.get(), this->crossThreadDataSize), argInfo.bufferOffset, static_cast(offset)); @@ -111,6 +98,9 @@ struct KernelHw : public KernelImp { NEO::EncodeSurfaceStateArgs args; args.outMemory = &surfaceState; args.graphicsAddress = bufferAddressForSsh; + if (allocData && allocData->virtualReservationData) { + bufferSizeForSsh = MemoryConstants::fullStatefulRegion; + } args.size = bufferSizeForSsh; args.mocs = device->getMOCS(l3Enabled, false); args.numAvailableDevices = neoDevice->getNumGenericSubDevices(); diff --git a/level_zero/core/test/unit_tests/sources/module/test_module.cpp b/level_zero/core/test/unit_tests/sources/module/test_module.cpp index 7b8b053148..59d16a628a 100644 --- a/level_zero/core/test/unit_tests/sources/module/test_module.cpp +++ b/level_zero/core/test/unit_tests/sources/module/test_module.cpp @@ -2510,24 +2510,20 @@ HWTEST_F(MultiDeviceModuleSetArgBufferTest, bool phys1Resident = false; bool phys2Resident = false; - NEO::GraphicsAllocation *baseAlloc = nullptr; - NEO::GraphicsAllocation *offsetAlloc = nullptr; for (auto alloc : kernel->getResidencyContainer()) { if (alloc && alloc->getGpuAddress() == reinterpret_cast(ptr)) { phys1Resident = true; - baseAlloc = alloc; } if (alloc && alloc->getGpuAddress() == reinterpret_cast(offsetAddress)) { phys2Resident = true; - offsetAlloc = alloc; } } auto argInfo = kernel->getImmutableData()->getDescriptor().payloadMappings.explicitArgs[0].as(); auto surfaceStateAddressRaw = ptrOffset(kernel->getSurfaceStateHeapData(), argInfo.bindful); auto surfaceStateAddress = reinterpret_cast(const_cast(surfaceStateAddressRaw)); SurfaceStateBufferLength length = {0}; - length.length = static_cast((baseAlloc->getUnderlyingBufferSize() + offsetAlloc->getUnderlyingBufferSize()) - 1); - EXPECT_EQ(surfaceStateAddress->getWidth(), static_cast(length.surfaceState.width + 1)); + length.length = static_cast((MemoryConstants::fullStatefulRegion)-1); + EXPECT_EQ(surfaceStateAddress->getWidth(), static_cast(length.surfaceState.width)); EXPECT_EQ(surfaceStateAddress->getHeight(), static_cast(length.surfaceState.height + 1)); EXPECT_EQ(surfaceStateAddress->getDepth(), static_cast(length.surfaceState.depth + 1)); EXPECT_TRUE(phys1Resident); @@ -2584,22 +2580,20 @@ HWTEST_F(MultiDeviceModuleSetArgBufferTest, bool phys1Resident = false; bool phys2Resident = false; - NEO::GraphicsAllocation *offsetAlloc = nullptr; for (auto alloc : kernel->getResidencyContainer()) { if (alloc && alloc->getGpuAddress() == reinterpret_cast(ptr)) { phys1Resident = true; } if (alloc && alloc->getGpuAddress() == reinterpret_cast(offsetAddress)) { phys2Resident = true; - offsetAlloc = alloc; } } auto argInfo = kernel->getImmutableData()->getDescriptor().payloadMappings.explicitArgs[0].as(); auto surfaceStateAddressRaw = ptrOffset(kernel->getSurfaceStateHeapData(), argInfo.bindful); auto surfaceStateAddress = reinterpret_cast(const_cast(surfaceStateAddressRaw)); SurfaceStateBufferLength length = {0}; - length.length = static_cast(offsetAlloc->getUnderlyingBufferSize() - 1); - EXPECT_EQ(surfaceStateAddress->getWidth(), static_cast(length.surfaceState.width + 1)); + length.length = static_cast((MemoryConstants::fullStatefulRegion)-1); + EXPECT_EQ(surfaceStateAddress->getWidth(), static_cast(length.surfaceState.width)); EXPECT_EQ(surfaceStateAddress->getHeight(), static_cast(length.surfaceState.height + 1)); EXPECT_EQ(surfaceStateAddress->getDepth(), static_cast(length.surfaceState.depth + 1)); EXPECT_TRUE(phys1Resident); @@ -2649,19 +2643,17 @@ HWTEST_F(MultiDeviceModuleSetArgBufferTest, kernel->setArgBuffer(0, sizeof(ptr), &ptr); bool phys1Resident = false; - NEO::GraphicsAllocation *baseAlloc = nullptr; for (auto alloc : kernel->getResidencyContainer()) { if (alloc && alloc->getGpuAddress() == reinterpret_cast(ptr)) { phys1Resident = true; - baseAlloc = alloc; } } auto argInfo = kernel->getImmutableData()->getDescriptor().payloadMappings.explicitArgs[0].as(); auto surfaceStateAddressRaw = ptrOffset(kernel->getSurfaceStateHeapData(), argInfo.bindful); auto surfaceStateAddress = reinterpret_cast(const_cast(surfaceStateAddressRaw)); SurfaceStateBufferLength length = {0}; - length.length = static_cast(baseAlloc->getUnderlyingBufferSize() - 1); - EXPECT_EQ(surfaceStateAddress->getWidth(), static_cast(length.surfaceState.width + 1)); + length.length = static_cast((MemoryConstants::fullStatefulRegion)-1); + EXPECT_EQ(surfaceStateAddress->getWidth(), static_cast(length.surfaceState.width)); EXPECT_EQ(surfaceStateAddress->getHeight(), static_cast(length.surfaceState.height + 1)); EXPECT_EQ(surfaceStateAddress->getDepth(), static_cast(length.surfaceState.depth + 1)); EXPECT_TRUE(phys1Resident); @@ -2737,8 +2729,8 @@ HWTEST_F(MultiDeviceModuleSetArgBufferTest, auto surfaceStateAddressRaw = ptrOffset(kernel->getSurfaceStateHeapData(), argInfo.bindful); auto surfaceStateAddress = reinterpret_cast(const_cast(surfaceStateAddressRaw)); SurfaceStateBufferLength length = {0}; - length.length = static_cast((MemoryConstants::gigaByte * 4) - 1); - EXPECT_EQ(surfaceStateAddress->getWidth(), static_cast(length.surfaceState.width + 1)); + length.length = static_cast((MemoryConstants::fullStatefulRegion)-1); + EXPECT_EQ(surfaceStateAddress->getWidth(), static_cast(length.surfaceState.width)); EXPECT_EQ(surfaceStateAddress->getHeight(), static_cast(length.surfaceState.height + 1)); EXPECT_EQ(surfaceStateAddress->getDepth(), static_cast(length.surfaceState.depth + 1)); EXPECT_TRUE(phys1Resident); diff --git a/shared/source/command_container/command_encoder.inl b/shared/source/command_container/command_encoder.inl index 28775c2c7c..cabc58dd8b 100644 --- a/shared/source/command_container/command_encoder.inl +++ b/shared/source/command_container/command_encoder.inl @@ -414,8 +414,8 @@ inline void EncodeStoreMMIO::encode(MI_STORE_REGISTER_MEM *cmdBuffer, ui template void EncodeSurfaceState::encodeBuffer(EncodeSurfaceStateArgs &args) { auto surfaceState = reinterpret_cast(args.outMemory); - auto bufferSize = alignUp(args.size, getSurfaceBaseAddressAlignment()); - + uint64_t bufferSize = alignUp(args.size, getSurfaceBaseAddressAlignment()); + bufferSize = std::min(bufferSize, static_cast(MemoryConstants::fullStatefulRegion - 1)); SurfaceStateBufferLength length = {0}; length.length = static_cast(bufferSize - 1); diff --git a/shared/source/helpers/constants.h b/shared/source/helpers/constants.h index 080a7aa196..fdf5849052 100644 --- a/shared/source/helpers/constants.h +++ b/shared/source/helpers/constants.h @@ -33,6 +33,7 @@ inline constexpr uint64_t kiloByteShiftSize = 10; inline constexpr uint64_t megaByte = 1024 * kiloByte; inline constexpr uint64_t gigaByte = 1024 * megaByte; inline constexpr uint64_t teraByte = 1024 * gigaByte; +inline constexpr uint64_t fullStatefulRegion = 4 * gigaByte; inline constexpr size_t minBufferAlignment = 4; inline constexpr size_t cacheLineSize = 64; inline constexpr size_t pageSize = 4 * kiloByte;