fix: Calculate size of buffer surface state given mapped allocations

Related-To: NEO-8350

- given a virtual address part of a mapping to multiple physical
allocations, then the buffer surface state size is increased to
include the allocations which follow the current allocation, which
allows users access to the remainder of the mapped buffer.

Signed-off-by: Spruit, Neil R <neil.r.spruit@intel.com>
This commit is contained in:
Spruit, Neil R
2023-10-12 01:44:07 +00:00
committed by Compute-Runtime-Automation
parent ff1a8770fe
commit 58fa968273
7 changed files with 195 additions and 39 deletions

View File

@@ -2231,9 +2231,6 @@ inline AlignedAllocationData CommandListCoreFamily<gfxCoreFamily>::getAlignedAll
// Add additional allocations to the residency container if the virtual reservation spans multiple allocations.
if (buffer != mappedAllocationData.second->ptr) {
commandContainer.addToResidencyContainer(mappedAllocationData.second->mappedAllocation->allocation);
} else if (mappedAllocationData.second->mappedAllocation->allocation->getUnderlyingBufferSize() < allocData->virtualReservationData->virtualAddressRange.size) {
// If the target buffer is the same as the virtual reservation, but the allocation is less than the full reserved size, then extend the size to the full reserved size.
mappedAllocationData.second->mappedAllocation->allocation->setExtendedSize(allocData->virtualReservationData->virtualAddressRange.size);
}
}
}

View File

@@ -12,6 +12,7 @@
#include "shared/source/device/device.h"
#include "shared/source/helpers/aligned_memory.h"
#include "shared/source/helpers/cache_policy.h"
#include "shared/source/memory_manager/memory_manager.h"
#include "shared/source/memory_manager/unified_memory_manager.h"
#include "level_zero/core/source/device/device.h"
@@ -38,10 +39,22 @@ struct KernelHw : public KernelImp {
auto misalignedSize = ptrDiff(alloc->getGpuAddressToPatch(), baseAddress);
auto offset = ptrDiff(address, reinterpret_cast<void *>(baseAddress));
size_t bufferSizeForSsh = alloc->getUnderlyingBufferSize();
// If the allocation has been set with an extended size to span other resident allocations, then program that size for the surface state & reset the extended size to 0 in the allocation structure.
if (alloc->getExtendedBufferSize() != 0) {
bufferSizeForSsh = alloc->getExtendedBufferSize();
alloc->setExtendedSize(0);
// If the allocation is part of a mapped virtual range, then check to see if the buffer size needs to be extended to include more physical buffers.
Device *device = module->getDevice();
auto allocData = device->getDriverHandle()->getSvmAllocsManager()->getSVMAlloc(reinterpret_cast<void *>(alloc->getGpuAddress()));
if (allocData && allocData->virtualReservationData) {
size_t calcBufferSizeForSsh = bufferSizeForSsh;
for (const auto &mappedAllocationData : allocData->virtualReservationData->mappedAllocations) {
// Add additional allocations buffer size to be programmed to allow full usage of the memory range if the allocation is after this starting address.
if (address != mappedAllocationData.second->ptr && mappedAllocationData.second->ptr > address) {
calcBufferSizeForSsh += mappedAllocationData.second->mappedAllocation->allocation->getUnderlyingBufferSize();
// Only allow for the surface state to be extended up to 4GB in size.
bufferSizeForSsh = std::min(calcBufferSizeForSsh, MemoryConstants::gigaByte * 4);
if (bufferSizeForSsh == MemoryConstants::gigaByte * 4) {
break;
}
}
}
}
auto argInfo = kernelImmData->getDescriptor().payloadMappings.explicitArgs[argIndex].as<NEO::ArgDescPointer>();
bool offsetWasPatched = NEO::patchNonPointer<uint32_t, uint32_t>(ArrayRef<uint8_t>(this->crossThreadData.get(), this->crossThreadDataSize),
@@ -85,10 +98,8 @@ struct KernelHw : public KernelImp {
// Most commonly this issue will occur with Host Point Allocations from customers.
l3Enabled = isL3Capable(*alloc);
Device *device = module->getDevice();
NEO::Device *neoDevice = device->getNEODevice();
auto allocData = device->getDriverHandle()->getSvmAllocsManager()->getSVMAlloc(reinterpret_cast<void *>(alloc->getGpuAddress()));
if (allocData && allocData->allocationFlagsProperty.flags.locallyUncachedResource) {
l3Enabled = false;
}

View File

@@ -741,9 +741,6 @@ ze_result_t KernelImp::setArgBuffer(uint32_t argIndex, size_t argSize, const voi
// Add additional allocations to the residency container if the virtual reservation spans multiple allocations.
if (requestedAddress != mappedAllocationData.second->ptr) {
this->residencyContainer.push_back(mappedAllocationData.second->mappedAllocation->allocation);
} else if (mappedAllocationData.second->mappedAllocation->allocation->getUnderlyingBufferSize() < allocData->virtualReservationData->virtualAddressRange.size) {
// If the target buffer is the same as the virtual reservation, but the allocation is less than the full reserved size, then extend the size to the full reserved size.
mappedAllocationData.second->mappedAllocation->allocation->setExtendedSize(allocData->virtualReservationData->virtualAddressRange.size);
}
}
}

View File

@@ -858,8 +858,6 @@ HWTEST2_F(CommandListTest, givenComputeCommandListWhenMemoryCopyWithReservedDevi
phys2Resident = true;
}
}
NEO::GraphicsAllocation *baseAlloc = reinterpret_cast<NEO::GraphicsAllocation *>(phPhysicalMemory);
EXPECT_EQ(reservationSize, baseAlloc->getExtendedBufferSize());
EXPECT_TRUE(phys2Resident);
res = context->unMapVirtualMem(dstBuffer, size);
@@ -899,9 +897,6 @@ HWTEST2_F(CommandListTest, givenComputeCommandListWhenMemoryCopyWithOneReservedD
commandList->appendMemoryCopy(dstBuffer, srcPtr, reservationSize, nullptr, 0, nullptr, false, false);
NEO::GraphicsAllocation *baseAlloc = reinterpret_cast<NEO::GraphicsAllocation *>(phPhysicalMemory);
EXPECT_EQ(0u, baseAlloc->getExtendedBufferSize());
res = context->unMapVirtualMem(dstBuffer, reservationSize);
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
res = context->freeVirtualMem(dstBuffer, reservationSize);

View File

@@ -2409,7 +2409,7 @@ class MultiDeviceModuleSetArgBufferTest : public MultiDeviceModuleFixture, publi
HWTEST_F(MultiDeviceModuleSetArgBufferTest,
givenCallsToSetArgBufferWithReservedMemoryThenResidencyContainerHasAllMappedAllocations) {
using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE;
for (uint32_t rootDeviceIndex = 0; rootDeviceIndex < numRootDevices; rootDeviceIndex++) {
createModuleFromMockBinary(rootDeviceIndex);
auto device = driverHandle->devices[rootDeviceIndex];
@@ -2446,6 +2446,7 @@ HWTEST_F(MultiDeviceModuleSetArgBufferTest,
bool phys1Resident = false;
bool phys2Resident = false;
NEO::GraphicsAllocation *baseAlloc = nullptr;
NEO::GraphicsAllocation *offsetAlloc = nullptr;
for (auto alloc : kernel->getResidencyContainer()) {
if (alloc && alloc->getGpuAddress() == reinterpret_cast<uint64_t>(ptr)) {
phys1Resident = true;
@@ -2453,13 +2454,21 @@ HWTEST_F(MultiDeviceModuleSetArgBufferTest,
}
if (alloc && alloc->getGpuAddress() == reinterpret_cast<uint64_t>(offsetAddress)) {
phys2Resident = true;
offsetAlloc = alloc;
}
}
auto argInfo = kernel->getImmutableData()->getDescriptor().payloadMappings.explicitArgs[0].as<NEO::ArgDescPointer>();
auto surfaceStateAddressRaw = ptrOffset(kernel->getSurfaceStateHeapData(), argInfo.bindful);
auto surfaceStateAddress = reinterpret_cast<RENDER_SURFACE_STATE *>(const_cast<unsigned char *>(surfaceStateAddressRaw));
SURFACE_STATE_BUFFER_LENGTH length = {0};
length.length = static_cast<uint32_t>((baseAlloc->getUnderlyingBufferSize() + offsetAlloc->getUnderlyingBufferSize()) - 1);
EXPECT_EQ(surfaceStateAddress->getWidth(), static_cast<uint32_t>(length.surfaceState.width + 1));
EXPECT_EQ(surfaceStateAddress->getHeight(), static_cast<uint32_t>(length.surfaceState.height + 1));
EXPECT_EQ(surfaceStateAddress->getDepth(), static_cast<uint32_t>(length.surfaceState.depth + 1));
EXPECT_TRUE(phys1Resident);
EXPECT_TRUE(phys2Resident);
res = context->unMapVirtualMem(ptr, size);
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
EXPECT_EQ(0u, baseAlloc->getExtendedBufferSize());
res = context->unMapVirtualMem(offsetAddress, size);
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
res = context->freeVirtualMem(ptr, reservationSize);
@@ -2473,8 +2482,80 @@ HWTEST_F(MultiDeviceModuleSetArgBufferTest,
}
HWTEST_F(MultiDeviceModuleSetArgBufferTest,
givenCallsToSetArgBufferWithReservedMemoryWithMappingToFullReservedSizeThenExtendedBufferSizeIsZero) {
givenCallsToSetArgBufferWithOffsetReservedMemoryThenResidencyContainerHasAllMappedAllocations) {
using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE;
for (uint32_t rootDeviceIndex = 0; rootDeviceIndex < numRootDevices; rootDeviceIndex++) {
createModuleFromMockBinary(rootDeviceIndex);
auto device = driverHandle->devices[rootDeviceIndex];
driverHandle->devices[rootDeviceIndex]->getNEODevice()->getExecutionEnvironment()->rootDeviceEnvironments[rootDeviceIndex]->memoryOperationsInterface =
std::make_unique<NEO::MockMemoryOperations>();
ze_kernel_handle_t kernelHandle;
void *ptr = nullptr;
size_t size = MemoryConstants::pageSize64k;
size_t reservationSize = size * 2;
ze_kernel_desc_t kernelDesc = {};
kernelDesc.pKernelName = kernelName.c_str();
ze_result_t res = modules[rootDeviceIndex]->createKernel(&kernelDesc, &kernelHandle);
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
res = context->reserveVirtualMem(nullptr, reservationSize, &ptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
ze_physical_mem_desc_t desc = {};
desc.size = size;
ze_physical_mem_handle_t phPhysicalMemory;
res = context->createPhysicalMem(device->toHandle(), &desc, &phPhysicalMemory);
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
ze_physical_mem_handle_t phPhysicalMemory2;
res = context->createPhysicalMem(device->toHandle(), &desc, &phPhysicalMemory2);
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
res = context->mapVirtualMem(ptr, size, phPhysicalMemory, 0, ZE_MEMORY_ACCESS_ATTRIBUTE_READWRITE);
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
void *offsetAddress = reinterpret_cast<void *>(reinterpret_cast<uint64_t>(ptr) + size);
res = context->mapVirtualMem(offsetAddress, size, phPhysicalMemory2, 0, ZE_MEMORY_ACCESS_ATTRIBUTE_READWRITE);
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
L0::KernelImp *kernel = reinterpret_cast<L0::KernelImp *>(Kernel::fromHandle(kernelHandle));
kernel->setArgBuffer(0, sizeof(offsetAddress), &offsetAddress);
bool phys1Resident = false;
bool phys2Resident = false;
NEO::GraphicsAllocation *offsetAlloc = nullptr;
for (auto alloc : kernel->getResidencyContainer()) {
if (alloc && alloc->getGpuAddress() == reinterpret_cast<uint64_t>(ptr)) {
phys1Resident = true;
}
if (alloc && alloc->getGpuAddress() == reinterpret_cast<uint64_t>(offsetAddress)) {
phys2Resident = true;
offsetAlloc = alloc;
}
}
auto argInfo = kernel->getImmutableData()->getDescriptor().payloadMappings.explicitArgs[0].as<NEO::ArgDescPointer>();
auto surfaceStateAddressRaw = ptrOffset(kernel->getSurfaceStateHeapData(), argInfo.bindful);
auto surfaceStateAddress = reinterpret_cast<RENDER_SURFACE_STATE *>(const_cast<unsigned char *>(surfaceStateAddressRaw));
SURFACE_STATE_BUFFER_LENGTH length = {0};
length.length = static_cast<uint32_t>(offsetAlloc->getUnderlyingBufferSize() - 1);
EXPECT_EQ(surfaceStateAddress->getWidth(), static_cast<uint32_t>(length.surfaceState.width + 1));
EXPECT_EQ(surfaceStateAddress->getHeight(), static_cast<uint32_t>(length.surfaceState.height + 1));
EXPECT_EQ(surfaceStateAddress->getDepth(), static_cast<uint32_t>(length.surfaceState.depth + 1));
EXPECT_TRUE(phys1Resident);
EXPECT_TRUE(phys2Resident);
res = context->unMapVirtualMem(ptr, size);
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
res = context->unMapVirtualMem(offsetAddress, size);
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
res = context->freeVirtualMem(ptr, reservationSize);
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
res = context->destroyPhysicalMem(phPhysicalMemory);
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
res = context->destroyPhysicalMem(phPhysicalMemory2);
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
Kernel::fromHandle(kernelHandle)->destroy();
}
}
HWTEST_F(MultiDeviceModuleSetArgBufferTest,
givenCallsToSetArgBufferWithReservedMemoryWithMappingToFullReservedSizeThenSurfaceStateSizeisUnchanged) {
using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE;
for (uint32_t rootDeviceIndex = 0; rootDeviceIndex < numRootDevices; rootDeviceIndex++) {
createModuleFromMockBinary(rootDeviceIndex);
auto device = driverHandle->devices[rootDeviceIndex];
@@ -2510,8 +2591,15 @@ HWTEST_F(MultiDeviceModuleSetArgBufferTest,
baseAlloc = alloc;
}
}
auto argInfo = kernel->getImmutableData()->getDescriptor().payloadMappings.explicitArgs[0].as<NEO::ArgDescPointer>();
auto surfaceStateAddressRaw = ptrOffset(kernel->getSurfaceStateHeapData(), argInfo.bindful);
auto surfaceStateAddress = reinterpret_cast<RENDER_SURFACE_STATE *>(const_cast<unsigned char *>(surfaceStateAddressRaw));
SURFACE_STATE_BUFFER_LENGTH length = {0};
length.length = static_cast<uint32_t>(baseAlloc->getUnderlyingBufferSize() - 1);
EXPECT_EQ(surfaceStateAddress->getWidth(), static_cast<uint32_t>(length.surfaceState.width + 1));
EXPECT_EQ(surfaceStateAddress->getHeight(), static_cast<uint32_t>(length.surfaceState.height + 1));
EXPECT_EQ(surfaceStateAddress->getDepth(), static_cast<uint32_t>(length.surfaceState.depth + 1));
EXPECT_TRUE(phys1Resident);
EXPECT_EQ(0u, baseAlloc->getExtendedBufferSize());
res = context->unMapVirtualMem(ptr, reservationSize);
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
res = context->freeVirtualMem(ptr, reservationSize);
@@ -2522,6 +2610,92 @@ HWTEST_F(MultiDeviceModuleSetArgBufferTest,
}
}
HWTEST_F(MultiDeviceModuleSetArgBufferTest,
givenCallsToSetArgBufferWithReservedMemoryWithMappingLargerThan4GBThenSurfaceStateSizeProgrammedDoesNotExceed4GB) {
using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE;
for (uint32_t rootDeviceIndex = 0; rootDeviceIndex < numRootDevices; rootDeviceIndex++) {
createModuleFromMockBinary(rootDeviceIndex);
auto device = driverHandle->devices[rootDeviceIndex];
driverHandle->devices[rootDeviceIndex]->getNEODevice()->getExecutionEnvironment()->rootDeviceEnvironments[rootDeviceIndex]->memoryOperationsInterface =
std::make_unique<NEO::MockMemoryOperations>();
ze_kernel_handle_t kernelHandle;
void *ptr = nullptr;
size_t size = MemoryConstants::pageSize64k;
size_t reservationSize = size * 4;
ze_kernel_desc_t kernelDesc = {};
kernelDesc.pKernelName = kernelName.c_str();
ze_result_t res = modules[rootDeviceIndex]->createKernel(&kernelDesc, &kernelHandle);
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
res = context->reserveVirtualMem(nullptr, reservationSize, &ptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
ze_physical_mem_desc_t desc = {};
desc.size = size;
ze_physical_mem_handle_t phPhysicalMemory;
res = context->createPhysicalMem(device->toHandle(), &desc, &phPhysicalMemory);
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
ze_physical_mem_handle_t phPhysicalMemory2;
res = context->createPhysicalMem(device->toHandle(), &desc, &phPhysicalMemory2);
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
ze_physical_mem_handle_t phPhysicalMemory3;
res = context->createPhysicalMem(device->toHandle(), &desc, &phPhysicalMemory3);
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
res = context->mapVirtualMem(ptr, size, phPhysicalMemory, 0, ZE_MEMORY_ACCESS_ATTRIBUTE_READWRITE);
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
void *offsetAddress = reinterpret_cast<void *>(reinterpret_cast<uint64_t>(ptr) + size);
res = context->mapVirtualMem(offsetAddress, size, phPhysicalMemory2, 0, ZE_MEMORY_ACCESS_ATTRIBUTE_READWRITE);
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
void *offsetAddress2 = reinterpret_cast<void *>(reinterpret_cast<uint64_t>(ptr) + (size * 2));
res = context->mapVirtualMem(offsetAddress2, size, phPhysicalMemory3, 0, ZE_MEMORY_ACCESS_ATTRIBUTE_READWRITE);
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
auto svmAllocsManager = device->getDriverHandle()->getSvmAllocsManager();
auto virtualAlloc = svmAllocsManager->getSVMAlloc(ptr);
virtualAlloc->virtualReservationData->mappedAllocations.at(offsetAddress)->mappedAllocation->allocation->setSize((MemoryConstants::gigaByte * 4) - MemoryConstants::pageSize64k);
L0::KernelImp *kernel = reinterpret_cast<L0::KernelImp *>(Kernel::fromHandle(kernelHandle));
kernel->setArgBuffer(0, sizeof(ptr), &ptr);
virtualAlloc->virtualReservationData->mappedAllocations.at(offsetAddress)->mappedAllocation->allocation->setSize(size);
bool phys1Resident = false;
bool phys2Resident = false;
for (auto alloc : kernel->getResidencyContainer()) {
if (alloc && alloc->getGpuAddress() == reinterpret_cast<uint64_t>(ptr)) {
phys1Resident = true;
}
if (alloc && alloc->getGpuAddress() == reinterpret_cast<uint64_t>(offsetAddress)) {
phys2Resident = true;
}
}
auto argInfo = kernel->getImmutableData()->getDescriptor().payloadMappings.explicitArgs[0].as<NEO::ArgDescPointer>();
auto surfaceStateAddressRaw = ptrOffset(kernel->getSurfaceStateHeapData(), argInfo.bindful);
auto surfaceStateAddress = reinterpret_cast<RENDER_SURFACE_STATE *>(const_cast<unsigned char *>(surfaceStateAddressRaw));
SURFACE_STATE_BUFFER_LENGTH length = {0};
length.length = static_cast<uint32_t>((MemoryConstants::gigaByte * 4) - 1);
EXPECT_EQ(surfaceStateAddress->getWidth(), static_cast<uint32_t>(length.surfaceState.width + 1));
EXPECT_EQ(surfaceStateAddress->getHeight(), static_cast<uint32_t>(length.surfaceState.height + 1));
EXPECT_EQ(surfaceStateAddress->getDepth(), static_cast<uint32_t>(length.surfaceState.depth + 1));
EXPECT_TRUE(phys1Resident);
EXPECT_TRUE(phys2Resident);
res = context->unMapVirtualMem(ptr, size);
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
res = context->unMapVirtualMem(offsetAddress, size);
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
res = context->unMapVirtualMem(offsetAddress2, size);
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
res = context->freeVirtualMem(ptr, reservationSize);
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
res = context->destroyPhysicalMem(phPhysicalMemory);
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
res = context->destroyPhysicalMem(phPhysicalMemory2);
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
res = context->destroyPhysicalMem(phPhysicalMemory3);
EXPECT_EQ(ZE_RESULT_SUCCESS, res);
Kernel::fromHandle(kernelHandle)->destroy();
}
}
HWTEST_F(MultiDeviceModuleSetArgBufferTest,
givenCallsToSetArgBufferThenAllocationIsSetForCorrectDevice) {