From 689ceacfe6376bed5d65f457098cda2d1a8a2c5f Mon Sep 17 00:00:00 2001 From: Maciej Plewka Date: Tue, 1 Jun 2021 13:19:02 +0000 Subject: [PATCH] Fix set allocation adress in SS when offset is patched Signed-off-by: Maciej Plewka --- level_zero/core/source/kernel/kernel_hw.h | 6 +- .../sources/cmdlist/test_cmdlist_2.cpp | 20 ++++- .../sources/cmdlist/test_cmdlist_3.cpp | 4 +- .../unit_tests/sources/module/test_module.cpp | 83 +++++++++++++++++++ .../command_container/command_encoder.h | 4 +- .../command_container/command_encoder.inl | 2 +- 6 files changed, 105 insertions(+), 14 deletions(-) diff --git a/level_zero/core/source/kernel/kernel_hw.h b/level_zero/core/source/kernel/kernel_hw.h index a48209661a..0f594739f4 100644 --- a/level_zero/core/source/kernel/kernel_hw.h +++ b/level_zero/core/source/kernel/kernel_hw.h @@ -29,14 +29,13 @@ struct KernelHw : public KernelImp { using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; void setBufferSurfaceState(uint32_t argIndex, void *address, NEO::GraphicsAllocation *alloc) override { - uint64_t baseAddress = castToUint64(address); + uint64_t baseAddress = alloc->getGpuAddressToPatch(); auto sshAlignmentMask = NEO::EncodeSurfaceState::getSurfaceBaseAddressAlignmentMask(); // Remove misalligned bytes, accounted for in in bufferOffset patch token baseAddress &= sshAlignmentMask; auto offset = ptrDiff(address, reinterpret_cast(baseAddress)); - size_t sizeTillEndOfSurface = alloc->getUnderlyingBufferSize() - offset; auto argInfo = kernelImmData->getDescriptor().payloadMappings.explicitArgs[argIndex].as(); bool offsetWasPatched = NEO::patchNonPointer(ArrayRef(this->crossThreadData.get(), this->crossThreadDataSize), argInfo.bufferOffset, static_cast(offset)); @@ -54,8 +53,7 @@ struct KernelHw : public KernelImp { } uint64_t bufferAddressForSsh = baseAddress; auto alignment = NEO::EncodeSurfaceState::getSurfaceBaseAddressAlignment(); - size_t bufferSizeForSsh = ptrDiff(alloc->getGpuAddress(), bufferAddressForSsh); - bufferSizeForSsh += sizeTillEndOfSurface; // take address alignment offset into account + size_t bufferSizeForSsh = alloc->getUnderlyingBufferSize(); bufferSizeForSsh = alignUp(bufferSizeForSsh, alignment); bool l3Enabled = true; diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp index 7ab04939d3..1753397896 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp @@ -256,10 +256,10 @@ HWTEST2_F(AppendMemoryCopy, givenCommandListAndUnalignedHostPointersWhenMemoryCo EXPECT_TRUE(cmdList.dstAlignedPtr == (cmdList.dstAlignedPtr & sshAlignmentMask)); } -HWTEST2_F(AppendMemoryCopy, givenCommandListAndUnalignedHostPointersWhenBlitMemoryCopyRegion2DCalledThenSrcDstPointersArePageAligned, Platforms) { +HWTEST2_F(AppendMemoryCopy, givenCommandListAndUnalignedHostPointersWhenBlitMemoryCopyRegion2DCalledThenSrcDstNotZeroOffsetsArePassed, Platforms) { MockAppendMemoryCopy cmdList; cmdList.initialize(device, NEO::EngineGroupType::Copy); - void *srcPtr = reinterpret_cast(0x1234); + void *srcPtr = reinterpret_cast(0x1233); void *dstPtr = reinterpret_cast(0x2345); ze_copy_region_t dstRegion = {4, 4, 0, 2, 2, 0}; ze_copy_region_t srcRegion = {4, 4, 0, 2, 2, 0}; @@ -268,10 +268,10 @@ HWTEST2_F(AppendMemoryCopy, givenCommandListAndUnalignedHostPointersWhenBlitMemo EXPECT_GT(cmdList.dstBlitCopyRegionOffset, 0u); } -HWTEST2_F(AppendMemoryCopy, givenCommandListAndUnalignedHostPointersWhenBlitMemoryCopyRegion3DCalledThenSrcDstPointersArePageAligned, Platforms) { +HWTEST2_F(AppendMemoryCopy, givenCommandListAndUnalignedHostPointersWhenBlitMemoryCopyRegion3DCalledThenSrcDstNotZeroOffsetsArePassed, Platforms) { MockAppendMemoryCopy cmdList; cmdList.initialize(device, NEO::EngineGroupType::Copy); - void *srcPtr = reinterpret_cast(0x1234); + void *srcPtr = reinterpret_cast(0x1233); void *dstPtr = reinterpret_cast(0x2345); ze_copy_region_t dstRegion = {4, 4, 4, 2, 2, 2}; ze_copy_region_t srcRegion = {4, 4, 4, 2, 2, 2}; @@ -280,6 +280,18 @@ HWTEST2_F(AppendMemoryCopy, givenCommandListAndUnalignedHostPointersWhenBlitMemo EXPECT_GT(cmdList.dstBlitCopyRegionOffset, 0u); } +HWTEST2_F(AppendMemoryCopy, givenCommandListAndAlignedHostPointersWhenBlitMemoryCopyRegion3DCalledThenSrcDstZeroOffsetsArePassed, Platforms) { + MockAppendMemoryCopy cmdList; + cmdList.initialize(device, NEO::EngineGroupType::Copy); + void *srcPtr = alignDown(reinterpret_cast(0x1233), NEO::EncodeSurfaceState::getSurfaceBaseAddressAlignment()); + void *dstPtr = alignDown(reinterpret_cast(0x2345), NEO::EncodeSurfaceState::getSurfaceBaseAddressAlignment()); + ze_copy_region_t dstRegion = {4, 4, 4, 2, 2, 2}; + ze_copy_region_t srcRegion = {4, 4, 4, 2, 2, 2}; + cmdList.appendMemoryCopyRegion(dstPtr, &dstRegion, 0, 0, srcPtr, &srcRegion, 0, 0, nullptr, 0, nullptr); + EXPECT_EQ(cmdList.srcBlitCopyRegionOffset, 0u); + EXPECT_EQ(cmdList.dstBlitCopyRegionOffset, 0u); +} + HWTEST2_F(AppendMemoryCopy, givenCommandListAndHostPointersWhenMemoryCopyRegionCalledThenPipeControlWithDcFlushAdded, Platforms) { using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_3.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_3.cpp index 4cfbb71e92..8189a3ebe9 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_3.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_3.cpp @@ -221,7 +221,7 @@ HWTEST2_F(CommandListCreate, EXPECT_EQ(expectedGpuAddress, outData.alignedAllocationPtr); EXPECT_EQ(expectedOffset, outData.offset); - size_t offset = 0x20u; + size_t offset = 0x21u; void *offsetMemory = ptrOffset(startMemory, offset); expectedOffset = ptrDiff(offsetMemory, baseAddress); EXPECT_EQ(outData.offset + offset, expectedOffset); @@ -231,7 +231,7 @@ HWTEST2_F(CommandListCreate, EXPECT_EQ(firstAlloc, outData.alloc); EXPECT_EQ(startMemory, outData.alloc->getUnderlyingBuffer()); EXPECT_EQ(expectedGpuAddress, outData.alignedAllocationPtr); - EXPECT_EQ(expectedOffset, outData.offset); + EXPECT_EQ((expectedOffset & (EncodeSurfaceState::getSurfaceBaseAddressAlignment() - 1)), outData.offset); commandList->removeHostPtrAllocations(); device->getNEODevice()->getMemoryManager()->freeSystemMemory(cmdListHostBuffer); diff --git a/level_zero/core/test/unit_tests/sources/module/test_module.cpp b/level_zero/core/test/unit_tests/sources/module/test_module.cpp index 70d4d87496..2ca53e2a4d 100644 --- a/level_zero/core/test/unit_tests/sources/module/test_module.cpp +++ b/level_zero/core/test/unit_tests/sources/module/test_module.cpp @@ -139,6 +139,89 @@ HWTEST2_F(ModuleTest, givenNonPatchedTokenThenSurfaceBaseAddressIsCorrectlySet, context->freeMem(devicePtr); } +HWTEST_F(ModuleTest, givenStatefulBufferWhenOffsetIsPatchedThenAllocBaseAddressIsSet) { + using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; + ze_kernel_handle_t kernelHandle; + + ze_kernel_desc_t kernelDesc = {}; + kernelDesc.pKernelName = kernelName.c_str(); + + ze_result_t res = module->createKernel(&kernelDesc, &kernelHandle); + + EXPECT_EQ(ZE_RESULT_SUCCESS, res); + + auto kernelImp = reinterpret_cast(L0::Kernel::fromHandle(kernelHandle)); + + void *devicePtr = nullptr; + ze_device_mem_alloc_desc_t deviceDesc = {}; + res = context->allocDeviceMem(device->toHandle(), + &deviceDesc, + 16384u, + 0u, + &devicePtr); + ASSERT_EQ(ZE_RESULT_SUCCESS, res); + + auto gpuAlloc = device->getDriverHandle()->getSvmAllocsManager()->getSVMAllocs()->get(devicePtr)->gpuAllocations.getGraphicsAllocation(device->getRootDeviceIndex()); + ASSERT_NE(nullptr, gpuAlloc); + + uint32_t argIndex = 0u; + uint32_t offset = 0x1234; + const_cast(&(kernelImp->getImmutableData()->getDescriptor()))->payloadMappings.explicitArgs[argIndex].as().bufferOffset = 0; + const_cast(&(kernelImp->getImmutableData()->getDescriptor()))->payloadMappings.explicitArgs[argIndex].as().bindful = 0x80; + kernelImp->setBufferSurfaceState(argIndex, ptrOffset(devicePtr, offset), gpuAlloc); + + auto argInfo = kernelImp->getImmutableData()->getDescriptor().payloadMappings.explicitArgs[argIndex].as(); + auto surfaceStateAddressRaw = ptrOffset(kernelImp->getSurfaceStateHeapData(), argInfo.bindful); + auto surfaceStateAddress = reinterpret_cast(const_cast(surfaceStateAddressRaw)); + EXPECT_EQ(devicePtr, reinterpret_cast(surfaceStateAddress->getSurfaceBaseAddress())); + + Kernel::fromHandle(kernelHandle)->destroy(); + + context->freeMem(devicePtr); +} + +HWTEST_F(ModuleTest, givenBufferWhenOffsetIsNotPatchedThenPassedPtrIsSetAsBaseAddress) { + using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; + ze_kernel_handle_t kernelHandle; + + ze_kernel_desc_t kernelDesc = {}; + kernelDesc.pKernelName = kernelName.c_str(); + + ze_result_t res = module->createKernel(&kernelDesc, &kernelHandle); + + EXPECT_EQ(ZE_RESULT_SUCCESS, res); + + auto kernelImp = reinterpret_cast(L0::Kernel::fromHandle(kernelHandle)); + + void *devicePtr = nullptr; + ze_device_mem_alloc_desc_t deviceDesc = {}; + res = context->allocDeviceMem(device->toHandle(), + &deviceDesc, + 16384u, + 0u, + &devicePtr); + ASSERT_EQ(ZE_RESULT_SUCCESS, res); + + auto gpuAlloc = device->getDriverHandle()->getSvmAllocsManager()->getSVMAllocs()->get(devicePtr)->gpuAllocations.getGraphicsAllocation(device->getRootDeviceIndex()); + ASSERT_NE(nullptr, gpuAlloc); + + uint32_t argIndex = 0u; + uint32_t offset = 0x1234; + const_cast(&(kernelImp->getImmutableData()->getDescriptor()))->payloadMappings.explicitArgs[argIndex].as().bufferOffset = undefined; + const_cast(&(kernelImp->getImmutableData()->getDescriptor()))->payloadMappings.explicitArgs[argIndex].as().bindful = 0x80; + + kernelImp->setBufferSurfaceState(argIndex, ptrOffset(devicePtr, offset), gpuAlloc); + + auto argInfo = kernelImp->getImmutableData()->getDescriptor().payloadMappings.explicitArgs[argIndex].as(); + auto surfaceStateAddressRaw = ptrOffset(kernelImp->getSurfaceStateHeapData(), argInfo.bindful); + auto surfaceStateAddress = reinterpret_cast(const_cast(surfaceStateAddressRaw)); + EXPECT_EQ(ptrOffset(devicePtr, offset), reinterpret_cast(surfaceStateAddress->getSurfaceBaseAddress())); + + Kernel::fromHandle(kernelHandle)->destroy(); + + context->freeMem(devicePtr); +} + using ModuleUncachedBufferTest = Test; struct KernelImpUncachedTest : public KernelImp { diff --git a/shared/source/command_container/command_encoder.h b/shared/source/command_container/command_encoder.h index ba2982623c..b8c704fd5c 100644 --- a/shared/source/command_container/command_encoder.h +++ b/shared/source/command_container/command_encoder.h @@ -247,9 +247,7 @@ struct EncodeSurfaceState { return ~(getSurfaceBaseAddressAlignment() - 1); } - static constexpr uintptr_t getSurfaceBaseAddressMinimumAlignment() { return 4; } - - static constexpr uintptr_t getSurfaceBaseAddressAlignment() { return MemoryConstants::pageSize; } + static constexpr uintptr_t getSurfaceBaseAddressAlignment() { return 4; } static void getSshAlignedPointer(uintptr_t &ptr, size_t &offset); static bool doBindingTablePrefetch(); diff --git a/shared/source/command_container/command_encoder.inl b/shared/source/command_container/command_encoder.inl index e688a65c7d..bc91222fdc 100644 --- a/shared/source/command_container/command_encoder.inl +++ b/shared/source/command_container/command_encoder.inl @@ -320,7 +320,7 @@ void EncodeSurfaceState::encodeBuffer(void *dst, uint64_t address, size_ bool cpuCoherent, bool forceNonAuxMode, bool isReadOnly, uint32_t numAvailableDevices, GraphicsAllocation *allocation, GmmHelper *gmmHelper, bool useGlobalAtomics, bool areMultipleSubDevicesInContext) { auto surfaceState = reinterpret_cast(dst); - UNRECOVERABLE_IF(!isAligned(size)); + UNRECOVERABLE_IF(!isAligned(size)); SURFACE_STATE_BUFFER_LENGTH Length = {0}; Length.Length = static_cast(size - 1);