From 3e5f3fe055ab3d17b4f54ff9d3bbf491b96c2464 Mon Sep 17 00:00:00 2001 From: "Spruit, Neil R" Date: Thu, 29 Oct 2020 17:57:14 +0000 Subject: [PATCH] Fixed AppendMemoryFill for alignment, multi kernel execution, memory overrun - Fixed Memory Fill to use aligned allocations in Destination and Source buffer kernel arguments to ensure proper alignment of memory in the surface state. - Fixed patternSize == 1 case where simd size is > the size being written resulting in the inital kernel append with group count of 0 and the remainder kernel being executed incorrectly. - Fixed offset calculation in the remainder kernel to be based off of the aligned allocation offset. - Fixed memory overrun case where size < patternsize with the minimum size being used in the kernel. Change-Id: I39bd83b3a83ceb6df05d374238f85f7fdf0bd09a Signed-off-by: Spruit, Neil R --- level_zero/core/source/cmdlist/cmdlist_hw.inl | 24 +++---- .../sources/cmdlist/test_cmdlist_blit.cpp | 71 +++++++++++++++++++ 2 files changed, 83 insertions(+), 12 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index 19fd8f0ec0..3db0869418 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -1110,8 +1110,8 @@ ze_result_t CommandListCoreFamily::appendMemoryFill(void *ptr, } uintptr_t dstPtr = reinterpret_cast(ptr); - size_t dstOffset = 0; - NEO::EncodeSurfaceState::getSshAlignedPointer(dstPtr, dstOffset); + + auto dstAllocation = this->getAlignedAllocation(this->device, reinterpret_cast(dstPtr), size); uintptr_t srcPtr = reinterpret_cast(const_cast(pattern)); size_t srcOffset = 0; @@ -1126,14 +1126,17 @@ ze_result_t CommandListCoreFamily::appendMemoryFill(void *ptr, builtinFunction = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferImmediate); groupSizeX = builtinFunction->getImmutableData()->getDescriptor().kernelAttributes.simdSize; + if (groupSizeX > static_cast(size)) { + groupSizeX = static_cast(size); + } if (builtinFunction->setGroupSize(groupSizeX, 1u, 1u)) { DEBUG_BREAK_IF(true); return ZE_RESULT_ERROR_UNKNOWN; } uint32_t value = *(reinterpret_cast(const_cast(pattern))); - builtinFunction->setArgumentValue(0, sizeof(dstPtr), &dstPtr); - builtinFunction->setArgumentValue(1, sizeof(dstOffset), &dstOffset); + builtinFunction->setArgBufferWithAlloc(0, dstAllocation.alignedAllocationPtr, dstAllocation.alloc); + builtinFunction->setArgumentValue(1, sizeof(dstAllocation.offset), &dstAllocation.offset); builtinFunction->setArgumentValue(2, sizeof(value), &value); } else { @@ -1146,14 +1149,14 @@ ze_result_t CommandListCoreFamily::appendMemoryFill(void *ptr, } srcOffset += patternAlloc.offset; - groupSizeX = static_cast(patternSize); + groupSizeX = static_cast(std::min(patternSize, size)); if (builtinFunction->setGroupSize(groupSizeX, 1u, 1u)) { DEBUG_BREAK_IF(true); return ZE_RESULT_ERROR_UNKNOWN; } - builtinFunction->setArgumentValue(0, sizeof(dstPtr), &dstPtr); - builtinFunction->setArgumentValue(1, sizeof(dstOffset), &dstOffset); + builtinFunction->setArgBufferWithAlloc(0, dstAllocation.alignedAllocationPtr, dstAllocation.alloc); + builtinFunction->setArgumentValue(1, sizeof(dstAllocation.offset), &dstAllocation.offset); builtinFunction->setArgBufferWithAlloc(2, patternAlloc.alignedAllocationPtr, patternAlloc.alloc); builtinFunction->setArgumentValue(3, sizeof(srcOffset), &srcOffset); @@ -1178,11 +1181,8 @@ ze_result_t CommandListCoreFamily::appendMemoryFill(void *ptr, } ze_group_count_t dispatchFuncArgs{1u, 1u, 1u}; - dstPtr = dstPtr + (size - groupRemainderSizeX); - dstOffset = 0; - NEO::EncodeSurfaceState::getSshAlignedPointer(dstPtr, dstOffset); - - builtinFunction->setArgumentValue(0, sizeof(dstPtr), &dstPtr); + size_t dstOffset = dstAllocation.offset + (size - groupRemainderSizeX); + builtinFunction->setArgBufferWithAlloc(0, dstAllocation.alignedAllocationPtr, dstAllocation.alloc); builtinFunction->setArgumentValue(1, sizeof(dstOffset), &dstOffset); res = CommandListCoreFamily::appendLaunchKernel(builtinFunction->toHandle(), diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_blit.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_blit.cpp index d2f64ffc6c..bd341d810f 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_blit.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_blit.cpp @@ -69,6 +69,7 @@ class MockCommandListForMemFillHostPtr : public WhiteBox<::L0::CommandListCoreFa } }; +uint32_t memoryFillMockGroupSizeX = 0, memoryFillMockGroupSizeY = 0, memoryFillMockGroupSizeZ = 0; struct AppendMemoryFillFixture { class MockDriverHandleHostPtr : public L0::DriverHandleImp { public: @@ -91,10 +92,28 @@ struct AppendMemoryFillFixture { std::unique_ptr mockAllocation; NEO::SvmAllocationData data{rootDeviceIndex}; }; + struct MockKernelImmutableDataForMemFill : KernelImmutableData { + MockKernelImmutableDataForMemFill(L0::Device *l0device = nullptr) { + mockKernelDescriptor = new NEO::KernelDescriptor; + mockKernelDescriptor->kernelAttributes.simdSize = 32; + kernelDescriptor = mockKernelDescriptor; + return; + } + ~MockKernelImmutableDataForMemFill() override { + delete mockKernelDescriptor; + } + NEO::KernelDescriptor *mockKernelDescriptor = nullptr; + }; class MockKernelForMemFill : public L0::KernelImp { public: + MockKernelForMemFill() { + mockKernelImmData = new MockKernelImmutableDataForMemFill(); + } ze_result_t setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY, uint32_t groupSizeZ) override { + memoryFillMockGroupSizeX = groupSizeX; + memoryFillMockGroupSizeY = groupSizeY; + memoryFillMockGroupSizeZ = groupSizeZ; return ZE_RESULT_ERROR_UNKNOWN; } void setBufferSurfaceState(uint32_t argIndex, void *address, NEO::GraphicsAllocation *alloc) override { @@ -103,7 +122,14 @@ struct AppendMemoryFillFixture { void evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::KernelDescriptor &kernelDescriptor) override { return; } + const MockKernelImmutableDataForMemFill *getImmutableData() const override { + return mockKernelImmData; + } + ~MockKernelForMemFill() override { + delete mockKernelImmData; + } std::unique_ptr clone() const override { return nullptr; } + MockKernelImmutableDataForMemFill *mockKernelImmData = nullptr; }; struct MockBuiltinFunctionsForMemFill : BuiltinFunctionsLibImpl { @@ -132,6 +158,9 @@ struct AppendMemoryFillFixture { MockBuiltinFunctionsForMemFill *tmpMockBultinLib = nullptr; }; virtual void SetUp() { // NOLINT(readability-identifier-naming) + memoryFillMockGroupSizeX = 0; + memoryFillMockGroupSizeY = 0; + memoryFillMockGroupSizeZ = 0; neoDevice = NEO::MockDevice::createWithNewExecutionEnvironment(NEO::defaultHwInfo.get()); neoMockDevice = NEO::MockDevice::createWithNewExecutionEnvironment(NEO::defaultHwInfo.get()); NEO::DeviceVector devices; @@ -427,5 +456,47 @@ HWTEST2_F(AppendMemoryfillHostPtr, givenCommandListAndHostPointerWhenMemoryfillC deviceMock.get()->setDriverHandle(driverHandle.get()); } +HWTEST2_F(AppendMemoryfillHostPtr, givenCommandListAndHostPointerWithPatternSizeGreaterThanSizeWhenMemoryfillCalledThenGroupSizeXEqualsSize, Platforms) { + MockCommandListForMemFillHostPtr cmdList; + MockDriverHandleHostPtr driverHandleMock; + size_t patternSize = 0x1001; + size_t dstSize = 0x1000; + deviceMock.get()->setDriverHandle(&driverHandleMock); + cmdList.initialize(deviceMock.get(), NEO::EngineGroupType::RenderCompute); + uint64_t pattern[4] = {1, 2, 3, 4}; + void *ptr = reinterpret_cast(registeredGraphicsAllocationAddress); + cmdList.appendMemoryFill(ptr, reinterpret_cast(&pattern), patternSize, dstSize, nullptr); + EXPECT_EQ(memoryFillMockGroupSizeX, 0x1000u); + deviceMock.get()->setDriverHandle(driverHandle.get()); +} + +HWTEST2_F(AppendMemoryfillHostPtr, givenCommandListAndHostPointerWithSizeLessThanSimdWhenMemoryfillCalledThenGroupSizeXEqualsSize, Platforms) { + MockCommandListForMemFillHostPtr cmdList; + MockDriverHandleHostPtr driverHandleMock; + size_t patternSize = 1; + size_t dstSize = 16; + deviceMock.get()->setDriverHandle(&driverHandleMock); + cmdList.initialize(deviceMock.get(), NEO::EngineGroupType::RenderCompute); + uint64_t pattern[4] = {1, 2, 3, 4}; + void *ptr = reinterpret_cast(registeredGraphicsAllocationAddress); + cmdList.appendMemoryFill(ptr, reinterpret_cast(&pattern), patternSize, dstSize, nullptr); + EXPECT_EQ(memoryFillMockGroupSizeX, 16u); + deviceMock.get()->setDriverHandle(driverHandle.get()); +} + +HWTEST2_F(AppendMemoryfillHostPtr, givenCommandListAndHostPointerWithSizeLargerThanSimdWhenMemoryfillCalledThenGroupSizeIsSimd, Platforms) { + MockCommandListForMemFillHostPtr cmdList; + MockDriverHandleHostPtr driverHandleMock; + size_t patternSize = 1; + size_t dstSize = 64; + deviceMock.get()->setDriverHandle(&driverHandleMock); + cmdList.initialize(deviceMock.get(), NEO::EngineGroupType::RenderCompute); + uint64_t pattern[4] = {1, 2, 3, 4}; + void *ptr = reinterpret_cast(registeredGraphicsAllocationAddress); + cmdList.appendMemoryFill(ptr, reinterpret_cast(&pattern), patternSize, dstSize, nullptr); + EXPECT_EQ(memoryFillMockGroupSizeX, 32u); + deviceMock.get()->setDriverHandle(driverHandle.get()); +} + } // namespace ult } // namespace L0