From 549b73510cfc27943c8223c84dd934df86977be6 Mon Sep 17 00:00:00 2001 From: "Woloszyn, Wojciech" Date: Mon, 5 Nov 2018 05:26:45 -0800 Subject: [PATCH] Flush L3 for reduced address space platforms Change-Id: I5a73e72f8e309137328930920ab174ba6f1378dc --- runtime/command_queue/enqueue_common.h | 10 ++- runtime/command_queue/enqueue_read_buffer.h | 2 +- .../command_queue/enqueue_read_buffer_rect.h | 2 +- runtime/command_queue/enqueue_read_image.h | 2 +- runtime/command_queue/enqueue_write_buffer.h | 2 +- .../command_queue/enqueue_write_buffer_rect.h | 2 +- runtime/command_queue/enqueue_write_image.h | 2 +- .../command_stream_receiver.cpp | 4 +- .../command_stream/command_stream_receiver.h | 2 +- runtime/memory_manager/memory_manager.h | 8 +- .../command_queue/enqueue_kernel_tests.cpp | 75 +++++++++++++++++++ .../command_stream_receiver_tests.cpp | 6 +- .../memory_manager/memory_manager_tests.cpp | 18 ++++- 13 files changed, 116 insertions(+), 19 deletions(-) diff --git a/runtime/command_queue/enqueue_common.h b/runtime/command_queue/enqueue_common.h index 82cc3d4665..4fe2d8a649 100644 --- a/runtime/command_queue/enqueue_common.h +++ b/runtime/command_queue/enqueue_common.h @@ -31,6 +31,7 @@ #include "runtime/program/block_kernel_manager.h" #include "runtime/utilities/range.h" #include "runtime/utilities/tag_allocator.h" +#include #include namespace OCLRT { @@ -568,9 +569,16 @@ CompletionStamp CommandQueueHw::enqueueNonBlocked( commandStreamReceiver.requestThreadArbitrationPolicy(multiDispatchInfo.peekMainKernel()->getThreadArbitrationPolicy()); + auto allocNeedsFlushDC = false; + if (!device->isFullRangeSvm()) { + if (std::any_of(commandStreamReceiver.getResidencyAllocations().begin(), commandStreamReceiver.getResidencyAllocations().end(), [](const auto allocation) { return allocation->flushL3Required; })) { + allocNeedsFlushDC = true; + } + } + DispatchFlags dispatchFlags; dispatchFlags.blocking = blocking; - dispatchFlags.dcFlush = shouldFlushDC(commandType, printfHandler); + dispatchFlags.dcFlush = shouldFlushDC(commandType, printfHandler) || allocNeedsFlushDC; dispatchFlags.useSLM = slmUsed; dispatchFlags.guardCommandBufferWithPipeControl = true; dispatchFlags.GSBA32BitRequired = commandType == CL_COMMAND_NDRANGE_KERNEL; diff --git a/runtime/command_queue/enqueue_read_buffer.h b/runtime/command_queue/enqueue_read_buffer.h index 3ea1dfb5fc..7d072744d0 100644 --- a/runtime/command_queue/enqueue_read_buffer.h +++ b/runtime/command_queue/enqueue_read_buffer.h @@ -90,7 +90,7 @@ cl_int CommandQueueHw::enqueueReadBuffer( Surface *surfaces[] = {&bufferSurf, &hostPtrSurf}; if (size != 0) { - bool status = getDevice().getCommandStreamReceiver().createAllocationForHostSurface(hostPtrSurf, getDevice()); + bool status = getDevice().getCommandStreamReceiver().createAllocationForHostSurface(hostPtrSurf, getDevice(), true); if (!status) { return CL_OUT_OF_RESOURCES; } diff --git a/runtime/command_queue/enqueue_read_buffer_rect.h b/runtime/command_queue/enqueue_read_buffer_rect.h index efb1882199..e3a879ff49 100644 --- a/runtime/command_queue/enqueue_read_buffer_rect.h +++ b/runtime/command_queue/enqueue_read_buffer_rect.h @@ -76,7 +76,7 @@ cl_int CommandQueueHw::enqueueReadBufferRect( if (region[0] != 0 && region[1] != 0 && region[2] != 0) { - bool status = getDevice().getCommandStreamReceiver().createAllocationForHostSurface(hostPtrSurf, getDevice()); + bool status = getDevice().getCommandStreamReceiver().createAllocationForHostSurface(hostPtrSurf, getDevice(), true); if (!status) { return CL_OUT_OF_RESOURCES; } diff --git a/runtime/command_queue/enqueue_read_image.h b/runtime/command_queue/enqueue_read_image.h index 13383ea6e2..45e585e6e5 100644 --- a/runtime/command_queue/enqueue_read_image.h +++ b/runtime/command_queue/enqueue_read_image.h @@ -82,7 +82,7 @@ cl_int CommandQueueHw::enqueueReadImage( if (region[0] != 0 && region[1] != 0 && region[2] != 0) { - bool status = getDevice().getCommandStreamReceiver().createAllocationForHostSurface(hostPtrSurf, getDevice()); + bool status = getDevice().getCommandStreamReceiver().createAllocationForHostSurface(hostPtrSurf, getDevice(), true); if (!status) { return CL_OUT_OF_RESOURCES; } diff --git a/runtime/command_queue/enqueue_write_buffer.h b/runtime/command_queue/enqueue_write_buffer.h index 3b4f3d4a56..dc3cfcc6bd 100644 --- a/runtime/command_queue/enqueue_write_buffer.h +++ b/runtime/command_queue/enqueue_write_buffer.h @@ -89,7 +89,7 @@ cl_int CommandQueueHw::enqueueWriteBuffer( Surface *surfaces[] = {&bufferSurf, &hostPtrSurf}; if (size != 0) { - bool status = getDevice().getCommandStreamReceiver().createAllocationForHostSurface(hostPtrSurf, getDevice()); + bool status = getDevice().getCommandStreamReceiver().createAllocationForHostSurface(hostPtrSurf, getDevice(), false); if (!status) { return CL_OUT_OF_RESOURCES; } diff --git a/runtime/command_queue/enqueue_write_buffer_rect.h b/runtime/command_queue/enqueue_write_buffer_rect.h index ee40dfc393..b8c07b16e7 100644 --- a/runtime/command_queue/enqueue_write_buffer_rect.h +++ b/runtime/command_queue/enqueue_write_buffer_rect.h @@ -75,7 +75,7 @@ cl_int CommandQueueHw::enqueueWriteBufferRect( if (region[0] != 0 && region[1] != 0 && region[2] != 0) { - bool status = getDevice().getCommandStreamReceiver().createAllocationForHostSurface(hostPtrSurf, getDevice()); + bool status = getDevice().getCommandStreamReceiver().createAllocationForHostSurface(hostPtrSurf, getDevice(), false); if (!status) { return CL_OUT_OF_RESOURCES; } diff --git a/runtime/command_queue/enqueue_write_image.h b/runtime/command_queue/enqueue_write_image.h index 6a9eefc3ef..0b72f381a5 100644 --- a/runtime/command_queue/enqueue_write_image.h +++ b/runtime/command_queue/enqueue_write_image.h @@ -76,7 +76,7 @@ cl_int CommandQueueHw::enqueueWriteImage( if (region[0] != 0 && region[1] != 0 && region[2] != 0) { - bool status = getDevice().getCommandStreamReceiver().createAllocationForHostSurface(hostPtrSurf, getDevice()); + bool status = getDevice().getCommandStreamReceiver().createAllocationForHostSurface(hostPtrSurf, getDevice(), false); if (!status) { return CL_OUT_OF_RESOURCES; } diff --git a/runtime/command_stream/command_stream_receiver.cpp b/runtime/command_stream/command_stream_receiver.cpp index e32e5cb71b..fb3846feff 100644 --- a/runtime/command_stream/command_stream_receiver.cpp +++ b/runtime/command_stream/command_stream_receiver.cpp @@ -344,10 +344,10 @@ std::unique_lock CommandStreamReceiver::obtain AllocationsList &CommandStreamReceiver::getTemporaryAllocations() { return internalAllocationStorage->getTemporaryAllocations(); } AllocationsList &CommandStreamReceiver::getAllocationsForReuse() { return internalAllocationStorage->getAllocationsForReuse(); } -bool CommandStreamReceiver::createAllocationForHostSurface(HostPtrSurface &surface, Device &device) { +bool CommandStreamReceiver::createAllocationForHostSurface(HostPtrSurface &surface, Device &device, bool requiresL3Flush) { auto memoryManager = getMemoryManager(); GraphicsAllocation *allocation = nullptr; - allocation = memoryManager->allocateGraphicsMemoryForHostPtr(surface.getSurfaceSize(), surface.getMemoryPointer(), device.isFullRangeSvm()); + allocation = memoryManager->allocateGraphicsMemoryForHostPtr(surface.getSurfaceSize(), surface.getMemoryPointer(), device.isFullRangeSvm(), requiresL3Flush); if (allocation == nullptr && surface.peekIsPtrCopyAllowed()) { // Try with no host pointer allocation and copy allocation = memoryManager->allocateGraphicsMemory(surface.getSurfaceSize(), MemoryConstants::pageSize, false, false); diff --git a/runtime/command_stream/command_stream_receiver.h b/runtime/command_stream/command_stream_receiver.h index 0c54480aef..1d43d3aa86 100644 --- a/runtime/command_stream/command_stream_receiver.h +++ b/runtime/command_stream/command_stream_receiver.h @@ -153,7 +153,7 @@ class CommandStreamReceiver { AllocationsList &getTemporaryAllocations(); AllocationsList &getAllocationsForReuse(); InternalAllocationStorage *getInternalAllocationStorage() const { return internalAllocationStorage.get(); } - bool createAllocationForHostSurface(HostPtrSurface &surface, Device &device); + bool createAllocationForHostSurface(HostPtrSurface &surface, Device &device, bool requiresL3Flush); protected: void cleanupResources(); diff --git a/runtime/memory_manager/memory_manager.h b/runtime/memory_manager/memory_manager.h index 93578e1dd2..3974a68c6b 100644 --- a/runtime/memory_manager/memory_manager.h +++ b/runtime/memory_manager/memory_manager.h @@ -139,11 +139,15 @@ class MemoryManager { } virtual GraphicsAllocation *allocateGraphicsMemory(size_t size, const void *ptr, bool forcePin); - GraphicsAllocation *allocateGraphicsMemoryForHostPtr(size_t size, void *ptr, bool fullRangeSvm) { + GraphicsAllocation *allocateGraphicsMemoryForHostPtr(size_t size, void *ptr, bool fullRangeSvm, bool requiresL3Flush) { if (fullRangeSvm) { return allocateGraphicsMemory(size, ptr); } else { - return allocateGraphicsMemoryForNonSvmHostPtr(size, ptr); + auto allocation = allocateGraphicsMemoryForNonSvmHostPtr(size, ptr); + if (allocation) { + allocation->flushL3Required = requiresL3Flush; + } + return allocation; } } diff --git a/unit_tests/command_queue/enqueue_kernel_tests.cpp b/unit_tests/command_queue/enqueue_kernel_tests.cpp index 8b7e848e36..1804be1f06 100644 --- a/unit_tests/command_queue/enqueue_kernel_tests.cpp +++ b/unit_tests/command_queue/enqueue_kernel_tests.cpp @@ -1009,6 +1009,81 @@ HWTEST_F(EnqueueKernelTest, givenCommandStreamReceiverInBatchingModeWhenEnqueueK EXPECT_EQ(5u + csrSurfaceCount, cmdBuffer->surfaces.size()); } +HWTEST_F(EnqueueKernelTest, givenReducedAddressSpaceGraphicsAllocationForHostPtrWithL3FlushRequiredWhenEnqueueKernelIsCalledThenFlushIsCalledForReducedAddressSpacePlatforms) { + std::unique_ptr device; + std::unique_ptr cmdQ; + auto hwInfoToModify = *platformDevices[0]; + hwInfoToModify.capabilityTable.gpuAddressSpace = MemoryConstants::max32BitAddress; + device.reset(MockDevice::createWithNewExecutionEnvironment(&hwInfoToModify)); + auto mockCsr = new MockCsrHw2(device->getHardwareInfo(), *device->executionEnvironment); + device->resetCommandStreamReceiver(mockCsr); + auto memoryManager = mockCsr->getMemoryManager(); + uint32_t hostPtr[10]{}; + + auto allocation = memoryManager->allocateGraphicsMemoryForHostPtr(1, hostPtr, device->isFullRangeSvm(), true); + MockKernelWithInternals mockKernel(*device, context); + size_t gws[3] = {1, 0, 0}; + mockCsr->makeResident(*allocation); + cmdQ.reset(createCommandQueue(device.get(), 0)); + auto ret = cmdQ->enqueueKernel(mockKernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); + EXPECT_EQ(CL_SUCCESS, ret); + EXPECT_TRUE(mockCsr->passedDispatchFlags.dcFlush); + memoryManager->freeGraphicsMemory(allocation); +} + +HWTEST_F(EnqueueKernelTest, givenReducedAddressSpaceGraphicsAllocationForHostPtrWithL3FlushUnrequiredWhenEnqueueKernelIsCalledThenFlushIsNotForcedByGraphicsAllocation) { + std::unique_ptr device; + std::unique_ptr cmdQ; + auto hwInfoToModify = *platformDevices[0]; + hwInfoToModify.capabilityTable.gpuAddressSpace = MemoryConstants::max32BitAddress; + device.reset(MockDevice::createWithNewExecutionEnvironment(&hwInfoToModify)); + auto mockCsr = new MockCsrHw2(device->getHardwareInfo(), *device->executionEnvironment); + device->resetCommandStreamReceiver(mockCsr); + auto memoryManager = mockCsr->getMemoryManager(); + uint32_t hostPtr[10]{}; + + auto allocation = memoryManager->allocateGraphicsMemoryForHostPtr(1, hostPtr, device->isFullRangeSvm(), false); + MockKernelWithInternals mockKernel(*device, context); + size_t gws[3] = {1, 0, 0}; + mockCsr->makeResident(*allocation); + cmdQ.reset(createCommandQueue(device.get(), 0)); + auto ret = cmdQ->enqueueKernel(mockKernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); + EXPECT_EQ(CL_SUCCESS, ret); + EXPECT_FALSE(mockCsr->passedDispatchFlags.dcFlush); + memoryManager->freeGraphicsMemory(allocation); +} + +HWTEST_F(EnqueueKernelTest, givenFullAddressSpaceGraphicsAllocationWhenEnqueueKernelIsCalledThenFlushIsNotForcedByGraphicsAllocation) { + HardwareInfo hwInfoToModify; + std::unique_ptr device; + std::unique_ptr cmdQ; + hwInfoToModify = *platformDevices[0]; + hwInfoToModify.capabilityTable.gpuAddressSpace = MemoryConstants::max48BitAddress; + device.reset(MockDevice::createWithNewExecutionEnvironment(&hwInfoToModify)); + auto mockCsr = new MockCsrHw2(device->getHardwareInfo(), *device->executionEnvironment); + device->resetCommandStreamReceiver(mockCsr); + auto memoryManager = mockCsr->getMemoryManager(); + uint32_t hostPtr[10]{}; + + auto allocation = memoryManager->allocateGraphicsMemoryForHostPtr(1, hostPtr, device->isFullRangeSvm(), false); + MockKernelWithInternals mockKernel(*device, context); + size_t gws[3] = {1, 0, 0}; + mockCsr->makeResident(*allocation); + cmdQ.reset(createCommandQueue(device.get(), 0)); + auto ret = cmdQ->enqueueKernel(mockKernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); + EXPECT_EQ(CL_SUCCESS, ret); + EXPECT_FALSE(mockCsr->passedDispatchFlags.dcFlush); + memoryManager->freeGraphicsMemory(allocation); + + allocation = (memoryManager->allocateGraphicsMemoryForHostPtr(1, hostPtr, device->isFullRangeSvm(), true)); + mockCsr->makeResident(*allocation); + cmdQ.reset(createCommandQueue(device.get(), 0)); + ret = cmdQ->enqueueKernel(mockKernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); + EXPECT_EQ(CL_SUCCESS, ret); + EXPECT_FALSE(mockCsr->passedDispatchFlags.dcFlush); + memoryManager->freeGraphicsMemory(allocation); +} + HWTEST_F(EnqueueKernelTest, givenDefaultCommandStreamReceiverWhenClFlushIsCalledThenSuccessIsReturned) { MockKernelWithInternals mockKernel(*pDevice); size_t gws[3] = {1, 0, 0}; diff --git a/unit_tests/command_stream/command_stream_receiver_tests.cpp b/unit_tests/command_stream/command_stream_receiver_tests.cpp index 31c227cd8c..61b2f1a29f 100644 --- a/unit_tests/command_stream/command_stream_receiver_tests.cpp +++ b/unit_tests/command_stream/command_stream_receiver_tests.cpp @@ -408,7 +408,7 @@ TEST_F(CreateAllocationForHostSurfaceTest, givenReadOnlyHostPointerWhenAllocatio .WillOnce(::testing::Return(nullptr)); } - bool result = commandStreamReceiver->createAllocationForHostSurface(surface, *device); + bool result = commandStreamReceiver->createAllocationForHostSurface(surface, *device, false); EXPECT_TRUE(result); auto allocation = surface.getAllocation(); @@ -435,7 +435,7 @@ TEST_F(CreateAllocationForHostSurfaceTest, givenReadOnlyHostPointerWhenAllocatio .WillOnce(::testing::Return(nullptr)); } - bool result = commandStreamReceiver->createAllocationForHostSurface(surface, *device); + bool result = commandStreamReceiver->createAllocationForHostSurface(surface, *device, false); EXPECT_FALSE(result); auto allocation = surface.getAllocation(); @@ -459,6 +459,6 @@ TEST_F(ReducedAddrSpaceCommandStreamReceiverTest, .Times(1) .WillOnce(::testing::Return(nullptr)); - bool result = commandStreamReceiver->createAllocationForHostSurface(surface, *device); + bool result = commandStreamReceiver->createAllocationForHostSurface(surface, *device, false); EXPECT_FALSE(result); } diff --git a/unit_tests/memory_manager/memory_manager_tests.cpp b/unit_tests/memory_manager/memory_manager_tests.cpp index 1ee252fd30..5973105835 100644 --- a/unit_tests/memory_manager/memory_manager_tests.cpp +++ b/unit_tests/memory_manager/memory_manager_tests.cpp @@ -1033,30 +1033,40 @@ TEST(OsAgnosticMemoryManager, givenOsAgnosticMemoryManagerWhenAllocateGraphicsMe memoryManager.freeGraphicsMemory(allocation); } -TEST(OsAgnosticMemoryManager, givenReducedGpuAddressSpaceWhenAllocateGraphicsMemoryForHostPtrIsCalledThenAllocationWithoutFragmentsIsCreated) { +using OsAgnosticMemoryManagerWithParams = ::testing::TestWithParam; + +TEST_P(OsAgnosticMemoryManagerWithParams, givenReducedGpuAddressSpaceWhenAllocateGraphicsMemoryForHostPtrIsCalledThenAllocationWithoutFragmentsIsCreated) { + bool requiresL3Flush = GetParam(); ExecutionEnvironment executionEnvironment; OsAgnosticMemoryManager memoryManager(false, false, executionEnvironment); auto hostPtr = reinterpret_cast(0x5001); - auto allocation = memoryManager.allocateGraphicsMemoryForHostPtr(13, hostPtr, false); + auto allocation = memoryManager.allocateGraphicsMemoryForHostPtr(13, hostPtr, false, requiresL3Flush); EXPECT_NE(nullptr, allocation); EXPECT_EQ(0u, allocation->fragmentsStorage.fragmentCount); + EXPECT_EQ(requiresL3Flush, allocation->flushL3Required); memoryManager.freeGraphicsMemory(allocation); } -TEST(OsAgnosticMemoryManager, givenFullGpuAddressSpaceWhenAllocateGraphicsMemoryForHostPtrIsCalledThenAllocationWithFragmentsIsCreated) { +TEST_P(OsAgnosticMemoryManagerWithParams, givenFullGpuAddressSpaceWhenAllocateGraphicsMemoryForHostPtrIsCalledThenAllocationWithFragmentsIsCreated) { + bool requiresL3Flush = GetParam(); ExecutionEnvironment executionEnvironment; OsAgnosticMemoryManager memoryManager(false, false, executionEnvironment); auto hostPtr = reinterpret_cast(0x5001); - auto allocation = memoryManager.allocateGraphicsMemoryForHostPtr(13, hostPtr, true); + auto allocation = memoryManager.allocateGraphicsMemoryForHostPtr(13, hostPtr, true, requiresL3Flush); EXPECT_NE(nullptr, allocation); EXPECT_EQ(1u, allocation->fragmentsStorage.fragmentCount); + EXPECT_FALSE(allocation->flushL3Required); memoryManager.freeGraphicsMemory(allocation); } +INSTANTIATE_TEST_CASE_P(OsAgnosticMemoryManagerWithParams, + OsAgnosticMemoryManagerWithParams, + ::testing::Values(false, true)); + TEST(OsAgnosticMemoryManager, givenLocalMemoryNotSupportedWhenMemoryManagerIsCreatedThenAllocator32BitHasCorrectBaseAddress) { ExecutionEnvironment executionEnvironment; MockMemoryManager memoryManager(false, false, false, executionEnvironment);