From a822503b41b55a9defe9642f8c765f77c9bad407 Mon Sep 17 00:00:00 2001 From: Maciej Plewka Date: Tue, 23 Jun 2020 12:03:43 +0200 Subject: [PATCH] Use encoder to program buffer surface state Change-Id: Ibe66bd9906743b021a04f1d9aad1aae4127a4f71 Signed-off-by: Maciej Plewka --- level_zero/core/source/kernel/kernel_hw.h | 4 +- opencl/source/mem_obj/buffer.cpp | 17 ++++++ opencl/source/mem_obj/buffer.h | 2 + opencl/source/mem_obj/buffer_base.inl | 59 +------------------ .../test/unit_test/mem_obj/buffer_tests.cpp | 49 +++++++++++++++ .../command_container/command_encoder.h | 5 +- .../command_container/command_encoder.inl | 27 +++++++-- .../unit_test/encoders/test_encode_states.cpp | 6 +- 8 files changed, 103 insertions(+), 66 deletions(-) diff --git a/level_zero/core/source/kernel/kernel_hw.h b/level_zero/core/source/kernel/kernel_hw.h index 3e6f872b16..9a173d001e 100644 --- a/level_zero/core/source/kernel/kernel_hw.h +++ b/level_zero/core/source/kernel/kernel_hw.h @@ -44,9 +44,9 @@ struct KernelHw : public KernelImp { } auto surfaceStateAddress = ptrOffset(surfaceStateHeapData.get(), argInfo.bindful); - void *bufferAddressForSsh = reinterpret_cast(baseAddress); + uint64_t bufferAddressForSsh = baseAddress; auto alignment = NEO::EncodeSurfaceState::getSurfaceBaseAddressAlignment(); - size_t bufferSizeForSsh = ptrDiff(reinterpret_cast(alloc->getGpuAddress()), bufferAddressForSsh); + size_t bufferSizeForSsh = ptrDiff(alloc->getGpuAddress(), bufferAddressForSsh); bufferSizeForSsh += sizeTillEndOfSurface; // take address alignment offset into account bufferSizeForSsh = alignUp(bufferSizeForSsh, alignment); diff --git a/opencl/source/mem_obj/buffer.cpp b/opencl/source/mem_obj/buffer.cpp index 15ac6a46c4..a94aa39cf9 100644 --- a/opencl/source/mem_obj/buffer.cpp +++ b/opencl/source/mem_obj/buffer.cpp @@ -656,6 +656,23 @@ uint32_t Buffer::getMocsValue(bool disableL3Cache, bool isReadOnlyArgument) cons } } +uint32_t Buffer::getSurfaceSize(bool alignSizeForAuxTranslation) const { + auto bufferAddress = getBufferAddress(); + auto bufferAddressAligned = alignDown(bufferAddress, 4); + auto bufferOffset = ptrDiff(bufferAddress, bufferAddressAligned); + + uint32_t surfaceSize = static_cast(alignUp(getSize() + bufferOffset, alignSizeForAuxTranslation ? 512 : 4)); + return surfaceSize; +} + +uint64_t Buffer::getBufferAddress() const { + auto graphicsAllocation = multiGraphicsAllocation.getDefaultGraphicsAllocation(); + // The graphics allocation for Host Ptr surface will be created in makeResident call and GPU address is expected to be the same as CPU address + auto bufferAddress = (graphicsAllocation != nullptr) ? graphicsAllocation->getGpuAddress() : castToUint64(getHostPtr()); + bufferAddress += this->offset; + return bufferAddress; +} + bool Buffer::isCompressed(uint32_t rootDeviceIndex) const { auto graphicsAllocation = multiGraphicsAllocation.getGraphicsAllocation(rootDeviceIndex); if (graphicsAllocation->getDefaultGmm()) { diff --git a/opencl/source/mem_obj/buffer.h b/opencl/source/mem_obj/buffer.h index 311d73caef..00477fe51f 100644 --- a/opencl/source/mem_obj/buffer.h +++ b/opencl/source/mem_obj/buffer.h @@ -152,6 +152,8 @@ class Buffer : public MemObj { bool isReadWriteOnCpuPreffered(void *ptr, size_t size); uint32_t getMocsValue(bool disableL3Cache, bool isReadOnlyArgument) const; + uint32_t getSurfaceSize(bool alignSizeForAuxTranslation) const; + uint64_t getBufferAddress() const; bool isCompressed(uint32_t rootDeviceIndex) const; diff --git a/opencl/source/mem_obj/buffer_base.inl b/opencl/source/mem_obj/buffer_base.inl index 2b44e4173f..1e1a488a3e 100644 --- a/opencl/source/mem_obj/buffer_base.inl +++ b/opencl/source/mem_obj/buffer_base.inl @@ -5,6 +5,7 @@ * */ +#include "shared/source/command_container/command_encoder.h" #include "shared/source/execution_environment/execution_environment.h" #include "shared/source/execution_environment/root_device_environment.h" #include "shared/source/gmm_helper/gmm.h" @@ -32,64 +33,10 @@ union SURFACE_STATE_BUFFER_LENGTH { template void BufferHw::setArgStateful(void *memory, bool forceNonAuxMode, bool disableL3, bool alignSizeForAuxTranslation, bool isReadOnlyArgument) { - using RENDER_SURFACE_STATE = typename GfxFamily::RENDER_SURFACE_STATE; - using SURFACE_FORMAT = typename RENDER_SURFACE_STATE::SURFACE_FORMAT; - using AUXILIARY_SURFACE_MODE = typename RENDER_SURFACE_STATE::AUXILIARY_SURFACE_MODE; - - auto surfaceState = reinterpret_cast(memory); - - auto graphicsAllocation = multiGraphicsAllocation.getDefaultGraphicsAllocation(); - // The graphics allocation for Host Ptr surface will be created in makeResident call and GPU address is expected to be the same as CPU address - auto bufferAddress = (graphicsAllocation != nullptr) ? graphicsAllocation->getGpuAddress() : castToUint64(getHostPtr()); - bufferAddress += this->offset; - - auto bufferAddressAligned = alignDown(bufferAddress, 4); - auto bufferOffset = ptrDiff(bufferAddress, bufferAddressAligned); - - auto surfaceSize = alignUp(getSize() + bufferOffset, alignSizeForAuxTranslation ? 512 : 4); - - SURFACE_STATE_BUFFER_LENGTH Length = {0}; - Length.Length = static_cast(surfaceSize - 1); - - surfaceState->setWidth(Length.SurfaceState.Width + 1); - surfaceState->setHeight(Length.SurfaceState.Height + 1); - surfaceState->setDepth(Length.SurfaceState.Depth + 1); - - if (bufferAddress != 0) { - surfaceState->setSurfaceType(RENDER_SURFACE_STATE::SURFACE_TYPE_SURFTYPE_BUFFER); - } else { - surfaceState->setSurfaceType(RENDER_SURFACE_STATE::SURFACE_TYPE_SURFTYPE_NULL); - } - surfaceState->setSurfaceFormat(SURFACE_FORMAT::SURFACE_FORMAT_RAW); - surfaceState->setSurfaceVerticalAlignment(RENDER_SURFACE_STATE::SURFACE_VERTICAL_ALIGNMENT_VALIGN_4); - surfaceState->setSurfaceHorizontalAlignment(RENDER_SURFACE_STATE::SURFACE_HORIZONTAL_ALIGNMENT_HALIGN_4); - - surfaceState->setTileMode(RENDER_SURFACE_STATE::TILE_MODE_LINEAR); - surfaceState->setVerticalLineStride(0); - surfaceState->setVerticalLineStrideOffset(0); - - surfaceState->setMemoryObjectControlState(getMocsValue(disableL3, isReadOnlyArgument)); - surfaceState->setSurfaceBaseAddress(bufferAddressAligned); - - Gmm *gmm = graphicsAllocation ? graphicsAllocation->getDefaultGmm() : nullptr; - - if (gmm && gmm->isRenderCompressed && !forceNonAuxMode && - GraphicsAllocation::AllocationType::BUFFER_COMPRESSED == graphicsAllocation->getAllocationType()) { - // Its expected to not program pitch/qpitch/baseAddress for Aux surface in CCS scenarios - surfaceState->setCoherencyType(RENDER_SURFACE_STATE::COHERENCY_TYPE_GPU_COHERENT); - surfaceState->setAuxiliarySurfaceMode(AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_CCS_E); - } else { - surfaceState->setCoherencyType(RENDER_SURFACE_STATE::COHERENCY_TYPE_IA_COHERENT); - surfaceState->setAuxiliarySurfaceMode(AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_NONE); - } + EncodeSurfaceState::encodeBuffer(memory, getBufferAddress(), getSurfaceSize(alignSizeForAuxTranslation), getMocsValue(disableL3, isReadOnlyArgument), true); + EncodeSurfaceState::encodeExtraBufferParams(multiGraphicsAllocation.getDefaultGraphicsAllocation(), rootDeviceEnvironment->getGmmHelper(), memory, forceNonAuxMode, isReadOnlyArgument); appendBufferState(memory, context, graphicsAllocation, isReadOnlyArgument); appendSurfaceStateExt(memory); - - auto gmmHelper = rootDeviceEnvironment->getGmmHelper(); - if (DebugManager.flags.DisableCachingForStatefulBufferAccess.get()) { - surfaceState->setMemoryObjectControlState(gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER_CACHELINE_MISALIGNED)); - } } - } // namespace NEO diff --git a/opencl/test/unit_test/mem_obj/buffer_tests.cpp b/opencl/test/unit_test/mem_obj/buffer_tests.cpp index 3b4167a8ea..f259e28b14 100644 --- a/opencl/test/unit_test/mem_obj/buffer_tests.cpp +++ b/opencl/test/unit_test/mem_obj/buffer_tests.cpp @@ -3086,3 +3086,52 @@ TEST_F(MultiRootDeviceBufferTest, bufferGraphicsAllocationHasCorrectRootDeviceIn ASSERT_NE(nullptr, graphicsAllocation); EXPECT_EQ(expectedRootDeviceIndex, graphicsAllocation->getRootDeviceIndex()); } + +TEST_F(MultiRootDeviceBufferTest, givenBufferWhenGetSurfaceSizeCalledWithoutAlignSizeForAuxTranslationThenCorrectValueReturned) { + cl_int retVal = 0; + cl_mem_flags flags = CL_MEM_READ_WRITE; + uint32_t size = 0x131; + std::unique_ptr buffer(Buffer::create(context.get(), flags, size, nullptr, retVal)); + + auto surfaceSize = buffer->getSurfaceSize(false); + EXPECT_EQ(surfaceSize, alignUp(size, 4)); +} + +TEST_F(MultiRootDeviceBufferTest, givenBufferWhenGetSurfaceSizeCalledWithAlignSizeForAuxTranslationThenCorrectValueReturned) { + cl_int retVal = 0; + cl_mem_flags flags = CL_MEM_READ_WRITE; + uint32_t size = 0x131; + std::unique_ptr buffer(Buffer::create(context.get(), flags, size, nullptr, retVal)); + + auto surfaceSize = buffer->getSurfaceSize(true); + EXPECT_EQ(surfaceSize, alignUp(size, 512)); +} + +TEST_F(MultiRootDeviceBufferTest, givenHostPtrBufferWhenGetBufferAddressCalledThenHostPtrReturned) { + class MockBuffer : public Buffer { + public: + using Buffer::multiGraphicsAllocation; + MockBuffer(void *hostPtr) { + this->hostPtr = hostPtr; + } + void setArgStateful(void *memory, bool forceNonAuxMode, bool disableL3, bool alignSizeForAuxTranslation, bool isReadOnly) override { + } + }; + void *hostPtr = reinterpret_cast(0x3000); + + std::unique_ptr buffer(new MockBuffer(hostPtr)); + + auto address = buffer->getBufferAddress(); + ASSERT_EQ(hostPtr, reinterpret_cast(address)); +} + +TEST_F(MultiRootDeviceBufferTest, givenBufferWithoutMultiGAWhenGetBufferAddressCalledThenCorrectAddressReturned) { + cl_int retVal = 0; + cl_mem_flags flags = CL_MEM_READ_WRITE; + + std::unique_ptr buffer(Buffer::create(context.get(), flags, MemoryConstants::pageSize, nullptr, retVal)); + + auto address = buffer->getBufferAddress(); + auto graphicsAllocation = buffer->getGraphicsAllocation(expectedRootDeviceIndex); + ASSERT_EQ(graphicsAllocation->getGpuAddress(), address); +} \ No newline at end of file diff --git a/shared/source/command_container/command_encoder.h b/shared/source/command_container/command_encoder.h index 3883c6a80a..c99c1be1d6 100644 --- a/shared/source/command_container/command_encoder.h +++ b/shared/source/command_container/command_encoder.h @@ -18,6 +18,8 @@ namespace NEO { +class GmmHelper; + template struct EncodeDispatchKernel { using WALKER_TYPE = typename GfxFamily::WALKER_TYPE; @@ -165,8 +167,9 @@ struct EncodeSurfaceState { using SURFACE_FORMAT = typename R_SURFACE_STATE::SURFACE_FORMAT; using AUXILIARY_SURFACE_MODE = typename R_SURFACE_STATE::AUXILIARY_SURFACE_MODE; - static void encodeBuffer(void *dst, void *address, size_t size, uint32_t mocs, + static void encodeBuffer(void *dst, uint64_t address, size_t size, uint32_t mocs, bool cpuCoherent); + static void encodeExtraBufferParams(GraphicsAllocation *allocation, GmmHelper *gmmHelper, void *memory, bool forceNonAuxMode, bool isReadOnlyArgument); static constexpr uintptr_t getSurfaceBaseAddressAlignmentMask() { return ~(getSurfaceBaseAddressAlignment() - 1); diff --git a/shared/source/command_container/command_encoder.inl b/shared/source/command_container/command_encoder.inl index 46fabb5e41..ac0b450694 100644 --- a/shared/source/command_container/command_encoder.inl +++ b/shared/source/command_container/command_encoder.inl @@ -10,6 +10,7 @@ #include "shared/source/command_stream/linear_stream.h" #include "shared/source/device/device.h" #include "shared/source/execution_environment/execution_environment.h" +#include "shared/source/gmm_helper/gmm_helper.h" #include "shared/source/helpers/hw_helper.h" #include "shared/source/helpers/preamble.h" #include "shared/source/helpers/register_offsets.h" @@ -265,7 +266,7 @@ void EncodeStoreMMIO::encode(LinearStream &csr, uint32_t offset, uint64_ } template -void EncodeSurfaceState::encodeBuffer(void *dst, void *address, size_t size, uint32_t mocs, +void EncodeSurfaceState::encodeBuffer(void *dst, uint64_t address, size_t size, uint32_t mocs, bool cpuCoherent) { auto ss = reinterpret_cast(dst); UNRECOVERABLE_IF(!isAligned(size)); @@ -277,8 +278,8 @@ void EncodeSurfaceState::encodeBuffer(void *dst, void *address, size_t s ss->setHeight(Length.SurfaceState.Height + 1); ss->setDepth(Length.SurfaceState.Depth + 1); - ss->setSurfaceType((address != nullptr) ? R_SURFACE_STATE::SURFACE_TYPE_SURFTYPE_BUFFER - : R_SURFACE_STATE::SURFACE_TYPE_SURFTYPE_NULL); + ss->setSurfaceType((address != 0) ? R_SURFACE_STATE::SURFACE_TYPE_SURFTYPE_BUFFER + : R_SURFACE_STATE::SURFACE_TYPE_SURFTYPE_NULL); ss->setSurfaceFormat(SURFACE_FORMAT::SURFACE_FORMAT_RAW); ss->setSurfaceVerticalAlignment(R_SURFACE_STATE::SURFACE_VERTICAL_ALIGNMENT_VALIGN_4); ss->setSurfaceHorizontalAlignment(R_SURFACE_STATE::SURFACE_HORIZONTAL_ALIGNMENT_HALIGN_4); @@ -287,13 +288,31 @@ void EncodeSurfaceState::encodeBuffer(void *dst, void *address, size_t s ss->setVerticalLineStride(0); ss->setVerticalLineStrideOffset(0); ss->setMemoryObjectControlState(mocs); - ss->setSurfaceBaseAddress(reinterpret_cast(address)); + ss->setSurfaceBaseAddress(address); ss->setCoherencyType(cpuCoherent ? R_SURFACE_STATE::COHERENCY_TYPE_IA_COHERENT : R_SURFACE_STATE::COHERENCY_TYPE_GPU_COHERENT); ss->setAuxiliarySurfaceMode(AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_NONE); } +template +void EncodeSurfaceState::encodeExtraBufferParams(GraphicsAllocation *allocation, GmmHelper *gmmHelper, void *memory, bool forceNonAuxMode, bool isReadOnlyArgument) { + using RENDER_SURFACE_STATE = typename Family::RENDER_SURFACE_STATE; + using AUXILIARY_SURFACE_MODE = typename RENDER_SURFACE_STATE::AUXILIARY_SURFACE_MODE; + auto surfaceState = reinterpret_cast(memory); + Gmm *gmm = allocation ? allocation->getDefaultGmm() : nullptr; + + if (gmm && gmm->isRenderCompressed && !forceNonAuxMode && + GraphicsAllocation::AllocationType::BUFFER_COMPRESSED == allocation->getAllocationType()) { + // Its expected to not program pitch/qpitch/baseAddress for Aux surface in CCS scenarios + surfaceState->setCoherencyType(RENDER_SURFACE_STATE::COHERENCY_TYPE_GPU_COHERENT); + surfaceState->setAuxiliarySurfaceMode(AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_CCS_E); + } + + if (DebugManager.flags.DisableCachingForStatefulBufferAccess.get()) { + surfaceState->setMemoryObjectControlState(gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER_CACHELINE_MISALIGNED)); + } +} template void *EncodeDispatchKernel::getInterfaceDescriptor(CommandContainer &container, uint32_t &iddOffset) { diff --git a/shared/test/unit_test/encoders/test_encode_states.cpp b/shared/test/unit_test/encoders/test_encode_states.cpp index 44f5882a02..88b560d89c 100644 --- a/shared/test/unit_test/encoders/test_encode_states.cpp +++ b/shared/test/unit_test/encoders/test_encode_states.cpp @@ -46,7 +46,7 @@ HWTEST_F(CommandEncodeStatesTest, givenCreatedSurfaceStateBufferWhenAllocationPr size_t allocSize = size; length.Length = static_cast(allocSize - 1); GraphicsAllocation allocation(0, GraphicsAllocation::AllocationType::UNKNOWN, cpuAddr, gpuAddr, 0u, allocSize, MemoryPool::MemoryNull); - EncodeSurfaceState::encodeBuffer(stateBuffer, reinterpret_cast(gpuAddr), allocSize, 1, + EncodeSurfaceState::encodeBuffer(stateBuffer, gpuAddr, allocSize, 1, RENDER_SURFACE_STATE::COHERENCY_TYPE_IA_COHERENT); EXPECT_EQ(length.SurfaceState.Depth + 1u, state->getDepth()); EXPECT_EQ(length.SurfaceState.Width + 1u, state->getWidth()); @@ -72,7 +72,7 @@ HWTEST_F(CommandEncodeStatesTest, givenCreatedSurfaceStateBufferWhenAllocationNo size_t allocSize = size; length.Length = static_cast(allocSize - 1); - EncodeSurfaceState::encodeBuffer(stateBuffer, reinterpret_cast(gpuAddr), allocSize, 1, + EncodeSurfaceState::encodeBuffer(stateBuffer, gpuAddr, allocSize, 1, RENDER_SURFACE_STATE::COHERENCY_TYPE_IA_COHERENT); EXPECT_EQ(RENDER_SURFACE_STATE::SURFACE_TYPE_SURFTYPE_NULL, state->getSurfaceType()); @@ -96,7 +96,7 @@ HWTEST_F(CommandEncodeStatesTest, givenCreatedSurfaceStateBufferWhenGpuCoherency size_t allocSize = size; length.Length = static_cast(allocSize - 1); - EncodeSurfaceState::encodeBuffer(stateBuffer, reinterpret_cast(gpuAddr), allocSize, 1, + EncodeSurfaceState::encodeBuffer(stateBuffer, gpuAddr, allocSize, 1, RENDER_SURFACE_STATE::COHERENCY_TYPE_GPU_COHERENT); EXPECT_EQ(RENDER_SURFACE_STATE::COHERENCY_TYPE_GPU_COHERENT, state->getCoherencyType());