From bd32518d3119466fe91a2a694f3a0d66f7715802 Mon Sep 17 00:00:00 2001 From: Igor Venevtsev Date: Mon, 29 Mar 2021 17:47:53 +0000 Subject: [PATCH] Add extra parameters to EncodeComputeMode::adjustComputeMode() method Signed-off-by: Igor Venevtsev --- level_zero/core/source/kernel/kernel_imp.cpp | 15 +++++++++------ shared/source/command_container/cmdcontainer.cpp | 1 + shared/source/command_container/cmdcontainer.h | 1 + shared/source/command_container/command_encoder.h | 5 +++-- .../command_encoder_bdw_plus.inl | 3 ++- .../encode_compute_mode_bdw_plus.inl | 6 +++--- .../encode_compute_mode_tgllp_plus.inl | 9 +++++---- .../command_stream_receiver_hw_tgllp_plus.inl | 5 +++-- shared/source/gen12lp/command_encoder_gen12lp.cpp | 3 ++- .../kernel/dispatch_kernel_encoder_interface.h | 3 ++- .../gen12lp/test_command_encoder_gen12lp.cpp | 2 +- .../mock_dispatch_kernel_encoder_interface.h | 1 - 12 files changed, 32 insertions(+), 22 deletions(-) diff --git a/level_zero/core/source/kernel/kernel_imp.cpp b/level_zero/core/source/kernel/kernel_imp.cpp index 760a9e52a5..e4b18eab42 100644 --- a/level_zero/core/source/kernel/kernel_imp.cpp +++ b/level_zero/core/source/kernel/kernel_imp.cpp @@ -78,7 +78,7 @@ KernelImmutableData::~KernelImmutableData() { inline void patchWithImplicitSurface(ArrayRef crossThreadData, ArrayRef surfaceStateHeap, uintptr_t ptrToPatchInCrossThreadData, NEO::GraphicsAllocation &allocation, - const NEO::ArgDescPointer &ptr, const NEO::Device &device) { + const NEO::ArgDescPointer &ptr, const NEO::Device &device, bool useGlobalAtomics) { if (false == crossThreadData.empty()) { NEO::patchPointer(crossThreadData, ptr, ptrToPatchInCrossThreadData); } @@ -88,7 +88,7 @@ inline void patchWithImplicitSurface(ArrayRef crossThreadData, ArrayRef void *addressToPatch = reinterpret_cast(allocation.getUnderlyingBuffer()); size_t sizeToPatch = allocation.getUnderlyingBufferSize(); NEO::Buffer::setSurfaceState(&device, surfaceState, false, false, sizeToPatch, addressToPatch, 0, - &allocation, 0, 0, false, device.getNumAvailableDevices() > 1); + &allocation, 0, 0, useGlobalAtomics, device.getNumAvailableDevices() > 1); } } @@ -171,7 +171,8 @@ void KernelImmutableData::initialize(NEO::KernelInfo *kernelInfo, Device *device patchWithImplicitSurface(crossThredDataArrayRef, surfaceStateHeapArrayRef, static_cast(globalConstBuffer->getGpuAddressToPatch()), - *globalConstBuffer, kernelDescriptor->payloadMappings.implicitArgs.globalConstantsSurfaceAddress, *neoDevice); + *globalConstBuffer, kernelDescriptor->payloadMappings.implicitArgs.globalConstantsSurfaceAddress, + *neoDevice, kernelDescriptor->kernelAttributes.flags.useGlobalAtomics); this->residencyContainer.push_back(globalConstBuffer); } else if (nullptr != globalConstBuffer) { this->residencyContainer.push_back(globalConstBuffer); @@ -182,7 +183,8 @@ void KernelImmutableData::initialize(NEO::KernelInfo *kernelInfo, Device *device patchWithImplicitSurface(crossThredDataArrayRef, surfaceStateHeapArrayRef, static_cast(globalVarBuffer->getGpuAddressToPatch()), - *globalVarBuffer, kernelDescriptor->payloadMappings.implicitArgs.globalVariablesSurfaceAddress, *neoDevice); + *globalVarBuffer, kernelDescriptor->payloadMappings.implicitArgs.globalVariablesSurfaceAddress, + *neoDevice, kernelDescriptor->kernelAttributes.flags.useGlobalAtomics); this->residencyContainer.push_back(globalVarBuffer); } else if (nullptr != globalVarBuffer) { this->residencyContainer.push_back(globalVarBuffer); @@ -741,7 +743,8 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) { patchWithImplicitSurface(crossThredDataArrayRef, surfaceStateHeapArrayRef, static_cast(privateMemoryGraphicsAllocation->getGpuAddressToPatch()), - *privateMemoryGraphicsAllocation, kernelImmData->getDescriptor().payloadMappings.implicitArgs.privateMemoryAddress, *neoDevice); + *privateMemoryGraphicsAllocation, kernelImmData->getDescriptor().payloadMappings.implicitArgs.privateMemoryAddress, + *neoDevice, kernelAttributes.flags.useGlobalAtomics); this->residencyContainer.push_back(this->privateMemoryGraphicsAllocation); } @@ -794,7 +797,7 @@ void KernelImp::setDebugSurface() { patchWithImplicitSurface(ArrayRef(), surfaceStateHeapRef, 0, *device->getDebugSurface(), this->getImmutableData()->getDescriptor().payloadMappings.implicitArgs.systemThreadSurfaceAddress, - *device->getNEODevice()); + *device->getNEODevice(), getKernelDescriptor().kernelAttributes.flags.useGlobalAtomics); } } void *KernelImp::patchBindlessSurfaceState(NEO::GraphicsAllocation *alloc, uint32_t bindless) { diff --git a/shared/source/command_container/cmdcontainer.cpp b/shared/source/command_container/cmdcontainer.cpp index 52f5d64daa..47c7129b3a 100644 --- a/shared/source/command_container/cmdcontainer.cpp +++ b/shared/source/command_container/cmdcontainer.cpp @@ -148,6 +148,7 @@ void CommandContainer::reset() { nextIddInBlock = this->getNumIddPerBlock(); lastSentNumGrfRequired = 0; lastPipelineSelectModeRequired = false; + lastSentUseGlobalAtomics = false; } void *CommandContainer::getHeapSpaceAllowGrow(HeapType heapType, diff --git a/shared/source/command_container/cmdcontainer.h b/shared/source/command_container/cmdcontainer.h index 73700985a0..b3458a2dcf 100644 --- a/shared/source/command_container/cmdcontainer.h +++ b/shared/source/command_container/cmdcontainer.h @@ -87,6 +87,7 @@ class CommandContainer : public NonCopyableOrMovableClass { uint32_t nextIddInBlock = 0; uint32_t lastSentNumGrfRequired = 0; bool lastPipelineSelectModeRequired = false; + bool lastSentUseGlobalAtomics = false; Device *getDevice() const { return device; } diff --git a/shared/source/command_container/command_encoder.h b/shared/source/command_container/command_encoder.h index 9829069f80..11029bb250 100644 --- a/shared/source/command_container/command_encoder.h +++ b/shared/source/command_container/command_encoder.h @@ -98,7 +98,7 @@ struct EncodeStates { const void *fnDynamicStateHeap, BindlessHeapsHelper *bindlessHeapHelper); - static void adjustStateComputeMode(LinearStream &csr, uint32_t numGrfRequired, void *const stateComputeModePtr, bool isMultiOsContextCapable, bool requiresCoherency); + static void adjustStateComputeMode(LinearStream &csr, uint32_t numGrfRequired, void *const stateComputeModePtr, bool isMultiOsContextCapable, bool requiresCoherency, bool useGlobalAtomics, bool areMultipleSubDevicesInContext); static size_t getAdjustStateComputeModeSize(); }; @@ -265,7 +265,8 @@ struct EncodeSurfaceState { template struct EncodeComputeMode { using STATE_COMPUTE_MODE = typename GfxFamily::STATE_COMPUTE_MODE; - static void adjustComputeMode(LinearStream &csr, uint32_t numGrfRequired, void *const stateComputeModePtr, bool isMultiOsContextCapable); + static void adjustComputeMode(LinearStream &csr, uint32_t numGrfRequired, void *const stateComputeModePtr, + bool isMultiOsContextCapable, bool useGlobalAtomics, bool areMultipleSubDevicesInContext); static void adjustPipelineSelect(CommandContainer &container, const NEO::KernelDescriptor &kernelDescriptor); }; diff --git a/shared/source/command_container/command_encoder_bdw_plus.inl b/shared/source/command_container/command_encoder_bdw_plus.inl index 8c960d4b10..02603cd0c8 100644 --- a/shared/source/command_container/command_encoder_bdw_plus.inl +++ b/shared/source/command_container/command_encoder_bdw_plus.inl @@ -69,7 +69,8 @@ void EncodeDispatchKernel::encode(CommandContainer &container, } EncodeWA::encodeAdditionalPipelineSelect(*container.getDevice(), *container.getCommandStream(), true); - EncodeStates::adjustStateComputeMode(*container.getCommandStream(), container.lastSentNumGrfRequired, nullptr, false, false); + EncodeStates::adjustStateComputeMode(*container.getCommandStream(), container.lastSentNumGrfRequired, nullptr, false, false, + kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, device->getNumAvailableDevices() > 1); EncodeWA::encodeAdditionalPipelineSelect(*container.getDevice(), *container.getCommandStream(), false); auto numThreadsPerThreadGroup = dispatchInterface->getNumThreadsPerThreadGroup(); diff --git a/shared/source/command_container/encode_compute_mode_bdw_plus.inl b/shared/source/command_container/encode_compute_mode_bdw_plus.inl index da620ba284..4fa7108aff 100644 --- a/shared/source/command_container/encode_compute_mode_bdw_plus.inl +++ b/shared/source/command_container/encode_compute_mode_bdw_plus.inl @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020 Intel Corporation + * Copyright (C) 2020-2021 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -11,7 +11,7 @@ namespace NEO { template -void EncodeStates::adjustStateComputeMode(LinearStream &csr, uint32_t numGrfRequired, void *const stateComputeModePtr, bool isMultiOsContextCapable, bool requiresCoherency) { +void EncodeStates::adjustStateComputeMode(LinearStream &csr, uint32_t numGrfRequired, void *const stateComputeModePtr, bool isMultiOsContextCapable, bool requiresCoherency, bool useGlobalAtomics, bool areMultipleSubDevicesInContext) { } template @@ -31,4 +31,4 @@ bool EncodeSetMMIO::isRemapApplicable(uint32_t offset) { return false; } -} // namespace NEO \ No newline at end of file +} // namespace NEO diff --git a/shared/source/command_container/encode_compute_mode_tgllp_plus.inl b/shared/source/command_container/encode_compute_mode_tgllp_plus.inl index b33d58c3c7..a243de344e 100644 --- a/shared/source/command_container/encode_compute_mode_tgllp_plus.inl +++ b/shared/source/command_container/encode_compute_mode_tgllp_plus.inl @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020 Intel Corporation + * Copyright (C) 2020-2021 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -11,7 +11,8 @@ namespace NEO { template -void EncodeStates::adjustStateComputeMode(LinearStream &csr, uint32_t numGrfRequired, void *const stateComputeModePtr, bool isMultiOsContextCapable, bool requiresCoherency) { +void EncodeStates::adjustStateComputeMode(LinearStream &csr, uint32_t numGrfRequired, void *const stateComputeModePtr, + bool isMultiOsContextCapable, bool requiresCoherency, bool useGlobalAtomics, bool areMultipleSubDevicesInContext) { using STATE_COMPUTE_MODE = typename Family::STATE_COMPUTE_MODE; using FORCE_NON_COHERENT = typename STATE_COMPUTE_MODE::FORCE_NON_COHERENT; STATE_COMPUTE_MODE stateComputeMode = (stateComputeModePtr != nullptr) ? *(static_cast(stateComputeModePtr)) : Family::cmdInitStateComputeMode; @@ -20,7 +21,7 @@ void EncodeStates::adjustStateComputeMode(LinearStream &csr, uint32_t nu stateComputeMode.setMaskBits(stateComputeMode.getMaskBits() | Family::stateComputeModeForceNonCoherentMask); - EncodeComputeMode::adjustComputeMode(csr, numGrfRequired, &stateComputeMode, isMultiOsContextCapable); + EncodeComputeMode::adjustComputeMode(csr, numGrfRequired, &stateComputeMode, isMultiOsContextCapable, useGlobalAtomics, areMultipleSubDevicesInContext); } template @@ -52,4 +53,4 @@ bool EncodeSetMMIO::isRemapApplicable(uint32_t offset) { (0x4400 <= offset && offset <= 0x441f); } -} // namespace NEO \ No newline at end of file +} // namespace NEO diff --git a/shared/source/command_stream/command_stream_receiver_hw_tgllp_plus.inl b/shared/source/command_stream/command_stream_receiver_hw_tgllp_plus.inl index 5682ed6dfa..fd07d049f3 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_tgllp_plus.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_tgllp_plus.inl @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2020 Intel Corporation + * Copyright (C) 2018-2021 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -21,7 +21,8 @@ void CommandStreamReceiverHw::programComputeMode(LinearStream &stream auto stateComputeMode = GfxFamily::cmdInitStateComputeMode; adjustThreadArbitionPolicy(&stateComputeMode); - EncodeStates::adjustStateComputeMode(stream, dispatchFlags.numGrfRequired, &stateComputeMode, isMultiOsContextCapable(), dispatchFlags.requiresCoherency); + EncodeStates::adjustStateComputeMode(stream, dispatchFlags.numGrfRequired, &stateComputeMode, isMultiOsContextCapable(), dispatchFlags.requiresCoherency, + dispatchFlags.useGlobalAtomics, dispatchFlags.areMultipleSubDevicesInContext); if (csrSizeRequestFlags.hasSharedHandles) { auto pc = stream.getSpaceForCmd(); diff --git a/shared/source/gen12lp/command_encoder_gen12lp.cpp b/shared/source/gen12lp/command_encoder_gen12lp.cpp index f2c0f25117..2227339704 100644 --- a/shared/source/gen12lp/command_encoder_gen12lp.cpp +++ b/shared/source/gen12lp/command_encoder_gen12lp.cpp @@ -35,7 +35,8 @@ size_t EncodeStates::getAdjustStateComputeModeSize() { } template <> -void EncodeComputeMode::adjustComputeMode(LinearStream &csr, uint32_t numGrfRequired, void *const stateComputeModePtr, bool isMultiOsContextCapable) { +void EncodeComputeMode::adjustComputeMode(LinearStream &csr, uint32_t numGrfRequired, void *const stateComputeModePtr, + bool isMultiOsContextCapable, bool useGlobalAtomics, bool areMultipleSubDevicesInContext) { STATE_COMPUTE_MODE *stateComputeMode = static_cast(stateComputeModePtr); auto buffer = csr.getSpace(sizeof(STATE_COMPUTE_MODE)); *reinterpret_cast(buffer) = *stateComputeMode; diff --git a/shared/source/kernel/dispatch_kernel_encoder_interface.h b/shared/source/kernel/dispatch_kernel_encoder_interface.h index 69012ba393..d988217b5f 100644 --- a/shared/source/kernel/dispatch_kernel_encoder_interface.h +++ b/shared/source/kernel/dispatch_kernel_encoder_interface.h @@ -1,11 +1,12 @@ /* - * Copyright (C) 2020 Intel Corporation + * Copyright (C) 2020-2021 Intel Corporation * * SPDX-License-Identifier: MIT * */ #pragma once +#include #include namespace NEO { diff --git a/shared/test/common/gen12lp/test_command_encoder_gen12lp.cpp b/shared/test/common/gen12lp/test_command_encoder_gen12lp.cpp index 90e37d3366..83605b3e6c 100644 --- a/shared/test/common/gen12lp/test_command_encoder_gen12lp.cpp +++ b/shared/test/common/gen12lp/test_command_encoder_gen12lp.cpp @@ -32,7 +32,7 @@ GEN12LPTEST_F(CommandEncoderTest, givenAdjustStateComputeModeStateComputeModeSho auto usedSpaceBefore = cmdContainer.getCommandStream()->getUsed(); // Adjust the State Compute Mode which sets FORCE_NON_COHERENT_FORCE_GPU_NON_COHERENT - EncodeStates::adjustStateComputeMode(*cmdContainer.getCommandStream(), cmdContainer.lastSentNumGrfRequired, nullptr, false, false); + EncodeStates::adjustStateComputeMode(*cmdContainer.getCommandStream(), cmdContainer.lastSentNumGrfRequired, nullptr, false, false, false, false); auto usedSpaceAfter = cmdContainer.getCommandStream()->getUsed(); ASSERT_GT(usedSpaceAfter, usedSpaceBefore); diff --git a/shared/test/common/mocks/mock_dispatch_kernel_encoder_interface.h b/shared/test/common/mocks/mock_dispatch_kernel_encoder_interface.h index 6d8c3c8fb0..76beeee3a0 100644 --- a/shared/test/common/mocks/mock_dispatch_kernel_encoder_interface.h +++ b/shared/test/common/mocks/mock_dispatch_kernel_encoder_interface.h @@ -52,7 +52,6 @@ struct MockDispatchKernelEncoder : public DispatchKernelEncoderI { uint32_t getNumThreadsPerThreadGroup() const override { return 1; } - void expectAnyMockFunctionCall(); ::testing::NiceMock mockAllocation;