From bd32518d3119466fe91a2a694f3a0d66f7715802 Mon Sep 17 00:00:00 2001
From: Igor Venevtsev <igor.venevtsev@intel.com>
Date: Mon, 29 Mar 2021 17:47:53 +0000
Subject: [PATCH] Add extra parameters to
 EncodeComputeMode::adjustComputeMode() method

Signed-off-by: Igor Venevtsev <igor.venevtsev@intel.com>
---
 level_zero/core/source/kernel/kernel_imp.cpp      | 15 +++++++++------
 shared/source/command_container/cmdcontainer.cpp  |  1 +
 shared/source/command_container/cmdcontainer.h    |  1 +
 shared/source/command_container/command_encoder.h |  5 +++--
 .../command_encoder_bdw_plus.inl                  |  3 ++-
 .../encode_compute_mode_bdw_plus.inl              |  6 +++---
 .../encode_compute_mode_tgllp_plus.inl            |  9 +++++----
 .../command_stream_receiver_hw_tgllp_plus.inl     |  5 +++--
 shared/source/gen12lp/command_encoder_gen12lp.cpp |  3 ++-
 .../kernel/dispatch_kernel_encoder_interface.h    |  3 ++-
 .../gen12lp/test_command_encoder_gen12lp.cpp      |  2 +-
 .../mock_dispatch_kernel_encoder_interface.h      |  1 -
 12 files changed, 32 insertions(+), 22 deletions(-)
diff --git a/level_zero/core/source/kernel/kernel_imp.cpp b/level_zero/core/source/kernel/kernel_imp.cpp
index 760a9e52a5..e4b18eab42 100644
--- a/level_zero/core/source/kernel/kernel_imp.cpp
+++ b/level_zero/core/source/kernel/kernel_imp.cpp
@@ -78,7 +78,7 @@ KernelImmutableData::~KernelImmutableData() {
 
 inline void patchWithImplicitSurface(ArrayRef<uint8_t> crossThreadData, ArrayRef<uint8_t> surfaceStateHeap,
                                      uintptr_t ptrToPatchInCrossThreadData, NEO::GraphicsAllocation &allocation,
-                                     const NEO::ArgDescPointer &ptr, const NEO::Device &device) {
+                                     const NEO::ArgDescPointer &ptr, const NEO::Device &device, bool useGlobalAtomics) {
     if (false == crossThreadData.empty()) {
         NEO::patchPointer(crossThreadData, ptr, ptrToPatchInCrossThreadData);
     }
@@ -88,7 +88,7 @@ inline void patchWithImplicitSurface(ArrayRef<uint8_t> crossThreadData, ArrayRef
         void *addressToPatch = reinterpret_cast<void *>(allocation.getUnderlyingBuffer());
         size_t sizeToPatch = allocation.getUnderlyingBufferSize();
         NEO::Buffer::setSurfaceState(&device, surfaceState, false, false, sizeToPatch, addressToPatch, 0,
-                                     &allocation, 0, 0, false, device.getNumAvailableDevices() > 1);
+                                     &allocation, 0, 0, useGlobalAtomics, device.getNumAvailableDevices() > 1);
     }
 }
 
@@ -171,7 +171,8 @@ void KernelImmutableData::initialize(NEO::KernelInfo *kernelInfo, Device *device
 
         patchWithImplicitSurface(crossThredDataArrayRef, surfaceStateHeapArrayRef,
                                  static_cast<uintptr_t>(globalConstBuffer->getGpuAddressToPatch()),
-                                 *globalConstBuffer, kernelDescriptor->payloadMappings.implicitArgs.globalConstantsSurfaceAddress, *neoDevice);
+                                 *globalConstBuffer, kernelDescriptor->payloadMappings.implicitArgs.globalConstantsSurfaceAddress,
+                                 *neoDevice, kernelDescriptor->kernelAttributes.flags.useGlobalAtomics);
         this->residencyContainer.push_back(globalConstBuffer);
     } else if (nullptr != globalConstBuffer) {
         this->residencyContainer.push_back(globalConstBuffer);
@@ -182,7 +183,8 @@ void KernelImmutableData::initialize(NEO::KernelInfo *kernelInfo, Device *device
 
         patchWithImplicitSurface(crossThredDataArrayRef, surfaceStateHeapArrayRef,
                                  static_cast<uintptr_t>(globalVarBuffer->getGpuAddressToPatch()),
-                                 *globalVarBuffer, kernelDescriptor->payloadMappings.implicitArgs.globalVariablesSurfaceAddress, *neoDevice);
+                                 *globalVarBuffer, kernelDescriptor->payloadMappings.implicitArgs.globalVariablesSurfaceAddress,
+                                 *neoDevice, kernelDescriptor->kernelAttributes.flags.useGlobalAtomics);
         this->residencyContainer.push_back(globalVarBuffer);
     } else if (nullptr != globalVarBuffer) {
         this->residencyContainer.push_back(globalVarBuffer);
@@ -741,7 +743,8 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
 
         patchWithImplicitSurface(crossThredDataArrayRef, surfaceStateHeapArrayRef,
                                  static_cast<uintptr_t>(privateMemoryGraphicsAllocation->getGpuAddressToPatch()),
-                                 *privateMemoryGraphicsAllocation, kernelImmData->getDescriptor().payloadMappings.implicitArgs.privateMemoryAddress, *neoDevice);
+                                 *privateMemoryGraphicsAllocation, kernelImmData->getDescriptor().payloadMappings.implicitArgs.privateMemoryAddress,
+                                 *neoDevice, kernelAttributes.flags.useGlobalAtomics);
 
         this->residencyContainer.push_back(this->privateMemoryGraphicsAllocation);
     }
@@ -794,7 +797,7 @@ void KernelImp::setDebugSurface() {
         patchWithImplicitSurface(ArrayRef<uint8_t>(), surfaceStateHeapRef,
                                  0,
                                  *device->getDebugSurface(), this->getImmutableData()->getDescriptor().payloadMappings.implicitArgs.systemThreadSurfaceAddress,
-                                 *device->getNEODevice());
+                                 *device->getNEODevice(), getKernelDescriptor().kernelAttributes.flags.useGlobalAtomics);
     }
 }
 void *KernelImp::patchBindlessSurfaceState(NEO::GraphicsAllocation *alloc, uint32_t bindless) {
diff --git a/shared/source/command_container/cmdcontainer.cpp b/shared/source/command_container/cmdcontainer.cpp
index 52f5d64daa..47c7129b3a 100644
--- a/shared/source/command_container/cmdcontainer.cpp
+++ b/shared/source/command_container/cmdcontainer.cpp
@@ -148,6 +148,7 @@ void CommandContainer::reset() {
     nextIddInBlock = this->getNumIddPerBlock();
     lastSentNumGrfRequired = 0;
     lastPipelineSelectModeRequired = false;
+    lastSentUseGlobalAtomics = false;
 }
 
 void *CommandContainer::getHeapSpaceAllowGrow(HeapType heapType,
diff --git a/shared/source/command_container/cmdcontainer.h b/shared/source/command_container/cmdcontainer.h
index 73700985a0..b3458a2dcf 100644
--- a/shared/source/command_container/cmdcontainer.h
+++ b/shared/source/command_container/cmdcontainer.h
@@ -87,6 +87,7 @@ class CommandContainer : public NonCopyableOrMovableClass {
     uint32_t nextIddInBlock = 0;
     uint32_t lastSentNumGrfRequired = 0;
     bool lastPipelineSelectModeRequired = false;
+    bool lastSentUseGlobalAtomics = false;
 
     Device *getDevice() const { return device; }
 
diff --git a/shared/source/command_container/command_encoder.h b/shared/source/command_container/command_encoder.h
index 9829069f80..11029bb250 100644
--- a/shared/source/command_container/command_encoder.h
+++ b/shared/source/command_container/command_encoder.h
@@ -98,7 +98,7 @@ struct EncodeStates {
                                      const void *fnDynamicStateHeap,
                                      BindlessHeapsHelper *bindlessHeapHelper);
 
-    static void adjustStateComputeMode(LinearStream &csr, uint32_t numGrfRequired, void *const stateComputeModePtr, bool isMultiOsContextCapable, bool requiresCoherency);
+    static void adjustStateComputeMode(LinearStream &csr, uint32_t numGrfRequired, void *const stateComputeModePtr, bool isMultiOsContextCapable, bool requiresCoherency, bool useGlobalAtomics, bool areMultipleSubDevicesInContext);
 
     static size_t getAdjustStateComputeModeSize();
 };
@@ -265,7 +265,8 @@ struct EncodeSurfaceState {
 template <typename GfxFamily>
 struct EncodeComputeMode {
     using STATE_COMPUTE_MODE = typename GfxFamily::STATE_COMPUTE_MODE;
-    static void adjustComputeMode(LinearStream &csr, uint32_t numGrfRequired, void *const stateComputeModePtr, bool isMultiOsContextCapable);
+    static void adjustComputeMode(LinearStream &csr, uint32_t numGrfRequired, void *const stateComputeModePtr,
+                                  bool isMultiOsContextCapable, bool useGlobalAtomics, bool areMultipleSubDevicesInContext);
 
     static void adjustPipelineSelect(CommandContainer &container, const NEO::KernelDescriptor &kernelDescriptor);
 };
diff --git a/shared/source/command_container/command_encoder_bdw_plus.inl b/shared/source/command_container/command_encoder_bdw_plus.inl
index 8c960d4b10..02603cd0c8 100644
--- a/shared/source/command_container/command_encoder_bdw_plus.inl
+++ b/shared/source/command_container/command_encoder_bdw_plus.inl
@@ -69,7 +69,8 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
     }
 
     EncodeWA<Family>::encodeAdditionalPipelineSelect(*container.getDevice(), *container.getCommandStream(), true);
-    EncodeStates<Family>::adjustStateComputeMode(*container.getCommandStream(), container.lastSentNumGrfRequired, nullptr, false, false);
+    EncodeStates<Family>::adjustStateComputeMode(*container.getCommandStream(), container.lastSentNumGrfRequired, nullptr, false, false,
+                                                 kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, device->getNumAvailableDevices() > 1);
     EncodeWA<Family>::encodeAdditionalPipelineSelect(*container.getDevice(), *container.getCommandStream(), false);
 
     auto numThreadsPerThreadGroup = dispatchInterface->getNumThreadsPerThreadGroup();
diff --git a/shared/source/command_container/encode_compute_mode_bdw_plus.inl b/shared/source/command_container/encode_compute_mode_bdw_plus.inl
index da620ba284..4fa7108aff 100644
--- a/shared/source/command_container/encode_compute_mode_bdw_plus.inl
+++ b/shared/source/command_container/encode_compute_mode_bdw_plus.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2020 Intel Corporation
+ * Copyright (C) 2020-2021 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -11,7 +11,7 @@
 
 namespace NEO {
 template <typename Family>
-void EncodeStates<Family>::adjustStateComputeMode(LinearStream &csr, uint32_t numGrfRequired, void *const stateComputeModePtr, bool isMultiOsContextCapable, bool requiresCoherency) {
+void EncodeStates<Family>::adjustStateComputeMode(LinearStream &csr, uint32_t numGrfRequired, void *const stateComputeModePtr, bool isMultiOsContextCapable, bool requiresCoherency, bool useGlobalAtomics, bool areMultipleSubDevicesInContext) {
 }
 
 template <typename Family>
@@ -31,4 +31,4 @@ bool EncodeSetMMIO<Family>::isRemapApplicable(uint32_t offset) {
     return false;
 }
 
-} // namespace NEO
\ No newline at end of file
+} // namespace NEO
diff --git a/shared/source/command_container/encode_compute_mode_tgllp_plus.inl b/shared/source/command_container/encode_compute_mode_tgllp_plus.inl
index b33d58c3c7..a243de344e 100644
--- a/shared/source/command_container/encode_compute_mode_tgllp_plus.inl
+++ b/shared/source/command_container/encode_compute_mode_tgllp_plus.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2020 Intel Corporation
+ * Copyright (C) 2020-2021 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -11,7 +11,8 @@
 
 namespace NEO {
 template <typename Family>
-void EncodeStates<Family>::adjustStateComputeMode(LinearStream &csr, uint32_t numGrfRequired, void *const stateComputeModePtr, bool isMultiOsContextCapable, bool requiresCoherency) {
+void EncodeStates<Family>::adjustStateComputeMode(LinearStream &csr, uint32_t numGrfRequired, void *const stateComputeModePtr,
+                                                  bool isMultiOsContextCapable, bool requiresCoherency, bool useGlobalAtomics, bool areMultipleSubDevicesInContext) {
     using STATE_COMPUTE_MODE = typename Family::STATE_COMPUTE_MODE;
     using FORCE_NON_COHERENT = typename STATE_COMPUTE_MODE::FORCE_NON_COHERENT;
     STATE_COMPUTE_MODE stateComputeMode = (stateComputeModePtr != nullptr) ? *(static_cast<STATE_COMPUTE_MODE *>(stateComputeModePtr)) : Family::cmdInitStateComputeMode;
@@ -20,7 +21,7 @@ void EncodeStates<Family>::adjustStateComputeMode(LinearStream &csr, uint32_t nu
 
     stateComputeMode.setMaskBits(stateComputeMode.getMaskBits() | Family::stateComputeModeForceNonCoherentMask);
 
-    EncodeComputeMode<Family>::adjustComputeMode(csr, numGrfRequired, &stateComputeMode, isMultiOsContextCapable);
+    EncodeComputeMode<Family>::adjustComputeMode(csr, numGrfRequired, &stateComputeMode, isMultiOsContextCapable, useGlobalAtomics, areMultipleSubDevicesInContext);
 }
 
 template <typename Family>
@@ -52,4 +53,4 @@ bool EncodeSetMMIO<Family>::isRemapApplicable(uint32_t offset) {
            (0x4400 <= offset && offset <= 0x441f);
 }
 
-} // namespace NEO
\ No newline at end of file
+} // namespace NEO
diff --git a/shared/source/command_stream/command_stream_receiver_hw_tgllp_plus.inl b/shared/source/command_stream/command_stream_receiver_hw_tgllp_plus.inl
index 5682ed6dfa..fd07d049f3 100644
--- a/shared/source/command_stream/command_stream_receiver_hw_tgllp_plus.inl
+++ b/shared/source/command_stream/command_stream_receiver_hw_tgllp_plus.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2020 Intel Corporation
+ * Copyright (C) 2018-2021 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,8 @@ void CommandStreamReceiverHw<GfxFamily>::programComputeMode(LinearStream &stream
 
         auto stateComputeMode = GfxFamily::cmdInitStateComputeMode;
         adjustThreadArbitionPolicy(&stateComputeMode);
-        EncodeStates<GfxFamily>::adjustStateComputeMode(stream, dispatchFlags.numGrfRequired, &stateComputeMode, isMultiOsContextCapable(), dispatchFlags.requiresCoherency);
+        EncodeStates<GfxFamily>::adjustStateComputeMode(stream, dispatchFlags.numGrfRequired, &stateComputeMode, isMultiOsContextCapable(), dispatchFlags.requiresCoherency,
+                                                        dispatchFlags.useGlobalAtomics, dispatchFlags.areMultipleSubDevicesInContext);
 
         if (csrSizeRequestFlags.hasSharedHandles) {
             auto pc = stream.getSpaceForCmd<PIPE_CONTROL>();
diff --git a/shared/source/gen12lp/command_encoder_gen12lp.cpp b/shared/source/gen12lp/command_encoder_gen12lp.cpp
index f2c0f25117..2227339704 100644
--- a/shared/source/gen12lp/command_encoder_gen12lp.cpp
+++ b/shared/source/gen12lp/command_encoder_gen12lp.cpp
@@ -35,7 +35,8 @@ size_t EncodeStates<Family>::getAdjustStateComputeModeSize() {
 }
 
 template <>
-void EncodeComputeMode<Family>::adjustComputeMode(LinearStream &csr, uint32_t numGrfRequired, void *const stateComputeModePtr, bool isMultiOsContextCapable) {
+void EncodeComputeMode<Family>::adjustComputeMode(LinearStream &csr, uint32_t numGrfRequired, void *const stateComputeModePtr,
+                                                  bool isMultiOsContextCapable, bool useGlobalAtomics, bool areMultipleSubDevicesInContext) {
     STATE_COMPUTE_MODE *stateComputeMode = static_cast<STATE_COMPUTE_MODE *>(stateComputeModePtr);
     auto buffer = csr.getSpace(sizeof(STATE_COMPUTE_MODE));
     *reinterpret_cast<STATE_COMPUTE_MODE *>(buffer) = *stateComputeMode;
diff --git a/shared/source/kernel/dispatch_kernel_encoder_interface.h b/shared/source/kernel/dispatch_kernel_encoder_interface.h
index 69012ba393..d988217b5f 100644
--- a/shared/source/kernel/dispatch_kernel_encoder_interface.h
+++ b/shared/source/kernel/dispatch_kernel_encoder_interface.h
@@ -1,11 +1,12 @@
 /*
- * Copyright (C) 2020 Intel Corporation
+ * Copyright (C) 2020-2021 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
  */
 
 #pragma once
+#include <cstddef>
 #include <cstdint>
 
 namespace NEO {
diff --git a/shared/test/common/gen12lp/test_command_encoder_gen12lp.cpp b/shared/test/common/gen12lp/test_command_encoder_gen12lp.cpp
index 90e37d3366..83605b3e6c 100644
--- a/shared/test/common/gen12lp/test_command_encoder_gen12lp.cpp
+++ b/shared/test/common/gen12lp/test_command_encoder_gen12lp.cpp
@@ -32,7 +32,7 @@ GEN12LPTEST_F(CommandEncoderTest, givenAdjustStateComputeModeStateComputeModeSho
     auto usedSpaceBefore = cmdContainer.getCommandStream()->getUsed();
 
     // Adjust the State Compute Mode which sets FORCE_NON_COHERENT_FORCE_GPU_NON_COHERENT
-    EncodeStates<FamilyType>::adjustStateComputeMode(*cmdContainer.getCommandStream(), cmdContainer.lastSentNumGrfRequired, nullptr, false, false);
+    EncodeStates<FamilyType>::adjustStateComputeMode(*cmdContainer.getCommandStream(), cmdContainer.lastSentNumGrfRequired, nullptr, false, false, false, false);
 
     auto usedSpaceAfter = cmdContainer.getCommandStream()->getUsed();
     ASSERT_GT(usedSpaceAfter, usedSpaceBefore);
diff --git a/shared/test/common/mocks/mock_dispatch_kernel_encoder_interface.h b/shared/test/common/mocks/mock_dispatch_kernel_encoder_interface.h
index 6d8c3c8fb0..76beeee3a0 100644
--- a/shared/test/common/mocks/mock_dispatch_kernel_encoder_interface.h
+++ b/shared/test/common/mocks/mock_dispatch_kernel_encoder_interface.h
@@ -52,7 +52,6 @@ struct MockDispatchKernelEncoder : public DispatchKernelEncoderI {
     uint32_t getNumThreadsPerThreadGroup() const override {
         return 1;
     }
-
     void expectAnyMockFunctionCall();
 
     ::testing::NiceMock<MockGraphicsAllocation> mockAllocation;