Improve scratch allocation size calculation

Change-Id: I627bea89ce31e7110976cb88f9e9266e08af590a
2026-01-05 09:09:04 +08:00 · 2018-06-06 08:45:45 +02:00
parent e6a9d30951
commit bd16f4bf2b
7 changed files with 44 additions and 38 deletions
--- a/runtime/command_stream/command_stream_receiver_hw.inl
+++ b/runtime/command_stream/command_stream_receiver_hw.inl
@@ -221,7 +221,7 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
    csrSizeRequestFlags.preemptionRequestChanged = this->lastPreemptionMode != dispatchFlags.preemptionMode;
    csrSizeRequestFlags.mediaSamplerConfigChanged = this->lastMediaSamplerConfig != static_cast<int8_t>(dispatchFlags.mediaSamplerRequired);

-    size_t requiredScratchSizeInBytes = requiredScratchSize * (hwInfo.pSysInfo->MaxSubSlicesSupported * hwInfo.pSysInfo->MaxEuPerSubSlice * hwInfo.pSysInfo->ThreadCount / hwInfo.pSysInfo->EUCount);
+    size_t requiredScratchSizeInBytes = requiredScratchSize * device->getDeviceInfo().computeUnitsUsedForScratch;

    auto force32BitAllocations = getMemoryManager()->peekForce32BitAllocations();

@@ -731,5 +731,4 @@ void CommandStreamReceiverHw<GfxFamily>::resetKmdNotifyHelper(KmdNotifyHelper *n
 template <typename GfxFamily>
 void CommandStreamReceiverHw<GfxFamily>::addClearSLMWorkAround(typename GfxFamily::PIPE_CONTROL *pCmd) {
 }
-
 } // namespace OCLRT
--- a/runtime/device/device_caps.cpp
+++ b/runtime/device/device_caps.cpp
@@ -275,20 +275,16 @@ void Device::initializeCaps() {
    deviceInfo.numThreadsPerEU = 0;
    auto simdSizeUsed = DebugManager.flags.UseMaxSimdSizeToDeduceMaxWorkgroupSize.get() ? 32 : 8;

-    if (systemInfo.EUCount > 0) {
-        deviceInfo.maxNumEUsPerSubSlice = (systemInfo.EuCountPerPoolMin == 0 || hwInfo.pSkuTable->ftrPooledEuEnabled == 0)
-                                              ? (systemInfo.EUCount / systemInfo.SubSliceCount)
-                                              : systemInfo.EuCountPerPoolMin;
-        deviceInfo.numThreadsPerEU = systemInfo.ThreadCount / systemInfo.EUCount;
-        auto maxWkgSize = DebugManager.flags.UseMaxSimdSizeToDeduceMaxWorkgroupSize.get() ? 1024u : 256u;
-        auto maxWS = deviceInfo.maxNumEUsPerSubSlice * deviceInfo.numThreadsPerEU * simdSizeUsed;
+    deviceInfo.maxNumEUsPerSubSlice = (systemInfo.EuCountPerPoolMin == 0 || hwInfo.pSkuTable->ftrPooledEuEnabled == 0)
+                                          ? (systemInfo.EUCount / systemInfo.SubSliceCount)
+                                          : systemInfo.EuCountPerPoolMin;
+    deviceInfo.numThreadsPerEU = systemInfo.ThreadCount / systemInfo.EUCount;
+    auto maxWkgSize = DebugManager.flags.UseMaxSimdSizeToDeduceMaxWorkgroupSize.get() ? 1024u : 256u;
+    auto maxWS = deviceInfo.maxNumEUsPerSubSlice * deviceInfo.numThreadsPerEU * simdSizeUsed;
+
+    maxWS = Math::prevPowerOfTwo(uint32_t(maxWS));
+    deviceInfo.maxWorkGroupSize = std::min(uint32_t(maxWS), maxWkgSize);

-        maxWS = Math::prevPowerOfTwo(uint32_t(maxWS));
-        deviceInfo.maxWorkGroupSize = std::min(uint32_t(maxWS), maxWkgSize);
-    } else {
-        //default value if systemInfo not provided
-        deviceInfo.maxWorkGroupSize = 128;
-    }
    DEBUG_BREAK_IF(!DebugManager.flags.UseMaxSimdSizeToDeduceMaxWorkgroupSize.get() && deviceInfo.maxWorkGroupSize > 256);

    // calculate a maximum number of subgroups in a workgroup (for the required SIMD size)
@@ -310,9 +306,7 @@ void Device::initializeCaps() {
                     systemInfo.MaxSlicesSupported,
                     systemInfo.MaxSubSlicesSupported);

-    if (systemInfo.EUCount > 0) {
-        deviceInfo.computeUnitsUsedForScratch = systemInfo.MaxSubSlicesSupported * systemInfo.MaxEuPerSubSlice * systemInfo.ThreadCount / systemInfo.EUCount;
-    }
+    deviceInfo.computeUnitsUsedForScratch = hwHelper.getComputeUnitsUsedForScratch(&hwInfo);

    printDebugString(DebugManager.flags.PrintDebugMessages.get(), stderr, "computeUnitsUsedForScratch: %d\n", deviceInfo.computeUnitsUsedForScratch);

--- a/runtime/helpers/hw_helper.h
+++ b/runtime/helpers/hw_helper.h
@@ -39,6 +39,7 @@ class HwHelper {
    virtual uint32_t getBindingTableStateAlignement() const = 0;
    virtual size_t getInterfaceDescriptorDataSize() const = 0;
    virtual size_t getMaxBarrierRegisterPerSlice() const = 0;
+    virtual uint32_t getComputeUnitsUsedForScratch(const HardwareInfo *pHwInfo) const = 0;
    virtual void setCapabilityCoherencyFlag(const HardwareInfo *pHwInfo, bool &coherencyFlag) = 0;
    virtual bool setupPreemptionRegisters(HardwareInfo *pHwInfo, bool enable) = 0;
    virtual void adjustDefaultEngineType(HardwareInfo *pHwInfo) = 0;
@@ -81,6 +82,8 @@ class HwHelperHw : public HwHelper {

    size_t getMaxBarrierRegisterPerSlice() const override;

+    uint32_t getComputeUnitsUsedForScratch(const HardwareInfo *pHwInfo) const override;
+
    void setCapabilityCoherencyFlag(const HardwareInfo *pHwInfo, bool &coherencyFlag) override;

    bool setupPreemptionRegisters(HardwareInfo *pHwInfo, bool enable) override;
--- a/runtime/helpers/hw_helper.inl
+++ b/runtime/helpers/hw_helper.inl
@@ -39,6 +39,12 @@ void HwHelperHw<Family>::setupHardwareCapabilities(HardwareCapabilities *caps) {
    caps->image3DMaxWidth = 16384;
 }

+template <typename Family>
+uint32_t HwHelperHw<Family>::getComputeUnitsUsedForScratch(const HardwareInfo *pHwInfo) const {
+    return pHwInfo->pSysInfo->MaxSubSlicesSupported * pHwInfo->pSysInfo->MaxEuPerSubSlice *
+           pHwInfo->pSysInfo->ThreadCount / pHwInfo->pSysInfo->EUCount;
+}
+
 template <typename Family>
 SipKernelType HwHelperHw<Family>::getSipKernelType(bool debuggingActive) {
    if (!debuggingActive) {