Improve scratch allocation size calculation

Change-Id: I627bea89ce31e7110976cb88f9e9266e08af590a
This commit is contained in:
Dunajski, Bartosz
2018-06-06 08:45:45 +02:00
committed by sys_ocldev
parent e6a9d30951
commit bd16f4bf2b
7 changed files with 44 additions and 38 deletions

View File

@@ -221,7 +221,7 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
csrSizeRequestFlags.preemptionRequestChanged = this->lastPreemptionMode != dispatchFlags.preemptionMode;
csrSizeRequestFlags.mediaSamplerConfigChanged = this->lastMediaSamplerConfig != static_cast<int8_t>(dispatchFlags.mediaSamplerRequired);
size_t requiredScratchSizeInBytes = requiredScratchSize * (hwInfo.pSysInfo->MaxSubSlicesSupported * hwInfo.pSysInfo->MaxEuPerSubSlice * hwInfo.pSysInfo->ThreadCount / hwInfo.pSysInfo->EUCount);
size_t requiredScratchSizeInBytes = requiredScratchSize * device->getDeviceInfo().computeUnitsUsedForScratch;
auto force32BitAllocations = getMemoryManager()->peekForce32BitAllocations();
@@ -731,5 +731,4 @@ void CommandStreamReceiverHw<GfxFamily>::resetKmdNotifyHelper(KmdNotifyHelper *n
template <typename GfxFamily>
void CommandStreamReceiverHw<GfxFamily>::addClearSLMWorkAround(typename GfxFamily::PIPE_CONTROL *pCmd) {
}
} // namespace OCLRT

View File

@@ -275,20 +275,16 @@ void Device::initializeCaps() {
deviceInfo.numThreadsPerEU = 0;
auto simdSizeUsed = DebugManager.flags.UseMaxSimdSizeToDeduceMaxWorkgroupSize.get() ? 32 : 8;
if (systemInfo.EUCount > 0) {
deviceInfo.maxNumEUsPerSubSlice = (systemInfo.EuCountPerPoolMin == 0 || hwInfo.pSkuTable->ftrPooledEuEnabled == 0)
? (systemInfo.EUCount / systemInfo.SubSliceCount)
: systemInfo.EuCountPerPoolMin;
deviceInfo.numThreadsPerEU = systemInfo.ThreadCount / systemInfo.EUCount;
auto maxWkgSize = DebugManager.flags.UseMaxSimdSizeToDeduceMaxWorkgroupSize.get() ? 1024u : 256u;
auto maxWS = deviceInfo.maxNumEUsPerSubSlice * deviceInfo.numThreadsPerEU * simdSizeUsed;
deviceInfo.maxNumEUsPerSubSlice = (systemInfo.EuCountPerPoolMin == 0 || hwInfo.pSkuTable->ftrPooledEuEnabled == 0)
? (systemInfo.EUCount / systemInfo.SubSliceCount)
: systemInfo.EuCountPerPoolMin;
deviceInfo.numThreadsPerEU = systemInfo.ThreadCount / systemInfo.EUCount;
auto maxWkgSize = DebugManager.flags.UseMaxSimdSizeToDeduceMaxWorkgroupSize.get() ? 1024u : 256u;
auto maxWS = deviceInfo.maxNumEUsPerSubSlice * deviceInfo.numThreadsPerEU * simdSizeUsed;
maxWS = Math::prevPowerOfTwo(uint32_t(maxWS));
deviceInfo.maxWorkGroupSize = std::min(uint32_t(maxWS), maxWkgSize);
maxWS = Math::prevPowerOfTwo(uint32_t(maxWS));
deviceInfo.maxWorkGroupSize = std::min(uint32_t(maxWS), maxWkgSize);
} else {
//default value if systemInfo not provided
deviceInfo.maxWorkGroupSize = 128;
}
DEBUG_BREAK_IF(!DebugManager.flags.UseMaxSimdSizeToDeduceMaxWorkgroupSize.get() && deviceInfo.maxWorkGroupSize > 256);
// calculate a maximum number of subgroups in a workgroup (for the required SIMD size)
@@ -310,9 +306,7 @@ void Device::initializeCaps() {
systemInfo.MaxSlicesSupported,
systemInfo.MaxSubSlicesSupported);
if (systemInfo.EUCount > 0) {
deviceInfo.computeUnitsUsedForScratch = systemInfo.MaxSubSlicesSupported * systemInfo.MaxEuPerSubSlice * systemInfo.ThreadCount / systemInfo.EUCount;
}
deviceInfo.computeUnitsUsedForScratch = hwHelper.getComputeUnitsUsedForScratch(&hwInfo);
printDebugString(DebugManager.flags.PrintDebugMessages.get(), stderr, "computeUnitsUsedForScratch: %d\n", deviceInfo.computeUnitsUsedForScratch);

View File

@@ -39,6 +39,7 @@ class HwHelper {
virtual uint32_t getBindingTableStateAlignement() const = 0;
virtual size_t getInterfaceDescriptorDataSize() const = 0;
virtual size_t getMaxBarrierRegisterPerSlice() const = 0;
virtual uint32_t getComputeUnitsUsedForScratch(const HardwareInfo *pHwInfo) const = 0;
virtual void setCapabilityCoherencyFlag(const HardwareInfo *pHwInfo, bool &coherencyFlag) = 0;
virtual bool setupPreemptionRegisters(HardwareInfo *pHwInfo, bool enable) = 0;
virtual void adjustDefaultEngineType(HardwareInfo *pHwInfo) = 0;
@@ -81,6 +82,8 @@ class HwHelperHw : public HwHelper {
size_t getMaxBarrierRegisterPerSlice() const override;
uint32_t getComputeUnitsUsedForScratch(const HardwareInfo *pHwInfo) const override;
void setCapabilityCoherencyFlag(const HardwareInfo *pHwInfo, bool &coherencyFlag) override;
bool setupPreemptionRegisters(HardwareInfo *pHwInfo, bool enable) override;

View File

@@ -39,6 +39,12 @@ void HwHelperHw<Family>::setupHardwareCapabilities(HardwareCapabilities *caps) {
caps->image3DMaxWidth = 16384;
}
template <typename Family>
uint32_t HwHelperHw<Family>::getComputeUnitsUsedForScratch(const HardwareInfo *pHwInfo) const {
return pHwInfo->pSysInfo->MaxSubSlicesSupported * pHwInfo->pSysInfo->MaxEuPerSubSlice *
pHwInfo->pSysInfo->ThreadCount / pHwInfo->pSysInfo->EUCount;
}
template <typename Family>
SipKernelType HwHelperHw<Family>::getSipKernelType(bool debuggingActive) {
if (!debuggingActive) {