mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-05 09:09:04 +08:00
Improve scratch allocation size calculation
Change-Id: I627bea89ce31e7110976cb88f9e9266e08af590a
This commit is contained in:
committed by
sys_ocldev
parent
e6a9d30951
commit
bd16f4bf2b
@@ -221,7 +221,7 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
|
||||
csrSizeRequestFlags.preemptionRequestChanged = this->lastPreemptionMode != dispatchFlags.preemptionMode;
|
||||
csrSizeRequestFlags.mediaSamplerConfigChanged = this->lastMediaSamplerConfig != static_cast<int8_t>(dispatchFlags.mediaSamplerRequired);
|
||||
|
||||
size_t requiredScratchSizeInBytes = requiredScratchSize * (hwInfo.pSysInfo->MaxSubSlicesSupported * hwInfo.pSysInfo->MaxEuPerSubSlice * hwInfo.pSysInfo->ThreadCount / hwInfo.pSysInfo->EUCount);
|
||||
size_t requiredScratchSizeInBytes = requiredScratchSize * device->getDeviceInfo().computeUnitsUsedForScratch;
|
||||
|
||||
auto force32BitAllocations = getMemoryManager()->peekForce32BitAllocations();
|
||||
|
||||
@@ -731,5 +731,4 @@ void CommandStreamReceiverHw<GfxFamily>::resetKmdNotifyHelper(KmdNotifyHelper *n
|
||||
template <typename GfxFamily>
|
||||
void CommandStreamReceiverHw<GfxFamily>::addClearSLMWorkAround(typename GfxFamily::PIPE_CONTROL *pCmd) {
|
||||
}
|
||||
|
||||
} // namespace OCLRT
|
||||
|
||||
@@ -275,20 +275,16 @@ void Device::initializeCaps() {
|
||||
deviceInfo.numThreadsPerEU = 0;
|
||||
auto simdSizeUsed = DebugManager.flags.UseMaxSimdSizeToDeduceMaxWorkgroupSize.get() ? 32 : 8;
|
||||
|
||||
if (systemInfo.EUCount > 0) {
|
||||
deviceInfo.maxNumEUsPerSubSlice = (systemInfo.EuCountPerPoolMin == 0 || hwInfo.pSkuTable->ftrPooledEuEnabled == 0)
|
||||
? (systemInfo.EUCount / systemInfo.SubSliceCount)
|
||||
: systemInfo.EuCountPerPoolMin;
|
||||
deviceInfo.numThreadsPerEU = systemInfo.ThreadCount / systemInfo.EUCount;
|
||||
auto maxWkgSize = DebugManager.flags.UseMaxSimdSizeToDeduceMaxWorkgroupSize.get() ? 1024u : 256u;
|
||||
auto maxWS = deviceInfo.maxNumEUsPerSubSlice * deviceInfo.numThreadsPerEU * simdSizeUsed;
|
||||
deviceInfo.maxNumEUsPerSubSlice = (systemInfo.EuCountPerPoolMin == 0 || hwInfo.pSkuTable->ftrPooledEuEnabled == 0)
|
||||
? (systemInfo.EUCount / systemInfo.SubSliceCount)
|
||||
: systemInfo.EuCountPerPoolMin;
|
||||
deviceInfo.numThreadsPerEU = systemInfo.ThreadCount / systemInfo.EUCount;
|
||||
auto maxWkgSize = DebugManager.flags.UseMaxSimdSizeToDeduceMaxWorkgroupSize.get() ? 1024u : 256u;
|
||||
auto maxWS = deviceInfo.maxNumEUsPerSubSlice * deviceInfo.numThreadsPerEU * simdSizeUsed;
|
||||
|
||||
maxWS = Math::prevPowerOfTwo(uint32_t(maxWS));
|
||||
deviceInfo.maxWorkGroupSize = std::min(uint32_t(maxWS), maxWkgSize);
|
||||
|
||||
maxWS = Math::prevPowerOfTwo(uint32_t(maxWS));
|
||||
deviceInfo.maxWorkGroupSize = std::min(uint32_t(maxWS), maxWkgSize);
|
||||
} else {
|
||||
//default value if systemInfo not provided
|
||||
deviceInfo.maxWorkGroupSize = 128;
|
||||
}
|
||||
DEBUG_BREAK_IF(!DebugManager.flags.UseMaxSimdSizeToDeduceMaxWorkgroupSize.get() && deviceInfo.maxWorkGroupSize > 256);
|
||||
|
||||
// calculate a maximum number of subgroups in a workgroup (for the required SIMD size)
|
||||
@@ -310,9 +306,7 @@ void Device::initializeCaps() {
|
||||
systemInfo.MaxSlicesSupported,
|
||||
systemInfo.MaxSubSlicesSupported);
|
||||
|
||||
if (systemInfo.EUCount > 0) {
|
||||
deviceInfo.computeUnitsUsedForScratch = systemInfo.MaxSubSlicesSupported * systemInfo.MaxEuPerSubSlice * systemInfo.ThreadCount / systemInfo.EUCount;
|
||||
}
|
||||
deviceInfo.computeUnitsUsedForScratch = hwHelper.getComputeUnitsUsedForScratch(&hwInfo);
|
||||
|
||||
printDebugString(DebugManager.flags.PrintDebugMessages.get(), stderr, "computeUnitsUsedForScratch: %d\n", deviceInfo.computeUnitsUsedForScratch);
|
||||
|
||||
|
||||
@@ -39,6 +39,7 @@ class HwHelper {
|
||||
virtual uint32_t getBindingTableStateAlignement() const = 0;
|
||||
virtual size_t getInterfaceDescriptorDataSize() const = 0;
|
||||
virtual size_t getMaxBarrierRegisterPerSlice() const = 0;
|
||||
virtual uint32_t getComputeUnitsUsedForScratch(const HardwareInfo *pHwInfo) const = 0;
|
||||
virtual void setCapabilityCoherencyFlag(const HardwareInfo *pHwInfo, bool &coherencyFlag) = 0;
|
||||
virtual bool setupPreemptionRegisters(HardwareInfo *pHwInfo, bool enable) = 0;
|
||||
virtual void adjustDefaultEngineType(HardwareInfo *pHwInfo) = 0;
|
||||
@@ -81,6 +82,8 @@ class HwHelperHw : public HwHelper {
|
||||
|
||||
size_t getMaxBarrierRegisterPerSlice() const override;
|
||||
|
||||
uint32_t getComputeUnitsUsedForScratch(const HardwareInfo *pHwInfo) const override;
|
||||
|
||||
void setCapabilityCoherencyFlag(const HardwareInfo *pHwInfo, bool &coherencyFlag) override;
|
||||
|
||||
bool setupPreemptionRegisters(HardwareInfo *pHwInfo, bool enable) override;
|
||||
|
||||
@@ -39,6 +39,12 @@ void HwHelperHw<Family>::setupHardwareCapabilities(HardwareCapabilities *caps) {
|
||||
caps->image3DMaxWidth = 16384;
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
uint32_t HwHelperHw<Family>::getComputeUnitsUsedForScratch(const HardwareInfo *pHwInfo) const {
|
||||
return pHwInfo->pSysInfo->MaxSubSlicesSupported * pHwInfo->pSysInfo->MaxEuPerSubSlice *
|
||||
pHwInfo->pSysInfo->ThreadCount / pHwInfo->pSysInfo->EUCount;
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
SipKernelType HwHelperHw<Family>::getSipKernelType(bool debuggingActive) {
|
||||
if (!debuggingActive) {
|
||||
|
||||
Reference in New Issue
Block a user