From 1a80ab25897cc30cb53686524f602fd7399b1bb1 Mon Sep 17 00:00:00 2001 From: Zbigniew Zdanowicz Date: Fri, 16 Jun 2023 13:33:32 +0000 Subject: [PATCH] performance: store values instead calling virtual methods in command list Related-To: NEO-7828 Signed-off-by: Zbigniew Zdanowicz --- level_zero/core/source/cmdlist/cmdlist.h | 2 ++ level_zero/core/source/cmdlist/cmdlist_hw.inl | 30 ++++++++++--------- .../cmdlist/cmdlist_hw_skl_to_tgllp.inl | 4 +-- level_zero/core/source/device/device_imp.cpp | 3 +- level_zero/core/source/device/device_imp.h | 1 + 5 files changed, 23 insertions(+), 17 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist.h b/level_zero/core/source/cmdlist/cmdlist.h index 3bb60ff1c0..e15feaedd8 100644 --- a/level_zero/core/source/cmdlist/cmdlist.h +++ b/level_zero/core/source/cmdlist/cmdlist.h @@ -382,6 +382,7 @@ struct CommandList : _ze_command_list_handle_t { size_t minimalSizeForBcsSplit = 4 * MemoryConstants::megaByte; size_t cmdListCurrentStartOffset = 0; + size_t maxFillPaternSizeForCopyEngine = 0; unsigned long numThreads = 1u; @@ -423,6 +424,7 @@ struct CommandList : _ze_command_list_handle_t { bool kernelWithAssertAppended = false; bool dispatchCmdListBatchBufferAsPrimary = false; bool copyThroughLockedPtrEnabled = false; + bool useOnlyGlobalTimestamps = false; }; using CommandListAllocatorFn = CommandList *(*)(uint32_t); diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index 75c0e03cbc..db9ca2cea7 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -150,8 +150,10 @@ ze_result_t CommandListCoreFamily::initialize(Device *device, NEO this->flags = flags; auto &hwInfo = device->getHwInfo(); - auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment(); + auto neoDevice = device->getNEODevice(); + auto &rootDeviceEnvironment = neoDevice->getRootDeviceEnvironment(); auto &productHelper = rootDeviceEnvironment.getHelper(); + auto &gfxCoreHelper = neoDevice->getGfxCoreHelper(); auto gmmHelper = rootDeviceEnvironment.getGmmHelper(); this->dcFlushSupport = NEO::MemorySynchronizationCommands::getDcFlushEnable(true, rootDeviceEnvironment); @@ -168,8 +170,10 @@ ze_result_t CommandListCoreFamily::initialize(Device *device, NEO this->defaultMocsIndex = (gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER) >> 1); this->l1CachePolicyData.init(productHelper); this->cmdListHeapAddressModel = L0GfxCoreHelper::getHeapAddressModel(rootDeviceEnvironment); - this->dummyBlitWa.rootDeviceEnvironment = &(device->getNEODevice()->getRootDeviceEnvironmentRef()); + this->dummyBlitWa.rootDeviceEnvironment = &(neoDevice->getRootDeviceEnvironmentRef()); this->dispatchCmdListBatchBufferAsPrimary = L0GfxCoreHelper::dispatchCmdListBatchBufferAsPrimary(rootDeviceEnvironment, this->cmdListType == CommandListType::TYPE_REGULAR); + this->useOnlyGlobalTimestamps = gfxCoreHelper.useOnlyGlobalTimestamps(); + this->maxFillPaternSizeForCopyEngine = gfxCoreHelper.getMaxFillPaternSizeForCopyEngine(); this->requiredStreamState.initSupport(rootDeviceEnvironment); this->finalStreamState.initSupport(rootDeviceEnvironment); @@ -181,7 +185,7 @@ ze_result_t CommandListCoreFamily::initialize(Device *device, NEO this->commandContainer.setUsingPrimaryBuffer(this->dispatchCmdListBatchBufferAsPrimary); if (device->isImplicitScalingCapable() && !this->internalUsage && !isCopyOnly()) { - this->partitionCount = static_cast(this->device->getNEODevice()->getDeviceBitfield().count()); + this->partitionCount = static_cast(neoDevice->getDeviceBitfield().count()); } if (this->isFlushTaskSubmissionEnabled) { @@ -201,8 +205,8 @@ ze_result_t CommandListCoreFamily::initialize(Device *device, NEO !device->isImplicitScalingCapable() && this->csr && this->csr->isAnyDirectSubmissionEnabled() && - !deviceImp->getNEODevice()->getExecutionEnvironment()->areMetricsEnabled() && - deviceImp->getNEODevice()->getMemoryManager()->isLocalMemorySupported(deviceImp->getRootDeviceIndex()); + !neoDevice->getExecutionEnvironment()->areMetricsEnabled() && + neoDevice->getMemoryManager()->isLocalMemorySupported(neoDevice->getRootDeviceIndex()); if (NEO::DebugManager.flags.DirectSubmissionFlatRingBuffer.get() != -1) { createSecondaryCmdBufferInHostMem &= !!NEO::DebugManager.flags.DirectSubmissionFlatRingBuffer.get(); @@ -1891,9 +1895,7 @@ ze_result_t CommandListCoreFamily::appendBlitFill(void *ptr, Event *signalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch) { - auto neoDevice = device->getNEODevice(); - auto &gfxCoreHelper = neoDevice->getGfxCoreHelper(); - if (gfxCoreHelper.getMaxFillPaternSizeForCopyEngine() < patternSize) { + if (this->maxFillPaternSizeForCopyEngine < patternSize) { return ZE_RESULT_ERROR_INVALID_SIZE; } else { ze_result_t ret = addEventsToCmdList(numWaitEvents, phWaitEvents, relaxedOrderingDispatch, false); @@ -1901,6 +1903,7 @@ ze_result_t CommandListCoreFamily::appendBlitFill(void *ptr, return ret; } + auto neoDevice = device->getNEODevice(); appendEventForProfiling(signalEvent, true); NEO::GraphicsAllocation *gpuAllocation = device->getDriverHandle()->getDriverSystemMemoryAllocation(ptr, size, @@ -2460,13 +2463,12 @@ ze_result_t CommandListCoreFamily::appendQueryKernelTimestamps( UNRECOVERABLE_IF(!result); Kernel *builtinKernel = nullptr; - auto &gfxCoreHelper = device->getGfxCoreHelper(); - auto useOnlyGlobalTimestamps = gfxCoreHelper.useOnlyGlobalTimestamps() ? 1u : 0u; + auto useOnlyGlobalTimestampsValue = this->useOnlyGlobalTimestamps ? 1u : 0u; auto lock = device->getBuiltinFunctionsLib()->obtainUniqueOwnership(); if (pOffsets == nullptr) { builtinKernel = device->getBuiltinFunctionsLib()->getFunction(Builtin::QueryKernelTimestamps); - builtinKernel->setArgumentValue(2u, sizeof(uint32_t), &useOnlyGlobalTimestamps); + builtinKernel->setArgumentValue(2u, sizeof(uint32_t), &useOnlyGlobalTimestampsValue); } else { auto pOffsetAllocationStruct = getAlignedAllocationData(this->device, pOffsets, sizeof(size_t) * numEvents, false); if (pOffsetAllocationStruct.alloc == nullptr) { @@ -2476,7 +2478,7 @@ ze_result_t CommandListCoreFamily::appendQueryKernelTimestamps( commandContainer.addToResidencyContainer(pOffsetAllocationStruct.alloc); builtinKernel = device->getBuiltinFunctionsLib()->getFunction(Builtin::QueryKernelTimestampsWithOffsets); builtinKernel->setArgBufferWithAlloc(2, offsetValPtr, pOffsetAllocationStruct.alloc, nullptr); - builtinKernel->setArgumentValue(3u, sizeof(uint32_t), &useOnlyGlobalTimestamps); + builtinKernel->setArgumentValue(3u, sizeof(uint32_t), &useOnlyGlobalTimestampsValue); offsetValPtr += sizeof(size_t); } @@ -2577,8 +2579,8 @@ inline bool getFusedEuDisabled(Kernel &kernel, Device *device, const ze_group_co auto &kernelAttributes = kernel.getKernelDescriptor().kernelAttributes; bool fusedEuDisabled = kernelAttributes.flags.requiresDisabledEUFusion; - auto &productHelper = device->getProductHelper(); - if (productHelper.isCalculationForDisablingEuFusionWithDpasNeeded(device->getHwInfo())) { + if (static_cast(device)->calculationForDisablingEuFusionWithDpasNeeded) { + auto &productHelper = device->getProductHelper(); if (threadGroupDimensions) { uint32_t *groupCountPtr = nullptr; uint32_t groupCount[3] = {}; diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl b/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl index 5a56f1c7a3..61a0e47ac7 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl @@ -31,8 +31,8 @@ struct DeviceImp; template size_t CommandListCoreFamily::getReserveSshSize() { - auto &gfxCoreHelper = device->getGfxCoreHelper(); - return gfxCoreHelper.getRenderSurfaceStateSize(); + using RENDER_SURFACE_STATE = typename GfxFamily::RENDER_SURFACE_STATE; + return sizeof(RENDER_SURFACE_STATE); } template diff --git a/level_zero/core/source/device/device_imp.cpp b/level_zero/core/source/device/device_imp.cpp index 153e9288e2..d9fd8983cf 100644 --- a/level_zero/core/source/device/device_imp.cpp +++ b/level_zero/core/source/device/device_imp.cpp @@ -1252,7 +1252,8 @@ Device *Device::create(DriverHandle *driverHandle, NEO::Device *neoDevice, bool device->resourcesReleased = false; device->populateSubDeviceCopyEngineGroups(); - + auto &productHelper = device->getProductHelper(); + device->calculationForDisablingEuFusionWithDpasNeeded = productHelper.isCalculationForDisablingEuFusionWithDpasNeeded(hwInfo); return device; } diff --git a/level_zero/core/source/device/device_imp.h b/level_zero/core/source/device/device_imp.h index e2b83ed0bb..46093ee2e4 100644 --- a/level_zero/core/source/device/device_imp.h +++ b/level_zero/core/source/device/device_imp.h @@ -132,6 +132,7 @@ struct DeviceImp : public Device { BcsSplit bcsSplit; bool resourcesReleased = false; + bool calculationForDisablingEuFusionWithDpasNeeded = false; void releaseResources(); NEO::SVMAllocsManager::MapBasedAllocationTracker peerAllocations;