diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index ad5998a828..594504a0e6 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -2164,7 +2164,8 @@ void CommandListCoreFamily::updateStreamProperties(Kernel &kernel } finalStreamState.frontEndState.setProperties(isCooperative, disableOverdispatch, false, hwInfo); - if (finalStreamState.frontEndState.isDirty()) { + bool isPatchingVfeStateAllowed = NEO::DebugManager.flags.AllowPatchingVfeStateInCommandLists.get(); + if (finalStreamState.frontEndState.isDirty() && isPatchingVfeStateAllowed) { auto pVfeStateAddress = NEO::PreambleHelper::getSpaceForVfeState(commandContainer.getCommandStream(), hwInfo, engineGroupType); auto pVfeState = new VFE_STATE_TYPE; NEO::PreambleHelper::programVfeState(pVfeState, hwInfo, 0, 0, device->getMaxNumHwThreads(), finalStreamState); diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_base.inl b/level_zero/core/source/cmdlist/cmdlist_hw_base.inl index e696e9f524..c18098d723 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_base.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_base.inl @@ -120,7 +120,8 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(z this->indirectAllocationsAllowed = true; } - if ((!containsAnyKernel) || NEO::DebugManager.flags.AllowMixingRegularAndCooperativeKernels.get()) { + bool isMixingRegularAndCooperativeKernelsAllowed = NEO::DebugManager.flags.AllowMixingRegularAndCooperativeKernels.get(); + if ((!containsAnyKernel) || isMixingRegularAndCooperativeKernelsAllowed) { containsCooperativeKernelsFlag = (containsCooperativeKernelsFlag || isCooperative); } else if (containsCooperativeKernelsFlag != isCooperative) { return ZE_RESULT_ERROR_INVALID_ARGUMENT; diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl index f02d60d90b..e0002fccd6 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl @@ -188,7 +188,8 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(z kernelDescriptor.kernelMetadata.kernelName.c_str(), 0u); } - if ((!containsAnyKernel) || NEO::DebugManager.flags.AllowMixingRegularAndCooperativeKernels.get()) { + bool isMixingRegularAndCooperativeKernelsAllowed = NEO::DebugManager.flags.AllowMixingRegularAndCooperativeKernels.get(); + if ((!containsAnyKernel) || isMixingRegularAndCooperativeKernelsAllowed) { containsCooperativeKernelsFlag = (containsCooperativeKernelsFlag || isCooperative); } else if (containsCooperativeKernelsFlag != isCooperative) { return ZE_RESULT_ERROR_INVALID_ARGUMENT; diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.h b/level_zero/core/source/cmdqueue/cmdqueue_hw.h index 84f3787fcb..82faec1ddd 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.h +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.h @@ -39,8 +39,8 @@ struct CommandQueueHw : public CommandQueueImp { size_t estimateStateBaseAddressCmdSize(); MOCKABLE_VIRTUAL void programFrontEnd(uint64_t scratchAddress, uint32_t perThreadScratchSpaceSize, NEO::LinearStream &commandStream); - size_t estimateFrontEndCmdSizeForMultipleCommandLists(bool isFrontEndStateDirty, uint32_t numCommandLists, - ze_command_list_handle_t *phCommandLists); + MOCKABLE_VIRTUAL size_t estimateFrontEndCmdSizeForMultipleCommandLists(bool isFrontEndStateDirty, uint32_t numCommandLists, + ze_command_list_handle_t *phCommandLists); size_t estimateFrontEndCmdSize(); size_t estimatePipelineSelect(); void programPipelineSelect(NEO::LinearStream &commandStream); diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl index 7622a12f6c..ab7c52c5f8 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl @@ -74,7 +74,8 @@ ze_result_t CommandQueueHw::executeCommandLists( auto lockCSR = csr->obtainUniqueOwnership(); - auto commandListsContainCooperativeKernels = CommandList::fromHandle(phCommandLists[0])->containsCooperativeKernels(); + auto anyCommandListWithCooperativeKernels = false; + auto anyCommandListWithoutCooperativeKernels = false; for (auto i = 0u; i < numCommandLists; i++) { auto commandList = CommandList::fromHandle(phCommandLists[i]); @@ -82,12 +83,19 @@ ze_result_t CommandQueueHw::executeCommandLists( return ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE; } - if ((commandListsContainCooperativeKernels != commandList->containsCooperativeKernels()) && - (!NEO::DebugManager.flags.AllowMixingRegularAndCooperativeKernels.get())) { - return ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE; + if (commandList->containsCooperativeKernels()) { + anyCommandListWithCooperativeKernels = true; + } else { + anyCommandListWithoutCooperativeKernels = true; } } + bool isMixingRegularAndCooperativeKernelsAllowed = NEO::DebugManager.flags.AllowMixingRegularAndCooperativeKernels.get(); + if (anyCommandListWithCooperativeKernels && anyCommandListWithoutCooperativeKernels && + (!isMixingRegularAndCooperativeKernelsAllowed)) { + return ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE; + } + size_t spaceForResidency = 0; size_t preemptionSize = 0u; size_t debuggerCmdsSize = 0; @@ -199,6 +207,17 @@ ze_result_t CommandQueueHw::executeCommandLists( gsbaStateDirty, frontEndStateDirty, perThreadScratchSpaceSize); + auto &streamProperties = csr->getStreamProperties(); + auto &hwHelper = NEO::HwHelper::get(hwInfo.platform.eRenderCoreFamily); + auto disableOverdispatch = hwHelper.isDisableOverdispatchAvailable(hwInfo); + auto isEngineInstanced = csr->getOsContext().isEngineInstanced(); + bool isPatchingVfeStateAllowed = NEO::DebugManager.flags.AllowPatchingVfeStateInCommandLists.get(); + if (!isPatchingVfeStateAllowed) { + streamProperties.frontEndState.setProperties(anyCommandListWithCooperativeKernels, disableOverdispatch, + isEngineInstanced, hwInfo); + frontEndStateDirty |= streamProperties.frontEndState.isDirty(); + } + gsbaStateDirty |= csr->getGSBAStateDirty(); frontEndStateDirty |= csr->getMediaVFEStateDirty(); if (!isCopyOnlyCommandQueue) { @@ -308,7 +327,6 @@ ze_result_t CommandQueueHw::executeCommandLists( } } - auto &streamProperties = csr->getStreamProperties(); for (auto i = 0u; i < numCommandLists; ++i) { auto commandList = CommandList::fromHandle(phCommandLists[i]); auto cmdBufferAllocations = commandList->commandContainer.getCmdBufferAllocations(); @@ -335,20 +353,24 @@ ze_result_t CommandQueueHw::executeCommandLists( } if (!isCopyOnlyCommandQueue) { - auto &requiredStreamState = commandList->getRequiredStreamState(); - streamProperties.frontEndState.setProperties(requiredStreamState.frontEndState); - streamProperties.frontEndState.singleSliceDispatchCcsMode.value = csr->getOsContext().isEngineInstanced(); - auto programVfe = streamProperties.frontEndState.isDirty(); - if (frontEndStateDirty) { - programVfe = true; - frontEndStateDirty = false; + bool programVfe = frontEndStateDirty; + if (isPatchingVfeStateAllowed) { + auto requiredStreamStateCopy = commandList->getRequiredStreamState(); + requiredStreamStateCopy.frontEndState.singleSliceDispatchCcsMode.set(isEngineInstanced); + streamProperties.frontEndState.setProperties(requiredStreamStateCopy.frontEndState); + programVfe |= streamProperties.frontEndState.isDirty(); } + if (programVfe) { programFrontEnd(scratchSpaceController->getScratchPatchAddress(), scratchSpaceController->getPerThreadScratchSpaceSize(), child); + frontEndStateDirty = false; + } + + if (isPatchingVfeStateAllowed) { + auto finalStreamStateCopy = commandList->getFinalStreamState(); + finalStreamStateCopy.frontEndState.singleSliceDispatchCcsMode.set(isEngineInstanced); + streamProperties.frontEndState.setProperties(finalStreamStateCopy.frontEndState); } - auto &finalStreamState = commandList->getFinalStreamState(); - streamProperties.frontEndState.setProperties(finalStreamState.frontEndState); - streamProperties.frontEndState.singleSliceDispatchCcsMode.value = csr->getOsContext().isEngineInstanced(); } patchCommands(*commandList, scratchSpaceController->getScratchPatchAddress()); @@ -413,7 +435,7 @@ ze_result_t CommandQueueHw::executeCommandLists( } submitBatchBuffer(ptrDiff(child.getCpuBase(), commandStream->getCpuBase()), csr->getResidencyAllocations(), endingCmd, - commandListsContainCooperativeKernels); + anyCommandListWithCooperativeKernels); this->taskCount = csr->peekTaskCount(); @@ -456,24 +478,29 @@ template size_t CommandQueueHw::estimateFrontEndCmdSizeForMultipleCommandLists( bool isFrontEndStateDirty, uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists) { - auto streamPropertiesCopy = csr->getStreamProperties(); auto singleFrontEndCmdSize = estimateFrontEndCmdSize(); + bool isPatchingVfeStateAllowed = NEO::DebugManager.flags.AllowPatchingVfeStateInCommandLists.get(); + if (!isPatchingVfeStateAllowed) { + return isFrontEndStateDirty * singleFrontEndCmdSize; + } + + auto streamPropertiesCopy = csr->getStreamProperties(); + auto isEngineInstanced = csr->getOsContext().isEngineInstanced(); size_t estimatedSize = 0; for (size_t i = 0; i < numCommandLists; i++) { auto commandList = CommandList::fromHandle(phCommandLists[i]); - auto &requiredStreamState = commandList->getRequiredStreamState(); - streamPropertiesCopy.frontEndState.setProperties(requiredStreamState.frontEndState); - auto isVfeRequired = streamPropertiesCopy.frontEndState.isDirty(); - if (isFrontEndStateDirty) { - isVfeRequired = true; + auto requiredStreamStateCopy = commandList->getRequiredStreamState(); + requiredStreamStateCopy.frontEndState.singleSliceDispatchCcsMode.set(isEngineInstanced); + streamPropertiesCopy.frontEndState.setProperties(requiredStreamStateCopy.frontEndState); + + if (isFrontEndStateDirty || streamPropertiesCopy.frontEndState.isDirty()) { + estimatedSize += singleFrontEndCmdSize; isFrontEndStateDirty = false; } - if (isVfeRequired) { - estimatedSize += singleFrontEndCmdSize; - } - auto &finalStreamState = commandList->getFinalStreamState(); - streamPropertiesCopy.frontEndState.setProperties(finalStreamState.frontEndState); + auto finalStreamStateCopy = commandList->getFinalStreamState(); + finalStreamStateCopy.frontEndState.singleSliceDispatchCcsMode.set(isEngineInstanced); + streamPropertiesCopy.frontEndState.setProperties(finalStreamStateCopy.frontEndState); } return estimatedSize; diff --git a/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl b/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl index 6e97cc2b73..464642a525 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl @@ -144,7 +144,7 @@ void CommandQueueHw::patchCommands(CommandList &commandList, uint cfeStateCmd = reinterpret_cast(commandToPatch.pCommand); cfeStateCmd->setScratchSpaceBuffer(lowScratchAddress); - cfeStateCmd->setSingleSliceDispatchCcsMode(csr->getStreamProperties().frontEndState.singleSliceDispatchCcsMode.value); + cfeStateCmd->setSingleSliceDispatchCcsMode(csr->getOsContext().isEngineInstanced()); *reinterpret_cast(commandToPatch.pDestination) = *cfeStateCmd; break; diff --git a/opencl/test/unit_test/test_files/igdrcl.config b/opencl/test/unit_test/test_files/igdrcl.config index fd4794b04f..31da28b0f4 100644 --- a/opencl/test/unit_test/test_files/igdrcl.config +++ b/opencl/test/unit_test/test_files/igdrcl.config @@ -325,4 +325,5 @@ OverrideGmmResourceUsageField = -1 LogAllocationType = 0 ProgramAdditionalPipeControlBeforeStateComputeModeCommand = 0 OverrideBufferSuitableForRenderCompression = -1 -AllowMixingRegularAndCooperativeKernels = 0 \ No newline at end of file +AllowMixingRegularAndCooperativeKernels = 0 +AllowPatchingVfeStateInCommandLists = 0 \ No newline at end of file diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index dced3fc477..1903eaa8f2 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -72,6 +72,7 @@ DECLARE_DEBUG_VARIABLE(bool, EnableResourceTags, false, "Enable resource tagging DECLARE_DEBUG_VARIABLE(bool, EnableFlushTaskSubmission, false, "true: driver uses csr flushTask for immediate submissions, false: driver uses legacy executeCommandList path") DECLARE_DEBUG_VARIABLE(bool, DoNotFreeResources, false, "true: driver stops freeing resources") DECLARE_DEBUG_VARIABLE(bool, AllowMixingRegularAndCooperativeKernels, false, "true: driver allows mixing regular and cooperative kernels in a single command list and in a single execute") +DECLARE_DEBUG_VARIABLE(bool, AllowPatchingVfeStateInCommandLists, false, "true: MEDIA_VFE_STATE may be programmed in a command list") DECLARE_DEBUG_VARIABLE(std::string, ForceDeviceId, std::string("unk"), "DeviceId selected for testing") DECLARE_DEBUG_VARIABLE(std::string, LoadBinarySipFromFile, std::string("unk"), "Select binary file to load SIP kernel raw binary") DECLARE_DEBUG_VARIABLE(int64_t, OverrideMultiStoragePlacement, -1, "-1: disable, 0+: tile mask, each bit corresponds to tile")