performance: change mutable structures for better browse and access patterns

Related-To: NEO-13916

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2025-07-21 14:49:57 +00:00
committed by Compute-Runtime-Automation
parent 9fd91cf80e
commit 86528a10a3
10 changed files with 264 additions and 274 deletions

View File

@@ -36,8 +36,6 @@ struct MutableAppendLaunchKernelWithParams {
struct MutableAppendLaunchKernelEvents {
CommandToPatch signalCmd;
size_t currentSignalEventDescriptorIndex = std::numeric_limits<size_t>::max();
bool waitEvents = false;
bool l3FlushEventSyncCmd = false;
bool l3FlushEventTimestampSyncCmds = false;
@@ -113,7 +111,7 @@ struct MutableCommandListCoreFamily : public MutableCommandListImp, public Comma
void storeKernelArgumentAndDispatchVariables(MutableAppendLaunchKernelWithParams &mutableParams,
CmdListKernelLaunchParams &launchParams,
Kernel *kernel,
MutationVariables *variableDescriptors,
KernelVariableDescriptor *kernelVariables,
ze_mutable_command_exp_flags_t mutableFlags);
void storeSignalEventVariable(MutableAppendLaunchKernelEvents &mutableEventParams,
CmdListKernelLaunchParams &launchParams,
@@ -126,7 +124,7 @@ struct MutableCommandListCoreFamily : public MutableCommandListImp, public Comma
std::vector<MutableLoadRegisterImm *> &variableLoadRegisterImmList);
void captureRegularWaitEventCommands(CommandToPatchContainer::iterator &cmdsIterator,
std::vector<MutableSemaphoreWait *> &variableSemaphoreWaitList);
void captureCounterBasedTimestampSignalEventCommands(MutableVariableDescriptor &currentMutableSignalEvent,
void captureCounterBasedTimestampSignalEventCommands(SignalEventVariableDescriptor &currentMutableSignalEvent,
std::vector<MutableSemaphoreWait *> &variableSemaphoreWaitList,
std::vector<MutableStoreDataImm *> &variableStoreDataImmList);
void captureStandaloneTimestampSignalEventCommands(std::vector<MutableStoreRegisterMem *> &variableStoreRegisterMem);

View File

@@ -178,10 +178,11 @@ inline ze_result_t MutableCommandListCoreFamily<gfxCoreFamily>::appendLaunchKern
if ((currentAppend.mutationFlags & ZE_MUTABLE_COMMAND_EXP_FLAG_WAIT_EVENTS) == ZE_MUTABLE_COMMAND_EXP_FLAG_WAIT_EVENTS) {
if (numWaitEvents > 0) {
currentAppend.variables.waitEvents.reserve(numWaitEvents);
mutableEventParams.waitEvents = true;
bool omitWaitEventResidency = false;
for (uint32_t i = 0; i < numWaitEvents; i++) {
MutableVariableDescriptor mutableWaitEvent = {};
WaitEventVariableDescriptor mutableWaitEventDesc = {};
Event *event = Event::fromHandle(phWaitEvents[i]);
@@ -191,21 +192,21 @@ inline ze_result_t MutableCommandListCoreFamily<gfxCoreFamily>::appendLaunchKern
variable->setAsWaitEvent(event);
mutableWaitEvent.var = variable;
mutableWaitEvent.varType = ZE_MUTABLE_COMMAND_EXP_FLAG_WAIT_EVENTS;
mutableWaitEvent.waitEvents.event = event;
mutableWaitEvent.waitEvents.waitEventIndex = i;
mutableWaitEventDesc.event = event;
mutableWaitEventDesc.eventVariable = variable;
mutableWaitEventDesc.waitEventIndex = i;
if (CommandListImp::isInOrderExecutionEnabled() && event->isCounterBased()) {
mutableWaitEvent.waitEvents.waitEventPackets = event->getInOrderExecInfo()->getNumDevicePartitionsToWait();
mutableWaitEventDesc.waitEventPackets = event->getInOrderExecInfo()->getNumDevicePartitionsToWait();
if (!isCbEventBoundToCmdList(event)) {
omitWaitEventResidency = true;
auto deviceCounterAlloc = event->getInOrderExecInfo()->getDeviceCounterAllocation();
addToResidencyContainer(getDeviceCounterAllocForResidency(deviceCounterAlloc));
}
} else {
mutableWaitEvent.waitEvents.waitEventPackets = event->getPacketsToWait();
mutableWaitEventDesc.waitEventPackets = event->getPacketsToWait();
}
currentAppend.variables.push_back(mutableWaitEvent);
currentAppend.variables.waitEvents.push_back(mutableWaitEventDesc);
NEO::GraphicsAllocation *eventPoolAlloc = event->getAllocation(this->device);
if (eventPoolAlloc) {
@@ -228,23 +229,23 @@ inline ze_result_t MutableCommandListCoreFamily<gfxCoreFamily>::appendLaunchKern
}
if (this->nextAppendKernelMutable) {
if (mutableEventParams.currentSignalEventDescriptorIndex != std::numeric_limits<size_t>::max()) {
MutableVariableDescriptor &currentMutableSignalEvent = this->mutations[(nextCommandId - 1)].variables[mutableEventParams.currentSignalEventDescriptorIndex];
if (this->mutations[(nextCommandId - 1)].variables.signalEvent.eventVariable != nullptr) {
auto &signalEventVariableDesc = this->mutations[(nextCommandId - 1)].variables.signalEvent;
MutableComputeWalker *walker = nullptr;
MutablePipeControl *signalPipeControl = nullptr;
if (mutableEventParams.counterBasedEvent) {
// both TS and L3 flush events need additional clean Store Data Imm -> signal cmd (CW or PC or StoreRegMem) -> sync SemWait
if (mutableEventParams.counterBasedTimestampEvent || mutableEventParams.l3FlushEvent) {
auto &eventVariableSemaphoreWaitList = currentMutableSignalEvent.var->getSemWaitList();
auto &eventVariableStoreDataImmList = currentMutableSignalEvent.var->getStoreDataImmList();
auto &eventVariableSemaphoreWaitList = signalEventVariableDesc.eventVariable->getSemWaitList();
auto &eventVariableStoreDataImmList = signalEventVariableDesc.eventVariable->getStoreDataImmList();
captureCounterBasedTimestampSignalEventCommands(currentMutableSignalEvent,
captureCounterBasedTimestampSignalEventCommands(signalEventVariableDesc,
eventVariableSemaphoreWaitList, eventVariableStoreDataImmList);
walker = this->appendKernelMutableComputeWalker;
}
if (mutableEventParams.l3FlushEventTimestampSyncCmds) {
// L3 TS is signaled by StoreRegMem
auto &eventVariableStoreRegMemList = currentMutableSignalEvent.var->getStoreRegMemList();
auto &eventVariableStoreRegMemList = signalEventVariableDesc.eventVariable->getStoreRegMemList();
captureStandaloneTimestampSignalEventCommands(eventVariableStoreRegMemList);
} else if (mutableEventParams.l3FlushEventSyncCmd) {
// L3 Immediate is signaled by PC
@@ -258,14 +259,14 @@ inline ze_result_t MutableCommandListCoreFamily<gfxCoreFamily>::appendLaunchKern
}
} else {
if (mutableEventParams.eventInsideInOrder) {
auto &eventVariableSemaphoreWaitList = currentMutableSignalEvent.var->getSemWaitList();
auto &eventVariableStoreDataImmList = currentMutableSignalEvent.var->getStoreDataImmList();
auto &eventVariableSemaphoreWaitList = signalEventVariableDesc.eventVariable->getSemWaitList();
auto &eventVariableStoreDataImmList = signalEventVariableDesc.eventVariable->getStoreDataImmList();
captureCounterBasedTimestampSignalEventCommands(currentMutableSignalEvent,
captureCounterBasedTimestampSignalEventCommands(signalEventVariableDesc,
eventVariableSemaphoreWaitList, eventVariableStoreDataImmList);
}
if (mutableEventParams.l3FlushEventTimestampSyncCmds) {
auto &eventVariableStoreRegMemList = currentMutableSignalEvent.var->getStoreRegMemList();
auto &eventVariableStoreRegMemList = signalEventVariableDesc.eventVariable->getStoreRegMemList();
captureStandaloneTimestampSignalEventCommands(eventVariableStoreRegMemList);
} else if (mutableEventParams.l3FlushEventSyncCmd) {
auto signalPipeControlPtr = std::make_unique<MutablePipeControlHw<GfxFamily>>(mutableEventParams.signalCmd.pDestination);
@@ -275,33 +276,30 @@ inline ze_result_t MutableCommandListCoreFamily<gfxCoreFamily>::appendLaunchKern
walker = this->appendKernelMutableComputeWalker;
}
}
currentMutableSignalEvent.var->setAsSignalEvent(currentMutableSignalEvent.signalEvent.event,
walker,
signalPipeControl);
NEO::GraphicsAllocation *eventAlloc = currentMutableSignalEvent.signalEvent.event->getAllocation(this->device);
signalEventVariableDesc.eventVariable->setAsSignalEvent(signalEventVariableDesc.event,
walker,
signalPipeControl);
NEO::GraphicsAllocation *eventAlloc = signalEventVariableDesc.event->getAllocation(this->device);
if (eventAlloc) {
addToResidencyContainer(eventAlloc);
}
}
if (mutableEventParams.waitEvents) {
auto waitEventCmdToPatchIterator = this->appendCmdsToPatch.begin();
if (waitEventCmdToPatchIterator->type == CommandToPatch::CommandType::PrefetchKernelMemory) {
waitEventCmdToPatchIterator++;
}
AppendMutation &currentAppend = this->mutations[(nextCommandId - 1)];
for (uint32_t i = 0; i < numWaitEvents; i++) {
MutableVariableDescriptor &mutableWaitEvent = currentAppend.variables[i];
UNRECOVERABLE_IF(ZE_MUTABLE_COMMAND_EXP_FLAG_WAIT_EVENTS != mutableWaitEvent.varType);
UNRECOVERABLE_IF(i != mutableWaitEvent.waitEvents.waitEventIndex);
WaitEventVariableDescriptor &mutableWaitEvent = currentAppend.variables.waitEvents[i];
UNRECOVERABLE_IF(i != mutableWaitEvent.waitEventIndex);
auto &variableSemWaitCmdList = mutableWaitEvent.var->getSemWaitList();
auto &variableLoadRegImmCmdList = mutableWaitEvent.var->getLoadRegImmList();
auto &variableSemWaitCmdList = mutableWaitEvent.eventVariable->getSemWaitList();
auto &variableLoadRegImmCmdList = mutableWaitEvent.eventVariable->getLoadRegImmList();
for (uint32_t packet = 0; packet < mutableWaitEvent.waitEvents.waitEventPackets; packet++) {
if (CommandListImp::isInOrderExecutionEnabled() && mutableWaitEvent.waitEvents.event->isCounterBased() && (this->heaplessModeEnabled || !mutableWaitEvent.waitEvents.event->hasInOrderTimestampNode())) {
for (uint32_t packet = 0; packet < mutableWaitEvent.waitEventPackets; packet++) {
if (CommandListImp::isInOrderExecutionEnabled() && mutableWaitEvent.event->isCounterBased() && (this->heaplessModeEnabled || !mutableWaitEvent.event->hasInOrderTimestampNode())) {
captureCounterBasedWaitEventCommands(waitEventCmdToPatchIterator, variableSemWaitCmdList, variableLoadRegImmCmdList);
} else {
captureRegularWaitEventCommands(waitEventCmdToPatchIterator, variableSemWaitCmdList);
@@ -338,7 +336,7 @@ inline ze_result_t MutableCommandListCoreFamily<gfxCoreFamily>::appendLaunchKern
}
MutableAppendLaunchKernelWithParams mutableCmdlistAppendLaunchParams = {};
MutationVariables *currentVariables = nullptr;
KernelVariableDescriptor *currentKernelVariables = nullptr;
if (this->nextAppendKernelMutable) {
AppendMutation &currentAppend = this->mutations[(nextCommandId - 1)];
@@ -358,12 +356,12 @@ inline ze_result_t MutableCommandListCoreFamily<gfxCoreFamily>::appendLaunchKern
mutableCmdlistAppendLaunchParams.localRegionSizeFromApi = launchParams.localRegionSize;
mutableCmdlistAppendLaunchParams.isCooperativeFromApi = launchParams.isCooperative;
currentVariables = &mutableCmdlistAppendLaunchParams.currentMutableKernel->getKernelVariables();
currentKernelVariables = &mutableCmdlistAppendLaunchParams.currentMutableKernel->getKernelVariables();
} else {
currentVariables = &currentAppend.variables;
currentKernelVariables = &currentAppend.variables.kernelVariables;
}
storeKernelArgumentAndDispatchVariables(mutableCmdlistAppendLaunchParams, launchParams, kernel, currentVariables, currentAppend.mutationFlags);
storeKernelArgumentAndDispatchVariables(mutableCmdlistAppendLaunchParams, launchParams, kernel, currentKernelVariables, currentAppend.mutationFlags);
if (mutableCmdlistAppendLaunchParams.kernelMutation) {
launchParams.reserveExtraPayloadSpace += mutableCmdlistAppendLaunchParams.extraPayloadSpaceForKernelGroup;
@@ -678,11 +676,11 @@ void MutableCommandListCoreFamily<gfxCoreFamily>::captureRegularWaitEventCommand
}
template <GFXCORE_FAMILY gfxCoreFamily>
void MutableCommandListCoreFamily<gfxCoreFamily>::captureCounterBasedTimestampSignalEventCommands(MutableVariableDescriptor &currentMutableSignalEvent,
void MutableCommandListCoreFamily<gfxCoreFamily>::captureCounterBasedTimestampSignalEventCommands(SignalEventVariableDescriptor &currentMutableSignalEvent,
std::vector<MutableSemaphoreWait *> &variableSemaphoreWaitList,
std::vector<MutableStoreDataImm *> &variableStoreDataImmList) {
auto partitionCount = CommandListCoreFamily<gfxCoreFamily>::getPartitionCount();
uint32_t syncWaitEventPackets = currentMutableSignalEvent.signalEvent.event->getPacketsToWait();
uint32_t syncWaitEventPackets = currentMutableSignalEvent.event->getPacketsToWait();
uint32_t clearEventOps = syncWaitEventPackets / partitionCount;
variableSemaphoreWaitList.reserve(syncWaitEventPackets);
@@ -730,27 +728,21 @@ template <GFXCORE_FAMILY gfxCoreFamily>
void MutableCommandListCoreFamily<gfxCoreFamily>::storeKernelArgumentAndDispatchVariables(MutableAppendLaunchKernelWithParams &mutableParams,
CmdListKernelLaunchParams &launchParams,
Kernel *kernel,
MutationVariables *variableDescriptors,
KernelVariableDescriptor *kernelVariables,
ze_mutable_command_exp_flags_t mutableFlags) {
if ((mutableFlags & ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_COUNT) == ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_COUNT) {
InterfaceVariableDescriptor varDesc = {};
varDesc.isStageCommit = true;
getVariable(&varDesc, &mutableParams.groupCountVariable);
MutableVariableDescriptor mutableGroupCount = {};
mutableGroupCount.varType = ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_COUNT;
mutableGroupCount.var = mutableParams.groupCountVariable;
variableDescriptors->push_back(mutableGroupCount);
kernelVariables->groupCount = mutableParams.groupCountVariable;
}
if ((mutableFlags & ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_SIZE) == ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_SIZE) {
InterfaceVariableDescriptor varDesc = {};
varDesc.isStageCommit = true;
getVariable(&varDesc, &mutableParams.groupSizeVariable);
MutableVariableDescriptor mutableGroupSize = {};
mutableGroupSize.varType = ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_SIZE;
mutableGroupSize.var = mutableParams.groupSizeVariable;
variableDescriptors->push_back(mutableGroupSize);
kernelVariables->groupSize = mutableParams.groupSizeVariable;
this->enableReservePerThreadForLocalId = true;
launchParams.reserveExtraPayloadSpace = this->maxPerThreadDataSize;
@@ -759,16 +751,16 @@ void MutableCommandListCoreFamily<gfxCoreFamily>::storeKernelArgumentAndDispatch
InterfaceVariableDescriptor varDesc = {};
getVariable(&varDesc, &mutableParams.globalOffsetVariable);
MutableVariableDescriptor mutableGlobalOffset = {};
mutableGlobalOffset.varType = ZE_MUTABLE_COMMAND_EXP_FLAG_GLOBAL_OFFSET;
mutableGlobalOffset.var = mutableParams.globalOffsetVariable;
variableDescriptors->push_back(mutableGlobalOffset);
kernelVariables->globalOffset = mutableParams.globalOffsetVariable;
}
if ((mutableFlags & ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_ARGUMENTS) == ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_ARGUMENTS) {
// intercept kernel arguments
auto &kernelArgs = kernel->getKernelDescriptor().payloadMappings.explicitArgs;
kernelVariables->kernelArguments.reserve(kernelArgs.size());
uint32_t argCount = 0;
for (const auto &arg : kernelArgs) {
KernelArgumentVariableDescriptor mutableKernelArgumentDesc = {};
mutableKernelArgumentDesc.argIndex = argCount;
bool captureArgument = false;
bool slmArgument = false;
bool immediateArgument = arg.type == NEO::ArgDescriptor::argTValue;
@@ -789,12 +781,9 @@ void MutableCommandListCoreFamily<gfxCoreFamily>::storeKernelArgumentAndDispatch
getVariable(&varDesc, &variable);
variable->setAsKernelArg(kernel->toHandle(), argCount);
MutableVariableDescriptor mutableKernelArgument = {};
mutableKernelArgument.var = variable;
mutableKernelArgument.kernelArguments.argIndex = argCount;
mutableKernelArgument.kernelArguments.argType = arg.type;
mutableKernelArgument.varType = ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_ARGUMENTS;
variableDescriptors->push_back(mutableKernelArgument);
mutableKernelArgumentDesc.kernelArgumentVariable = variable;
kernelVariables->kernelArguments.push_back(mutableKernelArgumentDesc);
mutableParams.kernelArgumentMutation = true;
@@ -804,7 +793,11 @@ void MutableCommandListCoreFamily<gfxCoreFamily>::storeKernelArgumentAndDispatch
}
mutableParams.lastSlmArgumentVariable = variable;
}
} else {
// in case kernel argument is non-mutable (not captured), push it into kernel arguments vector anyway to preserve index order
kernelVariables->kernelArguments.push_back(mutableKernelArgumentDesc);
}
argCount++;
}
}
@@ -818,14 +811,10 @@ void MutableCommandListCoreFamily<gfxCoreFamily>::storeSignalEventVariable(Mutab
ze_mutable_command_exp_flags_t mutableFlags) {
if ((mutableFlags & ZE_MUTABLE_COMMAND_EXP_FLAG_SIGNAL_EVENT) == ZE_MUTABLE_COMMAND_EXP_FLAG_SIGNAL_EVENT) {
if (event != nullptr) {
MutableVariableDescriptor mutableSignalEvent = {};
mutableSignalEvent.signalEvent.event = event;
mutableSignalEvent.varType = ZE_MUTABLE_COMMAND_EXP_FLAG_SIGNAL_EVENT;
Variable *variable = nullptr;
InterfaceVariableDescriptor varDesc = {};
getVariable(&varDesc, &variable);
mutableSignalEvent.var = variable;
launchParams.omitAddingEventResidency = event->getAllocation(this->device) != nullptr;
@@ -874,8 +863,8 @@ void MutableCommandListCoreFamily<gfxCoreFamily>::storeSignalEventVariable(Mutab
}
}
variableDescriptors->emplace_back(mutableSignalEvent);
mutableEventParams.currentSignalEventDescriptorIndex = variableDescriptors->size() - 1;
variableDescriptors->signalEvent.event = event;
variableDescriptors->signalEvent.eventVariable = variable;
}
}
}

View File

@@ -396,47 +396,43 @@ ze_result_t MutableCommandListImp::updateMutableCommandsExp(const ze_mutable_com
ze_result_t result = ZE_RESULT_SUCCESS;
const void *next = desc->pNext;
while (next != nullptr) {
MutationVariables *currentVariables = nullptr;
KernelVariableDescriptor *currentVariables = nullptr;
const ze_base_desc_t *extendedDesc = reinterpret_cast<const ze_base_desc_t *>(next);
if (extendedDesc->stype == ZE_STRUCTURE_TYPE_MUTABLE_KERNEL_ARGUMENT_EXP_DESC) {
const ze_mutable_kernel_argument_exp_desc_t *kernelArgumentDesc = reinterpret_cast<const ze_mutable_kernel_argument_exp_desc_t *>(next);
AppendMutation &selectedAppend = this->mutations[(kernelArgumentDesc->commandId - 1)];
const ze_mutable_kernel_argument_exp_desc_t *apiKernelArgumentDesc = reinterpret_cast<const ze_mutable_kernel_argument_exp_desc_t *>(next);
AppendMutation &selectedAppend = this->mutations[(apiKernelArgumentDesc->commandId - 1)];
if ((selectedAppend.mutationFlags & ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_ARGUMENTS) == 0) {
return ZE_RESULT_ERROR_INVALID_ARGUMENT;
}
currentVariables = getVariableDescriptorContainer(selectedAppend);
MutableVariableDescriptor *mutableKernelArgumentDesc = nullptr;
for (auto &mutableTypeDescriptor : *currentVariables) {
if (mutableTypeDescriptor.varType != ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_ARGUMENTS ||
mutableTypeDescriptor.kernelArguments.argIndex != kernelArgumentDesc->argIndex) {
continue;
}
mutableKernelArgumentDesc = &mutableTypeDescriptor;
break;
}
if (mutableKernelArgumentDesc == nullptr) {
if (apiKernelArgumentDesc->argIndex + 1 > currentVariables->kernelArguments.size()) {
return ZE_RESULT_ERROR_INVALID_ARGUMENT;
}
if (mutableKernelArgumentDesc->var->getType() == VariableType::buffer) {
auto argValue = kernelArgumentDesc->pArgValue == nullptr ? nullptr : *reinterpret_cast<void *const *>(kernelArgumentDesc->pArgValue);
if (mutableKernelArgumentDesc->var->getDesc().argValue == argValue) {
KernelArgumentVariableDescriptor &kernelArgDesc = currentVariables->kernelArguments[apiKernelArgumentDesc->argIndex];
UNRECOVERABLE_IF(kernelArgDesc.argIndex != apiKernelArgumentDesc->argIndex);
if (kernelArgDesc.kernelArgumentVariable == nullptr) {
return ZE_RESULT_ERROR_INVALID_ARGUMENT;
}
if (kernelArgDesc.kernelArgumentVariable->getType() == VariableType::buffer) {
auto argValue = apiKernelArgumentDesc->pArgValue == nullptr ? nullptr : *reinterpret_cast<void *const *>(apiKernelArgumentDesc->pArgValue);
if (kernelArgDesc.kernelArgumentVariable->getDesc().argValue == argValue) {
PRINT_DEBUG_STRING(NEO::debugManager.flags.PrintMclData.get(), stderr, "MCL update kernel arg commandId: %" PRIu64 " argument idx: %u, buffer - same value: %p\n",
kernelArgumentDesc->commandId, kernelArgumentDesc->argIndex, argValue);
apiKernelArgumentDesc->commandId, apiKernelArgumentDesc->argIndex, argValue);
next = extendedDesc->pNext;
continue;
}
}
result = mutableKernelArgumentDesc->var->setValue(kernelArgumentDesc->argSize, 0, kernelArgumentDesc->pArgValue);
result = kernelArgDesc.kernelArgumentVariable->setValue(apiKernelArgumentDesc->argSize, 0, apiKernelArgumentDesc->pArgValue);
if (result != ZE_RESULT_SUCCESS) {
return result;
}
this->updatedCommandList = true;
if (mutableKernelArgumentDesc->var->getType() == VariableType::slmBuffer && mutableKernelArgumentDesc->var->isCooperativeVariable()) {
auto varDispatch = mutableKernelArgumentDesc->var->getInitialVariableDispatch();
if (kernelArgDesc.kernelArgumentVariable->getType() == VariableType::slmBuffer && kernelArgDesc.kernelArgumentVariable->isCooperativeVariable()) {
auto varDispatch = kernelArgDesc.kernelArgumentVariable->getInitialVariableDispatch();
cooperativeKernelVariableDispatches.insert(varDispatch);
}
PRINT_DEBUG_STRING(NEO::debugManager.flags.PrintMclData.get(), stderr, "MCL update kernel arg commandId: %" PRIu64 " argument idx: %u, size: %zu, val: %p\n",
kernelArgumentDesc->commandId, kernelArgumentDesc->argIndex, kernelArgumentDesc->argSize, mutableKernelArgumentDesc->var->getDesc().argValue);
apiKernelArgumentDesc->commandId, apiKernelArgumentDesc->argIndex, apiKernelArgumentDesc->argSize, kernelArgDesc.kernelArgumentVariable->getDesc().argValue);
}
if (extendedDesc->stype == ZE_STRUCTURE_TYPE_MUTABLE_GROUP_COUNT_EXP_DESC) {
const ze_mutable_group_count_exp_desc_t *groupCountDesc = reinterpret_cast<const ze_mutable_group_count_exp_desc_t *>(next);
@@ -445,24 +441,16 @@ ze_result_t MutableCommandListImp::updateMutableCommandsExp(const ze_mutable_com
return ZE_RESULT_ERROR_INVALID_ARGUMENT;
}
currentVariables = getVariableDescriptorContainer(selectedAppend);
MutableVariableDescriptor *mutableGroupCountDesc = nullptr;
for (auto &mutableTypeDescriptor : *currentVariables) {
if (mutableTypeDescriptor.varType != ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_COUNT) {
continue;
}
mutableGroupCountDesc = &mutableTypeDescriptor;
break;
}
if (mutableGroupCountDesc == nullptr) {
if (currentVariables->groupCount == nullptr) {
return ZE_RESULT_ERROR_INVALID_ARGUMENT;
}
result = mutableGroupCountDesc->var->setValue(sizeof(ze_group_count_t), 0, groupCountDesc->pGroupCount);
result = currentVariables->groupCount->setValue(sizeof(ze_group_count_t), 0, groupCountDesc->pGroupCount);
if (result != ZE_RESULT_SUCCESS) {
return result;
}
this->updatedCommandList = true;
if (mutableGroupCountDesc->var->isCooperativeVariable()) {
auto varDispatch = mutableGroupCountDesc->var->getInitialVariableDispatch();
if (currentVariables->groupCount->isCooperativeVariable()) {
auto varDispatch = currentVariables->groupCount->getInitialVariableDispatch();
cooperativeKernelVariableDispatches.insert(varDispatch);
}
PRINT_DEBUG_STRING(NEO::debugManager.flags.PrintMclData.get(), stderr, "MCL update group count commandId: %" PRIu64 " x: %u y: %u z: %u\n",
@@ -475,25 +463,17 @@ ze_result_t MutableCommandListImp::updateMutableCommandsExp(const ze_mutable_com
return ZE_RESULT_ERROR_INVALID_ARGUMENT;
}
currentVariables = getVariableDescriptorContainer(selectedAppend);
MutableVariableDescriptor *mutableGroupSizeDesc = nullptr;
for (auto &mutableTypeDescriptor : *currentVariables) {
if (mutableTypeDescriptor.varType != ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_SIZE) {
continue;
}
mutableGroupSizeDesc = &mutableTypeDescriptor;
break;
}
if (mutableGroupSizeDesc == nullptr) {
if (currentVariables->groupSize == nullptr) {
return ZE_RESULT_ERROR_INVALID_ARGUMENT;
}
uint32_t groupSize[3] = {groupSizeDesc->groupSizeX, groupSizeDesc->groupSizeY, groupSizeDesc->groupSizeZ};
result = mutableGroupSizeDesc->var->setValue(sizeof(groupSize), 0, groupSize);
result = currentVariables->groupSize->setValue(sizeof(groupSize), 0, groupSize);
if (result != ZE_RESULT_SUCCESS) {
return result;
}
this->updatedCommandList = true;
if (mutableGroupSizeDesc->var->isCooperativeVariable()) {
auto varDispatch = mutableGroupSizeDesc->var->getInitialVariableDispatch();
if (currentVariables->groupSize->isCooperativeVariable()) {
auto varDispatch = currentVariables->groupSize->getInitialVariableDispatch();
cooperativeKernelVariableDispatches.insert(varDispatch);
}
PRINT_DEBUG_STRING(NEO::debugManager.flags.PrintMclData.get(), stderr, "MCL update group size commandId: %" PRIu64 " x: %u y: %u z: %u\n",
@@ -506,19 +486,11 @@ ze_result_t MutableCommandListImp::updateMutableCommandsExp(const ze_mutable_com
return ZE_RESULT_ERROR_INVALID_ARGUMENT;
}
currentVariables = getVariableDescriptorContainer(selectedAppend);
MutableVariableDescriptor *mutableGlobalOffsetDesc = nullptr;
for (auto &mutableTypeDescriptor : *currentVariables) {
if (mutableTypeDescriptor.varType != ZE_MUTABLE_COMMAND_EXP_FLAG_GLOBAL_OFFSET) {
continue;
}
mutableGlobalOffsetDesc = &mutableTypeDescriptor;
break;
}
if (mutableGlobalOffsetDesc == nullptr) {
if (currentVariables->globalOffset == nullptr) {
return ZE_RESULT_ERROR_INVALID_ARGUMENT;
}
uint32_t globalOffset[3] = {globalOffsetDesc->offsetX, globalOffsetDesc->offsetY, globalOffsetDesc->offsetZ};
result = mutableGlobalOffsetDesc->var->setValue(sizeof(globalOffset), 0, globalOffset);
result = currentVariables->globalOffset->setValue(sizeof(globalOffset), 0, globalOffset);
if (result != ZE_RESULT_SUCCESS) {
return result;
}
@@ -546,25 +518,17 @@ ze_result_t MutableCommandListImp::updateMutableCommandSignalEventExp(uint64_t c
if ((selectedAppend.mutationFlags & ZE_MUTABLE_COMMAND_EXP_FLAG_SIGNAL_EVENT) == 0) {
return ZE_RESULT_ERROR_INVALID_ARGUMENT;
}
MutableVariableDescriptor *mutableSignalEventDesc = nullptr;
for (auto &mutableTypeDescriptor : selectedAppend.variables) {
if (mutableTypeDescriptor.varType != ZE_MUTABLE_COMMAND_EXP_FLAG_SIGNAL_EVENT) {
continue;
}
mutableSignalEventDesc = &mutableTypeDescriptor;
break;
}
if (mutableSignalEventDesc == nullptr) {
if (selectedAppend.variables.signalEvent.eventVariable == nullptr) {
return ZE_RESULT_ERROR_INVALID_ARGUMENT;
}
auto inputEvent = Event::fromHandle(signalEvent);
if (mutableSignalEventDesc->signalEvent.event == inputEvent) {
if (selectedAppend.variables.signalEvent.event == inputEvent) {
return ZE_RESULT_SUCCESS;
}
auto ret = mutableSignalEventDesc->var->setValue(0, 0, inputEvent);
auto ret = selectedAppend.variables.signalEvent.eventVariable->setValue(0, 0, inputEvent);
if (ret == ZE_RESULT_SUCCESS) {
mutableSignalEventDesc->signalEvent.event = inputEvent;
selectedAppend.variables.signalEvent.event = inputEvent;
this->updatedCommandList = true;
}
return ret;
@@ -576,29 +540,21 @@ ze_result_t MutableCommandListImp::updateMutableCommandWaitEventsExp(uint64_t co
if ((selectedAppend.mutationFlags & ZE_MUTABLE_COMMAND_EXP_FLAG_WAIT_EVENTS) == 0) {
return ZE_RESULT_ERROR_INVALID_ARGUMENT;
}
MutableVariableDescriptor *mutableWaitEventDesc = nullptr;
if (numWaitEvents > selectedAppend.variables.waitEvents.size()) {
return ZE_RESULT_ERROR_INVALID_ARGUMENT;
}
for (uint32_t eventNum = 0; eventNum < numWaitEvents; eventNum++) {
for (auto &mutableTypeDescriptor : selectedAppend.variables) {
if (mutableTypeDescriptor.varType != ZE_MUTABLE_COMMAND_EXP_FLAG_WAIT_EVENTS || mutableTypeDescriptor.waitEvents.waitEventIndex != eventNum) {
continue;
}
mutableWaitEventDesc = &mutableTypeDescriptor;
break;
}
if (mutableWaitEventDesc == nullptr) {
return ZE_RESULT_ERROR_INVALID_ARGUMENT;
}
WaitEventVariableDescriptor &mutableWaitEventDesc = selectedAppend.variables.waitEvents[eventNum];
UNRECOVERABLE_IF(mutableWaitEventDesc.waitEventIndex != eventNum);
auto waitEventHandle = toInternalType(phWaitEvents[eventNum]);
auto inputEvent = Event::fromHandle(waitEventHandle);
if (mutableWaitEventDesc->waitEvents.event == inputEvent) {
if (mutableWaitEventDesc.event == inputEvent) {
continue;
}
auto retCode = mutableWaitEventDesc->var->setValue(0, 0, inputEvent);
auto retCode = mutableWaitEventDesc.eventVariable->setValue(0, 0, inputEvent);
if (retCode == ZE_RESULT_SUCCESS) {
mutableWaitEventDesc->waitEvents.event = inputEvent;
mutableWaitEventDesc.event = inputEvent;
this->updatedCommandList = true;
} else {
return retCode;
@@ -643,27 +599,29 @@ ze_result_t MutableCommandListImp::updateMutableCommandKernelsExp(uint32_t numKe
}
// remove old kernel arguments (buffers) from mutable residency and reset variables
{
for (auto &kernelVariableDescriptor : oldMutableKernel->getKernelVariables()) {
auto &varDescriptor = kernelVariableDescriptor.var->getDesc();
auto &kernelVariableDescriptors = oldMutableKernel->getKernelVariables();
for (auto &kernelArgVarDesc : kernelVariableDescriptors.kernelArguments) {
if (kernelArgVarDesc.kernelArgumentVariable == nullptr) {
continue;
}
auto &varDescriptor = kernelArgVarDesc.kernelArgumentVariable->getDesc();
if (varDescriptor.type == VariableType::buffer) {
if (varDescriptor.bufferAlloc != nullptr) {
removeFromResidencyContainer(varDescriptor.bufferAlloc);
}
kernelVariableDescriptor.var->resetBufferVariable();
kernelArgVarDesc.kernelArgumentVariable->resetBufferVariable();
}
if (varDescriptor.type == VariableType::slmBuffer) {
kernelVariableDescriptor.var->resetSlmVariable();
}
if (varDescriptor.type == VariableType::groupCount) {
kernelVariableDescriptor.var->resetGroupCountVariable();
}
if (varDescriptor.type == VariableType::groupSize) {
kernelVariableDescriptor.var->resetGroupSizeVariable();
}
if (varDescriptor.type == VariableType::globalOffset) {
kernelVariableDescriptor.var->resetGlobalOffsetVariable();
kernelArgVarDesc.kernelArgumentVariable->resetSlmVariable();
}
}
kernelVariableDescriptors.groupCount->resetGroupCountVariable();
if (kernelVariableDescriptors.groupSize != nullptr) {
kernelVariableDescriptors.groupSize->resetGroupSizeVariable();
}
if (kernelVariableDescriptors.globalOffset != nullptr) {
kernelVariableDescriptors.globalOffset->resetGlobalOffsetVariable();
}
}
// copy const data from host view into heap payload
@@ -703,9 +661,9 @@ ze_result_t MutableCommandListImp::updateMutableCommandKernelsExp(uint32_t numKe
// save new host view inline data/post sync into command buffer
newKernelComputeWalker->saveCpuBufferIntoGpuBuffer(false, true);
// update reminder variables (signal/wait events variables) with new compute walker to have correct reference for new post sync addresses
for (auto &mutableVariableDescriptor : selectedAppend.variables) {
mutableVariableDescriptor.var->updateMutableComputeWalker(newKernelComputeWalker);
// update reminder variables (signal event variable) with new compute walker to have correct reference for new post sync addresses
if (selectedAppend.variables.signalEvent.eventVariable != nullptr) {
selectedAppend.variables.signalEvent.eventVariable->updateMutableComputeWalker(newKernelComputeWalker);
}
// add new kernel to mutable residency

View File

@@ -75,11 +75,6 @@ struct MclAllocations {
};
struct AppendMutation {
AppendMutation() {
constexpr size_t estimatedKernelArgumentPerAppendCount = 40 + 2; // kernel args + 2 for group size and group count per kernel in kernel group
// reference to variables is used in append but NOT for kernel group (ISA mutation) and their descriptors
variables.reserve(estimatedKernelArgumentPerAppendCount);
}
MutationVariables variables;
MutableKernelGroup *kernelGroup = nullptr;
ze_mutable_command_exp_flags_t mutationFlags = 0;
@@ -168,11 +163,11 @@ struct MutableCommandListImp : public MutableCommandList {
void createNativeBinary(ArrayRef<const uint8_t> module);
KernelData *getKernelData(L0::Kernel *kernel);
MutationVariables *getVariableDescriptorContainer(AppendMutation &selectedAppend) {
KernelVariableDescriptor *getVariableDescriptorContainer(AppendMutation &selectedAppend) {
if (selectedAppend.kernelGroup != nullptr) {
return &selectedAppend.kernelGroup->getCurrentMutableKernel()->getKernelVariables();
} else {
return &selectedAppend.variables;
return &selectedAppend.variables.kernelVariables;
}
}

View File

@@ -23,18 +23,11 @@ namespace L0::MCL {
MutableKernel::MutableKernel(ze_kernel_handle_t kernelHandle, uint32_t inlineDataSize, uint32_t maxPerThreadDataSize)
: inlineDataSize(inlineDataSize),
maxPerThreadDataSize(maxPerThreadDataSize) {
// group count/size
constexpr size_t estimatedDispatchVariablesCount = 2;
// kernel arguments
constexpr size_t estimatedKernelArgumentPerAppendCount = 40;
// kernel args and extra group count/size
constexpr size_t estimatedVariablesPerAppend = estimatedKernelArgumentPerAppendCount + estimatedDispatchVariablesCount;
this->kernel = L0::Kernel::fromHandle(kernelHandle);
this->kernelVariables.kernelArguments.reserve(this->kernel->getKernelDescriptor().payloadMappings.explicitArgs.size());
// space for internal allocations like ISA, private, const, global buffers, etc.
constexpr size_t estimatedInternalResidencyCount = 10;
// reference to variables is used in append but for a given kernel for kernel group (ISA mutation) and their descriptors
this->kernelVariables.reserve(estimatedVariablesPerAppend);
this->kernelResidencySnapshotContainer.reserve(estimatedInternalResidencyCount);
this->kernel = L0::Kernel::fromHandle(kernelHandle);
}
uint32_t MutableKernel::getKernelScratchSize(uint32_t slotId) const {

View File

@@ -29,7 +29,7 @@ class MutableKernel {
MutableKernel(ze_kernel_handle_t kernelHandle, uint32_t inlineDataSize, uint32_t maxPerThreadDataSize);
MutationVariables &getKernelVariables() {
KernelVariableDescriptor &getKernelVariables() {
return kernelVariables;
}
@@ -82,7 +82,7 @@ class MutableKernel {
bool checkKernelCompatible();
protected:
MutationVariables kernelVariables;
KernelVariableDescriptor kernelVariables;
NEO::ResidencyContainer kernelResidencySnapshotContainer;
std::unique_ptr<MutableIndirectData> hostViewIndirectData;

View File

@@ -19,30 +19,33 @@ namespace L0::MCL {
struct Variable;
struct KernelArgumentVariableDescriptor {
uint32_t argType = 0;
Variable *kernelArgumentVariable = nullptr;
uint32_t argIndex = 0;
};
struct SignalEventVariableDescriptor {
Variable *eventVariable = nullptr;
Event *event = nullptr;
};
struct WaitEventVariableDescriptor {
Variable *eventVariable = nullptr;
Event *event = nullptr;
uint32_t waitEventIndex = 0;
uint32_t waitEventPackets = 0;
};
struct MutableVariableDescriptor {
Variable *var = nullptr;
union {
KernelArgumentVariableDescriptor kernelArguments;
SignalEventVariableDescriptor signalEvent;
WaitEventVariableDescriptor waitEvents;
};
ze_mutable_command_exp_flag_t varType;
struct KernelVariableDescriptor {
std::vector<KernelArgumentVariableDescriptor> kernelArguments;
Variable *groupCount = nullptr;
Variable *groupSize = nullptr;
Variable *globalOffset = nullptr;
};
using MutationVariables = std::vector<MutableVariableDescriptor>;
struct MutationVariables {
KernelVariableDescriptor kernelVariables;
SignalEventVariableDescriptor signalEvent;
std::vector<WaitEventVariableDescriptor> waitEvents;
};
} // namespace L0::MCL