mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-03 14:55:24 +08:00
Improve zeCommandListAppendMemoryFill Performance (2)
Add missing kernel for remainder kernel when pattern size is 1. Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
a0db607083
commit
26b036ab97
@@ -1182,12 +1182,25 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
|
||||
|
||||
uint32_t groups = static_cast<uint32_t>(size) / groupSizeX;
|
||||
ze_group_count_t dispatchFuncArgs{groups, 1u, 1u};
|
||||
res = CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernel(builtinFunction->toHandle(),
|
||||
&dispatchFuncArgs, nullptr,
|
||||
0, nullptr);
|
||||
res = appendLaunchKernelSplit(builtinFunction->toHandle(), &dispatchFuncArgs, hSignalEvent);
|
||||
if (res) {
|
||||
return res;
|
||||
}
|
||||
|
||||
uint32_t groupRemainderSizeX = static_cast<uint32_t>(size) % groupSizeX;
|
||||
if (groupRemainderSizeX) {
|
||||
builtinFunction->setGroupSize(groupRemainderSizeX, 1u, 1u);
|
||||
ze_group_count_t dispatchFuncRemainderArgs{1u, 1u, 1u};
|
||||
|
||||
size_t dstOffset = dstAllocation.offset + (size - groupRemainderSizeX);
|
||||
builtinFunction->setArgBufferWithAlloc(0, dstAllocation.alignedAllocationPtr, dstAllocation.alloc);
|
||||
builtinFunction->setArgumentValue(1, sizeof(dstOffset), &dstOffset);
|
||||
|
||||
res = appendLaunchKernelSplit(builtinFunction->toHandle(), &dispatchFuncRemainderArgs, hSignalEvent);
|
||||
if (res) {
|
||||
return res;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
auto builtinFunction = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferMiddle);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user