mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-03 23:03:02 +08:00
Improve zeCommandListAppendMemoryFill Performance
Improve L0 fill operations by copying the pattern using two kernels: one that copies four bytes at a time, and one that takes care of the remainder. Additionally, a new allocation is created to fill up at least a cacheline. Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
444b9594af
commit
479d01c118
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2019-2020 Intel Corporation
|
||||
* Copyright (C) 2019-2021 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -26,6 +26,8 @@ enum class Builtin : uint32_t {
|
||||
CopyBufferToBufferSide,
|
||||
FillBufferImmediate,
|
||||
FillBufferSSHOffset,
|
||||
FillBufferMiddle,
|
||||
FillBufferRightLeftover,
|
||||
QueryKernelTimestamps,
|
||||
QueryKernelTimestampsWithOffsets,
|
||||
COUNT
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2019-2020 Intel Corporation
|
||||
* Copyright (C) 2019-2021 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -49,6 +49,14 @@ void BuiltinFunctionsLibImpl::initFunctions() {
|
||||
builtinName = "FillBufferSSHOffset";
|
||||
builtin = NEO::EBuiltInOps::FillBuffer;
|
||||
break;
|
||||
case Builtin::FillBufferMiddle:
|
||||
builtinName = "FillBufferMiddle";
|
||||
builtin = NEO::EBuiltInOps::FillBuffer;
|
||||
break;
|
||||
case Builtin::FillBufferRightLeftover:
|
||||
builtinName = "FillBufferRightLeftover";
|
||||
builtin = NEO::EBuiltInOps::FillBuffer;
|
||||
break;
|
||||
case Builtin::QueryKernelTimestamps:
|
||||
builtinName = "QueryKernelTimestamps";
|
||||
builtin = NEO::EBuiltInOps::QueryKernelTimestamps;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2019-2020 Intel Corporation
|
||||
* Copyright (C) 2019-2021 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -190,6 +190,9 @@ struct CommandList : _ze_command_list_handle_t {
|
||||
virtual ~CommandList();
|
||||
NEO::CommandContainer commandContainer;
|
||||
bool getContainsStatelessUncachedResource() { return containsStatelessUncachedResource; }
|
||||
std::map<const void *, NEO::GraphicsAllocation *> &getHostPtrMap() {
|
||||
return hostPtrMap;
|
||||
};
|
||||
|
||||
protected:
|
||||
std::map<const void *, NEO::GraphicsAllocation *> hostPtrMap;
|
||||
|
||||
@@ -199,11 +199,11 @@ struct CommandListCoreFamily : CommandListImp {
|
||||
size_t bytesPerPixel, Vec3<size_t> copySize,
|
||||
Vec3<uint32_t> srcSize, Vec3<uint32_t> dstSize, ze_event_handle_t hSignalEvent);
|
||||
|
||||
ze_result_t appendLaunchKernelWithParams(ze_kernel_handle_t hKernel,
|
||||
const ze_group_count_t *pThreadGroupDimensions,
|
||||
ze_event_handle_t hEvent,
|
||||
bool isIndirect,
|
||||
bool isPredicate);
|
||||
MOCKABLE_VIRTUAL ze_result_t appendLaunchKernelWithParams(ze_kernel_handle_t hKernel,
|
||||
const ze_group_count_t *pThreadGroupDimensions,
|
||||
ze_event_handle_t hEvent,
|
||||
bool isIndirect,
|
||||
bool isPredicate);
|
||||
ze_result_t appendLaunchKernelSplit(ze_kernel_handle_t hKernel, const ze_group_count_t *pThreadGroupDimensions, ze_event_handle_t hEvent);
|
||||
ze_result_t prepareIndirectParams(const ze_group_count_t *pThreadGroupDimensions);
|
||||
|
||||
|
||||
@@ -1135,9 +1135,9 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
|
||||
return appendBlitFill(ptr, pattern, patternSize, size, hSignalEvent, numWaitEvents, phWaitEvents);
|
||||
}
|
||||
|
||||
ze_result_t ret = addEventsToCmdList(numWaitEvents, phWaitEvents);
|
||||
if (ret) {
|
||||
return ret;
|
||||
ze_result_t res = addEventsToCmdList(numWaitEvents, phWaitEvents);
|
||||
if (res) {
|
||||
return res;
|
||||
}
|
||||
|
||||
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
|
||||
@@ -1159,20 +1159,12 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
|
||||
}
|
||||
|
||||
auto dstAllocation = this->getAlignedAllocation(this->device, ptr, size);
|
||||
|
||||
uintptr_t srcPtr = reinterpret_cast<uintptr_t>(const_cast<void *>(pattern));
|
||||
size_t srcOffset = 0;
|
||||
NEO::EncodeSurfaceState<GfxFamily>::getSshAlignedPointer(srcPtr, srcOffset);
|
||||
|
||||
auto lock = device->getBuiltinFunctionsLib()->obtainUniqueOwnership();
|
||||
|
||||
Kernel *builtinFunction = nullptr;
|
||||
uint32_t groupSizeX = 1u;
|
||||
|
||||
if (patternSize == 1) {
|
||||
builtinFunction = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferImmediate);
|
||||
auto builtinFunction = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferImmediate);
|
||||
|
||||
groupSizeX = builtinFunction->getImmutableData()->getDescriptor().kernelAttributes.simdSize;
|
||||
uint32_t groupSizeX = builtinFunction->getImmutableData()->getDescriptor().kernelAttributes.simdSize;
|
||||
if (groupSizeX > static_cast<uint32_t>(size)) {
|
||||
groupSizeX = static_cast<uint32_t>(size);
|
||||
}
|
||||
@@ -1186,50 +1178,92 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
|
||||
builtinFunction->setArgumentValue(1, sizeof(dstAllocation.offset), &dstAllocation.offset);
|
||||
builtinFunction->setArgumentValue(2, sizeof(value), &value);
|
||||
|
||||
appendEventForProfilingAllWalkers(hSignalEvent, true);
|
||||
|
||||
uint32_t groups = static_cast<uint32_t>(size) / groupSizeX;
|
||||
ze_group_count_t dispatchFuncArgs{groups, 1u, 1u};
|
||||
res = CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernel(builtinFunction->toHandle(),
|
||||
&dispatchFuncArgs, nullptr,
|
||||
0, nullptr);
|
||||
if (res) {
|
||||
return res;
|
||||
}
|
||||
} else {
|
||||
builtinFunction = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferSSHOffset);
|
||||
auto builtinFunction = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferMiddle);
|
||||
|
||||
auto patternAlloc = this->getAlignedAllocation(this->device, reinterpret_cast<void *>(srcPtr), srcOffset + patternSize);
|
||||
if (patternAlloc.alloc == nullptr) {
|
||||
DEBUG_BREAK_IF(true);
|
||||
return ZE_RESULT_ERROR_UNKNOWN;
|
||||
}
|
||||
srcOffset += patternAlloc.offset;
|
||||
size_t middleElSize = sizeof(uint32_t);
|
||||
size_t adjustedSize = size / middleElSize;
|
||||
uint32_t groupSizeX = static_cast<uint32_t>(adjustedSize);
|
||||
uint32_t groupSizeY = 1, groupSizeZ = 1;
|
||||
builtinFunction->suggestGroupSize(groupSizeX, groupSizeY, groupSizeZ, &groupSizeX, &groupSizeY, &groupSizeZ);
|
||||
builtinFunction->setGroupSize(groupSizeX, groupSizeY, groupSizeZ);
|
||||
|
||||
groupSizeX = static_cast<uint32_t>(std::min(patternSize, size));
|
||||
if (builtinFunction->setGroupSize(groupSizeX, 1u, 1u)) {
|
||||
DEBUG_BREAK_IF(true);
|
||||
return ZE_RESULT_ERROR_UNKNOWN;
|
||||
uint32_t groups = static_cast<uint32_t>(adjustedSize) / groupSizeX;
|
||||
uint32_t groupRemainderSizeX = static_cast<uint32_t>(size) % groupSizeX;
|
||||
|
||||
size_t patternAllocationSize = alignUp(patternSize, MemoryConstants::cacheLineSize);
|
||||
uint32_t patternSizeInEls = static_cast<uint32_t>(patternAllocationSize / middleElSize);
|
||||
|
||||
auto patternGfxAlloc = getAllocationFromHostPtrMap(pattern, patternAllocationSize);
|
||||
if (patternGfxAlloc == nullptr) {
|
||||
patternGfxAlloc = device->getDriverHandle()->getMemoryManager()->allocateGraphicsMemoryWithProperties({device->getNEODevice()->getRootDeviceIndex(),
|
||||
patternAllocationSize,
|
||||
NEO::GraphicsAllocation::AllocationType::FILL_PATTERN,
|
||||
device->getNEODevice()->getDeviceBitfield()});
|
||||
hostPtrMap.insert(std::make_pair(pattern, patternGfxAlloc));
|
||||
}
|
||||
void *patternGfxAllocPtr = patternGfxAlloc->getUnderlyingBuffer();
|
||||
|
||||
uint64_t patternAllocPtr = reinterpret_cast<uintptr_t>(patternGfxAllocPtr);
|
||||
uint64_t patternAllocOffset = 0;
|
||||
uint64_t patternSizeToCopy = patternSize;
|
||||
do {
|
||||
memcpy_s(reinterpret_cast<void *>(patternAllocPtr + patternAllocOffset),
|
||||
patternSizeToCopy, pattern, patternSizeToCopy);
|
||||
|
||||
if ((patternAllocOffset + patternSizeToCopy) > patternAllocationSize) {
|
||||
patternSizeToCopy = patternAllocationSize - patternAllocOffset;
|
||||
}
|
||||
|
||||
patternAllocOffset += patternSizeToCopy;
|
||||
} while (patternAllocOffset < patternAllocationSize);
|
||||
|
||||
builtinFunction->setArgBufferWithAlloc(0, dstAllocation.alignedAllocationPtr, dstAllocation.alloc);
|
||||
builtinFunction->setArgumentValue(1, sizeof(dstAllocation.offset), &dstAllocation.offset);
|
||||
builtinFunction->setArgBufferWithAlloc(2, patternAlloc.alignedAllocationPtr,
|
||||
patternAlloc.alloc);
|
||||
builtinFunction->setArgumentValue(3, sizeof(srcOffset), &srcOffset);
|
||||
}
|
||||
builtinFunction->setArgBufferWithAlloc(2, reinterpret_cast<uintptr_t>(patternGfxAllocPtr), patternGfxAlloc);
|
||||
builtinFunction->setArgumentValue(3, sizeof(patternSizeInEls), &patternSizeInEls);
|
||||
|
||||
appendEventForProfilingAllWalkers(hSignalEvent, true);
|
||||
appendEventForProfilingAllWalkers(hSignalEvent, true);
|
||||
|
||||
uint32_t groups = static_cast<uint32_t>(size) / groupSizeX;
|
||||
ze_group_count_t dispatchFuncArgs{groups, 1u, 1u};
|
||||
ze_group_count_t dispatchFuncArgs{groups, 1u, 1u};
|
||||
res = appendLaunchKernelSplit(builtinFunction->toHandle(), &dispatchFuncArgs, hSignalEvent);
|
||||
if (res) {
|
||||
return res;
|
||||
}
|
||||
|
||||
ze_result_t res = CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelSplit(builtinFunction->toHandle(), &dispatchFuncArgs, hSignalEvent);
|
||||
if (groupRemainderSizeX) {
|
||||
uint32_t dstOffsetRemainder = groups * groupSizeX * static_cast<uint32_t>(middleElSize);
|
||||
uint64_t patternOffsetRemainder = (groupSizeX * groups & (patternSizeInEls - 1)) * middleElSize;
|
||||
|
||||
if (res) {
|
||||
return res;
|
||||
}
|
||||
auto builtinFunctionRemainder = device->getBuiltinFunctionsLib()->getFunction(Builtin::FillBufferRightLeftover);
|
||||
builtinFunctionRemainder->setGroupSize(groupRemainderSizeX, 1u, 1u);
|
||||
ze_group_count_t dispatchFuncArgs{1u, 1u, 1u};
|
||||
|
||||
uint32_t groupRemainderSizeX = static_cast<uint32_t>(size) % groupSizeX;
|
||||
if (groupRemainderSizeX) {
|
||||
builtinFunction->setGroupSize(groupRemainderSizeX, 1u, 1u);
|
||||
ze_group_count_t dispatchFuncArgs{1u, 1u, 1u};
|
||||
|
||||
size_t dstOffset = dstAllocation.offset + (size - groupRemainderSizeX);
|
||||
builtinFunction->setArgBufferWithAlloc(0, dstAllocation.alignedAllocationPtr, dstAllocation.alloc);
|
||||
builtinFunction->setArgumentValue(1, sizeof(dstOffset), &dstOffset);
|
||||
|
||||
res = CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelSplit(builtinFunction->toHandle(), &dispatchFuncArgs, hSignalEvent);
|
||||
builtinFunctionRemainder->setArgBufferWithAlloc(0,
|
||||
dstAllocation.alignedAllocationPtr,
|
||||
dstAllocation.alloc);
|
||||
builtinFunctionRemainder->setArgumentValue(1,
|
||||
sizeof(dstOffsetRemainder),
|
||||
&dstOffsetRemainder);
|
||||
builtinFunctionRemainder->setArgBufferWithAlloc(2,
|
||||
reinterpret_cast<uintptr_t>(patternGfxAllocPtr) + patternOffsetRemainder,
|
||||
patternGfxAlloc);
|
||||
builtinFunctionRemainder->setArgumentValue(3, sizeof(patternSizeInEls), &patternSizeInEls);
|
||||
res = appendLaunchKernelSplit(builtinFunctionRemainder->toHandle(), &dispatchFuncArgs, hSignalEvent);
|
||||
if (res) {
|
||||
return res;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
appendEventForProfilingAllWalkers(hSignalEvent, false);
|
||||
@@ -1488,7 +1522,7 @@ void CommandListCoreFamily<gfxCoreFamily>::appendEventForProfiling(ze_event_hand
|
||||
appendWriteKernelTimestamp(hEvent, beforeWalker, true);
|
||||
} else {
|
||||
|
||||
NEO::PipeControlArgs args;
|
||||
NEO::PipeControlArgs args = {};
|
||||
args.dcFlushEnable = true;
|
||||
|
||||
NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControl(*commandContainer.getCommandStream(), args);
|
||||
|
||||
Reference in New Issue
Block a user