Add kernel algorithm to check any argument is using system memory

Related-To: NEO-6959

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2022-07-01 18:03:54 +00:00
committed by Compute-Runtime-Automation
parent 5a3a39a281
commit e07f9f0698
19 changed files with 798 additions and 293 deletions

View File

@ -4898,9 +4898,9 @@ cl_int CL_API_CALL clSetKernelArgSVMPointer(cl_kernel kernel,
const void *argValue) {
TRACING_ENTER(ClSetKernelArgSvmPointer, &kernel, &argIndex, &argValue);
MultiDeviceKernel *pMultiDeviceKernel = nullptr;
MultiDeviceKernel *multiDeviceKernel = nullptr;
auto retVal = validateObjects(withCastToInternal(kernel, &pMultiDeviceKernel));
auto retVal = validateObjects(withCastToInternal(kernel, &multiDeviceKernel));
API_ENTER(&retVal);
if (CL_SUCCESS != retVal) {
@ -4908,27 +4908,27 @@ cl_int CL_API_CALL clSetKernelArgSVMPointer(cl_kernel kernel,
return retVal;
}
if (argIndex >= pMultiDeviceKernel->getKernelArgsNumber()) {
if (argIndex >= multiDeviceKernel->getKernelArgsNumber()) {
retVal = CL_INVALID_ARG_INDEX;
TRACING_EXIT(ClSetKernelArgSvmPointer, &retVal);
return retVal;
}
const auto svmManager = pMultiDeviceKernel->getContext().getSVMAllocsManager();
const auto svmManager = multiDeviceKernel->getContext().getSVMAllocsManager();
if (argValue != nullptr) {
if (pMultiDeviceKernel->getKernelArguments()[argIndex].allocId > 0 &&
pMultiDeviceKernel->getKernelArguments()[argIndex].value == argValue) {
if (multiDeviceKernel->getKernelArguments()[argIndex].allocId > 0 &&
multiDeviceKernel->getKernelArguments()[argIndex].value == argValue) {
bool reuseFromCache = false;
const auto allocationsCounter = svmManager->allocationsCounter.load();
if (allocationsCounter > 0) {
if (allocationsCounter == pMultiDeviceKernel->getKernelArguments()[argIndex].allocIdMemoryManagerCounter) {
if (allocationsCounter == multiDeviceKernel->getKernelArguments()[argIndex].allocIdMemoryManagerCounter) {
reuseFromCache = true;
} else {
const auto svmData = svmManager->getSVMAlloc(argValue);
if (svmData && pMultiDeviceKernel->getKernelArguments()[argIndex].allocId == svmData->getAllocId()) {
if (svmData && multiDeviceKernel->getKernelArguments()[argIndex].allocId == svmData->getAllocId()) {
reuseFromCache = true;
pMultiDeviceKernel->storeKernelArgAllocIdMemoryManagerCounter(argIndex, allocationsCounter);
multiDeviceKernel->storeKernelArgAllocIdMemoryManagerCounter(argIndex, allocationsCounter);
}
}
if (reuseFromCache) {
@ -4938,7 +4938,7 @@ cl_int CL_API_CALL clSetKernelArgSVMPointer(cl_kernel kernel,
}
}
} else {
if (pMultiDeviceKernel->getKernelArguments()[argIndex].isSetToNullptr) {
if (multiDeviceKernel->getKernelArguments()[argIndex].isSetToNullptr) {
TRACING_EXIT(ClSetKernelArgSvmPointer, &retVal);
return CL_SUCCESS;
}
@ -4946,7 +4946,7 @@ cl_int CL_API_CALL clSetKernelArgSVMPointer(cl_kernel kernel,
DBG_LOG_INPUTS("kernel", kernel, "argIndex", argIndex, "argValue", argValue);
for (const auto &pDevice : pMultiDeviceKernel->getDevices()) {
for (const auto &pDevice : multiDeviceKernel->getDevices()) {
const HardwareInfo &hwInfo = pDevice->getHardwareInfo();
if (!hwInfo.capabilityTable.ftrSvm) {
retVal = CL_INVALID_OPERATION;
@ -4955,8 +4955,8 @@ cl_int CL_API_CALL clSetKernelArgSVMPointer(cl_kernel kernel,
}
}
for (const auto &pDevice : pMultiDeviceKernel->getDevices()) {
auto pKernel = pMultiDeviceKernel->getKernel(pDevice->getRootDeviceIndex());
for (const auto &pDevice : multiDeviceKernel->getDevices()) {
auto pKernel = multiDeviceKernel->getKernel(pDevice->getRootDeviceIndex());
cl_int kernelArgAddressQualifier = asClKernelArgAddressQualifier(pKernel->getKernelInfo()
.kernelDescriptor.payloadMappings.explicitArgs[argIndex]
.getTraits()
@ -4969,12 +4969,12 @@ cl_int CL_API_CALL clSetKernelArgSVMPointer(cl_kernel kernel,
}
}
MultiGraphicsAllocation *pSvmAllocs = nullptr;
MultiGraphicsAllocation *svmAllocs = nullptr;
uint32_t allocId = 0u;
if (argValue != nullptr) {
auto svmData = svmManager->getSVMAlloc(argValue);
if (svmData == nullptr) {
for (const auto &pDevice : pMultiDeviceKernel->getDevices()) {
for (const auto &pDevice : multiDeviceKernel->getDevices()) {
if (!pDevice->areSharedSystemAllocationsAllowed()) {
retVal = CL_INVALID_ARG_VALUE;
TRACING_EXIT(ClSetKernelArgSvmPointer, &retVal);
@ -4982,12 +4982,12 @@ cl_int CL_API_CALL clSetKernelArgSVMPointer(cl_kernel kernel,
}
}
} else {
pSvmAllocs = &svmData->gpuAllocations;
svmAllocs = &svmData->gpuAllocations;
allocId = svmData->getAllocId();
}
}
retVal = pMultiDeviceKernel->setArgSvmAlloc(argIndex, const_cast<void *>(argValue), pSvmAllocs, allocId);
retVal = multiDeviceKernel->setArgSvmAlloc(argIndex, const_cast<void *>(argValue), svmAllocs, allocId);
TRACING_EXIT(ClSetKernelArgSvmPointer, &retVal);
return retVal;
}

View File

@ -447,6 +447,7 @@ void CommandQueueHw<GfxFamily>::processDispatchForKernels(const MultiDispatchInf
dispatchWalkerArgs.timestampPacketDependencies = &timestampPacketDependencies;
dispatchWalkerArgs.currentTimestampPacketNodes = timestampPacketContainer.get();
dispatchWalkerArgs.commandType = commandType;
dispatchWalkerArgs.event = event;
HardwareInterface<GfxFamily>::dispatchWalker(
*this,

View File

@ -16,6 +16,7 @@ namespace NEO {
class CommandQueue;
class DispatchInfo;
class Event;
class IndirectHeap;
class Kernel;
class LinearStream;
@ -37,6 +38,7 @@ struct HardwareInterfaceWalkerArgs {
const Vec3<size_t> *numberOfWorkgroups = nullptr;
const Vec3<size_t> *startOfWorkgroups = nullptr;
KernelOperation *blockedCommandsData = nullptr;
Event *event = nullptr;
size_t currentDispatchIndex = 0;
size_t offsetInterfaceDescriptorTable = 0;
PreemptionMode preemptionMode = PreemptionMode::Initial;

View File

@ -362,7 +362,7 @@ cl_int Kernel::cloneKernel(Kernel *pSourceKernel) {
break;
case SVM_OBJ:
setArgSvm(i, pSourceKernel->getKernelArgInfo(i).size, const_cast<void *>(pSourceKernel->getKernelArgInfo(i).value),
pSourceKernel->getKernelArgInfo(i).pSvmAlloc, pSourceKernel->getKernelArgInfo(i).svmFlags);
pSourceKernel->getKernelArgInfo(i).svmAllocation, pSourceKernel->getKernelArgInfo(i).svmFlags);
break;
case SVM_ALLOC_OBJ:
setArgSvmAlloc(i, const_cast<void *>(pSourceKernel->getKernelArgInfo(i).value),
@ -881,8 +881,10 @@ cl_int Kernel::setArgSvm(uint32_t argIndex, size_t svmAllocSize, void *svmPtr, G
patchedArgumentsNum++;
kernelArguments[argIndex].isPatched = true;
}
if (svmPtr != nullptr) {
this->anyKernelArgumentUsingSystemMemory |= true;
}
addAllocationToCacheFlushVector(argIndex, svmAlloc);
return CL_SUCCESS;
}
@ -894,6 +896,8 @@ cl_int Kernel::setArgSvmAlloc(uint32_t argIndex, void *svmPtr, GraphicsAllocatio
auto patchLocation = ptrOffset(getCrossThreadData(), argAsPtr.stateless);
patchWithRequiredSize(patchLocation, argAsPtr.pointerSize, reinterpret_cast<uintptr_t>(svmPtr));
auto &kernelArgInfo = kernelArguments[argIndex];
bool disableL3 = false;
bool forceNonAuxMode = false;
bool isAuxTranslationKernel = (AuxTranslationDirection::None != auxTranslationDirection);
@ -910,7 +914,7 @@ cl_int Kernel::setArgSvmAlloc(uint32_t argIndex, void *svmPtr, GraphicsAllocatio
forceNonAuxMode = true;
}
bool argWasUncacheable = kernelArguments[argIndex].isStatelessUncacheable;
bool argWasUncacheable = kernelArgInfo.isStatelessUncacheable;
bool argIsUncacheable = svmAlloc ? svmAlloc->isUncacheable() : false;
statelessUncacheableArgsCount += (argIsUncacheable ? 1 : 0) - (argWasUncacheable ? 1 : 0);
@ -929,15 +933,21 @@ cl_int Kernel::setArgSvmAlloc(uint32_t argIndex, void *svmPtr, GraphicsAllocatio
}
storeKernelArg(argIndex, SVM_ALLOC_OBJ, svmAlloc, svmPtr, sizeof(uintptr_t));
kernelArguments[argIndex].allocId = allocId;
kernelArguments[argIndex].allocIdMemoryManagerCounter = allocId ? this->getContext().getSVMAllocsManager()->allocationsCounter.load() : 0u;
kernelArguments[argIndex].isSetToNullptr = nullptr == svmPtr;
if (!kernelArguments[argIndex].isPatched) {
kernelArgInfo.allocId = allocId;
kernelArgInfo.allocIdMemoryManagerCounter = allocId ? this->getContext().getSVMAllocsManager()->allocationsCounter.load() : 0u;
kernelArgInfo.isSetToNullptr = nullptr == svmPtr;
if (!kernelArgInfo.isPatched) {
patchedArgumentsNum++;
kernelArguments[argIndex].isPatched = true;
kernelArgInfo.isPatched = true;
}
if (!kernelArgInfo.isSetToNullptr) {
if (svmAlloc != nullptr) {
this->anyKernelArgumentUsingSystemMemory |= graphicsAllocationTypeUseSystemMemory(svmAlloc->getAllocationType());
} else {
this->anyKernelArgumentUsingSystemMemory |= true;
}
}
addAllocationToCacheFlushVector(argIndex, svmAlloc);
return CL_SUCCESS;
}
@ -948,7 +958,7 @@ void Kernel::storeKernelArg(uint32_t argIndex, kernelArgType argType, void *argO
kernelArguments[argIndex].object = argObject;
kernelArguments[argIndex].value = argValue;
kernelArguments[argIndex].size = argSize;
kernelArguments[argIndex].pSvmAlloc = argSvmAlloc;
kernelArguments[argIndex].svmAllocation = argSvmAlloc;
kernelArguments[argIndex].svmFlags = argSvmFlags;
}
@ -1391,8 +1401,12 @@ cl_int Kernel::setArgBuffer(uint32_t argIndex,
storeKernelArg(argIndex, BUFFER_OBJ, clMemObj, argVal, argSize);
auto buffer = castToObject<Buffer>(clMemObj);
if (!buffer)
if (!buffer) {
return CL_INVALID_MEM_OBJECT;
}
auto gfxAllocationType = buffer->getGraphicsAllocation(rootDeviceIndex)->getAllocationType();
this->anyKernelArgumentUsingSystemMemory |= graphicsAllocationTypeUseSystemMemory(gfxAllocationType);
if (buffer->peekSharingHandler()) {
usingSharedObjArgs = true;
@ -1449,7 +1463,6 @@ cl_int Kernel::setArgBuffer(uint32_t argIndex,
}
addAllocationToCacheFlushVector(argIndex, allocationForCacheFlush);
return CL_SUCCESS;
} else {
storeKernelArg(argIndex, BUFFER_OBJ, nullptr, argVal, argSize);
@ -2237,4 +2250,11 @@ int Kernel::setKernelThreadArbitrationPolicy(uint32_t policy) {
return CL_SUCCESS;
}
bool Kernel::graphicsAllocationTypeUseSystemMemory(AllocationType type) {
return (type == AllocationType::BUFFER_HOST_MEMORY) ||
(type == AllocationType::EXTERNAL_HOST_PTR) ||
(type == AllocationType::SVM_CPU) ||
(type == AllocationType::SVM_ZERO_COPY);
}
} // namespace NEO

View File

@ -59,16 +59,16 @@ class Kernel : public ReferenceTrackedObject<Kernel> {
};
struct SimpleKernelArgInfo {
kernelArgType type;
cl_mem_flags svmFlags;
void *object;
const void *value;
size_t size;
GraphicsAllocation *pSvmAlloc;
cl_mem_flags svmFlags;
bool isPatched = false;
bool isStatelessUncacheable = false;
GraphicsAllocation *svmAllocation;
kernelArgType type;
uint32_t allocId;
uint32_t allocIdMemoryManagerCounter;
bool isPatched = false;
bool isStatelessUncacheable = false;
bool isSetToNullptr = false;
};
@ -405,89 +405,11 @@ class Kernel : public ReferenceTrackedObject<Kernel> {
const std::map<uint32_t, MemObj *> &getMemObjectsToMigrate() const { return migratableArgsMap; }
ImplicitArgs *getImplicitArgs() const { return pImplicitArgs.get(); }
const HardwareInfo &getHardwareInfo() const;
bool isAnyKernelArgumentUsingSystemMemory() const {
return anyKernelArgumentUsingSystemMemory;
}
protected:
void
makeArgsResident(CommandStreamReceiver &commandStreamReceiver);
void *patchBufferOffset(const ArgDescPointer &argAsPtr, void *svmPtr, GraphicsAllocation *svmAlloc);
void patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const ArgDescPointer &arg);
Kernel(Program *programArg, const KernelInfo &kernelInfo, ClDevice &clDevice);
void provideInitializationHints();
void markArgPatchedAndResolveArgs(uint32_t argIndex);
void resolveArgs();
void reconfigureKernel();
bool hasDirectStatelessAccessToSharedBuffer() const;
bool hasDirectStatelessAccessToHostMemory() const;
bool hasIndirectStatelessAccessToHostMemory() const;
void addAllocationToCacheFlushVector(uint32_t argIndex, GraphicsAllocation *argAllocation);
bool allocationForCacheFlush(GraphicsAllocation *argAllocation) const;
const ClDevice &getDevice() const {
return clDevice;
}
cl_int patchPrivateSurface();
bool containsStatelessWrites = true;
const ExecutionEnvironment &executionEnvironment;
Program *program;
ClDevice &clDevice;
const KernelInfo &kernelInfo;
std::vector<SimpleKernelArgInfo> kernelArguments;
std::vector<KernelArgHandler> kernelArgHandlers;
std::vector<GraphicsAllocation *> kernelSvmGfxAllocations;
std::vector<GraphicsAllocation *> kernelUnifiedMemoryGfxAllocations;
AuxTranslationDirection auxTranslationDirection = AuxTranslationDirection::None;
bool usingSharedObjArgs = false;
bool usingImages = false;
bool usingImagesOnly = false;
bool auxTranslationRequired = false;
uint32_t patchedArgumentsNum = 0;
uint32_t startOffset = 0;
uint32_t statelessUncacheableArgsCount = 0;
int32_t threadArbitrationPolicy = ThreadArbitrationPolicy::NotPresent;
KernelExecutionType executionType = KernelExecutionType::Default;
std::vector<PatchInfoData> patchInfoDataList;
std::unique_ptr<ImageTransformer> imageTransformer;
std::map<uint32_t, MemObj *> migratableArgsMap{};
bool specialPipelineSelectMode = false;
bool svmAllocationsRequireCacheFlush = false;
std::vector<GraphicsAllocation *> kernelArgRequiresCacheFlush;
UnifiedMemoryControls unifiedMemoryControls{};
bool isUnifiedMemorySyncRequired = true;
bool debugEnabled = false;
uint32_t additionalKernelExecInfo = AdditionalKernelExecInfo::DisableOverdispatch;
uint32_t *maxWorkGroupSizeForCrossThreadData = &Kernel::dummyPatchLocation;
uint32_t maxKernelWorkGroupSize = 0;
uint32_t *dataParameterSimdSize = &Kernel::dummyPatchLocation;
uint32_t *parentEventOffset = &Kernel::dummyPatchLocation;
uint32_t *preferredWkgMultipleOffset = &Kernel::dummyPatchLocation;
size_t numberOfBindingTableStates = 0u;
size_t localBindingTableOffset = 0u;
std::vector<size_t> slmSizes;
uint32_t slmTotalSize = 0u;
std::unique_ptr<char[]> pSshLocal;
uint32_t sshLocalSize = 0u;
char *crossThreadData = nullptr;
uint32_t crossThreadDataSize = 0u;
GraphicsAllocation *privateSurface = nullptr;
uint64_t privateSurfaceSize = 0u;
struct KernelConfig {
Vec3<size_t> gws;
Vec3<size_t> lws;
@ -523,15 +445,98 @@ class Kernel : public ReferenceTrackedObject<Kernel> {
bool singleSubdevicePreferred = false;
};
Kernel(Program *programArg, const KernelInfo &kernelInfo, ClDevice &clDevice);
void makeArgsResident(CommandStreamReceiver &commandStreamReceiver);
void *patchBufferOffset(const ArgDescPointer &argAsPtr, void *svmPtr, GraphicsAllocation *svmAlloc);
void patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const ArgDescPointer &arg);
void provideInitializationHints();
void markArgPatchedAndResolveArgs(uint32_t argIndex);
void resolveArgs();
void reconfigureKernel();
bool hasDirectStatelessAccessToSharedBuffer() const;
bool hasDirectStatelessAccessToHostMemory() const;
bool hasIndirectStatelessAccessToHostMemory() const;
void addAllocationToCacheFlushVector(uint32_t argIndex, GraphicsAllocation *argAllocation);
bool allocationForCacheFlush(GraphicsAllocation *argAllocation) const;
const ClDevice &getDevice() const {
return clDevice;
}
cl_int patchPrivateSurface();
bool hasTunningFinished(KernelSubmissionData &submissionData);
bool hasRunFinished(TimestampPacketContainer *timestampContainer);
bool graphicsAllocationTypeUseSystemMemory(AllocationType type);
UnifiedMemoryControls unifiedMemoryControls{};
std::map<uint32_t, MemObj *> migratableArgsMap{};
std::unordered_map<KernelConfig, KernelSubmissionData, KernelConfigHash> kernelSubmissionMap;
bool singleSubdevicePreferredInCurrentEnqueue = false;
bool kernelHasIndirectAccess = true;
MultiDeviceKernel *pMultiDeviceKernel = nullptr;
std::vector<SimpleKernelArgInfo> kernelArguments;
std::vector<KernelArgHandler> kernelArgHandlers;
std::vector<GraphicsAllocation *> kernelSvmGfxAllocations;
std::vector<GraphicsAllocation *> kernelUnifiedMemoryGfxAllocations;
std::vector<PatchInfoData> patchInfoDataList;
std::vector<GraphicsAllocation *> kernelArgRequiresCacheFlush;
std::vector<size_t> slmSizes;
std::unique_ptr<ImageTransformer> imageTransformer;
std::unique_ptr<char[]> pSshLocal;
std::unique_ptr<ImplicitArgs> pImplicitArgs = nullptr;
uint64_t privateSurfaceSize = 0u;
size_t numberOfBindingTableStates = 0u;
size_t localBindingTableOffset = 0u;
const ExecutionEnvironment &executionEnvironment;
Program *program;
ClDevice &clDevice;
const KernelInfo &kernelInfo;
GraphicsAllocation *privateSurface = nullptr;
MultiDeviceKernel *pMultiDeviceKernel = nullptr;
uint32_t *maxWorkGroupSizeForCrossThreadData = &Kernel::dummyPatchLocation;
uint32_t *dataParameterSimdSize = &Kernel::dummyPatchLocation;
uint32_t *parentEventOffset = &Kernel::dummyPatchLocation;
uint32_t *preferredWkgMultipleOffset = &Kernel::dummyPatchLocation;
char *crossThreadData = nullptr;
AuxTranslationDirection auxTranslationDirection = AuxTranslationDirection::None;
KernelExecutionType executionType = KernelExecutionType::Default;
int32_t threadArbitrationPolicy = ThreadArbitrationPolicy::NotPresent;
uint32_t patchedArgumentsNum = 0;
uint32_t startOffset = 0;
uint32_t statelessUncacheableArgsCount = 0;
uint32_t additionalKernelExecInfo = AdditionalKernelExecInfo::DisableOverdispatch;
uint32_t maxKernelWorkGroupSize = 0;
uint32_t slmTotalSize = 0u;
uint32_t sshLocalSize = 0u;
uint32_t crossThreadDataSize = 0u;
bool containsStatelessWrites = true;
bool usingSharedObjArgs = false;
bool usingImages = false;
bool usingImagesOnly = false;
bool auxTranslationRequired = false;
bool specialPipelineSelectMode = false;
bool svmAllocationsRequireCacheFlush = false;
bool isUnifiedMemorySyncRequired = true;
bool debugEnabled = false;
bool singleSubdevicePreferredInCurrentEnqueue = false;
bool kernelHasIndirectAccess = true;
bool anyKernelArgumentUsingSystemMemory = false;
};
} // namespace NEO