Store SSH per root device in Kernel

Related-To: NEO-5001
Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
This commit is contained in:
Mateusz Jablonski
2020-11-23 18:01:38 +00:00
committed by Compute-Runtime-Automation
parent 52d96af5f0
commit 7ec69c33f9
40 changed files with 231 additions and 220 deletions

View File

@@ -140,7 +140,7 @@ void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, Graphic
uint32_t sshOffset = patch.SurfaceStateHeapOffset;
auto rootDeviceIndex = allocation.getRootDeviceIndex();
void *crossThreadData = getCrossThreadData(rootDeviceIndex);
void *ssh = getSurfaceStateHeap();
void *ssh = getSurfaceStateHeap(rootDeviceIndex);
if (crossThreadData != nullptr) {
auto pp = ptrOffset(crossThreadData, crossThreadDataOffset);
uintptr_t addressToPatch = reinterpret_cast<uintptr_t>(ptrToPatchInCrossThreadData);
@@ -235,13 +235,14 @@ cl_int Kernel::initialize() {
}
// allocate our own SSH, if necessary
sshLocalSize = heapInfo.SurfaceStateHeapSize;
kernelDeviceInfos[rootDeviceIndex].sshLocalSize = heapInfo.SurfaceStateHeapSize;
if (sshLocalSize) {
pSshLocal = std::make_unique<char[]>(sshLocalSize);
if (kernelDeviceInfos[rootDeviceIndex].sshLocalSize) {
kernelDeviceInfos[rootDeviceIndex].pSshLocal = std::make_unique<char[]>(kernelDeviceInfos[rootDeviceIndex].sshLocalSize);
// copy the ssh into our local copy
memcpy_s(pSshLocal.get(), sshLocalSize, heapInfo.pSsh, sshLocalSize);
memcpy_s(kernelDeviceInfos[rootDeviceIndex].pSshLocal.get(), kernelDeviceInfos[rootDeviceIndex].sshLocalSize,
heapInfo.pSsh, kernelDeviceInfos[rootDeviceIndex].sshLocalSize);
}
numberOfBindingTableStates = (patchInfo.bindingTableState != nullptr) ? patchInfo.bindingTableState->Count : 0;
localBindingTableOffset = (patchInfo.bindingTableState != nullptr) ? patchInfo.bindingTableState->Offset : 0;
@@ -287,7 +288,7 @@ cl_int Kernel::initialize() {
if (patchInfo.pAllocateStatelessEventPoolSurface) {
if (requiresSshForBuffers()) {
auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap()),
auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap(rootDeviceIndex)),
patchInfo.pAllocateStatelessEventPoolSurface->SurfaceStateHeapOffset);
Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, 0, nullptr, 0, nullptr, 0, 0);
}
@@ -296,7 +297,7 @@ cl_int Kernel::initialize() {
if (patchInfo.pAllocateStatelessDefaultDeviceQueueSurface) {
if (requiresSshForBuffers()) {
auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap()),
auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap(rootDeviceIndex)),
patchInfo.pAllocateStatelessDefaultDeviceQueueSurface->SurfaceStateHeapOffset);
Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, 0, nullptr, 0, nullptr, 0, 0);
}
@@ -783,8 +784,8 @@ void Kernel::setStartOffset(uint32_t offset) {
this->startOffset = offset;
}
void *Kernel::getSurfaceStateHeap() const {
return kernelInfo.usesSsh ? pSshLocal.get() : nullptr;
void *Kernel::getSurfaceStateHeap(uint32_t rootDeviceIndex) const {
return kernelInfo.usesSsh ? kernelDeviceInfos[rootDeviceIndex].pSshLocal.get() : nullptr;
}
size_t Kernel::getDynamicStateHeapSize() const {
@@ -795,9 +796,9 @@ const void *Kernel::getDynamicStateHeap() const {
return kernelInfo.heapInfo.pDsh;
}
size_t Kernel::getSurfaceStateHeapSize() const {
size_t Kernel::getSurfaceStateHeapSize(uint32_t rootDeviceIndex) const {
return kernelInfo.usesSsh
? sshLocalSize
? kernelDeviceInfos[rootDeviceIndex].sshLocalSize
: 0;
}
@@ -805,9 +806,9 @@ size_t Kernel::getNumberOfBindingTableStates() const {
return numberOfBindingTableStates;
}
void Kernel::resizeSurfaceStateHeap(void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset) {
pSshLocal.reset(static_cast<char *>(pNewSsh));
sshLocalSize = static_cast<uint32_t>(newSshSize);
void Kernel::resizeSurfaceStateHeap(uint32_t rootDeviceIndex, void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset) {
kernelDeviceInfos[rootDeviceIndex].pSshLocal.reset(static_cast<char *>(pNewSsh));
kernelDeviceInfos[rootDeviceIndex].sshLocalSize = static_cast<uint32_t>(newSshSize);
numberOfBindingTableStates = newBindingTableCount;
localBindingTableOffset = newBindingTableOffset;
}
@@ -882,7 +883,7 @@ cl_int Kernel::setArgSvm(uint32_t argIndex, size_t svmAllocSize, void *svmPtr, G
if (requiresSshForBuffers()) {
const auto &kernelArgInfo = kernelInfo.kernelArgInfo[argIndex];
auto surfaceState = ptrOffset(getSurfaceStateHeap(), kernelArgInfo.offsetHeap);
auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap);
Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, svmAllocSize + ptrDiff(svmPtr, ptrToPatch), ptrToPatch, 0, svmAlloc, svmFlags, 0);
}
if (!kernelArguments[argIndex].isPatched) {
@@ -913,7 +914,7 @@ cl_int Kernel::setArgSvmAlloc(uint32_t argIndex, void *svmPtr, GraphicsAllocatio
if (requiresSshForBuffers()) {
const auto &kernelArgInfo = kernelInfo.kernelArgInfo[argIndex];
auto surfaceState = ptrOffset(getSurfaceStateHeap(), kernelArgInfo.offsetHeap);
auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap);
size_t allocSize = 0;
size_t offset = 0;
if (svmAlloc != nullptr) {
@@ -1317,7 +1318,7 @@ cl_int Kernel::setArgBuffer(uint32_t argIndex,
}
if (requiresSshForBuffers()) {
auto surfaceState = ptrOffset(getSurfaceStateHeap(), kernelArgInfo.offsetHeap);
auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap);
buffer->setArgStateful(surfaceState, forceNonAuxMode, disableL3, isAuxTranslationKernel, kernelArgInfo.isReadOnly, getDevice().getDevice());
}
@@ -1342,7 +1343,7 @@ cl_int Kernel::setArgBuffer(uint32_t argIndex,
storeKernelArg(argIndex, BUFFER_OBJ, nullptr, argVal, argSize);
if (requiresSshForBuffers()) {
auto surfaceState = ptrOffset(getSurfaceStateHeap(), kernelArgInfo.offsetHeap);
auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap);
Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, 0, nullptr, 0, nullptr, 0, 0);
}
@@ -1391,7 +1392,7 @@ cl_int Kernel::setArgPipe(uint32_t argIndex,
auto graphicsAllocation = pipe->getGraphicsAllocation(getDevice().getRootDeviceIndex());
if (requiresSshForBuffers()) {
auto surfaceState = ptrOffset(getSurfaceStateHeap(), kernelArgInfo.offsetHeap);
auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap);
Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState,
pipe->getSize(), pipe->getCpuAddress(), 0,
graphicsAllocation, 0, 0);
@@ -1429,7 +1430,7 @@ cl_int Kernel::setArgImageWithMipLevel(uint32_t argIndex,
storeKernelArg(argIndex, IMAGE_OBJ, clMemObj, argVal, argSize);
auto surfaceState = ptrOffset(getSurfaceStateHeap(), kernelArgInfo.offsetHeap);
auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap);
DEBUG_BREAK_IF(!kernelArgInfo.isImage);
// Sets SS structure
@@ -2250,7 +2251,7 @@ void Kernel::patchDefaultDeviceQueue(DeviceQueue *devQueue) {
static_cast<uintptr_t>(devQueue->getQueueBuffer()->getGpuAddressToPatch()));
}
if (requiresSshForBuffers()) {
auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap()),
auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap(rootDeviceIndex)),
patchInfo.pAllocateStatelessDefaultDeviceQueueSurface->SurfaceStateHeapOffset);
Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, devQueue->getQueueBuffer()->getUnderlyingBufferSize(),
(void *)devQueue->getQueueBuffer()->getGpuAddress(), 0, devQueue->getQueueBuffer(), 0, 0);
@@ -2272,7 +2273,7 @@ void Kernel::patchEventPool(DeviceQueue *devQueue) {
}
if (requiresSshForBuffers()) {
auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap()),
auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap(rootDeviceIndex)),
patchInfo.pAllocateStatelessEventPoolSurface->SurfaceStateHeapOffset);
Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, devQueue->getEventPoolBuffer()->getUnderlyingBufferSize(),
(void *)devQueue->getEventPoolBuffer()->getGpuAddress(), 0, devQueue->getEventPoolBuffer(), 0, 0);
@@ -2298,13 +2299,14 @@ bool Kernel::usesSyncBuffer() {
}
void Kernel::patchSyncBuffer(Device &device, GraphicsAllocation *gfxAllocation, size_t bufferOffset) {
auto rootDeviceIndex = device.getRootDeviceIndex();
auto &patchInfo = kernelInfo.patchInfo;
auto bufferPatchAddress = ptrOffset(getCrossThreadData(device.getRootDeviceIndex()), patchInfo.pAllocateSyncBuffer->DataParamOffset);
auto bufferPatchAddress = ptrOffset(getCrossThreadData(rootDeviceIndex), patchInfo.pAllocateSyncBuffer->DataParamOffset);
patchWithRequiredSize(bufferPatchAddress, patchInfo.pAllocateSyncBuffer->DataParamSize,
ptrOffset(gfxAllocation->getGpuAddressToPatch(), bufferOffset));
if (requiresSshForBuffers()) {
auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap()),
auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap(rootDeviceIndex)),
patchInfo.pAllocateSyncBuffer->SurfaceStateHeapOffset);
auto addressToPatch = gfxAllocation->getUnderlyingBuffer();
auto sizeToPatch = gfxAllocation->getUnderlyingBufferSize();
@@ -2353,10 +2355,11 @@ void Kernel::resolveArgs() {
}
}
}
auto rootDeviceIndex = getDevice().getRootDeviceIndex();
if (canTransformImageTo2dArray) {
imageTransformer->transformImagesTo2dArray(kernelInfo, kernelArguments, getSurfaceStateHeap());
imageTransformer->transformImagesTo2dArray(kernelInfo, kernelArguments, getSurfaceStateHeap(rootDeviceIndex));
} else if (imageTransformer->didTransform()) {
imageTransformer->transformImagesTo3d(kernelInfo, kernelArguments, getSurfaceStateHeap());
imageTransformer->transformImagesTo3d(kernelInfo, kernelArguments, getSurfaceStateHeap(rootDeviceIndex));
}
}

View File

@@ -153,18 +153,18 @@ class Kernel : public BaseObject<_cl_kernel> {
size_t *paramValueSizeRet) const;
const void *getKernelHeap() const;
void *getSurfaceStateHeap() const;
void *getSurfaceStateHeap(uint32_t rootDeviceIndex) const;
const void *getDynamicStateHeap() const;
size_t getKernelHeapSize() const;
size_t getSurfaceStateHeapSize() const;
size_t getSurfaceStateHeapSize(uint32_t rootDeviceIndex) const;
size_t getDynamicStateHeapSize() const;
size_t getNumberOfBindingTableStates() const;
size_t getBindingTableOffset() const {
return localBindingTableOffset;
}
void resizeSurfaceStateHeap(void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset);
void resizeSurfaceStateHeap(uint32_t rootDeviceIndex, void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset);
void substituteKernelHeap(void *newKernelHeap, size_t newKernelHeapSize);
bool isKernelHeapSubstituted() const;
@@ -524,8 +524,6 @@ class Kernel : public BaseObject<_cl_kernel> {
size_t numberOfBindingTableStates = 0u;
size_t localBindingTableOffset = 0u;
std::unique_ptr<char[]> pSshLocal;
uint32_t sshLocalSize = 0u;
GraphicsAllocation *kernelReflectionSurface = nullptr;
@@ -550,13 +548,15 @@ class Kernel : public BaseObject<_cl_kernel> {
bool debugEnabled = false;
uint32_t additionalKernelExecInfo = AdditionalKernelExecInfo::NotSet;
struct KernelDeviceInfo {
struct KernelDeviceInfo : public NonCopyableClass {
std::unique_ptr<char[]> pSshLocal;
uint32_t sshLocalSize = 0u;
char *crossThreadData = nullptr;
uint32_t crossThreadDataSize = 0u;
GraphicsAllocation *privateSurface = nullptr;
uint64_t privateSurfaceSize = 0u;
};
StackVec<KernelDeviceInfo, 1> kernelDeviceInfos;
std::vector<KernelDeviceInfo> kernelDeviceInfos;
};
} // namespace NEO