mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-25 13:33:02 +08:00
Store SSH per root device in Kernel
Related-To: NEO-5001 Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
52d96af5f0
commit
7ec69c33f9
@@ -140,7 +140,7 @@ void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, Graphic
|
||||
uint32_t sshOffset = patch.SurfaceStateHeapOffset;
|
||||
auto rootDeviceIndex = allocation.getRootDeviceIndex();
|
||||
void *crossThreadData = getCrossThreadData(rootDeviceIndex);
|
||||
void *ssh = getSurfaceStateHeap();
|
||||
void *ssh = getSurfaceStateHeap(rootDeviceIndex);
|
||||
if (crossThreadData != nullptr) {
|
||||
auto pp = ptrOffset(crossThreadData, crossThreadDataOffset);
|
||||
uintptr_t addressToPatch = reinterpret_cast<uintptr_t>(ptrToPatchInCrossThreadData);
|
||||
@@ -235,13 +235,14 @@ cl_int Kernel::initialize() {
|
||||
}
|
||||
|
||||
// allocate our own SSH, if necessary
|
||||
sshLocalSize = heapInfo.SurfaceStateHeapSize;
|
||||
kernelDeviceInfos[rootDeviceIndex].sshLocalSize = heapInfo.SurfaceStateHeapSize;
|
||||
|
||||
if (sshLocalSize) {
|
||||
pSshLocal = std::make_unique<char[]>(sshLocalSize);
|
||||
if (kernelDeviceInfos[rootDeviceIndex].sshLocalSize) {
|
||||
kernelDeviceInfos[rootDeviceIndex].pSshLocal = std::make_unique<char[]>(kernelDeviceInfos[rootDeviceIndex].sshLocalSize);
|
||||
|
||||
// copy the ssh into our local copy
|
||||
memcpy_s(pSshLocal.get(), sshLocalSize, heapInfo.pSsh, sshLocalSize);
|
||||
memcpy_s(kernelDeviceInfos[rootDeviceIndex].pSshLocal.get(), kernelDeviceInfos[rootDeviceIndex].sshLocalSize,
|
||||
heapInfo.pSsh, kernelDeviceInfos[rootDeviceIndex].sshLocalSize);
|
||||
}
|
||||
numberOfBindingTableStates = (patchInfo.bindingTableState != nullptr) ? patchInfo.bindingTableState->Count : 0;
|
||||
localBindingTableOffset = (patchInfo.bindingTableState != nullptr) ? patchInfo.bindingTableState->Offset : 0;
|
||||
@@ -287,7 +288,7 @@ cl_int Kernel::initialize() {
|
||||
|
||||
if (patchInfo.pAllocateStatelessEventPoolSurface) {
|
||||
if (requiresSshForBuffers()) {
|
||||
auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap()),
|
||||
auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap(rootDeviceIndex)),
|
||||
patchInfo.pAllocateStatelessEventPoolSurface->SurfaceStateHeapOffset);
|
||||
Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, 0, nullptr, 0, nullptr, 0, 0);
|
||||
}
|
||||
@@ -296,7 +297,7 @@ cl_int Kernel::initialize() {
|
||||
if (patchInfo.pAllocateStatelessDefaultDeviceQueueSurface) {
|
||||
|
||||
if (requiresSshForBuffers()) {
|
||||
auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap()),
|
||||
auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap(rootDeviceIndex)),
|
||||
patchInfo.pAllocateStatelessDefaultDeviceQueueSurface->SurfaceStateHeapOffset);
|
||||
Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, 0, nullptr, 0, nullptr, 0, 0);
|
||||
}
|
||||
@@ -783,8 +784,8 @@ void Kernel::setStartOffset(uint32_t offset) {
|
||||
this->startOffset = offset;
|
||||
}
|
||||
|
||||
void *Kernel::getSurfaceStateHeap() const {
|
||||
return kernelInfo.usesSsh ? pSshLocal.get() : nullptr;
|
||||
void *Kernel::getSurfaceStateHeap(uint32_t rootDeviceIndex) const {
|
||||
return kernelInfo.usesSsh ? kernelDeviceInfos[rootDeviceIndex].pSshLocal.get() : nullptr;
|
||||
}
|
||||
|
||||
size_t Kernel::getDynamicStateHeapSize() const {
|
||||
@@ -795,9 +796,9 @@ const void *Kernel::getDynamicStateHeap() const {
|
||||
return kernelInfo.heapInfo.pDsh;
|
||||
}
|
||||
|
||||
size_t Kernel::getSurfaceStateHeapSize() const {
|
||||
size_t Kernel::getSurfaceStateHeapSize(uint32_t rootDeviceIndex) const {
|
||||
return kernelInfo.usesSsh
|
||||
? sshLocalSize
|
||||
? kernelDeviceInfos[rootDeviceIndex].sshLocalSize
|
||||
: 0;
|
||||
}
|
||||
|
||||
@@ -805,9 +806,9 @@ size_t Kernel::getNumberOfBindingTableStates() const {
|
||||
return numberOfBindingTableStates;
|
||||
}
|
||||
|
||||
void Kernel::resizeSurfaceStateHeap(void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset) {
|
||||
pSshLocal.reset(static_cast<char *>(pNewSsh));
|
||||
sshLocalSize = static_cast<uint32_t>(newSshSize);
|
||||
void Kernel::resizeSurfaceStateHeap(uint32_t rootDeviceIndex, void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset) {
|
||||
kernelDeviceInfos[rootDeviceIndex].pSshLocal.reset(static_cast<char *>(pNewSsh));
|
||||
kernelDeviceInfos[rootDeviceIndex].sshLocalSize = static_cast<uint32_t>(newSshSize);
|
||||
numberOfBindingTableStates = newBindingTableCount;
|
||||
localBindingTableOffset = newBindingTableOffset;
|
||||
}
|
||||
@@ -882,7 +883,7 @@ cl_int Kernel::setArgSvm(uint32_t argIndex, size_t svmAllocSize, void *svmPtr, G
|
||||
|
||||
if (requiresSshForBuffers()) {
|
||||
const auto &kernelArgInfo = kernelInfo.kernelArgInfo[argIndex];
|
||||
auto surfaceState = ptrOffset(getSurfaceStateHeap(), kernelArgInfo.offsetHeap);
|
||||
auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap);
|
||||
Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, svmAllocSize + ptrDiff(svmPtr, ptrToPatch), ptrToPatch, 0, svmAlloc, svmFlags, 0);
|
||||
}
|
||||
if (!kernelArguments[argIndex].isPatched) {
|
||||
@@ -913,7 +914,7 @@ cl_int Kernel::setArgSvmAlloc(uint32_t argIndex, void *svmPtr, GraphicsAllocatio
|
||||
|
||||
if (requiresSshForBuffers()) {
|
||||
const auto &kernelArgInfo = kernelInfo.kernelArgInfo[argIndex];
|
||||
auto surfaceState = ptrOffset(getSurfaceStateHeap(), kernelArgInfo.offsetHeap);
|
||||
auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap);
|
||||
size_t allocSize = 0;
|
||||
size_t offset = 0;
|
||||
if (svmAlloc != nullptr) {
|
||||
@@ -1317,7 +1318,7 @@ cl_int Kernel::setArgBuffer(uint32_t argIndex,
|
||||
}
|
||||
|
||||
if (requiresSshForBuffers()) {
|
||||
auto surfaceState = ptrOffset(getSurfaceStateHeap(), kernelArgInfo.offsetHeap);
|
||||
auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap);
|
||||
buffer->setArgStateful(surfaceState, forceNonAuxMode, disableL3, isAuxTranslationKernel, kernelArgInfo.isReadOnly, getDevice().getDevice());
|
||||
}
|
||||
|
||||
@@ -1342,7 +1343,7 @@ cl_int Kernel::setArgBuffer(uint32_t argIndex,
|
||||
storeKernelArg(argIndex, BUFFER_OBJ, nullptr, argVal, argSize);
|
||||
|
||||
if (requiresSshForBuffers()) {
|
||||
auto surfaceState = ptrOffset(getSurfaceStateHeap(), kernelArgInfo.offsetHeap);
|
||||
auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap);
|
||||
Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, 0, nullptr, 0, nullptr, 0, 0);
|
||||
}
|
||||
|
||||
@@ -1391,7 +1392,7 @@ cl_int Kernel::setArgPipe(uint32_t argIndex,
|
||||
auto graphicsAllocation = pipe->getGraphicsAllocation(getDevice().getRootDeviceIndex());
|
||||
|
||||
if (requiresSshForBuffers()) {
|
||||
auto surfaceState = ptrOffset(getSurfaceStateHeap(), kernelArgInfo.offsetHeap);
|
||||
auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap);
|
||||
Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState,
|
||||
pipe->getSize(), pipe->getCpuAddress(), 0,
|
||||
graphicsAllocation, 0, 0);
|
||||
@@ -1429,7 +1430,7 @@ cl_int Kernel::setArgImageWithMipLevel(uint32_t argIndex,
|
||||
|
||||
storeKernelArg(argIndex, IMAGE_OBJ, clMemObj, argVal, argSize);
|
||||
|
||||
auto surfaceState = ptrOffset(getSurfaceStateHeap(), kernelArgInfo.offsetHeap);
|
||||
auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap);
|
||||
DEBUG_BREAK_IF(!kernelArgInfo.isImage);
|
||||
|
||||
// Sets SS structure
|
||||
@@ -2250,7 +2251,7 @@ void Kernel::patchDefaultDeviceQueue(DeviceQueue *devQueue) {
|
||||
static_cast<uintptr_t>(devQueue->getQueueBuffer()->getGpuAddressToPatch()));
|
||||
}
|
||||
if (requiresSshForBuffers()) {
|
||||
auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap()),
|
||||
auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap(rootDeviceIndex)),
|
||||
patchInfo.pAllocateStatelessDefaultDeviceQueueSurface->SurfaceStateHeapOffset);
|
||||
Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, devQueue->getQueueBuffer()->getUnderlyingBufferSize(),
|
||||
(void *)devQueue->getQueueBuffer()->getGpuAddress(), 0, devQueue->getQueueBuffer(), 0, 0);
|
||||
@@ -2272,7 +2273,7 @@ void Kernel::patchEventPool(DeviceQueue *devQueue) {
|
||||
}
|
||||
|
||||
if (requiresSshForBuffers()) {
|
||||
auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap()),
|
||||
auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap(rootDeviceIndex)),
|
||||
patchInfo.pAllocateStatelessEventPoolSurface->SurfaceStateHeapOffset);
|
||||
Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, devQueue->getEventPoolBuffer()->getUnderlyingBufferSize(),
|
||||
(void *)devQueue->getEventPoolBuffer()->getGpuAddress(), 0, devQueue->getEventPoolBuffer(), 0, 0);
|
||||
@@ -2298,13 +2299,14 @@ bool Kernel::usesSyncBuffer() {
|
||||
}
|
||||
|
||||
void Kernel::patchSyncBuffer(Device &device, GraphicsAllocation *gfxAllocation, size_t bufferOffset) {
|
||||
auto rootDeviceIndex = device.getRootDeviceIndex();
|
||||
auto &patchInfo = kernelInfo.patchInfo;
|
||||
auto bufferPatchAddress = ptrOffset(getCrossThreadData(device.getRootDeviceIndex()), patchInfo.pAllocateSyncBuffer->DataParamOffset);
|
||||
auto bufferPatchAddress = ptrOffset(getCrossThreadData(rootDeviceIndex), patchInfo.pAllocateSyncBuffer->DataParamOffset);
|
||||
patchWithRequiredSize(bufferPatchAddress, patchInfo.pAllocateSyncBuffer->DataParamSize,
|
||||
ptrOffset(gfxAllocation->getGpuAddressToPatch(), bufferOffset));
|
||||
|
||||
if (requiresSshForBuffers()) {
|
||||
auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap()),
|
||||
auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap(rootDeviceIndex)),
|
||||
patchInfo.pAllocateSyncBuffer->SurfaceStateHeapOffset);
|
||||
auto addressToPatch = gfxAllocation->getUnderlyingBuffer();
|
||||
auto sizeToPatch = gfxAllocation->getUnderlyingBufferSize();
|
||||
@@ -2353,10 +2355,11 @@ void Kernel::resolveArgs() {
|
||||
}
|
||||
}
|
||||
}
|
||||
auto rootDeviceIndex = getDevice().getRootDeviceIndex();
|
||||
if (canTransformImageTo2dArray) {
|
||||
imageTransformer->transformImagesTo2dArray(kernelInfo, kernelArguments, getSurfaceStateHeap());
|
||||
imageTransformer->transformImagesTo2dArray(kernelInfo, kernelArguments, getSurfaceStateHeap(rootDeviceIndex));
|
||||
} else if (imageTransformer->didTransform()) {
|
||||
imageTransformer->transformImagesTo3d(kernelInfo, kernelArguments, getSurfaceStateHeap());
|
||||
imageTransformer->transformImagesTo3d(kernelInfo, kernelArguments, getSurfaceStateHeap(rootDeviceIndex));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -153,18 +153,18 @@ class Kernel : public BaseObject<_cl_kernel> {
|
||||
size_t *paramValueSizeRet) const;
|
||||
|
||||
const void *getKernelHeap() const;
|
||||
void *getSurfaceStateHeap() const;
|
||||
void *getSurfaceStateHeap(uint32_t rootDeviceIndex) const;
|
||||
const void *getDynamicStateHeap() const;
|
||||
|
||||
size_t getKernelHeapSize() const;
|
||||
size_t getSurfaceStateHeapSize() const;
|
||||
size_t getSurfaceStateHeapSize(uint32_t rootDeviceIndex) const;
|
||||
size_t getDynamicStateHeapSize() const;
|
||||
size_t getNumberOfBindingTableStates() const;
|
||||
size_t getBindingTableOffset() const {
|
||||
return localBindingTableOffset;
|
||||
}
|
||||
|
||||
void resizeSurfaceStateHeap(void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset);
|
||||
void resizeSurfaceStateHeap(uint32_t rootDeviceIndex, void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset);
|
||||
|
||||
void substituteKernelHeap(void *newKernelHeap, size_t newKernelHeapSize);
|
||||
bool isKernelHeapSubstituted() const;
|
||||
@@ -524,8 +524,6 @@ class Kernel : public BaseObject<_cl_kernel> {
|
||||
|
||||
size_t numberOfBindingTableStates = 0u;
|
||||
size_t localBindingTableOffset = 0u;
|
||||
std::unique_ptr<char[]> pSshLocal;
|
||||
uint32_t sshLocalSize = 0u;
|
||||
|
||||
GraphicsAllocation *kernelReflectionSurface = nullptr;
|
||||
|
||||
@@ -550,13 +548,15 @@ class Kernel : public BaseObject<_cl_kernel> {
|
||||
bool debugEnabled = false;
|
||||
uint32_t additionalKernelExecInfo = AdditionalKernelExecInfo::NotSet;
|
||||
|
||||
struct KernelDeviceInfo {
|
||||
struct KernelDeviceInfo : public NonCopyableClass {
|
||||
std::unique_ptr<char[]> pSshLocal;
|
||||
uint32_t sshLocalSize = 0u;
|
||||
char *crossThreadData = nullptr;
|
||||
uint32_t crossThreadDataSize = 0u;
|
||||
|
||||
GraphicsAllocation *privateSurface = nullptr;
|
||||
uint64_t privateSurfaceSize = 0u;
|
||||
};
|
||||
StackVec<KernelDeviceInfo, 1> kernelDeviceInfos;
|
||||
std::vector<KernelDeviceInfo> kernelDeviceInfos;
|
||||
};
|
||||
} // namespace NEO
|
||||
|
||||
Reference in New Issue
Block a user