Correct patching private surface in cloned kernel
Related-To: NEO-5081 Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
This commit is contained in:
parent
caddc63eec
commit
7f920139b4
|
@ -202,25 +202,9 @@ cl_int Kernel::initialize() {
|
|||
localBindingTableOffset = kernelDescriptor.payloadMappings.bindingTable.tableOffset;
|
||||
|
||||
// patch crossthread data and ssh with inline surfaces, if necessary
|
||||
auto perHwThreadPrivateMemorySize = kernelDescriptor.kernelAttributes.perHwThreadPrivateMemorySize;
|
||||
if (perHwThreadPrivateMemorySize) {
|
||||
privateSurfaceSize = KernelHelper::getPrivateSurfaceSize(perHwThreadPrivateMemorySize, pClDevice->getSharedDeviceInfo().computeUnitsUsedForScratch);
|
||||
DEBUG_BREAK_IF(privateSurfaceSize == 0);
|
||||
|
||||
if (privateSurfaceSize > std::numeric_limits<uint32_t>::max()) {
|
||||
return CL_OUT_OF_RESOURCES;
|
||||
}
|
||||
privateSurface = executionEnvironment.memoryManager->allocateGraphicsMemoryWithProperties(
|
||||
{rootDeviceIndex,
|
||||
static_cast<size_t>(privateSurfaceSize),
|
||||
GraphicsAllocation::AllocationType::PRIVATE_SURFACE,
|
||||
pClDevice->getDeviceBitfield()});
|
||||
if (privateSurface == nullptr) {
|
||||
return CL_OUT_OF_RESOURCES;
|
||||
}
|
||||
|
||||
const auto &privateMemoryAddress = kernelDescriptor.payloadMappings.implicitArgs.privateMemoryAddress;
|
||||
patchWithImplicitSurface(reinterpret_cast<void *>(privateSurface->getGpuAddressToPatch()), *privateSurface, privateMemoryAddress);
|
||||
auto status = patchPrivateSurface();
|
||||
if (CL_SUCCESS != status) {
|
||||
return status;
|
||||
}
|
||||
|
||||
if (isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress.stateless)) {
|
||||
|
@ -334,12 +318,44 @@ cl_int Kernel::initialize() {
|
|||
return CL_SUCCESS;
|
||||
}
|
||||
|
||||
cl_int Kernel::patchPrivateSurface() {
|
||||
auto pClDevice = &getDevice();
|
||||
auto rootDeviceIndex = pClDevice->getRootDeviceIndex();
|
||||
auto &kernelDescriptor = kernelInfo.kernelDescriptor;
|
||||
auto perHwThreadPrivateMemorySize = kernelDescriptor.kernelAttributes.perHwThreadPrivateMemorySize;
|
||||
if (perHwThreadPrivateMemorySize) {
|
||||
if (!privateSurface) {
|
||||
privateSurfaceSize = KernelHelper::getPrivateSurfaceSize(perHwThreadPrivateMemorySize, pClDevice->getSharedDeviceInfo().computeUnitsUsedForScratch);
|
||||
DEBUG_BREAK_IF(privateSurfaceSize == 0);
|
||||
|
||||
if (privateSurfaceSize > std::numeric_limits<uint32_t>::max()) {
|
||||
return CL_OUT_OF_RESOURCES;
|
||||
}
|
||||
privateSurface = executionEnvironment.memoryManager->allocateGraphicsMemoryWithProperties(
|
||||
{rootDeviceIndex,
|
||||
static_cast<size_t>(privateSurfaceSize),
|
||||
GraphicsAllocation::AllocationType::PRIVATE_SURFACE,
|
||||
pClDevice->getDeviceBitfield()});
|
||||
if (privateSurface == nullptr) {
|
||||
return CL_OUT_OF_RESOURCES;
|
||||
}
|
||||
}
|
||||
|
||||
const auto &privateMemoryAddress = kernelDescriptor.payloadMappings.implicitArgs.privateMemoryAddress;
|
||||
patchWithImplicitSurface(reinterpret_cast<void *>(privateSurface->getGpuAddressToPatch()), *privateSurface, privateMemoryAddress);
|
||||
}
|
||||
return CL_SUCCESS;
|
||||
}
|
||||
|
||||
cl_int Kernel::cloneKernel(Kernel *pSourceKernel) {
|
||||
// copy cross thread data to store arguments set to source kernel with clSetKernelArg on immediate data (non-pointer types)
|
||||
memcpy_s(crossThreadData, crossThreadDataSize,
|
||||
pSourceKernel->crossThreadData, pSourceKernel->crossThreadDataSize);
|
||||
DEBUG_BREAK_IF(pSourceKernel->crossThreadDataSize != crossThreadDataSize);
|
||||
|
||||
[[maybe_unused]] auto status = patchPrivateSurface();
|
||||
DEBUG_BREAK_IF(status != CL_SUCCESS);
|
||||
|
||||
// copy arguments set to source kernel with clSetKernelArg or clSetKernelArgSVMPointer
|
||||
for (uint32_t i = 0; i < pSourceKernel->kernelArguments.size(); i++) {
|
||||
if (0 == pSourceKernel->getKernelArgInfo(i).size) {
|
||||
|
|
|
@ -523,6 +523,7 @@ class Kernel : public ReferenceTrackedObject<Kernel> {
|
|||
const ClDevice &getDevice() const {
|
||||
return clDevice;
|
||||
}
|
||||
cl_int patchPrivateSurface();
|
||||
|
||||
const ExecutionEnvironment &executionEnvironment;
|
||||
Program *program;
|
||||
|
|
|
@ -49,6 +49,8 @@ class CloneKernelTest : public MultiRootDeviceWithSubDevicesFixture {
|
|||
pKernelInfo->kernelDescriptor.payloadMappings.explicitArgsExtendedDescriptors.resize(1);
|
||||
|
||||
pKernelInfo->kernelDescriptor.kernelAttributes.simdSize = 1;
|
||||
pKernelInfo->kernelDescriptor.kernelAttributes.crossThreadDataSize = 72;
|
||||
pKernelInfo->setPrivateMemory(0x10, false, 8, 64, 64);
|
||||
pKernelInfo->heapInfo.SurfaceStateHeapSize = sizeof(surfaceStateHeap);
|
||||
pKernelInfo->heapInfo.pSsh = surfaceStateHeap;
|
||||
|
||||
|
@ -66,16 +68,11 @@ class CloneKernelTest : public MultiRootDeviceWithSubDevicesFixture {
|
|||
|
||||
pSourceKernel[rootDeviceIndex] = new MockKernel(pProgram.get(), *pKernelInfo, *deviceFactory->rootDevices[rootDeviceIndex]);
|
||||
ASSERT_EQ(CL_SUCCESS, pSourceKernel[rootDeviceIndex]->initialize());
|
||||
char pSourceCrossThreadData[64] = {};
|
||||
sourceKernels[rootDeviceIndex] = pSourceKernel[rootDeviceIndex];
|
||||
|
||||
pClonedKernel[rootDeviceIndex] = new MockKernel(pProgram.get(), *pKernelInfo, *deviceFactory->rootDevices[rootDeviceIndex]);
|
||||
ASSERT_EQ(CL_SUCCESS, pClonedKernel[rootDeviceIndex]->initialize());
|
||||
char pClonedCrossThreadData[64] = {};
|
||||
clonedKernels[rootDeviceIndex] = pClonedKernel[rootDeviceIndex];
|
||||
|
||||
pSourceKernel[rootDeviceIndex]->setCrossThreadData(pSourceCrossThreadData, sizeof(pSourceCrossThreadData));
|
||||
pClonedKernel[rootDeviceIndex]->setCrossThreadData(pClonedCrossThreadData, sizeof(pClonedCrossThreadData));
|
||||
}
|
||||
|
||||
pSourceMultiDeviceKernel = std::make_unique<MultiDeviceKernel>(sourceKernels, kernelInfos);
|
||||
|
@ -96,6 +93,33 @@ class CloneKernelTest : public MultiRootDeviceWithSubDevicesFixture {
|
|||
char surfaceStateHeap[128];
|
||||
};
|
||||
|
||||
TEST_F(CloneKernelTest, GivenKernelWithPrivateSurfaceWhenCloningKernelThenClonedKernelProgramItsOwnPrivateSurfaceAddress) {
|
||||
for (auto &rootDeviceIndex : this->context->getRootDeviceIndices()) {
|
||||
auto pSourcePrivateSurface = pSourceKernel[rootDeviceIndex]->privateSurface;
|
||||
auto pClonedPrivateSurface = pClonedKernel[rootDeviceIndex]->privateSurface;
|
||||
EXPECT_NE(nullptr, pSourcePrivateSurface);
|
||||
EXPECT_NE(nullptr, pClonedPrivateSurface);
|
||||
EXPECT_NE(pClonedPrivateSurface, pSourcePrivateSurface);
|
||||
{
|
||||
auto pSourcePrivateSurfPatchedAddress = reinterpret_cast<uint64_t *>(ptrOffset(pSourceKernel[rootDeviceIndex]->getCrossThreadData(), 64));
|
||||
auto pClonedPrivateSurfPatchedAddress = reinterpret_cast<uint64_t *>(ptrOffset(pClonedKernel[rootDeviceIndex]->getCrossThreadData(), 64));
|
||||
|
||||
EXPECT_EQ(pSourcePrivateSurface->getGpuAddressToPatch(), *pSourcePrivateSurfPatchedAddress);
|
||||
EXPECT_EQ(pClonedPrivateSurface->getGpuAddressToPatch(), *pClonedPrivateSurfPatchedAddress);
|
||||
}
|
||||
|
||||
retVal = pClonedKernel[rootDeviceIndex]->cloneKernel(pSourceKernel[rootDeviceIndex]);
|
||||
EXPECT_EQ(CL_SUCCESS, retVal);
|
||||
|
||||
auto pClonedPrivateSurface2 = pClonedKernel[rootDeviceIndex]->privateSurface;
|
||||
EXPECT_EQ(pClonedPrivateSurface, pClonedPrivateSurface2);
|
||||
{
|
||||
auto pClonedPrivateSurfPatchedAddress = reinterpret_cast<uint64_t *>(ptrOffset(pClonedKernel[rootDeviceIndex]->getCrossThreadData(), 64));
|
||||
EXPECT_EQ(pClonedPrivateSurface->getGpuAddressToPatch(), *pClonedPrivateSurfPatchedAddress);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(CloneKernelTest, GivenUnsetArgWhenCloningKernelThenKernelInfoIsCorrect) {
|
||||
pKernelInfo->addArgBuffer(0);
|
||||
for (auto &rootDeviceIndex : this->context->getRootDeviceIndices()) {
|
||||
|
|
Loading…
Reference in New Issue