Correct patching private surface in cloned kernel

Related-To: NEO-5081 Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
2021-09-07 11:10:18 +00:00 · 2021-09-07 11:10:18 +00:00 · 7f920139b4
parent caddc63eec
commit 7f920139b4
3 changed files with 65 additions and 24 deletions
--- a/opencl/source/kernel/kernel.cpp
+++ b/opencl/source/kernel/kernel.cpp
@ -202,25 +202,9 @@ cl_int Kernel::initialize() {
    localBindingTableOffset = kernelDescriptor.payloadMappings.bindingTable.tableOffset;

    // patch crossthread data and ssh with inline surfaces, if necessary
-    auto perHwThreadPrivateMemorySize = kernelDescriptor.kernelAttributes.perHwThreadPrivateMemorySize;
-    if (perHwThreadPrivateMemorySize) {
-        privateSurfaceSize = KernelHelper::getPrivateSurfaceSize(perHwThreadPrivateMemorySize, pClDevice->getSharedDeviceInfo().computeUnitsUsedForScratch);
-        DEBUG_BREAK_IF(privateSurfaceSize == 0);
-
-        if (privateSurfaceSize > std::numeric_limits<uint32_t>::max()) {
-            return CL_OUT_OF_RESOURCES;
-        }
-        privateSurface = executionEnvironment.memoryManager->allocateGraphicsMemoryWithProperties(
-            {rootDeviceIndex,
-             static_cast<size_t>(privateSurfaceSize),
-             GraphicsAllocation::AllocationType::PRIVATE_SURFACE,
-             pClDevice->getDeviceBitfield()});
-        if (privateSurface == nullptr) {
-            return CL_OUT_OF_RESOURCES;
-        }
-
-        const auto &privateMemoryAddress = kernelDescriptor.payloadMappings.implicitArgs.privateMemoryAddress;
-        patchWithImplicitSurface(reinterpret_cast<void *>(privateSurface->getGpuAddressToPatch()), *privateSurface, privateMemoryAddress);
+    auto status = patchPrivateSurface();
+    if (CL_SUCCESS != status) {
+        return status;
    }

    if (isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress.stateless)) {
@ -334,12 +318,44 @@ cl_int Kernel::initialize() {
    return CL_SUCCESS;
 }

+cl_int Kernel::patchPrivateSurface() {
+    auto pClDevice = &getDevice();
+    auto rootDeviceIndex = pClDevice->getRootDeviceIndex();
+    auto &kernelDescriptor = kernelInfo.kernelDescriptor;
+    auto perHwThreadPrivateMemorySize = kernelDescriptor.kernelAttributes.perHwThreadPrivateMemorySize;
+    if (perHwThreadPrivateMemorySize) {
+        if (!privateSurface) {
+            privateSurfaceSize = KernelHelper::getPrivateSurfaceSize(perHwThreadPrivateMemorySize, pClDevice->getSharedDeviceInfo().computeUnitsUsedForScratch);
+            DEBUG_BREAK_IF(privateSurfaceSize == 0);
+
+            if (privateSurfaceSize > std::numeric_limits<uint32_t>::max()) {
+                return CL_OUT_OF_RESOURCES;
+            }
+            privateSurface = executionEnvironment.memoryManager->allocateGraphicsMemoryWithProperties(
+                {rootDeviceIndex,
+                 static_cast<size_t>(privateSurfaceSize),
+                 GraphicsAllocation::AllocationType::PRIVATE_SURFACE,
+                 pClDevice->getDeviceBitfield()});
+            if (privateSurface == nullptr) {
+                return CL_OUT_OF_RESOURCES;
+            }
+        }
+
+        const auto &privateMemoryAddress = kernelDescriptor.payloadMappings.implicitArgs.privateMemoryAddress;
+        patchWithImplicitSurface(reinterpret_cast<void *>(privateSurface->getGpuAddressToPatch()), *privateSurface, privateMemoryAddress);
+    }
+    return CL_SUCCESS;
+}
+
 cl_int Kernel::cloneKernel(Kernel *pSourceKernel) {
    // copy cross thread data to store arguments set to source kernel with clSetKernelArg on immediate data (non-pointer types)
    memcpy_s(crossThreadData, crossThreadDataSize,
             pSourceKernel->crossThreadData, pSourceKernel->crossThreadDataSize);
    DEBUG_BREAK_IF(pSourceKernel->crossThreadDataSize != crossThreadDataSize);

+    [[maybe_unused]] auto status = patchPrivateSurface();
+    DEBUG_BREAK_IF(status != CL_SUCCESS);
+
    // copy arguments set to source kernel with clSetKernelArg or clSetKernelArgSVMPointer
    for (uint32_t i = 0; i < pSourceKernel->kernelArguments.size(); i++) {
        if (0 == pSourceKernel->getKernelArgInfo(i).size) {
--- a/opencl/source/kernel/kernel.h
+++ b/opencl/source/kernel/kernel.h
@ -523,6 +523,7 @@ class Kernel : public ReferenceTrackedObject<Kernel> {
    const ClDevice &getDevice() const {
        return clDevice;
    }
+    cl_int patchPrivateSurface();

    const ExecutionEnvironment &executionEnvironment;
    Program *program;
--- a/opencl/test/unit_test/kernel/clone_kernel_tests.cpp
+++ b/opencl/test/unit_test/kernel/clone_kernel_tests.cpp
@ -49,6 +49,8 @@ class CloneKernelTest : public MultiRootDeviceWithSubDevicesFixture {
        pKernelInfo->kernelDescriptor.payloadMappings.explicitArgsExtendedDescriptors.resize(1);

        pKernelInfo->kernelDescriptor.kernelAttributes.simdSize = 1;
+        pKernelInfo->kernelDescriptor.kernelAttributes.crossThreadDataSize = 72;
+        pKernelInfo->setPrivateMemory(0x10, false, 8, 64, 64);
        pKernelInfo->heapInfo.SurfaceStateHeapSize = sizeof(surfaceStateHeap);
        pKernelInfo->heapInfo.pSsh = surfaceStateHeap;

@ -66,16 +68,11 @@ class CloneKernelTest : public MultiRootDeviceWithSubDevicesFixture {

            pSourceKernel[rootDeviceIndex] = new MockKernel(pProgram.get(), *pKernelInfo, *deviceFactory->rootDevices[rootDeviceIndex]);
            ASSERT_EQ(CL_SUCCESS, pSourceKernel[rootDeviceIndex]->initialize());
-            char pSourceCrossThreadData[64] = {};
            sourceKernels[rootDeviceIndex] = pSourceKernel[rootDeviceIndex];

            pClonedKernel[rootDeviceIndex] = new MockKernel(pProgram.get(), *pKernelInfo, *deviceFactory->rootDevices[rootDeviceIndex]);
            ASSERT_EQ(CL_SUCCESS, pClonedKernel[rootDeviceIndex]->initialize());
-            char pClonedCrossThreadData[64] = {};
            clonedKernels[rootDeviceIndex] = pClonedKernel[rootDeviceIndex];
-
-            pSourceKernel[rootDeviceIndex]->setCrossThreadData(pSourceCrossThreadData, sizeof(pSourceCrossThreadData));
-            pClonedKernel[rootDeviceIndex]->setCrossThreadData(pClonedCrossThreadData, sizeof(pClonedCrossThreadData));
        }

        pSourceMultiDeviceKernel = std::make_unique<MultiDeviceKernel>(sourceKernels, kernelInfos);
@ -96,6 +93,33 @@ class CloneKernelTest : public MultiRootDeviceWithSubDevicesFixture {
    char surfaceStateHeap[128];
 };

+TEST_F(CloneKernelTest, GivenKernelWithPrivateSurfaceWhenCloningKernelThenClonedKernelProgramItsOwnPrivateSurfaceAddress) {
+    for (auto &rootDeviceIndex : this->context->getRootDeviceIndices()) {
+        auto pSourcePrivateSurface = pSourceKernel[rootDeviceIndex]->privateSurface;
+        auto pClonedPrivateSurface = pClonedKernel[rootDeviceIndex]->privateSurface;
+        EXPECT_NE(nullptr, pSourcePrivateSurface);
+        EXPECT_NE(nullptr, pClonedPrivateSurface);
+        EXPECT_NE(pClonedPrivateSurface, pSourcePrivateSurface);
+        {
+            auto pSourcePrivateSurfPatchedAddress = reinterpret_cast<uint64_t *>(ptrOffset(pSourceKernel[rootDeviceIndex]->getCrossThreadData(), 64));
+            auto pClonedPrivateSurfPatchedAddress = reinterpret_cast<uint64_t *>(ptrOffset(pClonedKernel[rootDeviceIndex]->getCrossThreadData(), 64));
+
+            EXPECT_EQ(pSourcePrivateSurface->getGpuAddressToPatch(), *pSourcePrivateSurfPatchedAddress);
+            EXPECT_EQ(pClonedPrivateSurface->getGpuAddressToPatch(), *pClonedPrivateSurfPatchedAddress);
+        }
+
+        retVal = pClonedKernel[rootDeviceIndex]->cloneKernel(pSourceKernel[rootDeviceIndex]);
+        EXPECT_EQ(CL_SUCCESS, retVal);
+
+        auto pClonedPrivateSurface2 = pClonedKernel[rootDeviceIndex]->privateSurface;
+        EXPECT_EQ(pClonedPrivateSurface, pClonedPrivateSurface2);
+        {
+            auto pClonedPrivateSurfPatchedAddress = reinterpret_cast<uint64_t *>(ptrOffset(pClonedKernel[rootDeviceIndex]->getCrossThreadData(), 64));
+            EXPECT_EQ(pClonedPrivateSurface->getGpuAddressToPatch(), *pClonedPrivateSurfPatchedAddress);
+        }
+    }
+}
+
 TEST_F(CloneKernelTest, GivenUnsetArgWhenCloningKernelThenKernelInfoIsCorrect) {
    pKernelInfo->addArgBuffer(0);
    for (auto &rootDeviceIndex : this->context->getRootDeviceIndices()) {