Change interface to program cross-thread data

Change-Id: I96bf4bddf1557f588fd745efca7b19ec2f38a78e
2025-12-24 12:23:05 +08:00 · 2018-10-17 21:38:18 -07:00
parent 31bf5b9b43
commit f3a732081e
10 changed files with 115 additions and 114 deletions
--- a/runtime/command_queue/gpgpu_walker.h
+++ b/runtime/command_queue/gpgpu_walker.h
@@ -135,8 +135,8 @@ class GpgpuWalkerHelper {
        uint32_t simd,
        uint32_t workDim,
        bool localIdsGenerationByRuntime,
-        bool kernelUsesLocalIds,
-        bool inlineDataProgrammingRequired);
+        bool inlineDataProgrammingRequired,
+        const iOpenCL::SPatchThreadPayload &threadPayload);

    static void dispatchProfilingCommandsStart(
        HwTimeStamps &hwTimeStamps,
--- a/runtime/command_queue/gpgpu_walker_base.inl
+++ b/runtime/command_queue/gpgpu_walker_base.inl
@@ -20,8 +20,8 @@ inline size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
    uint32_t simd,
    uint32_t workDim,
    bool localIdsGenerationByRuntime,
-    bool kernelUsesLocalIds,
-    bool inlineDataProgrammingRequired) {
+    bool inlineDataProgrammingRequired,
+    const iOpenCL::SPatchThreadPayload &threadPayload) {
    auto localWorkSize = localWorkSizesIn[0] * localWorkSizesIn[1] * localWorkSizesIn[2];

    auto threadsPerWorkGroup = getThreadsPerWG(simd, localWorkSize);
@@ -155,7 +155,8 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
    size_t globalOffsets[3] = {0, 0, 0};
    size_t workGroups[3] = {(scheduler.getGws() / scheduler.getLws()), 1, 1};
    GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(pGpGpuWalkerCmd, globalOffsets, globalOffsets, workGroups, localWorkSizes,
-                                                           simd, 1, localIdsGenerationByRuntime, kernelUsesLocalIds, inlineDataProgrammingRequired);
+                                                           simd, 1, localIdsGenerationByRuntime, inlineDataProgrammingRequired,
+                                                           *scheduler.getKernelInfo().patchInfo.threadPayload);

    // Implement disabling special WA DisableLSQCROPERFforOCL if needed
    GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, scheduler, false);
--- a/runtime/command_queue/hardware_interface.inl
+++ b/runtime/command_queue/hardware_interface.inl
@@ -223,7 +223,8 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
        size_t numWorkGroups[3] = {nwgs.x, nwgs.y, nwgs.z};
        GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(walkerCmd, globalOffsets, startWorkGroups,
                                                               numWorkGroups, localWorkSizes, simd, dim,
-                                                               localIdsGenerationByRuntime, kernelUsesLocalIds, inlineDataProgrammingRequired);
+                                                               localIdsGenerationByRuntime, inlineDataProgrammingRequired,
+                                                               *kernel.getKernelInfo().patchInfo.threadPayload);

        dispatchWorkarounds(commandStream, commandQueue, kernel, false);
        currentDispatchIndex++;
--- a/runtime/helpers/kernel_commands.h
+++ b/runtime/helpers/kernel_commands.h
@@ -75,7 +75,10 @@ struct KernelCommandsHelper : public PerThreadDataHelper {

    static size_t sendCrossThreadData(
        IndirectHeap &indirectHeap,
-        Kernel &kernel);
+        Kernel &kernel,
+        bool inlineDataProgrammingRequired,
+        WALKER_TYPE<GfxFamily> *walkerCmd,
+        uint32_t &sizeCrossThreadData);

    static size_t pushBindingTableAndSurfaceStates(IndirectHeap &dstHeap, const KernelInfo &srcKernelInfo,
                                                   const void *srcKernelSsh, size_t srcKernelSshSize,
@@ -197,18 +200,6 @@ struct KernelCommandsHelper : public PerThreadDataHelper {
        WALKER_TYPE<GfxFamily> *walkerCmd,
        uint32_t &interfaceDescriptorIndex);

-    static void getCrossThreadData(
-        uint32_t &sizeCrossThreadData,
-        size_t &offsetCrossThreadData,
-        Kernel &kernel,
-        const bool &inlineDataProgrammingRequired,
-        IndirectHeap &ioh,
-        WALKER_TYPE<GfxFamily> *walkerCmd);
-
-    inline static size_t getCrossThreadDataSize(
-        uint32_t &sizeCrossThreadData,
-        Kernel &kernel);
-
    static void programMiSemaphoreWait(LinearStream &commandStream, uint64_t compareAddress, uint32_t compareData);
    static MI_ATOMIC *programMiAtomic(LinearStream &commandStream, uint64_t writeAddress, typename MI_ATOMIC::ATOMIC_OPCODES opcode, typename MI_ATOMIC::DATA_SIZE dataSize);
    static void programPipeControlDataWriteWithCsStall(LinearStream &commandStream, uint64_t writeAddress, uint64_t data);
--- a/runtime/helpers/kernel_commands.inl
+++ b/runtime/helpers/kernel_commands.inl
@@ -163,24 +163,6 @@ size_t KernelCommandsHelper<GfxFamily>::sendInterfaceDescriptorData(
    return (size_t)offsetInterfaceDescriptor;
 }

-template <typename GfxFamily>
-size_t KernelCommandsHelper<GfxFamily>::sendCrossThreadData(
-    IndirectHeap &indirectHeap,
-    Kernel &kernel) {
-    indirectHeap.align(GfxFamily::WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
-
-    auto offsetCrossThreadData = indirectHeap.getUsed();
-    auto sizeCrossThreadData = kernel.getCrossThreadDataSize();
-    char *pDest = static_cast<char *>(indirectHeap.getSpace(sizeCrossThreadData));
-    memcpy_s(pDest, sizeCrossThreadData, kernel.getCrossThreadData(), sizeCrossThreadData);
-
-    if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) {
-        FlatBatchBufferHelper::fixCrossThreadDataInfo(kernel.getPatchInfoDataList(), offsetCrossThreadData, indirectHeap.getGraphicsAllocation()->getGpuAddress());
-    }
-
-    return offsetCrossThreadData + static_cast<size_t>(indirectHeap.getHeapGpuStartOffset());
-}
-
 // Returned binding table pointer is relative to given heap (which is assumed to be the Surface state base addess)
 // as required by the INTERFACE_DESCRIPTOR_DATA.
 template <typename GfxFamily>
@@ -308,21 +290,16 @@ size_t KernelCommandsHelper<GfxFamily>::sendIndirectState(
    auto threadsPerThreadGroup = static_cast<uint32_t>(getThreadsPerWG(simd, localWorkItems));
    auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload);

-    uint32_t sizeCrossThreadData = 0;
-    size_t offsetCrossThreadData = 0;
+    uint32_t sizeCrossThreadData = kernel.getCrossThreadDataSize();

-    getCrossThreadData(
-        sizeCrossThreadData,
-        offsetCrossThreadData,
-        kernel,
-        inlineDataProgrammingRequired,
-        ioh,
-        walkerCmd);
+    size_t offsetCrossThreadData = KernelCommandsHelper<GfxFamily>::sendCrossThreadData(
+        ioh, kernel, inlineDataProgrammingRequired,
+        walkerCmd, sizeCrossThreadData);

    size_t sizePerThreadDataTotal = 0;
    size_t sizePerThreadData = 0;

-    programPerThreadData(
+    KernelCommandsHelper<GfxFamily>::programPerThreadData(
        sizePerThreadData,
        localIdsGenerationByRuntime,
        ioh,
@@ -345,7 +322,7 @@ size_t KernelCommandsHelper<GfxFamily>::sendIndirectState(
        dsh,
        offsetInterfaceDescriptor,
        kernelStartOffset,
-        getCrossThreadDataSize(sizeCrossThreadData, kernel),
+        sizeCrossThreadData,
        sizePerThreadData,
        dstBindingTablePointer,
        samplerStateOffset,
@@ -441,8 +418,7 @@ bool KernelCommandsHelper<GfxFamily>::doBindingTablePrefetch() {
 template <typename GfxFamily>
 bool KernelCommandsHelper<GfxFamily>::inlineDataProgrammingRequired(const Kernel &kernel) {
    if (DebugManager.flags.EnablePassInlineData.get()) {
-        return kernel.getKernelInfo().patchInfo.threadPayload->PassInlineData &&
-               kernel.getCrossThreadDataSize() <= sizeof(GRF);
+        return kernel.getKernelInfo().patchInfo.threadPayload->PassInlineData;
    }
    return false;
 }
--- a/runtime/helpers/kernel_commands_base.inl
+++ b/runtime/helpers/kernel_commands_base.inl
@@ -117,6 +117,26 @@ void KernelCommandsHelper<GfxFamily>::programPerThreadData(
    updatePerThreadDataTotal(sizePerThreadData, simd, numChannels, sizePerThreadDataTotal, localWorkItems);
 }

+template <typename GfxFamily>
+size_t KernelCommandsHelper<GfxFamily>::sendCrossThreadData(
+    IndirectHeap &indirectHeap,
+    Kernel &kernel,
+    bool inlineDataProgrammingRequired,
+    WALKER_TYPE<GfxFamily> *walkerCmd,
+    uint32_t &sizeCrossThreadData) {
+    indirectHeap.align(WALKER_TYPE<GfxFamily>::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
+
+    auto offsetCrossThreadData = indirectHeap.getUsed();
+    char *pDest = static_cast<char *>(indirectHeap.getSpace(sizeCrossThreadData));
+    memcpy_s(pDest, sizeCrossThreadData, kernel.getCrossThreadData(), sizeCrossThreadData);
+
+    if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) {
+        FlatBatchBufferHelper::fixCrossThreadDataInfo(kernel.getPatchInfoDataList(), offsetCrossThreadData, indirectHeap.getGraphicsAllocation()->getGpuAddress());
+    }
+
+    return offsetCrossThreadData + static_cast<size_t>(indirectHeap.getHeapGpuStartOffset());
+}
+
 template <typename GfxFamily>
 bool KernelCommandsHelper<GfxFamily>::resetBindingTablePrefetch(Kernel &kernel) {
    return kernel.isSchedulerKernel || !doBindingTablePrefetch();
@@ -130,29 +150,6 @@ void KernelCommandsHelper<GfxFamily>::setInterfaceDescriptorOffset(
    walkerCmd->setInterfaceDescriptorOffset(interfaceDescriptorIndex++);
 }

-template <typename GfxFamily>
-void KernelCommandsHelper<GfxFamily>::getCrossThreadData(
-    uint32_t &sizeCrossThreadData,
-    size_t &offsetCrossThreadData,
-    Kernel &kernel,
-    const bool &inlineDataProgrammingRequired,
-    IndirectHeap &ioh,
-    WALKER_TYPE<GfxFamily> *walkerCmd) {
-
-    sizeCrossThreadData = kernel.getCrossThreadDataSize();
-    offsetCrossThreadData = sendCrossThreadData(
-        ioh,
-        kernel);
-}
-
-template <typename GfxFamily>
-size_t KernelCommandsHelper<GfxFamily>::getCrossThreadDataSize(
-    uint32_t &sizeCrossThreadData,
-    Kernel &kernel) {
-
-    return sizeCrossThreadData;
-}
-
 template <typename GfxFamily>
 bool KernelCommandsHelper<GfxFamily>::isRuntimeLocalIdsGenerationRequired(uint32_t workDim, size_t *gws, size_t *lws) {
    return true;
--- a/unit_tests/command_queue/work_group_size_tests.cpp
+++ b/unit_tests/command_queue/work_group_size_tests.cpp
@@ -6,6 +6,7 @@
 */

 #include "hw_cmds.h"
+#include "patch_shared.h"
 #include "runtime/command_queue/gpgpu_walker.h"
 #include "unit_tests/fixtures/device_fixture.h"
 #include "unit_tests/helpers/debug_manager_state_restore.h"
@@ -84,8 +85,9 @@ struct WorkGroupSizeBase {
            (workItems[0] + workGroupSize[0] - 1) / workGroupSize[0],
            (workItems[1] + workGroupSize[1] - 1) / workGroupSize[1],
            (workItems[2] + workGroupSize[2] - 1) / workGroupSize[2]};
+        const iOpenCL::SPatchThreadPayload threadPayload = {};
        GpgpuWalkerHelper<FamilyType>::setGpgpuWalkerThreadData(&pCmd, globalOffsets, workGroupsStart, workGroupsNum,
-                                                                workGroupSize, simdSize, dims, true, false, false);
+                                                                workGroupSize, simdSize, dims, true, false, threadPayload);

        //And check if it is programmed correctly
        auto numWorkItems = computeWalkerWorkItems<FamilyType>(pCmd);
--- a/unit_tests/helpers/kernel_commands_tests.cpp
+++ b/unit_tests/helpers/kernel_commands_tests.cpp
@@ -155,10 +155,13 @@ HWTEST_F(KernelCommandsTest, sendCrossThreadDataResourceUsage) {

    auto &indirectHeap = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192);
    auto usedBefore = indirectHeap.getUsed();
-
+    auto sizeCrossThreadData = kernel->getCrossThreadDataSize();
    KernelCommandsHelper<FamilyType>::sendCrossThreadData(
        indirectHeap,
-        *kernel);
+        *kernel,
+        false,
+        nullptr,
+        sizeCrossThreadData);

    auto usedAfter = indirectHeap.getUsed();
    EXPECT_EQ(kernel->getCrossThreadDataSize(), usedAfter - usedBefore);
@@ -178,10 +181,13 @@ HWTEST_F(KernelCommandsTest, givenSendCrossThreadDataWhenWhenAddPatchInfoComment

    PatchInfoData patchInfoData = {0xaaaaaaaa, 0, PatchInfoAllocationType::KernelArg, 0xbbbbbbbb, 0, PatchInfoAllocationType::IndirectObjectHeap};
    kernel->getPatchInfoDataList().push_back(patchInfoData);
-
+    auto sizeCrossThreadData = kernel->getCrossThreadDataSize();
    KernelCommandsHelper<FamilyType>::sendCrossThreadData(
        indirectHeap,
-        *kernel);
+        *kernel,
+        false,
+        nullptr,
+        sizeCrossThreadData);

    ASSERT_EQ(1u, kernel->getPatchInfoDataList().size());
    EXPECT_EQ(0xaaaaaaaa, kernel->getPatchInfoDataList()[0].sourceAllocation);
@@ -197,9 +203,13 @@ HWTEST_F(KernelCommandsTest, givenIndirectHeapNotAllocatedFromInternalPoolWhenSe
    IndirectHeap indirectHeap(nonInternalAllocation, false);

    MockKernelWithInternals mockKernelWithInternal(*pDevice);
+    auto sizeCrossThreadData = mockKernelWithInternal.mockKernel->getCrossThreadDataSize();
    auto offset = KernelCommandsHelper<FamilyType>::sendCrossThreadData(
        indirectHeap,
-        *mockKernelWithInternal.mockKernel);
+        *mockKernelWithInternal.mockKernel,
+        false,
+        nullptr,
+        sizeCrossThreadData);
    EXPECT_EQ(0u, offset);
    pDevice->getMemoryManager()->freeGraphicsMemory(nonInternalAllocation);
 }
@@ -210,9 +220,13 @@ HWTEST_F(KernelCommandsTest, givenIndirectHeapAllocatedFromInternalPoolWhenSendC
    auto expectedOffset = internalAllocation->getGpuAddressToPatch();

    MockKernelWithInternals mockKernelWithInternal(*pDevice);
+    auto sizeCrossThreadData = mockKernelWithInternal.mockKernel->getCrossThreadDataSize();
    auto offset = KernelCommandsHelper<FamilyType>::sendCrossThreadData(
        indirectHeap,
-        *mockKernelWithInternal.mockKernel);
+        *mockKernelWithInternal.mockKernel,
+        false,
+        nullptr,
+        sizeCrossThreadData);
    EXPECT_EQ(expectedOffset, offset);

    pDevice->getMemoryManager()->freeGraphicsMemory(internalAllocation);
@@ -239,10 +253,13 @@ HWTEST_F(KernelCommandsTest, givenSendCrossThreadDataWhenWhenAddPatchInfoComment

    kernel->getPatchInfoDataList().push_back(patchInfoData1);
    kernel->getPatchInfoDataList().push_back(patchInfoData2);
-
+    auto sizeCrossThreadData = kernel->getCrossThreadDataSize();
    auto offsetCrossThreadData = KernelCommandsHelper<FamilyType>::sendCrossThreadData(
        indirectHeap,
-        *kernel);
+        *kernel,
+        false,
+        nullptr,
+        sizeCrossThreadData);

    ASSERT_NE(0u, offsetCrossThreadData);
    EXPECT_EQ(128u, offsetCrossThreadData);
@@ -1204,7 +1221,7 @@ INSTANTIATE_TEST_CASE_P(ParentKernelCommandsFromBinaryTest,
                            ::testing::Values(binaryFile),
                            ::testing::ValuesIn(KernelNames)));

-HWTEST_F(KernelCommandsTest, givenEnabledPassInlineDataWhenKernelAllowsInlineAndCrossThreadSizeLesserEqualThanGrfThenReturnTrue) {
+HWTEST_F(KernelCommandsTest, givenEnabledPassInlineDataWhenKernelAllowsInlineThenReturnTrue) {
    DebugManagerStateRestore restore;
    DebugManager.flags.EnablePassInlineData.set(true);

@@ -1217,7 +1234,7 @@ HWTEST_F(KernelCommandsTest, givenEnabledPassInlineDataWhenKernelAllowsInlineAnd
    EXPECT_TRUE(KernelCommandsHelper<FamilyType>::inlineDataProgrammingRequired(*mockKernelWithInternal.mockKernel));
 }

-HWTEST_F(KernelCommandsTest, givenEnabledPassInlineDataWhenKernelDisallowsInlineAndCrossThreadSizeLesserEqualThanGrfThenReturnFalse) {
+HWTEST_F(KernelCommandsTest, givenEnabledPassInlineDataWhenKernelDisallowsInlineThenReturnFalse) {
    DebugManagerStateRestore restore;
    DebugManager.flags.EnablePassInlineData.set(true);

@@ -1230,19 +1247,6 @@ HWTEST_F(KernelCommandsTest, givenEnabledPassInlineDataWhenKernelDisallowsInline
    EXPECT_FALSE(KernelCommandsHelper<FamilyType>::inlineDataProgrammingRequired(*mockKernelWithInternal.mockKernel));
 }

-HWTEST_F(KernelCommandsTest, givenEnabledPassInlineDataWhenKernelAllowsInlineAndCrossThreadSizeGreaterThanGrfThenReturnFalse) {
-    DebugManagerStateRestore restore;
-    DebugManager.flags.EnablePassInlineData.set(true);
-
-    uint32_t crossThreadData[16];
-
-    MockKernelWithInternals mockKernelWithInternal(*pDevice);
-    const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->PassInlineData = 1;
-    mockKernelWithInternal.mockKernel->setCrossThreadData(crossThreadData, sizeof(crossThreadData));
-
-    EXPECT_FALSE(KernelCommandsHelper<FamilyType>::inlineDataProgrammingRequired(*mockKernelWithInternal.mockKernel));
-}
-
 HWTEST_F(KernelCommandsTest, whenLocalIdxInXDimPresentThenExpectLocalIdsInUseIsTrue) {
    MockKernelWithInternals mockKernelWithInternal(*pDevice);
    const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDXPresent = 1;
--- a/unit_tests/kernel/kernel_tests.cpp
+++ b/unit_tests/kernel/kernel_tests.cpp
@@ -358,6 +358,42 @@ TEST_F(KernelFromBinaryTests, BuiltInIsSetToFalseForRegularKernels) {

    EXPECT_FALSE(isBuiltIn);

+    delete pKernel;
+    pKernel = nullptr;
+
+    pKernelInfo = pProgram->getKernelInfo("simple_kernel_3");
+
+    pKernel = Kernel::create(
+        pProgram,
+        *pKernelInfo,
+        &retVal);
+
+    ASSERT_EQ(CL_SUCCESS, retVal);
+    ASSERT_NE(nullptr, pKernel);
+
+    // get builtIn property
+    isBuiltIn = pKernel->isBuiltIn;
+
+    EXPECT_FALSE(isBuiltIn);
+
+    delete pKernel;
+    pKernel = nullptr;
+
+    pKernelInfo = pProgram->getKernelInfo("simple_kernel_4");
+
+    pKernel = Kernel::create(
+        pProgram,
+        *pKernelInfo,
+        &retVal);
+
+    ASSERT_EQ(CL_SUCCESS, retVal);
+    ASSERT_NE(nullptr, pKernel);
+
+    // get builtIn property
+    isBuiltIn = pKernel->isBuiltIn;
+
+    EXPECT_FALSE(isBuiltIn);
+
    delete pKernel;
 }

--- a/unit_tests/test_files/simple_kernels.cl
+++ b/unit_tests/test_files/simple_kernels.cl
@@ -1,23 +1,8 @@
 /*
- * Copyright (c) 2017, Intel Corporation
+ * Copyright (C) 2017-2018 Intel Corporation
 *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
+ * SPDX-License-Identifier: MIT
 *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
 */

 __kernel void simple_kernel_0(
@@ -49,3 +34,11 @@ __kernel void simple_kernel_2(

    dst[idx] = arg0;
 }
+
+__kernel void simple_kernel_3(
+    __global uint *dst) {
+    dst[get_local_id(0)] = 0;
+}
+
+__kernel void simple_kernel_4() {
+}