Add patchToken OffsetToSkipSetFFIDGP in device execution

Change-Id: I0634836b787fa371f0b64779732941396a6ba804 Signed-off-by: Pawel Wilma <pawel.wilma@intel.com> Related-To: NEO-3892
2019-11-13 15:37:52 +01:00 · 2019-11-13 15:37:52 +01:00 · ae0cefc834
parent 5ecb9905c9
commit ae0cefc834
21 changed files with 117 additions and 48 deletions
--- a/core/sku_info/sku_info_base.h
+++ b/core/sku_info/sku_info_base.h
@ -118,5 +118,6 @@ struct WorkaroundTableBase {
    bool waUntypedBufferCompression = false;
    bool waAuxTable16KGranular = false;
    bool waDisableFusedThreadScheduling = false;
+    bool waUseOffsetToSkipSetFFIDGP = false;
 };
 } // namespace NEO
--- a/runtime/command_queue/enqueue_common.h
+++ b/runtime/command_queue/enqueue_common.h
@ -19,6 +19,7 @@
 #include "runtime/gtpin/gtpin_notify.h"
 #include "runtime/helpers/array_count.h"
 #include "runtime/helpers/dispatch_info_builder.h"
+#include "runtime/helpers/engine_node_helper.h"
 #include "runtime/helpers/enqueue_properties.h"
 #include "runtime/helpers/hardware_commands_helper.h"
 #include "runtime/helpers/options.h"
@ -530,6 +531,7 @@ void CommandQueueHw<GfxFamily>::processDeviceEnqueue(DeviceQueueHw<GfxFamily> *d
                                                     bool &blocking) {
    auto parentKernel = multiDispatchInfo.peekParentKernel();
    size_t minSizeSSHForEM = HardwareCommandsHelper<GfxFamily>::getSizeRequiredForExecutionModel(IndirectHeap::SURFACE_STATE, *parentKernel);
+    bool isCcsUsed = isCcs(gpgpuEngine->osContext->getEngineType());

    uint32_t taskCount = getGpgpuCommandStreamReceiver().peekTaskCount() + 1;
    devQueueHw->setupExecutionModelDispatch(getIndirectHeap(IndirectHeap::SURFACE_STATE, minSizeSSHForEM),
@ -538,7 +540,8 @@ void CommandQueueHw<GfxFamily>::processDeviceEnqueue(DeviceQueueHw<GfxFamily> *d
                                            (uint32_t)multiDispatchInfo.size(),
                                            getGpgpuCommandStreamReceiver().getTagAllocation()->getGpuAddress(),
                                            taskCount,
-                                            hwTimeStamps);
+                                            hwTimeStamps,
+                                            isCcsUsed);

    BuiltIns &builtIns = *getDevice().getExecutionEnvironment()->getBuiltIns();
    SchedulerKernel &scheduler = builtIns.getSchedulerKernel(this->getContext());
@ -560,7 +563,8 @@ void CommandQueueHw<GfxFamily>::processDeviceEnqueue(DeviceQueueHw<GfxFamily> *d
        preemptionMode,
        scheduler,
        &getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u),
-        devQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE));
+        devQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE),
+        isCcsUsed);

    scheduler.makeResident(getGpgpuCommandStreamReceiver());

--- a/runtime/command_queue/gpgpu_walker.h
+++ b/runtime/command_queue/gpgpu_walker.h
@ -142,7 +142,8 @@ class GpgpuWalkerHelper {
        PreemptionMode preemptionMode,
        SchedulerKernel &scheduler,
        IndirectHeap *ssh,
-        IndirectHeap *dsh);
+        IndirectHeap *dsh,
+        bool isCcsUsed);

    static void adjustMiStoreRegMemMode(MI_STORE_REG_MEM<GfxFamily> *storeCmd);

--- a/runtime/command_queue/gpgpu_walker_bdw_plus.inl
+++ b/runtime/command_queue/gpgpu_walker_bdw_plus.inl
@ -8,7 +8,6 @@
 #pragma once
 #include "core/helpers/simd_helper.h"
 #include "runtime/command_queue/gpgpu_walker_base.inl"
-#include "runtime/helpers/engine_node_helper.h"

 namespace NEO {

@ -60,7 +59,8 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
    PreemptionMode preemptionMode,
    SchedulerKernel &scheduler,
    IndirectHeap *ssh,
-    IndirectHeap *dsh) {
+    IndirectHeap *dsh,
+    bool isCcsUsed) {

    using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
    using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
@ -125,7 +125,6 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
    // Program the walker.  Invokes execution so all state should already be programmed
    auto pGpGpuWalkerCmd = static_cast<GPGPU_WALKER *>(commandStream.getSpace(sizeof(GPGPU_WALKER)));
    *pGpGpuWalkerCmd = GfxFamily::cmdInitGpgpuWalker;
-    auto isCcsUsed = isCcs(devQueueHw.getDevice().getDefaultEngine().osContext->getEngineType());
    bool inlineDataProgrammingRequired = HardwareCommandsHelper<GfxFamily>::inlineDataProgrammingRequired(scheduler);
    HardwareCommandsHelper<GfxFamily>::sendIndirectState(
        commandStream,
--- a/runtime/command_queue/hardware_interface_bdw_plus.inl
+++ b/runtime/command_queue/hardware_interface_bdw_plus.inl
@ -7,6 +7,7 @@

 #pragma once
 #include "runtime/command_queue/hardware_interface_base.inl"
+#include "runtime/helpers/engine_node_helper.h"
 #include "runtime/os_interface/os_context.h"

 namespace NEO {
--- a/runtime/device_queue/device_queue.cpp
+++ b/runtime/device_queue/device_queue.cpp
@ -147,12 +147,12 @@ void DeviceQueue::initDeviceQueue() {
 }

 void DeviceQueue::setupExecutionModelDispatch(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel,
-                                              uint32_t parentCount, uint64_t tagAddress, uint32_t taskCount, TagNode<HwTimeStamps> *hwTimeStamp) {
-    setupIndirectState(surfaceStateHeap, dynamicStateHeap, parentKernel, parentCount);
+                                              uint32_t parentCount, uint64_t tagAddress, uint32_t taskCount, TagNode<HwTimeStamps> *hwTimeStamp, bool isCcsUsed) {
+    setupIndirectState(surfaceStateHeap, dynamicStateHeap, parentKernel, parentCount, isCcsUsed);
    addExecutionModelCleanUpSection(parentKernel, hwTimeStamp, tagAddress, taskCount);
 }

-void DeviceQueue::setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount) {
+void DeviceQueue::setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount, bool isCcsUsed) {
    return;
 }

@ -164,7 +164,7 @@ void DeviceQueue::resetDeviceQueue() {
    return;
 }

-void DeviceQueue::dispatchScheduler(LinearStream &commandStream, SchedulerKernel &scheduler, PreemptionMode preemptionMode, IndirectHeap *ssh, IndirectHeap *dsh) {
+void DeviceQueue::dispatchScheduler(LinearStream &commandStream, SchedulerKernel &scheduler, PreemptionMode preemptionMode, IndirectHeap *ssh, IndirectHeap *dsh, bool isCcsUsed) {
    return;
 }

--- a/runtime/device_queue/device_queue.h
+++ b/runtime/device_queue/device_queue.h
@ -68,9 +68,9 @@ class DeviceQueue : public BaseObject<_device_queue> {
                               size_t paramValueSize, void *paramValue,
                               size_t *paramValueSizeRet);

-    void setupExecutionModelDispatch(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentCount, uint64_t tagAddress, uint32_t taskCount, TagNode<HwTimeStamps> *hwTimeStamp);
+    void setupExecutionModelDispatch(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentCount, uint64_t tagAddress, uint32_t taskCount, TagNode<HwTimeStamps> *hwTimeStamp, bool isCcsUsed);

-    virtual void setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount);
+    virtual void setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount, bool isCcsUsed);
    virtual void addExecutionModelCleanUpSection(Kernel *parentKernel, TagNode<HwTimeStamps> *hwTimeStamp, uint64_t tagAddress, uint32_t taskCount);

    MOCKABLE_VIRTUAL bool isEMCriticalSectionFree() {
@ -80,7 +80,7 @@ class DeviceQueue : public BaseObject<_device_queue> {
    }

    virtual void resetDeviceQueue();
-    virtual void dispatchScheduler(LinearStream &commandStream, SchedulerKernel &scheduler, PreemptionMode preemptionMode, IndirectHeap *ssh, IndirectHeap *dsh);
+    virtual void dispatchScheduler(LinearStream &commandStream, SchedulerKernel &scheduler, PreemptionMode preemptionMode, IndirectHeap *ssh, IndirectHeap *dsh, bool isCcsUsed);
    virtual IndirectHeap *getIndirectHeap(IndirectHeap::Type type);

    void acquireEMCriticalSection() {
--- a/runtime/device_queue/device_queue_hw.h
+++ b/runtime/device_queue/device_queue_hw.h
@ -54,11 +54,11 @@ class DeviceQueueHw : public DeviceQueue {

    size_t setSchedulerCrossThreadData(SchedulerKernel &scheduler);

-    void setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount) override;
+    void setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount, bool isCcsUsed) override;

    void addExecutionModelCleanUpSection(Kernel *parentKernel, TagNode<HwTimeStamps> *hwTimeStamp, uint64_t tagAddress, uint32_t taskCount) override;
    void resetDeviceQueue() override;
-    void dispatchScheduler(LinearStream &commandStream, SchedulerKernel &scheduler, PreemptionMode preemptionMode, IndirectHeap *ssh, IndirectHeap *dsh) override;
+    void dispatchScheduler(LinearStream &commandStream, SchedulerKernel &scheduler, PreemptionMode preemptionMode, IndirectHeap *ssh, IndirectHeap *dsh, bool isCcsUsed) override;

    uint32_t getSchedulerReturnInstance() {
        return igilQueue->m_controls.m_SchedulerEarlyReturn;
@ -86,6 +86,7 @@ class DeviceQueueHw : public DeviceQueue {
    static size_t getMediaStateClearCmdsSize();

    static size_t getExecutionModelCleanupSectionSize();
+    static uint64_t getBlockKernelStartPointer(const Device &device, const KernelInfo *blockInfo, bool isCcsUsed);

    LinearStream slbCS;
    IGIL_CommandQueue *igilQueue = nullptr;
--- a/runtime/device_queue/device_queue_hw_base.inl
+++ b/runtime/device_queue/device_queue_hw_base.inl
@ -179,13 +179,14 @@ size_t DeviceQueueHw<GfxFamily>::setSchedulerCrossThreadData(SchedulerKernel &sc
 }

 template <typename GfxFamily>
-void DeviceQueueHw<GfxFamily>::dispatchScheduler(LinearStream &commandStream, SchedulerKernel &scheduler, PreemptionMode preemptionMode, IndirectHeap *ssh, IndirectHeap *dsh) {
+void DeviceQueueHw<GfxFamily>::dispatchScheduler(LinearStream &commandStream, SchedulerKernel &scheduler, PreemptionMode preemptionMode, IndirectHeap *ssh, IndirectHeap *dsh, bool isCcsUsed) {
    GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(commandStream,
                                                    *this,
                                                    preemptionMode,
                                                    scheduler,
                                                    ssh,
-                                                    dsh);
+                                                    dsh,
+                                                    isCcsUsed);
    return;
 }

@ -235,4 +236,17 @@ size_t DeviceQueueHw<GfxFamily>::getProfilingEndCmdsSize() {
 template <typename GfxFamily>
 void DeviceQueueHw<GfxFamily>::addDcFlushToPipeControlWa(PIPE_CONTROL *pc) {}

+template <typename GfxFamily>
+uint64_t DeviceQueueHw<GfxFamily>::getBlockKernelStartPointer(const Device &device, const KernelInfo *blockInfo, bool isCcsUsed) {
+    auto blockAllocation = blockInfo->getGraphicsAllocation();
+    DEBUG_BREAK_IF(!blockAllocation);
+
+    auto blockKernelStartPointer = blockAllocation ? blockAllocation->getGpuAddressToPatch() : 0llu;
+
+    if (blockAllocation && isCcsUsed && device.getHardwareInfo().workaroundTable.waUseOffsetToSkipSetFFIDGP) {
+        blockKernelStartPointer += blockInfo->patchInfo.threadPayload->OffsetToSkipSetFFIDGP;
+    }
+    return blockKernelStartPointer;
+}
+
 } // namespace NEO
--- a/runtime/device_queue/device_queue_hw_bdw_plus.inl
+++ b/runtime/device_queue/device_queue_hw_bdw_plus.inl
@ -142,7 +142,7 @@ size_t DeviceQueueHw<GfxFamily>::getMediaStateClearCmdsSize() {
 }

 template <typename GfxFamily>
-void DeviceQueueHw<GfxFamily>::setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount) {
+void DeviceQueueHw<GfxFamily>::setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount, bool isCcsUsed) {
    using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
    void *pDSH = dynamicStateHeap.getCpuBase();

@ -174,10 +174,7 @@ void DeviceQueueHw<GfxFamily>::setupIndirectState(IndirectHeap &surfaceStateHeap
    for (uint32_t i = 0; i < blockCount; i++) {
        const KernelInfo *pBlockInfo = blockManager->getBlockKernelInfo(i);

-        auto blockAllocation = pBlockInfo->getGraphicsAllocation();
-        DEBUG_BREAK_IF(!blockAllocation);
-
-        auto gpuAddress = blockAllocation ? blockAllocation->getGpuAddressToPatch() : 0llu;
+        auto blockKernelStartPointer = getBlockKernelStartPointer(getDevice(), pBlockInfo, isCcsUsed);

        auto bindingTableCount = pBlockInfo->patchInfo.bindingTableState->Count;
        maxBindingTableCount = std::max(maxBindingTableCount, bindingTableCount);
@ -196,8 +193,8 @@ void DeviceQueueHw<GfxFamily>::setupIndirectState(IndirectHeap &surfaceStateHeap
        const INTERFACE_DESCRIPTOR_DATA *pBlockID = static_cast<const INTERFACE_DESCRIPTOR_DATA *>(ptrOffset(pBlockInfo->heapInfo.pDsh, idOffset));

        pIDDestination[blockIndex + i] = *pBlockID;
-        pIDDestination[blockIndex + i].setKernelStartPointerHigh(gpuAddress >> 32);
-        pIDDestination[blockIndex + i].setKernelStartPointer((uint32_t)gpuAddress);
+        pIDDestination[blockIndex + i].setKernelStartPointerHigh(blockKernelStartPointer >> 32);
+        pIDDestination[blockIndex + i].setKernelStartPointer(static_cast<uint32_t>(blockKernelStartPointer));
        pIDDestination[blockIndex + i].setDenormMode(INTERFACE_DESCRIPTOR_DATA::DENORM_MODE_SETBYKERNEL);
        HardwareCommandsHelper<GfxFamily>::programBarrierEnable(&pIDDestination[blockIndex + i],
                                                                pBlockInfo->patchInfo.executionEnvironment->HasBarriers,
--- a/runtime/gen12lp/hw_info_tgllp.inl
+++ b/runtime/gen12lp/hw_info_tgllp.inl
@ -100,6 +100,9 @@ void TGLLP::setupFeatureAndWorkaroundTable(HardwareInfo *hwInfo) {
    workaroundTable->wa4kAlignUVOffsetNV12LinearSurface = true;
    workaroundTable->waEnablePreemptionGranularityControlByUMD = true;
    workaroundTable->waUntypedBufferCompression = true;
+    if (hwInfo->platform.usRevId == REVISION_A0) {
+        workaroundTable->waUseOffsetToSkipSetFFIDGP = true;
+    }
 };

 const HardwareInfo TGLLP_1x6x16::hwInfo = {
--- a/runtime/helpers/hardware_commands_helper_base.inl
+++ b/runtime/helpers/hardware_commands_helper_base.inl
@ -103,9 +103,7 @@ void HardwareCommandsHelper<GfxFamily>::setKernelStartOffset(
    }
    kernelStartOffset += kernel.getStartOffset();

-    if ((kernel.getDevice().getHardwareInfo().platform.eProductFamily == IGFX_TIGERLAKE_LP) &&
-        (kernel.getDevice().getHardwareInfo().platform.usRevId == REVISION_A0) &&
-        isCssUsed) {
+    if (isCssUsed && kernel.getDevice().getHardwareInfo().workaroundTable.waUseOffsetToSkipSetFFIDGP) {
        kernelStartOffset += kernelInfo.patchInfo.threadPayload->OffsetToSkipSetFFIDGP;
    }
 }
--- a/runtime/helpers/task_information.cpp
+++ b/runtime/helpers/task_information.cpp
@ -18,6 +18,7 @@
 #include "runtime/device_queue/device_queue.h"
 #include "runtime/gtpin/gtpin_notify.h"
 #include "runtime/helpers/csr_deps.h"
+#include "runtime/helpers/engine_node_helper.h"
 #include "runtime/helpers/enqueue_properties.h"
 #include "runtime/helpers/task_information.inl"
 #include "runtime/mem_obj/mem_obj.h"
@ -126,6 +127,7 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
    auto devQueue = commandQueue.getContext().getDefaultDeviceQueue();

    auto commandStreamReceiverOwnership = commandStreamReceiver.obtainUniqueOwnership();
+    bool isCcsUsed = isCcs(commandQueue.getGpgpuEngine().osContext->getEngineType());

    if (executionModelKernel) {
        while (!devQueue->isEMCriticalSectionFree())
@ -158,7 +160,7 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
    if (executionModelKernel) {
        uint32_t taskCount = commandStreamReceiver.peekTaskCount() + 1;
        devQueue->setupExecutionModelDispatch(*ssh, *dsh, kernel, kernelCount,
-                                              commandStreamReceiver.getTagAllocation()->getGpuAddress(), taskCount, timestamp);
+                                              commandStreamReceiver.getTagAllocation()->getGpuAddress(), taskCount, timestamp, isCcsUsed);

        BuiltIns &builtIns = *this->kernel->getDevice().getExecutionEnvironment()->getBuiltIns();
        SchedulerKernel &scheduler = builtIns.getSchedulerKernel(commandQueue.getContext());
@ -178,7 +180,8 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
            scheduler,
            preemptionMode,
            ssh,
-            dsh);
+            dsh,
+            isCcsUsed);

        scheduler.makeResident(commandStreamReceiver);

--- a/unit_tests/device_queue/device_queue_hw_tests.cpp
+++ b/unit_tests/device_queue/device_queue_hw_tests.cpp
@ -541,7 +541,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, DeviceQueueHwWithKernel, setupIndirectState) {
        auto usedBeforeSSH = ssh->getUsed();
        auto usedBeforeDSH = dsh->getUsed();

-        devQueueHw->setupIndirectState(*ssh, *dsh, pKernel, 1);
+        devQueueHw->setupIndirectState(*ssh, *dsh, pKernel, 1, false);
        auto usedAfterSSH = ssh->getUsed();
        auto usedAfterDSH = dsh->getUsed();

@ -571,7 +571,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, DeviceQueueHwWithKernel, setupIndirectStateSetsCorre

        uint32_t parentCount = 4;

-        devQueueHw->setupIndirectState(*ssh, *dsh, pKernel, parentCount);
+        devQueueHw->setupIndirectState(*ssh, *dsh, pKernel, parentCount, false);
        auto *igilQueue = reinterpret_cast<IGIL_CommandQueue *>(devQueueHw->getQueueBuffer()->getUnderlyingBuffer());

        EXPECT_EQ(parentCount, igilQueue->m_controls.m_StartBlockID);
@ -601,7 +601,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, DeviceQueueHwWithKernel, setupIndirectStateSetsCorre

        uint32_t parentCount = 1;

-        devQueueHw->setupIndirectState(*ssh, *dsh, pKernel, parentCount);
+        devQueueHw->setupIndirectState(*ssh, *dsh, pKernel, parentCount, false);
        auto *igilQueue = reinterpret_cast<IGIL_CommandQueue *>(devQueueHw->getQueueBuffer()->getUnderlyingBuffer());

        EXPECT_EQ(igilQueue->m_controls.m_DynamicHeapStart, devQueueHw->offsetDsh + alignUp((uint32_t)pKernel->getDynamicStateHeapSize(), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE));
@ -639,7 +639,7 @@ HWCMDTEST_P(IGFX_GEN8_CORE, DeviceQueueHwWithKernel, GivenHasBarriersSetWhenCall
                                                                                 const_cast<const Kernel &>(*pKernel));
        auto ssh = std::make_unique<IndirectHeap>(alignedMalloc(surfaceStateHeapSize, MemoryConstants::pageSize), surfaceStateHeapSize);

-        devQueueHw->setupIndirectState(*ssh, *dsh, pKernel, parentCount);
+        devQueueHw->setupIndirectState(*ssh, *dsh, pKernel, parentCount, false);

        auto iddStartPtr = static_cast<INTERFACE_DESCRIPTOR_DATA *>(ptrOffset(dsh->getCpuBase(), devQueueHw->colorCalcStateSize));
        auto iddStartIndex = parentCount;
@ -792,3 +792,30 @@ HWCMDTEST_F(IGFX_GEN8_CORE, TheSimplestDeviceQueueFixture, getProfilingEndCmdsSi

    EXPECT_EQ(expectedSize, MockDeviceQueueHw<FamilyType>::getProfilingEndCmdsSize());
 }
+
+HWCMDTEST_F(IGFX_GEN8_CORE, DeviceQueueHwTest, givenDeviceQueueWhenRunningOnCCsThenFfidSkipOffsetIsAddedToBlockKernelStartPointer) {
+    std::unique_ptr<MockDevice> device(MockDevice::createWithNewExecutionEnvironment<MockDevice>(platformDevices[0]));
+    std::unique_ptr<MockParentKernel> mockParentKernel(MockParentKernel::create(*pContext));
+    KernelInfo *blockInfo = const_cast<KernelInfo *>(mockParentKernel->mockProgram->getBlockKernelInfo(0));
+    blockInfo->createKernelAllocation(device->getRootDeviceIndex(), device->getMemoryManager());
+    ASSERT_NE(nullptr, blockInfo->getGraphicsAllocation());
+    const_cast<SPatchThreadPayload *>(blockInfo->patchInfo.threadPayload)->OffsetToSkipSetFFIDGP = 0x1234;
+    const_cast<HardwareInfo &>(device->getHardwareInfo()).workaroundTable.waUseOffsetToSkipSetFFIDGP = true;
+
+    uint64_t expectedOffset = blockInfo->getGraphicsAllocation()->getGpuAddressToPatch() + blockInfo->patchInfo.threadPayload->OffsetToSkipSetFFIDGP;
+    uint64_t offset = MockDeviceQueueHw<FamilyType>::getBlockKernelStartPointer(*device, blockInfo, true);
+    EXPECT_EQ(expectedOffset, offset);
+
+    expectedOffset = blockInfo->getGraphicsAllocation()->getGpuAddressToPatch();
+    offset = MockDeviceQueueHw<FamilyType>::getBlockKernelStartPointer(*device, blockInfo, false);
+    EXPECT_EQ(expectedOffset, offset);
+
+    const_cast<HardwareInfo &>(device->getHardwareInfo()).workaroundTable.waUseOffsetToSkipSetFFIDGP = false;
+
+    expectedOffset = blockInfo->getGraphicsAllocation()->getGpuAddressToPatch();
+    offset = MockDeviceQueueHw<FamilyType>::getBlockKernelStartPointer(*device, blockInfo, true);
+    EXPECT_EQ(expectedOffset, offset);
+
+    offset = MockDeviceQueueHw<FamilyType>::getBlockKernelStartPointer(*device, blockInfo, false);
+    EXPECT_EQ(expectedOffset, offset);
+}
--- a/unit_tests/device_queue/device_queue_tests.cpp
+++ b/unit_tests/device_queue/device_queue_tests.cpp
@ -27,7 +27,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, DeviceQueueSimpleTest, setupExecutionModelDispatchDo
    size_t size = 20;
    IndirectHeap ssh(buffer, size);
    IndirectHeap dsh(buffer, size);
-    devQueue.setupExecutionModelDispatch(ssh, dsh, nullptr, 0, 0, 0x123, 0);
+    devQueue.setupExecutionModelDispatch(ssh, dsh, nullptr, 0, 0, 0x123, 0, false);

    EXPECT_EQ(0u, ssh.getUsed());

@ -325,6 +325,6 @@ HWCMDTEST_F(IGFX_GEN8_CORE, DeviceQueueTest, dispatchScheduler) {
    MockSchedulerKernel *kernel = new MockSchedulerKernel(&program, info, *device);
    LinearStream cmdStream;

-    devQueue.dispatchScheduler(cmdStream, *kernel, device->getPreemptionMode(), nullptr, nullptr);
+    devQueue.dispatchScheduler(cmdStream, *kernel, device->getPreemptionMode(), nullptr, nullptr, false);
    delete kernel;
 }
--- a/unit_tests/execution_model/scheduler_dispatch_tests.cpp
+++ b/unit_tests/execution_model/scheduler_dispatch_tests.cpp
@ -67,7 +67,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ExecutionModelSchedulerFixture, dispatchScheduler) {
            pDevice->getPreemptionMode(),
            scheduler,
            &pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u),
-            pDevQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE));
+            pDevQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE),
+            false);

        EXPECT_EQ(0u, *scheduler.globalWorkOffsetX);
        EXPECT_EQ(0u, *scheduler.globalWorkOffsetY);
@ -188,7 +189,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ExecutionModelSchedulerFixture, dispatchSchedulerDoe
            pDevice->getPreemptionMode(),
            scheduler,
            &pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u),
-            pDevQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE));
+            pDevQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE),
+            false);

        auto &ioh = pCmdQ->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0u);

@ -224,7 +226,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, ParentKernelCommandQueueFixture, dispatchSchedulerWi
            device->getPreemptionMode(),
            scheduler,
            &pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u),
-            mockDevQueue.getIndirectHeap(IndirectHeap::DYNAMIC_STATE));
+            mockDevQueue.getIndirectHeap(IndirectHeap::DYNAMIC_STATE),
+            false);

        HardwareParse hwParser;
        hwParser.parseCommands<FamilyType>(commandStream, 0);
--- a/unit_tests/execution_model/submit_blocked_parent_kernel_tests.cpp
+++ b/unit_tests/execution_model/submit_blocked_parent_kernel_tests.cpp
@ -52,18 +52,18 @@ class MockDeviceQueueHwWithCriticalSectionRelease : public DeviceQueueHw<GfxFami
        return igilCmdQueue->m_controls.m_CriticalSection == DeviceQueueHw<GfxFamily>::ExecutionModelCriticalSection::Free;
    }

-    void setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount) override {
+    void setupIndirectState(IndirectHeap &surfaceStateHeap, IndirectHeap &dynamicStateHeap, Kernel *parentKernel, uint32_t parentIDCount, bool isCcsUsed) override {
        indirectStateSetup = true;
-        return BaseClass::setupIndirectState(surfaceStateHeap, dynamicStateHeap, parentKernel, parentIDCount);
+        return BaseClass::setupIndirectState(surfaceStateHeap, dynamicStateHeap, parentKernel, parentIDCount, isCcsUsed);
    }
    void addExecutionModelCleanUpSection(Kernel *parentKernel, TagNode<HwTimeStamps> *hwTimeStamp, uint64_t tagAddress, uint32_t taskCount) override {
        cleanupSectionAdded = true;
        timestampAddedInCleanupSection = hwTimeStamp ? hwTimeStamp->tagForCpuAccess : nullptr;
        return BaseClass::addExecutionModelCleanUpSection(parentKernel, hwTimeStamp, tagAddress, taskCount);
    }
-    void dispatchScheduler(LinearStream &commandStream, SchedulerKernel &scheduler, PreemptionMode preemptionMode, IndirectHeap *ssh, IndirectHeap *dsh) override {
+    void dispatchScheduler(LinearStream &commandStream, SchedulerKernel &scheduler, PreemptionMode preemptionMode, IndirectHeap *ssh, IndirectHeap *dsh, bool isCcsUsed) override {
        schedulerDispatched = true;
-        return BaseClass::dispatchScheduler(commandStream, scheduler, preemptionMode, ssh, dsh);
+        return BaseClass::dispatchScheduler(commandStream, scheduler, preemptionMode, ssh, dsh, isCcsUsed);
    }

    uint32_t criticalSectioncheckCounter = 0;
--- a/unit_tests/gen12lp/hardware_commands_helper_tests_gen12lp.inl
+++ b/unit_tests/gen12lp/hardware_commands_helper_tests_gen12lp.inl
@ -14,16 +14,15 @@ using namespace NEO;

 using HardwareCommandsGen12LpTests = ::testing::Test;

-TGLLPTEST_F(HardwareCommandsGen12LpTests, GivenTgllpA0WhenSettingKernelStartOffsetThenAdditionalOffsetIsSet) {
+TGLLPTEST_F(HardwareCommandsGen12LpTests, GivenUseOffsetToSkipSetFFIDGPWorkaroundActiveWhenSettingKernelStartOffsetThenAdditionalOffsetIsSet) {
    const uint64_t defaultKernelStartOffset = 0;
    const uint64_t additionalOffsetDueToFfid = 0x1234;
    SPatchThreadPayload threadPayload{};
    threadPayload.OffsetToSkipSetFFIDGP = additionalOffsetDueToFfid;
    auto hwInfo = *platformDevices[0];

-    __REVID revIds[] = {REVISION_A0, REVISION_A1};
-    for (auto revId : revIds) {
-        hwInfo.platform.usRevId = revId;
+    for (auto workaround : ::testing::Bool()) {
+        hwInfo.workaroundTable.waUseOffsetToSkipSetFFIDGP = workaround;
        auto device = std::unique_ptr<MockDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(&hwInfo));
        MockKernelWithInternals mockKernelWithInternals{*device};
        mockKernelWithInternals.kernelInfo.patchInfo.threadPayload = &threadPayload;
@ -33,7 +32,7 @@ TGLLPTEST_F(HardwareCommandsGen12LpTests, GivenTgllpA0WhenSettingKernelStartOffs
            HardwareCommandsHelper<FamilyType>::setKernelStartOffset(kernelStartOffset, false, mockKernelWithInternals.kernelInfo, false,
                                                                     false, *mockKernelWithInternals.mockKernel, isCcsUsed);

-            if ((revId == REVISION_A0) && isCcsUsed) {
+            if (workaround && isCcsUsed) {
                EXPECT_EQ(defaultKernelStartOffset + additionalOffsetDueToFfid, kernelStartOffset);
            } else {
                EXPECT_EQ(defaultKernelStartOffset, kernelStartOffset);
--- a/unit_tests/gen12lp/tgllp/test_hw_info_config_tgllp.cpp
+++ b/unit_tests/gen12lp/tgllp/test_hw_info_config_tgllp.cpp
@ -98,3 +98,19 @@ TGLLPTEST_F(TgllpHwInfo, givenHwInfoConfigStringThenAfterSetupResultingVmeIsDisa
    EXPECT_FALSE(hwInfo.capabilityTable.ftrSupportsVmeAvcPreemption);
    EXPECT_FALSE(hwInfo.capabilityTable.supportsVme);
 }
+
+TGLLPTEST_F(TgllpHwInfo, givenA0SteppingWhenWaTableIsInitializedThenWaUseOffsetToSkipSetFFIDGPIsSet) {
+    HardwareInfo hwInfo;
+    hwInfo.platform.usRevId = REVISION_A0;
+    TGLLP::setupFeatureAndWorkaroundTable(&hwInfo);
+
+    EXPECT_TRUE(hwInfo.workaroundTable.waUseOffsetToSkipSetFFIDGP);
+}
+
+TGLLPTEST_F(TgllpHwInfo, givenA1SteppingWhenWaTableIsInitializedThenWaUseOffsetToSkipSetFFIDGPIsNotSet) {
+    HardwareInfo hwInfo;
+    hwInfo.platform.usRevId = REVISION_A1;
+    TGLLP::setupFeatureAndWorkaroundTable(&hwInfo);
+
+    EXPECT_FALSE(hwInfo.workaroundTable.waUseOffsetToSkipSetFFIDGP);
+}
--- a/unit_tests/gen8/scheduler_dispatch_tests_gen8.cpp
+++ b/unit_tests/gen8/scheduler_dispatch_tests_gen8.cpp
@ -45,7 +45,8 @@ BDWTEST_F(BdwSchedulerTest, givenCallToDispatchSchedulerWhenPipeControlWithCSSta
            pDevice->getPreemptionMode(),
            scheduler,
            &pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u),
-            pDevQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE));
+            pDevQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE),
+            false);

        HardwareParse hwParser;
        hwParser.parseCommands<FamilyType>(commandStream, 0);
--- a/unit_tests/mocks/mock_device_queue.h
+++ b/unit_tests/mocks/mock_device_queue.h
@ -33,6 +33,7 @@ class MockDeviceQueueHw : public DeviceQueueHw<GfxFamily> {
    using BaseClass::addPipeControlCmdWa;
    using BaseClass::addProfilingEndCmds;
    using BaseClass::buildSlbDummyCommands;
+    using BaseClass::getBlockKernelStartPointer;
    using BaseClass::getCSPrefetchSize;
    using BaseClass::getExecutionModelCleanupSectionSize;
    using BaseClass::getMediaStateClearCmdsSize;