Add support for OffsetToSkipSetFFIDGP igc parameter

Related-To: NEO-3829

Change-Id: I18b237bac5301f57bbb26636bec94683c3d250a7
Signed-off-by: Filip Hazubski <filip.hazubski@intel.com>
This commit is contained in:
Filip Hazubski 2019-10-14 13:33:18 +02:00
parent a59559e516
commit b6e62528b6
9 changed files with 86 additions and 16 deletions

View File

@ -7,6 +7,7 @@
#pragma once
#include "runtime/command_queue/gpgpu_walker_base.inl"
#include "runtime/helpers/engine_node_helper.h"
namespace NEO {
@ -123,7 +124,7 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
// Program the walker. Invokes execution so all state should already be programmed
auto pGpGpuWalkerCmd = static_cast<GPGPU_WALKER *>(commandStream.getSpace(sizeof(GPGPU_WALKER)));
*pGpGpuWalkerCmd = GfxFamily::cmdInitGpgpuWalker;
auto isCcsUsed = isCcs(devQueueHw.getDevice().getDefaultEngine().osContext->getEngineType());
bool inlineDataProgrammingRequired = HardwareCommandsHelper<GfxFamily>::inlineDataProgrammingRequired(scheduler);
HardwareCommandsHelper<GfxFamily>::sendIndirectState(
commandStream,
@ -138,7 +139,8 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
preemptionMode,
pGpGpuWalkerCmd,
nullptr,
true);
true,
isCcsUsed);
// Implement enabling special WA DisableLSQCROPERFforOCL if needed
GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(&commandStream, scheduler, true);

View File

@ -7,6 +7,7 @@
#pragma once
#include "runtime/command_queue/hardware_interface_base.inl"
#include "runtime/os_interface/os_context.h"
namespace NEO {
@ -112,6 +113,8 @@ inline void HardwareInterface<GfxFamily>::programWalker(
GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(&commandStream, walkerCmd, timestampPacketNode, TimestampPacketStorage::WriteOperationType::AfterWalker, commandQueue.getDevice().getHardwareInfo());
}
auto isCcsUsed = isCcs(commandQueue.getGpgpuEngine().osContext->getEngineType());
HardwareCommandsHelper<GfxFamily>::sendIndirectState(
commandStream,
dsh,
@ -125,7 +128,8 @@ inline void HardwareInterface<GfxFamily>::programWalker(
preemptionMode,
walkerCmd,
nullptr,
true);
true,
isCcsUsed);
GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(walkerCmd, globalOffsets, startWorkGroups,
numWorkGroups, localWorkSizes, simd, dim,

View File

@ -31,6 +31,8 @@ set(RUNTIME_SRCS_HELPERS_BASE
${CMAKE_CURRENT_SOURCE_DIR}/dispatch_info_builder.h
${CMAKE_CURRENT_SOURCE_DIR}/enable_product.inl
${CMAKE_CURRENT_SOURCE_DIR}/engine_control.h
${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}/engine_node_helper.cpp
${CMAKE_CURRENT_SOURCE_DIR}/engine_node_helper.h
${CMAKE_CURRENT_SOURCE_DIR}/error_mappers.h
${CMAKE_CURRENT_SOURCE_DIR}/file_io.cpp
${CMAKE_CURRENT_SOURCE_DIR}/file_io.h

View File

@ -0,0 +1,16 @@
/*
* Copyright (C) 2019 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "engine_node.h"
namespace NEO {
bool isCcs(aub_stream::EngineType engineType) {
return engineType == aub_stream::ENGINE_CCS;
}
} // namespace NEO

View File

@ -0,0 +1,16 @@
/*
* Copyright (C) 2019 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#pragma once
#include "engine_node.h"
namespace NEO {
bool isCcs(aub_stream::EngineType engineType);
} // namespace NEO

View File

@ -112,7 +112,8 @@ struct HardwareCommandsHelper : public PerThreadDataHelper {
PreemptionMode preemptionMode,
WALKER_TYPE<GfxFamily> *walkerCmd,
INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor,
bool localIdsGenerationByRuntime);
bool localIdsGenerationByRuntime,
bool isCcsUsed);
static void programPerThreadData(
size_t &sizePerThreadData,
@ -140,7 +141,8 @@ struct HardwareCommandsHelper : public PerThreadDataHelper {
const KernelInfo &kernelInfo,
const bool &localIdsGenerationByRuntime,
const bool &kernelUsesLocalIds,
Kernel &kernel);
Kernel &kernel,
bool isCssUsed);
static size_t getSizeRequiredCS(const Kernel *kernel);
static size_t getSizeRequiredForCacheFlush(const CommandQueue &commandQueue, const Kernel *kernel, uint64_t postSyncAddress);

View File

@ -239,7 +239,8 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
PreemptionMode preemptionMode,
WALKER_TYPE<GfxFamily> *walkerCmd,
INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor,
bool localIdsGenerationByRuntime) {
bool localIdsGenerationByRuntime,
bool isCcsUsed) {
using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE;
@ -252,7 +253,8 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
const auto &kernelInfo = kernel.getKernelInfo();
auto kernelAllocation = kernelInfo.getGraphicsAllocation();
DEBUG_BREAK_IF(!kernelAllocation);
setKernelStartOffset(kernelStartOffset, kernelAllocation, kernelInfo, localIdsGenerationByRuntime, kernelUsesLocalIds, kernel);
setKernelStartOffset(kernelStartOffset, kernelAllocation, kernelInfo, localIdsGenerationByRuntime,
kernelUsesLocalIds, kernel, isCcsUsed);
const auto &patchInfo = kernelInfo.patchInfo;

View File

@ -95,12 +95,21 @@ void HardwareCommandsHelper<GfxFamily>::setKernelStartOffset(
const KernelInfo &kernelInfo,
const bool &localIdsGenerationByRuntime,
const bool &kernelUsesLocalIds,
Kernel &kernel) {
Kernel &kernel,
bool isCssUsed) {
if (kernelAllocation) {
kernelStartOffset = kernelInfo.getGraphicsAllocation()->getGpuAddressToPatch();
}
kernelStartOffset += kernel.getStartOffset();
#ifdef WIN32
if ((kernel.getDevice().getHardwareInfo().platform.eProductFamily == IGFX_TIGERLAKE_LP) &&
(kernel.getDevice().getHardwareInfo().platform.usRevId == REVISION_A0) &&
isCssUsed) {
kernelStartOffset += kernelInfo.patchInfo.threadPayload->OffsetToSkipSetFFIDGP;
}
#endif
}
template <typename GfxFamily>

View File

@ -14,6 +14,7 @@
#include "runtime/api/api.h"
#include "runtime/built_ins/builtins_dispatch_builder.h"
#include "runtime/command_queue/command_queue_hw.h"
#include "runtime/helpers/engine_node_helper.h"
#include "runtime/helpers/hardware_commands_helper.h"
#include "unit_tests/fixtures/execution_model_kernel_fixture.h"
#include "unit_tests/fixtures/hello_world_fixture.h"
@ -326,6 +327,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, sendIndirectStateResourceUsage
IDToffset,
sizeof(INTERFACE_DESCRIPTOR_DATA));
uint32_t interfaceDescriptorIndex = 0;
auto isCcsUsed = isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
HardwareCommandsHelper<FamilyType>::sendIndirectState(
commandStream,
dsh,
@ -339,7 +341,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, sendIndirectStateResourceUsage
pDevice->getPreemptionMode(),
pWalkerCmd,
nullptr,
true);
true,
isCcsUsed);
// It's okay these are EXPECT_GE as they're only going to be used for
// estimation purposes to avoid OOM.
@ -376,6 +379,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWithFourBindingTabl
const size_t localWorkSize = 256;
const size_t localWorkSizes[3]{localWorkSize, 1, 1};
uint32_t interfaceDescriptorIndex = 0;
auto isCcsUsed = isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
HardwareCommandsHelper<FamilyType>::sendIndirectState(
commandStream,
dsh,
@ -389,7 +393,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWithFourBindingTabl
pDevice->getPreemptionMode(),
pWalkerCmd,
nullptr,
true);
true,
isCcsUsed);
auto interfaceDescriptor = reinterpret_cast<INTERFACE_DESCRIPTOR_DATA *>(dsh.getCpuBase());
if (HardwareCommandsHelper<FamilyType>::doBindingTablePrefetch()) {
@ -419,6 +424,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelThatIsSchedulerWhen
const size_t localWorkSize = 256;
const size_t localWorkSizes[3]{localWorkSize, 1, 1};
uint32_t interfaceDescriptorIndex = 0;
auto isCcsUsed = isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
HardwareCommandsHelper<FamilyType>::sendIndirectState(
commandStream,
dsh,
@ -432,7 +438,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelThatIsSchedulerWhen
pDevice->getPreemptionMode(),
pWalkerCmd,
nullptr,
true);
true,
isCcsUsed);
auto interfaceDescriptor = reinterpret_cast<INTERFACE_DESCRIPTOR_DATA *>(dsh.getCpuBase());
EXPECT_EQ(0u, interfaceDescriptor->getBindingTableEntryCount());
@ -456,6 +463,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWith100BindingTable
const size_t localWorkSize = 256;
const size_t localWorkSizes[3]{localWorkSize, 1, 1};
uint32_t interfaceDescriptorIndex = 0;
auto isCcsUsed = isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
HardwareCommandsHelper<FamilyType>::sendIndirectState(
commandStream,
dsh,
@ -469,7 +477,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWith100BindingTable
pDevice->getPreemptionMode(),
pWalkerCmd,
nullptr,
true);
true,
isCcsUsed);
auto interfaceDescriptor = reinterpret_cast<INTERFACE_DESCRIPTOR_DATA *>(dsh.getCpuBase());
if (HardwareCommandsHelper<FamilyType>::doBindingTablePrefetch()) {
@ -528,6 +537,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, whenSendingIndirectStateThenKe
modifiedKernelInfo.workgroupDimensionsOrder[2] = 0;
MockKernel mockKernel{kernel->getProgram(), modifiedKernelInfo, kernel->getDevice(), false};
uint32_t interfaceDescriptorIndex = 0;
auto isCcsUsed = isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
HardwareCommandsHelper<FamilyType>::sendIndirectState(
commandStream,
dsh,
@ -541,7 +551,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, whenSendingIndirectStateThenKe
pDevice->getPreemptionMode(),
pWalkerCmd,
nullptr,
true);
true,
isCcsUsed);
size_t numThreads = localWorkSizeX * localWorkSizeY * localWorkSizeZ;
numThreads = Math::divideAndRoundUp(numThreads, modifiedKernelInfo.getMaxSimdSize());
@ -607,6 +618,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, usedBindingTableStatePointer)
// force statefull path for buffers
const_cast<KernelInfo &>(kernelInfo).requiresSshForBuffers = true;
uint32_t interfaceDescriptorIndex = 0;
auto isCcsUsed = isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
HardwareCommandsHelper<FamilyType>::sendIndirectState(
commandStream,
dsh,
@ -620,7 +632,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, usedBindingTableStatePointer)
pDevice->getPreemptionMode(),
pWalkerCmd,
nullptr,
true);
true,
isCcsUsed);
EXPECT_EQ(0x00000000u, *(&bindingTableStatesPointers[0]));
EXPECT_EQ(0x00000040u, *(&bindingTableStatesPointers[1]));
@ -767,6 +780,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, usedBindingTableStatePointersF
// push surfaces states and binding table to given ssh heap
uint32_t interfaceDescriptorIndex = 0;
auto isCcsUsed = isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
HardwareCommandsHelper<FamilyType>::sendIndirectState(
commandStream,
dsh,
@ -780,7 +794,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, usedBindingTableStatePointersF
pDevice->getPreemptionMode(),
pWalkerCmd,
nullptr,
true);
true,
isCcsUsed);
bti = reinterpret_cast<typename FamilyType::BINDING_TABLE_STATE *>(reinterpret_cast<unsigned char *>(ssh.getCpuBase()) + localSshOffset + btiOffset);
for (uint32_t i = 0; i < numSurfaces; ++i) {
@ -1005,6 +1020,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, GivenKernelWithSamplersWhenInd
mockKernelWithInternal->mockKernel->setCrossThreadData(mockKernelWithInternal->crossThreadData, sizeof(mockKernelWithInternal->crossThreadData));
mockKernelWithInternal->mockKernel->setSshLocal(mockKernelWithInternal->sshLocal, sizeof(mockKernelWithInternal->sshLocal));
uint32_t interfaceDescriptorIndex = 0;
auto isCcsUsed = isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
HardwareCommandsHelper<FamilyType>::sendIndirectState(
commandStream,
dsh,
@ -1018,7 +1034,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, GivenKernelWithSamplersWhenInd
pDevice->getPreemptionMode(),
pWalkerCmd,
nullptr,
true);
true,
isCcsUsed);
bool isMemorySame = memcmp(borderColorPointer, mockDsh, borderColorSize) == 0;
EXPECT_TRUE(isMemorySame);