Grf configuration

Change-Id: I3741f53a38c6707b0c8ad82ae553ea65ae6917e4
Signed-off-by: Pawel Wilma <pawel.wilma@intel.com>
This commit is contained in:
Pawel Wilma
2018-09-21 14:06:35 +02:00
committed by sys_ocldev
parent a81b1a461f
commit e06aa17dfc
19 changed files with 73 additions and 26 deletions

View File

@ -534,6 +534,7 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueNonBlocked(
}
auto mediaSamplerRequired = false;
uint32_t numGrfRequired = GrfConfig::DefaultGrfNumber;
Kernel *kernel = nullptr;
for (auto &dispatchInfo : multiDispatchInfo) {
if (kernel != dispatchInfo.getKernel()) {
@ -544,6 +545,8 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueNonBlocked(
kernel->makeResident(commandStreamReceiver);
requiresCoherency |= kernel->requiresCoherency();
mediaSamplerRequired |= kernel->isVmeKernel();
auto numGrfRequiredByKernel = kernel->getKernelInfo().patchInfo.executionEnvironment->NumGRFRequired;
numGrfRequired = std::max(numGrfRequired, numGrfRequiredByKernel);
}
if (mediaSamplerRequired) {
@ -593,7 +596,7 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueNonBlocked(
if (commandStreamReceiver.peekTimestampPacketWriteEnabled()) {
dispatchFlags.outOfDeviceDependencies = &eventsRequest;
}
dispatchFlags.numGrfRequired = numGrfRequired;
DEBUG_BREAK_IF(taskLevel >= Event::eventNotReady);
if (gtpinIsGTPinInitialized()) {

View File

@ -15,6 +15,7 @@
#include "runtime/helpers/completion_stamp.h"
#include "runtime/helpers/flat_batch_buffer_helper.h"
#include "runtime/helpers/options.h"
#include "runtime/kernel/grf_config.h"
#include "runtime/indirect_heap/indirect_heap.h"
#include <cstddef>
#include <cstdint>
@ -179,6 +180,7 @@ class CommandStreamReceiver {
int8_t lastMediaSamplerConfig = -1;
PreemptionMode lastPreemptionMode = PreemptionMode::Initial;
uint32_t latestSentStatelessMocsConfig = 0;
uint32_t lastSentNumGrfRequired = GrfConfig::DefaultGrfNumber;
LinearStream commandStream;

View File

@ -50,9 +50,9 @@ class CommandStreamReceiverHw : public CommandStreamReceiver {
size_t getCmdSizeForPreemption(const DispatchFlags &dispatchFlags) const;
size_t getCmdSizeForL3Config() const;
size_t getCmdSizeForPipelineSelect() const;
size_t getCmdSizeForCoherency();
size_t getCmdSizeForComputeMode();
size_t getCmdSizeForMediaSampler(bool mediaSamplerRequired) const;
void programCoherency(LinearStream &csr, DispatchFlags &dispatchFlags);
void programComputeMode(LinearStream &csr, DispatchFlags &dispatchFlags);
void waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, OsContext &osContext) override;
const HardwareInfo &peekHwInfo() const { return hwInfo; }

View File

@ -228,6 +228,7 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
csrSizeRequestFlags.coherencyRequestChanged = this->lastSentCoherencyRequest != static_cast<int8_t>(dispatchFlags.requiresCoherency);
csrSizeRequestFlags.preemptionRequestChanged = this->lastPreemptionMode != dispatchFlags.preemptionMode;
csrSizeRequestFlags.mediaSamplerConfigChanged = this->lastMediaSamplerConfig != static_cast<int8_t>(dispatchFlags.mediaSamplerRequired);
csrSizeRequestFlags.numGrfRequiredChanged = this->lastSentNumGrfRequired != dispatchFlags.numGrfRequired;
size_t requiredScratchSizeInBytes = requiredScratchSize * device.getDeviceInfo().computeUnitsUsedForScratch;
@ -255,7 +256,7 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
}
initPageTableManagerRegisters(commandStreamCSR);
programPreemption(commandStreamCSR, device, dispatchFlags);
programCoherency(commandStreamCSR, dispatchFlags);
programComputeMode(commandStreamCSR, dispatchFlags);
programL3(commandStreamCSR, dispatchFlags, newL3Config);
programPipelineSelect(commandStreamCSR, dispatchFlags);
programPreamble(commandStreamCSR, device, dispatchFlags, newL3Config);
@ -628,7 +629,7 @@ size_t CommandStreamReceiverHw<GfxFamily>::getRequiredCmdStreamSize(const Dispat
size += sizeof(typename GfxFamily::MI_BATCH_BUFFER_START);
size += getCmdSizeForL3Config();
size += getCmdSizeForCoherency();
size += getCmdSizeForComputeMode();
size += getCmdSizeForMediaSampler(dispatchFlags.mediaSamplerRequired);
size += getCmdSizeForPipelineSelect();
size += getCmdSizeForPreemption(dispatchFlags);

View File

@ -9,6 +9,7 @@
#include "runtime/memory_manager/memory_constants.h"
#include "runtime/helpers/hw_info.h"
#include "runtime/helpers/properties_helper.h"
#include "runtime/kernel/grf_config.h"
#include <limits>
namespace OCLRT {
@ -43,6 +44,7 @@ struct DispatchFlags {
FlushStampTrackingObj *flushStampReference = nullptr;
PreemptionMode preemptionMode = PreemptionMode::Disabled;
EventsRequest *outOfDeviceDependencies = nullptr;
uint32_t numGrfRequired = GrfConfig::DefaultGrfNumber;
};
struct CsrSizeRequestFlags {
@ -51,5 +53,6 @@ struct CsrSizeRequestFlags {
bool preemptionRequestChanged = false;
bool mediaSamplerConfigChanged = false;
bool hasSharedHandles = false;
bool numGrfRequiredChanged = false;
};
} // namespace OCLRT

View File

@ -16,7 +16,7 @@ typedef CNLFamily Family;
static auto gfxCore = IGFX_GEN10_CORE;
template <>
size_t CommandStreamReceiverHw<Family>::getCmdSizeForCoherency() {
size_t CommandStreamReceiverHw<Family>::getCmdSizeForComputeMode() {
if (csrSizeRequestFlags.coherencyRequestChanged) {
return sizeof(typename Family::MI_LOAD_REGISTER_IMM);
}
@ -24,7 +24,7 @@ size_t CommandStreamReceiverHw<Family>::getCmdSizeForCoherency() {
}
template <>
void CommandStreamReceiverHw<Family>::programCoherency(LinearStream &stream, DispatchFlags &dispatchFlags) {
void CommandStreamReceiverHw<Family>::programComputeMode(LinearStream &stream, DispatchFlags &dispatchFlags) {
if (csrSizeRequestFlags.coherencyRequestChanged) {
LriHelper<Family>::program(&stream, gen10HdcModeRegisterAddresss, DwordBuilder::build(4, true, !dispatchFlags.requiresCoherency));
this->lastSentCoherencyRequest = static_cast<int8_t>(dispatchFlags.requiresCoherency);

View File

@ -15,12 +15,12 @@ typedef BDWFamily Family;
static auto gfxCore = IGFX_GEN8_CORE;
template <>
size_t CommandStreamReceiverHw<Family>::getCmdSizeForCoherency() {
size_t CommandStreamReceiverHw<Family>::getCmdSizeForComputeMode() {
return 0;
}
template <>
void CommandStreamReceiverHw<Family>::programCoherency(LinearStream &stream, DispatchFlags &dispatchFlags) {
void CommandStreamReceiverHw<Family>::programComputeMode(LinearStream &stream, DispatchFlags &dispatchFlags) {
}
template <>

View File

@ -15,12 +15,12 @@ typedef SKLFamily Family;
static auto gfxCore = IGFX_GEN9_CORE;
template <>
size_t CommandStreamReceiverHw<Family>::getCmdSizeForCoherency() {
size_t CommandStreamReceiverHw<Family>::getCmdSizeForComputeMode() {
return 0;
}
template <>
void CommandStreamReceiverHw<Family>::programCoherency(LinearStream &stream, DispatchFlags &dispatchFlags) {
void CommandStreamReceiverHw<Family>::programComputeMode(LinearStream &stream, DispatchFlags &dispatchFlags) {
}
template <>

View File

@ -7,11 +7,13 @@
set(RUNTIME_SRCS_KERNEL
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
${CMAKE_CURRENT_SOURCE_DIR}/dynamic_kernel_info.h
${CMAKE_CURRENT_SOURCE_DIR}/grf_config.h
${CMAKE_CURRENT_SOURCE_DIR}/image_transformer.cpp
${CMAKE_CURRENT_SOURCE_DIR}/image_transformer.h
${CMAKE_CURRENT_SOURCE_DIR}/kernel.cpp
${CMAKE_CURRENT_SOURCE_DIR}/kernel.h
${CMAKE_CURRENT_SOURCE_DIR}/kernel.inl
${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}/kernel_reconfiguration.cpp
)
target_sources(${NEO_STATIC_LIB_NAME} PRIVATE ${RUNTIME_SRCS_KERNEL})
set_property(GLOBAL PROPERTY RUNTIME_SRCS_KERNEL ${RUNTIME_SRCS_KERNEL})

View File

@ -0,0 +1,12 @@
/*
* Copyright (C) 2018 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#pragma once
namespace GrfConfig {
constexpr uint32_t DefaultGrfNumber = 128u;
}

View File

@ -354,6 +354,8 @@ cl_int Kernel::initialize() {
program->allocateBlockPrivateSurfaces();
}
reconfigureKernel();
retVal = CL_SUCCESS;
} while (false);

View File

@ -458,6 +458,8 @@ class Kernel : public BaseObject<_cl_kernel> {
void resolveArgs();
void reconfigureKernel();
Program *program;
Context *context;
const Device &device;

View File

@ -0,0 +1,13 @@
/*
* Copyright (C) 2018 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "runtime/kernel/kernel.h"
namespace OCLRT {
void Kernel::reconfigureKernel() {
}
} // namespace OCLRT

View File

@ -239,5 +239,6 @@ struct KernelInfo {
bool isKernelHeapSubstituted = false;
GraphicsAllocation *kernelAllocation = nullptr;
DebugData debugData;
bool computeMode = false;
};
} // namespace OCLRT

View File

@ -535,7 +535,8 @@ cl_int Program::parsePatchList(KernelInfo &kernelInfo) {
"\n .SubgroupIndependentForwardProgressRequired", kernelInfo.patchInfo.executionEnvironment->SubgroupIndependentForwardProgressRequired,
"\n .WorkgroupWalkOrderDim0", kernelInfo.workgroupWalkOrder[0],
"\n .WorkgroupWalkOrderDim1", kernelInfo.workgroupWalkOrder[1],
"\n .WorkgroupWalkOrderDim2", kernelInfo.workgroupWalkOrder[2]);
"\n .WorkgroupWalkOrderDim2", kernelInfo.workgroupWalkOrder[2],
"\n .NumGRFRequired", kernelInfo.patchInfo.executionEnvironment->NumGRFRequired);
break;
case PATCH_TOKEN_DATA_PARAMETER_STREAM:

View File

@ -42,19 +42,19 @@ struct Gen10CoherencyRequirements : public ::testing::Test {
GEN10TEST_F(Gen10CoherencyRequirements, coherencyCmdSize) {
auto lriSize = sizeof(MI_LOAD_REGISTER_IMM);
overrideCoherencyRequest(false, false);
auto retSize = csr->getCmdSizeForCoherency();
auto retSize = csr->getCmdSizeForComputeMode();
EXPECT_EQ(0u, retSize);
overrideCoherencyRequest(false, true);
retSize = csr->getCmdSizeForCoherency();
retSize = csr->getCmdSizeForComputeMode();
EXPECT_EQ(0u, retSize);
overrideCoherencyRequest(true, true);
retSize = csr->getCmdSizeForCoherency();
retSize = csr->getCmdSizeForComputeMode();
EXPECT_EQ(lriSize, retSize);
overrideCoherencyRequest(true, false);
retSize = csr->getCmdSizeForCoherency();
retSize = csr->getCmdSizeForComputeMode();
EXPECT_EQ(lriSize, retSize);
}
@ -68,14 +68,14 @@ GEN10TEST_F(Gen10CoherencyRequirements, coherencyCmdValues) {
expectedCmd.setDataDword(DwordBuilder::build(4, true));
overrideCoherencyRequest(true, false);
csr->programCoherency(stream, flags);
csr->programComputeMode(stream, flags);
EXPECT_EQ(lriSize, stream.getUsed());
auto cmd = reinterpret_cast<MI_LOAD_REGISTER_IMM *>(stream.getCpuBase());
EXPECT_TRUE(memcmp(&expectedCmd, cmd, lriSize) == 0);
overrideCoherencyRequest(true, true);
csr->programCoherency(stream, flags);
csr->programComputeMode(stream, flags);
EXPECT_EQ(lriSize * 2, stream.getUsed());
cmd = reinterpret_cast<MI_LOAD_REGISTER_IMM *>(ptrOffset(stream.getCpuBase(), lriSize));

View File

@ -19,14 +19,14 @@ GEN8TEST_F(Gen8CoherencyRequirements, noCoherencyProgramming) {
LinearStream stream;
DispatchFlags flags = {};
auto retSize = csr.getCmdSizeForCoherency();
auto retSize = csr.getCmdSizeForComputeMode();
EXPECT_EQ(0u, retSize);
csr.programCoherency(stream, flags);
csr.programComputeMode(stream, flags);
EXPECT_EQ(0u, stream.getUsed());
flags.requiresCoherency = true;
retSize = csr.getCmdSizeForCoherency();
retSize = csr.getCmdSizeForComputeMode();
EXPECT_EQ(0u, retSize);
csr.programCoherency(stream, flags);
csr.programComputeMode(stream, flags);
EXPECT_EQ(0u, stream.getUsed());
}

View File

@ -19,14 +19,14 @@ GEN9TEST_F(Gen9CoherencyRequirements, noCoherencyProgramming) {
LinearStream stream;
DispatchFlags flags = {};
auto retSize = csr.getCmdSizeForCoherency();
auto retSize = csr.getCmdSizeForComputeMode();
EXPECT_EQ(0u, retSize);
csr.programCoherency(stream, flags);
csr.programComputeMode(stream, flags);
EXPECT_EQ(0u, stream.getUsed());
flags.requiresCoherency = true;
retSize = csr.getCmdSizeForCoherency();
retSize = csr.getCmdSizeForComputeMode();
EXPECT_EQ(0u, retSize);
csr.programCoherency(stream, flags);
csr.programComputeMode(stream, flags);
EXPECT_EQ(0u, stream.getUsed());
}

View File

@ -8,6 +8,7 @@
#pragma once
#include "runtime/helpers/string.h"
#include "runtime/kernel/grf_config.h"
#include "runtime/kernel/kernel.h"
#include "runtime/scheduler/scheduler_kernel.h"
#include "runtime/device/device.h"
@ -118,6 +119,7 @@ class MockKernel : public Kernel {
SPatchExecutionEnvironment *executionEnvironment = new SPatchExecutionEnvironment;
memset(executionEnvironment, 0, sizeof(SPatchExecutionEnvironment));
executionEnvironment->HasDeviceEnqueue = 0;
executionEnvironment->NumGRFRequired = GrfConfig::DefaultGrfNumber;
info->patchInfo.executionEnvironment = executionEnvironment;
info->crossThreadData = new char[crossThreadSize];
@ -243,6 +245,8 @@ class MockKernelWithInternals {
memset(&executionEnvironment, 0, sizeof(SPatchExecutionEnvironment));
memset(&executionEnvironmentBlock, 0, sizeof(SPatchExecutionEnvironment));
memset(&dataParameterStream, 0, sizeof(SPatchDataParameterStream));
executionEnvironment.NumGRFRequired = GrfConfig::DefaultGrfNumber;
executionEnvironmentBlock.NumGRFRequired = GrfConfig::DefaultGrfNumber;
kernelHeader.SurfaceStateHeapSize = sizeof(sshLocal);
threadPayload.LocalIDXPresent = 1;
threadPayload.LocalIDYPresent = 1;
@ -324,6 +328,7 @@ class MockParentKernel : public Kernel {
SPatchExecutionEnvironment *executionEnvironment = new SPatchExecutionEnvironment;
*executionEnvironment = {};
executionEnvironment->HasDeviceEnqueue = 1;
executionEnvironment->NumGRFRequired = GrfConfig::DefaultGrfNumber;
info->patchInfo.executionEnvironment = executionEnvironment;
SPatchAllocateStatelessDefaultDeviceQueueSurface *allocateDeviceQueue = new SPatchAllocateStatelessDefaultDeviceQueueSurface;