diff --git a/runtime/command_queue/enqueue_common.h b/runtime/command_queue/enqueue_common.h index 8f328b6e96..651bc9981e 100644 --- a/runtime/command_queue/enqueue_common.h +++ b/runtime/command_queue/enqueue_common.h @@ -534,6 +534,7 @@ CompletionStamp CommandQueueHw::enqueueNonBlocked( } auto mediaSamplerRequired = false; + uint32_t numGrfRequired = GrfConfig::DefaultGrfNumber; Kernel *kernel = nullptr; for (auto &dispatchInfo : multiDispatchInfo) { if (kernel != dispatchInfo.getKernel()) { @@ -544,6 +545,8 @@ CompletionStamp CommandQueueHw::enqueueNonBlocked( kernel->makeResident(commandStreamReceiver); requiresCoherency |= kernel->requiresCoherency(); mediaSamplerRequired |= kernel->isVmeKernel(); + auto numGrfRequiredByKernel = kernel->getKernelInfo().patchInfo.executionEnvironment->NumGRFRequired; + numGrfRequired = std::max(numGrfRequired, numGrfRequiredByKernel); } if (mediaSamplerRequired) { @@ -593,7 +596,7 @@ CompletionStamp CommandQueueHw::enqueueNonBlocked( if (commandStreamReceiver.peekTimestampPacketWriteEnabled()) { dispatchFlags.outOfDeviceDependencies = &eventsRequest; } - + dispatchFlags.numGrfRequired = numGrfRequired; DEBUG_BREAK_IF(taskLevel >= Event::eventNotReady); if (gtpinIsGTPinInitialized()) { diff --git a/runtime/command_stream/command_stream_receiver.h b/runtime/command_stream/command_stream_receiver.h index a9c726f489..1741438c5c 100644 --- a/runtime/command_stream/command_stream_receiver.h +++ b/runtime/command_stream/command_stream_receiver.h @@ -15,6 +15,7 @@ #include "runtime/helpers/completion_stamp.h" #include "runtime/helpers/flat_batch_buffer_helper.h" #include "runtime/helpers/options.h" +#include "runtime/kernel/grf_config.h" #include "runtime/indirect_heap/indirect_heap.h" #include #include @@ -179,6 +180,7 @@ class CommandStreamReceiver { int8_t lastMediaSamplerConfig = -1; PreemptionMode lastPreemptionMode = PreemptionMode::Initial; uint32_t latestSentStatelessMocsConfig = 0; + uint32_t lastSentNumGrfRequired = GrfConfig::DefaultGrfNumber; LinearStream commandStream; diff --git a/runtime/command_stream/command_stream_receiver_hw.h b/runtime/command_stream/command_stream_receiver_hw.h index 47fa58e40c..fed3f5bd10 100644 --- a/runtime/command_stream/command_stream_receiver_hw.h +++ b/runtime/command_stream/command_stream_receiver_hw.h @@ -50,9 +50,9 @@ class CommandStreamReceiverHw : public CommandStreamReceiver { size_t getCmdSizeForPreemption(const DispatchFlags &dispatchFlags) const; size_t getCmdSizeForL3Config() const; size_t getCmdSizeForPipelineSelect() const; - size_t getCmdSizeForCoherency(); + size_t getCmdSizeForComputeMode(); size_t getCmdSizeForMediaSampler(bool mediaSamplerRequired) const; - void programCoherency(LinearStream &csr, DispatchFlags &dispatchFlags); + void programComputeMode(LinearStream &csr, DispatchFlags &dispatchFlags); void waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep, OsContext &osContext) override; const HardwareInfo &peekHwInfo() const { return hwInfo; } diff --git a/runtime/command_stream/command_stream_receiver_hw.inl b/runtime/command_stream/command_stream_receiver_hw.inl index ead8c2fb04..13ee9662a2 100644 --- a/runtime/command_stream/command_stream_receiver_hw.inl +++ b/runtime/command_stream/command_stream_receiver_hw.inl @@ -228,6 +228,7 @@ CompletionStamp CommandStreamReceiverHw::flushTask( csrSizeRequestFlags.coherencyRequestChanged = this->lastSentCoherencyRequest != static_cast(dispatchFlags.requiresCoherency); csrSizeRequestFlags.preemptionRequestChanged = this->lastPreemptionMode != dispatchFlags.preemptionMode; csrSizeRequestFlags.mediaSamplerConfigChanged = this->lastMediaSamplerConfig != static_cast(dispatchFlags.mediaSamplerRequired); + csrSizeRequestFlags.numGrfRequiredChanged = this->lastSentNumGrfRequired != dispatchFlags.numGrfRequired; size_t requiredScratchSizeInBytes = requiredScratchSize * device.getDeviceInfo().computeUnitsUsedForScratch; @@ -255,7 +256,7 @@ CompletionStamp CommandStreamReceiverHw::flushTask( } initPageTableManagerRegisters(commandStreamCSR); programPreemption(commandStreamCSR, device, dispatchFlags); - programCoherency(commandStreamCSR, dispatchFlags); + programComputeMode(commandStreamCSR, dispatchFlags); programL3(commandStreamCSR, dispatchFlags, newL3Config); programPipelineSelect(commandStreamCSR, dispatchFlags); programPreamble(commandStreamCSR, device, dispatchFlags, newL3Config); @@ -628,7 +629,7 @@ size_t CommandStreamReceiverHw::getRequiredCmdStreamSize(const Dispat size += sizeof(typename GfxFamily::MI_BATCH_BUFFER_START); size += getCmdSizeForL3Config(); - size += getCmdSizeForCoherency(); + size += getCmdSizeForComputeMode(); size += getCmdSizeForMediaSampler(dispatchFlags.mediaSamplerRequired); size += getCmdSizeForPipelineSelect(); size += getCmdSizeForPreemption(dispatchFlags); diff --git a/runtime/command_stream/csr_definitions.h b/runtime/command_stream/csr_definitions.h index ced2347515..ccb291db82 100644 --- a/runtime/command_stream/csr_definitions.h +++ b/runtime/command_stream/csr_definitions.h @@ -9,6 +9,7 @@ #include "runtime/memory_manager/memory_constants.h" #include "runtime/helpers/hw_info.h" #include "runtime/helpers/properties_helper.h" +#include "runtime/kernel/grf_config.h" #include namespace OCLRT { @@ -43,6 +44,7 @@ struct DispatchFlags { FlushStampTrackingObj *flushStampReference = nullptr; PreemptionMode preemptionMode = PreemptionMode::Disabled; EventsRequest *outOfDeviceDependencies = nullptr; + uint32_t numGrfRequired = GrfConfig::DefaultGrfNumber; }; struct CsrSizeRequestFlags { @@ -51,5 +53,6 @@ struct CsrSizeRequestFlags { bool preemptionRequestChanged = false; bool mediaSamplerConfigChanged = false; bool hasSharedHandles = false; + bool numGrfRequiredChanged = false; }; } // namespace OCLRT diff --git a/runtime/gen10/command_stream_receiver_hw_gen10.cpp b/runtime/gen10/command_stream_receiver_hw_gen10.cpp index 58011bd8e6..2c802b707c 100644 --- a/runtime/gen10/command_stream_receiver_hw_gen10.cpp +++ b/runtime/gen10/command_stream_receiver_hw_gen10.cpp @@ -16,7 +16,7 @@ typedef CNLFamily Family; static auto gfxCore = IGFX_GEN10_CORE; template <> -size_t CommandStreamReceiverHw::getCmdSizeForCoherency() { +size_t CommandStreamReceiverHw::getCmdSizeForComputeMode() { if (csrSizeRequestFlags.coherencyRequestChanged) { return sizeof(typename Family::MI_LOAD_REGISTER_IMM); } @@ -24,7 +24,7 @@ size_t CommandStreamReceiverHw::getCmdSizeForCoherency() { } template <> -void CommandStreamReceiverHw::programCoherency(LinearStream &stream, DispatchFlags &dispatchFlags) { +void CommandStreamReceiverHw::programComputeMode(LinearStream &stream, DispatchFlags &dispatchFlags) { if (csrSizeRequestFlags.coherencyRequestChanged) { LriHelper::program(&stream, gen10HdcModeRegisterAddresss, DwordBuilder::build(4, true, !dispatchFlags.requiresCoherency)); this->lastSentCoherencyRequest = static_cast(dispatchFlags.requiresCoherency); diff --git a/runtime/gen8/command_stream_receiver_hw_gen8.cpp b/runtime/gen8/command_stream_receiver_hw_gen8.cpp index c08351ffb5..fe33903ae5 100644 --- a/runtime/gen8/command_stream_receiver_hw_gen8.cpp +++ b/runtime/gen8/command_stream_receiver_hw_gen8.cpp @@ -15,12 +15,12 @@ typedef BDWFamily Family; static auto gfxCore = IGFX_GEN8_CORE; template <> -size_t CommandStreamReceiverHw::getCmdSizeForCoherency() { +size_t CommandStreamReceiverHw::getCmdSizeForComputeMode() { return 0; } template <> -void CommandStreamReceiverHw::programCoherency(LinearStream &stream, DispatchFlags &dispatchFlags) { +void CommandStreamReceiverHw::programComputeMode(LinearStream &stream, DispatchFlags &dispatchFlags) { } template <> diff --git a/runtime/gen9/command_stream_receiver_hw_gen9.cpp b/runtime/gen9/command_stream_receiver_hw_gen9.cpp index 696f76fc9b..c4669182b4 100644 --- a/runtime/gen9/command_stream_receiver_hw_gen9.cpp +++ b/runtime/gen9/command_stream_receiver_hw_gen9.cpp @@ -15,12 +15,12 @@ typedef SKLFamily Family; static auto gfxCore = IGFX_GEN9_CORE; template <> -size_t CommandStreamReceiverHw::getCmdSizeForCoherency() { +size_t CommandStreamReceiverHw::getCmdSizeForComputeMode() { return 0; } template <> -void CommandStreamReceiverHw::programCoherency(LinearStream &stream, DispatchFlags &dispatchFlags) { +void CommandStreamReceiverHw::programComputeMode(LinearStream &stream, DispatchFlags &dispatchFlags) { } template <> diff --git a/runtime/kernel/CMakeLists.txt b/runtime/kernel/CMakeLists.txt index b2326e07e7..6d42fc2f6b 100644 --- a/runtime/kernel/CMakeLists.txt +++ b/runtime/kernel/CMakeLists.txt @@ -7,11 +7,13 @@ set(RUNTIME_SRCS_KERNEL ${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt ${CMAKE_CURRENT_SOURCE_DIR}/dynamic_kernel_info.h + ${CMAKE_CURRENT_SOURCE_DIR}/grf_config.h ${CMAKE_CURRENT_SOURCE_DIR}/image_transformer.cpp ${CMAKE_CURRENT_SOURCE_DIR}/image_transformer.h ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cpp ${CMAKE_CURRENT_SOURCE_DIR}/kernel.h ${CMAKE_CURRENT_SOURCE_DIR}/kernel.inl + ${CMAKE_CURRENT_SOURCE_DIR}${BRANCH_DIR_SUFFIX}/kernel_reconfiguration.cpp ) target_sources(${NEO_STATIC_LIB_NAME} PRIVATE ${RUNTIME_SRCS_KERNEL}) set_property(GLOBAL PROPERTY RUNTIME_SRCS_KERNEL ${RUNTIME_SRCS_KERNEL}) diff --git a/runtime/kernel/grf_config.h b/runtime/kernel/grf_config.h new file mode 100644 index 0000000000..d1e227c51b --- /dev/null +++ b/runtime/kernel/grf_config.h @@ -0,0 +1,12 @@ +/* + * Copyright (C) 2018 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#pragma once + +namespace GrfConfig { +constexpr uint32_t DefaultGrfNumber = 128u; +} \ No newline at end of file diff --git a/runtime/kernel/kernel.cpp b/runtime/kernel/kernel.cpp index fe54f72ff5..31fa2ff8ae 100644 --- a/runtime/kernel/kernel.cpp +++ b/runtime/kernel/kernel.cpp @@ -354,6 +354,8 @@ cl_int Kernel::initialize() { program->allocateBlockPrivateSurfaces(); } + reconfigureKernel(); + retVal = CL_SUCCESS; } while (false); diff --git a/runtime/kernel/kernel.h b/runtime/kernel/kernel.h index 0c17236590..8879c5e033 100644 --- a/runtime/kernel/kernel.h +++ b/runtime/kernel/kernel.h @@ -458,6 +458,8 @@ class Kernel : public BaseObject<_cl_kernel> { void resolveArgs(); + void reconfigureKernel(); + Program *program; Context *context; const Device &device; diff --git a/runtime/kernel/kernel_reconfiguration.cpp b/runtime/kernel/kernel_reconfiguration.cpp new file mode 100644 index 0000000000..b30d675e7b --- /dev/null +++ b/runtime/kernel/kernel_reconfiguration.cpp @@ -0,0 +1,13 @@ +/* + * Copyright (C) 2018 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "runtime/kernel/kernel.h" + +namespace OCLRT { +void Kernel::reconfigureKernel() { +} +} // namespace OCLRT \ No newline at end of file diff --git a/runtime/program/kernel_info.h b/runtime/program/kernel_info.h index c08c682dba..a019966d3c 100644 --- a/runtime/program/kernel_info.h +++ b/runtime/program/kernel_info.h @@ -239,5 +239,6 @@ struct KernelInfo { bool isKernelHeapSubstituted = false; GraphicsAllocation *kernelAllocation = nullptr; DebugData debugData; + bool computeMode = false; }; } // namespace OCLRT diff --git a/runtime/program/process_gen_binary.cpp b/runtime/program/process_gen_binary.cpp index 1818f57e0d..484429acf6 100644 --- a/runtime/program/process_gen_binary.cpp +++ b/runtime/program/process_gen_binary.cpp @@ -535,7 +535,8 @@ cl_int Program::parsePatchList(KernelInfo &kernelInfo) { "\n .SubgroupIndependentForwardProgressRequired", kernelInfo.patchInfo.executionEnvironment->SubgroupIndependentForwardProgressRequired, "\n .WorkgroupWalkOrderDim0", kernelInfo.workgroupWalkOrder[0], "\n .WorkgroupWalkOrderDim1", kernelInfo.workgroupWalkOrder[1], - "\n .WorkgroupWalkOrderDim2", kernelInfo.workgroupWalkOrder[2]); + "\n .WorkgroupWalkOrderDim2", kernelInfo.workgroupWalkOrder[2], + "\n .NumGRFRequired", kernelInfo.patchInfo.executionEnvironment->NumGRFRequired); break; case PATCH_TOKEN_DATA_PARAMETER_STREAM: diff --git a/unit_tests/gen10/coherency_tests_gen10.cpp b/unit_tests/gen10/coherency_tests_gen10.cpp index f1468f2334..6dc3fef593 100644 --- a/unit_tests/gen10/coherency_tests_gen10.cpp +++ b/unit_tests/gen10/coherency_tests_gen10.cpp @@ -42,19 +42,19 @@ struct Gen10CoherencyRequirements : public ::testing::Test { GEN10TEST_F(Gen10CoherencyRequirements, coherencyCmdSize) { auto lriSize = sizeof(MI_LOAD_REGISTER_IMM); overrideCoherencyRequest(false, false); - auto retSize = csr->getCmdSizeForCoherency(); + auto retSize = csr->getCmdSizeForComputeMode(); EXPECT_EQ(0u, retSize); overrideCoherencyRequest(false, true); - retSize = csr->getCmdSizeForCoherency(); + retSize = csr->getCmdSizeForComputeMode(); EXPECT_EQ(0u, retSize); overrideCoherencyRequest(true, true); - retSize = csr->getCmdSizeForCoherency(); + retSize = csr->getCmdSizeForComputeMode(); EXPECT_EQ(lriSize, retSize); overrideCoherencyRequest(true, false); - retSize = csr->getCmdSizeForCoherency(); + retSize = csr->getCmdSizeForComputeMode(); EXPECT_EQ(lriSize, retSize); } @@ -68,14 +68,14 @@ GEN10TEST_F(Gen10CoherencyRequirements, coherencyCmdValues) { expectedCmd.setDataDword(DwordBuilder::build(4, true)); overrideCoherencyRequest(true, false); - csr->programCoherency(stream, flags); + csr->programComputeMode(stream, flags); EXPECT_EQ(lriSize, stream.getUsed()); auto cmd = reinterpret_cast(stream.getCpuBase()); EXPECT_TRUE(memcmp(&expectedCmd, cmd, lriSize) == 0); overrideCoherencyRequest(true, true); - csr->programCoherency(stream, flags); + csr->programComputeMode(stream, flags); EXPECT_EQ(lriSize * 2, stream.getUsed()); cmd = reinterpret_cast(ptrOffset(stream.getCpuBase(), lriSize)); diff --git a/unit_tests/gen8/coherency_tests_gen8.cpp b/unit_tests/gen8/coherency_tests_gen8.cpp index 6df8f18b88..e8a9246e76 100644 --- a/unit_tests/gen8/coherency_tests_gen8.cpp +++ b/unit_tests/gen8/coherency_tests_gen8.cpp @@ -19,14 +19,14 @@ GEN8TEST_F(Gen8CoherencyRequirements, noCoherencyProgramming) { LinearStream stream; DispatchFlags flags = {}; - auto retSize = csr.getCmdSizeForCoherency(); + auto retSize = csr.getCmdSizeForComputeMode(); EXPECT_EQ(0u, retSize); - csr.programCoherency(stream, flags); + csr.programComputeMode(stream, flags); EXPECT_EQ(0u, stream.getUsed()); flags.requiresCoherency = true; - retSize = csr.getCmdSizeForCoherency(); + retSize = csr.getCmdSizeForComputeMode(); EXPECT_EQ(0u, retSize); - csr.programCoherency(stream, flags); + csr.programComputeMode(stream, flags); EXPECT_EQ(0u, stream.getUsed()); } diff --git a/unit_tests/gen9/coherency_tests_gen9.cpp b/unit_tests/gen9/coherency_tests_gen9.cpp index 838341600c..e42ae370b6 100644 --- a/unit_tests/gen9/coherency_tests_gen9.cpp +++ b/unit_tests/gen9/coherency_tests_gen9.cpp @@ -19,14 +19,14 @@ GEN9TEST_F(Gen9CoherencyRequirements, noCoherencyProgramming) { LinearStream stream; DispatchFlags flags = {}; - auto retSize = csr.getCmdSizeForCoherency(); + auto retSize = csr.getCmdSizeForComputeMode(); EXPECT_EQ(0u, retSize); - csr.programCoherency(stream, flags); + csr.programComputeMode(stream, flags); EXPECT_EQ(0u, stream.getUsed()); flags.requiresCoherency = true; - retSize = csr.getCmdSizeForCoherency(); + retSize = csr.getCmdSizeForComputeMode(); EXPECT_EQ(0u, retSize); - csr.programCoherency(stream, flags); + csr.programComputeMode(stream, flags); EXPECT_EQ(0u, stream.getUsed()); } diff --git a/unit_tests/mocks/mock_kernel.h b/unit_tests/mocks/mock_kernel.h index 2ed7657d2c..3426a70fa2 100644 --- a/unit_tests/mocks/mock_kernel.h +++ b/unit_tests/mocks/mock_kernel.h @@ -8,6 +8,7 @@ #pragma once #include "runtime/helpers/string.h" +#include "runtime/kernel/grf_config.h" #include "runtime/kernel/kernel.h" #include "runtime/scheduler/scheduler_kernel.h" #include "runtime/device/device.h" @@ -118,6 +119,7 @@ class MockKernel : public Kernel { SPatchExecutionEnvironment *executionEnvironment = new SPatchExecutionEnvironment; memset(executionEnvironment, 0, sizeof(SPatchExecutionEnvironment)); executionEnvironment->HasDeviceEnqueue = 0; + executionEnvironment->NumGRFRequired = GrfConfig::DefaultGrfNumber; info->patchInfo.executionEnvironment = executionEnvironment; info->crossThreadData = new char[crossThreadSize]; @@ -243,6 +245,8 @@ class MockKernelWithInternals { memset(&executionEnvironment, 0, sizeof(SPatchExecutionEnvironment)); memset(&executionEnvironmentBlock, 0, sizeof(SPatchExecutionEnvironment)); memset(&dataParameterStream, 0, sizeof(SPatchDataParameterStream)); + executionEnvironment.NumGRFRequired = GrfConfig::DefaultGrfNumber; + executionEnvironmentBlock.NumGRFRequired = GrfConfig::DefaultGrfNumber; kernelHeader.SurfaceStateHeapSize = sizeof(sshLocal); threadPayload.LocalIDXPresent = 1; threadPayload.LocalIDYPresent = 1; @@ -324,6 +328,7 @@ class MockParentKernel : public Kernel { SPatchExecutionEnvironment *executionEnvironment = new SPatchExecutionEnvironment; *executionEnvironment = {}; executionEnvironment->HasDeviceEnqueue = 1; + executionEnvironment->NumGRFRequired = GrfConfig::DefaultGrfNumber; info->patchInfo.executionEnvironment = executionEnvironment; SPatchAllocateStatelessDefaultDeviceQueueSurface *allocateDeviceQueue = new SPatchAllocateStatelessDefaultDeviceQueueSurface;