refactor: remove unused kernel tunning
Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
This commit is contained in:
parent
b048d0e557
commit
62619f9525
|
@ -143,12 +143,6 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
|
|||
DebugPauseState::hasUserStartConfirmation, hwInfo);
|
||||
}
|
||||
|
||||
mainKernel->performKernelTuning(commandQueue.getGpgpuCommandStreamReceiver(),
|
||||
multiDispatchInfo.begin()->getLocalWorkgroupSize(),
|
||||
multiDispatchInfo.begin()->getActualWorkgroupSize(),
|
||||
multiDispatchInfo.begin()->getOffset(),
|
||||
walkerArgs.currentTimestampPacketNodes);
|
||||
|
||||
walkerArgs.currentDispatchIndex = 0;
|
||||
|
||||
for (auto &dispatchInfo : multiDispatchInfo) {
|
||||
|
|
|
@ -1230,88 +1230,10 @@ inline void Kernel::makeArgsResident(CommandStreamReceiver &commandStreamReceive
|
|||
}
|
||||
}
|
||||
|
||||
void Kernel::performKernelTuning(CommandStreamReceiver &commandStreamReceiver, const Vec3<size_t> &lws, const Vec3<size_t> &gws, const Vec3<size_t> &offsets, TimestampPacketContainer *timestampContainer) {
|
||||
auto performTunning = TunningType::disabled;
|
||||
|
||||
if (debugManager.flags.EnableKernelTunning.get() != -1) {
|
||||
performTunning = static_cast<TunningType>(debugManager.flags.EnableKernelTunning.get());
|
||||
}
|
||||
|
||||
if (performTunning == TunningType::full) {
|
||||
KernelConfig config{gws, lws, offsets};
|
||||
|
||||
auto submissionDataIt = this->kernelSubmissionMap.find(config);
|
||||
if (submissionDataIt == this->kernelSubmissionMap.end()) {
|
||||
KernelSubmissionData submissionData;
|
||||
submissionData.kernelStandardTimestamps = std::make_unique<TimestampPacketContainer>();
|
||||
submissionData.kernelSubdeviceTimestamps = std::make_unique<TimestampPacketContainer>();
|
||||
submissionData.status = TunningStatus::standardTunningInProgress;
|
||||
submissionData.kernelStandardTimestamps->assignAndIncrementNodesRefCounts(*timestampContainer);
|
||||
this->kernelSubmissionMap[config] = std::move(submissionData);
|
||||
this->singleSubdevicePreferredInCurrentEnqueue = false;
|
||||
return;
|
||||
}
|
||||
|
||||
auto &submissionData = submissionDataIt->second;
|
||||
|
||||
if (submissionData.status == TunningStatus::tunningDone) {
|
||||
this->singleSubdevicePreferredInCurrentEnqueue = submissionData.singleSubdevicePreferred;
|
||||
}
|
||||
|
||||
if (submissionData.status == TunningStatus::subdeviceTunningInProgress) {
|
||||
if (this->hasTunningFinished(submissionData)) {
|
||||
submissionData.status = TunningStatus::tunningDone;
|
||||
submissionData.kernelStandardTimestamps.reset();
|
||||
submissionData.kernelSubdeviceTimestamps.reset();
|
||||
this->singleSubdevicePreferredInCurrentEnqueue = submissionData.singleSubdevicePreferred;
|
||||
} else {
|
||||
this->singleSubdevicePreferredInCurrentEnqueue = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (submissionData.status == TunningStatus::standardTunningInProgress) {
|
||||
submissionData.status = TunningStatus::subdeviceTunningInProgress;
|
||||
submissionData.kernelSubdeviceTimestamps->assignAndIncrementNodesRefCounts(*timestampContainer);
|
||||
this->singleSubdevicePreferredInCurrentEnqueue = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool Kernel::hasTunningFinished(KernelSubmissionData &submissionData) {
|
||||
if (!this->hasRunFinished(submissionData.kernelStandardTimestamps.get()) ||
|
||||
!this->hasRunFinished(submissionData.kernelSubdeviceTimestamps.get())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
uint64_t globalStartTS = 0u;
|
||||
uint64_t globalEndTS = 0u;
|
||||
|
||||
Event::getBoundaryTimestampValues(submissionData.kernelStandardTimestamps.get(), globalStartTS, globalEndTS);
|
||||
auto standardTSDiff = globalEndTS - globalStartTS;
|
||||
|
||||
Event::getBoundaryTimestampValues(submissionData.kernelSubdeviceTimestamps.get(), globalStartTS, globalEndTS);
|
||||
auto subdeviceTSDiff = globalEndTS - globalStartTS;
|
||||
|
||||
submissionData.singleSubdevicePreferred = standardTSDiff > subdeviceTSDiff;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Kernel::hasRunFinished(TimestampPacketContainer *timestampContainer) {
|
||||
for (const auto &node : timestampContainer->peekNodes()) {
|
||||
for (uint32_t i = 0; i < node->getPacketsUsed(); i++) {
|
||||
if (node->getContextEndValue(i) == 1) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Kernel::isSingleSubdevicePreferred() const {
|
||||
auto &gfxCoreHelper = this->getGfxCoreHelper();
|
||||
|
||||
return this->singleSubdevicePreferredInCurrentEnqueue || gfxCoreHelper.singleTileExecImplicitScalingRequired(this->usesSyncBuffer());
|
||||
return gfxCoreHelper.singleTileExecImplicitScalingRequired(this->usesSyncBuffer());
|
||||
}
|
||||
|
||||
void Kernel::setInlineSamplers() {
|
||||
|
|
|
@ -74,18 +74,6 @@ class Kernel : public ReferenceTrackedObject<Kernel>, NEO::NonCopyableAndNonMova
|
|||
bool isSetToNullptr = false;
|
||||
};
|
||||
|
||||
enum class TunningStatus {
|
||||
standardTunningInProgress,
|
||||
subdeviceTunningInProgress,
|
||||
tunningDone
|
||||
};
|
||||
|
||||
enum class TunningType {
|
||||
disabled,
|
||||
simple,
|
||||
full
|
||||
};
|
||||
|
||||
typedef int32_t (Kernel::*KernelArgHandler)(uint32_t argIndex,
|
||||
size_t argSize,
|
||||
const void *argVal);
|
||||
|
@ -279,7 +267,6 @@ class Kernel : public ReferenceTrackedObject<Kernel>, NEO::NonCopyableAndNonMova
|
|||
bool isVmeKernel() const { return kernelInfo.kernelDescriptor.kernelAttributes.flags.usesVme; }
|
||||
bool requiresSystolicPipelineSelectMode() const { return systolicPipelineSelectMode; }
|
||||
|
||||
void performKernelTuning(CommandStreamReceiver &commandStreamReceiver, const Vec3<size_t> &lws, const Vec3<size_t> &gws, const Vec3<size_t> &offsets, TimestampPacketContainer *timestampContainer);
|
||||
MOCKABLE_VIRTUAL bool isSingleSubdevicePreferred() const;
|
||||
void setInlineSamplers();
|
||||
|
||||
|
@ -405,41 +392,6 @@ class Kernel : public ReferenceTrackedObject<Kernel>, NEO::NonCopyableAndNonMova
|
|||
}
|
||||
|
||||
protected:
|
||||
struct KernelConfig {
|
||||
Vec3<size_t> gws;
|
||||
Vec3<size_t> lws;
|
||||
Vec3<size_t> offsets;
|
||||
bool operator==(const KernelConfig &other) const { return this->gws == other.gws && this->lws == other.lws && this->offsets == other.offsets; }
|
||||
};
|
||||
struct KernelConfigHash {
|
||||
size_t operator()(KernelConfig const &config) const {
|
||||
auto hash = std::hash<size_t>{};
|
||||
size_t gwsHashX = hash(config.gws.x);
|
||||
size_t gwsHashY = hash(config.gws.y);
|
||||
size_t gwsHashZ = hash(config.gws.z);
|
||||
size_t gwsHash = hashCombine(gwsHashX, gwsHashY, gwsHashZ);
|
||||
size_t lwsHashX = hash(config.lws.x);
|
||||
size_t lwsHashY = hash(config.lws.y);
|
||||
size_t lwsHashZ = hash(config.lws.z);
|
||||
size_t lwsHash = hashCombine(lwsHashX, lwsHashY, lwsHashZ);
|
||||
size_t offsetsHashX = hash(config.offsets.x);
|
||||
size_t offsetsHashY = hash(config.offsets.y);
|
||||
size_t offsetsHashZ = hash(config.offsets.z);
|
||||
size_t offsetsHash = hashCombine(offsetsHashX, offsetsHashY, offsetsHashZ);
|
||||
return hashCombine(gwsHash, lwsHash, offsetsHash);
|
||||
}
|
||||
|
||||
size_t hashCombine(size_t hash1, size_t hash2, size_t hash3) const {
|
||||
return (hash1 ^ (hash2 << 1u)) ^ (hash3 << 2u);
|
||||
}
|
||||
};
|
||||
struct KernelSubmissionData {
|
||||
std::unique_ptr<TimestampPacketContainer> kernelStandardTimestamps;
|
||||
std::unique_ptr<TimestampPacketContainer> kernelSubdeviceTimestamps;
|
||||
TunningStatus status;
|
||||
bool singleSubdevicePreferred = false;
|
||||
};
|
||||
|
||||
Kernel(Program *programArg, const KernelInfo &kernelInfo, ClDevice &clDevice);
|
||||
|
||||
void makeArgsResident(CommandStreamReceiver &commandStreamReceiver);
|
||||
|
@ -462,9 +414,6 @@ class Kernel : public ReferenceTrackedObject<Kernel>, NEO::NonCopyableAndNonMova
|
|||
}
|
||||
cl_int patchPrivateSurface();
|
||||
|
||||
bool hasTunningFinished(KernelSubmissionData &submissionData);
|
||||
bool hasRunFinished(TimestampPacketContainer *timestampContainer);
|
||||
|
||||
void initializeLocalIdsCache();
|
||||
std::unique_ptr<LocalIdsCache> localIdsCache;
|
||||
|
||||
|
@ -472,8 +421,6 @@ class Kernel : public ReferenceTrackedObject<Kernel>, NEO::NonCopyableAndNonMova
|
|||
|
||||
std::map<uint32_t, MemObj *> migratableArgsMap{};
|
||||
|
||||
std::unordered_map<KernelConfig, KernelSubmissionData, KernelConfigHash> kernelSubmissionMap;
|
||||
|
||||
std::vector<SimpleKernelArgInfo> kernelArguments;
|
||||
std::vector<KernelArgHandler> kernelArgHandlers;
|
||||
std::vector<GraphicsAllocation *> kernelSvmGfxAllocations;
|
||||
|
@ -522,7 +469,6 @@ class Kernel : public ReferenceTrackedObject<Kernel>, NEO::NonCopyableAndNonMova
|
|||
bool auxTranslationRequired = false;
|
||||
bool systolicPipelineSelectMode = false;
|
||||
bool isUnifiedMemorySyncRequired = true;
|
||||
bool singleSubdevicePreferredInCurrentEnqueue = false;
|
||||
bool kernelHasIndirectAccess = true;
|
||||
bool anyKernelArgumentUsingSystemMemory = false;
|
||||
bool isDestinationAllocationInSystemMemory = false;
|
||||
|
|
|
@ -423,13 +423,9 @@ TEST_F(KernelTests, WhenIsSingleSubdevicePreferredIsCalledThenCorrectValuesAreRe
|
|||
std::unique_ptr<MockKernel> kernel{MockKernel::create<MockKernel>(pClDevice->getDevice(), pProgram)};
|
||||
for (auto usesSyncBuffer : ::testing::Bool()) {
|
||||
kernel->getAllocatedKernelInfo()->kernelDescriptor.kernelAttributes.flags.usesSyncBuffer = usesSyncBuffer;
|
||||
for (auto singleSubdevicePreferredInCurrentEnqueue : ::testing::Bool()) {
|
||||
kernel->singleSubdevicePreferredInCurrentEnqueue = singleSubdevicePreferredInCurrentEnqueue;
|
||||
|
||||
EXPECT_EQ(usesSyncBuffer, kernel->usesSyncBuffer());
|
||||
auto expectedSingleSubdevicePreferredInCurrentEnqueue = singleSubdevicePreferredInCurrentEnqueue || helper.singleTileExecImplicitScalingRequired(usesSyncBuffer);
|
||||
EXPECT_EQ(expectedSingleSubdevicePreferredInCurrentEnqueue, kernel->isSingleSubdevicePreferred());
|
||||
}
|
||||
EXPECT_EQ(usesSyncBuffer, kernel->usesSyncBuffer());
|
||||
EXPECT_EQ(helper.singleTileExecImplicitScalingRequired(usesSyncBuffer), kernel->isSingleSubdevicePreferred());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2371,118 +2367,6 @@ HWTEST_F(KernelResidencyTest, givenKernelWithNoKernelArgAtomicAndImplicitArgsHas
|
|||
memoryManager->freeGraphicsMemory(pKernelInfo->kernelAllocation);
|
||||
}
|
||||
|
||||
TEST(KernelConfigTests, givenTwoKernelConfigsWhenCompareThenResultsAreCorrect) {
|
||||
Vec3<size_t> lws{1, 1, 1};
|
||||
Vec3<size_t> gws{1, 1, 1};
|
||||
Vec3<size_t> offsets{1, 1, 1};
|
||||
MockKernel::KernelConfig config{gws, lws, offsets};
|
||||
MockKernel::KernelConfig config2{gws, lws, offsets};
|
||||
EXPECT_TRUE(config == config2);
|
||||
|
||||
config2.offsets.z = 2;
|
||||
EXPECT_FALSE(config == config2);
|
||||
|
||||
config2.lws.z = 2;
|
||||
config2.offsets.z = 1;
|
||||
EXPECT_FALSE(config == config2);
|
||||
|
||||
config2.lws.z = 1;
|
||||
config2.gws.z = 2;
|
||||
EXPECT_FALSE(config == config2);
|
||||
}
|
||||
|
||||
HWTEST_F(KernelResidencyTest, givenEnableFullKernelTuningWhenPerformTunningThenKernelConfigDataIsTracked) {
|
||||
using TimestampPacketType = typename FamilyType::TimestampPacketType;
|
||||
DebugManagerStateRestore restorer;
|
||||
debugManager.flags.EnableKernelTunning.set(2u);
|
||||
|
||||
auto &commandStreamReceiver = this->pDevice->getUltCommandStreamReceiver<FamilyType>();
|
||||
MockKernelWithInternals mockKernel(*this->pClDevice);
|
||||
|
||||
Vec3<size_t> lws{1, 1, 1};
|
||||
Vec3<size_t> gws{1, 1, 1};
|
||||
Vec3<size_t> offsets{1, 1, 1};
|
||||
MockKernel::KernelConfig config{gws, lws, offsets};
|
||||
|
||||
MockTimestampPacketContainer container(*commandStreamReceiver.getTimestampPacketAllocator(), 1);
|
||||
MockTimestampPacketContainer subdeviceContainer(*commandStreamReceiver.getTimestampPacketAllocator(), 2);
|
||||
|
||||
auto result = mockKernel.mockKernel->kernelSubmissionMap.find(config);
|
||||
EXPECT_EQ(result, mockKernel.mockKernel->kernelSubmissionMap.end());
|
||||
|
||||
mockKernel.mockKernel->performKernelTuning(commandStreamReceiver, lws, gws, offsets, &container);
|
||||
|
||||
result = mockKernel.mockKernel->kernelSubmissionMap.find(config);
|
||||
EXPECT_NE(result, mockKernel.mockKernel->kernelSubmissionMap.end());
|
||||
EXPECT_EQ(result->second.status, MockKernel::TunningStatus::standardTunningInProgress);
|
||||
EXPECT_FALSE(mockKernel.mockKernel->singleSubdevicePreferredInCurrentEnqueue);
|
||||
|
||||
mockKernel.mockKernel->performKernelTuning(commandStreamReceiver, lws, gws, offsets, &subdeviceContainer);
|
||||
|
||||
result = mockKernel.mockKernel->kernelSubmissionMap.find(config);
|
||||
EXPECT_NE(result, mockKernel.mockKernel->kernelSubmissionMap.end());
|
||||
EXPECT_EQ(result->second.status, MockKernel::TunningStatus::subdeviceTunningInProgress);
|
||||
EXPECT_TRUE(mockKernel.mockKernel->singleSubdevicePreferredInCurrentEnqueue);
|
||||
|
||||
mockKernel.mockKernel->performKernelTuning(commandStreamReceiver, lws, gws, offsets, &container);
|
||||
|
||||
result = mockKernel.mockKernel->kernelSubmissionMap.find(config);
|
||||
EXPECT_NE(result, mockKernel.mockKernel->kernelSubmissionMap.end());
|
||||
EXPECT_EQ(result->second.status, MockKernel::TunningStatus::subdeviceTunningInProgress);
|
||||
EXPECT_FALSE(mockKernel.mockKernel->singleSubdevicePreferredInCurrentEnqueue);
|
||||
|
||||
TimestampPacketType data[4] = {static_cast<TimestampPacketType>(container.getNode(0u)->getContextStartValue(0)),
|
||||
static_cast<TimestampPacketType>(container.getNode(0u)->getGlobalStartValue(0)),
|
||||
2, 2};
|
||||
|
||||
container.getNode(0u)->assignDataToAllTimestamps(0, data);
|
||||
|
||||
mockKernel.mockKernel->performKernelTuning(commandStreamReceiver, lws, gws, offsets, &container);
|
||||
|
||||
result = mockKernel.mockKernel->kernelSubmissionMap.find(config);
|
||||
EXPECT_NE(result, mockKernel.mockKernel->kernelSubmissionMap.end());
|
||||
EXPECT_EQ(result->second.status, MockKernel::TunningStatus::subdeviceTunningInProgress);
|
||||
EXPECT_FALSE(mockKernel.mockKernel->singleSubdevicePreferredInCurrentEnqueue);
|
||||
|
||||
data[0] = static_cast<TimestampPacketType>(subdeviceContainer.getNode(0u)->getContextStartValue(0));
|
||||
data[1] = static_cast<TimestampPacketType>(subdeviceContainer.getNode(0u)->getGlobalStartValue(0));
|
||||
data[2] = 2;
|
||||
data[3] = 2;
|
||||
|
||||
subdeviceContainer.getNode(0u)->assignDataToAllTimestamps(0, data);
|
||||
|
||||
mockKernel.mockKernel->performKernelTuning(commandStreamReceiver, lws, gws, offsets, &container);
|
||||
|
||||
result = mockKernel.mockKernel->kernelSubmissionMap.find(config);
|
||||
EXPECT_NE(result, mockKernel.mockKernel->kernelSubmissionMap.end());
|
||||
EXPECT_NE(result->second.kernelStandardTimestamps.get(), nullptr);
|
||||
EXPECT_NE(result->second.kernelSubdeviceTimestamps.get(), nullptr);
|
||||
EXPECT_EQ(result->second.status, MockKernel::TunningStatus::subdeviceTunningInProgress);
|
||||
EXPECT_FALSE(mockKernel.mockKernel->singleSubdevicePreferredInCurrentEnqueue);
|
||||
|
||||
data[0] = static_cast<TimestampPacketType>(subdeviceContainer.getNode(1u)->getContextStartValue(0));
|
||||
data[1] = static_cast<TimestampPacketType>(subdeviceContainer.getNode(1u)->getGlobalStartValue(0));
|
||||
data[2] = 2;
|
||||
data[3] = 2;
|
||||
|
||||
subdeviceContainer.getNode(1u)->assignDataToAllTimestamps(0, data);
|
||||
|
||||
mockKernel.mockKernel->performKernelTuning(commandStreamReceiver, lws, gws, offsets, &container);
|
||||
|
||||
result = mockKernel.mockKernel->kernelSubmissionMap.find(config);
|
||||
EXPECT_NE(result, mockKernel.mockKernel->kernelSubmissionMap.end());
|
||||
EXPECT_EQ(result->second.kernelStandardTimestamps.get(), nullptr);
|
||||
EXPECT_EQ(result->second.kernelSubdeviceTimestamps.get(), nullptr);
|
||||
EXPECT_EQ(result->second.status, MockKernel::TunningStatus::tunningDone);
|
||||
EXPECT_EQ(result->second.singleSubdevicePreferred, mockKernel.mockKernel->singleSubdevicePreferredInCurrentEnqueue);
|
||||
|
||||
mockKernel.mockKernel->performKernelTuning(commandStreamReceiver, lws, gws, offsets, &container);
|
||||
result = mockKernel.mockKernel->kernelSubmissionMap.find(config);
|
||||
EXPECT_NE(result, mockKernel.mockKernel->kernelSubmissionMap.end());
|
||||
EXPECT_EQ(result->second.status, MockKernel::TunningStatus::tunningDone);
|
||||
EXPECT_EQ(result->second.singleSubdevicePreferred, mockKernel.mockKernel->singleSubdevicePreferredInCurrentEnqueue);
|
||||
}
|
||||
|
||||
HWTEST_F(KernelResidencyTest, givenSimpleKernelWhenExecEnvDoesNotHavePageFaultManagerThenPageFaultDoesNotMoveAllocation) {
|
||||
auto mockPageFaultManager = std::make_unique<MockPageFaultManager>();
|
||||
MockKernelWithInternals mockKernel(*this->pClDevice);
|
||||
|
|
|
@ -111,9 +111,7 @@ class MockKernel : public Kernel {
|
|||
using Kernel::isUnifiedMemorySyncRequired;
|
||||
using Kernel::kernelArgHandlers;
|
||||
using Kernel::kernelArguments;
|
||||
using Kernel::KernelConfig;
|
||||
using Kernel::kernelHasIndirectAccess;
|
||||
using Kernel::kernelSubmissionMap;
|
||||
using Kernel::kernelSvmGfxAllocations;
|
||||
using Kernel::kernelUnifiedMemoryGfxAllocations;
|
||||
using Kernel::localBindingTableOffset;
|
||||
|
@ -129,7 +127,6 @@ class MockKernel : public Kernel {
|
|||
using Kernel::preferredWkgMultipleOffset;
|
||||
using Kernel::privateSurface;
|
||||
using Kernel::setInlineSamplers;
|
||||
using Kernel::singleSubdevicePreferredInCurrentEnqueue;
|
||||
using Kernel::unifiedMemoryControls;
|
||||
|
||||
using Kernel::implicitArgsVersion;
|
||||
|
|
Loading…
Reference in New Issue