Add MI_MATH MOCS support

Related-To: NEO-7458

Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
This commit is contained in:
Dunajski, Bartosz
2023-02-27 16:21:40 +00:00
committed by Compute-Runtime-Automation
parent c28f0c72ea
commit c00c310cf4
12 changed files with 60 additions and 8 deletions

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2022 Intel Corporation
* Copyright (C) 2022-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,12 @@ class EncodeAluHelper {
aluOps.miMath.DW0.BitField.DwordLength = AluCount - 1;
}
void setMocs([[maybe_unused]] uint32_t mocs) {
if constexpr (GfxFamily::isUsingMiMathMocs) {
aluOps.miMath.DW0.BitField.MemoryObjectControlState = mocs;
}
}
void setNextAlu(AluRegisters opcode) {
setNextAlu(opcode, AluRegisters::OPCODE_NONE, AluRegisters::OPCODE_NONE);
}

View File

@@ -14,6 +14,8 @@
#include "shared/source/direct_submission/direct_submission_hw_diagnostic_mode.h"
#include "shared/source/direct_submission/relaxed_ordering_helper.h"
#include "shared/source/execution_environment/root_device_environment.h"
#include "shared/source/gmm_helper/gmm_helper.h"
#include "shared/source/gmm_helper/gmm_lib.h"
#include "shared/source/helpers/aligned_memory.h"
#include "shared/source/helpers/flush_stamp.h"
#include "shared/source/helpers/gfx_core_helper.h"
@@ -102,6 +104,8 @@ void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchStaticRelaxedOrderingSch
uint64_t loopSectionStartAddress = schedulerStartAddress + RelaxedOrderingHelper::StaticSchedulerSizeAndOffsetSection<GfxFamily>::loopStartSectionStart;
const uint32_t miMathMocs = this->rootDeviceEnvironment.getGmmHelper()->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER);
// 1. Init section
{
EncodeMiPredicate<GfxFamily>::encode(schedulerCmdStream, MiPredicateType::Disable);
@@ -133,6 +137,7 @@ void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchStaticRelaxedOrderingSch
LriHelper<GfxFamily>::program(&schedulerCmdStream, CS_GPR_R8 + 4, static_cast<uint32_t>(deferredTasksListGpuVa >> 32), true);
EncodeAluHelper<GfxFamily, 10> aluHelper;
aluHelper.setMocs(miMathMocs);
aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCA, AluRegisters::R_2);
aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCB, AluRegisters::R_6);
aluHelper.setNextAlu(AluRegisters::OPCODE_SHL);
@@ -168,6 +173,7 @@ void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchStaticRelaxedOrderingSch
LriHelper<GfxFamily>::program(&schedulerCmdStream, CS_GPR_R8 + 4, static_cast<uint32_t>(deferredTasksListGpuVa >> 32), true);
EncodeAluHelper<GfxFamily, 14> aluHelper;
aluHelper.setMocs(miMathMocs);
aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCA, AluRegisters::R_1);
aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCB, AluRegisters::R_7);
aluHelper.setNextAlu(AluRegisters::OPCODE_SHL);
@@ -238,6 +244,7 @@ void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchStaticRelaxedOrderingSch
LriHelper<GfxFamily>::program(&schedulerCmdStream, CS_GPR_R10 + 4, 0, true);
EncodeAluHelper<GfxFamily, 4> aluHelper;
aluHelper.setMocs(miMathMocs);
aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCA, AluRegisters::R_9);
aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCB, AluRegisters::R_10);
aluHelper.setNextAlu(AluRegisters::OPCODE_ADD);
@@ -815,7 +822,10 @@ void DirectSubmissionHw<GfxFamily, Dispatcher>::preinitializeRelaxedOrderingSect
LriHelper<GfxFamily>::program(&stream, CS_GPR_R8, 8, true);
LriHelper<GfxFamily>::program(&stream, CS_GPR_R8 + 4, 0, true);
const uint32_t miMathMocs = this->rootDeviceEnvironment.getGmmHelper()->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER);
EncodeAluHelper<GfxFamily, 9> aluHelper;
aluHelper.setMocs(miMathMocs);
aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCA, AluRegisters::R_1);
aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCB, AluRegisters::R_8);
aluHelper.setNextAlu(AluRegisters::OPCODE_SHL);

View File

@@ -27,6 +27,7 @@ struct Gen11 {
static constexpr bool isUsingGenericMediaStateClear = true;
static constexpr bool isUsingMiMemFence = false;
static constexpr bool isUsingMiSetPredicate = false;
static constexpr bool isUsingMiMathMocs = false;
struct FrontEndStateSupport {
static constexpr bool scratchSize = true;

View File

@@ -26,6 +26,7 @@ struct Gen12Lp {
static constexpr uint32_t stateComputeModeForceNonCoherentMask = (0b11u << 3);
static constexpr bool isUsingMiMemFence = false;
static constexpr bool isUsingMiSetPredicate = false;
static constexpr bool isUsingMiMathMocs = false;
struct FrontEndStateSupport {
static constexpr bool scratchSize = true;

View File

@@ -28,6 +28,7 @@ struct Gen8 {
static constexpr bool isUsingGenericMediaStateClear = true;
static constexpr bool isUsingMiMemFence = false;
static constexpr bool isUsingMiSetPredicate = false;
static constexpr bool isUsingMiMathMocs = false;
struct FrontEndStateSupport {
static constexpr bool scratchSize = true;

View File

@@ -27,6 +27,7 @@ struct Gen9 {
static constexpr bool isUsingGenericMediaStateClear = true;
static constexpr bool isUsingMiMemFence = false;
static constexpr bool isUsingMiSetPredicate = false;
static constexpr bool isUsingMiMathMocs = false;
struct FrontEndStateSupport {
static constexpr bool scratchSize = true;

View File

@@ -652,7 +652,8 @@ typedef struct tagMI_MATH {
union _DW0 {
struct _BitField {
uint32_t DwordLength : BITFIELD_RANGE(0, 7);
uint32_t Reserved : BITFIELD_RANGE(8, 22);
uint32_t MemoryObjectControlState : BITFIELD_RANGE(8, 14);
uint32_t Reserved : BITFIELD_RANGE(15, 22);
uint32_t InstructionOpcode : BITFIELD_RANGE(23, 28);
uint32_t InstructionType : BITFIELD_RANGE(29, 31);
} BitField;

View File

@@ -401,7 +401,8 @@ typedef struct tagMI_MATH {
union _DW0 {
struct _BitField {
uint32_t DwordLength : BITFIELD_RANGE(0, 7);
uint32_t Reserved : BITFIELD_RANGE(8, 22);
uint32_t MemoryObjectControlState : BITFIELD_RANGE(8, 14);
uint32_t Reserved : BITFIELD_RANGE(15, 22);
uint32_t InstructionOpcode : BITFIELD_RANGE(23, 28);
uint32_t InstructionType : BITFIELD_RANGE(29, 31);
} BitField;

View File

@@ -32,6 +32,7 @@ struct XeHpCore {
static constexpr bool isUsingGenericMediaStateClear = true;
static constexpr bool isUsingMiMemFence = false;
static constexpr bool isUsingMiSetPredicate = true;
static constexpr bool isUsingMiMathMocs = false;
struct FrontEndStateSupport {
static constexpr bool scratchSize = true;

View File

@@ -34,6 +34,7 @@ struct XeHpcCore {
static constexpr bool isUsingGenericMediaStateClear = true;
static constexpr bool isUsingMiMemFence = true;
static constexpr bool isUsingMiSetPredicate = true;
static constexpr bool isUsingMiMathMocs = true;
struct StateBaseAddressStateSupport {
static constexpr bool globalAtomics = false;

View File

@@ -34,6 +34,7 @@ struct XeHpgCore {
static constexpr bool isUsingGenericMediaStateClear = true;
static constexpr bool isUsingMiMemFence = false;
static constexpr bool isUsingMiSetPredicate = true;
static constexpr bool isUsingMiMathMocs = true;
struct FrontEndStateSupport {
static constexpr bool scratchSize = true;

View File

@@ -12,6 +12,8 @@
#include "shared/source/direct_submission/direct_submission_hw.h"
#include "shared/source/direct_submission/dispatchers/render_dispatcher.h"
#include "shared/source/direct_submission/relaxed_ordering_helper.h"
#include "shared/source/gmm_helper/gmm_helper.h"
#include "shared/source/gmm_helper/gmm_lib.h"
#include "shared/source/helpers/flush_stamp.h"
#include "shared/source/helpers/gfx_core_helper.h"
#include "shared/source/helpers/register_offsets.h"
@@ -1054,7 +1056,7 @@ struct DirectSubmissionRelaxedOrderingTests : public DirectSubmissionDispatchBuf
bool verifyDynamicSchedulerProgramming(LinearStream &cs, uint64_t schedulerAllocationGpuVa, uint64_t semaphoreGpuVa, uint32_t semaphoreValue, size_t offset, size_t &endOffset);
template <typename FamilyType>
bool verifyStaticSchedulerProgramming(GraphicsAllocation &schedulerAllocation, uint64_t deferredTaskListVa, uint32_t expectedQueueSizeLimit);
bool verifyStaticSchedulerProgramming(GraphicsAllocation &schedulerAllocation, uint64_t deferredTaskListVa, uint32_t expectedQueueSizeLimit, uint32_t miMathMocs);
template <typename FamilyType>
bool verifyMiPredicate(void *miPredicateCmd, MiPredicateType predicateType);
@@ -1329,7 +1331,7 @@ bool DirectSubmissionRelaxedOrderingTests::verifyConditionalDataRegBbStart(void
}
template <typename FamilyType>
bool DirectSubmissionRelaxedOrderingTests::verifyStaticSchedulerProgramming(GraphicsAllocation &schedulerAllocation, uint64_t deferredTaskListVa, uint32_t expectedQueueSizeLimit) {
bool DirectSubmissionRelaxedOrderingTests::verifyStaticSchedulerProgramming(GraphicsAllocation &schedulerAllocation, uint64_t deferredTaskListVa, uint32_t expectedQueueSizeLimit, uint32_t miMathMocs) {
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM;
using MI_LOAD_REGISTER_REG = typename FamilyType::MI_LOAD_REGISTER_REG;
@@ -1412,6 +1414,12 @@ bool DirectSubmissionRelaxedOrderingTests::verifyStaticSchedulerProgramming(Grap
return false;
}
if constexpr (FamilyType::isUsingMiMathMocs) {
if (miMathCmd->DW0.BitField.MemoryObjectControlState != miMathMocs) {
return false;
}
}
auto miAluCmd = reinterpret_cast<MI_MATH_ALU_INST_INLINE *>(++miMathCmd);
if (!verifyAlu<FamilyType>(miAluCmd, AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCA, AluRegisters::R_2)) {
return false;
@@ -1504,6 +1512,12 @@ bool DirectSubmissionRelaxedOrderingTests::verifyStaticSchedulerProgramming(Grap
return false;
}
if constexpr (FamilyType::isUsingMiMathMocs) {
if (miMathCmd->DW0.BitField.MemoryObjectControlState != miMathMocs) {
return false;
}
}
miAluCmd = reinterpret_cast<MI_MATH_ALU_INST_INLINE *>(++miMathCmd);
if (!verifyAlu<FamilyType>(miAluCmd, AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCA, AluRegisters::R_1)) {
return false;
@@ -1646,6 +1660,12 @@ bool DirectSubmissionRelaxedOrderingTests::verifyStaticSchedulerProgramming(Grap
return false;
}
if constexpr (FamilyType::isUsingMiMathMocs) {
if (miMathCmd->DW0.BitField.MemoryObjectControlState != miMathMocs) {
return false;
}
}
miAluCmd = reinterpret_cast<MI_MATH_ALU_INST_INLINE *>(++miMathCmd);
if (!verifyAlu<FamilyType>(miAluCmd, AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCA, AluRegisters::R_9)) {
return false;
@@ -1793,7 +1813,8 @@ HWTEST2_F(DirectSubmissionRelaxedOrderingTests, givenDebugFlagSetWhenDispatching
EXPECT_EQ(1u, directSubmission.dispatchStaticRelaxedOrderingSchedulerCalled);
EXPECT_TRUE(verifyStaticSchedulerProgramming<FamilyType>(*directSubmission.relaxedOrderingSchedulerAllocation,
directSubmission.deferredTasksListAllocation->getGpuAddress(), 123));
directSubmission.deferredTasksListAllocation->getGpuAddress(), 123,
pDevice->getRootDeviceEnvironment().getGmmHelper()->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER)));
}
HWTEST2_F(DirectSubmissionRelaxedOrderingTests, givenNewNumberOfClientsWhenDispatchingWorkThenIncraseQueueSize, IsAtLeastXeHpcCore) {
@@ -1806,7 +1827,8 @@ HWTEST2_F(DirectSubmissionRelaxedOrderingTests, givenNewNumberOfClientsWhenDispa
EXPECT_EQ(1u, directSubmission.dispatchStaticRelaxedOrderingSchedulerCalled);
EXPECT_EQ(RelaxedOrderingHelper::queueSizeMultiplier, directSubmission.currentRelaxedOrderingQueueSize);
EXPECT_TRUE(verifyStaticSchedulerProgramming<FamilyType>(*directSubmission.relaxedOrderingSchedulerAllocation,
directSubmission.deferredTasksListAllocation->getGpuAddress(), RelaxedOrderingHelper::queueSizeMultiplier));
directSubmission.deferredTasksListAllocation->getGpuAddress(), RelaxedOrderingHelper::queueSizeMultiplier,
pDevice->getRootDeviceEnvironment().getGmmHelper()->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER)));
const uint64_t expectedQueueSizeValueVa = directSubmission.relaxedOrderingSchedulerAllocation->getGpuAddress() +
RelaxedOrderingHelper::StaticSchedulerSizeAndOffsetSection<FamilyType>::drainRequestSectionStart +
@@ -1876,7 +1898,8 @@ HWTEST2_F(DirectSubmissionRelaxedOrderingTests, whenInitializingThenDispatchStat
EXPECT_EQ(1u, directSubmission.dispatchStaticRelaxedOrderingSchedulerCalled);
EXPECT_TRUE(verifyStaticSchedulerProgramming<FamilyType>(*directSubmission.relaxedOrderingSchedulerAllocation,
directSubmission.deferredTasksListAllocation->getGpuAddress(), RelaxedOrderingHelper::queueSizeMultiplier));
directSubmission.deferredTasksListAllocation->getGpuAddress(), RelaxedOrderingHelper::queueSizeMultiplier,
pDevice->getRootDeviceEnvironment().getGmmHelper()->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER)));
}
{
@@ -2031,6 +2054,10 @@ HWTEST_F(DirectSubmissionRelaxedOrderingTests, whenDispatchingWorkThenDispatchTa
auto miMathCmd = reinterpret_cast<MI_MATH *>(++lriCmd);
EXPECT_EQ(8u, miMathCmd->DW0.BitField.DwordLength);
if constexpr (FamilyType::isUsingMiMathMocs) {
EXPECT_EQ(miMathCmd->DW0.BitField.MemoryObjectControlState, pDevice->getRootDeviceEnvironment().getGmmHelper()->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER));
}
auto miAluCmd = reinterpret_cast<MI_MATH_ALU_INST_INLINE *>(++miMathCmd);
EXPECT_TRUE(verifyAlu<FamilyType>(miAluCmd, AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCA, AluRegisters::R_1));