From 048c90e3b177af6763fe09a7f4b62af0ae43d8d3 Mon Sep 17 00:00:00 2001 From: Zbigniew Zdanowicz Date: Mon, 27 Apr 2020 18:55:26 +0200 Subject: [PATCH] Remove RMW access patterns from gfx memory Related-To: NEO-4338 Change-Id: I8dcfca9a11f499fde44ca9754dec67fe5a5e3d46 Signed-off-by: Zbigniew Zdanowicz --- .../command_queue/gpgpu_walker_base.inl | 108 +++++++++++------- .../command_queue/gpgpu_walker_bdw_plus.inl | 19 +-- .../command_queue/hardware_interface_base.inl | 3 +- .../hardware_interface_bdw_plus.inl | 12 +- .../source/helpers/hardware_commands_helper.h | 2 +- .../helpers/hardware_commands_helper_base.inl | 56 ++++----- .../hardware_commands_helper_bdw_plus.inl | 34 +++--- opencl/source/mem_obj/image.inl | 42 +++---- .../hardware_commands_helper_tests.cpp | 6 +- .../command_container/command_encoder.inl | 93 ++++++++------- .../command_encoder_base.inl | 4 +- .../command_stream_receiver_hw_base.inl | 14 ++- .../command_stream_receiver_hw_bdw_plus.inl | 9 +- shared/source/gen8/preemption_gen8.cpp | 14 ++- shared/source/gen9/preemption_gen9.cpp | 14 ++- .../helpers/blit_commands_helper_base.inl | 3 +- shared/source/helpers/hw_helper_base.inl | 39 ++++--- 17 files changed, 263 insertions(+), 209 deletions(-) diff --git a/opencl/source/command_queue/gpgpu_walker_base.inl b/opencl/source/command_queue/gpgpu_walker_base.inl index 21d0a0119c..24abbf70b3 100644 --- a/opencl/source/command_queue/gpgpu_walker_base.inl +++ b/opencl/source/command_queue/gpgpu_walker_base.inl @@ -42,77 +42,94 @@ void GpgpuWalkerHelper::addAluReadModifyWriteRegister( AluRegisters operation, uint32_t mask) { // Load "Register" value into CS_GPR_R0 - typedef typename GfxFamily::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG; - typedef typename GfxFamily::MI_MATH MI_MATH; - typedef typename GfxFamily::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE; + using MI_LOAD_REGISTER_REG = typename GfxFamily::MI_LOAD_REGISTER_REG; + using MI_MATH = typename GfxFamily::MI_MATH; + using MI_MATH_ALU_INST_INLINE = typename GfxFamily::MI_MATH_ALU_INST_INLINE; + auto pCmd = pCommandStream->getSpaceForCmd(); - *pCmd = GfxFamily::cmdInitLoadRegisterReg; - pCmd->setSourceRegisterAddress(aluRegister); - pCmd->setDestinationRegisterAddress(CS_GPR_R0); + MI_LOAD_REGISTER_REG cmdReg = GfxFamily::cmdInitLoadRegisterReg; + cmdReg.setSourceRegisterAddress(aluRegister); + cmdReg.setDestinationRegisterAddress(CS_GPR_R0); + *pCmd = cmdReg; // Load "Mask" into CS_GPR_R1 typedef typename GfxFamily::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM; auto pCmd2 = pCommandStream->getSpaceForCmd(); - *pCmd2 = GfxFamily::cmdInitLoadRegisterImm; - pCmd2->setRegisterOffset(CS_GPR_R1); - pCmd2->setDataDword(mask); + MI_LOAD_REGISTER_IMM cmdImm = GfxFamily::cmdInitLoadRegisterImm; + cmdImm.setRegisterOffset(CS_GPR_R1); + cmdImm.setDataDword(mask); + *pCmd2 = cmdImm; // Add instruction MI_MATH with 4 MI_MATH_ALU_INST_INLINE operands auto pCmd3 = reinterpret_cast(pCommandStream->getSpace(sizeof(MI_MATH) + NUM_ALU_INST_FOR_READ_MODIFY_WRITE * sizeof(MI_MATH_ALU_INST_INLINE))); - reinterpret_cast(pCmd3)->DW0.Value = 0x0; - reinterpret_cast(pCmd3)->DW0.BitField.InstructionType = MI_MATH::COMMAND_TYPE_MI_COMMAND; - reinterpret_cast(pCmd3)->DW0.BitField.InstructionOpcode = MI_MATH::MI_COMMAND_OPCODE_MI_MATH; + MI_MATH mathCmd; + mathCmd.DW0.Value = 0x0; + mathCmd.DW0.BitField.InstructionType = MI_MATH::COMMAND_TYPE_MI_COMMAND; + mathCmd.DW0.BitField.InstructionOpcode = MI_MATH::MI_COMMAND_OPCODE_MI_MATH; // 0x3 - 5 Dwords length cmd (-2): 1 for MI_MATH, 4 for MI_MATH_ALU_INST_INLINE - reinterpret_cast(pCmd3)->DW0.BitField.DwordLength = NUM_ALU_INST_FOR_READ_MODIFY_WRITE - 1; + mathCmd.DW0.BitField.DwordLength = NUM_ALU_INST_FOR_READ_MODIFY_WRITE - 1; + *reinterpret_cast(pCmd3) = mathCmd; + pCmd3++; MI_MATH_ALU_INST_INLINE *pAluParam = reinterpret_cast(pCmd3); + MI_MATH_ALU_INST_INLINE cmdAluParam; + cmdAluParam.DW0.Value = 0x0; // Setup first operand of MI_MATH - load CS_GPR_R0 into register A - pAluParam->DW0.BitField.ALUOpcode = + cmdAluParam.DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_LOAD); - pAluParam->DW0.BitField.Operand1 = + cmdAluParam.DW0.BitField.Operand1 = static_cast(AluRegisters::R_SRCA); - pAluParam->DW0.BitField.Operand2 = + cmdAluParam.DW0.BitField.Operand2 = static_cast(AluRegisters::R_0); + *pAluParam = cmdAluParam; pAluParam++; + cmdAluParam.DW0.Value = 0x0; // Setup second operand of MI_MATH - load CS_GPR_R1 into register B - pAluParam->DW0.BitField.ALUOpcode = + cmdAluParam.DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_LOAD); - pAluParam->DW0.BitField.Operand1 = + cmdAluParam.DW0.BitField.Operand1 = static_cast(AluRegisters::R_SRCB); - pAluParam->DW0.BitField.Operand2 = + cmdAluParam.DW0.BitField.Operand2 = static_cast(AluRegisters::R_1); + *pAluParam = cmdAluParam; pAluParam++; + cmdAluParam.DW0.Value = 0x0; // Setup third operand of MI_MATH - "Operation" on registers A and B - pAluParam->DW0.BitField.ALUOpcode = static_cast(operation); - pAluParam->DW0.BitField.Operand1 = 0; - pAluParam->DW0.BitField.Operand2 = 0; + cmdAluParam.DW0.BitField.ALUOpcode = static_cast(operation); + cmdAluParam.DW0.BitField.Operand1 = 0; + cmdAluParam.DW0.BitField.Operand2 = 0; + *pAluParam = cmdAluParam; pAluParam++; + cmdAluParam.DW0.Value = 0x0; // Setup fourth operand of MI_MATH - store result into CS_GPR_R0 - pAluParam->DW0.BitField.ALUOpcode = + cmdAluParam.DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_STORE); - pAluParam->DW0.BitField.Operand1 = + cmdAluParam.DW0.BitField.Operand1 = static_cast(AluRegisters::R_0); - pAluParam->DW0.BitField.Operand2 = + cmdAluParam.DW0.BitField.Operand2 = static_cast(AluRegisters::R_ACCU); + *pAluParam = cmdAluParam; // LOAD value of CS_GPR_R0 into "Register" auto pCmd4 = pCommandStream->getSpaceForCmd(); - *pCmd4 = GfxFamily::cmdInitLoadRegisterReg; - pCmd4->setSourceRegisterAddress(CS_GPR_R0); - pCmd4->setDestinationRegisterAddress(aluRegister); + cmdReg = GfxFamily::cmdInitLoadRegisterReg; + cmdReg.setSourceRegisterAddress(CS_GPR_R0); + cmdReg.setDestinationRegisterAddress(aluRegister); + *pCmd4 = cmdReg; // Add PIPE_CONTROL to flush caches auto pCmd5 = pCommandStream->getSpaceForCmd(); - *pCmd5 = GfxFamily::cmdInitPipeControl; - pCmd5->setCommandStreamerStallEnable(true); - pCmd5->setDcFlushEnable(true); - pCmd5->setTextureCacheInvalidationEnable(true); - pCmd5->setPipeControlFlushEnable(true); - pCmd5->setStateCacheInvalidationEnable(true); + PIPE_CONTROL cmdPipeControl = GfxFamily::cmdInitPipeControl; + cmdPipeControl.setCommandStreamerStallEnable(true); + cmdPipeControl.setDcFlushEnable(true); + cmdPipeControl.setTextureCacheInvalidationEnable(true); + cmdPipeControl.setPipeControlFlushEnable(true); + cmdPipeControl.setStateCacheInvalidationEnable(true); + *pCmd5 = cmdPipeControl; } template @@ -139,10 +156,11 @@ void GpgpuWalkerHelper::dispatchProfilingCommandsStart( //low part auto pMICmdLow = commandStream->getSpaceForCmd(); - *pMICmdLow = GfxFamily::cmdInitStoreRegisterMem; - adjustMiStoreRegMemMode(pMICmdLow); - pMICmdLow->setRegisterAddress(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW); - pMICmdLow->setMemoryAddress(timeStampAddress); + MI_STORE_REGISTER_MEM cmd = GfxFamily::cmdInitStoreRegisterMem; + adjustMiStoreRegMemMode(&cmd); + cmd.setRegisterAddress(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW); + cmd.setMemoryAddress(timeStampAddress); + *pMICmdLow = cmd; } template @@ -154,18 +172,20 @@ void GpgpuWalkerHelper::dispatchProfilingCommandsEnd( // PIPE_CONTROL for global timestamp auto pPipeControlCmd = commandStream->getSpaceForCmd(); - *pPipeControlCmd = GfxFamily::cmdInitPipeControl; - pPipeControlCmd->setCommandStreamerStallEnable(true); + PIPE_CONTROL cmdPipeControl = GfxFamily::cmdInitPipeControl; + cmdPipeControl.setCommandStreamerStallEnable(true); + *pPipeControlCmd = cmdPipeControl; //MI_STORE_REGISTER_MEM for context local timestamp uint64_t timeStampAddress = hwTimeStamps.getGpuAddress() + offsetof(HwTimeStamps, ContextEndTS); //low part auto pMICmdLow = commandStream->getSpaceForCmd(); - *pMICmdLow = GfxFamily::cmdInitStoreRegisterMem; - adjustMiStoreRegMemMode(pMICmdLow); - pMICmdLow->setRegisterAddress(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW); - pMICmdLow->setMemoryAddress(timeStampAddress); + MI_STORE_REGISTER_MEM cmd = GfxFamily::cmdInitStoreRegisterMem; + adjustMiStoreRegMemMode(&cmd); + cmd.setRegisterAddress(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW); + cmd.setMemoryAddress(timeStampAddress); + *pMICmdLow = cmd; } template diff --git a/opencl/source/command_queue/gpgpu_walker_bdw_plus.inl b/opencl/source/command_queue/gpgpu_walker_bdw_plus.inl index c431265333..43599ed70e 100644 --- a/opencl/source/command_queue/gpgpu_walker_bdw_plus.inl +++ b/opencl/source/command_queue/gpgpu_walker_bdw_plus.inl @@ -128,8 +128,9 @@ void GpgpuWalkerHelper::dispatchScheduler( IndirectHeap *ioh = &indirectObjectHeap; // Program the walker. Invokes execution so all state should already be programmed - auto pGpGpuWalkerCmd = static_cast(commandStream.getSpace(sizeof(GPGPU_WALKER))); - *pGpGpuWalkerCmd = GfxFamily::cmdInitGpgpuWalker; + auto pGpGpuWalkerCmd = commandStream.getSpaceForCmd(); + GPGPU_WALKER cmdWalker = GfxFamily::cmdInitGpgpuWalker; + bool inlineDataProgrammingRequired = HardwareCommandsHelper::inlineDataProgrammingRequired(scheduler); auto kernelUsesLocalIds = HardwareCommandsHelper::kernelUsesLocalIds(scheduler); @@ -145,7 +146,7 @@ void GpgpuWalkerHelper::dispatchScheduler( offsetInterfaceDescriptorTable, interfaceDescriptorIndex, preemptionMode, - pGpGpuWalkerCmd, + &cmdWalker, nullptr, true); @@ -154,9 +155,10 @@ void GpgpuWalkerHelper::dispatchScheduler( size_t globalOffsets[3] = {0, 0, 0}; size_t workGroups[3] = {(scheduler.getGws() / scheduler.getLws()), 1, 1}; - GpgpuWalkerHelper::setGpgpuWalkerThreadData(pGpGpuWalkerCmd, globalOffsets, globalOffsets, workGroups, localWorkSizes, + GpgpuWalkerHelper::setGpgpuWalkerThreadData(&cmdWalker, globalOffsets, globalOffsets, workGroups, localWorkSizes, simd, 1, true, inlineDataProgrammingRequired, *scheduler.getKernelInfo().patchInfo.threadPayload, 0u); + *pGpGpuWalkerCmd = cmdWalker; // Implement disabling special WA DisableLSQCROPERFforOCL if needed GpgpuWalkerHelper::applyWADisableLSQCROPERFforOCL(&commandStream, scheduler, false); @@ -167,11 +169,12 @@ void GpgpuWalkerHelper::dispatchScheduler( MemorySynchronizationCommands::addPipeControl(commandStream, args); // Add BB Start Cmd to the SLB in the Primary Batch Buffer - auto *bbStart = static_cast(commandStream.getSpace(sizeof(MI_BATCH_BUFFER_START))); - *bbStart = GfxFamily::cmdInitBatchBufferStart; - bbStart->setSecondLevelBatchBuffer(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER_FIRST_LEVEL_BATCH); + auto bbStart = commandStream.getSpaceForCmd(); + MI_BATCH_BUFFER_START cmdBbStart = GfxFamily::cmdInitBatchBufferStart; + cmdBbStart.setSecondLevelBatchBuffer(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER_FIRST_LEVEL_BATCH); uint64_t slbAddress = devQueueHw.getSlbBuffer()->getGpuAddress(); - bbStart->setBatchBufferStartAddressGraphicsaddress472(slbAddress); + cmdBbStart.setBatchBufferStartAddressGraphicsaddress472(slbAddress); + *bbStart = cmdBbStart; } } diff --git a/opencl/source/command_queue/hardware_interface_base.inl b/opencl/source/command_queue/hardware_interface_base.inl index 0622b91827..dfa585ae9d 100644 --- a/opencl/source/command_queue/hardware_interface_base.inl +++ b/opencl/source/command_queue/hardware_interface_base.inl @@ -18,8 +18,7 @@ namespace NEO { template inline WALKER_TYPE *HardwareInterface::allocateWalkerSpace(LinearStream &commandStream, const Kernel &kernel) { - auto walkerCmd = static_cast *>(commandStream.getSpace(sizeof(WALKER_TYPE))); - *walkerCmd = GfxFamily::cmdInitGpgpuWalker; + auto walkerCmd = commandStream.getSpaceForCmd>(); return walkerCmd; } diff --git a/opencl/source/command_queue/hardware_interface_bdw_plus.inl b/opencl/source/command_queue/hardware_interface_bdw_plus.inl index 6c5971aa72..536198f0d0 100644 --- a/opencl/source/command_queue/hardware_interface_bdw_plus.inl +++ b/opencl/source/command_queue/hardware_interface_bdw_plus.inl @@ -102,7 +102,8 @@ inline void HardwareInterface::programWalker( Vec3 &numberOfWorkgroups, Vec3 &startOfWorkgroups) { - auto walkerCmd = allocateWalkerSpace(commandStream, kernel); + auto walkerCmdBuf = allocateWalkerSpace(commandStream, kernel); + WALKER_TYPE walkerCmd = GfxFamily::cmdInitGpgpuWalker; uint32_t dim = dispatchInfo.getDim(); uint32_t simd = kernel.getKernelInfo().getMaxSimdSize(); @@ -112,7 +113,7 @@ inline void HardwareInterface::programWalker( if (currentTimestampPacketNodes && commandQueue.getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { auto timestampPacketNode = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex); - GpgpuWalkerHelper::setupTimestampPacket(&commandStream, walkerCmd, timestampPacketNode, TimestampPacketStorage::WriteOperationType::AfterWalker, commandQueue.getDevice().getRootDeviceEnvironment()); + GpgpuWalkerHelper::setupTimestampPacket(&commandStream, &walkerCmd, timestampPacketNode, TimestampPacketStorage::WriteOperationType::AfterWalker, commandQueue.getDevice().getRootDeviceEnvironment()); } auto isCcsUsed = EngineHelpers::isCcs(commandQueue.getGpgpuEngine().osContext->getEngineType()); @@ -130,15 +131,16 @@ inline void HardwareInterface::programWalker( offsetInterfaceDescriptorTable, interfaceDescriptorIndex, preemptionMode, - walkerCmd, + &walkerCmd, nullptr, true); - GpgpuWalkerHelper::setGpgpuWalkerThreadData(walkerCmd, globalOffsets, startWorkGroups, + GpgpuWalkerHelper::setGpgpuWalkerThreadData(&walkerCmd, globalOffsets, startWorkGroups, numWorkGroups, localWorkSizes, simd, dim, false, false, *kernel.getKernelInfo().patchInfo.threadPayload, 0u); - EncodeDispatchKernel::encodeAdditionalWalkerFields(commandQueue.getDevice().getHardwareInfo(), *walkerCmd); + EncodeDispatchKernel::encodeAdditionalWalkerFields(commandQueue.getDevice().getHardwareInfo(), walkerCmd); + *walkerCmdBuf = walkerCmd; } } // namespace NEO diff --git a/opencl/source/helpers/hardware_commands_helper.h b/opencl/source/helpers/hardware_commands_helper.h index 83741d6c9f..130e0dcdbf 100644 --- a/opencl/source/helpers/hardware_commands_helper.h +++ b/opencl/source/helpers/hardware_commands_helper.h @@ -155,7 +155,7 @@ struct HardwareCommandsHelper : public PerThreadDataHelper { uint32_t compareData, COMPARE_OPERATION compareMode); - static MI_ATOMIC *programMiAtomic(LinearStream &commandStream, uint64_t writeAddress, typename MI_ATOMIC::ATOMIC_OPCODES opcode, typename MI_ATOMIC::DATA_SIZE dataSize); + static void programMiAtomic(LinearStream &commandStream, uint64_t writeAddress, typename MI_ATOMIC::ATOMIC_OPCODES opcode, typename MI_ATOMIC::DATA_SIZE dataSize); static void programMiAtomic(MI_ATOMIC &atomic, uint64_t writeAddress, typename MI_ATOMIC::ATOMIC_OPCODES opcode, typename MI_ATOMIC::DATA_SIZE dataSize); static void programCacheFlushAfterWalkerCommand(LinearStream *commandStream, const CommandQueue &commandQueue, const Kernel *kernel, uint64_t postSyncAddress); static void programBarrierEnable(INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, uint32_t value, const HardwareInfo &hwInfo); diff --git a/opencl/source/helpers/hardware_commands_helper_base.inl b/opencl/source/helpers/hardware_commands_helper_base.inl index 73be8bf48e..ee71b33aa6 100644 --- a/opencl/source/helpers/hardware_commands_helper_base.inl +++ b/opencl/source/helpers/hardware_commands_helper_base.inl @@ -183,39 +183,40 @@ size_t HardwareCommandsHelper::sendInterfaceDescriptorData( // Allocate some memory for the interface descriptor auto pInterfaceDescriptor = getInterfaceDescriptor(indirectHeap, offsetInterfaceDescriptor, inlineInterfaceDescriptor); - *pInterfaceDescriptor = GfxFamily::cmdInitInterfaceDescriptorData; + auto interfaceDescriptor = GfxFamily::cmdInitInterfaceDescriptorData; // Program the kernel start pointer - pInterfaceDescriptor->setKernelStartPointerHigh(kernelStartOffset >> 32); - pInterfaceDescriptor->setKernelStartPointer((uint32_t)kernelStartOffset); + interfaceDescriptor.setKernelStartPointerHigh(kernelStartOffset >> 32); + interfaceDescriptor.setKernelStartPointer((uint32_t)kernelStartOffset); // # of threads in thread group should be based on LWS. - pInterfaceDescriptor->setNumberOfThreadsInGpgpuThreadGroup(threadsPerThreadGroup); + interfaceDescriptor.setNumberOfThreadsInGpgpuThreadGroup(threadsPerThreadGroup); - pInterfaceDescriptor->setDenormMode(INTERFACE_DESCRIPTOR_DATA::DENORM_MODE_SETBYKERNEL); + interfaceDescriptor.setDenormMode(INTERFACE_DESCRIPTOR_DATA::DENORM_MODE_SETBYKERNEL); - setGrfInfo(pInterfaceDescriptor, kernel, sizeCrossThreadData, sizePerThreadData); - setAdditionalInfo(pInterfaceDescriptor, kernel, threadsPerThreadGroup); + setGrfInfo(&interfaceDescriptor, kernel, sizeCrossThreadData, sizePerThreadData); + setAdditionalInfo(&interfaceDescriptor, kernel, threadsPerThreadGroup); - pInterfaceDescriptor->setBindingTablePointer(static_cast(bindingTablePointer)); + interfaceDescriptor.setBindingTablePointer(static_cast(bindingTablePointer)); - pInterfaceDescriptor->setSamplerStatePointer(static_cast(offsetSamplerState)); + interfaceDescriptor.setSamplerStatePointer(static_cast(offsetSamplerState)); DEBUG_BREAK_IF(numSamplers > 16); auto samplerCountState = static_cast((numSamplers + 3) / 4); - pInterfaceDescriptor->setSamplerCount(samplerCountState); + interfaceDescriptor.setSamplerCount(samplerCountState); - pInterfaceDescriptor->setBindingTableEntryCount(bindingTablePrefetchSize); + interfaceDescriptor.setBindingTableEntryCount(bindingTablePrefetchSize); auto programmableIDSLMSize = static_cast(computeSlmValues(kernel.slmTotalSize)); - pInterfaceDescriptor->setSharedLocalMemorySize(programmableIDSLMSize); - programBarrierEnable(pInterfaceDescriptor, kernel.getKernelInfo().patchInfo.executionEnvironment->HasBarriers, + interfaceDescriptor.setSharedLocalMemorySize(programmableIDSLMSize); + programBarrierEnable(&interfaceDescriptor, kernel.getKernelInfo().patchInfo.executionEnvironment->HasBarriers, kernel.getDevice().getHardwareInfo()); - PreemptionHelper::programInterfaceDescriptorDataPreemption(pInterfaceDescriptor, preemptionMode); - HardwareCommandsHelper::adjustInterfaceDescriptorData(pInterfaceDescriptor, kernel.getDevice().getHardwareInfo()); + PreemptionHelper::programInterfaceDescriptorDataPreemption(&interfaceDescriptor, preemptionMode); + HardwareCommandsHelper::adjustInterfaceDescriptorData(&interfaceDescriptor, kernel.getDevice().getHardwareInfo()); + *pInterfaceDescriptor = interfaceDescriptor; return (size_t)offsetInterfaceDescriptor; } @@ -407,21 +408,24 @@ void HardwareCommandsHelper::programMiSemaphoreWait(LinearStream &com using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT; auto miSemaphoreCmd = commandStream.getSpaceForCmd(); - *miSemaphoreCmd = GfxFamily::cmdInitMiSemaphoreWait; - miSemaphoreCmd->setCompareOperation(compareMode); - miSemaphoreCmd->setSemaphoreDataDword(compareData); - miSemaphoreCmd->setSemaphoreGraphicsAddress(compareAddress); - miSemaphoreCmd->setWaitMode(MI_SEMAPHORE_WAIT::WAIT_MODE::WAIT_MODE_POLLING_MODE); + MI_SEMAPHORE_WAIT cmd = GfxFamily::cmdInitMiSemaphoreWait; + + cmd.setCompareOperation(compareMode); + cmd.setSemaphoreDataDword(compareData); + cmd.setSemaphoreGraphicsAddress(compareAddress); + cmd.setWaitMode(MI_SEMAPHORE_WAIT::WAIT_MODE::WAIT_MODE_POLLING_MODE); + *miSemaphoreCmd = cmd; } template -typename GfxFamily::MI_ATOMIC *HardwareCommandsHelper::programMiAtomic(LinearStream &commandStream, uint64_t writeAddress, - typename MI_ATOMIC::ATOMIC_OPCODES opcode, - typename MI_ATOMIC::DATA_SIZE dataSize) { +void HardwareCommandsHelper::programMiAtomic(LinearStream &commandStream, uint64_t writeAddress, + typename MI_ATOMIC::ATOMIC_OPCODES opcode, + typename MI_ATOMIC::DATA_SIZE dataSize) { auto miAtomic = commandStream.getSpaceForCmd(); - *miAtomic = GfxFamily::cmdInitAtomic; - HardwareCommandsHelper::programMiAtomic(*miAtomic, writeAddress, opcode, dataSize); - return miAtomic; + MI_ATOMIC cmd = GfxFamily::cmdInitAtomic; + + HardwareCommandsHelper::programMiAtomic(cmd, writeAddress, opcode, dataSize); + *miAtomic = cmd; } template diff --git a/opencl/source/helpers/hardware_commands_helper_bdw_plus.inl b/opencl/source/helpers/hardware_commands_helper_bdw_plus.inl index 52c8708390..e3df52ec1f 100644 --- a/opencl/source/helpers/hardware_commands_helper_bdw_plus.inl +++ b/opencl/source/helpers/hardware_commands_helper_bdw_plus.inl @@ -66,10 +66,12 @@ void HardwareCommandsHelper::sendMediaStateFlush( LinearStream &commandStream, size_t offsetInterfaceDescriptorData) { - typedef typename GfxFamily::MEDIA_STATE_FLUSH MEDIA_STATE_FLUSH; - auto pCmd = (MEDIA_STATE_FLUSH *)commandStream.getSpace(sizeof(MEDIA_STATE_FLUSH)); - *pCmd = GfxFamily::cmdInitMediaStateFlush; - pCmd->setInterfaceDescriptorOffset((uint32_t)offsetInterfaceDescriptorData); + using MEDIA_STATE_FLUSH = typename GfxFamily::MEDIA_STATE_FLUSH; + auto pCmd = commandStream.getSpaceForCmd(); + MEDIA_STATE_FLUSH cmd = GfxFamily::cmdInitMediaStateFlush; + + cmd.setInterfaceDescriptorOffset(static_cast(offsetInterfaceDescriptorData)); + *pCmd = cmd; } template @@ -78,17 +80,18 @@ void HardwareCommandsHelper::sendMediaInterfaceDescriptorLoad( size_t offsetInterfaceDescriptorData, size_t sizeInterfaceDescriptorData) { { - typedef typename GfxFamily::MEDIA_STATE_FLUSH MEDIA_STATE_FLUSH; - auto pCmd = (MEDIA_STATE_FLUSH *)commandStream.getSpace(sizeof(MEDIA_STATE_FLUSH)); + using MEDIA_STATE_FLUSH = typename GfxFamily::MEDIA_STATE_FLUSH; + auto pCmd = commandStream.getSpaceForCmd(); *pCmd = GfxFamily::cmdInitMediaStateFlush; } { - typedef typename GfxFamily::MEDIA_INTERFACE_DESCRIPTOR_LOAD MEDIA_INTERFACE_DESCRIPTOR_LOAD; - auto pCmd = (MEDIA_INTERFACE_DESCRIPTOR_LOAD *)commandStream.getSpace(sizeof(MEDIA_INTERFACE_DESCRIPTOR_LOAD)); - *pCmd = GfxFamily::cmdInitMediaInterfaceDescriptorLoad; - pCmd->setInterfaceDescriptorDataStartAddress((uint32_t)offsetInterfaceDescriptorData); - pCmd->setInterfaceDescriptorTotalLength((uint32_t)sizeInterfaceDescriptorData); + using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename GfxFamily::MEDIA_INTERFACE_DESCRIPTOR_LOAD; + auto pCmd = commandStream.getSpaceForCmd(); + MEDIA_INTERFACE_DESCRIPTOR_LOAD cmd = GfxFamily::cmdInitMediaInterfaceDescriptorLoad; + cmd.setInterfaceDescriptorDataStartAddress(static_cast(offsetInterfaceDescriptorData)); + cmd.setInterfaceDescriptorTotalLength(static_cast(sizeInterfaceDescriptorData)); + *pCmd = cmd; } } @@ -154,10 +157,11 @@ void HardwareCommandsHelper::setInterfaceDescriptorOffset( template void HardwareCommandsHelper::programCacheFlushAfterWalkerCommand(LinearStream *commandStream, const CommandQueue &commandQueue, const Kernel *kernel, uint64_t postSyncAddress) { using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL; - auto pipeControl = reinterpret_cast(commandStream->getSpace(sizeof(PIPE_CONTROL))); - *pipeControl = GfxFamily::cmdInitPipeControl; - pipeControl->setCommandStreamerStallEnable(true); - pipeControl->setDcFlushEnable(true); + auto pipeControl = commandStream->getSpaceForCmd(); + PIPE_CONTROL cmd = GfxFamily::cmdInitPipeControl; + cmd.setCommandStreamerStallEnable(true); + cmd.setDcFlushEnable(true); + *pipeControl = cmd; } template diff --git a/opencl/source/mem_obj/image.inl b/opencl/source/mem_obj/image.inl index 8303c83adf..cf1a3b626c 100644 --- a/opencl/source/mem_obj/image.inl +++ b/opencl/source/mem_obj/image.inl @@ -135,44 +135,46 @@ void ImageHw::setMediaImageArg(void *memory) { auto gmmHelper = rootDeviceEnvironment->getGmmHelper(); auto surfaceState = reinterpret_cast(memory); - *surfaceState = GfxFamily::cmdInitMediaSurfaceState; + MEDIA_SURFACE_STATE state = GfxFamily::cmdInitMediaSurfaceState; - setMediaSurfaceRotation(reinterpret_cast(surfaceState)); + setMediaSurfaceRotation(reinterpret_cast(&state)); DEBUG_BREAK_IF(surfaceFormat == MEDIA_SURFACE_STATE::SURFACE_FORMAT_Y1_UNORM); - surfaceState->setWidth(static_cast(getImageDesc().image_width)); + state.setWidth(static_cast(getImageDesc().image_width)); - surfaceState->setHeight(static_cast(getImageDesc().image_height)); - surfaceState->setPictureStructure(MEDIA_SURFACE_STATE::PICTURE_STRUCTURE_FRAME_PICTURE); + state.setHeight(static_cast(getImageDesc().image_height)); + state.setPictureStructure(MEDIA_SURFACE_STATE::PICTURE_STRUCTURE_FRAME_PICTURE); auto gmm = getGraphicsAllocation()->getDefaultGmm(); auto tileMode = static_cast(gmm->gmmResourceInfo->getTileModeSurfaceState()); - surfaceState->setTileMode(tileMode); - surfaceState->setSurfacePitch(static_cast(getImageDesc().image_row_pitch)); + state.setTileMode(tileMode); + state.setSurfacePitch(static_cast(getImageDesc().image_row_pitch)); - surfaceState->setSurfaceFormat(surfaceFormat); + state.setSurfaceFormat(surfaceFormat); - surfaceState->setHalfPitchForChroma(false); - surfaceState->setInterleaveChroma(false); - surfaceState->setXOffsetForUCb(0); - surfaceState->setYOffsetForUCb(0); - surfaceState->setXOffsetForVCr(0); - surfaceState->setYOffsetForVCr(0); + state.setHalfPitchForChroma(false); + state.setInterleaveChroma(false); + state.setXOffsetForUCb(0); + state.setYOffsetForUCb(0); + state.setXOffsetForVCr(0); + state.setYOffsetForVCr(0); setSurfaceMemoryObjectControlStateIndexToMocsTable( - reinterpret_cast(surfaceState), + reinterpret_cast(&state), gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_IMAGE)); if (IsNV12Image(&this->getImageFormat())) { - surfaceState->setInterleaveChroma(true); - surfaceState->setYOffsetForUCb(this->surfaceOffsets.yOffsetForUVplane); + state.setInterleaveChroma(true); + state.setYOffsetForUCb(this->surfaceOffsets.yOffsetForUVplane); } - surfaceState->setVerticalLineStride(0); - surfaceState->setVerticalLineStrideOffset(0); + state.setVerticalLineStride(0); + state.setVerticalLineStrideOffset(0); - surfaceState->setSurfaceBaseAddress(getGraphicsAllocation()->getGpuAddress() + this->surfaceOffsets.offset); + state.setSurfaceBaseAddress(getGraphicsAllocation()->getGpuAddress() + this->surfaceOffsets.offset); + + *surfaceState = state; } template diff --git a/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp b/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp index bb37f80abd..c26f322ddf 100644 --- a/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp +++ b/opencl/test/unit_test/helpers/hardware_commands_helper_tests.cpp @@ -1169,9 +1169,11 @@ HWTEST_F(HardwareCommandsHelperTests, whenProgrammingMiAtomicThenSetupAllFields) MI_ATOMIC referenceCommand = FamilyType::cmdInitAtomic; HardwareCommandsHelper::programMiAtomic(referenceCommand, writeAddress, opcode, dataSize); - auto miAtomic = HardwareCommandsHelper::programMiAtomic(cmdStream, writeAddress, opcode, dataSize); + HardwareCommandsHelper::programMiAtomic(cmdStream, writeAddress, opcode, dataSize); + auto miAtomic = genCmdCast(cmdStream.getCpuBase()); + ASSERT_NE(nullptr, miAtomic); + EXPECT_EQ(sizeof(MI_ATOMIC), cmdStream.getUsed()); - EXPECT_EQ(miAtomic, cmdStream.getCpuBase()); EXPECT_EQ(0, memcmp(&referenceCommand, miAtomic, sizeof(MI_ATOMIC))); } diff --git a/shared/source/command_container/command_encoder.inl b/shared/source/command_container/command_encoder.inl index 81dcfc65c5..993de63508 100644 --- a/shared/source/command_container/command_encoder.inl +++ b/shared/source/command_container/command_encoder.inl @@ -43,14 +43,14 @@ uint32_t EncodeStates::copySamplerState(IndirectHeap *dsh, dsh->align(INTERFACE_DESCRIPTOR_DATA::SAMPLERSTATEPOINTER_ALIGN_SIZE); auto samplerStateOffsetInDsh = static_cast(dsh->getUsed()); - auto samplerState = dsh->getSpace(sizeSamplerState); + auto dstSamplerState = reinterpret_cast(dsh->getSpace(sizeSamplerState)); - memcpy_s(samplerState, sizeSamplerState, ptrOffset(fnDynamicStateHeap, samplerStateOffset), - sizeSamplerState); - - auto pSmplr = reinterpret_cast(samplerState); + auto srcSamplerState = reinterpret_cast(ptrOffset(fnDynamicStateHeap, samplerStateOffset)); + SAMPLER_STATE state = {}; for (uint32_t i = 0; i < samplerCount; i++) { - pSmplr[i].setIndirectStatePointer((uint32_t)borderColorOffsetInDsh); + state = srcSamplerState[i]; + state.setIndirectStatePointer(static_cast(borderColorOffsetInDsh)); + dstSamplerState[i] = state; } return samplerStateOffsetInDsh; @@ -114,48 +114,68 @@ void EncodeMathMMIO::encodeGreaterThanPredicate(CommandContainer &contai */ template void EncodeMathMMIO::encodeAlu(MI_MATH_ALU_INST_INLINE *pAluParam, AluRegisters srcA, AluRegisters srcB, AluRegisters op, AluRegisters finalResultRegister, AluRegisters postOperationStateRegister) { - pAluParam->DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_LOAD); - pAluParam->DW0.BitField.Operand1 = static_cast(AluRegisters::R_SRCA); - pAluParam->DW0.BitField.Operand2 = static_cast(srcA); + MI_MATH_ALU_INST_INLINE aluParam; + + aluParam.DW0.Value = 0x0; + aluParam.DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_LOAD); + aluParam.DW0.BitField.Operand1 = static_cast(AluRegisters::R_SRCA); + aluParam.DW0.BitField.Operand2 = static_cast(srcA); + *pAluParam = aluParam; pAluParam++; - pAluParam->DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_LOAD); - pAluParam->DW0.BitField.Operand1 = static_cast(AluRegisters::R_SRCB); - pAluParam->DW0.BitField.Operand2 = static_cast(srcB); + aluParam.DW0.Value = 0x0; + aluParam.DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_LOAD); + aluParam.DW0.BitField.Operand1 = static_cast(AluRegisters::R_SRCB); + aluParam.DW0.BitField.Operand2 = static_cast(srcB); + *pAluParam = aluParam; pAluParam++; /* Order of operation: Operand1 Operand2 */ - pAluParam->DW0.BitField.ALUOpcode = static_cast(op); - pAluParam->DW0.BitField.Operand1 = 0; - pAluParam->DW0.BitField.Operand2 = 0; + aluParam.DW0.Value = 0x0; + aluParam.DW0.BitField.ALUOpcode = static_cast(op); + aluParam.DW0.BitField.Operand1 = 0; + aluParam.DW0.BitField.Operand2 = 0; + *pAluParam = aluParam; pAluParam++; - pAluParam->DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_STORE); - pAluParam->DW0.BitField.Operand1 = static_cast(finalResultRegister); - pAluParam->DW0.BitField.Operand2 = static_cast(postOperationStateRegister); + aluParam.DW0.Value = 0x0; + aluParam.DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_STORE); + aluParam.DW0.BitField.Operand1 = static_cast(finalResultRegister); + aluParam.DW0.BitField.Operand2 = static_cast(postOperationStateRegister); + *pAluParam = aluParam; pAluParam++; } -template -void EncodeMathMMIO::encodeAluSubStoreCarry(MI_MATH_ALU_INST_INLINE *pAluParam, AluRegisters regA, AluRegisters regB, AluRegisters finalResultRegister) { - /* regB is subtracted from regA */ - encodeAlu(pAluParam, regA, regB, AluRegisters::OPCODE_SUB, finalResultRegister, AluRegisters::R_CF); -} - template uint32_t *EncodeMath::commandReserve(CommandContainer &container) { size_t size = sizeof(MI_MATH) + sizeof(MI_MATH_ALU_INST_INLINE) * NUM_ALU_INST_FOR_READ_MODIFY_WRITE; auto cmd = reinterpret_cast(container.getCommandStream()->getSpace(size)); - reinterpret_cast(cmd)->DW0.Value = 0x0; - reinterpret_cast(cmd)->DW0.BitField.InstructionType = MI_MATH::COMMAND_TYPE_MI_COMMAND; - reinterpret_cast(cmd)->DW0.BitField.InstructionOpcode = MI_MATH::MI_COMMAND_OPCODE_MI_MATH; - reinterpret_cast(cmd)->DW0.BitField.DwordLength = NUM_ALU_INST_FOR_READ_MODIFY_WRITE - 1; + MI_MATH mathBuffer; + mathBuffer.DW0.Value = 0x0; + mathBuffer.DW0.BitField.InstructionType = MI_MATH::COMMAND_TYPE_MI_COMMAND; + mathBuffer.DW0.BitField.InstructionOpcode = MI_MATH::MI_COMMAND_OPCODE_MI_MATH; + mathBuffer.DW0.BitField.DwordLength = NUM_ALU_INST_FOR_READ_MODIFY_WRITE - 1; + *reinterpret_cast(cmd) = mathBuffer; cmd++; return cmd; } +template +void EncodeMathMMIO::encodeAluAdd(MI_MATH_ALU_INST_INLINE *pAluParam, + AluRegisters firstOperandRegister, + AluRegisters secondOperandRegister, + AluRegisters finalResultRegister) { + encodeAlu(pAluParam, firstOperandRegister, secondOperandRegister, AluRegisters::OPCODE_ADD, finalResultRegister, AluRegisters::R_ACCU); +} + +template +void EncodeMathMMIO::encodeAluSubStoreCarry(MI_MATH_ALU_INST_INLINE *pAluParam, AluRegisters regA, AluRegisters regB, AluRegisters finalResultRegister) { + /* regB is subtracted from regA */ + encodeAlu(pAluParam, regA, regB, AluRegisters::OPCODE_SUB, finalResultRegister, AluRegisters::R_CF); +} + /* * greaterThan() tests if firstOperandRegister is greater than * secondOperandRegister. @@ -166,11 +186,9 @@ void EncodeMath::greaterThan(CommandContainer &container, AluRegisters secondOperandRegister, AluRegisters finalResultRegister) { uint32_t *cmd = EncodeMath::commandReserve(container); - MI_MATH_ALU_INST_INLINE *pAluParam = - reinterpret_cast(cmd); /* firstOperandRegister will be subtracted from secondOperandRegister */ - EncodeMathMMIO::encodeAluSubStoreCarry(pAluParam, + EncodeMathMMIO::encodeAluSubStoreCarry(reinterpret_cast(cmd), secondOperandRegister, firstOperandRegister, finalResultRegister); @@ -182,22 +200,13 @@ void EncodeMath::addition(CommandContainer &container, AluRegisters secondOperandRegister, AluRegisters finalResultRegister) { uint32_t *cmd = EncodeMath::commandReserve(container); - EncodeMath::MI_MATH_ALU_INST_INLINE *pAluParam = - reinterpret_cast(cmd); - EncodeMathMMIO::encodeAluAdd(pAluParam, firstOperandRegister, + EncodeMathMMIO::encodeAluAdd(reinterpret_cast(cmd), + firstOperandRegister, secondOperandRegister, finalResultRegister); } -template -void EncodeMathMMIO::encodeAluAdd(MI_MATH_ALU_INST_INLINE *pAluParam, - AluRegisters firstOperandRegister, - AluRegisters secondOperandRegister, - AluRegisters finalResultRegister) { - encodeAlu(pAluParam, firstOperandRegister, secondOperandRegister, AluRegisters::OPCODE_ADD, finalResultRegister, AluRegisters::R_ACCU); -} - template void EncodeIndirectParams::setGroupCountIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress) { for (int i = 0; i < 3; ++i) { diff --git a/shared/source/command_container/command_encoder_base.inl b/shared/source/command_container/command_encoder_base.inl index 3317345456..6a240ef0e4 100644 --- a/shared/source/command_container/command_encoder_base.inl +++ b/shared/source/command_container/command_encoder_base.inl @@ -214,8 +214,8 @@ void EncodeMediaInterfaceDescriptorLoad::encode(CommandContainer &contai using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename Family::MEDIA_INTERFACE_DESCRIPTOR_LOAD; auto heap = container.getIndirectHeap(HeapType::DYNAMIC_STATE); - auto mediaStateFlush = container.getCommandStream()->getSpace(sizeof(MEDIA_STATE_FLUSH)); - *reinterpret_cast(mediaStateFlush) = Family::cmdInitMediaStateFlush; + auto mediaStateFlush = container.getCommandStream()->getSpaceForCmd(); + *mediaStateFlush = Family::cmdInitMediaStateFlush; MEDIA_INTERFACE_DESCRIPTOR_LOAD cmd = Family::cmdInitMediaInterfaceDescriptorLoad; cmd.setInterfaceDescriptorDataStartAddress(static_cast(ptrDiff(container.getIddBlock(), heap->getCpuBase()))); diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl index bddd38a7d2..a32f21e7e4 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl @@ -78,7 +78,9 @@ inline void CommandStreamReceiverHw::programEndingCmd(LinearStream &c if (directSubmissionEnabled) { *patchLocation = commandStream.getSpace(sizeof(MI_BATCH_BUFFER_START)); auto bbStart = reinterpret_cast(*patchLocation); - addBatchBufferStart(bbStart, 0ull, false); + MI_BATCH_BUFFER_START cmd = {}; + addBatchBufferStart(&cmd, 0ull, false); + *bbStart = cmd; } else { this->addBatchBufferEnd(commandStream, patchLocation); } @@ -86,15 +88,17 @@ inline void CommandStreamReceiverHw::programEndingCmd(LinearStream &c template inline void CommandStreamReceiverHw::addBatchBufferStart(MI_BATCH_BUFFER_START *commandBufferMemory, uint64_t startAddress, bool secondary) { - *commandBufferMemory = GfxFamily::cmdInitBatchBufferStart; - commandBufferMemory->setBatchBufferStartAddressGraphicsaddress472(startAddress); - commandBufferMemory->setAddressSpaceIndicator(MI_BATCH_BUFFER_START::ADDRESS_SPACE_INDICATOR_PPGTT); + MI_BATCH_BUFFER_START cmd = GfxFamily::cmdInitBatchBufferStart; + + cmd.setBatchBufferStartAddressGraphicsaddress472(startAddress); + cmd.setAddressSpaceIndicator(MI_BATCH_BUFFER_START::ADDRESS_SPACE_INDICATOR_PPGTT); if (secondary) { - commandBufferMemory->setSecondLevelBatchBuffer(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER_SECOND_LEVEL_BATCH); + cmd.setSecondLevelBatchBuffer(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER_SECOND_LEVEL_BATCH); } if (DebugManager.flags.FlattenBatchBufferForAUBDump.get()) { flatBatchBufferHelper->registerBatchBufferStartAddress(reinterpret_cast(commandBufferMemory), startAddress); } + *commandBufferMemory = cmd; } template diff --git a/shared/source/command_stream/command_stream_receiver_hw_bdw_plus.inl b/shared/source/command_stream/command_stream_receiver_hw_bdw_plus.inl index a39b626601..07aaa8d09c 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_bdw_plus.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_bdw_plus.inl @@ -20,10 +20,11 @@ inline void CommandStreamReceiverHw::programL3(LinearStream &csr, Dis if (csrSizeRequestFlags.l3ConfigChanged && this->isPreambleSent) { // Add a PIPE_CONTROL w/ CS_stall auto pCmd = (PIPE_CONTROL *)csr.getSpace(sizeof(PIPE_CONTROL)); - *pCmd = GfxFamily::cmdInitPipeControl; - pCmd->setCommandStreamerStallEnable(true); - pCmd->setDcFlushEnable(true); - addClearSLMWorkAround(pCmd); + PIPE_CONTROL cmd = GfxFamily::cmdInitPipeControl; + cmd.setCommandStreamerStallEnable(true); + cmd.setDcFlushEnable(true); + addClearSLMWorkAround(&cmd); + *pCmd = cmd; PreambleHelper::programL3(&csr, newL3Config); this->lastSentL3Config = newL3Config; diff --git a/shared/source/gen8/preemption_gen8.cpp b/shared/source/gen8/preemption_gen8.cpp index 049462cfe5..0bdd165673 100644 --- a/shared/source/gen8/preemption_gen8.cpp +++ b/shared/source/gen8/preemption_gen8.cpp @@ -81,9 +81,10 @@ void PreemptionHelper::applyPreemptionWaCmdsBegin(LinearStream *pComm preemptionMode == PreemptionMode::MidThread) { if (device.getHardwareInfo().workaroundTable.waModifyVFEStateAfterGPGPUPreemption) { auto pCmd = reinterpret_cast(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_IMM))); - *pCmd = GfxFamily::cmdInitLoadRegisterImm; - pCmd->setRegisterOffset(CS_GPR_R0); - pCmd->setDataDword(GPGPU_WALKER_COOKIE_VALUE_BEFORE_WALKER); + MI_LOAD_REGISTER_IMM cmd = GfxFamily::cmdInitLoadRegisterImm; + cmd.setRegisterOffset(CS_GPR_R0); + cmd.setDataDword(GPGPU_WALKER_COOKIE_VALUE_BEFORE_WALKER); + *pCmd = cmd; } } } @@ -96,9 +97,10 @@ void PreemptionHelper::applyPreemptionWaCmdsEnd(LinearStream *pComman preemptionMode == PreemptionMode::MidThread) { if (device.getHardwareInfo().workaroundTable.waModifyVFEStateAfterGPGPUPreemption) { auto pCmd = reinterpret_cast(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_IMM))); - *pCmd = GfxFamily::cmdInitLoadRegisterImm; - pCmd->setRegisterOffset(CS_GPR_R0); - pCmd->setDataDword(GPGPU_WALKER_COOKIE_VALUE_AFTER_WALKER); + MI_LOAD_REGISTER_IMM cmd = GfxFamily::cmdInitLoadRegisterImm; + cmd.setRegisterOffset(CS_GPR_R0); + cmd.setDataDword(GPGPU_WALKER_COOKIE_VALUE_AFTER_WALKER); + *pCmd = cmd; } } } diff --git a/shared/source/gen9/preemption_gen9.cpp b/shared/source/gen9/preemption_gen9.cpp index f8ace1a206..e52bc75543 100644 --- a/shared/source/gen9/preemption_gen9.cpp +++ b/shared/source/gen9/preemption_gen9.cpp @@ -38,9 +38,10 @@ void PreemptionHelper::applyPreemptionWaCmdsBegin(LinearStream *pComm preemptionMode == PreemptionMode::MidThread) { if (device.getHardwareInfo().workaroundTable.waModifyVFEStateAfterGPGPUPreemption) { auto pCmd = reinterpret_cast(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_IMM))); - *pCmd = GfxFamily::cmdInitLoadRegisterImm; - pCmd->setRegisterOffset(CS_GPR_R0); - pCmd->setDataDword(GPGPU_WALKER_COOKIE_VALUE_BEFORE_WALKER); + MI_LOAD_REGISTER_IMM cmd = GfxFamily::cmdInitLoadRegisterImm; + cmd.setRegisterOffset(CS_GPR_R0); + cmd.setDataDword(GPGPU_WALKER_COOKIE_VALUE_BEFORE_WALKER); + *pCmd = cmd; } } } @@ -53,9 +54,10 @@ void PreemptionHelper::applyPreemptionWaCmdsEnd(LinearStream *pComman preemptionMode == PreemptionMode::MidThread) { if (device.getHardwareInfo().workaroundTable.waModifyVFEStateAfterGPGPUPreemption) { auto pCmd = reinterpret_cast(pCommandStream->getSpace(sizeof(MI_LOAD_REGISTER_IMM))); - *pCmd = GfxFamily::cmdInitLoadRegisterImm; - pCmd->setRegisterOffset(CS_GPR_R0); - pCmd->setDataDword(GPGPU_WALKER_COOKIE_VALUE_AFTER_WALKER); + MI_LOAD_REGISTER_IMM cmd = GfxFamily::cmdInitLoadRegisterImm; + cmd.setRegisterOffset(CS_GPR_R0); + cmd.setDataDword(GPGPU_WALKER_COOKIE_VALUE_AFTER_WALKER); + *pCmd = cmd; } } } diff --git a/shared/source/helpers/blit_commands_helper_base.inl b/shared/source/helpers/blit_commands_helper_base.inl index 21acd660dc..04032d3e17 100644 --- a/shared/source/helpers/blit_commands_helper_base.inl +++ b/shared/source/helpers/blit_commands_helper_base.inl @@ -117,9 +117,8 @@ void BlitCommandsHelper::dispatchBlitCommandsForBuffer(const BlitProp } { - auto miArbCheckCmd = GfxFamily::cmdInitArbCheck; auto miArbCheckStream = linearStream.getSpaceForCmd(); - *miArbCheckStream = miArbCheckCmd; + *miArbCheckStream = GfxFamily::cmdInitArbCheck; } auto blitSize = width * height; diff --git a/shared/source/helpers/hw_helper_base.inl b/shared/source/helpers/hw_helper_base.inl index 0fa8322c9c..001b2a0c14 100644 --- a/shared/source/helpers/hw_helper_base.inl +++ b/shared/source/helpers/hw_helper_base.inl @@ -110,17 +110,17 @@ void HwHelperHw::setRenderSurfaceStateForBuffer(const RootDeviceEnvironm auto gmmHelper = rootDeviceEnvironment.getGmmHelper(); auto surfaceState = reinterpret_cast(surfaceStateBuffer); - *surfaceState = Family::cmdInitRenderSurfaceState; + RENDER_SURFACE_STATE state = Family::cmdInitRenderSurfaceState; auto surfaceSize = alignUp(bufferSize, 4); SURFACE_STATE_BUFFER_LENGTH Length = {0}; Length.Length = static_cast(surfaceSize - 1); - surfaceState->setWidth(Length.SurfaceState.Width + 1); - surfaceState->setHeight(Length.SurfaceState.Height + 1); - surfaceState->setDepth(Length.SurfaceState.Depth + 1); + state.setWidth(Length.SurfaceState.Width + 1); + state.setHeight(Length.SurfaceState.Height + 1); + state.setDepth(Length.SurfaceState.Depth + 1); if (pitch) { - surfaceState->setSurfacePitch(pitch); + state.setSurfacePitch(pitch); } // The graphics allocation for Host Ptr surface will be created in makeResident call and GPU address is expected to be the same as CPU address @@ -129,34 +129,35 @@ void HwHelperHw::setRenderSurfaceStateForBuffer(const RootDeviceEnvironm auto bufferStateSize = (gfxAlloc != nullptr) ? gfxAlloc->getUnderlyingBufferSize() : bufferSize; - surfaceState->setSurfaceType(static_cast(surfaceType)); + state.setSurfaceType(static_cast(surfaceType)); - surfaceState->setSurfaceFormat(SURFACE_FORMAT::SURFACE_FORMAT_RAW); - surfaceState->setSurfaceVerticalAlignment(RENDER_SURFACE_STATE::SURFACE_VERTICAL_ALIGNMENT_VALIGN_4); - surfaceState->setSurfaceHorizontalAlignment(RENDER_SURFACE_STATE::SURFACE_HORIZONTAL_ALIGNMENT_HALIGN_4); + state.setSurfaceFormat(SURFACE_FORMAT::SURFACE_FORMAT_RAW); + state.setSurfaceVerticalAlignment(RENDER_SURFACE_STATE::SURFACE_VERTICAL_ALIGNMENT_VALIGN_4); + state.setSurfaceHorizontalAlignment(RENDER_SURFACE_STATE::SURFACE_HORIZONTAL_ALIGNMENT_HALIGN_4); - surfaceState->setTileMode(RENDER_SURFACE_STATE::TILE_MODE_LINEAR); - surfaceState->setVerticalLineStride(0); - surfaceState->setVerticalLineStrideOffset(0); + state.setTileMode(RENDER_SURFACE_STATE::TILE_MODE_LINEAR); + state.setVerticalLineStride(0); + state.setVerticalLineStrideOffset(0); if ((isAligned(bufferStateAddress) && isAligned(bufferStateSize)) || isReadOnly) { - surfaceState->setMemoryObjectControlState(gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER)); + state.setMemoryObjectControlState(gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER)); } else { - surfaceState->setMemoryObjectControlState(gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER_CACHELINE_MISALIGNED)); + state.setMemoryObjectControlState(gmmHelper->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER_CACHELINE_MISALIGNED)); } - surfaceState->setSurfaceBaseAddress(bufferStateAddress); + state.setSurfaceBaseAddress(bufferStateAddress); Gmm *gmm = gfxAlloc ? gfxAlloc->getDefaultGmm() : nullptr; if (gmm && gmm->isRenderCompressed && !forceNonAuxMode && GraphicsAllocation::AllocationType::BUFFER_COMPRESSED == gfxAlloc->getAllocationType()) { // Its expected to not program pitch/qpitch/baseAddress for Aux surface in CCS scenarios - surfaceState->setCoherencyType(RENDER_SURFACE_STATE::COHERENCY_TYPE_GPU_COHERENT); - surfaceState->setAuxiliarySurfaceMode(AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_CCS_E); + state.setCoherencyType(RENDER_SURFACE_STATE::COHERENCY_TYPE_GPU_COHERENT); + state.setAuxiliarySurfaceMode(AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_CCS_E); } else { - surfaceState->setCoherencyType(RENDER_SURFACE_STATE::COHERENCY_TYPE_IA_COHERENT); - surfaceState->setAuxiliarySurfaceMode(AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_NONE); + state.setCoherencyType(RENDER_SURFACE_STATE::COHERENCY_TYPE_IA_COHERENT); + state.setAuxiliarySurfaceMode(AUXILIARY_SURFACE_MODE::AUXILIARY_SURFACE_MODE_AUX_NONE); } + *surfaceState = state; } template