mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-03 14:55:24 +08:00
RelaxedOrdering: Optimize GPU Queue stall by adding early return
Related-To: NEO-7458 Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
a104636b31
commit
a969636b6a
@@ -37,6 +37,8 @@ class LinearStream {
|
||||
uint64_t getGpuBase() const;
|
||||
void setGpuBase(uint64_t gpuAddress);
|
||||
|
||||
uint64_t getCurrentGpuAddressPosition() const;
|
||||
|
||||
void overrideMaxSize(size_t newMaxSize);
|
||||
void replaceBuffer(void *buffer, size_t bufferSize);
|
||||
GraphicsAllocation *getGraphicsAllocation() const;
|
||||
@@ -112,4 +114,9 @@ inline GraphicsAllocation *LinearStream::getGraphicsAllocation() const {
|
||||
inline void LinearStream::replaceGraphicsAllocation(GraphicsAllocation *gfxAllocation) {
|
||||
graphicsAllocation = gfxAllocation;
|
||||
}
|
||||
|
||||
inline uint64_t LinearStream::getCurrentGpuAddressPosition() const {
|
||||
return (getGpuBase() + getUsed());
|
||||
}
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -148,8 +148,6 @@ class DirectSubmissionHw {
|
||||
|
||||
size_t getSizeEnd();
|
||||
|
||||
uint64_t getCommandBufferPositionGpuAddress(void *position);
|
||||
|
||||
void dispatchPartitionRegisterConfiguration();
|
||||
size_t getSizePartitionRegisterConfigurationSection();
|
||||
|
||||
|
||||
@@ -82,7 +82,7 @@ DirectSubmissionHw<GfxFamily, Dispatcher>::DirectSubmissionHw(const DirectSubmis
|
||||
|
||||
template <typename GfxFamily, typename Dispatcher>
|
||||
void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchRelaxedOrderingSchedulerSection(uint32_t value) {
|
||||
uint64_t schedulerStartAddress = getCommandBufferPositionGpuAddress(ringCommandStream.getSpace(0));
|
||||
uint64_t schedulerStartAddress = ringCommandStream.getCurrentGpuAddressPosition();
|
||||
uint64_t deferredTasksListGpuVa = deferredTasksListAllocation->getGpuAddress();
|
||||
|
||||
// 1. Init section
|
||||
@@ -409,7 +409,7 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::startRingBuffer() {
|
||||
if (ringCommandStream.getAvailableSpace() < requiredSize) {
|
||||
switchRingBuffers();
|
||||
}
|
||||
uint64_t gpuStartVa = getCommandBufferPositionGpuAddress(ringCommandStream.getSpace(0));
|
||||
uint64_t gpuStartVa = ringCommandStream.getCurrentGpuAddressPosition();
|
||||
|
||||
if (!this->partitionConfigSet) {
|
||||
dispatchPartitionRegisterConfiguration();
|
||||
@@ -558,14 +558,6 @@ inline size_t DirectSubmissionHw<GfxFamily, Dispatcher>::getSizeEnd() {
|
||||
return size;
|
||||
}
|
||||
|
||||
template <typename GfxFamily, typename Dispatcher>
|
||||
inline uint64_t DirectSubmissionHw<GfxFamily, Dispatcher>::getCommandBufferPositionGpuAddress(void *position) {
|
||||
void *currentBase = ringCommandStream.getCpuBase();
|
||||
|
||||
size_t offset = ptrDiff(position, currentBase);
|
||||
return ringCommandStream.getGraphicsAllocation()->getGpuAddress() + static_cast<uint64_t>(offset);
|
||||
}
|
||||
|
||||
template <typename GfxFamily, typename Dispatcher>
|
||||
inline size_t DirectSubmissionHw<GfxFamily, Dispatcher>::getSizeDispatch() {
|
||||
size_t size = getSizeSemaphoreSection(false);
|
||||
@@ -624,8 +616,8 @@ void *DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchWorkloadSection(BatchBu
|
||||
}
|
||||
|
||||
dispatchStartSection(commandStreamAddress);
|
||||
void *returnPosition = ringCommandStream.getSpace(0);
|
||||
uint64_t returnGpuPointer = getCommandBufferPositionGpuAddress(returnPosition);
|
||||
|
||||
uint64_t returnGpuPointer = ringCommandStream.getCurrentGpuAddressPosition();
|
||||
|
||||
if (this->relaxedOrderingEnabled) {
|
||||
dispatchRelaxedOrderingReturnPtrRegs(relaxedOrderingReturnPtrCmdStream, returnGpuPointer);
|
||||
@@ -659,13 +651,21 @@ void *DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchWorkloadSection(BatchBu
|
||||
|
||||
template <typename GfxFamily, typename Dispatcher>
|
||||
void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchRelaxedOrderingQueueStall() {
|
||||
LinearStream bbStartStream(ringCommandStream.getSpace(EncodeBatchBufferStartOrEnd<GfxFamily>::getCmdSizeConditionalDataRegBatchBufferStart()),
|
||||
EncodeBatchBufferStartOrEnd<GfxFamily>::getCmdSizeConditionalDataRegBatchBufferStart());
|
||||
|
||||
LriHelper<GfxFamily>::program(&ringCommandStream, CS_GPR_R5, 1, true);
|
||||
dispatchSemaphoreSection(currentQueueWorkCount, false);
|
||||
|
||||
// patch conditional bb_start with current GPU address
|
||||
EncodeBatchBufferStartOrEnd<GfxFamily>::programConditionalDataRegBatchBufferStart(bbStartStream, ringCommandStream.getCurrentGpuAddressPosition(),
|
||||
CS_GPR_R1, 0, CompareOperation::Equal, false);
|
||||
}
|
||||
|
||||
template <typename GfxFamily, typename Dispatcher>
|
||||
size_t DirectSubmissionHw<GfxFamily, Dispatcher>::getSizeDispatchRelaxedOrderingQueueStall() {
|
||||
return getSizeSemaphoreSection(false) + sizeof(typename GfxFamily::MI_LOAD_REGISTER_IMM);
|
||||
return getSizeSemaphoreSection(false) + sizeof(typename GfxFamily::MI_LOAD_REGISTER_IMM) +
|
||||
EncodeBatchBufferStartOrEnd<GfxFamily>::getCmdSizeConditionalDataRegBatchBufferStart();
|
||||
}
|
||||
|
||||
template <typename GfxFamily, typename Dispatcher>
|
||||
@@ -762,8 +762,6 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffe
|
||||
requiredMinimalSize += RelaxedOrderingHelper::getSizeTaskStoreSection<GfxFamily>() + RelaxedOrderingHelper::getSizeReturnPtrRegs<GfxFamily>();
|
||||
}
|
||||
|
||||
getCommandBufferPositionGpuAddress(ringCommandStream.getSpace(0));
|
||||
|
||||
if (ringCommandStream.getAvailableSpace() < requiredMinimalSize) {
|
||||
switchRingBuffers();
|
||||
}
|
||||
@@ -828,7 +826,7 @@ template <typename GfxFamily, typename Dispatcher>
|
||||
inline uint64_t DirectSubmissionHw<GfxFamily, Dispatcher>::switchRingBuffers() {
|
||||
GraphicsAllocation *nextRingBuffer = switchRingBuffersAllocations();
|
||||
void *flushPtr = ringCommandStream.getSpace(0);
|
||||
uint64_t currentBufferGpuVa = getCommandBufferPositionGpuAddress(flushPtr);
|
||||
uint64_t currentBufferGpuVa = ringCommandStream.getCurrentGpuAddressPosition();
|
||||
|
||||
if (ringStart) {
|
||||
dispatchSwitchRingBufferSection(nextRingBuffer->getGpuAddress());
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2021 Intel Corporation
|
||||
* Copyright (C) 2021-2022 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -11,8 +11,7 @@ namespace NEO {
|
||||
|
||||
template <typename GfxFamily, typename Dispatcher>
|
||||
inline void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchPrefetchMitigation() {
|
||||
auto addressToJump = ptrOffset(ringCommandStream.getSpace(0u), getSizeStartSection());
|
||||
dispatchStartSection(getCommandBufferPositionGpuAddress(addressToJump));
|
||||
dispatchStartSection(ringCommandStream.getCurrentGpuAddressPosition() + getSizeStartSection());
|
||||
}
|
||||
|
||||
template <typename GfxFamily, typename Dispatcher>
|
||||
|
||||
Reference in New Issue
Block a user