Improve obtaining LinearStream during enqueue call

- Move logic to enqueueHandler to cover all scenarios
- Create BlockedCommandsData not only for Kernel enqueue
- KernelOperation cleanup

Change-Id: Ie4a673cbbc986c685996a38ab296444d38e7bbd5
Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
This commit is contained in:
Dunajski, Bartosz
2019-07-18 21:15:50 +02:00
committed by sys_ocldev
parent 1460713d69
commit 95c2dcd8b0
18 changed files with 264 additions and 220 deletions

View File

@@ -337,7 +337,7 @@ class CommandQueueHw : public CommandQueue {
bool &blocking,
const MultiDispatchInfo &multiDispatchInfo,
TimestampPacketContainer *previousTimestampPacketNodes,
KernelOperation *blockedCommandsData,
std::unique_ptr<KernelOperation> &blockedCommandsData,
EventsRequest &eventsRequest,
bool slmUsed,
EventBuilder &externalEventBuilder,
@@ -385,6 +385,29 @@ class CommandQueueHw : public CommandQueue {
MOCKABLE_VIRTUAL void dispatchAuxTranslation(MultiDispatchInfo &multiDispatchInfo, MemObjsForAuxTranslation &memObjsForAuxTranslation,
AuxTranslationDirection auxTranslationDirection);
template <uint32_t commandType>
LinearStream *obtainCommandStream(const CsrDependencies &csrDependencies, bool profilingRequired,
bool perfCountersRequired, bool blitEnqueue, bool blockedQueue,
const MultiDispatchInfo &multiDispatchInfo,
std::unique_ptr<KernelOperation> &blockedCommandsData,
Surface **surfaces, size_t numSurfaces) {
LinearStream *commandStream = nullptr;
if (blockedQueue && !multiDispatchInfo.empty()) {
constexpr size_t additionalAllocationSize = CSRequirements::csOverfetchSize;
constexpr size_t allocationSize = MemoryConstants::pageSize64k - CSRequirements::csOverfetchSize;
commandStream = new LinearStream();
auto &gpgpuCsr = getGpgpuCommandStreamReceiver();
gpgpuCsr.ensureCommandBufferAllocation(*commandStream, allocationSize, additionalAllocationSize);
blockedCommandsData = std::make_unique<KernelOperation>(commandStream, *gpgpuCsr.getInternalAllocationStorage());
} else {
commandStream = &getCommandStream<GfxFamily, commandType>(*this, csrDependencies, profilingRequired, perfCountersRequired,
blitEnqueue, multiDispatchInfo, surfaces, numSurfaces);
}
return commandStream;
}
private:
bool isTaskLevelUpdateRequired(const uint32_t &taskLevel, const cl_event *eventWaitList, const cl_uint &numEventsInWaitList, unsigned int commandType);
void obtainTaskLevelAndBlockedStatus(unsigned int &taskLevel, cl_uint &numEventsInWaitList, const cl_event *&eventWaitList, bool &blockQueueStatus, unsigned int commandType) override;
@@ -414,7 +437,7 @@ class CommandQueueHw : public CommandQueue {
bool blockQueue,
DeviceQueueHw<GfxFamily> *devQueueHw,
CsrDependencies &csrDeps,
KernelOperation *&blockedCommandsData,
KernelOperation *blockedCommandsData,
TimestampPacketContainer &previousTimestampPacketNodes,
PreemptionMode preemption);
};

View File

@@ -165,7 +165,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
bool profilingRequired = (this->isProfilingEnabled() && event != nullptr);
bool perfCountersRequired = (this->isPerfCountersEnabled() && event != nullptr);
KernelOperation *blockedCommandsData = nullptr;
std::unique_ptr<KernelOperation> blockedCommandsData;
std::unique_ptr<PrintfHandler> printfHandler;
bool slmUsed = multiDispatchInfo.usesSlm() || parentKernel;
auto preemption = PreemptionHelper::taskPreemptionMode(*device, multiDispatchInfo);
@@ -227,8 +227,8 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
}
}
auto &commandStream = getCommandStream<GfxFamily, commandType>(*this, csrDeps, profilingRequired, perfCountersRequired,
blitEnqueue, multiDispatchInfo, surfacesForResidency, numSurfaceForResidency);
auto &commandStream = *obtainCommandStream<commandType>(csrDeps, profilingRequired, perfCountersRequired, blitEnqueue, blockQueue,
multiDispatchInfo, blockedCommandsData, surfacesForResidency, numSurfaceForResidency);
auto commandStreamStart = commandStream.getUsed();
if (eventBuilder.getEvent() && getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
@@ -241,7 +241,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
processDispatchForBlitEnqueue(multiDispatchInfo, previousTimestampPacketNodes, eventsRequest, commandStream, commandType);
} else if (multiDispatchInfo.empty() == false) {
processDispatchForKernels<commandType>(multiDispatchInfo, printfHandler, eventBuilder.getEvent(),
hwTimeStamps, parentKernel, blockQueue, devQueueHw, csrDeps, blockedCommandsData,
hwTimeStamps, parentKernel, blockQueue, devQueueHw, csrDeps, blockedCommandsData.get(),
previousTimestampPacketNodes, preemption);
} else if (isCacheFlushCommand(commandType)) {
processDispatchForCacheFlush(surfacesForResidency, numSurfaceForResidency, &commandStream, csrDeps);
@@ -396,7 +396,7 @@ void CommandQueueHw<GfxFamily>::processDispatchForKernels(const MultiDispatchInf
bool blockQueue,
DeviceQueueHw<GfxFamily> *devQueueHw,
CsrDependencies &csrDeps,
KernelOperation *&blockedCommandsData,
KernelOperation *blockedCommandsData,
TimestampPacketContainer &previousTimestampPacketNodes,
PreemptionMode preemption) {
TagNode<HwPerfCounter> *hwPerfCounter = nullptr;
@@ -437,13 +437,12 @@ void CommandQueueHw<GfxFamily>::processDispatchForKernels(const MultiDispatchInf
*this,
multiDispatchInfo,
csrDeps,
&blockedCommandsData,
blockedCommandsData,
hwTimeStamps,
hwPerfCounter,
&previousTimestampPacketNodes,
timestampPacketContainer.get(),
preemption,
blockQueue,
commandType);
if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) {
@@ -738,7 +737,7 @@ void CommandQueueHw<GfxFamily>::enqueueBlocked(
bool &blocking,
const MultiDispatchInfo &multiDispatchInfo,
TimestampPacketContainer *previousTimestampPacketNodes,
KernelOperation *blockedCommandsData,
std::unique_ptr<KernelOperation> &blockedCommandsData,
EventsRequest &eventsRequest,
bool slmUsed,
EventBuilder &externalEventBuilder,
@@ -795,10 +794,9 @@ void CommandQueueHw<GfxFamily>::enqueueBlocked(
allSurfaces.push_back(surface->duplicate());
}
PreemptionMode preemptionMode = PreemptionHelper::taskPreemptionMode(*device, multiDispatchInfo);
auto kernelOperation = std::unique_ptr<KernelOperation>(blockedCommandsData); // marking ownership
auto cmd = std::make_unique<CommandComputeKernel>(
*this,
std::move(kernelOperation),
std::move(blockedCommandsData),
allSurfaces,
shouldFlushDC(commandType, printfHandler.get()),
slmUsed,

View File

@@ -39,14 +39,13 @@ class HardwareInterface {
CommandQueue &commandQueue,
const MultiDispatchInfo &multiDispatchInfo,
const CsrDependencies &csrDependencies,
KernelOperation **blockedCommandsData,
KernelOperation *blockedCommandsData,
TagNode<HwTimeStamps> *hwTimeStamps,
TagNode<HwPerfCounter> *hwPerfCounter,
TimestampPacketContainer *previousTimestampPacketNodes,
TimestampPacketContainer *currentTimestampPacketNodes,
PreemptionMode preemptionMode,
bool blockQueue,
uint32_t commandType = 0);
uint32_t commandType);
static void getDefaultDshSpace(
const size_t &offsetInterfaceDescriptorTable,

View File

@@ -26,13 +26,12 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
CommandQueue &commandQueue,
const MultiDispatchInfo &multiDispatchInfo,
const CsrDependencies &csrDependencies,
KernelOperation **blockedCommandsData,
KernelOperation *blockedCommandsData,
TagNode<HwTimeStamps> *hwTimeStamps,
TagNode<HwPerfCounter> *hwPerfCounter,
TimestampPacketContainer *previousTimestampPacketNodes,
TimestampPacketContainer *currentTimestampPacketNodes,
PreemptionMode preemptionMode,
bool blockQueue,
uint32_t commandType) {
LinearStream *commandStream = nullptr;
@@ -49,19 +48,11 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
}
// Allocate command stream and indirect heaps
obtainIndirectHeaps(commandQueue, multiDispatchInfo, blockQueue, dsh, ioh, ssh);
if (blockQueue) {
constexpr static auto additionalAllocationSize = CSRequirements::csOverfetchSize;
constexpr static auto allocationSize = MemoryConstants::pageSize64k - additionalAllocationSize;
commandStream = new LinearStream();
commandQueue.getGpgpuCommandStreamReceiver().ensureCommandBufferAllocation(*commandStream, allocationSize, additionalAllocationSize);
using UniqueIH = std::unique_ptr<IndirectHeap>;
*blockedCommandsData = new KernelOperation(std::unique_ptr<LinearStream>(commandStream), UniqueIH(dsh), UniqueIH(ioh),
UniqueIH(ssh), *commandQueue.getGpgpuCommandStreamReceiver().getInternalAllocationStorage());
if (parentKernel) {
(*blockedCommandsData)->doNotFreeISH = true;
}
bool blockedQueue = (blockedCommandsData != nullptr);
obtainIndirectHeaps(commandQueue, multiDispatchInfo, blockedQueue, dsh, ioh, ssh);
if (blockedQueue) {
blockedCommandsData->setHeaps(dsh, ioh, ssh);
commandStream = blockedCommandsData->commandStream.get();
} else {
commandStream = &commandQueue.getCS(0);
}