Files
compute-runtime/shared/source/command_stream/command_stream_receiver.cpp
Maciej Plewka 0a16dc6c47 Fix multi thread usage of external host alloc
This is fixed reupload of this commit after auto revert
With this commit OpenCL will track if external host memory is used from
few threads and will secure to update task count in all threads before
destroing allocation.

Resolves: NEO-6807

Signed-off-by: Maciej Plewka <maciej.plewka@intel.com>
2022-05-05 13:32:15 +02:00

869 lines
37 KiB
C++

/*
* Copyright (C) 2018-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/command_stream/command_stream_receiver.h"
#include "shared/source/built_ins/built_ins.h"
#include "shared/source/command_container/implicit_scaling.h"
#include "shared/source/command_stream/experimental_command_buffer.h"
#include "shared/source/command_stream/preemption.h"
#include "shared/source/command_stream/scratch_space_controller.h"
#include "shared/source/debug_settings/debug_settings_manager.h"
#include "shared/source/device/device.h"
#include "shared/source/direct_submission/direct_submission_controller.h"
#include "shared/source/execution_environment/root_device_environment.h"
#include "shared/source/gmm_helper/page_table_mngr.h"
#include "shared/source/helpers/api_specific_config.h"
#include "shared/source/helpers/array_count.h"
#include "shared/source/helpers/cache_policy.h"
#include "shared/source/helpers/flush_stamp.h"
#include "shared/source/helpers/hw_helper.h"
#include "shared/source/helpers/pause_on_gpu_properties.h"
#include "shared/source/helpers/ray_tracing_helper.h"
#include "shared/source/helpers/string.h"
#include "shared/source/helpers/timestamp_packet.h"
#include "shared/source/memory_manager/internal_allocation_storage.h"
#include "shared/source/memory_manager/memory_manager.h"
#include "shared/source/memory_manager/surface.h"
#include "shared/source/os_interface/hw_info_config.h"
#include "shared/source/os_interface/os_context.h"
#include "shared/source/os_interface/os_interface.h"
#include "shared/source/os_interface/sys_calls_common.h"
#include "shared/source/utilities/cpuintrinsics.h"
#include "shared/source/utilities/tag_allocator.h"
#include "shared/source/utilities/wait_util.h"
namespace NEO {
// Global table of CommandStreamReceiver factories for HW and tests
CommandStreamReceiverCreateFunc commandStreamReceiverFactory[2 * IGFX_MAX_CORE] = {};
CommandStreamReceiver::CommandStreamReceiver(ExecutionEnvironment &executionEnvironment,
uint32_t rootDeviceIndex,
const DeviceBitfield deviceBitfield)
: executionEnvironment(executionEnvironment), rootDeviceIndex(rootDeviceIndex), deviceBitfield(deviceBitfield) {
residencyAllocations.reserve(startingResidencyContainerSize);
latestSentStatelessMocsConfig = CacheSettings::unknownMocs;
submissionAggregator.reset(new SubmissionAggregator());
if (ApiSpecificConfig::getApiType() == ApiSpecificConfig::L0) {
this->dispatchMode = DispatchMode::ImmediateDispatch;
}
if (DebugManager.flags.CsrDispatchMode.get()) {
this->dispatchMode = (DispatchMode)DebugManager.flags.CsrDispatchMode.get();
}
flushStamp.reset(new FlushStampTracker(true));
for (int i = 0; i < IndirectHeap::Type::NUM_TYPES; ++i) {
indirectHeap[i] = nullptr;
}
internalAllocationStorage = std::make_unique<InternalAllocationStorage>(*this);
const auto &hwInfo = peekHwInfo();
uint32_t subDeviceCount = static_cast<uint32_t>(deviceBitfield.count());
bool platformImplicitScaling = HwHelper::get(hwInfo.platform.eRenderCoreFamily).platformSupportsImplicitScaling(hwInfo);
if (NEO::ImplicitScalingHelper::isImplicitScalingEnabled(deviceBitfield, platformImplicitScaling) &&
subDeviceCount > 1 &&
DebugManager.flags.EnableStaticPartitioning.get() != 0) {
this->activePartitions = subDeviceCount;
this->staticWorkPartitioningEnabled = true;
}
}
CommandStreamReceiver::~CommandStreamReceiver() {
if (userPauseConfirmation) {
{
std::unique_lock<SpinLock> lock{debugPauseStateLock};
*debugPauseStateAddress = DebugPauseState::terminate;
}
userPauseConfirmation->join();
}
for (int i = 0; i < IndirectHeap::Type::NUM_TYPES; ++i) {
if (indirectHeap[i] != nullptr) {
auto allocation = indirectHeap[i]->getGraphicsAllocation();
if (allocation != nullptr) {
internalAllocationStorage->storeAllocation(std::unique_ptr<GraphicsAllocation>(allocation), REUSABLE_ALLOCATION);
}
delete indirectHeap[i];
}
}
cleanupResources();
internalAllocationStorage->cleanAllocationList(-1, REUSABLE_ALLOCATION);
internalAllocationStorage->cleanAllocationList(-1, TEMPORARY_ALLOCATION);
getMemoryManager()->unregisterEngineForCsr(this);
}
SubmissionStatus CommandStreamReceiver::submitBatchBuffer(BatchBuffer &batchBuffer, ResidencyContainer &allocationsForResidency) {
this->latestSentTaskCount = taskCount + 1;
SubmissionStatus retVal = this->flush(batchBuffer, allocationsForResidency);
if (!isUpdateTagFromWaitEnabled()) {
this->latestFlushedTaskCount = taskCount + 1;
}
taskCount++;
return retVal;
}
void CommandStreamReceiver::makeResident(MultiGraphicsAllocation &gfxAllocation) {
makeResident(*gfxAllocation.getGraphicsAllocation(rootDeviceIndex));
}
void CommandStreamReceiver::makeResident(GraphicsAllocation &gfxAllocation) {
auto submissionTaskCount = this->taskCount + 1;
if (gfxAllocation.isResidencyTaskCountBelow(submissionTaskCount, osContext->getContextId())) {
auto pushAllocations = true;
if (DebugManager.flags.MakeEachAllocationResident.get() != -1) {
pushAllocations = !DebugManager.flags.MakeEachAllocationResident.get();
}
if (pushAllocations) {
this->getResidencyAllocations().push_back(&gfxAllocation);
}
checkForNewResources(submissionTaskCount, gfxAllocation.getTaskCount(osContext->getContextId()), gfxAllocation);
gfxAllocation.updateTaskCount(submissionTaskCount, osContext->getContextId());
if (!gfxAllocation.isResident(osContext->getContextId())) {
this->totalMemoryUsed += gfxAllocation.getUnderlyingBufferSize();
}
}
gfxAllocation.updateResidencyTaskCount(submissionTaskCount, osContext->getContextId());
}
void CommandStreamReceiver::processEviction() {
this->getEvictionAllocations().clear();
}
void CommandStreamReceiver::makeNonResident(GraphicsAllocation &gfxAllocation) {
if (gfxAllocation.isResident(osContext->getContextId())) {
if (gfxAllocation.peekEvictable()) {
this->getEvictionAllocations().push_back(&gfxAllocation);
} else {
gfxAllocation.setEvictable(true);
}
}
if (!gfxAllocation.isAlwaysResident(this->osContext->getContextId())) {
gfxAllocation.releaseResidencyInOsContext(this->osContext->getContextId());
}
}
void CommandStreamReceiver::makeSurfacePackNonResident(ResidencyContainer &allocationsForResidency) {
for (auto &surface : allocationsForResidency) {
this->makeNonResident(*surface);
}
allocationsForResidency.clear();
this->processEviction();
}
void CommandStreamReceiver::makeResidentHostPtrAllocation(GraphicsAllocation *gfxAllocation) {
makeResident(*gfxAllocation);
}
WaitStatus CommandStreamReceiver::waitForTaskCount(uint32_t requiredTaskCount) {
auto address = getTagAddress();
if (address) {
this->downloadTagAllocation(requiredTaskCount);
return baseWaitFunction(address, WaitParams{false, false, 0}, requiredTaskCount);
}
return WaitStatus::Ready;
}
WaitStatus CommandStreamReceiver::waitForTaskCountAndCleanAllocationList(uint32_t requiredTaskCount, uint32_t allocationUsage) {
WaitStatus waitStatus{WaitStatus::Ready};
auto &list = allocationUsage == TEMPORARY_ALLOCATION ? internalAllocationStorage->getTemporaryAllocations() : internalAllocationStorage->getAllocationsForReuse();
if (!list.peekIsEmpty()) {
waitStatus = this->CommandStreamReceiver::waitForTaskCount(requiredTaskCount);
}
internalAllocationStorage->cleanAllocationList(requiredTaskCount, allocationUsage);
return waitStatus;
}
WaitStatus CommandStreamReceiver::waitForTaskCountAndCleanTemporaryAllocationList(uint32_t requiredTaskCount) {
return waitForTaskCountAndCleanAllocationList(requiredTaskCount, TEMPORARY_ALLOCATION);
}
void CommandStreamReceiver::ensureCommandBufferAllocation(LinearStream &commandStream, size_t minimumRequiredSize, size_t additionalAllocationSize) {
if (commandStream.getAvailableSpace() >= minimumRequiredSize) {
return;
}
auto alignment = MemoryConstants::pageSize64k;
if (DebugManager.flags.ForceCommandBufferAlignment.get() != -1) {
alignment = DebugManager.flags.ForceCommandBufferAlignment.get() * MemoryConstants::kiloByte;
}
const auto allocationSize = alignUp(minimumRequiredSize + additionalAllocationSize, alignment);
constexpr static auto allocationType = AllocationType::COMMAND_BUFFER;
auto allocation = this->getInternalAllocationStorage()->obtainReusableAllocation(allocationSize, allocationType).release();
if (allocation == nullptr) {
const AllocationProperties commandStreamAllocationProperties{rootDeviceIndex, true, allocationSize, allocationType,
isMultiOsContextCapable(), false, osContext->getDeviceBitfield()};
allocation = this->getMemoryManager()->allocateGraphicsMemoryWithProperties(commandStreamAllocationProperties);
}
DEBUG_BREAK_IF(allocation == nullptr);
if (commandStream.getGraphicsAllocation() != nullptr) {
getInternalAllocationStorage()->storeAllocation(std::unique_ptr<GraphicsAllocation>(commandStream.getGraphicsAllocation()), REUSABLE_ALLOCATION);
}
commandStream.replaceBuffer(allocation->getUnderlyingBuffer(), allocationSize - additionalAllocationSize);
commandStream.replaceGraphicsAllocation(allocation);
}
MemoryManager *CommandStreamReceiver::getMemoryManager() const {
DEBUG_BREAK_IF(!executionEnvironment.memoryManager);
return executionEnvironment.memoryManager.get();
}
LinearStream &CommandStreamReceiver::getCS(size_t minRequiredSize) {
constexpr static auto additionalAllocationSize = MemoryConstants::cacheLineSize + CSRequirements::csOverfetchSize;
ensureCommandBufferAllocation(this->commandStream, minRequiredSize, additionalAllocationSize);
return commandStream;
}
OSInterface *CommandStreamReceiver::getOSInterface() const {
return executionEnvironment.rootDeviceEnvironments[rootDeviceIndex]->osInterface.get();
}
GmmHelper *CommandStreamReceiver::peekGmmHelper() const {
return executionEnvironment.rootDeviceEnvironments[rootDeviceIndex]->gmmHelper.get();
}
uint64_t CommandStreamReceiver::getWorkPartitionAllocationGpuAddress() const {
if (isStaticWorkPartitioningEnabled()) {
return getWorkPartitionAllocation()->getGpuAddress();
}
return 0;
}
bool CommandStreamReceiver::isRcs() const {
return this->osContext->getEngineType() == aub_stream::ENGINE_RCS;
}
bool CommandStreamReceiver::skipResourceCleanup() const {
return this->getOSInterface() && this->getOSInterface()->getDriverModel() && this->getOSInterface()->getDriverModel()->skipResourceCleanup();
}
bool CommandStreamReceiver::isGpuHangDetected() const {
if (DebugManager.flags.DisableGpuHangDetection.get()) {
return false;
}
return this->osContext && this->getOSInterface() && this->getOSInterface()->getDriverModel() && this->getOSInterface()->getDriverModel()->isGpuHangDetected(*osContext);
}
void CommandStreamReceiver::cleanupResources() {
if (this->skipResourceCleanup()) {
return;
}
waitForTaskCountAndCleanAllocationList(this->latestFlushedTaskCount, TEMPORARY_ALLOCATION);
waitForTaskCountAndCleanAllocationList(this->latestFlushedTaskCount, REUSABLE_ALLOCATION);
if (debugSurface) {
getMemoryManager()->freeGraphicsMemory(debugSurface);
debugSurface = nullptr;
}
if (commandStream.getCpuBase()) {
getMemoryManager()->freeGraphicsMemory(commandStream.getGraphicsAllocation());
commandStream.replaceGraphicsAllocation(nullptr);
commandStream.replaceBuffer(nullptr, 0);
}
if (tagsMultiAllocation) {
// Null tag address to prevent waiting for tag update when freeing it
tagAllocation = nullptr;
tagAddress = nullptr;
DEBUG_BREAK_IF(tagAllocation != nullptr);
DEBUG_BREAK_IF(tagAddress != nullptr);
for (auto graphicsAllocation : tagsMultiAllocation->getGraphicsAllocations()) {
getMemoryManager()->freeGraphicsMemory(graphicsAllocation);
}
delete tagsMultiAllocation;
tagsMultiAllocation = nullptr;
}
if (globalFenceAllocation) {
getMemoryManager()->freeGraphicsMemory(globalFenceAllocation);
globalFenceAllocation = nullptr;
}
if (preemptionAllocation) {
getMemoryManager()->freeGraphicsMemory(preemptionAllocation);
preemptionAllocation = nullptr;
}
if (perDssBackedBuffer) {
getMemoryManager()->freeGraphicsMemory(perDssBackedBuffer);
perDssBackedBuffer = nullptr;
}
if (clearColorAllocation) {
getMemoryManager()->freeGraphicsMemory(clearColorAllocation);
clearColorAllocation = nullptr;
}
if (workPartitionAllocation) {
getMemoryManager()->freeGraphicsMemory(workPartitionAllocation);
workPartitionAllocation = nullptr;
}
}
WaitStatus CommandStreamReceiver::waitForCompletionWithTimeout(const WaitParams &params, uint32_t taskCountToWait) {
uint32_t latestSentTaskCount = this->latestFlushedTaskCount;
if (latestSentTaskCount < taskCountToWait) {
if (!this->flushBatchedSubmissions()) {
const auto isGpuHang{isGpuHangDetected()};
return isGpuHang ? WaitStatus::GpuHang : WaitStatus::NotReady;
}
}
return baseWaitFunction(getTagAddress(), params, taskCountToWait);
}
WaitStatus CommandStreamReceiver::baseWaitFunction(volatile uint32_t *pollAddress, const WaitParams &params, uint32_t taskCountToWait) {
std::chrono::microseconds elapsedTimeSinceGpuHangCheck{0};
std::chrono::high_resolution_clock::time_point waitStartTime, lastHangCheckTime, currentTime;
int64_t timeDiff = 0;
uint32_t latestSentTaskCount = this->latestFlushedTaskCount;
if (latestSentTaskCount < taskCountToWait) {
this->flushTagUpdate();
}
volatile uint32_t *partitionAddress = pollAddress;
waitStartTime = std::chrono::high_resolution_clock::now();
lastHangCheckTime = waitStartTime;
for (uint32_t i = 0; i < activePartitions; i++) {
while (*partitionAddress < taskCountToWait && timeDiff <= params.waitTimeout) {
this->downloadTagAllocation(taskCountToWait);
if (!params.indefinitelyPoll && WaitUtils::waitFunction(partitionAddress, taskCountToWait)) {
break;
}
currentTime = std::chrono::high_resolution_clock::now();
elapsedTimeSinceGpuHangCheck = std::chrono::duration_cast<std::chrono::microseconds>(currentTime - lastHangCheckTime);
if (elapsedTimeSinceGpuHangCheck.count() >= gpuHangCheckPeriod.count()) {
lastHangCheckTime = currentTime;
if (isGpuHangDetected()) {
return WaitStatus::GpuHang;
}
}
if (params.enableTimeout) {
timeDiff = std::chrono::duration_cast<std::chrono::microseconds>(currentTime - waitStartTime).count();
}
}
partitionAddress = ptrOffset(partitionAddress, this->postSyncWriteOffset);
}
partitionAddress = pollAddress;
for (uint32_t i = 0; i < activePartitions; i++) {
if (*partitionAddress < taskCountToWait) {
return WaitStatus::NotReady;
}
partitionAddress = ptrOffset(partitionAddress, this->postSyncWriteOffset);
}
return WaitStatus::Ready;
}
void CommandStreamReceiver::setTagAllocation(GraphicsAllocation *allocation) {
this->tagAllocation = allocation;
UNRECOVERABLE_IF(allocation == nullptr);
this->tagAddress = reinterpret_cast<uint32_t *>(allocation->getUnderlyingBuffer());
this->debugPauseStateAddress = reinterpret_cast<DebugPauseState *>(
reinterpret_cast<uint8_t *>(allocation->getUnderlyingBuffer()) + debugPauseStateAddressOffset);
}
MultiGraphicsAllocation &CommandStreamReceiver::createTagsMultiAllocation() {
RootDeviceIndicesContainer rootDeviceIndices;
if (ApiSpecificConfig::getApiType() == ApiSpecificConfig::L0) {
rootDeviceIndices.push_back(rootDeviceIndex);
} else {
for (auto index = 0u; index < this->executionEnvironment.rootDeviceEnvironments.size(); index++) {
if (this->executionEnvironment.rootDeviceEnvironments[index].get()->getHardwareInfo()->platform.eProductFamily ==
this->executionEnvironment.rootDeviceEnvironments[this->rootDeviceIndex].get()->getHardwareInfo()->platform.eProductFamily) {
rootDeviceIndices.push_back(index);
}
}
}
auto maxRootDeviceIndex = *std::max_element(rootDeviceIndices.begin(), rootDeviceIndices.end(), std::less<uint32_t const>());
auto allocations = new MultiGraphicsAllocation(maxRootDeviceIndex);
AllocationProperties unifiedMemoryProperties{rootDeviceIndices.at(0), MemoryConstants::pageSize, AllocationType::TAG_BUFFER, systemMemoryBitfield};
this->getMemoryManager()->createMultiGraphicsAllocationInSystemMemoryPool(rootDeviceIndices, unifiedMemoryProperties, *allocations);
return *allocations;
}
FlushStamp CommandStreamReceiver::obtainCurrentFlushStamp() const {
return flushStamp->peekStamp();
}
void CommandStreamReceiver::setRequiredScratchSizes(uint32_t newRequiredScratchSize, uint32_t newRequiredPrivateScratchSize) {
if (newRequiredScratchSize > requiredScratchSize) {
requiredScratchSize = newRequiredScratchSize;
}
if (newRequiredPrivateScratchSize > requiredPrivateScratchSize) {
requiredPrivateScratchSize = newRequiredPrivateScratchSize;
}
}
GraphicsAllocation *CommandStreamReceiver::getScratchAllocation() {
return scratchSpaceController->getScratchSpaceAllocation();
}
void CommandStreamReceiver::initProgrammingFlags() {
isPreambleSent = false;
GSBAFor32BitProgrammed = false;
bindingTableBaseAddressRequired = true;
mediaVfeStateDirty = true;
lastVmeSubslicesConfig = false;
lastSentL3Config = 0;
lastMediaSamplerConfig = -1;
lastPreemptionMode = PreemptionMode::Initial;
latestSentStatelessMocsConfig = 0;
lastSentUseGlobalAtomics = false;
}
void CommandStreamReceiver::programForAubSubCapture(bool wasActiveInPreviousEnqueue, bool isActive) {
if (!wasActiveInPreviousEnqueue && isActive) {
// force CSR reprogramming upon subcapture activation
this->initProgrammingFlags();
}
if (wasActiveInPreviousEnqueue && !isActive) {
// flush BB upon subcapture deactivation
this->flushBatchedSubmissions();
}
}
ResidencyContainer &CommandStreamReceiver::getResidencyAllocations() {
return this->residencyAllocations;
}
ResidencyContainer &CommandStreamReceiver::getEvictionAllocations() {
return this->evictionAllocations;
}
AubSubCaptureStatus CommandStreamReceiver::checkAndActivateAubSubCapture(const std::string &kernelName) { return {false, false}; }
void CommandStreamReceiver::addAubComment(const char *comment) {}
void CommandStreamReceiver::downloadAllocation(GraphicsAllocation &gfxAllocation) {
if (this->downloadAllocationImpl) {
this->downloadAllocationImpl(gfxAllocation);
}
}
void CommandStreamReceiver::startControllingDirectSubmissions() {
auto controller = this->executionEnvironment.directSubmissionController.get();
if (controller) {
controller->startControlling();
}
}
GraphicsAllocation *CommandStreamReceiver::allocateDebugSurface(size_t size) {
UNRECOVERABLE_IF(debugSurface != nullptr);
debugSurface = getMemoryManager()->allocateGraphicsMemoryWithProperties({rootDeviceIndex, size, AllocationType::INTERNAL_HOST_MEMORY, getOsContext().getDeviceBitfield()});
return debugSurface;
}
IndirectHeap &CommandStreamReceiver::getIndirectHeap(IndirectHeap::Type heapType,
size_t minRequiredSize) {
DEBUG_BREAK_IF(static_cast<uint32_t>(heapType) >= arrayCount(indirectHeap));
auto &heap = indirectHeap[heapType];
GraphicsAllocation *heapMemory = nullptr;
if (heap)
heapMemory = heap->getGraphicsAllocation();
if (heap && heap->getAvailableSpace() < minRequiredSize && heapMemory) {
internalAllocationStorage->storeAllocation(std::unique_ptr<GraphicsAllocation>(heapMemory), REUSABLE_ALLOCATION);
heapMemory = nullptr;
}
if (!heapMemory) {
allocateHeapMemory(heapType, minRequiredSize, heap);
}
return *heap;
}
void CommandStreamReceiver::allocateHeapMemory(IndirectHeap::Type heapType,
size_t minRequiredSize, IndirectHeap *&indirectHeap) {
size_t reservedSize = 0;
auto finalHeapSize = getDefaultHeapSize();
if (IndirectHeap::Type::SURFACE_STATE == heapType) {
finalHeapSize = defaultSshSize;
}
bool requireInternalHeap = IndirectHeap::Type::INDIRECT_OBJECT == heapType ? canUse4GbHeaps : false;
if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) {
requireInternalHeap = false;
}
minRequiredSize += reservedSize;
finalHeapSize = alignUp(std::max(finalHeapSize, minRequiredSize), MemoryConstants::pageSize);
auto allocationType = AllocationType::LINEAR_STREAM;
if (requireInternalHeap) {
allocationType = AllocationType::INTERNAL_HEAP;
}
auto heapMemory = internalAllocationStorage->obtainReusableAllocation(finalHeapSize, allocationType).release();
if (!heapMemory) {
heapMemory = getMemoryManager()->allocateGraphicsMemoryWithProperties({rootDeviceIndex, true, finalHeapSize, allocationType,
isMultiOsContextCapable(), false, osContext->getDeviceBitfield()});
} else {
finalHeapSize = std::max(heapMemory->getUnderlyingBufferSize(), finalHeapSize);
}
if (IndirectHeap::Type::SURFACE_STATE == heapType) {
DEBUG_BREAK_IF(minRequiredSize > defaultSshSize - MemoryConstants::pageSize);
finalHeapSize = defaultSshSize - MemoryConstants::pageSize;
}
if (indirectHeap) {
indirectHeap->replaceBuffer(heapMemory->getUnderlyingBuffer(), finalHeapSize);
indirectHeap->replaceGraphicsAllocation(heapMemory);
} else {
indirectHeap = new IndirectHeap(heapMemory, requireInternalHeap);
indirectHeap->overrideMaxSize(finalHeapSize);
}
scratchSpaceController->reserveHeap(heapType, indirectHeap);
}
void CommandStreamReceiver::releaseIndirectHeap(IndirectHeap::Type heapType) {
DEBUG_BREAK_IF(static_cast<uint32_t>(heapType) >= arrayCount(indirectHeap));
auto &heap = indirectHeap[heapType];
if (heap) {
auto heapMemory = heap->getGraphicsAllocation();
if (heapMemory != nullptr)
internalAllocationStorage->storeAllocation(std::unique_ptr<GraphicsAllocation>(heapMemory), REUSABLE_ALLOCATION);
heap->replaceBuffer(nullptr, 0);
heap->replaceGraphicsAllocation(nullptr);
}
}
void CommandStreamReceiver::setExperimentalCmdBuffer(std::unique_ptr<ExperimentalCommandBuffer> &&cmdBuffer) {
experimentalCmdBuffer = std::move(cmdBuffer);
}
void *CommandStreamReceiver::asyncDebugBreakConfirmation(void *arg) {
auto self = reinterpret_cast<CommandStreamReceiver *>(arg);
do {
auto debugPauseStateValue = DebugPauseState::waitingForUserStartConfirmation;
if (DebugManager.flags.PauseOnGpuMode.get() != PauseOnGpuProperties::PauseMode::AfterWorkload) {
do {
{
std::unique_lock<SpinLock> lock{self->debugPauseStateLock};
debugPauseStateValue = *self->debugPauseStateAddress;
}
if (debugPauseStateValue == DebugPauseState::terminate) {
return nullptr;
}
std::this_thread::yield();
} while (debugPauseStateValue != DebugPauseState::waitingForUserStartConfirmation);
std::cout << "Debug break: Press enter to start workload" << std::endl;
self->debugConfirmationFunction();
debugPauseStateValue = DebugPauseState::hasUserStartConfirmation;
{
std::unique_lock<SpinLock> lock{self->debugPauseStateLock};
*self->debugPauseStateAddress = debugPauseStateValue;
}
}
if (DebugManager.flags.PauseOnGpuMode.get() != PauseOnGpuProperties::PauseMode::BeforeWorkload) {
do {
{
std::unique_lock<SpinLock> lock{self->debugPauseStateLock};
debugPauseStateValue = *self->debugPauseStateAddress;
}
if (debugPauseStateValue == DebugPauseState::terminate) {
return nullptr;
}
std::this_thread::yield();
} while (debugPauseStateValue != DebugPauseState::waitingForUserEndConfirmation);
std::cout << "Debug break: Workload ended, press enter to continue" << std::endl;
self->debugConfirmationFunction();
{
std::unique_lock<SpinLock> lock{self->debugPauseStateLock};
*self->debugPauseStateAddress = DebugPauseState::hasUserEndConfirmation;
}
}
} while (DebugManager.flags.PauseOnEnqueue.get() == PauseOnGpuProperties::DebugFlagValues::OnEachEnqueue || DebugManager.flags.PauseOnBlitCopy.get() == PauseOnGpuProperties::DebugFlagValues::OnEachEnqueue);
return nullptr;
}
bool CommandStreamReceiver::initializeTagAllocation() {
this->tagsMultiAllocation = &this->createTagsMultiAllocation();
auto tagAllocation = tagsMultiAllocation->getGraphicsAllocation(rootDeviceIndex);
if (!tagAllocation) {
return false;
}
this->setTagAllocation(tagAllocation);
auto initValue = DebugManager.flags.EnableNullHardware.get() ? static_cast<uint32_t>(-1) : initialHardwareTag;
auto tagAddress = this->tagAddress;
uint32_t subDevices = static_cast<uint32_t>(this->deviceBitfield.count());
for (uint32_t i = 0; i < subDevices; i++) {
*tagAddress = initValue;
tagAddress = ptrOffset(tagAddress, this->postSyncWriteOffset);
}
*this->debugPauseStateAddress = DebugManager.flags.EnableNullHardware.get() ? DebugPauseState::disabled : DebugPauseState::waitingForFirstSemaphore;
PRINT_DEBUG_STRING(DebugManager.flags.PrintTagAllocationAddress.get(), stdout,
"\nCreated tag allocation %p for engine %u\n",
this->tagAddress, static_cast<uint32_t>(osContext->getEngineType()));
if (DebugManager.flags.PauseOnEnqueue.get() != -1 || DebugManager.flags.PauseOnBlitCopy.get() != -1) {
userPauseConfirmation = Thread::create(CommandStreamReceiver::asyncDebugBreakConfirmation, reinterpret_cast<void *>(this));
}
return true;
}
bool CommandStreamReceiver::createWorkPartitionAllocation(const Device &device) {
if (!staticWorkPartitioningEnabled) {
return false;
}
UNRECOVERABLE_IF(device.getNumGenericSubDevices() < 2);
AllocationProperties properties{this->rootDeviceIndex, true, 4096u, AllocationType::WORK_PARTITION_SURFACE, true, false, deviceBitfield};
this->workPartitionAllocation = getMemoryManager()->allocateGraphicsMemoryWithProperties(properties);
if (this->workPartitionAllocation == nullptr) {
return false;
}
uint32_t logicalId = 0;
for (uint32_t deviceIndex = 0; deviceIndex < deviceBitfield.size(); deviceIndex++) {
if (!deviceBitfield.test(deviceIndex)) {
continue;
}
const uint32_t copySrc[2] = {logicalId++, deviceIndex};
DeviceBitfield copyBitfield{};
copyBitfield.set(deviceIndex);
auto copySuccess = MemoryTransferHelper::transferMemoryToAllocationBanks(device, workPartitionAllocation, 0, copySrc, sizeof(copySrc), copyBitfield);
if (!copySuccess) {
return false;
}
}
return true;
}
bool CommandStreamReceiver::createGlobalFenceAllocation() {
auto hwInfo = executionEnvironment.rootDeviceEnvironments[rootDeviceIndex]->getHardwareInfo();
if (!HwHelper::get(hwInfo->platform.eRenderCoreFamily).isFenceAllocationRequired(*hwInfo)) {
return true;
}
DEBUG_BREAK_IF(this->globalFenceAllocation != nullptr);
this->globalFenceAllocation = getMemoryManager()->allocateGraphicsMemoryWithProperties({rootDeviceIndex, MemoryConstants::pageSize, AllocationType::GLOBAL_FENCE, osContext->getDeviceBitfield()});
return this->globalFenceAllocation != nullptr;
}
bool CommandStreamReceiver::createPreemptionAllocation() {
auto hwInfo = executionEnvironment.rootDeviceEnvironments[rootDeviceIndex]->getHardwareInfo();
size_t preemptionSurfaceSize = hwInfo->capabilityTable.requiredPreemptionSurfaceSize;
if (DebugManager.flags.OverrideCsrAllocationSize.get() > 0) {
preemptionSurfaceSize = DebugManager.flags.OverrideCsrAllocationSize.get();
}
AllocationProperties properties{rootDeviceIndex, true, preemptionSurfaceSize, AllocationType::PREEMPTION, isMultiOsContextCapable(), false, deviceBitfield};
properties.flags.uncacheable = hwInfo->workaroundTable.flags.waCSRUncachable;
properties.alignment = HwHelper::get(hwInfo->platform.eRenderCoreFamily).getPreemptionAllocationAlignment();
this->preemptionAllocation = getMemoryManager()->allocateGraphicsMemoryWithProperties(properties);
return this->preemptionAllocation != nullptr;
}
std::unique_lock<CommandStreamReceiver::MutexType> CommandStreamReceiver::obtainUniqueOwnership() {
return std::unique_lock<CommandStreamReceiver::MutexType>(this->ownershipMutex);
}
std::unique_lock<CommandStreamReceiver::MutexType> CommandStreamReceiver::obtainHostPtrSurfaceCreationLock() {
return std::unique_lock<CommandStreamReceiver::MutexType>(this->hostPtrSurfaceCreationMutex);
}
AllocationsList &CommandStreamReceiver::getTemporaryAllocations() { return internalAllocationStorage->getTemporaryAllocations(); }
AllocationsList &CommandStreamReceiver::getAllocationsForReuse() { return internalAllocationStorage->getAllocationsForReuse(); }
bool CommandStreamReceiver::createAllocationForHostSurface(HostPtrSurface &surface, bool requiresL3Flush) {
std::unique_lock<decltype(hostPtrSurfaceCreationMutex)> lock = this->obtainHostPtrSurfaceCreationLock();
auto allocation = internalAllocationStorage->obtainTemporaryAllocationWithPtr(surface.getSurfaceSize(), surface.getMemoryPointer(), AllocationType::EXTERNAL_HOST_PTR);
if (allocation == nullptr) {
auto memoryManager = getMemoryManager();
AllocationProperties properties{rootDeviceIndex,
false, // allocateMemory
surface.getSurfaceSize(), AllocationType::EXTERNAL_HOST_PTR,
false, // isMultiStorageAllocation
osContext->getDeviceBitfield()};
properties.flags.flushL3RequiredForRead = properties.flags.flushL3RequiredForWrite = requiresL3Flush;
allocation.reset(memoryManager->allocateGraphicsMemoryWithProperties(properties, surface.getMemoryPointer()));
if (allocation == nullptr && surface.peekIsPtrCopyAllowed()) {
// Try with no host pointer allocation and copy
allocation.reset(memoryManager->allocateInternalGraphicsMemoryWithHostCopy(rootDeviceIndex,
internalAllocationStorage->getDeviceBitfield(),
surface.getMemoryPointer(),
surface.getSurfaceSize()));
}
}
if (allocation == nullptr) {
return false;
}
allocation->hostPtrTaskCountAssignment++;
allocation->updateTaskCount(0u, osContext->getContextId());
surface.setAllocation(allocation.get());
internalAllocationStorage->storeAllocation(std::move(allocation), TEMPORARY_ALLOCATION);
return true;
}
TagAllocatorBase *CommandStreamReceiver::getEventTsAllocator() {
if (profilingTimeStampAllocator.get() == nullptr) {
RootDeviceIndicesContainer rootDeviceIndices = {rootDeviceIndex};
profilingTimeStampAllocator = std::make_unique<TagAllocator<HwTimeStamps>>(rootDeviceIndices, getMemoryManager(), getPreferredTagPoolSize(), MemoryConstants::cacheLineSize,
sizeof(HwTimeStamps), false, osContext->getDeviceBitfield());
}
return profilingTimeStampAllocator.get();
}
TagAllocatorBase *CommandStreamReceiver::getEventPerfCountAllocator(const uint32_t tagSize) {
if (perfCounterAllocator.get() == nullptr) {
RootDeviceIndicesContainer rootDeviceIndices = {rootDeviceIndex};
perfCounterAllocator = std::make_unique<TagAllocator<HwPerfCounter>>(
rootDeviceIndices, getMemoryManager(), getPreferredTagPoolSize(), MemoryConstants::cacheLineSize, tagSize, false, osContext->getDeviceBitfield());
}
return perfCounterAllocator.get();
}
size_t CommandStreamReceiver::getPreferredTagPoolSize() const {
if (DebugManager.flags.DisableTimestampPacketOptimizations.get()) {
return 1;
}
return 2048;
}
bool CommandStreamReceiver::expectMemory(const void *gfxAddress, const void *srcAddress,
size_t length, uint32_t compareOperation) {
auto isMemoryEqual = (memcmp(gfxAddress, srcAddress, length) == 0);
auto isEqualMemoryExpected = (compareOperation == AubMemDump::CmdServicesMemTraceMemoryCompare::CompareOperationValues::CompareEqual);
return (isMemoryEqual == isEqualMemoryExpected);
}
bool CommandStreamReceiver::needsPageTableManager() const {
auto hwInfo = executionEnvironment.rootDeviceEnvironments[rootDeviceIndex]->getHardwareInfo();
if (pageTableManager.get() != nullptr) {
return false;
}
return HwInfoConfig::get(hwInfo->platform.eProductFamily)->isPageTableManagerSupported(*hwInfo);
}
void CommandStreamReceiver::printDeviceIndex() {
if (DebugManager.flags.PrintDeviceAndEngineIdOnSubmission.get()) {
printf("%u: Submission to RootDevice Index: %u, Sub-Devices Mask: %lu, EngineId: %u (%s, %s)\n",
SysCalls::getProcessId(),
this->getRootDeviceIndex(),
this->osContext->getDeviceBitfield().to_ulong(),
this->osContext->getEngineType(),
EngineHelpers::engineTypeToString(this->osContext->getEngineType()).c_str(),
EngineHelpers::engineUsageToString(this->osContext->getEngineUsage()).c_str());
}
}
void CommandStreamReceiver::checkForNewResources(uint32_t submittedTaskCount, uint32_t allocationTaskCount, GraphicsAllocation &gfxAllocation) {
if (useNewResourceImplicitFlush) {
if (allocationTaskCount == GraphicsAllocation::objectNotUsed && !GraphicsAllocation::isIsaAllocationType(gfxAllocation.getAllocationType())) {
newResources = true;
if (DebugManager.flags.ProvideVerboseImplicitFlush.get()) {
printf("New resource detected of type %llu\n", static_cast<unsigned long long>(gfxAllocation.getAllocationType()));
}
}
}
}
bool CommandStreamReceiver::checkImplicitFlushForGpuIdle() {
if (useGpuIdleImplicitFlush) {
if (this->taskCount == *getTagAddress()) {
return true;
}
}
return false;
}
void CommandStreamReceiver::downloadTagAllocation(uint32_t taskCountToWait) {
if (this->getTagAllocation()) {
if (taskCountToWait && taskCountToWait <= this->peekLatestFlushedTaskCount()) {
this->downloadAllocation(*this->getTagAllocation());
}
}
}
bool CommandStreamReceiver::testTaskCountReady(volatile uint32_t *pollAddress, uint32_t taskCountToWait) {
this->downloadTagAllocation(taskCountToWait);
for (uint32_t i = 0; i < activePartitions; i++) {
if (!WaitUtils::waitFunction(pollAddress, taskCountToWait)) {
return false;
}
pollAddress = ptrOffset(pollAddress, this->postSyncWriteOffset);
}
return true;
}
const HardwareInfo &CommandStreamReceiver::peekHwInfo() const {
return *peekRootDeviceEnvironment().getHardwareInfo();
}
const RootDeviceEnvironment &CommandStreamReceiver::peekRootDeviceEnvironment() const {
return *executionEnvironment.rootDeviceEnvironments[rootDeviceIndex];
}
uint32_t CommandStreamReceiver::getCompletionValue(const GraphicsAllocation &gfxAllocation) {
if (completionFenceValuePointer) {
return *completionFenceValuePointer;
}
auto osContextId = osContext->getContextId();
return gfxAllocation.getTaskCount(osContextId);
}
bool CommandStreamReceiver::createPerDssBackedBuffer(Device &device) {
UNRECOVERABLE_IF(perDssBackedBuffer != nullptr);
auto size = RayTracingHelper::getTotalMemoryBackedFifoSize(device);
perDssBackedBuffer = getMemoryManager()->allocateGraphicsMemoryWithProperties({rootDeviceIndex, size, AllocationType::BUFFER, device.getDeviceBitfield()});
return perDssBackedBuffer != nullptr;
}
} // namespace NEO