mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-03 06:49:52 +08:00
Disable implicit scaling for cooperative kernels
When implicit scaling is disabled use useSingleSubdeviceValue = true. Resolves: NEO-5757 Signed-off-by: Filip Hazubski <filip.hazubski@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
b5d5784b81
commit
29c64c3dd0
@@ -44,7 +44,8 @@ struct EncodeDispatchKernel {
|
||||
bool &requiresUncachedMocs,
|
||||
bool useGlobalAtomics,
|
||||
uint32_t &partitionCount,
|
||||
bool isInternal);
|
||||
bool isInternal,
|
||||
bool isCooperative);
|
||||
|
||||
static void encodeAdditionalWalkerFields(const HardwareInfo &hwInfo, WALKER_TYPE &walkerCmd);
|
||||
|
||||
@@ -52,7 +53,8 @@ struct EncodeDispatchKernel {
|
||||
|
||||
static void *getInterfaceDescriptor(CommandContainer &container, uint32_t &iddOffset);
|
||||
|
||||
static size_t estimateEncodeDispatchKernelCmdsSize(Device *device, Vec3<size_t> groupStart, Vec3<size_t> groupCount, bool isInternal);
|
||||
static size_t estimateEncodeDispatchKernelCmdsSize(Device *device, Vec3<size_t> groupStart, Vec3<size_t> groupCount,
|
||||
bool isInternal, bool isCooperative);
|
||||
|
||||
static bool isRuntimeLocalIdsGenerationRequired(uint32_t activeChannels,
|
||||
size_t *lws,
|
||||
|
||||
@@ -26,7 +26,7 @@ template <typename Family>
|
||||
void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
|
||||
const void *pThreadGroupDimensions, bool isIndirect, bool isPredicate, DispatchKernelEncoderI *dispatchInterface,
|
||||
uint64_t eventAddress, bool isTimestampEvent, bool L3FlushEnable, Device *device, PreemptionMode preemptionMode,
|
||||
bool &requiresUncachedMocs, bool useGlobalAtomics, uint32_t &partitionCount, bool isInternal) {
|
||||
bool &requiresUncachedMocs, bool useGlobalAtomics, uint32_t &partitionCount, bool isInternal, bool isCooperative) {
|
||||
|
||||
using MEDIA_STATE_FLUSH = typename Family::MEDIA_STATE_FLUSH;
|
||||
using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename Family::MEDIA_INTERFACE_DESCRIPTOR_LOAD;
|
||||
@@ -49,7 +49,8 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
|
||||
if (!isIndirect) {
|
||||
threadDimsVec = {threadDims[0], threadDims[1], threadDims[2]};
|
||||
}
|
||||
size_t estimatedSizeRequired = estimateEncodeDispatchKernelCmdsSize(device, threadStartVec, threadDimsVec, isInternal);
|
||||
size_t estimatedSizeRequired = estimateEncodeDispatchKernelCmdsSize(device, threadStartVec, threadDimsVec,
|
||||
isInternal, isCooperative);
|
||||
if (container.getCommandStream()->getAvailableSpace() < estimatedSizeRequired) {
|
||||
auto bbEnd = listCmdBufferStream->getSpaceForCmd<MI_BATCH_BUFFER_END>();
|
||||
*bbEnd = Family::cmdInitBatchBufferEnd;
|
||||
@@ -319,7 +320,9 @@ template <typename Family>
|
||||
void EncodeDispatchKernel<Family>::appendAdditionalIDDFields(INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const HardwareInfo &hwInfo, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy) {}
|
||||
|
||||
template <typename Family>
|
||||
size_t EncodeDispatchKernel<Family>::estimateEncodeDispatchKernelCmdsSize(Device *device, Vec3<size_t> groupStart, Vec3<size_t> groupCount, bool isInternal) {
|
||||
size_t EncodeDispatchKernel<Family>::estimateEncodeDispatchKernelCmdsSize(Device *device, Vec3<size_t> groupStart,
|
||||
Vec3<size_t> groupCount, bool isInternal,
|
||||
bool isCooperative) {
|
||||
using MEDIA_STATE_FLUSH = typename Family::MEDIA_STATE_FLUSH;
|
||||
using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename Family::MEDIA_INTERFACE_DESCRIPTOR_LOAD;
|
||||
using MI_BATCH_BUFFER_END = typename Family::MI_BATCH_BUFFER_END;
|
||||
|
||||
@@ -38,7 +38,7 @@ template <typename Family>
|
||||
void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
|
||||
const void *pThreadGroupDimensions, bool isIndirect, bool isPredicate, DispatchKernelEncoderI *dispatchInterface,
|
||||
uint64_t eventAddress, bool isTimestampEvent, bool L3FlushEnable, Device *device, PreemptionMode preemptionMode,
|
||||
bool &requiresUncachedMocs, bool useGlobalAtomics, uint32_t &partitionCount, bool isInternal) {
|
||||
bool &requiresUncachedMocs, bool useGlobalAtomics, uint32_t &partitionCount, bool isInternal, bool isCooperative) {
|
||||
using SHARED_LOCAL_MEMORY_SIZE = typename Family::INTERFACE_DESCRIPTOR_DATA::SHARED_LOCAL_MEMORY_SIZE;
|
||||
using STATE_BASE_ADDRESS = typename Family::STATE_BASE_ADDRESS;
|
||||
using MI_BATCH_BUFFER_END = typename Family::MI_BATCH_BUFFER_END;
|
||||
@@ -59,7 +59,8 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
|
||||
if (!isIndirect) {
|
||||
threadDimsVec = {threadDims[0], threadDims[1], threadDims[2]};
|
||||
}
|
||||
size_t estimatedSizeRequired = estimateEncodeDispatchKernelCmdsSize(device, threadStartVec, threadDimsVec, isInternal);
|
||||
size_t estimatedSizeRequired = estimateEncodeDispatchKernelCmdsSize(device, threadStartVec, threadDimsVec,
|
||||
isInternal, isCooperative);
|
||||
if (container.getCommandStream()->getAvailableSpace() < estimatedSizeRequired) {
|
||||
auto bbEnd = listCmdBufferStream->getSpaceForCmd<MI_BATCH_BUFFER_END>();
|
||||
*bbEnd = Family::cmdInitBatchBufferEnd;
|
||||
@@ -256,7 +257,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
|
||||
|
||||
PreemptionHelper::applyPreemptionWaCmdsBegin<Family>(listCmdBufferStream, *device);
|
||||
|
||||
if (ImplicitScalingHelper::isImplicitScalingEnabled(device->getDeviceBitfield(), true) &&
|
||||
if (ImplicitScalingHelper::isImplicitScalingEnabled(device->getDeviceBitfield(), !isCooperative) &&
|
||||
!isInternal) {
|
||||
const uint64_t workPartitionAllocationGpuVa = device->getDefaultEngine().commandStreamReceiver->getWorkPartitionAllocationGpuAddress();
|
||||
ImplicitScalingDispatch<Family>::dispatchCommands(*listCmdBufferStream,
|
||||
@@ -421,15 +422,17 @@ void EncodeDispatchKernel<Family>::encodeThreadData(WALKER_TYPE &walkerCmd,
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
size_t EncodeDispatchKernel<Family>::estimateEncodeDispatchKernelCmdsSize(Device *device, Vec3<size_t> groupStart, Vec3<size_t> groupCount,
|
||||
bool isInternal) {
|
||||
size_t EncodeDispatchKernel<Family>::estimateEncodeDispatchKernelCmdsSize(Device *device, Vec3<size_t> groupStart,
|
||||
Vec3<size_t> groupCount, bool isInternal,
|
||||
bool isCooperative) {
|
||||
size_t totalSize = sizeof(WALKER_TYPE);
|
||||
totalSize += PreemptionHelper::getPreemptionWaCsSize<Family>(*device);
|
||||
totalSize += EncodeStates<Family>::getAdjustStateComputeModeSize();
|
||||
totalSize += EncodeIndirectParams<Family>::getCmdsSizeForIndirectParams();
|
||||
totalSize += EncodeIndirectParams<Family>::getCmdsSizeForSetGroupCountIndirect();
|
||||
totalSize += EncodeIndirectParams<Family>::getCmdsSizeForSetGroupSizeIndirect();
|
||||
if (ImplicitScalingHelper::isImplicitScalingEnabled(device->getDeviceBitfield(), true) &&
|
||||
|
||||
if (ImplicitScalingHelper::isImplicitScalingEnabled(device->getDeviceBitfield(), !isCooperative) &&
|
||||
!isInternal) {
|
||||
const bool staticPartitioning = device->getDefaultEngine().commandStreamReceiver->isStaticWorkPartitioningEnabled();
|
||||
totalSize += ImplicitScalingDispatch<Family>::getSize(true, staticPartitioning, device->getDeviceBitfield(), groupStart, groupCount);
|
||||
|
||||
@@ -83,7 +83,7 @@ class CommandStreamReceiver {
|
||||
uint32_t taskLevel, DispatchFlags &dispatchFlags, Device &device) = 0;
|
||||
|
||||
virtual bool flushBatchedSubmissions() = 0;
|
||||
bool submitBatchBuffer(BatchBuffer &batchBuffer, ResidencyContainer &allocationsForResidency);
|
||||
MOCKABLE_VIRTUAL bool submitBatchBuffer(BatchBuffer &batchBuffer, ResidencyContainer &allocationsForResidency);
|
||||
virtual void programHardwareContext(LinearStream &cmdStream) = 0;
|
||||
virtual size_t getCmdsSizeForHardwareContext() const = 0;
|
||||
|
||||
|
||||
@@ -58,7 +58,7 @@ class TbxCommandStreamReceiverHw : public CommandStreamReceiverSimulatedHw<GfxFa
|
||||
AubSubCaptureStatus checkAndActivateAubSubCapture(const MultiDispatchInfo &dispatchInfo) override;
|
||||
|
||||
// Family specific version
|
||||
MOCKABLE_VIRTUAL void submitBatchBuffer(uint64_t batchBufferGpuAddress, const void *batchBuffer, size_t batchBufferSize, uint32_t memoryBank, uint64_t entryBits, bool overrideRingHead);
|
||||
MOCKABLE_VIRTUAL void submitBatchBufferTbx(uint64_t batchBufferGpuAddress, const void *batchBuffer, size_t batchBufferSize, uint32_t memoryBank, uint64_t entryBits, bool overrideRingHead);
|
||||
void pollForCompletion() override;
|
||||
|
||||
void dumpAllocation(GraphicsAllocation &gfxAllocation) override;
|
||||
|
||||
@@ -241,7 +241,7 @@ bool TbxCommandStreamReceiverHw<GfxFamily>::flush(BatchBuffer &batchBuffer, Resi
|
||||
}
|
||||
}
|
||||
|
||||
submitBatchBuffer(
|
||||
submitBatchBufferTbx(
|
||||
batchBufferGpuAddress, pBatchBuffer, sizeBatchBuffer,
|
||||
this->getMemoryBank(batchBuffer.commandBufferAllocation),
|
||||
this->getPPGTTAdditionalBits(batchBuffer.commandBufferAllocation),
|
||||
@@ -256,7 +256,7 @@ bool TbxCommandStreamReceiverHw<GfxFamily>::flush(BatchBuffer &batchBuffer, Resi
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void TbxCommandStreamReceiverHw<GfxFamily>::submitBatchBuffer(uint64_t batchBufferGpuAddress, const void *batchBuffer, size_t batchBufferSize, uint32_t memoryBank, uint64_t entryBits, bool overrideRingHead) {
|
||||
void TbxCommandStreamReceiverHw<GfxFamily>::submitBatchBufferTbx(uint64_t batchBufferGpuAddress, const void *batchBuffer, size_t batchBufferSize, uint32_t memoryBank, uint64_t entryBits, bool overrideRingHead) {
|
||||
if (hardwareContextController) {
|
||||
if (batchBufferSize) {
|
||||
hardwareContextController->submit(batchBufferGpuAddress, batchBuffer, batchBufferSize, memoryBank, MemoryConstants::pageSize64k, overrideRingHead);
|
||||
|
||||
Reference in New Issue
Block a user