DispatchKernelEncoder refactor

Replacing parts of DispatchKernelEncoder with KernelDescriptor

Change-Id: I1c780b04a2d3d1de0fb75d5413a0dde8b41bbe07
This commit is contained in:
Jaroslaw Chodor
2020-04-07 14:07:31 +02:00
committed by sys_ocldev
parent ea56bde3fb
commit 2c25777f3c
27 changed files with 180 additions and 383 deletions

View File

@ -155,7 +155,7 @@ struct CommandListCoreFamily : CommandListImp {
void applyMemoryRangesBarrier(uint32_t numRanges, const size_t *pRangeSizes,
const void **pRanges);
ze_result_t setGroupSizeIndirect(uint32_t offsets[3], void *crossThreadAddress, uint32_t lws[3]);
ze_result_t setGlobalWorkSizeIndirect(NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress, uint32_t lws[3]);
void appendEventForProfiling(ze_event_handle_t hEvent, bool beforeWalker);
void appendSignalEventPostWalker(ze_event_handle_t hEvent);

View File

@ -1176,13 +1176,10 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::prepareIndirectParams(const ze
}
template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandListCoreFamily<gfxCoreFamily>::setGroupSizeIndirect(uint32_t offsets[3],
void *crossThreadAddress,
uint32_t lws[3]) {
ze_result_t CommandListCoreFamily<gfxCoreFamily>::setGlobalWorkSizeIndirect(NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress, uint32_t lws[3]) {
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
NEO::EncodeIndirectParams<GfxFamily>::setGroupSizeIndirect(commandContainer, offsets, crossThreadAddress, lws);
NEO::EncodeIndirectParams<GfxFamily>::setGlobalWorkSizeIndirect(commandContainer, offsets, crossThreadAddress, lws);
return ZE_RESULT_SUCCESS;
}

View File

@ -29,10 +29,10 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
const auto kernel = Kernel::fromHandle(hKernel);
UNRECOVERABLE_IF(kernel == nullptr);
const auto functionImmutableData = kernel->getImmutableData();
commandListPerThreadScratchSize = std::max(commandListPerThreadScratchSize, kernel->getPerThreadScratchSize());
commandListPerThreadScratchSize = std::max<std::uint32_t>(commandListPerThreadScratchSize, kernel->getImmutableData()->getDescriptor().kernelAttributes.perThreadScratchSize[0]);
auto functionPreemptionMode = obtainFunctionPreemptionMode(kernel);
commandListPreemptionMode = std::min(commandListPreemptionMode, functionPreemptionMode);
auto kernelPreemptionMode = obtainFunctionPreemptionMode(kernel);
commandListPreemptionMode = std::min(commandListPreemptionMode, kernelPreemptionMode);
if (!isIndirect) {
kernel->setGroupCount(pThreadGroupDimensions->groupCountX,

View File

@ -98,8 +98,6 @@ struct Kernel : _ze_kernel_handle_t, virtual NEO::DispatchKernelEncoderI {
virtual ze_result_t setArgBufferWithAlloc(uint32_t argIndex, const void *argVal, NEO::GraphicsAllocation *allocation) = 0;
virtual ze_result_t setArgRedescribedImage(uint32_t argIndex, ze_image_handle_t argVal) = 0;
virtual bool getGroupCountOffsets(uint32_t *locations) = 0;
virtual bool getGroupSizeOffsets(uint32_t *locations) = 0;
virtual ze_result_t setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
uint32_t groupSizeZ) = 0;
virtual ze_result_t suggestGroupSize(uint32_t globalSizeX, uint32_t globalSizeY,
@ -113,22 +111,6 @@ struct Kernel : _ze_kernel_handle_t, virtual NEO::DispatchKernelEncoderI {
virtual const std::vector<NEO::GraphicsAllocation *> &getResidencyContainer() const = 0;
virtual void getGroupSize(uint32_t &outGroupSizeX, uint32_t &outGroupSizeY, uint32_t &outGroupSizeZ) const = 0;
virtual uint32_t getThreadsPerThreadGroup() const = 0;
virtual uint32_t getThreadExecutionMask() const = 0;
virtual const uint8_t *getCrossThreadData() const = 0;
virtual uint32_t getCrossThreadDataSize() const = 0;
virtual const uint8_t *getPerThreadData() const = 0;
virtual uint32_t getPerThreadDataSizeForWholeThreadGroup() const = 0;
virtual uint32_t getPerThreadDataSize() const = 0;
virtual const uint8_t *getSurfaceStateHeapData() const = 0;
virtual uint32_t getSurfaceStateHeapDataSize() const = 0;
virtual const uint8_t *getDynamicStateHeapData() const = 0;
virtual size_t getDynamicStateHeapDataSize() const = 0;
virtual UnifiedMemoryControls getUnifiedMemoryControls() const = 0;
virtual bool hasIndirectAllocationsAllowed() const = 0;

View File

@ -77,7 +77,7 @@ struct KernelHw : public KernelImp {
}
std::copy(this->groupSize, this->groupSize + 3, cloned->groupSize);
cloned->threadsPerThreadGroup = this->threadsPerThreadGroup;
cloned->numThreadsPerThreadGroup = this->numThreadsPerThreadGroup;
cloned->threadExecutionMask = this->threadExecutionMask;
if (this->surfaceStateHeapDataSize > 0) {

View File

@ -185,14 +185,9 @@ ze_result_t KernelImp::setArgumentValue(uint32_t argIndex, size_t argSize,
}
void KernelImp::setGroupCount(uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ) {
uint32_t groupSizeX;
uint32_t groupSizeY;
uint32_t groupSizeZ;
getGroupSize(groupSizeX, groupSizeY, groupSizeZ);
const NEO::KernelDescriptor &desc = kernelImmData->getDescriptor();
uint32_t globalWorkSize[3] = {groupCountX * groupSizeX, groupCountY * groupSizeY,
groupCountZ * groupSizeZ};
uint32_t globalWorkSize[3] = {groupCountX * groupSize[0], groupCountY * groupSize[1],
groupCountZ * groupSize[2]};
auto dst = ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize);
NEO::patchVecNonPointer(dst, desc.payloadMappings.dispatchTraits.globalWorkSize, globalWorkSize);
@ -200,30 +195,6 @@ void KernelImp::setGroupCount(uint32_t groupCountX, uint32_t groupCountY, uint32
NEO::patchVecNonPointer(dst, desc.payloadMappings.dispatchTraits.numWorkGroups, groupCount);
}
bool KernelImp::getGroupCountOffsets(uint32_t *locations) {
const NEO::KernelDescriptor &desc = kernelImmData->getDescriptor();
for (int i = 0; i < 3; i++) {
if (NEO::isValidOffset(desc.payloadMappings.dispatchTraits.numWorkGroups[i])) {
locations[i] = desc.payloadMappings.dispatchTraits.numWorkGroups[i];
} else {
return false;
}
}
return true;
}
bool KernelImp::getGroupSizeOffsets(uint32_t *locations) {
const NEO::KernelDescriptor &desc = kernelImmData->getDescriptor();
for (int i = 0; i < 3; i++) {
if (NEO::isValidOffset(desc.payloadMappings.dispatchTraits.globalWorkSize[i])) {
locations[i] = desc.payloadMappings.dispatchTraits.globalWorkSize[i];
} else {
return false;
}
}
return true;
}
ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
uint32_t groupSizeZ) {
if ((0 == groupSizeX) || (0 == groupSizeY) || (0 == groupSizeZ)) {
@ -267,8 +238,8 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
this->groupSize[2] = groupSizeZ;
auto simdSize = kernelImmData->getDescriptor().kernelAttributes.simdSize;
this->threadsPerThreadGroup = static_cast<uint32_t>((itemsInGroup + simdSize - 1u) / simdSize);
this->perThreadDataSize = perThreadDataSizeForWholeThreadGroup / threadsPerThreadGroup;
this->numThreadsPerThreadGroup = static_cast<uint32_t>((itemsInGroup + simdSize - 1u) / simdSize);
this->perThreadDataSize = perThreadDataSizeForWholeThreadGroup / numThreadsPerThreadGroup;
patchWorkgroupSizeInCrossThreadData(groupSizeX, groupSizeY, groupSizeZ);
auto remainderSimdLanes = itemsInGroup & (simdSize - 1u);
@ -297,7 +268,7 @@ ze_result_t KernelImp::suggestGroupSize(uint32_t globalSizeX, uint32_t globalSiz
uint32_t numThreadsPerSubSlice = (uint32_t)deviceInfo.maxNumEUsPerSubSlice * deviceInfo.numThreadsPerEU;
uint32_t localMemSize = (uint32_t)deviceInfo.localMemSize;
NEO::WorkSizeInfo wsInfo(maxWorkGroupSize, this->hasBarriers(), simd, this->getSlmTotalSize(),
NEO::WorkSizeInfo wsInfo(maxWorkGroupSize, kernelImmData->getDescriptor().kernelAttributes.flags.usesBarriers, simd, this->getSlmTotalSize(),
coreFamily, numThreadsPerSubSlice, localMemSize,
usesImages, false);
NEO::computeWorkgroupSizeND(wsInfo, retGroupSize, workItems, dim);
@ -672,92 +643,12 @@ bool KernelImp::hasIndirectAllocationsAllowed() const {
unifiedMemoryControls.indirectSharedAllocationsAllowed);
}
bool KernelImp::hasBarriers() {
return getImmutableData()->getDescriptor().kernelAttributes.flags.usesBarriers;
}
uint32_t KernelImp::getSlmTotalSize() {
uint32_t KernelImp::getSlmTotalSize() const {
return slmArgsTotalSize + getImmutableData()->getDescriptor().kernelAttributes.slmInlineSize;
}
uint32_t KernelImp::getBindingTableOffset() {
return getImmutableData()->getDescriptor().payloadMappings.bindingTable.tableOffset;
}
uint32_t KernelImp::getBorderColor() {
return getImmutableData()->getDescriptor().payloadMappings.samplerTable.borderColor;
}
uint32_t KernelImp::getSamplerTableOffset() {
return getImmutableData()->getDescriptor().payloadMappings.samplerTable.tableOffset;
}
uint32_t KernelImp::getNumSurfaceStates() {
return getImmutableData()->getDescriptor().payloadMappings.bindingTable.numEntries;
}
uint32_t KernelImp::getNumSamplers() {
return getImmutableData()->getDescriptor().payloadMappings.samplerTable.numSamplers;
}
uint32_t KernelImp::getSimdSize() {
return getImmutableData()->getDescriptor().kernelAttributes.simdSize;
}
uint32_t KernelImp::getSizeCrossThreadData() {
return getCrossThreadDataSize();
}
uint32_t KernelImp::getPerThreadScratchSize() {
return getImmutableData()->getDescriptor().kernelAttributes.perThreadScratchSize[0];
}
uint32_t KernelImp::getThreadsPerThreadGroupCount() {
return getThreadsPerThreadGroup();
}
uint32_t KernelImp::getSizePerThreadData() {
return getPerThreadDataSize();
}
uint32_t KernelImp::getSizePerThreadDataForWholeGroup() {
return getPerThreadDataSizeForWholeThreadGroup();
}
uint32_t KernelImp::getSizeSurfaceStateHeapData() {
return getSurfaceStateHeapDataSize();
}
uint32_t KernelImp::getPerThreadExecutionMask() {
return getThreadExecutionMask();
}
uint32_t *KernelImp::getCountOffsets() {
return groupCountOffsets;
}
uint32_t *KernelImp::getSizeOffsets() {
return groupSizeOffsets;
}
uint32_t *KernelImp::getLocalWorkSize() {
if (hasGroupSize()) {
getGroupSize(localWorkSize[0], localWorkSize[1], localWorkSize[2]);
}
return localWorkSize;
}
uint32_t KernelImp::getNumGrfRequired() {
return getImmutableData()->getDescriptor().kernelAttributes.numGrfRequired;
}
NEO::GraphicsAllocation *KernelImp::getIsaAllocation() {
NEO::GraphicsAllocation *KernelImp::getIsaAllocation() const {
return getImmutableData()->getIsaGraphicsAllocation();
}
bool KernelImp::hasGroupCounts() {
return getGroupCountOffsets(groupCountOffsets);
}
bool KernelImp::hasGroupSize() {
return getGroupSizeOffsets(groupSizeOffsets);
}
const void *KernelImp::getSurfaceStateHeap() {
return getSurfaceStateHeapData();
}
const void *KernelImp::getDynamicStateHeap() {
return getDynamicStateHeapData();
}
const void *KernelImp::getCrossThread() {
return getCrossThreadData();
}
const void *KernelImp::getPerThread() {
return getPerThreadData();
}
bool KernelImp::isInlineDataRequired() {
return getImmutableData()->getDescriptor().kernelAttributes.flags.passInlineData;
}
uint8_t KernelImp::getNumLocalIdChannels() {
return getImmutableData()->getDescriptor().kernelAttributes.numLocalIdChannels;
}
} // namespace L0

View File

@ -42,10 +42,6 @@ struct KernelImp : Kernel {
void setGroupCount(uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ) override;
bool getGroupCountOffsets(uint32_t *locations) override;
bool getGroupSizeOffsets(uint32_t *locations) override;
ze_result_t setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
uint32_t groupSizeZ) override;
@ -62,13 +58,6 @@ struct KernelImp : Kernel {
return residencyContainer;
}
void getGroupSize(uint32_t &outGroupSizeX, uint32_t &outGroupSizeY,
uint32_t &outGroupSizeZ) const override {
outGroupSizeX = this->groupSize[0];
outGroupSizeY = this->groupSize[1];
outGroupSizeZ = this->groupSize[2];
}
ze_result_t setArgImmediate(uint32_t argIndex, size_t argSize, const void *argVal);
ze_result_t setArgBuffer(uint32_t argIndex, size_t argSize, const void *argVal);
@ -89,7 +78,7 @@ struct KernelImp : Kernel {
uint32_t getPerThreadDataSizeForWholeThreadGroup() const override { return perThreadDataSizeForWholeThreadGroup; }
uint32_t getPerThreadDataSize() const override { return perThreadDataSize; }
uint32_t getThreadsPerThreadGroup() const override { return threadsPerThreadGroup; }
uint32_t getNumThreadsPerThreadGroup() const override { return numThreadsPerThreadGroup; }
uint32_t getThreadExecutionMask() const override { return threadExecutionMask; }
NEO::GraphicsAllocation *getPrintfBufferAllocation() override { return this->printfBuffer; }
@ -99,41 +88,20 @@ struct KernelImp : Kernel {
uint32_t getSurfaceStateHeapDataSize() const override { return surfaceStateHeapDataSize; }
const uint8_t *getDynamicStateHeapData() const override { return dynamicStateHeapData.get(); }
size_t getDynamicStateHeapDataSize() const override { return dynamicStateHeapDataSize; }
const KernelImmutableData *getImmutableData() const override { return kernelImmData; }
UnifiedMemoryControls getUnifiedMemoryControls() const override { return unifiedMemoryControls; }
bool hasIndirectAllocationsAllowed() const override;
bool hasBarriers() override;
uint32_t getSlmTotalSize() override;
uint32_t getBindingTableOffset() override;
uint32_t getBorderColor() override;
uint32_t getSamplerTableOffset() override;
uint32_t getNumSurfaceStates() override;
uint32_t getNumSamplers() override;
uint32_t getSimdSize() override;
uint32_t getSizeCrossThreadData() override;
uint32_t getPerThreadScratchSize() override;
uint32_t getThreadsPerThreadGroupCount() override;
uint32_t getSizePerThreadData() override;
uint32_t getSizePerThreadDataForWholeGroup() override;
uint32_t getSizeSurfaceStateHeapData() override;
uint32_t getPerThreadExecutionMask() override;
uint32_t *getCountOffsets() override;
uint32_t *getSizeOffsets() override;
uint32_t *getLocalWorkSize() override;
uint32_t getNumGrfRequired() override;
NEO::GraphicsAllocation *getIsaAllocation() override;
bool hasGroupCounts() override;
bool hasGroupSize() override;
const void *getSurfaceStateHeap() override;
const void *getDynamicStateHeap() override;
const void *getCrossThread() override;
const void *getPerThread() override;
bool isInlineDataRequired() override;
uint8_t getNumLocalIdChannels() override;
const NEO::KernelDescriptor &getKernelDescriptor() const override {
return kernelImmData->getDescriptor();
}
const uint32_t *getGroupSize() const override {
return groupSize;
}
uint32_t getSlmTotalSize() const override;
NEO::GraphicsAllocation *getIsaAllocation() const override;
protected:
KernelImp() = default;
@ -153,7 +121,7 @@ struct KernelImp : Kernel {
NEO::GraphicsAllocation *printfBuffer = nullptr;
uint32_t groupSize[3] = {0u, 0u, 0u};
uint32_t threadsPerThreadGroup = 0u;
uint32_t numThreadsPerThreadGroup = 0u;
uint32_t threadExecutionMask = 0u;
std::unique_ptr<uint8_t[]> crossThreadData = 0;

View File

@ -12,26 +12,5 @@
namespace L0 {
namespace ult {
TEST(Kernel, givenPassInlineDataTrueWhenCallingIsInlineDataRequiredThenTrueIsReturned) {
Mock<Kernel> kernel;
kernel.descriptor.kernelAttributes.flags.passInlineData = true;
EXPECT_TRUE(kernel.isInlineDataRequired());
}
TEST(Kernel, givenPassInlineDataFalseWhenCallingIsInlineDataRequiredThenFalseIsReturned) {
Mock<Kernel> kernel;
kernel.descriptor.kernelAttributes.flags.passInlineData = false;
EXPECT_FALSE(kernel.isInlineDataRequired());
}
TEST(Kernel, whenGettingLocalIdsChannelNumberThenCorrectValueIsReturned) {
Mock<Kernel> kernel;
kernel.descriptor.kernelAttributes.numLocalIdChannels = 3;
EXPECT_EQ(3u, kernel.getNumLocalIdChannels());
}
} // namespace ult
} // namespace L0

View File

@ -51,7 +51,7 @@ struct HardwareCommandsHelper : public PerThreadDataHelper {
static void setAdditionalInfo(
INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor,
const Kernel &kernel,
const uint32_t threadsPerThreadGroup);
const uint32_t numThreadsPerThreadGroup);
inline static uint32_t additionalSizeRequiredDsh();
@ -64,7 +64,7 @@ struct HardwareCommandsHelper : public PerThreadDataHelper {
size_t bindingTablePointer,
size_t offsetSamplerState,
uint32_t numSamplers,
uint32_t threadsPerThreadGroup,
uint32_t numThreadsPerThreadGroup,
const Kernel &kernel,
uint32_t bindingTablePrefetchSize,
PreemptionMode preemptionMode,

View File

@ -144,8 +144,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, EnqueueCopyBufferToImageTest, WhenCopyingBufferToIma
auto localWorkSize = std::min(
maxLocalSize, Image2dDefaults::imageDesc.image_width * Image2dDefaults::imageDesc.image_height);
auto simd = 32u;
auto threadsPerThreadGroup = Math::divideAndRoundUp(localWorkSize, simd);
EXPECT_EQ(threadsPerThreadGroup, interfaceDescriptorData.getNumberOfThreadsInGpgpuThreadGroup());
auto numThreadsPerThreadGroup = Math::divideAndRoundUp(localWorkSize, simd);
EXPECT_EQ(numThreadsPerThreadGroup, interfaceDescriptorData.getNumberOfThreadsInGpgpuThreadGroup());
EXPECT_NE(0u, interfaceDescriptorData.getCrossThreadConstantDataReadLength());
EXPECT_NE(0u, interfaceDescriptorData.getConstantIndirectUrbEntryReadLength());

View File

@ -146,8 +146,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, EnqueueCopyImageTest, WhenCopyingImageThenInterfaceD
auto localWorkSize = std::min(maxLocalSize,
Image2dDefaults::imageDesc.image_width * Image2dDefaults::imageDesc.image_height);
auto simd = 32u;
auto threadsPerThreadGroup = Math::divideAndRoundUp(localWorkSize, simd);
EXPECT_EQ(threadsPerThreadGroup, interfaceDescriptorData.getNumberOfThreadsInGpgpuThreadGroup());
auto numThreadsPerThreadGroup = Math::divideAndRoundUp(localWorkSize, simd);
EXPECT_EQ(numThreadsPerThreadGroup, interfaceDescriptorData.getNumberOfThreadsInGpgpuThreadGroup());
EXPECT_NE(0u, interfaceDescriptorData.getCrossThreadConstantDataReadLength());
EXPECT_NE(0u, interfaceDescriptorData.getConstantIndirectUrbEntryReadLength());

View File

@ -145,8 +145,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, EnqueueCopyImageToBufferTest, WhenCopyingImageToBuff
auto localWorkSize = std::min(
maxLocalSize, Image2dDefaults::imageDesc.image_width * Image2dDefaults::imageDesc.image_height);
auto simd = 32u;
auto threadsPerThreadGroup = Math::divideAndRoundUp(localWorkSize, simd);
EXPECT_EQ(threadsPerThreadGroup, interfaceDescriptorData.getNumberOfThreadsInGpgpuThreadGroup());
auto numThreadsPerThreadGroup = Math::divideAndRoundUp(localWorkSize, simd);
EXPECT_EQ(numThreadsPerThreadGroup, interfaceDescriptorData.getNumberOfThreadsInGpgpuThreadGroup());
EXPECT_NE(0u, interfaceDescriptorData.getCrossThreadConstantDataReadLength());
EXPECT_NE(0u, interfaceDescriptorData.getConstantIndirectUrbEntryReadLength());

View File

@ -153,8 +153,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, EnqueueFillImageTest, WhenFillingImageThenInterfaceD
auto localWorkSize = std::min(maxLocalSize,
Image2dDefaults::imageDesc.image_width * Image2dDefaults::imageDesc.image_height);
auto simd = 32u;
auto threadsPerThreadGroup = Math::divideAndRoundUp(localWorkSize, simd);
EXPECT_EQ(threadsPerThreadGroup, interfaceDescriptorData.getNumberOfThreadsInGpgpuThreadGroup());
auto numThreadsPerThreadGroup = Math::divideAndRoundUp(localWorkSize, simd);
EXPECT_EQ(numThreadsPerThreadGroup, interfaceDescriptorData.getNumberOfThreadsInGpgpuThreadGroup());
EXPECT_NE(0u, interfaceDescriptorData.getCrossThreadConstantDataReadLength());
EXPECT_NE(0u, interfaceDescriptorData.getConstantIndirectUrbEntryReadLength());

View File

@ -154,8 +154,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, EnqueueReadImageTest, WhenReadingImageThenInterfaceD
auto localWorkSize = 4u;
auto simd = 32u;
auto threadsPerThreadGroup = Math::divideAndRoundUp(localWorkSize, simd);
EXPECT_EQ(threadsPerThreadGroup, interfaceDescriptorData.getNumberOfThreadsInGpgpuThreadGroup());
auto numThreadsPerThreadGroup = Math::divideAndRoundUp(localWorkSize, simd);
EXPECT_EQ(numThreadsPerThreadGroup, interfaceDescriptorData.getNumberOfThreadsInGpgpuThreadGroup());
EXPECT_NE(0u, interfaceDescriptorData.getCrossThreadConstantDataReadLength());
EXPECT_NE(0u, interfaceDescriptorData.getConstantIndirectUrbEntryReadLength());

View File

@ -155,8 +155,8 @@ HWCMDTEST_F(IGFX_GEN8_CORE, EnqueueWriteImageTest, WhenWritingImageThenInterface
// EnqueueWriteImage uses a byte copy. Need to convert to bytes.
auto localWorkSize = 2 * 2 * sizeof(float);
auto simd = 32;
auto threadsPerThreadGroup = Math::divideAndRoundUp(localWorkSize, simd);
EXPECT_EQ(threadsPerThreadGroup, interfaceDescriptorData.getNumberOfThreadsInGpgpuThreadGroup());
auto numThreadsPerThreadGroup = Math::divideAndRoundUp(localWorkSize, simd);
EXPECT_EQ(numThreadsPerThreadGroup, interfaceDescriptorData.getNumberOfThreadsInGpgpuThreadGroup());
EXPECT_NE(0u, interfaceDescriptorData.getCrossThreadConstantDataReadLength());
EXPECT_NE(0u, interfaceDescriptorData.getConstantIndirectUrbEntryReadLength());

View File

@ -12,6 +12,7 @@
#include "shared/source/helpers/register_offsets.h"
#include "shared/source/helpers/simd_helper.h"
#include "shared/source/kernel/dispatch_kernel_encoder_interface.h"
#include "shared/source/kernel/kernel_arg_descriptor.h"
#include <algorithm>
@ -96,9 +97,8 @@ struct EncodeIndirectParams {
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
using MI_MATH = typename GfxFamily::MI_MATH;
using MI_MATH_ALU_INST_INLINE = typename GfxFamily::MI_MATH_ALU_INST_INLINE;
static void setGroupCountIndirect(CommandContainer &container, uint32_t offsets[3], void *crossThreadAddress);
static void setGroupSizeIndirect(CommandContainer &container, uint32_t offsets[3], void *crossThreadAddress, uint32_t lws[3]);
static void setGroupCountIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress);
static void setGlobalWorkSizeIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress, const uint32_t *lws);
static size_t getCmdsSizeForIndirectParams();
static size_t getCmdsSizeForSetGroupSizeIndirect();

View File

@ -199,17 +199,23 @@ void EncodeMathMMIO<Family>::encodeAluAdd(MI_MATH_ALU_INST_INLINE *pAluParam,
}
template <typename Family>
void EncodeIndirectParams<Family>::setGroupCountIndirect(CommandContainer &container, uint32_t offsets[3], void *crossThreadAddress) {
EncodeStoreMMIO<Family>::encode(container, GPUGPU_DISPATCHDIMX, ptrOffset(reinterpret_cast<uint64_t>(crossThreadAddress), offsets[0]));
EncodeStoreMMIO<Family>::encode(container, GPUGPU_DISPATCHDIMY, ptrOffset(reinterpret_cast<uint64_t>(crossThreadAddress), offsets[1]));
EncodeStoreMMIO<Family>::encode(container, GPUGPU_DISPATCHDIMZ, ptrOffset(reinterpret_cast<uint64_t>(crossThreadAddress), offsets[2]));
void EncodeIndirectParams<Family>::setGroupCountIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress) {
for (int i = 0; i < 3; ++i) {
if (NEO::isUndefinedOffset(offsets[i])) {
continue;
}
EncodeStoreMMIO<Family>::encode(container, GPUGPU_DISPATCHDIM[i], ptrOffset(reinterpret_cast<uint64_t>(crossThreadAddress), offsets[i]));
}
}
template <typename Family>
void EncodeIndirectParams<Family>::setGroupSizeIndirect(CommandContainer &container, uint32_t offsets[3], void *crossThreadAddress, uint32_t lws[3]) {
EncodeMathMMIO<Family>::encodeMulRegVal(container, GPUGPU_DISPATCHDIMX, lws[0], ptrOffset(reinterpret_cast<uint64_t>(crossThreadAddress), offsets[0]));
EncodeMathMMIO<Family>::encodeMulRegVal(container, GPUGPU_DISPATCHDIMY, lws[1], ptrOffset(reinterpret_cast<uint64_t>(crossThreadAddress), offsets[1]));
EncodeMathMMIO<Family>::encodeMulRegVal(container, GPUGPU_DISPATCHDIMZ, lws[2], ptrOffset(reinterpret_cast<uint64_t>(crossThreadAddress), offsets[2]));
void EncodeIndirectParams<Family>::setGlobalWorkSizeIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], void *crossThreadAddress, const uint32_t *lws) {
for (int i = 0; i < 3; ++i) {
if (NEO::isUndefinedOffset(offsets[i])) {
continue;
}
EncodeMathMMIO<Family>::encodeMulRegVal(container, GPUGPU_DISPATCHDIM[i], lws[i], ptrOffset(reinterpret_cast<uint64_t>(crossThreadAddress), offsets[i]));
}
}
template <typename Family>

View File

@ -29,9 +29,10 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename Family::MEDIA_INTERFACE_DESCRIPTOR_LOAD;
using MI_BATCH_BUFFER_END = typename Family::MI_BATCH_BUFFER_END;
auto sizeCrossThreadData = dispatchInterface->getSizeCrossThreadData();
auto sizePerThreadData = dispatchInterface->getSizePerThreadData();
auto sizePerThreadDataForWholeGroup = dispatchInterface->getSizePerThreadDataForWholeGroup();
auto &kernelDescriptor = dispatchInterface->getKernelDescriptor();
auto sizeCrossThreadData = dispatchInterface->getCrossThreadDataSize();
auto sizePerThreadData = dispatchInterface->getPerThreadDataSize();
auto sizePerThreadDataForWholeGroup = dispatchInterface->getPerThreadDataSizeForWholeThreadGroup();
LinearStream *listCmdBufferStream = container.getCommandStream();
@ -58,26 +59,26 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
EncodeStates<Family>::adjustStateComputeMode(*container.getCommandStream(), container.lastSentNumGrfRequired, nullptr, false, false);
EncodeWA<Family>::encodeAdditionalPipelineSelect(*container.getDevice(), *container.getCommandStream(), false);
auto threadsPerThreadGroup = dispatchInterface->getThreadsPerThreadGroupCount();
idd.setNumberOfThreadsInGpgpuThreadGroup(threadsPerThreadGroup);
auto numThreadsPerThreadGroup = dispatchInterface->getNumThreadsPerThreadGroup();
idd.setNumberOfThreadsInGpgpuThreadGroup(numThreadsPerThreadGroup);
idd.setBarrierEnable(dispatchInterface->hasBarriers());
idd.setBarrierEnable(kernelDescriptor.kernelAttributes.flags.usesBarriers);
idd.setSharedLocalMemorySize(
dispatchInterface->getSlmTotalSize() > 0
? static_cast<typename INTERFACE_DESCRIPTOR_DATA::SHARED_LOCAL_MEMORY_SIZE>(HardwareCommandsHelper<Family>::computeSlmValues(dispatchInterface->getSlmTotalSize()))
: INTERFACE_DESCRIPTOR_DATA::SHARED_LOCAL_MEMORY_SIZE_ENCODES_0K);
{
auto bindingTableStateCount = dispatchInterface->getNumSurfaceStates();
uint32_t bindingTableStateCount = kernelDescriptor.payloadMappings.bindingTable.numEntries;
uint32_t bindingTablePointer = 0u;
if (bindingTableStateCount > 0u) {
auto ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::SURFACE_STATE, dispatchInterface->getSizeSurfaceStateHeapData(), BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
auto ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::SURFACE_STATE, dispatchInterface->getSurfaceStateHeapDataSize(), BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
bindingTablePointer = static_cast<uint32_t>(HardwareCommandsHelper<Family>::pushBindingTableAndSurfaceStates(
*ssh, bindingTableStateCount,
dispatchInterface->getSurfaceStateHeap(),
dispatchInterface->getSizeSurfaceStateHeapData(), bindingTableStateCount,
dispatchInterface->getBindingTableOffset()));
dispatchInterface->getSurfaceStateHeapData(),
dispatchInterface->getSurfaceStateHeapDataSize(), bindingTableStateCount,
kernelDescriptor.payloadMappings.bindingTable.tableOffset));
}
idd.setBindingTablePointer(bindingTablePointer);
@ -96,12 +97,12 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
uint32_t samplerStateOffset = 0;
uint32_t samplerCount = 0;
if (dispatchInterface->getNumSamplers() > 0) {
samplerCount = dispatchInterface->getNumSamplers();
samplerStateOffset = EncodeStates<Family>::copySamplerState(heap, dispatchInterface->getSamplerTableOffset(),
dispatchInterface->getNumSamplers(),
dispatchInterface->getBorderColor(),
dispatchInterface->getDynamicStateHeap());
if (kernelDescriptor.payloadMappings.samplerTable.numSamplers > 0) {
samplerCount = kernelDescriptor.payloadMappings.samplerTable.numSamplers;
samplerStateOffset = EncodeStates<Family>::copySamplerState(heap, kernelDescriptor.payloadMappings.samplerTable.tableOffset,
kernelDescriptor.payloadMappings.samplerTable.numSamplers,
kernelDescriptor.payloadMappings.samplerTable.borderColor,
dispatchInterface->getDynamicStateHeapData());
}
idd.setSamplerStatePointer(samplerStateOffset);
@ -129,21 +130,17 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
offsetThreadData = heapIndirect->getHeapGpuStartOffset() + static_cast<uint64_t>(heapIndirect->getUsed() - sizeThreadData);
memcpy_s(ptr, sizeCrossThreadData,
dispatchInterface->getCrossThread(), sizeCrossThreadData);
dispatchInterface->getCrossThreadData(), sizeCrossThreadData);
if (isIndirect) {
void *gpuPtr = reinterpret_cast<void *>(heapIndirect->getHeapGpuBase() + heapIndirect->getUsed() - sizeThreadData);
if (dispatchInterface->hasGroupCounts()) {
EncodeIndirectParams<Family>::setGroupCountIndirect(container, dispatchInterface->getCountOffsets(), gpuPtr);
}
if (dispatchInterface->hasGroupSize()) {
EncodeIndirectParams<Family>::setGroupSizeIndirect(container, dispatchInterface->getSizeOffsets(), gpuPtr, dispatchInterface->getLocalWorkSize());
}
EncodeIndirectParams<Family>::setGroupCountIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups, gpuPtr);
EncodeIndirectParams<Family>::setGlobalWorkSizeIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize, gpuPtr, dispatchInterface->getGroupSize());
}
ptr = ptrOffset(ptr, sizeCrossThreadData);
memcpy_s(ptr, sizePerThreadDataForWholeGroup,
dispatchInterface->getPerThread(), sizePerThreadDataForWholeGroup);
dispatchInterface->getPerThreadData(), sizePerThreadDataForWholeGroup);
}
auto slmSizeNew = dispatchInterface->getSlmTotalSize();
@ -185,14 +182,14 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
cmd.setThreadGroupIdZDimension(threadDims[2]);
}
auto simdSize = dispatchInterface->getSimdSize();
auto simdSize = kernelDescriptor.kernelAttributes.simdSize;
auto simdSizeOp = getSimdConfig<WALKER_TYPE>(simdSize);
cmd.setSimdSize(simdSizeOp);
cmd.setRightExecutionMask(dispatchInterface->getPerThreadExecutionMask());
cmd.setRightExecutionMask(dispatchInterface->getThreadExecutionMask());
cmd.setBottomExecutionMask(0xffffffff);
cmd.setThreadWidthCounterMaximum(threadsPerThreadGroup);
cmd.setThreadWidthCounterMaximum(numThreadsPerThreadGroup);
cmd.setPredicateEnable(isPredicate);

View File

@ -25,8 +25,8 @@ uint32_t KernelHelper::getMaxWorkGroupCount(uint32_t simd, uint32_t availableThr
workGroupSize *= localWorkSize[i];
}
auto threadsPerThreadGroup = static_cast<uint32_t>(Math::divideAndRoundUp(workGroupSize, simd));
auto maxWorkGroupsCount = availableThreadCount / threadsPerThreadGroup;
auto numThreadsPerThreadGroup = static_cast<uint32_t>(Math::divideAndRoundUp(workGroupSize, simd));
auto maxWorkGroupsCount = availableThreadCount / numThreadsPerThreadGroup;
if (numberOfBarriers > 0) {
auto maxWorkGroupsCountDueToBarrierUsage = dssCount * (maxBarrierCount / numberOfBarriers);

View File

@ -19,6 +19,8 @@ constexpr uint32_t GPUGPU_DISPATCHDIMX = 0x2500;
constexpr uint32_t GPUGPU_DISPATCHDIMY = 0x2504;
constexpr uint32_t GPUGPU_DISPATCHDIMZ = 0x2508;
constexpr uint32_t GPUGPU_DISPATCHDIM[3] = {GPUGPU_DISPATCHDIMX, GPUGPU_DISPATCHDIMY, GPUGPU_DISPATCHDIMZ};
constexpr uint32_t CS_GPR_R0 = 0x2600;
constexpr uint32_t CS_GPR_R1 = 0x2608;
constexpr uint32_t CS_GPR_R2 = 0x2610;

View File

@ -10,42 +10,28 @@
namespace NEO {
class GraphicsAllocation;
struct KernelDescriptor;
struct DispatchKernelEncoderI {
public:
virtual bool hasBarriers() = 0;
virtual uint32_t getSlmTotalSize() = 0;
virtual uint32_t getBindingTableOffset() = 0;
virtual uint32_t getBorderColor() = 0;
virtual uint32_t getSamplerTableOffset() = 0;
virtual uint32_t getNumSurfaceStates() = 0;
virtual uint32_t getNumSamplers() = 0;
virtual uint32_t getSimdSize() = 0;
virtual uint32_t getSizeCrossThreadData() = 0;
virtual uint32_t getPerThreadScratchSize() = 0;
virtual uint32_t getPerThreadExecutionMask() = 0;
virtual uint32_t getSizePerThreadData() = 0;
virtual uint32_t getSizePerThreadDataForWholeGroup() = 0;
virtual uint32_t getSizeSurfaceStateHeapData() = 0;
virtual uint32_t *getCountOffsets() = 0;
virtual uint32_t *getSizeOffsets() = 0;
virtual uint32_t *getLocalWorkSize() = 0;
virtual uint32_t getNumGrfRequired() = 0;
virtual uint32_t getThreadsPerThreadGroupCount() = 0;
virtual GraphicsAllocation *getIsaAllocation() = 0;
virtual bool hasGroupCounts() = 0;
virtual bool hasGroupSize() = 0;
virtual const void *getSurfaceStateHeap() = 0;
virtual const void *getDynamicStateHeap() = 0;
virtual const void *getCrossThread() = 0;
virtual const void *getPerThread() = 0;
virtual bool isInlineDataRequired() = 0;
virtual uint8_t getNumLocalIdChannels() = 0;
virtual ~DispatchKernelEncoderI() = default;
protected:
uint32_t groupCountOffsets[3] = {};
uint32_t groupSizeOffsets[3] = {};
uint32_t localWorkSize[3] = {};
virtual const KernelDescriptor &getKernelDescriptor() const = 0;
virtual const uint32_t *getGroupSize() const = 0;
virtual uint32_t getSlmTotalSize() const = 0;
virtual const uint8_t *getCrossThreadData() const = 0;
virtual uint32_t getCrossThreadDataSize() const = 0;
virtual uint32_t getThreadExecutionMask() const = 0;
virtual uint32_t getNumThreadsPerThreadGroup() const = 0;
virtual const uint8_t *getPerThreadData() const = 0;
virtual uint32_t getPerThreadDataSize() const = 0;
virtual uint32_t getPerThreadDataSizeForWholeThreadGroup() const = 0;
virtual const uint8_t *getSurfaceStateHeapData() const = 0;
virtual uint32_t getSurfaceStateHeapDataSize() const = 0;
virtual GraphicsAllocation *getIsaAllocation() const = 0;
virtual const uint8_t *getDynamicStateHeapData() const = 0;
};
} // namespace NEO

View File

@ -55,7 +55,7 @@ void populateKernelDescriptor(KernelDescriptor &dst, const SPatchExecutionEnviro
dst.kernelAttributes.flags.requiresSubgroupIndependentForwardProgress = (0 != execEnv.SubgroupIndependentForwardProgressRequired);
dst.kernelAttributes.numGrfRequired = execEnv.NumGRFRequired;
dst.kernelAttributes.flags.useGlobalAtomics = execEnv.HasGlobalAtomics;
dst.kernelAttributes.flags.usesStatelessWrites = 0U;
dst.kernelAttributes.flags.usesStatelessWrites = (execEnv.StatelessWritesCount > 0U);
}
void populateKernelDescriptor(KernelDescriptor &dst, const SPatchSamplerStateArray &token) {
@ -85,7 +85,6 @@ void populateKernelDescriptor(KernelDescriptor &dst, const SPatchInterfaceDescri
void populateKernelDescriptor(KernelDescriptor &dst, const SPatchThreadPayload &token) {
dst.kernelAttributes.flags.perThreadDataHeaderIsPresent = (0U != token.HeaderPresent);
dst.kernelAttributes.numLocalIdChannels = token.LocalIDXPresent + token.LocalIDYPresent + token.LocalIDZPresent;
;
dst.kernelAttributes.flags.usesFlattenedLocalIds = (0U != token.LocalIDFlattenedPresent);
dst.kernelAttributes.flags.perThreadDataUnusedGrfIsPresent = (0U != token.UnusedPerThreadConstantPresent);
dst.kernelAttributes.flags.passInlineData = (0 != token.PassInlineData);

View File

@ -92,7 +92,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, CommandEncodeStatesTest, givenSlmTotalSizeEqualZeroW
EXPECT_EQ(expectedValue, interfaceDescriptorData->getSharedLocalMemorySize());
}
HWCMDTEST_F(IGFX_GEN8_CORE, CommandEncodeStatesTest, givennumBindingTableOneWhenDispatchingKernelThenBindingTableOffsetIsCorrect) {
HWCMDTEST_F(IGFX_GEN8_CORE, CommandEncodeStatesTest, givenOneBindingTableEntryWhenDispatchingKernelThenBindingTableOffsetIsCorrect) {
using BINDING_TABLE_STATE = typename FamilyType::BINDING_TABLE_STATE;
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
uint32_t numBindingTable = 1;
@ -107,10 +107,11 @@ HWCMDTEST_F(IGFX_GEN8_CORE, CommandEncodeStatesTest, givennumBindingTableOneWhen
uint32_t dims[] = {2, 1, 1};
std::unique_ptr<MockDispatchKernelEncoder> dispatchInterface(new MockDispatchKernelEncoder());
EXPECT_CALL(*dispatchInterface.get(), getNumSurfaceStates()).WillRepeatedly(::testing::Return(numBindingTable));
EXPECT_CALL(*dispatchInterface.get(), getSurfaceStateHeap()).WillRepeatedly(::testing::Return(&bindingTableState));
EXPECT_CALL(*dispatchInterface.get(), getSizeSurfaceStateHeapData()).WillRepeatedly(::testing::Return(static_cast<uint32_t>(sizeof(BINDING_TABLE_STATE))));
EXPECT_CALL(*dispatchInterface.get(), getBindingTableOffset()).WillRepeatedly(::testing::Return(0));
dispatchInterface->kernelDescriptor.payloadMappings.bindingTable.numEntries = numBindingTable;
dispatchInterface->kernelDescriptor.payloadMappings.bindingTable.tableOffset = 0U;
const uint8_t *sshData = reinterpret_cast<uint8_t *>(&bindingTableState);
EXPECT_CALL(*dispatchInterface.get(), getSurfaceStateHeapData()).WillRepeatedly(::testing::Return(sshData));
EXPECT_CALL(*dispatchInterface.get(), getSurfaceStateHeapDataSize()).WillRepeatedly(::testing::Return(static_cast<uint32_t>(sizeof(BINDING_TABLE_STATE))));
EncodeDispatchKernel<FamilyType>::encode(*cmdContainer.get(), dims, false, false, dispatchInterface.get(), 0, pDevice, NEO::PreemptionMode::Disabled);
auto interfaceDescriptorData = static_cast<INTERFACE_DESCRIPTOR_DATA *>(cmdContainer->getIddBlock());
@ -132,10 +133,11 @@ HWCMDTEST_F(IGFX_GEN8_CORE, CommandEncodeStatesTest, giveNumBindingTableZeroWhen
uint32_t dims[] = {2, 1, 1};
std::unique_ptr<MockDispatchKernelEncoder> dispatchInterface(new MockDispatchKernelEncoder());
EXPECT_CALL(*dispatchInterface.get(), getNumSurfaceStates()).WillRepeatedly(::testing::Return(numBindingTable));
EXPECT_CALL(*dispatchInterface.get(), getSurfaceStateHeap()).WillRepeatedly(::testing::Return(&bindingTableState));
EXPECT_CALL(*dispatchInterface.get(), getSizeSurfaceStateHeapData()).WillRepeatedly(::testing::Return(static_cast<uint32_t>(sizeof(BINDING_TABLE_STATE))));
EXPECT_CALL(*dispatchInterface.get(), getBindingTableOffset()).WillRepeatedly(::testing::Return(0));
dispatchInterface->kernelDescriptor.payloadMappings.bindingTable.numEntries = numBindingTable;
dispatchInterface->kernelDescriptor.payloadMappings.bindingTable.tableOffset = 0U;
const uint8_t *sshData = reinterpret_cast<uint8_t *>(&bindingTableState);
EXPECT_CALL(*dispatchInterface.get(), getSurfaceStateHeapData()).WillRepeatedly(::testing::Return(sshData));
EXPECT_CALL(*dispatchInterface.get(), getSurfaceStateHeapDataSize()).WillRepeatedly(::testing::Return(static_cast<uint32_t>(sizeof(BINDING_TABLE_STATE))));
EncodeDispatchKernel<FamilyType>::encode(*cmdContainer.get(), dims, false, false, dispatchInterface.get(), 0, pDevice, NEO::PreemptionMode::Disabled);
auto interfaceDescriptorData = static_cast<INTERFACE_DESCRIPTOR_DATA *>(cmdContainer->getIddBlock());
@ -156,10 +158,11 @@ HWCMDTEST_F(IGFX_GEN8_CORE, CommandEncodeStatesTest, giveNumSamplersOneWhenDispa
uint32_t dims[] = {2, 1, 1};
std::unique_ptr<MockDispatchKernelEncoder> dispatchInterface(new MockDispatchKernelEncoder());
EXPECT_CALL(*dispatchInterface.get(), getNumSamplers()).WillRepeatedly(::testing::Return(numSamplers));
EXPECT_CALL(*dispatchInterface.get(), getSamplerTableOffset()).WillRepeatedly(::testing::Return(0));
EXPECT_CALL(*dispatchInterface.get(), getBorderColor()).WillRepeatedly(::testing::Return(0));
EXPECT_CALL(*dispatchInterface.get(), getDynamicStateHeap()).WillRepeatedly(::testing::Return(&samplerState));
dispatchInterface->kernelDescriptor.payloadMappings.samplerTable.numSamplers = numSamplers;
dispatchInterface->kernelDescriptor.payloadMappings.samplerTable.tableOffset = 0U;
dispatchInterface->kernelDescriptor.payloadMappings.samplerTable.borderColor = 0U;
const uint8_t *dshData = reinterpret_cast<uint8_t *>(&samplerState);
EXPECT_CALL(*dispatchInterface.get(), getDynamicStateHeapData()).WillRepeatedly(::testing::Return(dshData));
EncodeDispatchKernel<FamilyType>::encode(*cmdContainer.get(), dims, false, false, dispatchInterface.get(), 0, pDevice, NEO::PreemptionMode::Disabled);
auto interfaceDescriptorData = static_cast<INTERFACE_DESCRIPTOR_DATA *>(cmdContainer->getIddBlock());
@ -186,10 +189,11 @@ HWCMDTEST_F(IGFX_GEN8_CORE, CommandEncodeStatesTest, giveNumSamplersZeroWhenDisp
uint32_t dims[] = {2, 1, 1};
std::unique_ptr<MockDispatchKernelEncoder> dispatchInterface(new MockDispatchKernelEncoder());
EXPECT_CALL(*dispatchInterface.get(), getNumSamplers()).WillRepeatedly(::testing::Return(numSamplers));
EXPECT_CALL(*dispatchInterface.get(), getSamplerTableOffset()).WillRepeatedly(::testing::Return(0));
EXPECT_CALL(*dispatchInterface.get(), getBorderColor()).WillRepeatedly(::testing::Return(0));
EXPECT_CALL(*dispatchInterface.get(), getDynamicStateHeap()).WillRepeatedly(::testing::Return(&samplerState));
dispatchInterface->kernelDescriptor.payloadMappings.samplerTable.numSamplers = numSamplers;
dispatchInterface->kernelDescriptor.payloadMappings.samplerTable.tableOffset = 0U;
dispatchInterface->kernelDescriptor.payloadMappings.samplerTable.borderColor = 0U;
const uint8_t *dshData = reinterpret_cast<uint8_t *>(&samplerState);
EXPECT_CALL(*dispatchInterface.get(), getDynamicStateHeapData()).WillRepeatedly(::testing::Return(dshData));
EncodeDispatchKernel<FamilyType>::encode(*cmdContainer.get(), dims, false, false, dispatchInterface.get(), 0, pDevice, NEO::PreemptionMode::Disabled);
auto interfaceDescriptorData = static_cast<INTERFACE_DESCRIPTOR_DATA *>(cmdContainer->getIddBlock());
@ -203,16 +207,14 @@ HWCMDTEST_F(IGFX_GEN8_CORE, CommandEncodeStatesTest, giveNumSamplersZeroWhenDisp
EXPECT_NE(memcmp(pSmplr, &samplerState, sizeof(SAMPLER_STATE)), 0);
}
HWTEST_F(CommandEncodeStatesTest, givenIndarectOffsetsCountsWhenDispatchingKernelThenCorrestMIStoreOffsetsSet) {
HWTEST_F(CommandEncodeStatesTest, givenIndirectOffsetsCountsWhenDispatchingKernelThenCorrestMIStoreOffsetsSet) {
using MI_STORE_REGISTER_MEM = typename FamilyType::MI_STORE_REGISTER_MEM;
uint32_t dims[] = {2, 1, 1};
uint32_t offsets[] = {0x10, 0x20, 0x30};
std::unique_ptr<MockDispatchKernelEncoder> dispatchInterface(new MockDispatchKernelEncoder());
EXPECT_CALL(*dispatchInterface.get(), hasGroupCounts()).WillRepeatedly(::testing::Return(true));
EXPECT_CALL(*dispatchInterface.get(), getCountOffsets()).WillRepeatedly(::testing::Return(offsets));
EXPECT_CALL(*dispatchInterface.get(), hasGroupSize()).WillRepeatedly(::testing::Return(false));
dispatchInterface->kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups[0] = offsets[0];
dispatchInterface->kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups[1] = offsets[1];
dispatchInterface->kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups[2] = offsets[2];
EncodeDispatchKernel<FamilyType>::encode(*cmdContainer.get(), dims, true, false, dispatchInterface.get(), 0, pDevice, NEO::PreemptionMode::Disabled);
GenCmdList commands;
@ -233,11 +235,10 @@ HWTEST_F(CommandEncodeStatesTest, givenIndarectOffsetsSizeWhenDispatchingKernelT
uint32_t lws[] = {1, 1, 1};
std::unique_ptr<MockDispatchKernelEncoder> dispatchInterface(new MockDispatchKernelEncoder());
EXPECT_CALL(*dispatchInterface.get(), hasGroupCounts()).WillRepeatedly(::testing::Return(false));
EXPECT_CALL(*dispatchInterface.get(), getSizeOffsets()).WillRepeatedly(::testing::Return(offsets));
EXPECT_CALL(*dispatchInterface.get(), hasGroupSize()).WillRepeatedly(::testing::Return(true));
EXPECT_CALL(*dispatchInterface.get(), getLocalWorkSize()).WillRepeatedly(::testing::Return(lws));
EXPECT_CALL(*dispatchInterface.get(), getGroupSize()).WillRepeatedly(::testing::Return(lws));
dispatchInterface->kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize[0] = offsets[0];
dispatchInterface->kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize[1] = offsets[1];
dispatchInterface->kernelDescriptor.payloadMappings.dispatchTraits.globalWorkSize[2] = offsets[2];
EncodeDispatchKernel<FamilyType>::encode(*cmdContainer.get(), dims, true, false, dispatchInterface.get(), 0, pDevice, NEO::PreemptionMode::Disabled);
GenCmdList commands;

View File

@ -185,11 +185,11 @@ HWTEST_F(CommandEncoderMathTest, setGroupSizeIndirect) {
CommandContainer cmdContainer;
cmdContainer.initialize(pDevice);
uint32_t offsets[3] = {0, sizeof(uint32_t), 2 * sizeof(uint32_t)};
CrossThreadDataOffset offsets[3] = {0, sizeof(uint32_t), 2 * sizeof(uint32_t)};
uint32_t crossThreadAdress[3] = {};
uint32_t lws[3] = {2, 1, 1};
EncodeIndirectParams<FamilyType>::setGroupSizeIndirect(cmdContainer, offsets, crossThreadAdress, lws);
EncodeIndirectParams<FamilyType>::setGlobalWorkSizeIndirect(cmdContainer, offsets, crossThreadAdress, lws);
GenCmdList commands;
CmdParse<FamilyType>::parseCommandBuffer(commands, ptrOffset(cmdContainer.getCommandStream()->getCpuBase(), 0), cmdContainer.getCommandStream()->getUsed());
@ -211,7 +211,7 @@ HWTEST_F(CommandEncoderMathTest, setGroupCountIndirect) {
CommandContainer cmdContainer;
cmdContainer.initialize(pDevice);
uint32_t offsets[3] = {0, sizeof(uint32_t), 2 * sizeof(uint32_t)};
CrossThreadDataOffset offsets[3] = {0, sizeof(uint32_t), 2 * sizeof(uint32_t)};
uint32_t crossThreadAdress[3] = {};
EncodeIndirectParams<FamilyType>::setGroupCountIndirect(cmdContainer, offsets, crossThreadAdress);

View File

@ -129,6 +129,9 @@ TEST(KernelDescriptorFromPatchtokens, GivenExecutionEnvironmentThenSetsProperPar
EXPECT_TRUE(kernelDescriptor.kernelAttributes.flags.useGlobalAtomics);
EXPECT_FALSE(kernelDescriptor.kernelAttributes.flags.usesStatelessWrites);
execEnv.StatelessWritesCount = 1U;
NEO::populateKernelDescriptor(kernelDescriptor, kernelTokens, 4);
EXPECT_TRUE(kernelDescriptor.kernelAttributes.flags.usesStatelessWrites);
}
TEST(KernelDescriptorFromPatchtokens, GivenThreadPayloadThenSetsProperPartsOfDescriptor) {

View File

@ -12,34 +12,28 @@ using namespace NEO;
using ::testing::Return;
MockDispatchKernelEncoder::MockDispatchKernelEncoder() {
EXPECT_CALL(*this, getIsaAllocation).WillRepeatedly(Return(&mockAllocation));
EXPECT_CALL(*this, getSizeCrossThreadData).WillRepeatedly(Return(crossThreadSize));
EXPECT_CALL(*this, getSizePerThreadData).WillRepeatedly(Return(perThreadSize));
EXPECT_CALL(*this, getKernelDescriptor).WillRepeatedly(::testing::ReturnRef(kernelDescriptor));
EXPECT_CALL(*this, getIsaAllocation).WillRepeatedly(Return(&mockAllocation));
EXPECT_CALL(*this, getCrossThreadDataSize).WillRepeatedly(Return(crossThreadSize));
EXPECT_CALL(*this, getPerThreadDataSize).WillRepeatedly(Return(perThreadSize));
EXPECT_CALL(*this, getCrossThreadData).WillRepeatedly(Return(dataCrossThread));
EXPECT_CALL(*this, getPerThreadData).WillRepeatedly(Return(dataPerThread));
EXPECT_CALL(*this, getCrossThread).WillRepeatedly(Return(&dataCrossThread));
EXPECT_CALL(*this, getPerThread).WillRepeatedly(Return(&dataPerThread));
expectAnyMockFunctionCall();
}
void MockDispatchKernelEncoder::expectAnyMockFunctionCall() {
EXPECT_CALL(*this, hasBarriers()).Times(::testing::AnyNumber());
EXPECT_CALL(*this, getGroupSize()).Times(::testing::AnyNumber());
EXPECT_CALL(*this, getSlmTotalSize()).Times(::testing::AnyNumber());
EXPECT_CALL(*this, getBindingTableOffset()).Times(::testing::AnyNumber());
EXPECT_CALL(*this, getBorderColor()).Times(::testing::AnyNumber());
EXPECT_CALL(*this, getSamplerTableOffset()).Times(::testing::AnyNumber());
EXPECT_CALL(*this, getNumSurfaceStates()).Times(::testing::AnyNumber());
EXPECT_CALL(*this, getNumSamplers()).Times(::testing::AnyNumber());
EXPECT_CALL(*this, getSimdSize()).Times(::testing::AnyNumber());
EXPECT_CALL(*this, getPerThreadScratchSize()).Times(::testing::AnyNumber());
EXPECT_CALL(*this, getPerThreadExecutionMask()).Times(::testing::AnyNumber());
EXPECT_CALL(*this, getSizePerThreadDataForWholeGroup()).Times(::testing::AnyNumber());
EXPECT_CALL(*this, getSizeSurfaceStateHeapData()).Times(::testing::AnyNumber());
EXPECT_CALL(*this, getCountOffsets()).Times(::testing::AnyNumber());
EXPECT_CALL(*this, getSizeOffsets()).Times(::testing::AnyNumber());
EXPECT_CALL(*this, getLocalWorkSize()).Times(::testing::AnyNumber());
EXPECT_CALL(*this, getNumGrfRequired()).Times(::testing::AnyNumber());
EXPECT_CALL(*this, getThreadsPerThreadGroupCount()).Times(::testing::AnyNumber());
EXPECT_CALL(*this, hasGroupCounts()).Times(::testing::AnyNumber());
EXPECT_CALL(*this, getSurfaceStateHeap()).Times(::testing::AnyNumber());
EXPECT_CALL(*this, getDynamicStateHeap()).Times(::testing::AnyNumber());
EXPECT_CALL(*this, isInlineDataRequired()).Times(::testing::AnyNumber());
}
EXPECT_CALL(*this, getThreadExecutionMask()).Times(::testing::AnyNumber());
EXPECT_CALL(*this, getNumThreadsPerThreadGroup()).Times(::testing::AnyNumber());
EXPECT_CALL(*this, getPerThreadDataSizeForWholeThreadGroup()).Times(::testing::AnyNumber());
EXPECT_CALL(*this, getSurfaceStateHeapData()).Times(::testing::AnyNumber());
EXPECT_CALL(*this, getSurfaceStateHeapDataSize()).Times(::testing::AnyNumber());
EXPECT_CALL(*this, getDynamicStateHeapData()).Times(::testing::AnyNumber());
}

View File

@ -7,6 +7,7 @@
#pragma once
#include "shared/source/kernel/dispatch_kernel_encoder_interface.h"
#include "shared/source/kernel/kernel_descriptor.h"
#include "opencl/test/unit_test/mocks/mock_graphics_allocation.h"
@ -20,34 +21,24 @@ class GraphicsAllocation;
struct MockDispatchKernelEncoder : public DispatchKernelEncoderI {
public:
MockDispatchKernelEncoder();
MOCK_METHOD0(hasBarriers, bool());
MOCK_METHOD0(getSlmTotalSize, uint32_t());
MOCK_METHOD0(getBindingTableOffset, uint32_t());
MOCK_METHOD0(getBorderColor, uint32_t());
MOCK_METHOD0(getSamplerTableOffset, uint32_t());
MOCK_METHOD0(getNumSurfaceStates, uint32_t());
MOCK_METHOD0(getNumSamplers, uint32_t());
MOCK_METHOD0(getSimdSize, uint32_t());
MOCK_METHOD0(getSizeCrossThreadData, uint32_t());
MOCK_METHOD0(getPerThreadScratchSize, uint32_t());
MOCK_METHOD0(getPerThreadExecutionMask, uint32_t());
MOCK_METHOD0(getSizePerThreadData, uint32_t());
MOCK_METHOD0(getSizePerThreadDataForWholeGroup, uint32_t());
MOCK_METHOD0(getSizeSurfaceStateHeapData, uint32_t());
MOCK_METHOD0(getCountOffsets, uint32_t *());
MOCK_METHOD0(getSizeOffsets, uint32_t *());
MOCK_METHOD0(getLocalWorkSize, uint32_t *());
MOCK_METHOD0(getNumGrfRequired, uint32_t());
MOCK_METHOD0(getThreadsPerThreadGroupCount, uint32_t());
MOCK_METHOD0(getIsaAllocation, GraphicsAllocation *());
MOCK_METHOD0(hasGroupCounts, bool());
MOCK_METHOD0(hasGroupSize, bool());
MOCK_METHOD0(getSurfaceStateHeap, const void *());
MOCK_METHOD0(getDynamicStateHeap, const void *());
MOCK_METHOD0(getCrossThread, const void *());
MOCK_METHOD0(getPerThread, const void *());
MOCK_METHOD0(isInlineDataRequired, bool());
MOCK_METHOD0(getNumLocalIdChannels, uint8_t());
MOCK_CONST_METHOD0(getKernelDescriptor, const KernelDescriptor &());
MOCK_CONST_METHOD0(getGroupSize, const uint32_t *());
MOCK_CONST_METHOD0(getSlmTotalSize, uint32_t());
MOCK_CONST_METHOD0(getCrossThreadData, const uint8_t *());
MOCK_CONST_METHOD0(getCrossThreadDataSize, uint32_t());
MOCK_CONST_METHOD0(getThreadExecutionMask, uint32_t());
MOCK_CONST_METHOD0(getNumThreadsPerThreadGroup, uint32_t());
MOCK_CONST_METHOD0(getPerThreadData, const uint8_t *());
MOCK_CONST_METHOD0(getPerThreadDataSize, uint32_t());
MOCK_CONST_METHOD0(getPerThreadDataSizeForWholeThreadGroup, uint32_t());
MOCK_CONST_METHOD0(getSurfaceStateHeapData, const uint8_t *());
MOCK_CONST_METHOD0(getSurfaceStateHeapDataSize, uint32_t());
MOCK_CONST_METHOD0(getIsaAllocation, GraphicsAllocation *());
MOCK_CONST_METHOD0(getDynamicStateHeapData, const uint8_t *());
void expectAnyMockFunctionCall();
@ -56,5 +47,6 @@ struct MockDispatchKernelEncoder : public DispatchKernelEncoderI {
static constexpr uint32_t perThreadSize = 0x20;
uint8_t dataCrossThread[crossThreadSize];
uint8_t dataPerThread[perThreadSize];
KernelDescriptor kernelDescriptor;
};
} // namespace NEO