Workgroup walk order

Change-Id: Id02db6a383e21dc17be64655e7f51a84103b2e0b
This commit is contained in:
Chodor, Jaroslaw
2018-08-06 11:35:59 +02:00
committed by sys_ocldev
parent dfd331c568
commit c10d0d79f5
13 changed files with 235 additions and 37 deletions

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2017, Intel Corporation
* Copyright (c) 2017 - 2018, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -24,6 +24,8 @@
#include "runtime/helpers/aligned_memory.h"
#include "runtime/utilities/cpu_info.h"
#include <array>
namespace OCLRT {
struct uint16x8_t;
@@ -38,9 +40,9 @@ const uint16_t initialLocalID[] = {
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
// Lookup table for generating LocalIDs based on the SIMD of the kernel
void (*LocalIDHelper::generateSimd8)(void *buffer, size_t lwsX, size_t lwsY, size_t threadsPerWorkGroup) = generateLocalIDsSimd<uint16x8_t, 8>;
void (*LocalIDHelper::generateSimd16)(void *buffer, size_t lwsX, size_t lwsY, size_t threadsPerWorkGroup) = generateLocalIDsSimd<uint16x8_t, 16>;
void (*LocalIDHelper::generateSimd32)(void *buffer, size_t lwsX, size_t lwsY, size_t threadsPerWorkGroup) = generateLocalIDsSimd<uint16x8_t, 32>;
void (*LocalIDHelper::generateSimd8)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder) = generateLocalIDsSimd<uint16x8_t, 8>;
void (*LocalIDHelper::generateSimd16)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder) = generateLocalIDsSimd<uint16x8_t, 16>;
void (*LocalIDHelper::generateSimd32)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder) = generateLocalIDsSimd<uint16x8_t, 32>;
// Initialize the lookup table based on CPU capabilities
LocalIDHelper::LocalIDHelper() {
@@ -55,14 +57,14 @@ LocalIDHelper::LocalIDHelper() {
LocalIDHelper LocalIDHelper::initializer;
//traditional function to generate local IDs
void generateLocalIDs(void *buffer, uint32_t simd, size_t lwsX, size_t lwsY, size_t lwsZ) {
auto threadsPerWorkGroup = getThreadsPerWG(simd, lwsX * lwsY * lwsZ);
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder) {
auto threadsPerWorkGroup = static_cast<uint16_t>(getThreadsPerWG(simd, localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]));
if (simd == 32) {
LocalIDHelper::generateSimd32(buffer, lwsX, lwsY, threadsPerWorkGroup);
LocalIDHelper::generateSimd32(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder);
} else if (simd == 16) {
LocalIDHelper::generateSimd16(buffer, lwsX, lwsY, threadsPerWorkGroup);
LocalIDHelper::generateSimd16(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder);
} else {
LocalIDHelper::generateSimd8(buffer, lwsX, lwsY, threadsPerWorkGroup);
LocalIDHelper::generateSimd8(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder);
}
}
} // namespace OCLRT

View File

@@ -22,10 +22,12 @@
#pragma once
#include <cstdint>
#include <algorithm>
#include "runtime/helpers/ptr_math.h"
#include <algorithm>
#include <array>
#include <cstdint>
namespace OCLRT {
union GRF {
float fRegs[8];
@@ -60,9 +62,9 @@ inline size_t getPerThreadSizeLocalIDs(uint32_t simd, uint32_t numChannels = 3)
}
struct LocalIDHelper {
static void (*generateSimd8)(void *buffer, size_t lwsX, size_t lwsY, size_t threadsPerWorkGroup);
static void (*generateSimd16)(void *buffer, size_t lwsX, size_t lwsY, size_t threadsPerWorkGroup);
static void (*generateSimd32)(void *buffer, size_t lwsX, size_t lwsY, size_t threadsPerWorkGroup);
static void (*generateSimd8)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder);
static void (*generateSimd16)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder);
static void (*generateSimd32)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder);
static LocalIDHelper initializer;
@@ -73,7 +75,10 @@ struct LocalIDHelper {
extern const uint16_t initialLocalID[];
template <typename Vec, int simd>
void generateLocalIDsSimd(void *b, size_t lwsX, size_t lwsY, size_t threadsPerWorkGroup);
void generateLocalIDsSimd(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup,
const std::array<uint8_t, 3> &dimensionsOrder);
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize,
const std::array<uint8_t, 3> &dimensionsOrder);
void generateLocalIDs(void *buffer, uint32_t simd, size_t lwsX, size_t lwsY, size_t lwsZ);
} // namespace OCLRT

View File

@@ -22,15 +22,22 @@
#include "runtime/command_queue/local_id_gen.h"
#include <array>
namespace OCLRT {
template <typename Vec, int simd>
inline void generateLocalIDsSimd(void *b, size_t lwsX, size_t lwsY, size_t threadsPerWorkGroup) {
inline void generateLocalIDsSimd(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup,
const std::array<uint8_t, 3> &dimensionsOrder) {
const int passes = simd / Vec::numChannels;
int pass = 0;
const Vec vLwsX(static_cast<uint16_t>(lwsX));
const Vec vLwsY(static_cast<uint16_t>(lwsY));
uint32_t xDimNum = dimensionsOrder[0];
uint32_t yDimNum = dimensionsOrder[1];
uint32_t zDimNum = dimensionsOrder[2];
const Vec vLwsX(localWorkgroupSize[xDimNum]);
const Vec vLwsY(localWorkgroupSize[yDimNum]);
auto zero = Vec::zero();
auto one = Vec::one();
@@ -113,9 +120,9 @@ inline void generateLocalIDsSimd(void *b, size_t lwsX, size_t lwsY, size_t threa
} while (xWrap);
for (size_t i = 0; i < threadsPerWorkGroup; ++i) {
x.store(buffer);
y.store(ptrOffset(buffer, threadSkipSize));
z.store(ptrOffset(buffer, 2 * threadSkipSize));
x.store(ptrOffset(buffer, xDimNum * threadSkipSize));
y.store(ptrOffset(buffer, yDimNum * threadSkipSize));
z.store(ptrOffset(buffer, zDimNum * threadSkipSize));
x += vSimdX;
y += vSimdY;

View File

@@ -24,8 +24,10 @@
#include "runtime/command_queue/local_id_gen.inl"
#include "runtime/helpers/uint16_avx2.h"
#include <array>
namespace OCLRT {
template void generateLocalIDsSimd<uint16x16_t, 32>(void *b, size_t lwsX, size_t lwsY, size_t threadsPerWorkGroup);
template void generateLocalIDsSimd<uint16x16_t, 16>(void *b, size_t lwsX, size_t lwsY, size_t threadsPerWorkGroup);
template void generateLocalIDsSimd<uint16x16_t, 32>(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder);
template void generateLocalIDsSimd<uint16x16_t, 16>(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder);
} // namespace OCLRT
#endif

View File

@@ -23,8 +23,10 @@
#include "runtime/command_queue/local_id_gen.inl"
#include "runtime/helpers/uint16_sse4.h"
#include <array>
namespace OCLRT {
template void generateLocalIDsSimd<uint16x8_t, 32>(void *b, size_t lwsX, size_t lwsY, size_t threadsPerWorkGroup);
template void generateLocalIDsSimd<uint16x8_t, 16>(void *b, size_t lwsX, size_t lwsY, size_t threadsPerWorkGroup);
template void generateLocalIDsSimd<uint16x8_t, 8>(void *b, size_t lwsX, size_t lwsY, size_t threadsPerWorkGroup);
template void generateLocalIDsSimd<uint16x8_t, 32>(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder);
template void generateLocalIDsSimd<uint16x8_t, 16>(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder);
template void generateLocalIDsSimd<uint16x8_t, 8>(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder);
} // namespace OCLRT

View File

@@ -371,7 +371,8 @@ size_t KernelCommandsHelper<GfxFamily>::sendIndirectState(
ioh,
simd,
numChannels,
localWorkSize);
localWorkSize,
kernel.getKernelInfo().workgroupDimensionsOrder);
// send interface descriptor data
auto localWorkItems = localWorkSize[0] * localWorkSize[1] * localWorkSize[2];

View File

@@ -24,13 +24,16 @@
#include "runtime/helpers/debug_helpers.h"
#include "runtime/helpers/per_thread_data.h"
#include <array>
namespace OCLRT {
size_t PerThreadDataHelper::sendPerThreadData(
LinearStream &indirectHeap,
uint32_t simd,
uint32_t numChannels,
const size_t localWorkSizes[3]) {
const size_t localWorkSizes[3],
const std::array<uint8_t, 3> &workgroupWalkOrder) {
auto offsetPerThreadData = indirectHeap.getUsed();
if (numChannels) {
auto localWorkSize = localWorkSizes[0] * localWorkSizes[1] * localWorkSizes[2];
@@ -39,7 +42,11 @@ size_t PerThreadDataHelper::sendPerThreadData(
// Generate local IDs
DEBUG_BREAK_IF(numChannels != 3);
generateLocalIDs(pDest, simd, localWorkSizes[0], localWorkSizes[1], localWorkSizes[2]);
generateLocalIDs(pDest, static_cast<uint16_t>(simd),
std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizes[0]),
static_cast<uint16_t>(localWorkSizes[1]),
static_cast<uint16_t>(localWorkSizes[2])}},
std::array<uint8_t, 3>{{workgroupWalkOrder[0], workgroupWalkOrder[1], workgroupWalkOrder[2]}});
}
return offsetPerThreadData;
}

View File

@@ -21,6 +21,7 @@
*/
#pragma once
#include <array>
#include <cstdint>
#include <cstddef>
#include "runtime/command_queue/local_id_gen.h"
@@ -47,7 +48,16 @@ struct PerThreadDataHelper {
LinearStream &indirectHeap,
uint32_t simd,
uint32_t numChannels,
const size_t localWorkSizes[3]);
const size_t localWorkSizes[3],
const std::array<uint8_t, 3> &workgroupWalkOrder);
static size_t sendPerThreadData(
LinearStream &indirectHeap,
uint32_t simd,
uint32_t numChannels,
const size_t localWorkSizes[3]) {
return sendPerThreadData(indirectHeap, simd, numChannels, localWorkSizes, std::array<uint8_t, 3>{0, 1, 2});
}
static inline uint32_t getNumLocalIdChannels(const iOpenCL::SPatchThreadPayload &threadPayload) {
return threadPayload.LocalIDXPresent +

View File

@@ -26,13 +26,15 @@
#include "kernel_arg_info.h"
#include "patch_info.h"
#include "runtime/helpers/hw_info.h"
#include <algorithm>
#include <array>
#include <cstdint>
#include <cmath>
#include <vector>
#include <map>
#include <string>
#include <unordered_map>
#include <map>
#include <vector>
namespace OCLRT {
class BuiltinDispatchInfoBuilder;
@@ -243,6 +245,8 @@ struct KernelInfo {
char *crossThreadData = nullptr;
size_t reqdWorkGroupSize[3];
size_t requiredSubGroupSize = 0;
std::array<uint8_t, 3> workgroupWalkOrder = {{0, 1, 2}};
std::array<uint8_t, 3> workgroupDimensionsOrder = {{0, 1, 2}};
uint32_t gpuPointerSize = 0;
const BuiltinDispatchInfoBuilder *builtinDispatchBuilder = nullptr;
uint32_t argumentsToPatchNum = 0;

View File

@@ -516,6 +516,15 @@ cl_int Program::parsePatchList(KernelInfo &kernelInfo) {
DEBUG_BREAK_IF(!(kernelInfo.patchInfo.executionEnvironment->RequiredWorkGroupSizeY > 0));
DEBUG_BREAK_IF(!(kernelInfo.patchInfo.executionEnvironment->RequiredWorkGroupSizeZ > 0));
}
kernelInfo.workgroupWalkOrder[0] = 0;
kernelInfo.workgroupWalkOrder[1] = 1;
kernelInfo.workgroupWalkOrder[2] = 2;
for (uint32_t i = 0; i < 3; ++i) {
// inverts the walk order mapping (from ORDER_ID->DIM_ID to DIM_ID->ORDER_ID)
kernelInfo.workgroupDimensionsOrder[kernelInfo.workgroupWalkOrder[i]] = i;
}
if (kernelInfo.patchInfo.executionEnvironment->CompiledForGreaterThan4GBBuffers == false) {
kernelInfo.requiresSshForBuffers = true;
}
@@ -537,7 +546,10 @@ cl_int Program::parsePatchList(KernelInfo &kernelInfo) {
"\n .UsesFencesForReadWriteImages", kernelInfo.patchInfo.executionEnvironment->UsesFencesForReadWriteImages,
"\n .UsesStatelessSpillFill", kernelInfo.patchInfo.executionEnvironment->UsesStatelessSpillFill,
"\n .IsCoherent", kernelInfo.patchInfo.executionEnvironment->IsCoherent,
"\n .SubgroupIndependentForwardProgressRequired", kernelInfo.patchInfo.executionEnvironment->SubgroupIndependentForwardProgressRequired);
"\n .SubgroupIndependentForwardProgressRequired", kernelInfo.patchInfo.executionEnvironment->SubgroupIndependentForwardProgressRequired,
"\n .WorkgroupWalkOrderDim0", kernelInfo.workgroupWalkOrder[0],
"\n .WorkgroupWalkOrderDim1", kernelInfo.workgroupWalkOrder[1],
"\n .WorkgroupWalkOrderDim2", kernelInfo.workgroupWalkOrder[2]);
break;
case PATCH_TOKEN_DATA_PARAMETER_STREAM:

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2017, Intel Corporation
* Copyright (c) 2017 - 2018, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -172,6 +172,42 @@ struct LocalIDFixture : public ::testing::TestWithParam<std::tuple<int, int, int
}
}
void validateWalkOrder(uint32_t simd, uint32_t localWorkgroupSizeX, uint32_t localWorkgroupSizeY, uint32_t localWorkgroupSizeZ,
const std::array<uint8_t, 3> &dimensionsOrder) {
std::array<uint8_t, 3> walkOrder = {};
for (uint32_t i = 0; i < 3; ++i) {
// inverts the walk order mapping (from DIM_ID->ORDER_ID to ORDER_ID->DIM_ID)
walkOrder[dimensionsOrder[i]] = i;
}
auto skipPerThread = simd == 32 ? 32 : 16;
auto pBufferX = buffer;
auto pBufferY = pBufferX + skipPerThread;
auto pBufferZ = pBufferY + skipPerThread;
decltype(pBufferX) ids[] = {pBufferX, pBufferY, pBufferZ};
uint32_t sizes[] = {localWorkgroupSizeX, localWorkgroupSizeY, localWorkgroupSizeZ};
uint32_t flattenedId = 0;
for (uint32_t id2 = 0; id2 < sizes[walkOrder[2]]; ++id2) {
for (uint32_t id1 = 0; id1 < sizes[walkOrder[1]]; ++id1) {
for (uint32_t id0 = 0; id0 < sizes[walkOrder[0]]; ++id0) {
uint32_t threadId = flattenedId / simd;
uint32_t channelId = flattenedId % simd;
uint16_t foundId0 = ids[walkOrder[0]][channelId + threadId * skipPerThread * 3];
uint16_t foundId1 = ids[walkOrder[1]][channelId + threadId * skipPerThread * 3];
uint16_t foundId2 = ids[walkOrder[2]][channelId + threadId * skipPerThread * 3];
if ((id0 != foundId0) || (id1 != foundId1) || (id2 != foundId2)) {
EXPECT_EQ(id0, foundId0) << simd << " X @ (" << id0 << ", " << id1 << ", " << id2 << ") - flat " << flattenedId;
EXPECT_EQ(id1, foundId1) << simd << " Y @ (" << id0 << ", " << id1 << ", " << id2 << ") - flat " << flattenedId;
EXPECT_EQ(id2, foundId2) << simd << " Z @ (" << id0 << ", " << id1 << ", " << id2 << ") - flat " << flattenedId;
}
++flattenedId;
}
}
}
}
void dumpBuffer(uint32_t simd, uint32_t lwsX, uint32_t lwsY, uint32_t lwsZ) {
auto workSize = lwsX * lwsY * lwsZ;
auto threads = (workSize + simd - 1) / simd;
@@ -211,15 +247,41 @@ struct LocalIDFixture : public ::testing::TestWithParam<std::tuple<int, int, int
};
TEST_P(LocalIDFixture, checkIDWithinLimits) {
generateLocalIDs(buffer, simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ);
generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
std::array<uint8_t, 3>{{0, 1, 2}});
validateIDWithinLimits(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ);
}
TEST_P(LocalIDFixture, checkAllWorkItemsCovered) {
generateLocalIDs(buffer, simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ);
generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
std::array<uint8_t, 3>{{0, 1, 2}});
validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ);
}
TEST_P(LocalIDFixture, WhenWalkOrderIsXyzThenProperLocalIdsAreGenerated) {
auto dimensionsOrder = std::array<uint8_t, 3>{{0, 1, 2}};
generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
dimensionsOrder);
validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ);
validateWalkOrder(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, dimensionsOrder);
}
TEST_P(LocalIDFixture, WhenWalkOrderIsYxzThenProperLocalIdsAreGenerated) {
auto dimensionsOrder = std::array<uint8_t, 3>{{1, 0, 2}};
generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
dimensionsOrder);
validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ);
validateWalkOrder(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, dimensionsOrder);
}
TEST_P(LocalIDFixture, WhenWalkOrderIsZyxThenProperLocalIdsAreGenerated) {
auto dimensionsOrder = std::array<uint8_t, 3>{{2, 1, 0}};
generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
dimensionsOrder);
validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ);
validateWalkOrder(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, dimensionsOrder);
}
TEST_P(LocalIDFixture, sizeCalculationLocalIDs) {
auto workItems = localWorkSizeX * localWorkSizeY * localWorkSizeZ;
auto sizeTotalPerThreadData = getThreadsPerWG(simd, workItems) * getPerThreadSizeLocalIDs(simd);

View File

@@ -347,6 +347,74 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, sendIndirectStateResourceUsage)
EXPECT_GE(KernelCommandsHelper<FamilyType>::getSizeRequiredCS(), usedAfterCS - usedBeforeCS);
}
HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, whenSendingIndirectStateThenKernelsWalkOrderIsTakenIntoAccount) {
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
CommandQueueHw<FamilyType> cmdQ(pContext, pDevice, 0);
std::unique_ptr<Image> img(Image2dHelper<>::create(pContext));
MultiDispatchInfo multiDispatchInfo;
auto &builder = BuiltIns::getInstance().getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyImageToImage3d,
cmdQ.getContext(), cmdQ.getDevice());
BuiltinDispatchInfoBuilder::BuiltinOpParams dc;
dc.srcMemObj = img.get();
dc.dstMemObj = img.get();
dc.size = {1, 1, 1};
builder.buildDispatchInfos(multiDispatchInfo, dc);
ASSERT_NE(0u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
const size_t localWorkSizeX = 2;
const size_t localWorkSizeY = 3;
const size_t localWorkSizeZ = 4;
const size_t localWorkSizes[3]{localWorkSizeX, localWorkSizeY, localWorkSizeZ};
auto &commandStream = cmdQ.getCS();
auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192);
auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192);
auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192);
dsh.align(KernelCommandsHelper<FamilyType>::alignInterfaceDescriptorData);
size_t IDToffset = dsh.getUsed();
dsh.getSpace(sizeof(INTERFACE_DESCRIPTOR_DATA));
KernelInfo modifiedKernelInfo = {};
modifiedKernelInfo.patchInfo = kernel->getKernelInfo().patchInfo;
modifiedKernelInfo.workgroupWalkOrder[0] = 2;
modifiedKernelInfo.workgroupWalkOrder[1] = 1;
modifiedKernelInfo.workgroupWalkOrder[2] = 0;
modifiedKernelInfo.workgroupDimensionsOrder[0] = 2;
modifiedKernelInfo.workgroupDimensionsOrder[1] = 1;
modifiedKernelInfo.workgroupDimensionsOrder[2] = 0;
MockKernel mockKernel{kernel->getProgram(), modifiedKernelInfo, kernel->getDevice(), false};
KernelCommandsHelper<FamilyType>::sendIndirectState(
commandStream,
dsh,
ioh,
ssh,
mockKernel,
modifiedKernelInfo.getMaxSimdSize(),
localWorkSizes,
IDToffset,
0,
pDevice->getPreemptionMode(),
nullptr);
size_t numThreads = localWorkSizeX * localWorkSizeY * localWorkSizeZ;
numThreads = (numThreads + modifiedKernelInfo.getMaxSimdSize() - 1) / modifiedKernelInfo.getMaxSimdSize();
size_t expectedIohSize = ((modifiedKernelInfo.getMaxSimdSize() == 32) ? 32 : 16) * 3 * numThreads * sizeof(uint16_t);
ASSERT_LE(expectedIohSize, ioh.getUsed());
auto expectedLocalIds = alignedMalloc(expectedIohSize, 64);
generateLocalIDs(expectedLocalIds, modifiedKernelInfo.getMaxSimdSize(),
std::array<uint16_t, 3>{{localWorkSizeX, localWorkSizeY, localWorkSizeZ}},
std::array<uint8_t, 3>{{modifiedKernelInfo.workgroupDimensionsOrder[0], modifiedKernelInfo.workgroupDimensionsOrder[1], modifiedKernelInfo.workgroupDimensionsOrder[2]}});
EXPECT_EQ(0, memcmp(expectedLocalIds, ioh.getCpuBase(), expectedIohSize));
alignedFree(expectedLocalIds);
}
HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, usedBindingTableStatePointer) {
typedef typename FamilyType::BINDING_TABLE_STATE BINDING_TABLE_STATE;
typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;

View File

@@ -440,6 +440,22 @@ TEST_F(KernelDataTest, KernelAttributesInfo) {
EXPECT_EQ_CONST(PATCH_TOKEN_KERNEL_ATTRIBUTES_INFO, pKernelInfo->patchInfo.pKernelAttributesInfo->Token);
}
TEST_F(KernelDataTest, WhenDecodingExecutionEnvironmentTokenThenWalkOrderIsForcedToXMajor) {
iOpenCL::SPatchExecutionEnvironment executionEnvironment = {};
executionEnvironment.Token = PATCH_TOKEN_EXECUTION_ENVIRONMENT;
executionEnvironment.Size = sizeof(SPatchExecutionEnvironment);
pPatchList = &executionEnvironment;
patchListSize = executionEnvironment.Size;
buildAndDecode();
std::array<uint8_t, 3> expectedWalkOrder = {{0, 1, 2}};
std::array<uint8_t, 3> expectedDimsIds = {{0, 1, 2}};
EXPECT_EQ(expectedWalkOrder, pKernelInfo->workgroupWalkOrder);
EXPECT_EQ(expectedDimsIds, pKernelInfo->workgroupDimensionsOrder);
}
// Test all the different data parameters with the same "made up" data
class DataParameterTest : public KernelDataTest, public testing::WithParamInterface<uint32_t> {};