mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-22 10:17:01 +08:00
Workgroup walk order
Change-Id: Id02db6a383e21dc17be64655e7f51a84103b2e0b
This commit is contained in:
committed by
sys_ocldev
parent
dfd331c568
commit
c10d0d79f5
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2017, Intel Corporation
|
||||
* Copyright (c) 2017 - 2018, Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
@@ -24,6 +24,8 @@
|
||||
#include "runtime/helpers/aligned_memory.h"
|
||||
#include "runtime/utilities/cpu_info.h"
|
||||
|
||||
#include <array>
|
||||
|
||||
namespace OCLRT {
|
||||
|
||||
struct uint16x8_t;
|
||||
@@ -38,9 +40,9 @@ const uint16_t initialLocalID[] = {
|
||||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
|
||||
|
||||
// Lookup table for generating LocalIDs based on the SIMD of the kernel
|
||||
void (*LocalIDHelper::generateSimd8)(void *buffer, size_t lwsX, size_t lwsY, size_t threadsPerWorkGroup) = generateLocalIDsSimd<uint16x8_t, 8>;
|
||||
void (*LocalIDHelper::generateSimd16)(void *buffer, size_t lwsX, size_t lwsY, size_t threadsPerWorkGroup) = generateLocalIDsSimd<uint16x8_t, 16>;
|
||||
void (*LocalIDHelper::generateSimd32)(void *buffer, size_t lwsX, size_t lwsY, size_t threadsPerWorkGroup) = generateLocalIDsSimd<uint16x8_t, 32>;
|
||||
void (*LocalIDHelper::generateSimd8)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder) = generateLocalIDsSimd<uint16x8_t, 8>;
|
||||
void (*LocalIDHelper::generateSimd16)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder) = generateLocalIDsSimd<uint16x8_t, 16>;
|
||||
void (*LocalIDHelper::generateSimd32)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder) = generateLocalIDsSimd<uint16x8_t, 32>;
|
||||
|
||||
// Initialize the lookup table based on CPU capabilities
|
||||
LocalIDHelper::LocalIDHelper() {
|
||||
@@ -55,14 +57,14 @@ LocalIDHelper::LocalIDHelper() {
|
||||
LocalIDHelper LocalIDHelper::initializer;
|
||||
|
||||
//traditional function to generate local IDs
|
||||
void generateLocalIDs(void *buffer, uint32_t simd, size_t lwsX, size_t lwsY, size_t lwsZ) {
|
||||
auto threadsPerWorkGroup = getThreadsPerWG(simd, lwsX * lwsY * lwsZ);
|
||||
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder) {
|
||||
auto threadsPerWorkGroup = static_cast<uint16_t>(getThreadsPerWG(simd, localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]));
|
||||
if (simd == 32) {
|
||||
LocalIDHelper::generateSimd32(buffer, lwsX, lwsY, threadsPerWorkGroup);
|
||||
LocalIDHelper::generateSimd32(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder);
|
||||
} else if (simd == 16) {
|
||||
LocalIDHelper::generateSimd16(buffer, lwsX, lwsY, threadsPerWorkGroup);
|
||||
LocalIDHelper::generateSimd16(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder);
|
||||
} else {
|
||||
LocalIDHelper::generateSimd8(buffer, lwsX, lwsY, threadsPerWorkGroup);
|
||||
LocalIDHelper::generateSimd8(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder);
|
||||
}
|
||||
}
|
||||
} // namespace OCLRT
|
||||
|
||||
@@ -22,10 +22,12 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <algorithm>
|
||||
#include "runtime/helpers/ptr_math.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <cstdint>
|
||||
|
||||
namespace OCLRT {
|
||||
union GRF {
|
||||
float fRegs[8];
|
||||
@@ -60,9 +62,9 @@ inline size_t getPerThreadSizeLocalIDs(uint32_t simd, uint32_t numChannels = 3)
|
||||
}
|
||||
|
||||
struct LocalIDHelper {
|
||||
static void (*generateSimd8)(void *buffer, size_t lwsX, size_t lwsY, size_t threadsPerWorkGroup);
|
||||
static void (*generateSimd16)(void *buffer, size_t lwsX, size_t lwsY, size_t threadsPerWorkGroup);
|
||||
static void (*generateSimd32)(void *buffer, size_t lwsX, size_t lwsY, size_t threadsPerWorkGroup);
|
||||
static void (*generateSimd8)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder);
|
||||
static void (*generateSimd16)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder);
|
||||
static void (*generateSimd32)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder);
|
||||
|
||||
static LocalIDHelper initializer;
|
||||
|
||||
@@ -73,7 +75,10 @@ struct LocalIDHelper {
|
||||
extern const uint16_t initialLocalID[];
|
||||
|
||||
template <typename Vec, int simd>
|
||||
void generateLocalIDsSimd(void *b, size_t lwsX, size_t lwsY, size_t threadsPerWorkGroup);
|
||||
void generateLocalIDsSimd(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup,
|
||||
const std::array<uint8_t, 3> &dimensionsOrder);
|
||||
|
||||
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize,
|
||||
const std::array<uint8_t, 3> &dimensionsOrder);
|
||||
|
||||
void generateLocalIDs(void *buffer, uint32_t simd, size_t lwsX, size_t lwsY, size_t lwsZ);
|
||||
} // namespace OCLRT
|
||||
@@ -22,15 +22,22 @@
|
||||
|
||||
#include "runtime/command_queue/local_id_gen.h"
|
||||
|
||||
#include <array>
|
||||
|
||||
namespace OCLRT {
|
||||
|
||||
template <typename Vec, int simd>
|
||||
inline void generateLocalIDsSimd(void *b, size_t lwsX, size_t lwsY, size_t threadsPerWorkGroup) {
|
||||
inline void generateLocalIDsSimd(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup,
|
||||
const std::array<uint8_t, 3> &dimensionsOrder) {
|
||||
const int passes = simd / Vec::numChannels;
|
||||
int pass = 0;
|
||||
|
||||
const Vec vLwsX(static_cast<uint16_t>(lwsX));
|
||||
const Vec vLwsY(static_cast<uint16_t>(lwsY));
|
||||
uint32_t xDimNum = dimensionsOrder[0];
|
||||
uint32_t yDimNum = dimensionsOrder[1];
|
||||
uint32_t zDimNum = dimensionsOrder[2];
|
||||
|
||||
const Vec vLwsX(localWorkgroupSize[xDimNum]);
|
||||
const Vec vLwsY(localWorkgroupSize[yDimNum]);
|
||||
|
||||
auto zero = Vec::zero();
|
||||
auto one = Vec::one();
|
||||
@@ -113,9 +120,9 @@ inline void generateLocalIDsSimd(void *b, size_t lwsX, size_t lwsY, size_t threa
|
||||
} while (xWrap);
|
||||
|
||||
for (size_t i = 0; i < threadsPerWorkGroup; ++i) {
|
||||
x.store(buffer);
|
||||
y.store(ptrOffset(buffer, threadSkipSize));
|
||||
z.store(ptrOffset(buffer, 2 * threadSkipSize));
|
||||
x.store(ptrOffset(buffer, xDimNum * threadSkipSize));
|
||||
y.store(ptrOffset(buffer, yDimNum * threadSkipSize));
|
||||
z.store(ptrOffset(buffer, zDimNum * threadSkipSize));
|
||||
|
||||
x += vSimdX;
|
||||
y += vSimdY;
|
||||
|
||||
@@ -24,8 +24,10 @@
|
||||
#include "runtime/command_queue/local_id_gen.inl"
|
||||
#include "runtime/helpers/uint16_avx2.h"
|
||||
|
||||
#include <array>
|
||||
|
||||
namespace OCLRT {
|
||||
template void generateLocalIDsSimd<uint16x16_t, 32>(void *b, size_t lwsX, size_t lwsY, size_t threadsPerWorkGroup);
|
||||
template void generateLocalIDsSimd<uint16x16_t, 16>(void *b, size_t lwsX, size_t lwsY, size_t threadsPerWorkGroup);
|
||||
template void generateLocalIDsSimd<uint16x16_t, 32>(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder);
|
||||
template void generateLocalIDsSimd<uint16x16_t, 16>(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder);
|
||||
} // namespace OCLRT
|
||||
#endif
|
||||
@@ -23,8 +23,10 @@
|
||||
#include "runtime/command_queue/local_id_gen.inl"
|
||||
#include "runtime/helpers/uint16_sse4.h"
|
||||
|
||||
#include <array>
|
||||
|
||||
namespace OCLRT {
|
||||
template void generateLocalIDsSimd<uint16x8_t, 32>(void *b, size_t lwsX, size_t lwsY, size_t threadsPerWorkGroup);
|
||||
template void generateLocalIDsSimd<uint16x8_t, 16>(void *b, size_t lwsX, size_t lwsY, size_t threadsPerWorkGroup);
|
||||
template void generateLocalIDsSimd<uint16x8_t, 8>(void *b, size_t lwsX, size_t lwsY, size_t threadsPerWorkGroup);
|
||||
template void generateLocalIDsSimd<uint16x8_t, 32>(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder);
|
||||
template void generateLocalIDsSimd<uint16x8_t, 16>(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder);
|
||||
template void generateLocalIDsSimd<uint16x8_t, 8>(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder);
|
||||
} // namespace OCLRT
|
||||
|
||||
@@ -371,7 +371,8 @@ size_t KernelCommandsHelper<GfxFamily>::sendIndirectState(
|
||||
ioh,
|
||||
simd,
|
||||
numChannels,
|
||||
localWorkSize);
|
||||
localWorkSize,
|
||||
kernel.getKernelInfo().workgroupDimensionsOrder);
|
||||
|
||||
// send interface descriptor data
|
||||
auto localWorkItems = localWorkSize[0] * localWorkSize[1] * localWorkSize[2];
|
||||
|
||||
@@ -24,13 +24,16 @@
|
||||
#include "runtime/helpers/debug_helpers.h"
|
||||
#include "runtime/helpers/per_thread_data.h"
|
||||
|
||||
#include <array>
|
||||
|
||||
namespace OCLRT {
|
||||
|
||||
size_t PerThreadDataHelper::sendPerThreadData(
|
||||
LinearStream &indirectHeap,
|
||||
uint32_t simd,
|
||||
uint32_t numChannels,
|
||||
const size_t localWorkSizes[3]) {
|
||||
const size_t localWorkSizes[3],
|
||||
const std::array<uint8_t, 3> &workgroupWalkOrder) {
|
||||
auto offsetPerThreadData = indirectHeap.getUsed();
|
||||
if (numChannels) {
|
||||
auto localWorkSize = localWorkSizes[0] * localWorkSizes[1] * localWorkSizes[2];
|
||||
@@ -39,7 +42,11 @@ size_t PerThreadDataHelper::sendPerThreadData(
|
||||
|
||||
// Generate local IDs
|
||||
DEBUG_BREAK_IF(numChannels != 3);
|
||||
generateLocalIDs(pDest, simd, localWorkSizes[0], localWorkSizes[1], localWorkSizes[2]);
|
||||
generateLocalIDs(pDest, static_cast<uint16_t>(simd),
|
||||
std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizes[0]),
|
||||
static_cast<uint16_t>(localWorkSizes[1]),
|
||||
static_cast<uint16_t>(localWorkSizes[2])}},
|
||||
std::array<uint8_t, 3>{{workgroupWalkOrder[0], workgroupWalkOrder[1], workgroupWalkOrder[2]}});
|
||||
}
|
||||
return offsetPerThreadData;
|
||||
}
|
||||
|
||||
@@ -21,6 +21,7 @@
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include <array>
|
||||
#include <cstdint>
|
||||
#include <cstddef>
|
||||
#include "runtime/command_queue/local_id_gen.h"
|
||||
@@ -47,7 +48,16 @@ struct PerThreadDataHelper {
|
||||
LinearStream &indirectHeap,
|
||||
uint32_t simd,
|
||||
uint32_t numChannels,
|
||||
const size_t localWorkSizes[3]);
|
||||
const size_t localWorkSizes[3],
|
||||
const std::array<uint8_t, 3> &workgroupWalkOrder);
|
||||
|
||||
static size_t sendPerThreadData(
|
||||
LinearStream &indirectHeap,
|
||||
uint32_t simd,
|
||||
uint32_t numChannels,
|
||||
const size_t localWorkSizes[3]) {
|
||||
return sendPerThreadData(indirectHeap, simd, numChannels, localWorkSizes, std::array<uint8_t, 3>{0, 1, 2});
|
||||
}
|
||||
|
||||
static inline uint32_t getNumLocalIdChannels(const iOpenCL::SPatchThreadPayload &threadPayload) {
|
||||
return threadPayload.LocalIDXPresent +
|
||||
|
||||
@@ -26,13 +26,15 @@
|
||||
#include "kernel_arg_info.h"
|
||||
#include "patch_info.h"
|
||||
#include "runtime/helpers/hw_info.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <cstdint>
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <map>
|
||||
#include <vector>
|
||||
|
||||
namespace OCLRT {
|
||||
class BuiltinDispatchInfoBuilder;
|
||||
@@ -243,6 +245,8 @@ struct KernelInfo {
|
||||
char *crossThreadData = nullptr;
|
||||
size_t reqdWorkGroupSize[3];
|
||||
size_t requiredSubGroupSize = 0;
|
||||
std::array<uint8_t, 3> workgroupWalkOrder = {{0, 1, 2}};
|
||||
std::array<uint8_t, 3> workgroupDimensionsOrder = {{0, 1, 2}};
|
||||
uint32_t gpuPointerSize = 0;
|
||||
const BuiltinDispatchInfoBuilder *builtinDispatchBuilder = nullptr;
|
||||
uint32_t argumentsToPatchNum = 0;
|
||||
|
||||
@@ -516,6 +516,15 @@ cl_int Program::parsePatchList(KernelInfo &kernelInfo) {
|
||||
DEBUG_BREAK_IF(!(kernelInfo.patchInfo.executionEnvironment->RequiredWorkGroupSizeY > 0));
|
||||
DEBUG_BREAK_IF(!(kernelInfo.patchInfo.executionEnvironment->RequiredWorkGroupSizeZ > 0));
|
||||
}
|
||||
kernelInfo.workgroupWalkOrder[0] = 0;
|
||||
kernelInfo.workgroupWalkOrder[1] = 1;
|
||||
kernelInfo.workgroupWalkOrder[2] = 2;
|
||||
|
||||
for (uint32_t i = 0; i < 3; ++i) {
|
||||
// inverts the walk order mapping (from ORDER_ID->DIM_ID to DIM_ID->ORDER_ID)
|
||||
kernelInfo.workgroupDimensionsOrder[kernelInfo.workgroupWalkOrder[i]] = i;
|
||||
}
|
||||
|
||||
if (kernelInfo.patchInfo.executionEnvironment->CompiledForGreaterThan4GBBuffers == false) {
|
||||
kernelInfo.requiresSshForBuffers = true;
|
||||
}
|
||||
@@ -537,7 +546,10 @@ cl_int Program::parsePatchList(KernelInfo &kernelInfo) {
|
||||
"\n .UsesFencesForReadWriteImages", kernelInfo.patchInfo.executionEnvironment->UsesFencesForReadWriteImages,
|
||||
"\n .UsesStatelessSpillFill", kernelInfo.patchInfo.executionEnvironment->UsesStatelessSpillFill,
|
||||
"\n .IsCoherent", kernelInfo.patchInfo.executionEnvironment->IsCoherent,
|
||||
"\n .SubgroupIndependentForwardProgressRequired", kernelInfo.patchInfo.executionEnvironment->SubgroupIndependentForwardProgressRequired);
|
||||
"\n .SubgroupIndependentForwardProgressRequired", kernelInfo.patchInfo.executionEnvironment->SubgroupIndependentForwardProgressRequired,
|
||||
"\n .WorkgroupWalkOrderDim0", kernelInfo.workgroupWalkOrder[0],
|
||||
"\n .WorkgroupWalkOrderDim1", kernelInfo.workgroupWalkOrder[1],
|
||||
"\n .WorkgroupWalkOrderDim2", kernelInfo.workgroupWalkOrder[2]);
|
||||
break;
|
||||
|
||||
case PATCH_TOKEN_DATA_PARAMETER_STREAM:
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2017, Intel Corporation
|
||||
* Copyright (c) 2017 - 2018, Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
@@ -172,6 +172,42 @@ struct LocalIDFixture : public ::testing::TestWithParam<std::tuple<int, int, int
|
||||
}
|
||||
}
|
||||
|
||||
void validateWalkOrder(uint32_t simd, uint32_t localWorkgroupSizeX, uint32_t localWorkgroupSizeY, uint32_t localWorkgroupSizeZ,
|
||||
const std::array<uint8_t, 3> &dimensionsOrder) {
|
||||
std::array<uint8_t, 3> walkOrder = {};
|
||||
for (uint32_t i = 0; i < 3; ++i) {
|
||||
// inverts the walk order mapping (from DIM_ID->ORDER_ID to ORDER_ID->DIM_ID)
|
||||
walkOrder[dimensionsOrder[i]] = i;
|
||||
}
|
||||
|
||||
auto skipPerThread = simd == 32 ? 32 : 16;
|
||||
|
||||
auto pBufferX = buffer;
|
||||
auto pBufferY = pBufferX + skipPerThread;
|
||||
auto pBufferZ = pBufferY + skipPerThread;
|
||||
decltype(pBufferX) ids[] = {pBufferX, pBufferY, pBufferZ};
|
||||
uint32_t sizes[] = {localWorkgroupSizeX, localWorkgroupSizeY, localWorkgroupSizeZ};
|
||||
|
||||
uint32_t flattenedId = 0;
|
||||
for (uint32_t id2 = 0; id2 < sizes[walkOrder[2]]; ++id2) {
|
||||
for (uint32_t id1 = 0; id1 < sizes[walkOrder[1]]; ++id1) {
|
||||
for (uint32_t id0 = 0; id0 < sizes[walkOrder[0]]; ++id0) {
|
||||
uint32_t threadId = flattenedId / simd;
|
||||
uint32_t channelId = flattenedId % simd;
|
||||
uint16_t foundId0 = ids[walkOrder[0]][channelId + threadId * skipPerThread * 3];
|
||||
uint16_t foundId1 = ids[walkOrder[1]][channelId + threadId * skipPerThread * 3];
|
||||
uint16_t foundId2 = ids[walkOrder[2]][channelId + threadId * skipPerThread * 3];
|
||||
if ((id0 != foundId0) || (id1 != foundId1) || (id2 != foundId2)) {
|
||||
EXPECT_EQ(id0, foundId0) << simd << " X @ (" << id0 << ", " << id1 << ", " << id2 << ") - flat " << flattenedId;
|
||||
EXPECT_EQ(id1, foundId1) << simd << " Y @ (" << id0 << ", " << id1 << ", " << id2 << ") - flat " << flattenedId;
|
||||
EXPECT_EQ(id2, foundId2) << simd << " Z @ (" << id0 << ", " << id1 << ", " << id2 << ") - flat " << flattenedId;
|
||||
}
|
||||
++flattenedId;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void dumpBuffer(uint32_t simd, uint32_t lwsX, uint32_t lwsY, uint32_t lwsZ) {
|
||||
auto workSize = lwsX * lwsY * lwsZ;
|
||||
auto threads = (workSize + simd - 1) / simd;
|
||||
@@ -211,15 +247,41 @@ struct LocalIDFixture : public ::testing::TestWithParam<std::tuple<int, int, int
|
||||
};
|
||||
|
||||
TEST_P(LocalIDFixture, checkIDWithinLimits) {
|
||||
generateLocalIDs(buffer, simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ);
|
||||
generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
|
||||
std::array<uint8_t, 3>{{0, 1, 2}});
|
||||
validateIDWithinLimits(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ);
|
||||
}
|
||||
|
||||
TEST_P(LocalIDFixture, checkAllWorkItemsCovered) {
|
||||
generateLocalIDs(buffer, simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ);
|
||||
generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
|
||||
std::array<uint8_t, 3>{{0, 1, 2}});
|
||||
validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ);
|
||||
}
|
||||
|
||||
TEST_P(LocalIDFixture, WhenWalkOrderIsXyzThenProperLocalIdsAreGenerated) {
|
||||
auto dimensionsOrder = std::array<uint8_t, 3>{{0, 1, 2}};
|
||||
generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
|
||||
dimensionsOrder);
|
||||
validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ);
|
||||
validateWalkOrder(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, dimensionsOrder);
|
||||
}
|
||||
|
||||
TEST_P(LocalIDFixture, WhenWalkOrderIsYxzThenProperLocalIdsAreGenerated) {
|
||||
auto dimensionsOrder = std::array<uint8_t, 3>{{1, 0, 2}};
|
||||
generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
|
||||
dimensionsOrder);
|
||||
validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ);
|
||||
validateWalkOrder(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, dimensionsOrder);
|
||||
}
|
||||
|
||||
TEST_P(LocalIDFixture, WhenWalkOrderIsZyxThenProperLocalIdsAreGenerated) {
|
||||
auto dimensionsOrder = std::array<uint8_t, 3>{{2, 1, 0}};
|
||||
generateLocalIDs(buffer, simd, std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizeX), static_cast<uint16_t>(localWorkSizeY), static_cast<uint16_t>(localWorkSizeZ)}},
|
||||
dimensionsOrder);
|
||||
validateAllWorkItemsCovered(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ);
|
||||
validateWalkOrder(simd, localWorkSizeX, localWorkSizeY, localWorkSizeZ, dimensionsOrder);
|
||||
}
|
||||
|
||||
TEST_P(LocalIDFixture, sizeCalculationLocalIDs) {
|
||||
auto workItems = localWorkSizeX * localWorkSizeY * localWorkSizeZ;
|
||||
auto sizeTotalPerThreadData = getThreadsPerWG(simd, workItems) * getPerThreadSizeLocalIDs(simd);
|
||||
|
||||
@@ -347,6 +347,74 @@ HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, sendIndirectStateResourceUsage)
|
||||
EXPECT_GE(KernelCommandsHelper<FamilyType>::getSizeRequiredCS(), usedAfterCS - usedBeforeCS);
|
||||
}
|
||||
|
||||
HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, whenSendingIndirectStateThenKernelsWalkOrderIsTakenIntoAccount) {
|
||||
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
|
||||
|
||||
CommandQueueHw<FamilyType> cmdQ(pContext, pDevice, 0);
|
||||
|
||||
std::unique_ptr<Image> img(Image2dHelper<>::create(pContext));
|
||||
|
||||
MultiDispatchInfo multiDispatchInfo;
|
||||
auto &builder = BuiltIns::getInstance().getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyImageToImage3d,
|
||||
cmdQ.getContext(), cmdQ.getDevice());
|
||||
|
||||
BuiltinDispatchInfoBuilder::BuiltinOpParams dc;
|
||||
dc.srcMemObj = img.get();
|
||||
dc.dstMemObj = img.get();
|
||||
dc.size = {1, 1, 1};
|
||||
builder.buildDispatchInfos(multiDispatchInfo, dc);
|
||||
ASSERT_NE(0u, multiDispatchInfo.size());
|
||||
|
||||
auto kernel = multiDispatchInfo.begin()->getKernel();
|
||||
ASSERT_NE(nullptr, kernel);
|
||||
|
||||
const size_t localWorkSizeX = 2;
|
||||
const size_t localWorkSizeY = 3;
|
||||
const size_t localWorkSizeZ = 4;
|
||||
const size_t localWorkSizes[3]{localWorkSizeX, localWorkSizeY, localWorkSizeZ};
|
||||
|
||||
auto &commandStream = cmdQ.getCS();
|
||||
auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192);
|
||||
auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192);
|
||||
auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192);
|
||||
|
||||
dsh.align(KernelCommandsHelper<FamilyType>::alignInterfaceDescriptorData);
|
||||
size_t IDToffset = dsh.getUsed();
|
||||
dsh.getSpace(sizeof(INTERFACE_DESCRIPTOR_DATA));
|
||||
|
||||
KernelInfo modifiedKernelInfo = {};
|
||||
modifiedKernelInfo.patchInfo = kernel->getKernelInfo().patchInfo;
|
||||
modifiedKernelInfo.workgroupWalkOrder[0] = 2;
|
||||
modifiedKernelInfo.workgroupWalkOrder[1] = 1;
|
||||
modifiedKernelInfo.workgroupWalkOrder[2] = 0;
|
||||
modifiedKernelInfo.workgroupDimensionsOrder[0] = 2;
|
||||
modifiedKernelInfo.workgroupDimensionsOrder[1] = 1;
|
||||
modifiedKernelInfo.workgroupDimensionsOrder[2] = 0;
|
||||
MockKernel mockKernel{kernel->getProgram(), modifiedKernelInfo, kernel->getDevice(), false};
|
||||
KernelCommandsHelper<FamilyType>::sendIndirectState(
|
||||
commandStream,
|
||||
dsh,
|
||||
ioh,
|
||||
ssh,
|
||||
mockKernel,
|
||||
modifiedKernelInfo.getMaxSimdSize(),
|
||||
localWorkSizes,
|
||||
IDToffset,
|
||||
0,
|
||||
pDevice->getPreemptionMode(),
|
||||
nullptr);
|
||||
size_t numThreads = localWorkSizeX * localWorkSizeY * localWorkSizeZ;
|
||||
numThreads = (numThreads + modifiedKernelInfo.getMaxSimdSize() - 1) / modifiedKernelInfo.getMaxSimdSize();
|
||||
size_t expectedIohSize = ((modifiedKernelInfo.getMaxSimdSize() == 32) ? 32 : 16) * 3 * numThreads * sizeof(uint16_t);
|
||||
ASSERT_LE(expectedIohSize, ioh.getUsed());
|
||||
auto expectedLocalIds = alignedMalloc(expectedIohSize, 64);
|
||||
generateLocalIDs(expectedLocalIds, modifiedKernelInfo.getMaxSimdSize(),
|
||||
std::array<uint16_t, 3>{{localWorkSizeX, localWorkSizeY, localWorkSizeZ}},
|
||||
std::array<uint8_t, 3>{{modifiedKernelInfo.workgroupDimensionsOrder[0], modifiedKernelInfo.workgroupDimensionsOrder[1], modifiedKernelInfo.workgroupDimensionsOrder[2]}});
|
||||
EXPECT_EQ(0, memcmp(expectedLocalIds, ioh.getCpuBase(), expectedIohSize));
|
||||
alignedFree(expectedLocalIds);
|
||||
}
|
||||
|
||||
HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, usedBindingTableStatePointer) {
|
||||
typedef typename FamilyType::BINDING_TABLE_STATE BINDING_TABLE_STATE;
|
||||
typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
|
||||
|
||||
@@ -440,6 +440,22 @@ TEST_F(KernelDataTest, KernelAttributesInfo) {
|
||||
EXPECT_EQ_CONST(PATCH_TOKEN_KERNEL_ATTRIBUTES_INFO, pKernelInfo->patchInfo.pKernelAttributesInfo->Token);
|
||||
}
|
||||
|
||||
TEST_F(KernelDataTest, WhenDecodingExecutionEnvironmentTokenThenWalkOrderIsForcedToXMajor) {
|
||||
iOpenCL::SPatchExecutionEnvironment executionEnvironment = {};
|
||||
executionEnvironment.Token = PATCH_TOKEN_EXECUTION_ENVIRONMENT;
|
||||
executionEnvironment.Size = sizeof(SPatchExecutionEnvironment);
|
||||
|
||||
pPatchList = &executionEnvironment;
|
||||
patchListSize = executionEnvironment.Size;
|
||||
|
||||
buildAndDecode();
|
||||
|
||||
std::array<uint8_t, 3> expectedWalkOrder = {{0, 1, 2}};
|
||||
std::array<uint8_t, 3> expectedDimsIds = {{0, 1, 2}};
|
||||
EXPECT_EQ(expectedWalkOrder, pKernelInfo->workgroupWalkOrder);
|
||||
EXPECT_EQ(expectedDimsIds, pKernelInfo->workgroupDimensionsOrder);
|
||||
}
|
||||
|
||||
// Test all the different data parameters with the same "made up" data
|
||||
class DataParameterTest : public KernelDataTest, public testing::WithParamInterface<uint32_t> {};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user