Apply (2/4)x4x1 layout when generating local ids for kernel with images only

- For SIMD8 apply 2x4x1 layout
- For SIMD16/SIMD32 apply 4x4x1 layout

Change-Id: I31bceb49387011c66da5f96ad2a71125b96d4cda
Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
This commit is contained in:
Mateusz Jablonski
2018-08-08 12:49:36 +02:00
committed by sys_ocldev
parent cf971158c6
commit 47f3dad619
9 changed files with 289 additions and 26 deletions

View File

@@ -57,9 +57,12 @@ LocalIDHelper::LocalIDHelper() {
LocalIDHelper LocalIDHelper::initializer;
//traditional function to generate local IDs
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder) {
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool hasKernelOnlyImages) {
auto threadsPerWorkGroup = static_cast<uint16_t>(getThreadsPerWG(simd, localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]));
if (simd == 32) {
bool use4x4Layout = hasKernelOnlyImages && isCompatibleWith4x4Layout(localWorkgroupSize, dimensionsOrder, simd);
if (use4x4Layout) {
generateLocalIDsWith4x4Layout(buffer, localWorkgroupSize, simd);
} else if (simd == 32) {
LocalIDHelper::generateSimd32(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder);
} else if (simd == 16) {
LocalIDHelper::generateSimd16(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder);
@@ -67,4 +70,57 @@ void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3>
LocalIDHelper::generateSimd8(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder);
}
}
bool isCompatibleWith4x4Layout(const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, uint16_t simd) {
uint8_t rowWidth = simd == 32u ? 32u : 16u;
uint8_t xDelta = simd == 8u ? 2u : 4u;
uint8_t yDelta = simd == 8u || localWorkgroupSize.at(1) == 4u ? 4u : rowWidth / xDelta;
return dimensionsOrder.at(0) == 0 &&
dimensionsOrder.at(1) == 1 &&
(localWorkgroupSize.at(0) & (xDelta - 1)) == 0 &&
(localWorkgroupSize.at(1) & (yDelta - 1)) == 0;
}
inline void generateLocalIDsWith4x4Layout(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t simd) {
uint8_t rowWidth = simd == 32u ? 32u : 16u;
uint8_t xDelta = simd == 8u ? 2u : 4u; // difference between corresponding values in consecutive X rows
uint8_t yDelta = simd == 8u || localWorkgroupSize.at(1) == 4u ? 4u : rowWidth / xDelta; // difference between corresponding values in consecutive Y rows
std::array<uint16_t, 3> replicationFactors{{static_cast<uint16_t>(localWorkgroupSize.at(0) / xDelta),
static_cast<uint16_t>(localWorkgroupSize.at(1) / yDelta),
static_cast<uint16_t>(localWorkgroupSize.at(2))}};
bool earlyGrowX = replicationFactors.at(1) == 1 && simd == 32u && replicationFactors.at(0) > 1;
bool earlyGrowZ = replicationFactors.at(1) == 1 && simd == 32u && !earlyGrowX && replicationFactors.at(2) > 1;
auto buffer = reinterpret_cast<uint16_t *>(b);
uint16_t offset = 0u;
for (uint16_t z = 0u; z < replicationFactors.at(2); z++) {
for (uint16_t y = 0u; y < replicationFactors.at(1); y++) {
for (uint16_t x = 0u; x < replicationFactors.at(0); x++) {
// row for X
for (uint8_t i = 0u; i < simd; i++) {
if (earlyGrowX && i == yDelta * xDelta) {
x++;
}
auto xValue = xDelta * x + (i & (xDelta - 1));
buffer[offset + i] = xValue & (localWorkgroupSize.at(0) - 1);
}
offset += rowWidth;
// row for Y
for (uint8_t i = 0u; i < simd; i++) {
auto yValue = yDelta * y + i / xDelta;
buffer[offset + i] = yValue & (localWorkgroupSize.at(1) - 1);
}
offset += rowWidth;
// row for Z
for (uint8_t i = 0u; i < simd; i++) {
if (earlyGrowZ && i == yDelta * xDelta) {
z++;
}
auto zValue = z;
buffer[offset + i] = zValue & (localWorkgroupSize.at(2) - 1);
}
offset += rowWidth;
}
}
}
}
} // namespace OCLRT

View File

@@ -79,6 +79,8 @@ void generateLocalIDsSimd(void *b, const std::array<uint16_t, 3> &localWorkgroup
const std::array<uint8_t, 3> &dimensionsOrder);
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize,
const std::array<uint8_t, 3> &dimensionsOrder);
const std::array<uint8_t, 3> &dimensionsOrder, bool hasKernelOnlyImages);
void generateLocalIDsWith4x4Layout(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t simd);
bool isCompatibleWith4x4Layout(const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, uint16_t simd);
} // namespace OCLRT

View File

@@ -1,5 +1,5 @@
/*
* Copyright (c) 2017, Intel Corporation
* Copyright (c) 2017 - 2018, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -28,6 +28,8 @@
#include <cstdint>
#include <cstddef>
#include <algorithm>
#include <memory>
#include <functional>
#ifdef _MSC_VER
#define ALIGNAS(x) __declspec(align(x))
@@ -112,3 +114,6 @@ template <typename T>
inline bool isAligned(T *ptr) {
return (reinterpret_cast<uintptr_t>(ptr) & (alignof(T) - 1)) == 0;
}
inline auto allocateAlignedMemory(size_t bytes, size_t alignment) {
return std::unique_ptr<void, std::function<decltype(alignedFree)>>(alignedMalloc(bytes, alignment), alignedFree);
}

View File

@@ -375,7 +375,8 @@ size_t KernelCommandsHelper<GfxFamily>::sendIndirectState(
simd,
numChannels,
localWorkSize,
kernel.getKernelInfo().workgroupDimensionsOrder);
kernel.getKernelInfo().workgroupDimensionsOrder,
kernel.usesOnlyImages());
// send interface descriptor data
auto localWorkItems = localWorkSize[0] * localWorkSize[1] * localWorkSize[2];

View File

@@ -33,7 +33,8 @@ size_t PerThreadDataHelper::sendPerThreadData(
uint32_t simd,
uint32_t numChannels,
const size_t localWorkSizes[3],
const std::array<uint8_t, 3> &workgroupWalkOrder) {
const std::array<uint8_t, 3> &workgroupWalkOrder,
bool hasKernelOnlyImages) {
auto offsetPerThreadData = indirectHeap.getUsed();
if (numChannels) {
auto localWorkSize = localWorkSizes[0] * localWorkSizes[1] * localWorkSizes[2];
@@ -46,7 +47,8 @@ size_t PerThreadDataHelper::sendPerThreadData(
std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizes[0]),
static_cast<uint16_t>(localWorkSizes[1]),
static_cast<uint16_t>(localWorkSizes[2])}},
std::array<uint8_t, 3>{{workgroupWalkOrder[0], workgroupWalkOrder[1], workgroupWalkOrder[2]}});
std::array<uint8_t, 3>{{workgroupWalkOrder[0], workgroupWalkOrder[1], workgroupWalkOrder[2]}},
hasKernelOnlyImages);
}
return offsetPerThreadData;
}

View File

@@ -49,15 +49,8 @@ struct PerThreadDataHelper {
uint32_t simd,
uint32_t numChannels,
const size_t localWorkSizes[3],
const std::array<uint8_t, 3> &workgroupWalkOrder);
static size_t sendPerThreadData(
LinearStream &indirectHeap,
uint32_t simd,
uint32_t numChannels,
const size_t localWorkSizes[3]) {
return sendPerThreadData(indirectHeap, simd, numChannels, localWorkSizes, std::array<uint8_t, 3>{{0, 1, 2}});
}
const std::array<uint8_t, 3> &workgroupWalkOrder,
bool hasKernelOnlyImages);
static inline uint32_t getNumLocalIdChannels(const iOpenCL::SPatchThreadPayload &threadPayload) {
return threadPayload.LocalIDXPresent +