mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-09 22:43:00 +08:00
Apply (2/4)x4x1 layout when generating local ids for kernel with images only
- For SIMD8 apply 2x4x1 layout - For SIMD16/SIMD32 apply 4x4x1 layout Change-Id: I31bceb49387011c66da5f96ad2a71125b96d4cda Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
This commit is contained in:
committed by
sys_ocldev
parent
cf971158c6
commit
47f3dad619
@@ -57,9 +57,12 @@ LocalIDHelper::LocalIDHelper() {
|
||||
LocalIDHelper LocalIDHelper::initializer;
|
||||
|
||||
//traditional function to generate local IDs
|
||||
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder) {
|
||||
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool hasKernelOnlyImages) {
|
||||
auto threadsPerWorkGroup = static_cast<uint16_t>(getThreadsPerWG(simd, localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]));
|
||||
if (simd == 32) {
|
||||
bool use4x4Layout = hasKernelOnlyImages && isCompatibleWith4x4Layout(localWorkgroupSize, dimensionsOrder, simd);
|
||||
if (use4x4Layout) {
|
||||
generateLocalIDsWith4x4Layout(buffer, localWorkgroupSize, simd);
|
||||
} else if (simd == 32) {
|
||||
LocalIDHelper::generateSimd32(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder);
|
||||
} else if (simd == 16) {
|
||||
LocalIDHelper::generateSimd16(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder);
|
||||
@@ -67,4 +70,57 @@ void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3>
|
||||
LocalIDHelper::generateSimd8(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder);
|
||||
}
|
||||
}
|
||||
|
||||
bool isCompatibleWith4x4Layout(const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, uint16_t simd) {
|
||||
uint8_t rowWidth = simd == 32u ? 32u : 16u;
|
||||
uint8_t xDelta = simd == 8u ? 2u : 4u;
|
||||
uint8_t yDelta = simd == 8u || localWorkgroupSize.at(1) == 4u ? 4u : rowWidth / xDelta;
|
||||
return dimensionsOrder.at(0) == 0 &&
|
||||
dimensionsOrder.at(1) == 1 &&
|
||||
(localWorkgroupSize.at(0) & (xDelta - 1)) == 0 &&
|
||||
(localWorkgroupSize.at(1) & (yDelta - 1)) == 0;
|
||||
}
|
||||
|
||||
inline void generateLocalIDsWith4x4Layout(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t simd) {
|
||||
uint8_t rowWidth = simd == 32u ? 32u : 16u;
|
||||
uint8_t xDelta = simd == 8u ? 2u : 4u; // difference between corresponding values in consecutive X rows
|
||||
uint8_t yDelta = simd == 8u || localWorkgroupSize.at(1) == 4u ? 4u : rowWidth / xDelta; // difference between corresponding values in consecutive Y rows
|
||||
std::array<uint16_t, 3> replicationFactors{{static_cast<uint16_t>(localWorkgroupSize.at(0) / xDelta),
|
||||
static_cast<uint16_t>(localWorkgroupSize.at(1) / yDelta),
|
||||
static_cast<uint16_t>(localWorkgroupSize.at(2))}};
|
||||
bool earlyGrowX = replicationFactors.at(1) == 1 && simd == 32u && replicationFactors.at(0) > 1;
|
||||
bool earlyGrowZ = replicationFactors.at(1) == 1 && simd == 32u && !earlyGrowX && replicationFactors.at(2) > 1;
|
||||
auto buffer = reinterpret_cast<uint16_t *>(b);
|
||||
uint16_t offset = 0u;
|
||||
for (uint16_t z = 0u; z < replicationFactors.at(2); z++) {
|
||||
for (uint16_t y = 0u; y < replicationFactors.at(1); y++) {
|
||||
for (uint16_t x = 0u; x < replicationFactors.at(0); x++) {
|
||||
// row for X
|
||||
for (uint8_t i = 0u; i < simd; i++) {
|
||||
if (earlyGrowX && i == yDelta * xDelta) {
|
||||
x++;
|
||||
}
|
||||
auto xValue = xDelta * x + (i & (xDelta - 1));
|
||||
buffer[offset + i] = xValue & (localWorkgroupSize.at(0) - 1);
|
||||
}
|
||||
offset += rowWidth;
|
||||
// row for Y
|
||||
for (uint8_t i = 0u; i < simd; i++) {
|
||||
auto yValue = yDelta * y + i / xDelta;
|
||||
buffer[offset + i] = yValue & (localWorkgroupSize.at(1) - 1);
|
||||
}
|
||||
offset += rowWidth;
|
||||
// row for Z
|
||||
for (uint8_t i = 0u; i < simd; i++) {
|
||||
if (earlyGrowZ && i == yDelta * xDelta) {
|
||||
z++;
|
||||
}
|
||||
auto zValue = z;
|
||||
buffer[offset + i] = zValue & (localWorkgroupSize.at(2) - 1);
|
||||
}
|
||||
offset += rowWidth;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace OCLRT
|
||||
|
||||
@@ -79,6 +79,8 @@ void generateLocalIDsSimd(void *b, const std::array<uint16_t, 3> &localWorkgroup
|
||||
const std::array<uint8_t, 3> &dimensionsOrder);
|
||||
|
||||
void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize,
|
||||
const std::array<uint8_t, 3> &dimensionsOrder);
|
||||
const std::array<uint8_t, 3> &dimensionsOrder, bool hasKernelOnlyImages);
|
||||
void generateLocalIDsWith4x4Layout(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t simd);
|
||||
|
||||
bool isCompatibleWith4x4Layout(const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, uint16_t simd);
|
||||
} // namespace OCLRT
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2017, Intel Corporation
|
||||
* Copyright (c) 2017 - 2018, Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
@@ -28,6 +28,8 @@
|
||||
#include <cstdint>
|
||||
#include <cstddef>
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
#include <functional>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define ALIGNAS(x) __declspec(align(x))
|
||||
@@ -112,3 +114,6 @@ template <typename T>
|
||||
inline bool isAligned(T *ptr) {
|
||||
return (reinterpret_cast<uintptr_t>(ptr) & (alignof(T) - 1)) == 0;
|
||||
}
|
||||
inline auto allocateAlignedMemory(size_t bytes, size_t alignment) {
|
||||
return std::unique_ptr<void, std::function<decltype(alignedFree)>>(alignedMalloc(bytes, alignment), alignedFree);
|
||||
}
|
||||
|
||||
@@ -375,7 +375,8 @@ size_t KernelCommandsHelper<GfxFamily>::sendIndirectState(
|
||||
simd,
|
||||
numChannels,
|
||||
localWorkSize,
|
||||
kernel.getKernelInfo().workgroupDimensionsOrder);
|
||||
kernel.getKernelInfo().workgroupDimensionsOrder,
|
||||
kernel.usesOnlyImages());
|
||||
|
||||
// send interface descriptor data
|
||||
auto localWorkItems = localWorkSize[0] * localWorkSize[1] * localWorkSize[2];
|
||||
|
||||
@@ -33,7 +33,8 @@ size_t PerThreadDataHelper::sendPerThreadData(
|
||||
uint32_t simd,
|
||||
uint32_t numChannels,
|
||||
const size_t localWorkSizes[3],
|
||||
const std::array<uint8_t, 3> &workgroupWalkOrder) {
|
||||
const std::array<uint8_t, 3> &workgroupWalkOrder,
|
||||
bool hasKernelOnlyImages) {
|
||||
auto offsetPerThreadData = indirectHeap.getUsed();
|
||||
if (numChannels) {
|
||||
auto localWorkSize = localWorkSizes[0] * localWorkSizes[1] * localWorkSizes[2];
|
||||
@@ -46,7 +47,8 @@ size_t PerThreadDataHelper::sendPerThreadData(
|
||||
std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSizes[0]),
|
||||
static_cast<uint16_t>(localWorkSizes[1]),
|
||||
static_cast<uint16_t>(localWorkSizes[2])}},
|
||||
std::array<uint8_t, 3>{{workgroupWalkOrder[0], workgroupWalkOrder[1], workgroupWalkOrder[2]}});
|
||||
std::array<uint8_t, 3>{{workgroupWalkOrder[0], workgroupWalkOrder[1], workgroupWalkOrder[2]}},
|
||||
hasKernelOnlyImages);
|
||||
}
|
||||
return offsetPerThreadData;
|
||||
}
|
||||
|
||||
@@ -49,15 +49,8 @@ struct PerThreadDataHelper {
|
||||
uint32_t simd,
|
||||
uint32_t numChannels,
|
||||
const size_t localWorkSizes[3],
|
||||
const std::array<uint8_t, 3> &workgroupWalkOrder);
|
||||
|
||||
static size_t sendPerThreadData(
|
||||
LinearStream &indirectHeap,
|
||||
uint32_t simd,
|
||||
uint32_t numChannels,
|
||||
const size_t localWorkSizes[3]) {
|
||||
return sendPerThreadData(indirectHeap, simd, numChannels, localWorkSizes, std::array<uint8_t, 3>{{0, 1, 2}});
|
||||
}
|
||||
const std::array<uint8_t, 3> &workgroupWalkOrder,
|
||||
bool hasKernelOnlyImages);
|
||||
|
||||
static inline uint32_t getNumLocalIdChannels(const iOpenCL::SPatchThreadPayload &threadPayload) {
|
||||
return threadPayload.LocalIDXPresent +
|
||||
|
||||
Reference in New Issue
Block a user