Add pipe control before and after buffer translation

Change-Id: I4ee32c410e1ac2bcdb3ceae203cd461de79146a5
This commit is contained in:
Kamil Diedrich 2018-12-17 15:23:35 +01:00 committed by sys_ocldev
parent c9e667d601
commit 4b1871bf0e
12 changed files with 165 additions and 20 deletions

View File

@ -557,6 +557,9 @@ void CommandQueue::releaseIndirectHeap(IndirectHeap::Type heapType) {
void CommandQueue::dispatchAuxTranslation(MultiDispatchInfo &multiDispatchInfo, MemObjsForAuxTranslation &memObjsForAuxTranslation,
AuxTranslationDirection auxTranslationDirection) {
if (!multiDispatchInfo.empty()) {
multiDispatchInfo.rbegin()->setPipeControlRequired(true);
}
auto &builder = getDevice().getExecutionEnvironment()->getBuiltIns()->getBuiltinDispatchInfoBuilder(EBuiltInOps::AuxTranslation, getContext(), getDevice());
BuiltinDispatchInfoBuilder::BuiltinOpParams dispatchParams;
@ -564,6 +567,8 @@ void CommandQueue::dispatchAuxTranslation(MultiDispatchInfo &multiDispatchInfo,
dispatchParams.auxTranslationDirection = auxTranslationDirection;
builder.buildDispatchInfos(multiDispatchInfo, dispatchParams);
multiDispatchInfo.rbegin()->setPipeControlRequired(true);
}
void CommandQueue::obtainNewTimestampPacketNodes(size_t numberOfNodes, TimestampPacketContainer &previousNodes) {

View File

@ -9,6 +9,7 @@
#include "runtime/helpers/base_object.h"
#include "runtime/helpers/engine_control.h"
#include "runtime/helpers/task_information.h"
#include "runtime/helpers/dispatch_info.h"
#include "instrumentation.h"
#include <atomic>
#include <cstdint>
@ -27,6 +28,7 @@ class Kernel;
class MemObj;
class PerformanceCounters;
struct CompletionStamp;
struct MultiDispatchInfo;
enum class QueuePriority {
LOW,

View File

@ -63,7 +63,9 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface *(&surfaces)[surfaceCount
auto &builder = getDevice().getExecutionEnvironment()->getBuiltIns()->getBuiltinDispatchInfoBuilder(EBuiltInOps::AuxTranslation, getContext(), getDevice());
builtInLock.takeOwnership(builder, this->context);
kernel->fillWithBuffersForAuxTranslation(memObjsForAuxTranslation);
dispatchAuxTranslation(multiDispatchInfo, memObjsForAuxTranslation, AuxTranslationDirection::AuxToNonAux);
if (!memObjsForAuxTranslation.empty()) {
dispatchAuxTranslation(multiDispatchInfo, memObjsForAuxTranslation, AuxTranslationDirection::AuxToNonAux);
}
}
if (kernel->getKernelInfo().builtinDispatchBuilder == nullptr) {
@ -85,7 +87,9 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface *(&surfaces)[surfaceCount
buffer->getGraphicsAllocation()->setAllocationType(GraphicsAllocation::AllocationType::BUFFER);
}
} else {
dispatchAuxTranslation(multiDispatchInfo, memObjsForAuxTranslation, AuxTranslationDirection::NonAuxToAux);
if (!memObjsForAuxTranslation.empty()) {
dispatchAuxTranslation(multiDispatchInfo, memObjsForAuxTranslation, AuxTranslationDirection::AuxToNonAux);
}
}
}
}

View File

@ -387,8 +387,14 @@ template <typename GfxFamily>
size_t EnqueueOperation<GfxFamily>::getTotalSizeRequiredCS(uint32_t eventType, cl_uint numEventsInWaitList, bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) {
size_t expectedSizeCS = 0;
Kernel *parentKernel = multiDispatchInfo.peekParentKernel();
if (multiDispatchInfo.peekMainKernel() && multiDispatchInfo.peekMainKernel()->isAuxTranslationRequired()) {
expectedSizeCS += sizeof(PIPE_CONTROL);
}
for (auto &dispatchInfo : multiDispatchInfo) {
expectedSizeCS += EnqueueOperation<GfxFamily>::getSizeRequiredCS(eventType, reserveProfilingCmdsSpace, reservePerfCounters, commandQueue, dispatchInfo.getKernel());
if (dispatchInfo.isPipeControlRequired()) {
expectedSizeCS += sizeof(PIPE_CONTROL);
}
}
if (parentKernel) {
SchedulerKernel &scheduler = commandQueue.getDevice().getExecutionEnvironment()->getBuiltIns()->getSchedulerKernel(parentKernel->getContext());

View File

@ -30,6 +30,7 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
LinearStream *commandStream = nullptr;
IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
auto parentKernel = multiDispatchInfo.peekParentKernel();
auto mainKernel = multiDispatchInfo.peekMainKernel();
for (auto &dispatchInfo : multiDispatchInfo) {
// Compute local workgroup sizes
@ -109,10 +110,16 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
DEBUG_BREAK_IF(offsetInterfaceDescriptorTable % 64 != 0);
if (mainKernel->isAuxTranslationRequired()) {
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
auto pPipeControlCmd = static_cast<PIPE_CONTROL *>(commandStream->getSpace(sizeof(PIPE_CONTROL)));
*pPipeControlCmd = GfxFamily::cmdInitPipeControl;
pPipeControlCmd->setCommandStreamerStallEnable(true);
}
size_t currentDispatchIndex = 0;
for (auto &dispatchInfo : multiDispatchInfo) {
auto &kernel = *dispatchInfo.getKernel();
DEBUG_BREAK_IF(!(dispatchInfo.getDim() >= 1 && dispatchInfo.getDim() <= 3));
DEBUG_BREAK_IF(!(dispatchInfo.getGWS().z == 1 || dispatchInfo.getDim() == 3));
DEBUG_BREAK_IF(!(dispatchInfo.getGWS().y == 1 || dispatchInfo.getDim() >= 2));
@ -152,7 +159,7 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
*kernel.globalWorkSizeY = static_cast<uint32_t>(gws.y);
*kernel.globalWorkSizeZ = static_cast<uint32_t>(gws.z);
if ((&kernel == multiDispatchInfo.peekMainKernel()) || (kernel.localWorkSizeX2 == &Kernel::dummyPatchLocation)) {
if ((&kernel == mainKernel) || (kernel.localWorkSizeX2 == &Kernel::dummyPatchLocation)) {
*kernel.localWorkSizeX = static_cast<uint32_t>(lws.x);
*kernel.localWorkSizeY = static_cast<uint32_t>(lws.y);
*kernel.localWorkSizeZ = static_cast<uint32_t>(lws.z);
@ -166,7 +173,7 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
*kernel.enqueuedLocalWorkSizeY = static_cast<uint32_t>(elws.y);
*kernel.enqueuedLocalWorkSizeZ = static_cast<uint32_t>(elws.z);
if (&kernel == multiDispatchInfo.peekMainKernel()) {
if (&kernel == mainKernel) {
*kernel.numWorkGroupsX = static_cast<uint32_t>(twgs.x);
*kernel.numWorkGroupsY = static_cast<uint32_t>(twgs.y);
*kernel.numWorkGroupsZ = static_cast<uint32_t>(twgs.z);
@ -231,6 +238,12 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
GpgpuWalkerHelper<GfxFamily>::adjustWalkerData(commandStream, walkerCmd, kernel, dispatchInfo);
dispatchWorkarounds(commandStream, commandQueue, kernel, false);
if (dispatchInfo.isPipeControlRequired()) {
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
auto pPipeControlCmd = static_cast<PIPE_CONTROL *>(commandStream->getSpace(sizeof(PIPE_CONTROL)));
*pPipeControlCmd = GfxFamily::cmdInitPipeControl;
pPipeControlCmd->setCommandStreamerStallEnable(true);
}
currentDispatchIndex++;
}
dispatchProfilingPerfEndCommands(hwTimeStamps, hwPerfCounter, commandStream, commandQueue);

View File

@ -25,6 +25,8 @@ class DispatchInfo {
: kernel(k), dim(d), gws(gws), elws(elws), offset(offset), agws(0, 0, 0), lws(0, 0, 0), twgs(0, 0, 0), nwgs(0, 0, 0), swgs(0, 0, 0) {}
DispatchInfo(Kernel *k, uint32_t d, Vec3<size_t> gws, Vec3<size_t> elws, Vec3<size_t> offset, Vec3<size_t> agws, Vec3<size_t> lws, Vec3<size_t> twgs, Vec3<size_t> nwgs, Vec3<size_t> swgs)
: kernel(k), dim(d), gws(gws), elws(elws), offset(offset), agws(agws), lws(lws), twgs(twgs), nwgs(nwgs), swgs(swgs) {}
bool isPipeControlRequired() const { return pipeControlRequired; }
void setPipeControlRequired(bool blocking) { this->pipeControlRequired = blocking; }
bool usesSlm() const;
bool usesStatelessPrintfSurface() const;
uint32_t getRequiredScratchSize() const;
@ -50,6 +52,7 @@ class DispatchInfo {
void setStartOfWorkgroups(const Vec3<size_t> &swgs) { this->swgs = swgs; }
protected:
bool pipeControlRequired = false;
Kernel *kernel = nullptr;
uint32_t dim = 0;
@ -106,14 +109,38 @@ struct MultiDispatchInfo {
return ret;
}
DispatchInfo *begin() {
return dispatchInfos.begin();
}
const DispatchInfo *begin() const {
return dispatchInfos.begin();
}
std::reverse_iterator<DispatchInfo *> rbegin() {
return dispatchInfos.rbegin();
}
std::reverse_iterator<const DispatchInfo *> crbegin() const {
return dispatchInfos.crbegin();
}
DispatchInfo *end() {
return dispatchInfos.end();
}
const DispatchInfo *end() const {
return dispatchInfos.end();
}
std::reverse_iterator<DispatchInfo *> rend() {
return dispatchInfos.rend();
}
std::reverse_iterator<const DispatchInfo *> crend() const {
return dispatchInfos.crend();
}
void push(const DispatchInfo &dispatchInfo) {
dispatchInfos.push_back(dispatchInfo);
}

View File

@ -372,7 +372,7 @@ class Kernel : public BaseObject<_cl_kernel> {
return usingImagesOnly;
}
void fillWithBuffersForAuxTranslation(MemObjsForAuxTranslation &buffersForAuxTranslation);
void fillWithBuffersForAuxTranslation(MemObjsForAuxTranslation &memObjsForAuxTranslation);
bool requiresCacheFlushCommand() const;

View File

@ -18,6 +18,8 @@ class StackVec {
public:
using iterator = DataType *;
using const_iterator = const DataType *;
using reverse_iterator = std::reverse_iterator<iterator>;
using const_reverse_iterator = std::reverse_iterator<const_iterator>;
static const size_t onStackCaps = OnStackCapacity;
@ -181,6 +183,14 @@ class StackVec {
return onStackMem;
}
reverse_iterator rbegin() {
return reverse_iterator(end());
}
const_reverse_iterator crbegin() const {
return const_reverse_iterator(end());
}
const_iterator begin() const {
if (dynamicMem) {
return dynamicMem->data();
@ -197,6 +207,14 @@ class StackVec {
return onStackMem + onStackSize;
}
reverse_iterator rend() {
return reverse_iterator(begin());
}
const_reverse_iterator crend() const {
return const_reverse_iterator(begin());
}
const_iterator end() const {
if (dynamicMem) {
return dynamicMem->data() + dynamicMem->size();

View File

@ -1108,3 +1108,38 @@ HWTEST_F(DispatchWalkerTest, WhenCallingDefaultWaMethodsThenExpectNothing) {
size_t actualSize = GpgpuWalkerHelper<GENX>::getSizeForWADisableLSQCROPERFforOCL(&kernel);
EXPECT_EQ(expectedSize, actualSize);
}
HWTEST_F(DispatchWalkerTest, givenKernelWhenAuxTranslationWithoutParentKernelThenPipeControlAdded) {
MockKernel kernel(program.get(), kernelInfo, *pDevice);
kernelInfo.workloadInfo.workDimOffset = 0;
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
auto &cmdStream = pCmdQ->getCS(0);
void *buffer = cmdStream.getCpuBase();
kernel.auxTranslationRequired = true;
MockMultiDispatchInfo multiDispatchInfo(&kernel);
DispatchInfo di1(&kernel, 1, Vec3<size_t>(1, 1, 1), Vec3<size_t>(1, 1, 1), Vec3<size_t>(0, 0, 0));
di1.setPipeControlRequired(true);
multiDispatchInfo.push(di1);
HardwareInterface<FamilyType>::dispatchWalker(
*pCmdQ,
multiDispatchInfo,
0,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
nullptr,
pDevice->getPreemptionMode(),
false);
auto sizeUsed = cmdStream.getUsed();
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, buffer, sizeUsed));
auto itorCmd = find<typename FamilyType::PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
ASSERT_NE(cmdList.end(), itorCmd);
}

View File

@ -665,6 +665,7 @@ struct EnqueueAuxKernelTests : public EnqueueKernelTest {
Kernel *lastKernel = nullptr;
for (const auto &dispatchInfo : multiDispatchInfo) {
lastKernel = dispatchInfo.getKernel();
dispatchInfos.emplace_back(dispatchInfo);
}
dispatchAuxTranslationInputs.emplace_back(lastKernel, multiDispatchInfo.size(), memObjsForAuxTranslation, auxTranslationDirection);
}
@ -674,31 +675,20 @@ struct EnqueueAuxKernelTests : public EnqueueKernelTest {
CommandQueueHw<FamilyType>::waitUntilComplete(taskCountToWait, flushStampToWait, useQuickKmdSleep);
}
std::vector<DispatchInfo> dispatchInfos;
std::vector<std::tuple<Kernel *, size_t, MemObjsForAuxTranslation, AuxTranslationDirection>> dispatchAuxTranslationInputs;
uint32_t waitCalled = 0;
};
};
HWTEST_F(EnqueueAuxKernelTests, givenKernelWithRequiredAuxTranslationWhenEnqueuedThenGuardKernelWithAuxTranslations) {
HWTEST_F(EnqueueAuxKernelTests, givenKernelWithRequiredAuxTranslationAndWithoutArgumentsWhenEnqueuedThenNoGuardKernelWithAuxTranslations) {
MockKernelWithInternals mockKernel(*pDevice, context);
MyCmdQ<FamilyType> cmdQ(context, pDevice);
size_t gws[3] = {1, 0, 0};
mockKernel.mockKernel->auxTranslationRequired = true;
cmdQ.enqueueKernel(mockKernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr);
EXPECT_EQ(2u, cmdQ.dispatchAuxTranslationInputs.size());
// before kernel
EXPECT_EQ(0u, std::get<size_t>(cmdQ.dispatchAuxTranslationInputs.at(0)));
EXPECT_EQ(AuxTranslationDirection::AuxToNonAux, std::get<AuxTranslationDirection>(cmdQ.dispatchAuxTranslationInputs.at(0)));
// after kernel
EXPECT_EQ(1u, std::get<size_t>(cmdQ.dispatchAuxTranslationInputs.at(1)));
EXPECT_EQ(AuxTranslationDirection::NonAuxToAux, std::get<AuxTranslationDirection>(cmdQ.dispatchAuxTranslationInputs.at(1)));
mockKernel.mockKernel->auxTranslationRequired = false;
cmdQ.enqueueKernel(mockKernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr);
EXPECT_EQ(2u, cmdQ.dispatchAuxTranslationInputs.size()); // not changed
EXPECT_EQ(0u, cmdQ.dispatchAuxTranslationInputs.size());
}
HWTEST_F(EnqueueAuxKernelTests, givenMultipleArgsWhenAuxTranslationIsRequiredThenPickOnlyApplicableBuffers) {
@ -738,11 +728,20 @@ HWTEST_F(EnqueueAuxKernelTests, givenMultipleArgsWhenAuxTranslationIsRequiredThe
cmdQ.enqueueKernel(mockKernel.mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr);
EXPECT_EQ(2u, cmdQ.dispatchAuxTranslationInputs.size());
EXPECT_EQ(1u, std::get<MemObjsForAuxTranslation>(cmdQ.dispatchAuxTranslationInputs.at(0)).size()); // before kernel
EXPECT_EQ(1u, std::get<MemObjsForAuxTranslation>(cmdQ.dispatchAuxTranslationInputs.at(1)).size()); // after kernel
EXPECT_EQ(&buffer2, *std::get<MemObjsForAuxTranslation>(cmdQ.dispatchAuxTranslationInputs.at(0)).begin());
EXPECT_EQ(&buffer2, *std::get<MemObjsForAuxTranslation>(cmdQ.dispatchAuxTranslationInputs.at(1)).begin());
uint32_t pipeControlCount = 0;
for (auto dispatchInfo : cmdQ.dispatchInfos) {
if (dispatchInfo.isPipeControlRequired()) {
++pipeControlCount;
}
}
EXPECT_EQ(4u, pipeControlCount);
}
HWTEST_F(EnqueueAuxKernelTests, givenKernelWithRequiredAuxTranslationWhenEnqueuedThenDispatchAuxTranslationBuiltin) {
@ -821,6 +820,14 @@ HWCMDTEST_F(IGFX_GEN8_CORE, EnqueueAuxKernelTests, givenParentKernelWhenAuxTrans
EXPECT_EQ(GraphicsAllocation::AllocationType::BUFFER, buffer0.getGraphicsAllocation()->getAllocationType());
EXPECT_EQ(GraphicsAllocation::AllocationType::BUFFER_COMPRESSED, buffer1.getGraphicsAllocation()->getAllocationType());
EXPECT_EQ(GraphicsAllocation::AllocationType::BUFFER, buffer2.getGraphicsAllocation()->getAllocationType());
uint32_t pipeControlCount = 0;
for (auto dispatchInfo : cmdQ.dispatchInfos) {
if (dispatchInfo.isPipeControlRequired()) {
++pipeControlCount;
}
}
EXPECT_EQ(1u, pipeControlCount);
}
}

View File

@ -307,4 +307,21 @@ TEST_F(DispatchInfoTest, givenKernelWhenMultiDispatchInfoIsCreatedThenQueryParen
EXPECT_EQ(nullptr, multiDispatchInfo.peekParentKernel());
EXPECT_EQ(builtInKernel.get(), multiDispatchInfo.peekMainKernel());
}
{
MultiDispatchInfo multiDispatchInfo;
multiDispatchInfo.push(parentKernelDispatchInfo);
multiDispatchInfo.push(baseDispatchInfo);
multiDispatchInfo.push(builtInDispatchInfo);
std::reverse_iterator<DispatchInfo *> rend = multiDispatchInfo.rend();
std::reverse_iterator<const DispatchInfo *> crend = multiDispatchInfo.crend();
std::reverse_iterator<DispatchInfo *> rbegin = multiDispatchInfo.rbegin();
std::reverse_iterator<const DispatchInfo *> crbegin = multiDispatchInfo.crbegin();
EXPECT_EQ(rbegin.base(), multiDispatchInfo.end());
EXPECT_EQ(crbegin.base(), multiDispatchInfo.end());
EXPECT_EQ(rend.base(), multiDispatchInfo.begin());
EXPECT_EQ(crend.base(), multiDispatchInfo.begin());
}
}

View File

@ -1429,6 +1429,17 @@ TEST(StackVec, Clear) {
ASSERT_EQ(0U, v2.size());
}
TEST(StackVec, ReverseBeginningFunctions) {
using VecType = StackVec<int, 1>;
VecType v;
v.push_back(5);
ASSERT_EQ(v.begin(), v.rend().base());
ASSERT_EQ(v.end(), v.rbegin().base());
ASSERT_EQ(v.begin(), v.crend().base());
ASSERT_EQ(v.end(), v.crbegin().base());
}
TEST(StackVec, ConstMemberFunctions) {
using VecType = StackVec<int, 3>;
VecType v;