mirror of
https://github.com/intel/compute-runtime.git
synced 2025-11-10 05:49:51 +08:00
Dont use Packets without profiling data to calculate kernel duration
Change-Id: I710348835f8884a3b244502f53ff4e4980441654 Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
This commit is contained in:
committed by
sys_ocldev
parent
77f88f9a96
commit
44a6d70ced
@@ -519,6 +519,7 @@ void CommandQueueHw<GfxFamily>::processDispatchForCacheFlush(Surface **surfaces,
|
||||
uint64_t postSyncAddress = 0;
|
||||
if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
|
||||
auto timestampPacketNodeForPostSync = timestampPacketContainer->peekNodes().at(0);
|
||||
timestampPacketNodeForPostSync->setProfilingCapable(false);
|
||||
postSyncAddress = TimestampPacketHelper::getContextEndGpuAddress(*timestampPacketNodeForPostSync);
|
||||
}
|
||||
|
||||
|
||||
@@ -143,6 +143,7 @@ void HardwareInterface<GfxFamily>::dispatchWalker(
|
||||
uint64_t postSyncAddress = 0;
|
||||
if (commandQueue.getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
|
||||
auto timestampPacketNodeForPostSync = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex);
|
||||
timestampPacketNodeForPostSync->setProfilingCapable(false);
|
||||
postSyncAddress = TimestampPacketHelper::getContextEndGpuAddress(*timestampPacketNodeForPostSync);
|
||||
}
|
||||
HardwareCommandsHelper<GfxFamily>::programCacheFlushAfterWalkerCommand(commandStream, commandQueue, mainKernel, postSyncAddress);
|
||||
|
||||
@@ -256,9 +256,11 @@ bool Event::calcProfilingData() {
|
||||
|
||||
if (DebugManager.flags.PrintTimestampPacketContents.get()) {
|
||||
for (auto i = 0u; i < timestamps.size(); i++) {
|
||||
std::cout << "Timestamp " << i << ", "
|
||||
<< "profiling capable: " << timestamps[i]->isProfilingCapable() << ", ";
|
||||
for (auto j = 0u; j < timestamps[i]->tagForCpuAccess->packetsUsed; j++) {
|
||||
const auto &packet = timestamps[i]->tagForCpuAccess->packets[j];
|
||||
std::cout << "Timestamp " << i << ", packet " << j << ": "
|
||||
std::cout << "packet " << j << ": "
|
||||
<< "global start: " << packet.globalStart << ", "
|
||||
<< "global end: " << packet.globalEnd << ", "
|
||||
<< "context start: " << packet.contextStart << ", "
|
||||
@@ -271,6 +273,9 @@ bool Event::calcProfilingData() {
|
||||
uint64_t globalEndTS = timestamps[0]->tagForCpuAccess->packets[0].globalEnd;
|
||||
|
||||
for (const auto ×tamp : timestamps) {
|
||||
if (!timestamp->isProfilingCapable()) {
|
||||
continue;
|
||||
}
|
||||
for (auto i = 0u; i < timestamp->tagForCpuAccess->packetsUsed; ++i) {
|
||||
const auto &packet = timestamp->tagForCpuAccess->packets[i];
|
||||
if (globalStartTS > packet.globalStart) {
|
||||
|
||||
@@ -1123,6 +1123,30 @@ TEST_F(ProfilingTimestampPacketsTest, givenMultiOsContextCapableSetToTrueWhenCal
|
||||
EXPECT_EQ(350u, ev->getEndTimeStamp());
|
||||
}
|
||||
|
||||
TEST_F(ProfilingTimestampPacketsTest, givenTimestampPacketWithoutProfilingDataWhenCalculatingThenDontUseThatPacket) {
|
||||
int globalStart0 = 20;
|
||||
int globalEnd0 = 51;
|
||||
int contextStart0 = 21;
|
||||
int contextEnd0 = 50;
|
||||
|
||||
int globalStart1 = globalStart0 - 1;
|
||||
int globalEnd1 = globalEnd0 + 1;
|
||||
int contextStart1 = contextStart0 - 1;
|
||||
int contextEnd1 = contextEnd0 + 1;
|
||||
|
||||
addTimestampNodeMultiOsContext(&globalStart0, &globalEnd0, &contextStart0, &contextEnd0, 1);
|
||||
addTimestampNodeMultiOsContext(&globalStart1, &globalEnd1, &contextStart1, &contextEnd1, 1);
|
||||
auto &device = reinterpret_cast<MockDevice &>(cmdQ->getDevice());
|
||||
auto &csr = device.getUltCommandStreamReceiver<DEFAULT_TEST_FAMILY_NAME>();
|
||||
csr.multiOsContextCapable = true;
|
||||
|
||||
ev->timestampPacketContainer->peekNodes()[1]->setProfilingCapable(false);
|
||||
|
||||
ev->calcProfilingData();
|
||||
EXPECT_EQ(static_cast<uint64_t>(globalStart0), ev->getStartTimeStamp());
|
||||
EXPECT_EQ(static_cast<uint64_t>(globalEnd0), ev->getEndTimeStamp());
|
||||
}
|
||||
|
||||
TEST_F(ProfilingTimestampPacketsTest, givenPrintTimestampPacketContentsSetWhenCalcProfilingDataThenTimeStampsArePrinted) {
|
||||
DebugManagerStateRestore restorer;
|
||||
DebugManager.flags.PrintTimestampPacketContents.set(true);
|
||||
@@ -1148,8 +1172,10 @@ TEST_F(ProfilingTimestampPacketsTest, givenPrintTimestampPacketContentsSetWhenCa
|
||||
|
||||
std::string output = testing::internal::GetCapturedStdout();
|
||||
std::stringstream expected;
|
||||
|
||||
expected << "Timestamp 0, profiling capable: " << ev->timestampPacketContainer->peekNodes()[0]->isProfilingCapable() << ", ";
|
||||
for (int i = 0; i < 16; i++) {
|
||||
expected << "Timestamp 0, packet " << i << ": "
|
||||
expected << "packet " << i << ": "
|
||||
<< "global start: " << globalStart[i] << ", "
|
||||
<< "global end: " << globalEnd[i] << ", "
|
||||
<< "context start: " << contextStart[i] << ", "
|
||||
|
||||
@@ -310,10 +310,12 @@ TEST_F(TagAllocatorTest, whenNewTagIsTakenThenItIsInitialized) {
|
||||
MockTagAllocator<TimeStamps> tagAllocator(memoryManager, 1, 2, deviceBitfield);
|
||||
tagAllocator.getFreeTagsHead()->tagForCpuAccess->start = 3;
|
||||
tagAllocator.getFreeTagsHead()->tagForCpuAccess->end = 4;
|
||||
tagAllocator.getFreeTagsHead()->setProfilingCapable(false);
|
||||
|
||||
auto node = tagAllocator.getTag();
|
||||
EXPECT_EQ(1u, node->tagForCpuAccess->start);
|
||||
EXPECT_EQ(2u, node->tagForCpuAccess->end);
|
||||
EXPECT_TRUE(node->isProfilingCapable());
|
||||
}
|
||||
|
||||
TEST_F(TagAllocatorTest, givenMultipleReferencesOnTagWhenReleasingThenReturnWhenAllRefCountsAreReleased) {
|
||||
|
||||
@@ -46,11 +46,16 @@ struct TagNode : public IDNode<TagNode<TagType>>, NonCopyableOrMovableClass {
|
||||
doNotReleaseNodes = doNotRelease;
|
||||
}
|
||||
|
||||
void setProfilingCapable(bool capable) { profilingCapable = capable; }
|
||||
|
||||
bool isProfilingCapable() const { return profilingCapable; }
|
||||
|
||||
void incImplicitCpuDependenciesCount() { implicitCpuDependenciesCount++; }
|
||||
|
||||
void initialize() {
|
||||
tagForCpuAccess->initialize();
|
||||
implicitCpuDependenciesCount.store(0);
|
||||
setProfilingCapable(true);
|
||||
}
|
||||
|
||||
uint32_t getImplicitCpuDependenciesCount() const { return implicitCpuDependenciesCount.load(); }
|
||||
@@ -64,6 +69,7 @@ struct TagNode : public IDNode<TagNode<TagType>>, NonCopyableOrMovableClass {
|
||||
std::atomic<uint32_t> refCount{0};
|
||||
std::atomic<uint32_t> implicitCpuDependenciesCount{0};
|
||||
bool doNotReleaseNodes = false;
|
||||
bool profilingCapable = true;
|
||||
|
||||
template <typename TagType2>
|
||||
friend class TagAllocator;
|
||||
|
||||
Reference in New Issue
Block a user