[CodeGen] Extend Performance Counter to track per-scop information.

Previously, we would generate one performance counter for all scops.
Now, we generate both the old information, as well as a per-scop
performance counter to generate finer grained information.

This patch needed a way to generate a unique name for a `Scop`.
The start region, end region, and function name combined provides a
unique `Scop` name. So, `Scop` has a new public API to provide its start
and end region names.

Differential Revision: https://reviews.llvm.org/D33723

llvm-svn: 304528
This commit is contained in:
Siddharth Bhat
2017-06-02 08:01:22 +00:00
parent af199153cc
commit 07bee290de
7 changed files with 212 additions and 34 deletions

View File

@@ -25,8 +25,10 @@ class PerfMonitor {
public:
/// Create a new performance monitor.
///
/// @param S The scop for which to generate fine-grained performance
/// monitoring information.
/// @param M The module for which to generate the performance monitor.
PerfMonitor(llvm::Module *M);
PerfMonitor(const Scop &S, llvm::Module *M);
/// Initialize the performance monitor.
///
@@ -48,12 +50,18 @@ private:
llvm::Module *M;
PollyIRBuilder Builder;
// The scop to profile against.
const Scop &S;
/// Indicates if performance profiling is supported on this architecture.
bool Supported;
/// The cycle counter at the beginning of the program execution.
llvm::Value *CyclesTotalStartPtr;
/// The total number of cycles spent in the current scop S.
llvm::Value *CyclesInCurrentScopPtr;
/// The total number of cycles spent within scops.
llvm::Value *CyclesInScopsPtr;
@@ -89,6 +97,12 @@ private:
/// into the module (or obtain references to them if they already exist).
void addGlobalVariables();
/// Add per-scop tracking to module.
///
/// Insert the global variable which is used to track the number of cycles
/// this scop runs.
void addScopCounter();
/// Get a reference to the intrinsic "i64 @llvm.x86.rdtscp(i8*)".
///
/// The rdtscp function returns the current value of the processor's
@@ -126,6 +140,12 @@ private:
/// This function finalizes the performance measurements and prints the
/// results to stdout. It is expected to be registered with 'atexit()'.
llvm::Function *insertFinalReporting();
/// Append Scop reporting data to "__polly_perf_final_reporting".
///
/// This function appends the current scop (S)'s information to the final
/// printing function.
void AppendScopReporting();
};
} // namespace polly

View File

@@ -2329,6 +2329,14 @@ public:
/// Check if the SCoP has been optimized by the scheduler.
bool isOptimized() const { return IsOptimized; }
/// Get the name of the entry and exit blocks of this Scop.
///
/// These along with the function name can uniquely identify a Scop.
///
/// @return std::pair whose first element is the entry name & second element
/// is the exit name.
std::pair<std::string, std::string> getEntryExitStr() const;
/// Get the name of this Scop.
std::string getNameStr() const;

View File

@@ -4125,6 +4125,12 @@ std::string Scop::getInvalidContextStr() const {
}
std::string Scop::getNameStr() const {
std::string ExitName, EntryName;
std::tie(EntryName, ExitName) = getEntryExitStr();
return EntryName + "---" + ExitName;
}
std::pair<std::string, std::string> Scop::getEntryExitStr() const {
std::string ExitName, EntryName;
raw_string_ostream ExitStr(ExitName);
raw_string_ostream EntryStr(EntryName);
@@ -4138,7 +4144,7 @@ std::string Scop::getNameStr() const {
} else
ExitName = "FunctionExit";
return EntryName + "---" + ExitName;
return std::make_pair(EntryName, ExitName);
}
__isl_give isl_set *Scop::getContext() const { return isl_set_copy(Context); }

View File

@@ -184,7 +184,7 @@ static bool CodeGen(Scop &S, IslAstInfo &AI, LoopInfo &LI, DominatorTree &DT,
IslNodeBuilder NodeBuilder(Builder, Annotator, DL, LI, SE, DT, S, StartBlock);
if (PerfMonitoring) {
PerfMonitor P(EnteringBB->getParent()->getParent());
PerfMonitor P(S, EnteringBB->getParent()->getParent());
P.initialize();
P.insertRegionStart(SplitBlock->getTerminator());

View File

@@ -11,8 +11,10 @@
#include "polly/CodeGen/PerfMonitor.h"
#include "polly/CodeGen/RuntimeDebugBuilder.h"
#include "polly/ScopInfo.h"
#include "llvm/ADT/Triple.h"
#include "llvm/IR/Intrinsics.h"
#include <sstream>
using namespace llvm;
using namespace polly;
@@ -60,51 +62,73 @@ Function *PerfMonitor::getRDTSCP() {
return Intrinsic::getDeclaration(M, Intrinsic::x86_rdtscp);
}
PerfMonitor::PerfMonitor(Module *M) : M(M), Builder(M->getContext()) {
PerfMonitor::PerfMonitor(const Scop &S, Module *M)
: M(M), Builder(M->getContext()), S(S) {
if (Triple(M->getTargetTriple()).getArch() == llvm::Triple::x86_64)
Supported = true;
else
Supported = false;
}
static void TryRegisterGlobal(Module *M, const char *Name,
Constant *InitialValue, Value **Location) {
*Location = M->getGlobalVariable(Name);
if (!*Location)
*Location = new GlobalVariable(
*M, InitialValue->getType(), true, GlobalValue::WeakAnyLinkage,
InitialValue, Name, nullptr, GlobalVariable::InitialExecTLSModel);
};
// Generate a unique name that is usable as a LLVM name for a scop to name its
// performance counter.
static std::string GetScopUniqueVarname(const Scop &S) {
std::stringstream Name;
std::string EntryString, ExitString;
std::tie(EntryString, ExitString) = S.getEntryExitStr();
Name << "__polly_perf_cycles_in_" << std::string(S.getFunction().getName())
<< "_from__" << EntryString << "__to__" << ExitString;
return Name.str();
}
void PerfMonitor::addScopCounter() {
const std::string varname = GetScopUniqueVarname(S);
TryRegisterGlobal(M, varname.c_str(), Builder.getInt64(0),
&CyclesInCurrentScopPtr);
}
void PerfMonitor::addGlobalVariables() {
auto TryRegisterGlobal = [=](const char *Name, Constant *InitialValue,
Value **Location) {
*Location = M->getGlobalVariable(Name);
if (!*Location)
*Location = new GlobalVariable(
*M, InitialValue->getType(), true, GlobalValue::WeakAnyLinkage,
InitialValue, Name, nullptr, GlobalVariable::InitialExecTLSModel);
};
TryRegisterGlobal("__polly_perf_cycles_total_start", Builder.getInt64(0),
TryRegisterGlobal(M, "__polly_perf_cycles_total_start", Builder.getInt64(0),
&CyclesTotalStartPtr);
TryRegisterGlobal("__polly_perf_initialized", Builder.getInt1(0),
TryRegisterGlobal(M, "__polly_perf_initialized", Builder.getInt1(0),
&AlreadyInitializedPtr);
TryRegisterGlobal("__polly_perf_cycles_in_scops", Builder.getInt64(0),
TryRegisterGlobal(M, "__polly_perf_cycles_in_scops", Builder.getInt64(0),
&CyclesInScopsPtr);
TryRegisterGlobal("__polly_perf_cycles_in_scop_start", Builder.getInt64(0),
TryRegisterGlobal(M, "__polly_perf_cycles_in_scop_start", Builder.getInt64(0),
&CyclesInScopStartPtr);
TryRegisterGlobal("__polly_perf_write_loation", Builder.getInt32(0),
TryRegisterGlobal(M, "__polly_perf_write_loation", Builder.getInt32(0),
&RDTSCPWriteLocation);
}
static const char *InitFunctionName = "__polly_perf_init";
static const char *FinalReportingFunctionName = "__polly_perf_final";
static BasicBlock *FinalStartBB = nullptr;
static ReturnInst *ReturnFromFinal = nullptr;
Function *PerfMonitor::insertFinalReporting() {
// Create new function.
GlobalValue::LinkageTypes Linkage = Function::WeakODRLinkage;
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), {}, false);
Function *ExitFn =
Function::Create(Ty, Linkage, FinalReportingFunctionName, M);
BasicBlock *Start = BasicBlock::Create(M->getContext(), "start", ExitFn);
Builder.SetInsertPoint(Start);
FinalStartBB = BasicBlock::Create(M->getContext(), "start", ExitFn);
Builder.SetInsertPoint(FinalStartBB);
if (!Supported) {
RuntimeDebugBuilder::createCPUPrinter(
@@ -128,23 +152,42 @@ Function *PerfMonitor::insertFinalReporting() {
RuntimeDebugBuilder::createCPUPrinter(Builder, "Total: ", CyclesTotal, "\n");
RuntimeDebugBuilder::createCPUPrinter(Builder, "Scops: ", CyclesInScops,
"\n");
// Finalize function.
Builder.CreateRetVoid();
ReturnFromFinal = Builder.CreateRetVoid();
return ExitFn;
}
void PerfMonitor::AppendScopReporting() {
Builder.SetInsertPoint(FinalStartBB);
ReturnFromFinal->eraseFromParent();
Value *CyclesInCurrentScop =
Builder.CreateLoad(this->CyclesInCurrentScopPtr, true);
std::string EntryName, ExitName;
std::tie(EntryName, ExitName) = S.getEntryExitStr();
RuntimeDebugBuilder::createCPUPrinter(
Builder, "Scop(", S.getFunction().getName(), " |from: ", EntryName,
" |to: ", ExitName, "): ", CyclesInCurrentScop, "\n");
ReturnFromFinal = Builder.CreateRetVoid();
}
static Function *FinalReporting = nullptr;
void PerfMonitor::initialize() {
addGlobalVariables();
addScopCounter();
Function *F = M->getFunction(InitFunctionName);
if (F)
return;
// Ensure that we only add the final reporting function once.
// On later invocations, append to the reporting function.
if (!FinalReporting) {
FinalReporting = insertFinalReporting();
// initialize
Function *FinalReporting = insertFinalReporting();
Function *InitFn = insertInitFunction(FinalReporting);
addToGlobalConstructors(InitFn);
Function *InitFn = insertInitFunction(FinalReporting);
addToGlobalConstructors(InitFn);
}
AppendScopReporting();
}
Function *PerfMonitor::insertInitFunction(Function *FinalReporting) {
@@ -223,4 +266,8 @@ void PerfMonitor::insertRegionEnd(Instruction *InsertBefore) {
Value *CyclesInScops = Builder.CreateLoad(CyclesInScopsPtr, true);
CyclesInScops = Builder.CreateAdd(CyclesInScops, CyclesInScop);
Builder.CreateStore(CyclesInScops, CyclesInScopsPtr, true);
Value *CyclesInCurrentScop = Builder.CreateLoad(CyclesInCurrentScopPtr, true);
CyclesInCurrentScop = Builder.CreateAdd(CyclesInCurrentScop, CyclesInScop);
Builder.CreateStore(CyclesInCurrentScop, CyclesInCurrentScopPtr, true);
}

View File

@@ -49,7 +49,6 @@ return:
; CHECK-NEXT: %8 = load volatile i64, i64* @__polly_perf_cycles_in_scops
; CHECK-NEXT: %9 = add i64 %8, %7
; CHECK-NEXT: store volatile i64 %9, i64* @__polly_perf_cycles_in_scops
; CHECK-NEXT: br label %return
; CHECK: define weak_odr void @__polly_perf_final() {
@@ -66,8 +65,6 @@ return:
; CHECK-NEXT: %9 = call i32 @fflush(i8* null)
; CHECK-NEXT: %10 = call i32 (...) @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @9, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([8 x i8], [8 x i8] addrspace(4)* @7, i32 0, i32 0), i64 %3, i8 addrspace(4)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(4)* @8, i32 0, i32 0))
; CHECK-NEXT: %11 = call i32 @fflush(i8* null)
; CHECK-NEXT: ret void
; CHECK-NEXT: }
; CHECK: define weak_odr void @__polly_perf_init() {

View File

@@ -0,0 +1,100 @@
; RUN: opt %loadPolly -polly-codegen -polly-codegen-perf-monitoring \
; RUN: -S < %s | FileCheck %s
; void f(long A[], long N) {
; long i;
; if (true)
; for (i = 0; i < N; ++i)
; A[i] = i;
; }
; void g(long A[], long N) {
; long i;
; if (true)
; for (i = 0; i < N; ++i)
; A[i] = i;
; }
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
target triple = "x86_64-unknown-linux-gnu"
define void @f(i64* %A, i64 %N) nounwind {
entry:
fence seq_cst
br label %next
next:
br i1 true, label %for.i, label %return
for.i:
%indvar = phi i64 [ 0, %next], [ %indvar.next, %for.i ]
%scevgep = getelementptr i64, i64* %A, i64 %indvar
store i64 %indvar, i64* %scevgep
%indvar.next = add nsw i64 %indvar, 1
%exitcond = icmp eq i64 %indvar.next, %N
br i1 %exitcond, label %return, label %for.i
return:
fence seq_cst
ret void
}
define void @g(i64* %A, i64 %N) nounwind {
entry:
fence seq_cst
br label %next
next:
br i1 true, label %for.i, label %return
for.i:
%indvar = phi i64 [ 0, %next], [ %indvar.next, %for.i ]
%scevgep = getelementptr i64, i64* %A, i64 %indvar
store i64 %indvar, i64* %scevgep
%indvar.next = add nsw i64 %indvar, 1
%exitcond = icmp eq i64 %indvar.next, %N
br i1 %exitcond, label %return, label %for.i
return:
fence seq_cst
ret void
}
; Declaration of globals
; CHECK: @"__polly_perf_cycles_in_f_from__%next__to__%polly.merge_new_and_old" = weak thread_local(initialexec) constant i64 0
; CHECK: @"__polly_perf_cycles_in_g_from__%next__to__%polly.merge_new_and_old" = weak thread_local(initialexec) constant i64 0
; Bumping up counter in f
; CHECK: polly.merge_new_and_old: ; preds = %polly.exiting, %return.region_exiting
; CHECK-NEXT: %5 = load volatile i64, i64* @__polly_perf_cycles_in_scop_start
; CHECK-NEXT: %6 = call i64 @llvm.x86.rdtscp(i8* bitcast (i32* @__polly_perf_write_loation to i8*))
; CHECK-NEXT: %7 = sub i64 %6, %5
; CHECK-NEXT: %8 = load volatile i64, i64* @__polly_perf_cycles_in_scops
; CHECK-NEXT: %9 = add i64 %8, %7
; CHECK-NEXT: store volatile i64 %9, i64* @__polly_perf_cycles_in_scops
; CHECK-NEXT: %10 = load volatile i64, i64* @"__polly_perf_cycles_in_f_from__%next__to__%polly.merge_new_and_old"
; CHECK-NEXT: %11 = add i64 %10, %7
; CHECK-NEXT: store volatile i64 %11, i64* @"__polly_perf_cycles_in_f_from__%next__to__%polly.merge_new_and_old"
; CHECK-NEXT: br label %return
; Bumping up counter in g
; CHECK: polly.merge_new_and_old: ; preds = %polly.exiting, %return.region_exiting
; CHECK-NEXT: %5 = load volatile i64, i64* @__polly_perf_cycles_in_scop_start
; CHECK-NEXT: %6 = call i64 @llvm.x86.rdtscp(i8* bitcast (i32* @__polly_perf_write_loation to i8*))
; CHECK-NEXT: %7 = sub i64 %6, %5
; CHECK-NEXT: %8 = load volatile i64, i64* @__polly_perf_cycles_in_scops
; CHECK-NEXT: %9 = add i64 %8, %7
; CHECK-NEXT: store volatile i64 %9, i64* @__polly_perf_cycles_in_scops
; CHECK-NEXT: %10 = load volatile i64, i64* @"__polly_perf_cycles_in_g_from__%next__to__%polly.merge_new_and_old"
; CHECK-NEXT: %11 = add i64 %10, %7
; CHECK-NEXT: store volatile i64 %11, i64* @"__polly_perf_cycles_in_g_from__%next__to__%polly.merge_new_and_old"
; CHECK-NEXT: br label %return
; Final reporting prints
; CHECK: %12 = load volatile i64, i64* @"__polly_perf_cycles_in_f_from__%next__to__%polly.merge_new_and_old"
; CHECK-NEXT: %13 = call i32 (...) @printf(i8* getelementptr inbounds ([20 x i8], [20 x i8]* @18, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([6 x i8], [6 x i8] addrspace(4)* @10, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(4)* @11, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([9 x i8], [9 x i8] addrspace(4)* @12, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([6 x i8], [6 x i8] addrspace(4)* @13, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([7 x i8], [7 x i8] addrspace(4)* @14, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([25 x i8], [25 x i8] addrspace(4)* @15, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(4)* @16, i32 0, i32 0), i64 %12, i8 addrspace(4)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(4)* @17, i32 0, i32 0))
; CHECK-NEXT: %14 = call i32 @fflush(i8* null)
; CHECK-NEXT: %15 = load volatile i64, i64* @"__polly_perf_cycles_in_g_from__%next__to__%polly.merge_new_and_old"
; CHECK-NEXT: %16 = call i32 (...) @printf(i8* getelementptr inbounds ([20 x i8], [20 x i8]* @27, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([6 x i8], [6 x i8] addrspace(4)* @19, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(4)* @20, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([9 x i8], [9 x i8] addrspace(4)* @21, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([6 x i8], [6 x i8] addrspace(4)* @22, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([7 x i8], [7 x i8] addrspace(4)* @23, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([25 x i8], [25 x i8] addrspace(4)* @24, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(4)* @25, i32 0, i32 0), i64 %15, i8 addrspace(4)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(4)* @26, i32 0, i32 0))
; CHECK-NEXT: %17 = call i32 @fflush(i8* null)
; CHECK-NEXT: ret void