mirror of
https://github.com/intel/llvm.git
synced 2026-02-02 02:00:03 +08:00
[CodeGen] Extend Performance Counter to track per-scop information.
Previously, we would generate one performance counter for all scops. Now, we generate both the old information, as well as a per-scop performance counter to generate finer grained information. This patch needed a way to generate a unique name for a `Scop`. The start region, end region, and function name combined provides a unique `Scop` name. So, `Scop` has a new public API to provide its start and end region names. Differential Revision: https://reviews.llvm.org/D33723 llvm-svn: 304528
This commit is contained in:
@@ -25,8 +25,10 @@ class PerfMonitor {
|
||||
public:
|
||||
/// Create a new performance monitor.
|
||||
///
|
||||
/// @param S The scop for which to generate fine-grained performance
|
||||
/// monitoring information.
|
||||
/// @param M The module for which to generate the performance monitor.
|
||||
PerfMonitor(llvm::Module *M);
|
||||
PerfMonitor(const Scop &S, llvm::Module *M);
|
||||
|
||||
/// Initialize the performance monitor.
|
||||
///
|
||||
@@ -48,12 +50,18 @@ private:
|
||||
llvm::Module *M;
|
||||
PollyIRBuilder Builder;
|
||||
|
||||
// The scop to profile against.
|
||||
const Scop &S;
|
||||
|
||||
/// Indicates if performance profiling is supported on this architecture.
|
||||
bool Supported;
|
||||
|
||||
/// The cycle counter at the beginning of the program execution.
|
||||
llvm::Value *CyclesTotalStartPtr;
|
||||
|
||||
/// The total number of cycles spent in the current scop S.
|
||||
llvm::Value *CyclesInCurrentScopPtr;
|
||||
|
||||
/// The total number of cycles spent within scops.
|
||||
llvm::Value *CyclesInScopsPtr;
|
||||
|
||||
@@ -89,6 +97,12 @@ private:
|
||||
/// into the module (or obtain references to them if they already exist).
|
||||
void addGlobalVariables();
|
||||
|
||||
/// Add per-scop tracking to module.
|
||||
///
|
||||
/// Insert the global variable which is used to track the number of cycles
|
||||
/// this scop runs.
|
||||
void addScopCounter();
|
||||
|
||||
/// Get a reference to the intrinsic "i64 @llvm.x86.rdtscp(i8*)".
|
||||
///
|
||||
/// The rdtscp function returns the current value of the processor's
|
||||
@@ -126,6 +140,12 @@ private:
|
||||
/// This function finalizes the performance measurements and prints the
|
||||
/// results to stdout. It is expected to be registered with 'atexit()'.
|
||||
llvm::Function *insertFinalReporting();
|
||||
|
||||
/// Append Scop reporting data to "__polly_perf_final_reporting".
|
||||
///
|
||||
/// This function appends the current scop (S)'s information to the final
|
||||
/// printing function.
|
||||
void AppendScopReporting();
|
||||
};
|
||||
} // namespace polly
|
||||
|
||||
|
||||
@@ -2329,6 +2329,14 @@ public:
|
||||
/// Check if the SCoP has been optimized by the scheduler.
|
||||
bool isOptimized() const { return IsOptimized; }
|
||||
|
||||
/// Get the name of the entry and exit blocks of this Scop.
|
||||
///
|
||||
/// These along with the function name can uniquely identify a Scop.
|
||||
///
|
||||
/// @return std::pair whose first element is the entry name & second element
|
||||
/// is the exit name.
|
||||
std::pair<std::string, std::string> getEntryExitStr() const;
|
||||
|
||||
/// Get the name of this Scop.
|
||||
std::string getNameStr() const;
|
||||
|
||||
|
||||
@@ -4125,6 +4125,12 @@ std::string Scop::getInvalidContextStr() const {
|
||||
}
|
||||
|
||||
std::string Scop::getNameStr() const {
|
||||
std::string ExitName, EntryName;
|
||||
std::tie(EntryName, ExitName) = getEntryExitStr();
|
||||
return EntryName + "---" + ExitName;
|
||||
}
|
||||
|
||||
std::pair<std::string, std::string> Scop::getEntryExitStr() const {
|
||||
std::string ExitName, EntryName;
|
||||
raw_string_ostream ExitStr(ExitName);
|
||||
raw_string_ostream EntryStr(EntryName);
|
||||
@@ -4138,7 +4144,7 @@ std::string Scop::getNameStr() const {
|
||||
} else
|
||||
ExitName = "FunctionExit";
|
||||
|
||||
return EntryName + "---" + ExitName;
|
||||
return std::make_pair(EntryName, ExitName);
|
||||
}
|
||||
|
||||
__isl_give isl_set *Scop::getContext() const { return isl_set_copy(Context); }
|
||||
|
||||
@@ -184,7 +184,7 @@ static bool CodeGen(Scop &S, IslAstInfo &AI, LoopInfo &LI, DominatorTree &DT,
|
||||
IslNodeBuilder NodeBuilder(Builder, Annotator, DL, LI, SE, DT, S, StartBlock);
|
||||
|
||||
if (PerfMonitoring) {
|
||||
PerfMonitor P(EnteringBB->getParent()->getParent());
|
||||
PerfMonitor P(S, EnteringBB->getParent()->getParent());
|
||||
P.initialize();
|
||||
P.insertRegionStart(SplitBlock->getTerminator());
|
||||
|
||||
|
||||
@@ -11,8 +11,10 @@
|
||||
|
||||
#include "polly/CodeGen/PerfMonitor.h"
|
||||
#include "polly/CodeGen/RuntimeDebugBuilder.h"
|
||||
#include "polly/ScopInfo.h"
|
||||
#include "llvm/ADT/Triple.h"
|
||||
#include "llvm/IR/Intrinsics.h"
|
||||
#include <sstream>
|
||||
|
||||
using namespace llvm;
|
||||
using namespace polly;
|
||||
@@ -60,51 +62,73 @@ Function *PerfMonitor::getRDTSCP() {
|
||||
return Intrinsic::getDeclaration(M, Intrinsic::x86_rdtscp);
|
||||
}
|
||||
|
||||
PerfMonitor::PerfMonitor(Module *M) : M(M), Builder(M->getContext()) {
|
||||
PerfMonitor::PerfMonitor(const Scop &S, Module *M)
|
||||
: M(M), Builder(M->getContext()), S(S) {
|
||||
if (Triple(M->getTargetTriple()).getArch() == llvm::Triple::x86_64)
|
||||
Supported = true;
|
||||
else
|
||||
Supported = false;
|
||||
}
|
||||
|
||||
static void TryRegisterGlobal(Module *M, const char *Name,
|
||||
Constant *InitialValue, Value **Location) {
|
||||
*Location = M->getGlobalVariable(Name);
|
||||
|
||||
if (!*Location)
|
||||
*Location = new GlobalVariable(
|
||||
*M, InitialValue->getType(), true, GlobalValue::WeakAnyLinkage,
|
||||
InitialValue, Name, nullptr, GlobalVariable::InitialExecTLSModel);
|
||||
};
|
||||
|
||||
// Generate a unique name that is usable as a LLVM name for a scop to name its
|
||||
// performance counter.
|
||||
static std::string GetScopUniqueVarname(const Scop &S) {
|
||||
std::stringstream Name;
|
||||
std::string EntryString, ExitString;
|
||||
std::tie(EntryString, ExitString) = S.getEntryExitStr();
|
||||
|
||||
Name << "__polly_perf_cycles_in_" << std::string(S.getFunction().getName())
|
||||
<< "_from__" << EntryString << "__to__" << ExitString;
|
||||
return Name.str();
|
||||
}
|
||||
|
||||
void PerfMonitor::addScopCounter() {
|
||||
const std::string varname = GetScopUniqueVarname(S);
|
||||
TryRegisterGlobal(M, varname.c_str(), Builder.getInt64(0),
|
||||
&CyclesInCurrentScopPtr);
|
||||
}
|
||||
|
||||
void PerfMonitor::addGlobalVariables() {
|
||||
auto TryRegisterGlobal = [=](const char *Name, Constant *InitialValue,
|
||||
Value **Location) {
|
||||
*Location = M->getGlobalVariable(Name);
|
||||
|
||||
if (!*Location)
|
||||
*Location = new GlobalVariable(
|
||||
*M, InitialValue->getType(), true, GlobalValue::WeakAnyLinkage,
|
||||
InitialValue, Name, nullptr, GlobalVariable::InitialExecTLSModel);
|
||||
};
|
||||
|
||||
TryRegisterGlobal("__polly_perf_cycles_total_start", Builder.getInt64(0),
|
||||
TryRegisterGlobal(M, "__polly_perf_cycles_total_start", Builder.getInt64(0),
|
||||
&CyclesTotalStartPtr);
|
||||
|
||||
TryRegisterGlobal("__polly_perf_initialized", Builder.getInt1(0),
|
||||
TryRegisterGlobal(M, "__polly_perf_initialized", Builder.getInt1(0),
|
||||
&AlreadyInitializedPtr);
|
||||
|
||||
TryRegisterGlobal("__polly_perf_cycles_in_scops", Builder.getInt64(0),
|
||||
TryRegisterGlobal(M, "__polly_perf_cycles_in_scops", Builder.getInt64(0),
|
||||
&CyclesInScopsPtr);
|
||||
|
||||
TryRegisterGlobal("__polly_perf_cycles_in_scop_start", Builder.getInt64(0),
|
||||
TryRegisterGlobal(M, "__polly_perf_cycles_in_scop_start", Builder.getInt64(0),
|
||||
&CyclesInScopStartPtr);
|
||||
|
||||
TryRegisterGlobal("__polly_perf_write_loation", Builder.getInt32(0),
|
||||
TryRegisterGlobal(M, "__polly_perf_write_loation", Builder.getInt32(0),
|
||||
&RDTSCPWriteLocation);
|
||||
}
|
||||
|
||||
static const char *InitFunctionName = "__polly_perf_init";
|
||||
static const char *FinalReportingFunctionName = "__polly_perf_final";
|
||||
|
||||
static BasicBlock *FinalStartBB = nullptr;
|
||||
static ReturnInst *ReturnFromFinal = nullptr;
|
||||
|
||||
Function *PerfMonitor::insertFinalReporting() {
|
||||
// Create new function.
|
||||
GlobalValue::LinkageTypes Linkage = Function::WeakODRLinkage;
|
||||
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), {}, false);
|
||||
Function *ExitFn =
|
||||
Function::Create(Ty, Linkage, FinalReportingFunctionName, M);
|
||||
BasicBlock *Start = BasicBlock::Create(M->getContext(), "start", ExitFn);
|
||||
Builder.SetInsertPoint(Start);
|
||||
FinalStartBB = BasicBlock::Create(M->getContext(), "start", ExitFn);
|
||||
Builder.SetInsertPoint(FinalStartBB);
|
||||
|
||||
if (!Supported) {
|
||||
RuntimeDebugBuilder::createCPUPrinter(
|
||||
@@ -128,23 +152,42 @@ Function *PerfMonitor::insertFinalReporting() {
|
||||
RuntimeDebugBuilder::createCPUPrinter(Builder, "Total: ", CyclesTotal, "\n");
|
||||
RuntimeDebugBuilder::createCPUPrinter(Builder, "Scops: ", CyclesInScops,
|
||||
"\n");
|
||||
|
||||
// Finalize function.
|
||||
Builder.CreateRetVoid();
|
||||
ReturnFromFinal = Builder.CreateRetVoid();
|
||||
return ExitFn;
|
||||
}
|
||||
|
||||
void PerfMonitor::AppendScopReporting() {
|
||||
Builder.SetInsertPoint(FinalStartBB);
|
||||
ReturnFromFinal->eraseFromParent();
|
||||
|
||||
Value *CyclesInCurrentScop =
|
||||
Builder.CreateLoad(this->CyclesInCurrentScopPtr, true);
|
||||
std::string EntryName, ExitName;
|
||||
std::tie(EntryName, ExitName) = S.getEntryExitStr();
|
||||
|
||||
RuntimeDebugBuilder::createCPUPrinter(
|
||||
Builder, "Scop(", S.getFunction().getName(), " |from: ", EntryName,
|
||||
" |to: ", ExitName, "): ", CyclesInCurrentScop, "\n");
|
||||
|
||||
ReturnFromFinal = Builder.CreateRetVoid();
|
||||
}
|
||||
|
||||
static Function *FinalReporting = nullptr;
|
||||
|
||||
void PerfMonitor::initialize() {
|
||||
addGlobalVariables();
|
||||
addScopCounter();
|
||||
|
||||
Function *F = M->getFunction(InitFunctionName);
|
||||
if (F)
|
||||
return;
|
||||
// Ensure that we only add the final reporting function once.
|
||||
// On later invocations, append to the reporting function.
|
||||
if (!FinalReporting) {
|
||||
FinalReporting = insertFinalReporting();
|
||||
|
||||
// initialize
|
||||
Function *FinalReporting = insertFinalReporting();
|
||||
Function *InitFn = insertInitFunction(FinalReporting);
|
||||
addToGlobalConstructors(InitFn);
|
||||
Function *InitFn = insertInitFunction(FinalReporting);
|
||||
addToGlobalConstructors(InitFn);
|
||||
}
|
||||
|
||||
AppendScopReporting();
|
||||
}
|
||||
|
||||
Function *PerfMonitor::insertInitFunction(Function *FinalReporting) {
|
||||
@@ -223,4 +266,8 @@ void PerfMonitor::insertRegionEnd(Instruction *InsertBefore) {
|
||||
Value *CyclesInScops = Builder.CreateLoad(CyclesInScopsPtr, true);
|
||||
CyclesInScops = Builder.CreateAdd(CyclesInScops, CyclesInScop);
|
||||
Builder.CreateStore(CyclesInScops, CyclesInScopsPtr, true);
|
||||
|
||||
Value *CyclesInCurrentScop = Builder.CreateLoad(CyclesInCurrentScopPtr, true);
|
||||
CyclesInCurrentScop = Builder.CreateAdd(CyclesInCurrentScop, CyclesInScop);
|
||||
Builder.CreateStore(CyclesInCurrentScop, CyclesInCurrentScopPtr, true);
|
||||
}
|
||||
|
||||
@@ -49,7 +49,6 @@ return:
|
||||
; CHECK-NEXT: %8 = load volatile i64, i64* @__polly_perf_cycles_in_scops
|
||||
; CHECK-NEXT: %9 = add i64 %8, %7
|
||||
; CHECK-NEXT: store volatile i64 %9, i64* @__polly_perf_cycles_in_scops
|
||||
; CHECK-NEXT: br label %return
|
||||
|
||||
|
||||
; CHECK: define weak_odr void @__polly_perf_final() {
|
||||
@@ -66,8 +65,6 @@ return:
|
||||
; CHECK-NEXT: %9 = call i32 @fflush(i8* null)
|
||||
; CHECK-NEXT: %10 = call i32 (...) @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @9, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([8 x i8], [8 x i8] addrspace(4)* @7, i32 0, i32 0), i64 %3, i8 addrspace(4)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(4)* @8, i32 0, i32 0))
|
||||
; CHECK-NEXT: %11 = call i32 @fflush(i8* null)
|
||||
; CHECK-NEXT: ret void
|
||||
; CHECK-NEXT: }
|
||||
|
||||
|
||||
; CHECK: define weak_odr void @__polly_perf_init() {
|
||||
|
||||
100
polly/test/Isl/CodeGen/perf_monitoring_per_scop.ll
Normal file
100
polly/test/Isl/CodeGen/perf_monitoring_per_scop.ll
Normal file
@@ -0,0 +1,100 @@
|
||||
; RUN: opt %loadPolly -polly-codegen -polly-codegen-perf-monitoring \
|
||||
; RUN: -S < %s | FileCheck %s
|
||||
|
||||
; void f(long A[], long N) {
|
||||
; long i;
|
||||
; if (true)
|
||||
; for (i = 0; i < N; ++i)
|
||||
; A[i] = i;
|
||||
; }
|
||||
; void g(long A[], long N) {
|
||||
; long i;
|
||||
; if (true)
|
||||
; for (i = 0; i < N; ++i)
|
||||
; A[i] = i;
|
||||
; }
|
||||
|
||||
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
|
||||
target triple = "x86_64-unknown-linux-gnu"
|
||||
|
||||
define void @f(i64* %A, i64 %N) nounwind {
|
||||
entry:
|
||||
fence seq_cst
|
||||
br label %next
|
||||
|
||||
next:
|
||||
br i1 true, label %for.i, label %return
|
||||
|
||||
for.i:
|
||||
%indvar = phi i64 [ 0, %next], [ %indvar.next, %for.i ]
|
||||
%scevgep = getelementptr i64, i64* %A, i64 %indvar
|
||||
store i64 %indvar, i64* %scevgep
|
||||
%indvar.next = add nsw i64 %indvar, 1
|
||||
%exitcond = icmp eq i64 %indvar.next, %N
|
||||
br i1 %exitcond, label %return, label %for.i
|
||||
|
||||
return:
|
||||
fence seq_cst
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
define void @g(i64* %A, i64 %N) nounwind {
|
||||
entry:
|
||||
fence seq_cst
|
||||
br label %next
|
||||
|
||||
next:
|
||||
br i1 true, label %for.i, label %return
|
||||
|
||||
for.i:
|
||||
%indvar = phi i64 [ 0, %next], [ %indvar.next, %for.i ]
|
||||
%scevgep = getelementptr i64, i64* %A, i64 %indvar
|
||||
store i64 %indvar, i64* %scevgep
|
||||
%indvar.next = add nsw i64 %indvar, 1
|
||||
%exitcond = icmp eq i64 %indvar.next, %N
|
||||
br i1 %exitcond, label %return, label %for.i
|
||||
|
||||
return:
|
||||
fence seq_cst
|
||||
ret void
|
||||
}
|
||||
|
||||
; Declaration of globals
|
||||
; CHECK: @"__polly_perf_cycles_in_f_from__%next__to__%polly.merge_new_and_old" = weak thread_local(initialexec) constant i64 0
|
||||
; CHECK: @"__polly_perf_cycles_in_g_from__%next__to__%polly.merge_new_and_old" = weak thread_local(initialexec) constant i64 0
|
||||
|
||||
; Bumping up counter in f
|
||||
; CHECK: polly.merge_new_and_old: ; preds = %polly.exiting, %return.region_exiting
|
||||
; CHECK-NEXT: %5 = load volatile i64, i64* @__polly_perf_cycles_in_scop_start
|
||||
; CHECK-NEXT: %6 = call i64 @llvm.x86.rdtscp(i8* bitcast (i32* @__polly_perf_write_loation to i8*))
|
||||
; CHECK-NEXT: %7 = sub i64 %6, %5
|
||||
; CHECK-NEXT: %8 = load volatile i64, i64* @__polly_perf_cycles_in_scops
|
||||
; CHECK-NEXT: %9 = add i64 %8, %7
|
||||
; CHECK-NEXT: store volatile i64 %9, i64* @__polly_perf_cycles_in_scops
|
||||
; CHECK-NEXT: %10 = load volatile i64, i64* @"__polly_perf_cycles_in_f_from__%next__to__%polly.merge_new_and_old"
|
||||
; CHECK-NEXT: %11 = add i64 %10, %7
|
||||
; CHECK-NEXT: store volatile i64 %11, i64* @"__polly_perf_cycles_in_f_from__%next__to__%polly.merge_new_and_old"
|
||||
; CHECK-NEXT: br label %return
|
||||
|
||||
; Bumping up counter in g
|
||||
; CHECK: polly.merge_new_and_old: ; preds = %polly.exiting, %return.region_exiting
|
||||
; CHECK-NEXT: %5 = load volatile i64, i64* @__polly_perf_cycles_in_scop_start
|
||||
; CHECK-NEXT: %6 = call i64 @llvm.x86.rdtscp(i8* bitcast (i32* @__polly_perf_write_loation to i8*))
|
||||
; CHECK-NEXT: %7 = sub i64 %6, %5
|
||||
; CHECK-NEXT: %8 = load volatile i64, i64* @__polly_perf_cycles_in_scops
|
||||
; CHECK-NEXT: %9 = add i64 %8, %7
|
||||
; CHECK-NEXT: store volatile i64 %9, i64* @__polly_perf_cycles_in_scops
|
||||
; CHECK-NEXT: %10 = load volatile i64, i64* @"__polly_perf_cycles_in_g_from__%next__to__%polly.merge_new_and_old"
|
||||
; CHECK-NEXT: %11 = add i64 %10, %7
|
||||
; CHECK-NEXT: store volatile i64 %11, i64* @"__polly_perf_cycles_in_g_from__%next__to__%polly.merge_new_and_old"
|
||||
; CHECK-NEXT: br label %return
|
||||
|
||||
; Final reporting prints
|
||||
; CHECK: %12 = load volatile i64, i64* @"__polly_perf_cycles_in_f_from__%next__to__%polly.merge_new_and_old"
|
||||
; CHECK-NEXT: %13 = call i32 (...) @printf(i8* getelementptr inbounds ([20 x i8], [20 x i8]* @18, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([6 x i8], [6 x i8] addrspace(4)* @10, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(4)* @11, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([9 x i8], [9 x i8] addrspace(4)* @12, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([6 x i8], [6 x i8] addrspace(4)* @13, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([7 x i8], [7 x i8] addrspace(4)* @14, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([25 x i8], [25 x i8] addrspace(4)* @15, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(4)* @16, i32 0, i32 0), i64 %12, i8 addrspace(4)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(4)* @17, i32 0, i32 0))
|
||||
; CHECK-NEXT: %14 = call i32 @fflush(i8* null)
|
||||
; CHECK-NEXT: %15 = load volatile i64, i64* @"__polly_perf_cycles_in_g_from__%next__to__%polly.merge_new_and_old"
|
||||
; CHECK-NEXT: %16 = call i32 (...) @printf(i8* getelementptr inbounds ([20 x i8], [20 x i8]* @27, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([6 x i8], [6 x i8] addrspace(4)* @19, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(4)* @20, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([9 x i8], [9 x i8] addrspace(4)* @21, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([6 x i8], [6 x i8] addrspace(4)* @22, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([7 x i8], [7 x i8] addrspace(4)* @23, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([25 x i8], [25 x i8] addrspace(4)* @24, i32 0, i32 0), i8 addrspace(4)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(4)* @25, i32 0, i32 0), i64 %15, i8 addrspace(4)* getelementptr inbounds ([2 x i8], [2 x i8] addrspace(4)* @26, i32 0, i32 0))
|
||||
; CHECK-NEXT: %17 = call i32 @fflush(i8* null)
|
||||
; CHECK-NEXT: ret void
|
||||
Reference in New Issue
Block a user