[XRay][compiler-rt] Runtime changes to support custom event logging

Summary:
This change implements support for the custom event logging sleds and
intrinsics at runtime. For now it only supports handling the sleds in
x86_64, with the implementations for other architectures stubbed out to
do nothing.

NOTE: Work in progress, uploaded for exposition/exploration purposes.

Depends on D27503, D30018, and D33032.

Reviewers: echristo, javed.absar, timshen

Subscribers: mehdi_amini, nemanjai, llvm-commits

Differential Revision: https://reviews.llvm.org/D30630

llvm-svn: 302857
This commit is contained in:
Dean Michael Berris
2017-05-12 01:07:41 +00:00
parent a7bbe4481a
commit 29e16deb17
14 changed files with 429 additions and 140 deletions

View File

@@ -25,6 +25,7 @@ enum XRayEntryType {
EXIT = 1,
TAIL = 2,
LOG_ARGS_ENTRY = 3,
CUSTOM_EVENT = 4,
};
/// Provide a function to invoke for when instrumentation points are hit. This
@@ -64,6 +65,9 @@ extern int __xray_set_handler_arg1(void (*)(int32_t, XRayEntryType, uint64_t));
/// Returns 1 on success, 0 on error.
extern int __xray_remove_handler_arg1();
/// Provide a function to invoke when XRay encounters a custom event.
extern int __xray_set_customevent_handler(void (*entry)(void*, std::size_t));
enum XRayPatchingStatus {
NOT_INITIALIZED = 0,
SUCCESS = 1,

View File

@@ -18,8 +18,7 @@
#include <atomic>
#include <cassert>
extern "C" void __clear_cache(void* start, void* end);
extern "C" void __clear_cache(void *start, void *end);
namespace __xray {
@@ -86,8 +85,8 @@ inline static bool patchSled(const bool Enable, const uint32_t FuncId,
reinterpret_cast<std::atomic<uint32_t> *>(FirstAddress),
uint32_t(PatchOpcodes::PO_B32), std::memory_order_release);
}
__clear_cache(reinterpret_cast<char*>(FirstAddress),
reinterpret_cast<char*>(CurAddress));
__clear_cache(reinterpret_cast<char *>(FirstAddress),
reinterpret_cast<char *>(CurAddress));
return true;
}
@@ -107,6 +106,12 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
return patchSled(Enable, FuncId, Sled, __xray_FunctionTailExit);
}
bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
const XRaySled &Sled)
XRAY_NEVER_INSTRUMENT { // FIXME: Implement in aarch64?
return false;
}
// FIXME: Maybe implement this better?
bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; }

View File

@@ -18,7 +18,7 @@
#include <atomic>
#include <cassert>
extern "C" void __clear_cache(void* start, void* end);
extern "C" void __clear_cache(void *start, void *end);
namespace __xray {
@@ -122,8 +122,8 @@ inline static bool patchSled(const bool Enable, const uint32_t FuncId,
reinterpret_cast<std::atomic<uint32_t> *>(FirstAddress),
uint32_t(PatchOpcodes::PO_B20), std::memory_order_release);
}
__clear_cache(reinterpret_cast<char*>(FirstAddress),
reinterpret_cast<char*>(CurAddress));
__clear_cache(reinterpret_cast<char *>(FirstAddress),
reinterpret_cast<char *>(CurAddress));
return true;
}
@@ -143,6 +143,12 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
return patchSled(Enable, FuncId, Sled, __xray_FunctionTailExit);
}
bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
const XRaySled &Sled)
XRAY_NEVER_INSTRUMENT { // FIXME: Implement in arm?
return false;
}
// FIXME: Maybe implement this better?
bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; }

View File

@@ -29,6 +29,7 @@ struct alignas(16) MetadataRecord {
NewCPUId,
TSCWrap,
WalltimeMarker,
CustomEventMarker,
};
// Use 7 bits to identify this record type.
/* RecordKinds */ uint8_t RecordKind : 7;

View File

@@ -41,45 +41,12 @@ namespace __xray {
// Global BufferQueue.
std::shared_ptr<BufferQueue> BQ;
__sanitizer::atomic_sint32_t LoggingStatus = {
XRayLogInitStatus::XRAY_LOG_UNINITIALIZED};
__sanitizer::atomic_sint32_t LogFlushStatus = {
XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING};
std::unique_ptr<FDRLoggingOptions> FDROptions;
FDRLoggingOptions FDROptions;
XRayLogInitStatus fdrLoggingInit(std::size_t BufferSize, std::size_t BufferMax,
void *Options,
size_t OptionsSize) XRAY_NEVER_INSTRUMENT {
if (OptionsSize != sizeof(FDRLoggingOptions))
return static_cast<XRayLogInitStatus>(__sanitizer::atomic_load(
&LoggingStatus, __sanitizer::memory_order_acquire));
s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
if (!__sanitizer::atomic_compare_exchange_strong(
&LoggingStatus, &CurrentStatus,
XRayLogInitStatus::XRAY_LOG_INITIALIZING,
__sanitizer::memory_order_release))
return static_cast<XRayLogInitStatus>(CurrentStatus);
FDROptions.reset(new FDRLoggingOptions());
memcpy(FDROptions.get(), Options, OptionsSize);
bool Success = false;
BQ = std::make_shared<BufferQueue>(BufferSize, BufferMax, Success);
if (!Success) {
Report("BufferQueue init failed.\n");
return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
}
// Install the actual handleArg0 handler after initialising the buffers.
__xray_set_handler(fdrLoggingHandleArg0);
__sanitizer::atomic_store(&LoggingStatus,
XRayLogInitStatus::XRAY_LOG_INITIALIZED,
__sanitizer::memory_order_release);
Report("XRay FDR init successful.\n");
return XRayLogInitStatus::XRAY_LOG_INITIALIZED;
}
__sanitizer::SpinMutex FDROptionsMutex;
// Must finalize before flushing.
XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT {
@@ -108,7 +75,11 @@ XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT {
// (fixed-sized) and let the tools reading the buffers deal with the data
// afterwards.
//
int Fd = FDROptions->Fd;
int Fd = -1;
{
__sanitizer::SpinMutexLock Guard(&FDROptionsMutex);
Fd = FDROptions.Fd;
}
if (Fd == -1)
Fd = getLogFD();
if (Fd == -1) {
@@ -120,8 +91,8 @@ XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT {
// Test for required CPU features and cache the cycle frequency
static bool TSCSupported = probeRequiredCPUFeatures();
static uint64_t CycleFrequency = TSCSupported ? getTSCFrequency()
: __xray::NanosecondsPerSecond;
static uint64_t CycleFrequency =
TSCSupported ? getTSCFrequency() : __xray::NanosecondsPerSecond;
XRayFileHeader Header;
Header.Version = 1;
@@ -192,8 +163,8 @@ XRayLogInitStatus fdrLoggingReset() XRAY_NEVER_INSTRUMENT {
return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
}
void fdrLoggingHandleArg0(int32_t FuncId,
XRayEntryType Entry) XRAY_NEVER_INSTRUMENT {
static std::tuple<uint64_t, unsigned char>
getTimestamp() XRAY_NEVER_INSTRUMENT {
// We want to get the TSC as early as possible, so that we can check whether
// we've seen this CPU before. We also do it before we load anything else, to
// allow for forward progress with the scheduling.
@@ -203,7 +174,7 @@ void fdrLoggingHandleArg0(int32_t FuncId,
// Test once for required CPU features
static bool TSCSupported = probeRequiredCPUFeatures();
if(TSCSupported) {
if (TSCSupported) {
TSC = __xray::readTSC(CPU);
} else {
// FIXME: This code needs refactoring as it appears in multiple locations
@@ -216,9 +187,102 @@ void fdrLoggingHandleArg0(int32_t FuncId,
CPU = 0;
TSC = TS.tv_sec * __xray::NanosecondsPerSecond + TS.tv_nsec;
}
return std::make_tuple(TSC, CPU);
}
__xray_fdr_internal::processFunctionHook(FuncId, Entry, TSC, CPU,
clock_gettime, LoggingStatus, BQ);
void fdrLoggingHandleArg0(int32_t FuncId,
XRayEntryType Entry) XRAY_NEVER_INSTRUMENT {
auto TSC_CPU = getTimestamp();
__xray_fdr_internal::processFunctionHook(FuncId, Entry, std::get<0>(TSC_CPU),
std::get<1>(TSC_CPU), clock_gettime,
LoggingStatus, BQ);
}
void fdrLoggingHandleCustomEvent(void *Event,
std::size_t EventSize) XRAY_NEVER_INSTRUMENT {
using namespace __xray_fdr_internal;
auto TSC_CPU = getTimestamp();
auto &TSC = std::get<0>(TSC_CPU);
auto &CPU = std::get<1>(TSC_CPU);
thread_local bool Running = false;
RecursionGuard Guard{Running};
if (!Guard) {
assert(Running && "RecursionGuard is buggy!");
return;
}
if (EventSize > std::numeric_limits<int32_t>::max()) {
using Empty = struct {};
static Empty Once = [&] {
Report("Event size too large = %zu ; > max = %d\n", EventSize,
std::numeric_limits<int32_t>::max());
return Empty();
}();
(void)Once;
}
int32_t ReducedEventSize = static_cast<int32_t>(EventSize);
if (!isLogInitializedAndReady(LocalBQ, TSC, CPU, clock_gettime))
return;
// Here we need to prepare the log to handle:
// - The metadata record we're going to write. (16 bytes)
// - The additional data we're going to write. Currently, that's the size of
// the event we're going to dump into the log as free-form bytes.
if (!prepareBuffer(clock_gettime, MetadataRecSize + EventSize)) {
LocalBQ = nullptr;
return;
}
// Write the custom event metadata record, which consists of the following
// information:
// - 8 bytes (64-bits) for the full TSC when the event started.
// - 4 bytes (32-bits) for the length of the data.
MetadataRecord CustomEvent;
CustomEvent.Type = uint8_t(RecordType::Metadata);
CustomEvent.RecordKind =
uint8_t(MetadataRecord::RecordKinds::CustomEventMarker);
constexpr auto TSCSize = sizeof(std::get<0>(TSC_CPU));
std::memcpy(&CustomEvent.Data, &ReducedEventSize, sizeof(int32_t));
std::memcpy(&CustomEvent.Data[sizeof(int32_t)], &TSC, TSCSize);
std::memcpy(RecordPtr, &CustomEvent, sizeof(CustomEvent));
RecordPtr += sizeof(CustomEvent);
std::memcpy(RecordPtr, Event, ReducedEventSize);
endBufferIfFull();
}
XRayLogInitStatus fdrLoggingInit(std::size_t BufferSize, std::size_t BufferMax,
void *Options,
size_t OptionsSize) XRAY_NEVER_INSTRUMENT {
if (OptionsSize != sizeof(FDRLoggingOptions))
return static_cast<XRayLogInitStatus>(__sanitizer::atomic_load(
&LoggingStatus, __sanitizer::memory_order_acquire));
s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
if (!__sanitizer::atomic_compare_exchange_strong(
&LoggingStatus, &CurrentStatus,
XRayLogInitStatus::XRAY_LOG_INITIALIZING,
__sanitizer::memory_order_release))
return static_cast<XRayLogInitStatus>(CurrentStatus);
{
__sanitizer::SpinMutexLock Guard(&FDROptionsMutex);
memcpy(&FDROptions, Options, OptionsSize);
}
bool Success = false;
BQ = std::make_shared<BufferQueue>(BufferSize, BufferMax, Success);
if (!Success) {
Report("BufferQueue init failed.\n");
return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
}
// Install the actual handleArg0 handler after initialising the buffers.
__xray_set_handler(fdrLoggingHandleArg0);
__xray_set_customevent_handler(fdrLoggingHandleCustomEvent);
__sanitizer::atomic_store(&LoggingStatus,
XRayLogInitStatus::XRAY_LOG_INITIALIZED,
__sanitizer::memory_order_release);
Report("XRay FDR init successful.\n");
return XRayLogInitStatus::XRAY_LOG_INITIALIZED;
}
} // namespace __xray

View File

@@ -37,6 +37,9 @@
namespace __xray {
__sanitizer::atomic_sint32_t LoggingStatus = {
XRayLogInitStatus::XRAY_LOG_UNINITIALIZED};
/// We expose some of the state transitions when FDR logging mode is operating
/// such that we can simulate a series of log events that may occur without
/// and test with determinism without worrying about the real CPU time.
@@ -123,12 +126,21 @@ thread_local uint8_t NumTailCalls = 0;
constexpr auto MetadataRecSize = sizeof(MetadataRecord);
constexpr auto FunctionRecSize = sizeof(FunctionRecord);
// We use a thread_local variable to keep track of which CPUs we've already
// run, and the TSC times for these CPUs. This allows us to stop repeating the
// CPU field in the function records.
//
// We assume that we'll support only 65536 CPUs for x86_64.
thread_local uint16_t CurrentCPU = std::numeric_limits<uint16_t>::max();
thread_local uint64_t LastTSC = 0;
thread_local uint64_t LastFunctionEntryTSC = 0;
class ThreadExitBufferCleanup {
std::weak_ptr<BufferQueue> Buffers;
std::shared_ptr<BufferQueue> &Buffers;
BufferQueue::Buffer &Buffer;
public:
explicit ThreadExitBufferCleanup(std::weak_ptr<BufferQueue> BQ,
explicit ThreadExitBufferCleanup(std::shared_ptr<BufferQueue> &BQ,
BufferQueue::Buffer &Buffer)
XRAY_NEVER_INSTRUMENT : Buffers(BQ),
Buffer(Buffer) {}
@@ -142,17 +154,24 @@ public:
// the queue.
assert((RecordPtr + MetadataRecSize) - static_cast<char *>(Buffer.Buffer) >=
static_cast<ptrdiff_t>(MetadataRecSize));
if (auto BQ = Buffers.lock()) {
if (Buffers) {
writeEOBMetadata();
auto EC = BQ->releaseBuffer(Buffer);
auto EC = Buffers->releaseBuffer(Buffer);
if (EC != BufferQueue::ErrorCode::Ok)
Report("Failed to release buffer at %p; error=%s\n", Buffer.Buffer,
BufferQueue::getErrorString(EC));
Buffers = nullptr;
return;
}
}
};
// Make sure a thread that's ever called handleArg0 has a thread-local
// live reference to the buffer queue for this particular instance of
// FDRLogging, and that we're going to clean it up when the thread exits.
thread_local std::shared_ptr<BufferQueue> LocalBQ = nullptr;
thread_local ThreadExitBufferCleanup Cleanup(LocalBQ, Buffer);
class RecursionGuard {
bool &Running;
const bool Valid;
@@ -176,7 +195,7 @@ public:
}
};
static inline bool loggingInitialized(
inline bool loggingInitialized(
const __sanitizer::atomic_sint32_t &LoggingStatus) XRAY_NEVER_INSTRUMENT {
return __sanitizer::atomic_load(&LoggingStatus,
__sanitizer::memory_order_acquire) ==
@@ -185,7 +204,7 @@ static inline bool loggingInitialized(
} // namespace
static inline void writeNewBufferPreamble(pid_t Tid, timespec TS,
inline void writeNewBufferPreamble(pid_t Tid, timespec TS,
char *&MemPtr) XRAY_NEVER_INSTRUMENT {
static constexpr int InitRecordsCount = 2;
std::aligned_storage<sizeof(MetadataRecord)>::type Records[InitRecordsCount];
@@ -222,9 +241,8 @@ static inline void writeNewBufferPreamble(pid_t Tid, timespec TS,
NumTailCalls = 0;
}
static inline void setupNewBuffer(int (*wall_clock_reader)(clockid_t,
struct timespec *))
XRAY_NEVER_INSTRUMENT {
inline void setupNewBuffer(int (*wall_clock_reader)(
clockid_t, struct timespec *)) XRAY_NEVER_INSTRUMENT {
RecordPtr = static_cast<char *>(Buffer.Buffer);
pid_t Tid = syscall(SYS_gettid);
timespec TS{0, 0};
@@ -235,7 +253,7 @@ static inline void setupNewBuffer(int (*wall_clock_reader)(clockid_t,
NumTailCalls = 0;
}
static inline void writeNewCPUIdMetadata(uint16_t CPU, uint64_t TSC,
inline void writeNewCPUIdMetadata(uint16_t CPU, uint64_t TSC,
char *&MemPtr) XRAY_NEVER_INSTRUMENT {
MetadataRecord NewCPUId;
NewCPUId.Type = uint8_t(RecordType::Metadata);
@@ -253,12 +271,12 @@ static inline void writeNewCPUIdMetadata(uint16_t CPU, uint64_t TSC,
NumTailCalls = 0;
}
static inline void writeNewCPUIdMetadata(uint16_t CPU,
inline void writeNewCPUIdMetadata(uint16_t CPU,
uint64_t TSC) XRAY_NEVER_INSTRUMENT {
writeNewCPUIdMetadata(CPU, TSC, RecordPtr);
}
static inline void writeEOBMetadata(char *&MemPtr) XRAY_NEVER_INSTRUMENT {
inline void writeEOBMetadata(char *&MemPtr) XRAY_NEVER_INSTRUMENT {
MetadataRecord EOBMeta;
EOBMeta.Type = uint8_t(RecordType::Metadata);
EOBMeta.RecordKind = uint8_t(MetadataRecord::RecordKinds::EndOfBuffer);
@@ -269,11 +287,11 @@ static inline void writeEOBMetadata(char *&MemPtr) XRAY_NEVER_INSTRUMENT {
NumTailCalls = 0;
}
static inline void writeEOBMetadata() XRAY_NEVER_INSTRUMENT {
inline void writeEOBMetadata() XRAY_NEVER_INSTRUMENT {
writeEOBMetadata(RecordPtr);
}
static inline void writeTSCWrapMetadata(uint64_t TSC,
inline void writeTSCWrapMetadata(uint64_t TSC,
char *&MemPtr) XRAY_NEVER_INSTRUMENT {
MetadataRecord TSCWrap;
TSCWrap.Type = uint8_t(RecordType::Metadata);
@@ -289,11 +307,11 @@ static inline void writeTSCWrapMetadata(uint64_t TSC,
NumTailCalls = 0;
}
static inline void writeTSCWrapMetadata(uint64_t TSC) XRAY_NEVER_INSTRUMENT {
inline void writeTSCWrapMetadata(uint64_t TSC) XRAY_NEVER_INSTRUMENT {
writeTSCWrapMetadata(TSC, RecordPtr);
}
static inline void writeFunctionRecord(int FuncId, uint32_t TSCDelta,
inline void writeFunctionRecord(int FuncId, uint32_t TSCDelta,
XRayEntryType EntryType,
char *&MemPtr) XRAY_NEVER_INSTRUMENT {
std::aligned_storage<sizeof(FunctionRecord), alignof(FunctionRecord)>::type
@@ -339,6 +357,17 @@ static inline void writeFunctionRecord(int FuncId, uint32_t TSCDelta,
FuncRecord.RecordKind =
uint8_t(FunctionRecord::RecordKinds::FunctionTailExit);
break;
case XRayEntryType::CUSTOM_EVENT: {
// This is a bug in patching, so we'll report it once and move on.
static bool Once = [&] {
Report("Internal error: patched an XRay custom event call as a function; "
"func id = %d\n",
FuncId);
return true;
}();
(void)Once;
return;
}
}
std::memcpy(MemPtr, &AlignedFuncRecordBuffer, sizeof(FunctionRecord));
@@ -346,8 +375,9 @@ static inline void writeFunctionRecord(int FuncId, uint32_t TSCDelta,
}
static uint64_t thresholdTicks() {
static uint64_t TicksPerSec = probeRequiredCPUFeatures() ? getTSCFrequency() :
__xray::NanosecondsPerSecond;
static uint64_t TicksPerSec = probeRequiredCPUFeatures()
? getTSCFrequency()
: __xray::NanosecondsPerSecond;
static const uint64_t ThresholdTicks =
TicksPerSec * flags()->xray_fdr_log_func_duration_threshold_us / 1000000;
return ThresholdTicks;
@@ -397,9 +427,8 @@ static void rewindRecentCall(uint64_t TSC, uint64_t &LastTSC,
RewindingRecordPtr -= FunctionRecSize;
RewindingTSC -= ExpectedTailExit.TSCDelta;
AlignedFuncStorage FunctionEntryBuffer;
const auto &ExpectedFunctionEntry =
*reinterpret_cast<FunctionRecord *>(std::memcpy(
&FunctionEntryBuffer, RewindingRecordPtr, FunctionRecSize));
const auto &ExpectedFunctionEntry = *reinterpret_cast<FunctionRecord *>(
std::memcpy(&FunctionEntryBuffer, RewindingRecordPtr, FunctionRecSize));
assert(ExpectedFunctionEntry.RecordKind ==
uint8_t(FunctionRecord::RecordKinds::FunctionEnter) &&
"Expected to find function entry when rewinding tail call.");
@@ -422,7 +451,7 @@ static void rewindRecentCall(uint64_t TSC, uint64_t &LastTSC,
}
}
static inline bool releaseThreadLocalBuffer(BufferQueue *BQ) {
inline bool releaseThreadLocalBuffer(BufferQueue *BQ) {
auto EC = BQ->releaseBuffer(Buffer);
if (EC != BufferQueue::ErrorCode::Ok) {
Report("Failed to release buffer at %p; error=%s\n", Buffer.Buffer,
@@ -432,11 +461,29 @@ static inline bool releaseThreadLocalBuffer(BufferQueue *BQ) {
return true;
}
static inline void processFunctionHook(
int32_t FuncId, XRayEntryType Entry, uint64_t TSC, unsigned char CPU,
int (*wall_clock_reader)(clockid_t, struct timespec *),
__sanitizer::atomic_sint32_t &LoggingStatus,
const std::shared_ptr<BufferQueue> &BQ) XRAY_NEVER_INSTRUMENT {
inline bool prepareBuffer(int (*wall_clock_reader)(clockid_t,
struct timespec *),
size_t MaxSize) XRAY_NEVER_INSTRUMENT {
char *BufferStart = static_cast<char *>(Buffer.Buffer);
if ((RecordPtr + MaxSize) > (BufferStart + Buffer.Size - MetadataRecSize)) {
writeEOBMetadata();
if (!releaseThreadLocalBuffer(LocalBQ.get()))
return false;
auto EC = LocalBQ->getBuffer(Buffer);
if (EC != BufferQueue::ErrorCode::Ok) {
Report("Failed to acquire a buffer; error=%s\n",
BufferQueue::getErrorString(EC));
return false;
}
setupNewBuffer(wall_clock_reader);
}
return true;
}
inline bool isLogInitializedAndReady(
std::shared_ptr<BufferQueue> &LocalBQ, uint64_t TSC, unsigned char CPU,
int (*wall_clock_reader)(clockid_t,
struct timespec *)) XRAY_NEVER_INSTRUMENT {
// Bail out right away if logging is not initialized yet.
// We should take the opportunity to release the buffer though.
auto Status = __sanitizer::atomic_load(&LoggingStatus,
@@ -446,44 +493,19 @@ static inline void processFunctionHook(
(Status == XRayLogInitStatus::XRAY_LOG_FINALIZING ||
Status == XRayLogInitStatus::XRAY_LOG_FINALIZED)) {
writeEOBMetadata();
if (!releaseThreadLocalBuffer(BQ.get()))
return;
if (!releaseThreadLocalBuffer(LocalBQ.get()))
return false;
RecordPtr = nullptr;
LocalBQ = nullptr;
return false;
}
return;
}
// We use a thread_local variable to keep track of which CPUs we've already
// run, and the TSC times for these CPUs. This allows us to stop repeating the
// CPU field in the function records.
//
// We assume that we'll support only 65536 CPUs for x86_64.
thread_local uint16_t CurrentCPU = std::numeric_limits<uint16_t>::max();
thread_local uint64_t LastTSC = 0;
thread_local uint64_t LastFunctionEntryTSC = 0;
// Make sure a thread that's ever called handleArg0 has a thread-local
// live reference to the buffer queue for this particular instance of
// FDRLogging, and that we're going to clean it up when the thread exits.
thread_local auto LocalBQ = BQ;
thread_local ThreadExitBufferCleanup Cleanup(LocalBQ, Buffer);
// Prevent signal handler recursion, so in case we're already in a log writing
// mode and the signal handler comes in (and is also instrumented) then we
// don't want to be clobbering potentially partial writes already happening in
// the thread. We use a simple thread_local latch to only allow one on-going
// handleArg0 to happen at any given time.
thread_local bool Running = false;
RecursionGuard Guard{Running};
if (!Guard) {
assert(Running == true && "RecursionGuard is buggy!");
return;
return false;
}
if (!loggingInitialized(LoggingStatus) || LocalBQ->finalizing()) {
writeEOBMetadata();
if (!releaseThreadLocalBuffer(BQ.get()))
return;
if (!releaseThreadLocalBuffer(LocalBQ.get()))
return false;
RecordPtr = nullptr;
}
@@ -496,19 +518,57 @@ static inline void processFunctionHook(
LS != XRayLogInitStatus::XRAY_LOG_FINALIZED)
Report("Failed to acquire a buffer; error=%s\n",
BufferQueue::getErrorString(EC));
return;
return false;
}
setupNewBuffer(wall_clock_reader);
}
if (CurrentCPU == std::numeric_limits<uint16_t>::max()) {
// This means this is the first CPU this thread has ever run on. We set the
// current CPU and record this as the first TSC we've seen.
// This means this is the first CPU this thread has ever run on. We set
// the current CPU and record this as the first TSC we've seen.
CurrentCPU = CPU;
writeNewCPUIdMetadata(CPU, TSC);
}
return true;
} // namespace __xray_fdr_internal
inline void endBufferIfFull() XRAY_NEVER_INSTRUMENT {
auto BufferStart = static_cast<char *>(Buffer.Buffer);
if ((RecordPtr + MetadataRecSize) - BufferStart == MetadataRecSize) {
writeEOBMetadata();
if (!releaseThreadLocalBuffer(LocalBQ.get()))
return;
RecordPtr = nullptr;
}
}
inline void processFunctionHook(
int32_t FuncId, XRayEntryType Entry, uint64_t TSC, unsigned char CPU,
int (*wall_clock_reader)(clockid_t, struct timespec *),
__sanitizer::atomic_sint32_t &LoggingStatus,
const std::shared_ptr<BufferQueue> &BQ) XRAY_NEVER_INSTRUMENT {
// Prevent signal handler recursion, so in case we're already in a log writing
// mode and the signal handler comes in (and is also instrumented) then we
// don't want to be clobbering potentially partial writes already happening in
// the thread. We use a simple thread_local latch to only allow one on-going
// handleArg0 to happen at any given time.
thread_local bool Running = false;
RecursionGuard Guard{Running};
if (!Guard) {
assert(Running == true && "RecursionGuard is buggy!");
return;
}
// In case the reference has been cleaned up before, we make sure we
// initialize it to the provided BufferQueue.
if (LocalBQ == nullptr)
LocalBQ = BQ;
if (!isLogInitializedAndReady(LocalBQ, TSC, CPU, wall_clock_reader))
return;
// Before we go setting up writing new function entries, we need to be really
// careful about the pointer math we're doing. This means we need to ensure
// that the record we are about to write is going to fit into the buffer,
@@ -545,24 +605,14 @@ static inline void processFunctionHook(
// bytes in the end of the buffer, we need to write out the EOB, get a new
// Buffer, set it up properly before doing any further writing.
//
char *BufferStart = static_cast<char *>(Buffer.Buffer);
if ((RecordPtr + (MetadataRecSize + FunctionRecSize)) - BufferStart <
static_cast<ptrdiff_t>(MetadataRecSize)) {
writeEOBMetadata();
if (!releaseThreadLocalBuffer(LocalBQ.get()))
return;
auto EC = LocalBQ->getBuffer(Buffer);
if (EC != BufferQueue::ErrorCode::Ok) {
Report("Failed to acquire a buffer; error=%s\n",
BufferQueue::getErrorString(EC));
return;
}
setupNewBuffer(wall_clock_reader);
if (!prepareBuffer(wall_clock_reader, FunctionRecSize + MetadataRecSize)) {
LocalBQ = nullptr;
return;
}
// By this point, we are now ready to write at most 24 bytes (one metadata
// record and one function record).
BufferStart = static_cast<char *>(Buffer.Buffer);
auto BufferStart = static_cast<char *>(Buffer.Buffer);
assert((RecordPtr + (MetadataRecSize + FunctionRecSize)) - BufferStart >=
static_cast<ptrdiff_t>(MetadataRecSize) &&
"Misconfigured BufferQueue provided; Buffer size not large enough.");
@@ -586,7 +636,6 @@ static inline void processFunctionHook(
// FunctionRecord. In this case we write down just a FunctionRecord with
// the correct TSC delta.
//
uint32_t RecordTSCDelta = 0;
if (CPU != CurrentCPU) {
// We've moved to a new CPU.
@@ -619,21 +668,27 @@ static inline void processFunctionHook(
break;
rewindRecentCall(TSC, LastTSC, LastFunctionEntryTSC, FuncId);
return; // without writing log.
case XRayEntryType::CUSTOM_EVENT: {
// This is a bug in patching, so we'll report it once and move on.
static bool Once = [&] {
Report("Internal error: patched an XRay custom event call as a function; "
"func id = %d",
FuncId);
return true;
}();
(void)Once;
return;
}
}
writeFunctionRecord(FuncId, RecordTSCDelta, Entry, RecordPtr);
// If we've exhausted the buffer by this time, we then release the buffer to
// make sure that other threads may start using this buffer.
if ((RecordPtr + MetadataRecSize) - BufferStart == MetadataRecSize) {
writeEOBMetadata();
if (!releaseThreadLocalBuffer(LocalBQ.get()))
return;
RecordPtr = nullptr;
}
endBufferIfFull();
}
} // namespace __xray_fdr_internal
} // namespace __xray
#endif // XRAY_XRAY_FDR_LOGGING_IMPL_H

View File

@@ -50,6 +50,9 @@ __sanitizer::atomic_uintptr_t XRayPatchedFunction{0};
// This is the function to call from the arg1-enabled sleds/trampolines.
__sanitizer::atomic_uintptr_t XRayArgLogger{0};
// This is the function to call when we encounter a custom event log call.
__sanitizer::atomic_uintptr_t XRayPatchedCustomEvent{0};
// MProtectHelper is an RAII wrapper for calls to mprotect(...) that will undo
// any successful mprotect(...) changes. This is used to make a page writeable
// and executable, and upon destruction if it was successful in doing so returns
@@ -97,7 +100,19 @@ int __xray_set_handler(void (*entry)(int32_t,
__sanitizer::memory_order_acquire)) {
__sanitizer::atomic_store(&__xray::XRayPatchedFunction,
reinterpret_cast<uint64_t>(entry),
reinterpret_cast<uintptr_t>(entry),
__sanitizer::memory_order_release);
return 1;
}
return 0;
}
int __xray_set_customevent_handler(void (*entry)(void *, size_t))
XRAY_NEVER_INSTRUMENT {
if (__sanitizer::atomic_load(&XRayInitialized,
__sanitizer::memory_order_acquire)) {
__sanitizer::atomic_store(&__xray::XRayPatchedCustomEvent,
reinterpret_cast<uintptr_t>(entry),
__sanitizer::memory_order_release);
return 1;
}
@@ -161,6 +176,9 @@ inline bool patchSled(const XRaySledEntry &Sled, bool Enable,
case XRayEntryType::LOG_ARGS_ENTRY:
Success = patchFunctionEntry(Enable, FuncId, Sled, __xray_ArgLoggerEntry);
break;
case XRayEntryType::CUSTOM_EVENT:
Success = patchCustomEvent(Enable, FuncId, Sled);
break;
default:
Report("Unsupported sled kind '%d' @%04x\n", Sled.Address, int(Sled.Kind));
return false;
@@ -301,6 +319,7 @@ int __xray_set_handler_arg1(void (*Handler)(int32_t, XRayEntryType, uint64_t)) {
__sanitizer::memory_order_release);
return 1;
}
int __xray_remove_handler_arg1() { return __xray_set_handler_arg1(nullptr); }
uintptr_t __xray_function_address(int32_t FuncId) XRAY_NEVER_INSTRUMENT {

View File

@@ -60,6 +60,7 @@ bool patchFunctionEntry(bool Enable, uint32_t FuncId,
bool patchFunctionExit(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled);
bool patchFunctionTailExit(bool Enable, uint32_t FuncId,
const XRaySledEntry &Sled);
bool patchCustomEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled);
} // namespace __xray
@@ -70,6 +71,7 @@ extern void __xray_FunctionEntry();
extern void __xray_FunctionExit();
extern void __xray_FunctionTailExit();
extern void __xray_ArgLoggerEntry();
extern void __xray_CustomEvent();
}
#endif

View File

@@ -95,7 +95,8 @@ inline static bool patchSled(const bool Enable, const uint32_t FuncId,
// B #44
if (Enable) {
uint32_t LoTracingHookAddr = reinterpret_cast<int32_t>(TracingHook) & 0xffff;
uint32_t LoTracingHookAddr =
reinterpret_cast<int32_t>(TracingHook) & 0xffff;
uint32_t HiTracingHookAddr =
(reinterpret_cast<int32_t>(TracingHook) >> 16) & 0xffff;
uint32_t LoFunctionID = FuncId & 0xffff;
@@ -151,6 +152,12 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
return patchSled(Enable, FuncId, Sled, __xray_FunctionExit);
}
bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
// FIXME: Implement in mips?
return false;
}
} // namespace __xray
extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT {

View File

@@ -93,7 +93,8 @@ inline static bool patchSled(const bool Enable, const uint32_t FuncId,
if (Enable) {
uint32_t LoTracingHookAddr =
reinterpret_cast<int64_t>(TracingHook) & 0xffff;
uint32_t HiTracingHookAddr = (reinterpret_cast<int64_t>(TracingHook) >> 16) & 0xffff;
uint32_t HiTracingHookAddr =
(reinterpret_cast<int64_t>(TracingHook) >> 16) & 0xffff;
uint32_t HigherTracingHookAddr =
(reinterpret_cast<int64_t>(TracingHook) >> 32) & 0xffff;
uint32_t HighestTracingHookAddr =
@@ -160,6 +161,11 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
return patchSled(Enable, FuncId, Sled, __xray_FunctionExit);
}
bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
// FIXME: Implement in mips64?
return false;
}
} // namespace __xray
extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT {

View File

@@ -93,6 +93,12 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
// FIXME: Maybe implement this better?
bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; }
bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
// FIXME: Implement in powerpc64?
return false;
}
} // namespace __xray
extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT {

View File

@@ -176,9 +176,15 @@ __xray_ArgLoggerEntry:
je .Larg1entryFail
.Larg1entryLog:
movq %rdi, %rdx // first argument will become the third
xorq %rsi, %rsi // XRayEntryType::ENTRY into the second
movl %r10d, %edi // 32-bit function ID becomes the first
// First argument will become the third
movq %rdi, %rdx
// XRayEntryType::ENTRY into the second
xorq %rsi, %rsi
// 32-bit function ID becomes the first
movl %r10d, %edi
callq *%rax
.Larg1entryFail:
@@ -189,4 +195,38 @@ __xray_ArgLoggerEntry:
.size __xray_ArgLoggerEntry, .Larg1entryEnd-__xray_ArgLoggerEntry
.cfi_endproc
//===----------------------------------------------------------------------===//
.global __xray_CustomEvent
.align 16, 0x90
.type __xray_CustomEvent,@function
__xray_CustomEvent:
.cfi_startproc
subq $16, %rsp
.cfi_def_cfa_offset 24
movq %rbp, 8(%rsp)
movq %rax, 0(%rsp)
// We take two arguments to this trampoline, which should be in rdi and rsi
// already. We also make sure that we stash %rax because we use that register
// to call the logging handler.
movq _ZN6__xray22XRayPatchedCustomEventE(%rip), %rax
testq %rax,%rax
je .LcustomEventCleanup
// At this point we know that rcx and rdx already has the data, so we just
// call the logging handler.
callq *%rax
.LcustomEventCleanup:
movq 0(%rsp), %rax
movq 8(%rsp), %rbp
addq $16, %rsp
.cfi_def_cfa_offset 8
retq
.Ltmp8:
.size __xray_CustomEvent, .Ltmp8-__xray_CustomEvent
.cfi_endproc
NO_EXEC_STACK_DIRECTIVE

View File

@@ -75,8 +75,10 @@ uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
static constexpr uint8_t CallOpCode = 0xe8;
static constexpr uint16_t MovR10Seq = 0xba41;
static constexpr uint16_t Jmp9Seq = 0x09eb;
static constexpr uint16_t Jmp20Seq = 0x14eb;
static constexpr uint8_t JmpOpCode = 0xe9;
static constexpr uint8_t RetOpCode = 0xc3;
static constexpr uint16_t NopwSeq = 0x9066;
static constexpr int64_t MinOffset{std::numeric_limits<int32_t>::min()};
static constexpr int64_t MaxOffset{std::numeric_limits<int32_t>::max()};
@@ -201,6 +203,40 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
return true;
}
bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
// Here we do the dance of replacing the following sled:
//
// xray_sled_n:
// jmp +19 // 2 bytes
// ...
//
// With the following:
//
// nopw // 2 bytes*
// ...
//
// We need to do this in the following order:
//
// 1. Overwrite the 5-byte nop with the call (relative), where (relative) is
// the relative offset to the __xray_CustomEvent trampoline.
// 2. Do a two-byte atomic write over the 'jmp +24' to turn it into a 'nopw'.
// This allows us to "enable" this code once the changes have committed.
//
// The "unpatch" should just turn the 'nopw' back to a 'jmp +24'.
//
if (Enable) {
std::atomic_store_explicit(
reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), NopwSeq,
std::memory_order_release);
} else {
std::atomic_store_explicit(
reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), Jmp20Seq,
std::memory_order_release);
}
return false;
}
// We determine whether the CPU we're running on has the correct features we
// need. In x86_64 this will be rdtscp support.
bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT {

View File

@@ -0,0 +1,38 @@
// Use the clang feature for custom xray event logging.
//
// RUN: %clangxx_xray -std=c++11 %s -o %t
// RUN: XRAY_OPTIONS="patch_premain=false verbosity=1 xray_naive_log=false xray_logfile_base=custom-event-logging.xray-" %run %t 2>&1 | FileCheck %s
//
#include <cstdio>
#include "xray/xray_interface.h"
[[clang::xray_always_instrument]] void foo() {
static constexpr char CustomLogged[] = "hello custom logging!";
printf("before calling the custom logging...\n");
__xray_customevent(CustomLogged, sizeof(CustomLogged));
printf("after calling the custom logging...\n");
}
void myprinter(void* ptr, size_t size) {
printf("%.*s\n", static_cast<int>(size), static_cast<const char*>(ptr));
}
int main() {
foo();
// CHECK: before calling the custom logging...
// CHECK-NEXT: after calling the custom logging...
printf("setting up custom event handler...\n");
// CHECK-NEXT: setting up custom event handler...
__xray_set_customevent_handler(myprinter);
__xray_patch();
// CHECK-NEXT: before calling the custom logging...
foo();
// CHECK-NEXT: hello custom logging!
// CHECK-NEXT: after calling the custom logging...
printf("removing custom event handler...\n");
// CHECK-NEXT: removing custom event handler...
__xray_set_customevent_handler(nullptr);
foo();
// CHECK-NEXT: before calling the custom logging...
// CHECK-NEXT: after calling the custom logging...
}