lldb/source/Breakpoint/WatchpointAlgorithms.cpp

//===-- WatchpointAlgorithms.cpp ------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "lldb/Breakpoint/WatchpointAlgorithms.h"
#include "lldb/Breakpoint/WatchpointResource.h"
#include "lldb/Target/Process.h"
#include "lldb/Utility/ArchSpec.h"
#include "lldb/Utility/LLDBLog.h"
#include "lldb/Utility/Log.h"

#include <algorithm>
#include <utility>
#include <vector>

using namespace lldb;
using namespace lldb_private;

std::vector<WatchpointResourceSP>
WatchpointAlgorithms::AtomizeWatchpointRequest(
    addr_t addr, size_t size, bool read, bool write,
    WatchpointHardwareFeature supported_features, ArchSpec &arch) {

  std::vector<Region> entries;

  if (supported_features & eWatchpointHardwareArmMASK) {
    entries =
        PowerOf2Watchpoints(addr, size,
                            /*min_byte_size*/ 1,
                            /*max_byte_size*/ INT32_MAX,
                            /*address_byte_size*/ arch.GetAddressByteSize());
  } else {
    // As a fallback, assume we can watch any power-of-2
    // number of bytes up through the size of an address in the target.
    entries =
        PowerOf2Watchpoints(addr, size,
                            /*min_byte_size*/ 1,
                            /*max_byte_size*/ arch.GetAddressByteSize(),
                            /*address_byte_size*/ arch.GetAddressByteSize());
  }

  Log *log = GetLog(LLDBLog::Watchpoints);
  LLDB_LOGV(log, "AtomizeWatchpointRequest user request addr {0:x} size {1}",
            addr, size);
  std::vector<WatchpointResourceSP> resources;
  for (Region &ent : entries) {
    LLDB_LOGV(log, "AtomizeWatchpointRequest creating resource {0:x} size {1}",
              ent.addr, ent.size);
    WatchpointResourceSP wp_res_sp =
        std::make_shared<WatchpointResource>(ent.addr, ent.size, read, write);
    resources.push_back(wp_res_sp);
  }

  return resources;
}

// This should be `std::bit_ceil(aligned_size)` but
// that requires C++20.
// Calculates the smallest integral power of two that is not smaller than x.
static uint64_t bit_ceil(uint64_t input) {
  if (input <= 1 || llvm::popcount(input) == 1)
    return input;

  return 1ULL << (64 - llvm::countl_zero(input));
}

/// Convert a user's watchpoint request (\a user_addr and \a user_size)
/// into hardware watchpoints, for a target that can watch a power-of-2
/// region of memory (1, 2, 4, 8, etc), aligned to that same power-of-2
/// memory address.
///
/// If a user asks to watch 4 bytes at address 0x1002 (0x1002-0x1005
/// inclusive) we can implement this with two 2-byte watchpoints
/// (0x1002 and 0x1004) or with an 8-byte watchpoint at 0x1000.
/// A 4-byte watchpoint at 0x1002 would not be properly 4 byte aligned.
///
/// If a user asks to watch 16 bytes at 0x1000, and this target supports
/// 8-byte watchpoints, we can implement this with two 8-byte watchpoints
/// at 0x1000 and 0x1008.
std::vector<WatchpointAlgorithms::Region>
WatchpointAlgorithms::PowerOf2Watchpoints(addr_t user_addr, size_t user_size,
                                          size_t min_byte_size,
                                          size_t max_byte_size,
                                          uint32_t address_byte_size) {

  Log *log = GetLog(LLDBLog::Watchpoints);
  LLDB_LOGV(log,
            "AtomizeWatchpointRequest user request addr {0:x} size {1} "
            "min_byte_size {2}, max_byte_size {3}, address_byte_size {4}",
            user_addr, user_size, min_byte_size, max_byte_size,
            address_byte_size);

  // Can't watch zero bytes.
  if (user_size == 0)
    return {};

  size_t aligned_size = std::max(user_size, min_byte_size);
  /// Round up \a user_size to the next power-of-2 size
  /// user_size == 8   -> aligned_size == 8
  /// user_size == 9   -> aligned_size == 16
  aligned_size = bit_ceil(aligned_size);

  addr_t aligned_start = user_addr & ~(aligned_size - 1);

  // Does this power-of-2 memory range, aligned to power-of-2 that the
  // hardware can watch, completely cover the requested region.
  if (aligned_size <= max_byte_size &&
      aligned_start + aligned_size >= user_addr + user_size)
    return {{aligned_start, aligned_size}};

  // If the maximum region we can watch is larger than the aligned
  // size, try increasing the region size by one power of 2 and see
  // if aligning to that amount can cover the requested region.
  //
  // Increasing the aligned_size repeatedly instead of splitting the
  // watchpoint can result in us watching large regions of memory
  // unintentionally when we could use small two watchpoints.  e.g.
  //    user_addr 0x3ff8 user_size 32
  // can be watched with four 8-byte watchpoints or if it's done with one
  // MASK watchpoint, it would need to be a 32KB watchpoint (a 16KB
  // watchpoint at 0x0 only covers 0x0000-0x4000).  A user request
  // at the end of a power-of-2 region can lead to these undesirably
  // large watchpoints and many false positive hits to ignore.
  if (max_byte_size >= (aligned_size << 1)) {
    aligned_size <<= 1;
    aligned_start = user_addr & ~(aligned_size - 1);
    if (aligned_size <= max_byte_size &&
        aligned_start + aligned_size >= user_addr + user_size)
      return {{aligned_start, aligned_size}};

    // Go back to our original aligned size, to try the multiple
    // watchpoint approach.
    aligned_size >>= 1;
  }

  // We need to split the user's watchpoint into two or more watchpoints
  // that can be monitored by hardware, because of alignment and/or size
  // reasons.
  aligned_size = std::min(aligned_size, max_byte_size);
  aligned_start = user_addr & ~(aligned_size - 1);

  std::vector<Region> result;
  addr_t current_address = aligned_start;
  const addr_t user_end_address = user_addr + user_size;
  while (current_address + aligned_size < user_end_address) {
    result.push_back({current_address, aligned_size});
    current_address += aligned_size;
  }

  if (current_address < user_end_address)
    result.push_back({current_address, aligned_size});

  return result;
}
[lldb] Add support for large watchpoints in lldb (#79962) This patch is the next piece of work in my Large Watchpoint proposal, https://discourse.llvm.org/t/rfc-large-watchpoint-support-in-lldb/72116 This patch breaks a user's watchpoint into one or more WatchpointResources which reflect what the hardware registers can cover. This means we can watch objects larger than 8 bytes, and we can watched unaligned address ranges. On a typical 64-bit target with 4 watchpoint registers you can watch 32 bytes of memory if the start address is doubleword aligned. Additionally, if the remote stub implements AArch64 MASK style watchpoints (e.g. debugserver on Darwin), we can watch any power-of-2 size region of memory up to 2GB, aligned to that same size. I updated the Watchpoint constructor and CommandObjectWatchpoint to create a CompilerType of Array<UInt8> when the size of the watched region is greater than pointer-size and we don't have a variable type to use. For pointer-size and smaller, we can display the watched granule as an integer value; for larger-than-pointer-size we will display as an array of bytes. I have `watchpoint list` now print the WatchpointResources used to implement the watchpoint. I added a WatchpointAlgorithm class which has a top-level static method that takes an enum flag mask WatchpointHardwareFeature and a user address and size, and returns a vector of WatchpointResources covering the request. It does not take into account the number of watchpoint registers the target has, or the number still available for use. Right now there is only one algorithm, which monitors power-of-2 regions of memory. For up to pointer-size, this is what Intel hardware supports. AArch64 Byte Address Select watchpoints can watch any number of contiguous bytes in a pointer-size memory granule, that is not currently supported so if you ask to watch bytes 3-5, the algorithm will watch the entire doubleword (8 bytes). The newly default "modify" style means we will silently ignore modifications to bytes outside the watched range. I've temporarily skipped TestLargeWatchpoint.py for all targets. It was only run on Darwin when using the in-tree debugserver, which was a proxy for "debugserver supports MASK watchpoints". I'll be adding the aforementioned feature flag from the stub and enabling full mask watchpoints when a debugserver with that feature is enabled, and re-enable this test. I added a new TestUnalignedLargeWatchpoint.py which only has one test but it's a great one, watching a 22-byte range that is unaligned and requires four 8-byte watchpoints to cover. I also added a unit test, WatchpointAlgorithmsTests, which has a number of simple tests against WatchpointAlgorithms::PowerOf2Watchpoints. I think there's interesting possible different approaches to how we cover these; I note in the unit test that a user requesting a watch on address 0x12e0 of 120 bytes will be covered by two watchpoints today, a 128-bytes at 0x1280 and at 0x1300. But it could be done with a 16-byte watchpoint at 0x12e0 and a 128-byte at 0x1300, which would have fewer false positives/private stops. As we try refining this one, it's helpful to have a collection of tests to make sure things don't regress. I tested this on arm64 macOS, (genuine) x86_64 macOS, and AArch64 Ubuntu. I have not modifed the Windows process plugins yet, I might try that as a standalone patch, I'd be making the change blind, but the necessary changes (see ProcessGDBRemote::EnableWatchpoint) are pretty small so it might be obvious enough that I can change it and see what the Windows CI thinks. There isn't yet a packet (or a qSupported feature query) for the gdb remote serial protocol stub to communicate its watchpoint capabilities to lldb. I'll be doing that in a patch right after this is landed, having debugserver advertise its capability of AArch64 MASK watchpoints, and have ProcessGDBRemote add eWatchpointHardwareArmMASK to WatchpointAlgorithms so we can watch larger than 32-byte requests on Darwin. I haven't yet tackled WatchpointResource sharing by multiple Watchpoints. This is all part of the goal, especially when we may be watching a larger memory range than the user requested, if they then add another watchpoint next to their first request, it may be covered by the same WatchpointResource (hardware watchpoint register). Also one "read" watchpoint and one "write" watchpoint on the same memory granule need to be handled, making the WatchpointResource cover all requests. As WatchpointResources aren't shared among multiple Watchpoints yet, there's no handling of running the conditions/commands/etc on multiple Watchpoints when their shared WatchpointResource is hit. The goal beyond "large watchpoint" is to unify (much more) the Watchpoint and Breakpoint behavior and commands. I have a feeling I may be slowly chipping away at this for a while. Re-landing this patch after fixing two undefined behaviors in WatchpointAlgorithms found by UBSan and by failures on different CI bots. rdar://108234227 2024-01-31 21:01:59 -08:00			`//===-- WatchpointAlgorithms.cpp ------------------------------------------===//`
			`//`
			`// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.`
			`// See https://llvm.org/LICENSE.txt for license information.`
			`// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception`
			`//`
			`//===----------------------------------------------------------------------===//`

			`#include "lldb/Breakpoint/WatchpointAlgorithms.h"`
			`#include "lldb/Breakpoint/WatchpointResource.h"`
			`#include "lldb/Target/Process.h"`
			`#include "lldb/Utility/ArchSpec.h"`
			`#include "lldb/Utility/LLDBLog.h"`
			`#include "lldb/Utility/Log.h"`

			`#include <algorithm>`
			`#include <utility>`
			`#include <vector>`

			`using namespace lldb;`
			`using namespace lldb_private;`

			`std::vector<WatchpointResourceSP>`
			`WatchpointAlgorithms::AtomizeWatchpointRequest(`
			`addr_t addr, size_t size, bool read, bool write,`
			`WatchpointHardwareFeature supported_features, ArchSpec &arch) {`

			`std::vector<Region> entries;`

[lldb] Add QSupported key to report watchpoint types supported (#80376) debugserver on arm64 devices can manage both Byte Address Select watchpoints (1-8 bytes) and MASK watchpoints (8 bytes-2 gigabytes). This adds a SupportedWatchpointTypes key to the QSupported response from debugserver with a list of these, so lldb can take full advantage of them when creating larger regions with a single hardware watchpoint. Also add documentation for this, and two other lldb extensions, to the lldb-gdb-remote.txt documentation. Re-enable TestLargeWatchpoint.py on Darwin systems when testing with the in-tree built debugserver. I can remove the "in-tree built debugserver" in the future when this new key is handled by an Xcode debugserver. 2024-02-05 18:45:01 -08:00			`if (supported_features & eWatchpointHardwareArmMASK) {`
[lldb] Add support for large watchpoints in lldb (#79962) This patch is the next piece of work in my Large Watchpoint proposal, https://discourse.llvm.org/t/rfc-large-watchpoint-support-in-lldb/72116 This patch breaks a user's watchpoint into one or more WatchpointResources which reflect what the hardware registers can cover. This means we can watch objects larger than 8 bytes, and we can watched unaligned address ranges. On a typical 64-bit target with 4 watchpoint registers you can watch 32 bytes of memory if the start address is doubleword aligned. Additionally, if the remote stub implements AArch64 MASK style watchpoints (e.g. debugserver on Darwin), we can watch any power-of-2 size region of memory up to 2GB, aligned to that same size. I updated the Watchpoint constructor and CommandObjectWatchpoint to create a CompilerType of Array<UInt8> when the size of the watched region is greater than pointer-size and we don't have a variable type to use. For pointer-size and smaller, we can display the watched granule as an integer value; for larger-than-pointer-size we will display as an array of bytes. I have `watchpoint list` now print the WatchpointResources used to implement the watchpoint. I added a WatchpointAlgorithm class which has a top-level static method that takes an enum flag mask WatchpointHardwareFeature and a user address and size, and returns a vector of WatchpointResources covering the request. It does not take into account the number of watchpoint registers the target has, or the number still available for use. Right now there is only one algorithm, which monitors power-of-2 regions of memory. For up to pointer-size, this is what Intel hardware supports. AArch64 Byte Address Select watchpoints can watch any number of contiguous bytes in a pointer-size memory granule, that is not currently supported so if you ask to watch bytes 3-5, the algorithm will watch the entire doubleword (8 bytes). The newly default "modify" style means we will silently ignore modifications to bytes outside the watched range. I've temporarily skipped TestLargeWatchpoint.py for all targets. It was only run on Darwin when using the in-tree debugserver, which was a proxy for "debugserver supports MASK watchpoints". I'll be adding the aforementioned feature flag from the stub and enabling full mask watchpoints when a debugserver with that feature is enabled, and re-enable this test. I added a new TestUnalignedLargeWatchpoint.py which only has one test but it's a great one, watching a 22-byte range that is unaligned and requires four 8-byte watchpoints to cover. I also added a unit test, WatchpointAlgorithmsTests, which has a number of simple tests against WatchpointAlgorithms::PowerOf2Watchpoints. I think there's interesting possible different approaches to how we cover these; I note in the unit test that a user requesting a watch on address 0x12e0 of 120 bytes will be covered by two watchpoints today, a 128-bytes at 0x1280 and at 0x1300. But it could be done with a 16-byte watchpoint at 0x12e0 and a 128-byte at 0x1300, which would have fewer false positives/private stops. As we try refining this one, it's helpful to have a collection of tests to make sure things don't regress. I tested this on arm64 macOS, (genuine) x86_64 macOS, and AArch64 Ubuntu. I have not modifed the Windows process plugins yet, I might try that as a standalone patch, I'd be making the change blind, but the necessary changes (see ProcessGDBRemote::EnableWatchpoint) are pretty small so it might be obvious enough that I can change it and see what the Windows CI thinks. There isn't yet a packet (or a qSupported feature query) for the gdb remote serial protocol stub to communicate its watchpoint capabilities to lldb. I'll be doing that in a patch right after this is landed, having debugserver advertise its capability of AArch64 MASK watchpoints, and have ProcessGDBRemote add eWatchpointHardwareArmMASK to WatchpointAlgorithms so we can watch larger than 32-byte requests on Darwin. I haven't yet tackled WatchpointResource sharing by multiple Watchpoints. This is all part of the goal, especially when we may be watching a larger memory range than the user requested, if they then add another watchpoint next to their first request, it may be covered by the same WatchpointResource (hardware watchpoint register). Also one "read" watchpoint and one "write" watchpoint on the same memory granule need to be handled, making the WatchpointResource cover all requests. As WatchpointResources aren't shared among multiple Watchpoints yet, there's no handling of running the conditions/commands/etc on multiple Watchpoints when their shared WatchpointResource is hit. The goal beyond "large watchpoint" is to unify (much more) the Watchpoint and Breakpoint behavior and commands. I have a feeling I may be slowly chipping away at this for a while. Re-landing this patch after fixing two undefined behaviors in WatchpointAlgorithms found by UBSan and by failures on different CI bots. rdar://108234227 2024-01-31 21:01:59 -08:00			`entries =`
			`PowerOf2Watchpoints(addr, size,`
			`/min_byte_size/ 1,`
			`/max_byte_size/ INT32_MAX,`
			`/address_byte_size/ arch.GetAddressByteSize());`
			`} else {`
			`// As a fallback, assume we can watch any power-of-2`
			`// number of bytes up through the size of an address in the target.`
			`entries =`
			`PowerOf2Watchpoints(addr, size,`
			`/min_byte_size/ 1,`
			`/max_byte_size/ arch.GetAddressByteSize(),`
			`/address_byte_size/ arch.GetAddressByteSize());`
			`}`

			`Log *log = GetLog(LLDBLog::Watchpoints);`
			`LLDB_LOGV(log, "AtomizeWatchpointRequest user request addr {0:x} size {1}",`
			`addr, size);`
			`std::vector<WatchpointResourceSP> resources;`
			`for (Region &ent : entries) {`
			`LLDB_LOGV(log, "AtomizeWatchpointRequest creating resource {0:x} size {1}",`
			`ent.addr, ent.size);`
			`WatchpointResourceSP wp_res_sp =`
			`std::make_shared<WatchpointResource>(ent.addr, ent.size, read, write);`
			`resources.push_back(wp_res_sp);`
			`}`

			`return resources;`
			`}`

			// This should be `std::bit_ceil(aligned_size)` but
			`// that requires C++20.`
			`// Calculates the smallest integral power of two that is not smaller than x.`
			`static uint64_t bit_ceil(uint64_t input) {`
			`if (input <= 1 \|\| llvm::popcount(input) == 1)`
			`return input;`

			`return 1ULL << (64 - llvm::countl_zero(input));`
			`}`

			`/// Convert a user's watchpoint request (\a user_addr and \a user_size)`
			`/// into hardware watchpoints, for a target that can watch a power-of-2`
			`/// region of memory (1, 2, 4, 8, etc), aligned to that same power-of-2`
			`/// memory address.`
			`///`
			`/// If a user asks to watch 4 bytes at address 0x1002 (0x1002-0x1005`
			`/// inclusive) we can implement this with two 2-byte watchpoints`
			`/// (0x1002 and 0x1004) or with an 8-byte watchpoint at 0x1000.`
			`/// A 4-byte watchpoint at 0x1002 would not be properly 4 byte aligned.`
			`///`
			`/// If a user asks to watch 16 bytes at 0x1000, and this target supports`
			`/// 8-byte watchpoints, we can implement this with two 8-byte watchpoints`
			`/// at 0x1000 and 0x1008.`
			`std::vector<WatchpointAlgorithms::Region>`
			`WatchpointAlgorithms::PowerOf2Watchpoints(addr_t user_addr, size_t user_size,`
			`size_t min_byte_size,`
			`size_t max_byte_size,`
			`uint32_t address_byte_size) {`

			`Log *log = GetLog(LLDBLog::Watchpoints);`
			`LLDB_LOGV(log,`
			`"AtomizeWatchpointRequest user request addr {0:x} size {1} "`
			`"min_byte_size {2}, max_byte_size {3}, address_byte_size {4}",`
			`user_addr, user_size, min_byte_size, max_byte_size,`
			`address_byte_size);`

			`// Can't watch zero bytes.`
			`if (user_size == 0)`
			`return {};`

			`size_t aligned_size = std::max(user_size, min_byte_size);`
			`/// Round up \a user_size to the next power-of-2 size`
			`/// user_size == 8 -> aligned_size == 8`
			`/// user_size == 9 -> aligned_size == 16`
			`aligned_size = bit_ceil(aligned_size);`

			`addr_t aligned_start = user_addr & ~(aligned_size - 1);`

			`// Does this power-of-2 memory range, aligned to power-of-2 that the`
			`// hardware can watch, completely cover the requested region.`
			`if (aligned_size <= max_byte_size &&`
			`aligned_start + aligned_size >= user_addr + user_size)`
			`return {{aligned_start, aligned_size}};`

			`// If the maximum region we can watch is larger than the aligned`
			`// size, try increasing the region size by one power of 2 and see`
			`// if aligning to that amount can cover the requested region.`
			`//`
			`// Increasing the aligned_size repeatedly instead of splitting the`
			`// watchpoint can result in us watching large regions of memory`
			`// unintentionally when we could use small two watchpoints. e.g.`
			`// user_addr 0x3ff8 user_size 32`
			`// can be watched with four 8-byte watchpoints or if it's done with one`
			`// MASK watchpoint, it would need to be a 32KB watchpoint (a 16KB`
			`// watchpoint at 0x0 only covers 0x0000-0x4000). A user request`
			`// at the end of a power-of-2 region can lead to these undesirably`
			`// large watchpoints and many false positive hits to ignore.`
			`if (max_byte_size >= (aligned_size << 1)) {`
			`aligned_size <<= 1;`
			`aligned_start = user_addr & ~(aligned_size - 1);`
			`if (aligned_size <= max_byte_size &&`
			`aligned_start + aligned_size >= user_addr + user_size)`
			`return {{aligned_start, aligned_size}};`

			`// Go back to our original aligned size, to try the multiple`
			`// watchpoint approach.`
			`aligned_size >>= 1;`
			`}`

			`// We need to split the user's watchpoint into two or more watchpoints`
			`// that can be monitored by hardware, because of alignment and/or size`
			`// reasons.`
			`aligned_size = std::min(aligned_size, max_byte_size);`
			`aligned_start = user_addr & ~(aligned_size - 1);`

			`std::vector<Region> result;`
			`addr_t current_address = aligned_start;`
			`const addr_t user_end_address = user_addr + user_size;`
			`while (current_address + aligned_size < user_end_address) {`
			`result.push_back({current_address, aligned_size});`
			`current_address += aligned_size;`
			`}`

			`if (current_address < user_end_address)`
			`result.push_back({current_address, aligned_size});`

			`return result;`
			`}`