Path: blob/main/contrib/llvm-project/lldb/source/DataFormatters/StringPrinter.cpp
39587 views
//===-- StringPrinter.cpp -------------------------------------------------===//1//2// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.3// See https://llvm.org/LICENSE.txt for license information.4// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception5//6//===----------------------------------------------------------------------===//78#include "lldb/DataFormatters/StringPrinter.h"910#include "lldb/Core/Debugger.h"11#include "lldb/Core/ValueObject.h"12#include "lldb/Target/Language.h"13#include "lldb/Target/Process.h"14#include "lldb/Target/Target.h"15#include "lldb/Utility/Status.h"1617#include "llvm/ADT/StringExtras.h"18#include "llvm/Support/ConvertUTF.h"1920#include <cctype>21#include <locale>22#include <memory>2324using namespace lldb;25using namespace lldb_private;26using namespace lldb_private::formatters;27using GetPrintableElementType = StringPrinter::GetPrintableElementType;28using StringElementType = StringPrinter::StringElementType;2930/// DecodedCharBuffer stores the decoded contents of a single character. It31/// avoids managing memory on the heap by copying decoded bytes into an in-line32/// buffer.33class DecodedCharBuffer {34public:35DecodedCharBuffer(std::nullptr_t) {}3637DecodedCharBuffer(const uint8_t *bytes, size_t size) : m_size(size) {38if (size > MaxLength)39llvm_unreachable("unsupported length");40memcpy(m_data, bytes, size);41}4243DecodedCharBuffer(const char *bytes, size_t size)44: DecodedCharBuffer(reinterpret_cast<const uint8_t *>(bytes), size) {}4546const uint8_t *GetBytes() const { return m_data; }4748size_t GetSize() const { return m_size; }4950private:51static constexpr unsigned MaxLength = 16;5253size_t m_size = 0;54uint8_t m_data[MaxLength] = {0};55};5657using EscapingHelper =58std::function<DecodedCharBuffer(uint8_t *, uint8_t *, uint8_t *&)>;5960// we define this for all values of type but only implement it for those we61// care about that's good because we get linker errors for any unsupported type62template <StringElementType type>63static DecodedCharBuffer64GetPrintableImpl(uint8_t *buffer, uint8_t *buffer_end, uint8_t *&next,65StringPrinter::EscapeStyle escape_style);6667// Mimic isprint() for Unicode codepoints.68static bool isprint32(char32_t codepoint) {69if (codepoint <= 0x1F || codepoint == 0x7F) // C070{71return false;72}73if (codepoint >= 0x80 && codepoint <= 0x9F) // C174{75return false;76}77if (codepoint == 0x2028 || codepoint == 0x2029) // line/paragraph separators78{79return false;80}81if (codepoint == 0x200E || codepoint == 0x200F ||82(codepoint >= 0x202A &&83codepoint <= 0x202E)) // bidirectional text control84{85return false;86}87if (codepoint >= 0xFFF9 &&88codepoint <= 0xFFFF) // interlinears and generally specials89{90return false;91}92return true;93}9495DecodedCharBuffer attemptASCIIEscape(llvm::UTF32 c,96StringPrinter::EscapeStyle escape_style) {97const bool is_swift_escape_style =98escape_style == StringPrinter::EscapeStyle::Swift;99switch (c) {100case 0:101return {"\\0", 2};102case '\a':103return {"\\a", 2};104case '\b':105if (is_swift_escape_style)106return nullptr;107return {"\\b", 2};108case '\f':109if (is_swift_escape_style)110return nullptr;111return {"\\f", 2};112case '\n':113return {"\\n", 2};114case '\r':115return {"\\r", 2};116case '\t':117return {"\\t", 2};118case '\v':119if (is_swift_escape_style)120return nullptr;121return {"\\v", 2};122case '\"':123return {"\\\"", 2};124case '\'':125if (is_swift_escape_style)126return {"\\'", 2};127return nullptr;128case '\\':129return {"\\\\", 2};130}131return nullptr;132}133134template <>135DecodedCharBuffer GetPrintableImpl<StringElementType::ASCII>(136uint8_t *buffer, uint8_t *buffer_end, uint8_t *&next,137StringPrinter::EscapeStyle escape_style) {138// The ASCII helper always advances 1 byte at a time.139next = buffer + 1;140141DecodedCharBuffer retval = attemptASCIIEscape(*buffer, escape_style);142if (retval.GetSize())143return retval;144145// Use llvm's locale-independent isPrint(char), instead of the libc146// implementation which may give different results on different platforms.147if (llvm::isPrint(*buffer))148return {buffer, 1};149150unsigned escaped_len;151constexpr unsigned max_buffer_size = 7;152uint8_t data[max_buffer_size];153switch (escape_style) {154case StringPrinter::EscapeStyle::CXX:155// Prints 4 characters, then a \0 terminator.156escaped_len = snprintf((char *)data, max_buffer_size, "\\x%02x", *buffer);157break;158case StringPrinter::EscapeStyle::Swift:159// Prints up to 6 characters, then a \0 terminator.160escaped_len = snprintf((char *)data, max_buffer_size, "\\u{%x}", *buffer);161break;162}163lldbassert(escaped_len > 0 && "unknown string escape style");164return {data, escaped_len};165}166167template <>168DecodedCharBuffer GetPrintableImpl<StringElementType::UTF8>(169uint8_t *buffer, uint8_t *buffer_end, uint8_t *&next,170StringPrinter::EscapeStyle escape_style) {171// If the utf8 encoded length is invalid (i.e., not in the closed interval172// [1;4]), or if there aren't enough bytes to print, or if the subsequence173// isn't valid utf8, fall back to printing an ASCII-escaped subsequence.174if (!llvm::isLegalUTF8Sequence(buffer, buffer_end))175return GetPrintableImpl<StringElementType::ASCII>(buffer, buffer_end, next,176escape_style);177178// Convert the valid utf8 sequence to a utf32 codepoint. This cannot fail.179llvm::UTF32 codepoint = 0;180const llvm::UTF8 *buffer_for_conversion = buffer;181llvm::ConversionResult result = llvm::convertUTF8Sequence(182&buffer_for_conversion, buffer_end, &codepoint, llvm::strictConversion);183assert(result == llvm::conversionOK &&184"Failed to convert legal utf8 sequence");185UNUSED_IF_ASSERT_DISABLED(result);186187// The UTF8 helper always advances by the utf8 encoded length.188const unsigned utf8_encoded_len = buffer_for_conversion - buffer;189next = buffer + utf8_encoded_len;190191DecodedCharBuffer retval = attemptASCIIEscape(codepoint, escape_style);192if (retval.GetSize())193return retval;194if (isprint32(codepoint))195return {buffer, utf8_encoded_len};196197unsigned escaped_len;198constexpr unsigned max_buffer_size = 13;199uint8_t data[max_buffer_size];200switch (escape_style) {201case StringPrinter::EscapeStyle::CXX:202// Prints 10 characters, then a \0 terminator.203escaped_len = snprintf((char *)data, max_buffer_size, "\\U%08x", codepoint);204break;205case StringPrinter::EscapeStyle::Swift:206// Prints up to 12 characters, then a \0 terminator.207escaped_len = snprintf((char *)data, max_buffer_size, "\\u{%x}", codepoint);208break;209}210lldbassert(escaped_len > 0 && "unknown string escape style");211return {data, escaped_len};212}213214// Given a sequence of bytes, this function returns: a sequence of bytes to215// actually print out + a length the following unscanned position of the buffer216// is in next217static DecodedCharBuffer GetPrintable(StringElementType type, uint8_t *buffer,218uint8_t *buffer_end, uint8_t *&next,219StringPrinter::EscapeStyle escape_style) {220if (!buffer || buffer >= buffer_end)221return {nullptr};222223switch (type) {224case StringElementType::ASCII:225return GetPrintableImpl<StringElementType::ASCII>(buffer, buffer_end, next,226escape_style);227case StringElementType::UTF8:228return GetPrintableImpl<StringElementType::UTF8>(buffer, buffer_end, next,229escape_style);230default:231return {nullptr};232}233}234235static EscapingHelper236GetDefaultEscapingHelper(GetPrintableElementType elem_type,237StringPrinter::EscapeStyle escape_style) {238switch (elem_type) {239case GetPrintableElementType::UTF8:240case GetPrintableElementType::ASCII:241return [escape_style, elem_type](uint8_t *buffer, uint8_t *buffer_end,242uint8_t *&next) -> DecodedCharBuffer {243return GetPrintable(elem_type == GetPrintableElementType::UTF8244? StringElementType::UTF8245: StringElementType::ASCII,246buffer, buffer_end, next, escape_style);247};248}249llvm_unreachable("bad element type");250}251252/// Read a string encoded in accordance with \tparam SourceDataType from a253/// host-side LLDB buffer, then pretty-print it to a stream using \p style.254template <typename SourceDataType>255static bool DumpEncodedBufferToStream(256GetPrintableElementType style,257llvm::ConversionResult (*ConvertFunction)(const SourceDataType **,258const SourceDataType *,259llvm::UTF8 **, llvm::UTF8 *,260llvm::ConversionFlags),261const StringPrinter::ReadBufferAndDumpToStreamOptions &dump_options) {262assert(dump_options.GetStream() && "need a Stream to print the string to");263Stream &stream(*dump_options.GetStream());264if (dump_options.GetPrefixToken() != nullptr)265stream.Printf("%s", dump_options.GetPrefixToken());266if (dump_options.GetQuote() != 0)267stream.Printf("%c", dump_options.GetQuote());268auto data(dump_options.GetData());269auto source_size(dump_options.GetSourceSize());270if (data.GetByteSize() && data.GetDataStart() && data.GetDataEnd()) {271const int bufferSPSize = data.GetByteSize();272if (dump_options.GetSourceSize() == 0) {273const int origin_encoding = 8 * sizeof(SourceDataType);274source_size = bufferSPSize / (origin_encoding / 4);275}276277const SourceDataType *data_ptr =278(const SourceDataType *)data.GetDataStart();279const SourceDataType *data_end_ptr = data_ptr + source_size;280281const bool zero_is_terminator = dump_options.GetBinaryZeroIsTerminator();282283if (zero_is_terminator) {284while (data_ptr < data_end_ptr) {285if (!*data_ptr) {286data_end_ptr = data_ptr;287break;288}289data_ptr++;290}291292data_ptr = (const SourceDataType *)data.GetDataStart();293}294295lldb::WritableDataBufferSP utf8_data_buffer_sp;296llvm::UTF8 *utf8_data_ptr = nullptr;297llvm::UTF8 *utf8_data_end_ptr = nullptr;298299if (ConvertFunction) {300utf8_data_buffer_sp =301std::make_shared<DataBufferHeap>(4 * bufferSPSize, 0);302utf8_data_ptr = (llvm::UTF8 *)utf8_data_buffer_sp->GetBytes();303utf8_data_end_ptr = utf8_data_ptr + utf8_data_buffer_sp->GetByteSize();304ConvertFunction(&data_ptr, data_end_ptr, &utf8_data_ptr,305utf8_data_end_ptr, llvm::lenientConversion);306if (!zero_is_terminator)307utf8_data_end_ptr = utf8_data_ptr;308// needed because the ConvertFunction will change the value of the309// data_ptr.310utf8_data_ptr =311(llvm::UTF8 *)utf8_data_buffer_sp->GetBytes();312} else {313// just copy the pointers - the cast is necessary to make the compiler314// happy but this should only happen if we are reading UTF8 data315utf8_data_ptr = const_cast<llvm::UTF8 *>(316reinterpret_cast<const llvm::UTF8 *>(data_ptr));317utf8_data_end_ptr = const_cast<llvm::UTF8 *>(318reinterpret_cast<const llvm::UTF8 *>(data_end_ptr));319}320321const bool escape_non_printables = dump_options.GetEscapeNonPrintables();322EscapingHelper escaping_callback;323if (escape_non_printables)324escaping_callback =325GetDefaultEscapingHelper(style, dump_options.GetEscapeStyle());326327// since we tend to accept partial data (and even partially malformed data)328// we might end up with no NULL terminator before the end_ptr hence we need329// to take a slower route and ensure we stay within boundaries330for (; utf8_data_ptr < utf8_data_end_ptr;) {331if (zero_is_terminator && !*utf8_data_ptr)332break;333334if (escape_non_printables) {335uint8_t *next_data = nullptr;336auto printable =337escaping_callback(utf8_data_ptr, utf8_data_end_ptr, next_data);338auto printable_bytes = printable.GetBytes();339auto printable_size = printable.GetSize();340341// We failed to figure out how to print this string.342if (!printable_bytes || !next_data)343return false;344345for (unsigned c = 0; c < printable_size; c++)346stream.Printf("%c", *(printable_bytes + c));347utf8_data_ptr = (uint8_t *)next_data;348} else {349stream.Printf("%c", *utf8_data_ptr);350utf8_data_ptr++;351}352}353}354if (dump_options.GetQuote() != 0)355stream.Printf("%c", dump_options.GetQuote());356if (dump_options.GetSuffixToken() != nullptr)357stream.Printf("%s", dump_options.GetSuffixToken());358if (dump_options.GetIsTruncated())359stream.Printf("...");360return true;361}362363lldb_private::formatters::StringPrinter::ReadStringAndDumpToStreamOptions::364ReadStringAndDumpToStreamOptions(ValueObject &valobj)365: ReadStringAndDumpToStreamOptions() {366SetEscapeNonPrintables(367valobj.GetTargetSP()->GetDebugger().GetEscapeNonPrintables());368}369370lldb_private::formatters::StringPrinter::ReadBufferAndDumpToStreamOptions::371ReadBufferAndDumpToStreamOptions(ValueObject &valobj)372: ReadBufferAndDumpToStreamOptions() {373SetEscapeNonPrintables(374valobj.GetTargetSP()->GetDebugger().GetEscapeNonPrintables());375}376377lldb_private::formatters::StringPrinter::ReadBufferAndDumpToStreamOptions::378ReadBufferAndDumpToStreamOptions(379const ReadStringAndDumpToStreamOptions &options)380: ReadBufferAndDumpToStreamOptions() {381SetStream(options.GetStream());382SetPrefixToken(options.GetPrefixToken());383SetSuffixToken(options.GetSuffixToken());384SetQuote(options.GetQuote());385SetEscapeNonPrintables(options.GetEscapeNonPrintables());386SetBinaryZeroIsTerminator(options.GetBinaryZeroIsTerminator());387SetEscapeStyle(options.GetEscapeStyle());388}389390namespace lldb_private {391392namespace formatters {393394template <typename SourceDataType>395static bool ReadEncodedBufferAndDumpToStream(396StringElementType elem_type,397const StringPrinter::ReadStringAndDumpToStreamOptions &options,398llvm::ConversionResult (*ConvertFunction)(const SourceDataType **,399const SourceDataType *,400llvm::UTF8 **, llvm::UTF8 *,401llvm::ConversionFlags)) {402assert(options.GetStream() && "need a Stream to print the string to");403if (!options.GetStream())404return false;405406if (options.GetLocation() == 0 ||407options.GetLocation() == LLDB_INVALID_ADDRESS)408return false;409410lldb::TargetSP target_sp = options.GetTargetSP();411if (!target_sp)412return false;413414constexpr int type_width = sizeof(SourceDataType);415constexpr int origin_encoding = 8 * type_width;416if (origin_encoding != 8 && origin_encoding != 16 && origin_encoding != 32)417return false;418// If not UTF8 or ASCII, conversion to UTF8 is necessary.419if (origin_encoding != 8 && !ConvertFunction)420return false;421422bool needs_zero_terminator = options.GetNeedsZeroTermination();423424bool is_truncated = false;425const auto max_size = target_sp->GetMaximumSizeOfStringSummary();426427uint32_t sourceSize;428if (elem_type == StringElementType::ASCII && !options.GetSourceSize()) {429// FIXME: The NSString formatter sets HasSourceSize(true) when the size is430// actually unknown, as well as SetBinaryZeroIsTerminator(false). IIUC the431// C++ formatter also sets SetBinaryZeroIsTerminator(false) when it doesn't432// mean to. I don't see how this makes sense: we should fix the formatters.433//434// Until then, the behavior that's expected for ASCII strings with unknown435// lengths is to read up to the max size and then null-terminate. Do that.436sourceSize = max_size;437needs_zero_terminator = true;438} else if (options.HasSourceSize()) {439sourceSize = options.GetSourceSize();440if (!options.GetIgnoreMaxLength()) {441if (sourceSize > max_size) {442sourceSize = max_size;443is_truncated = true;444}445}446} else {447sourceSize = max_size;448needs_zero_terminator = true;449}450451const int bufferSPSize = sourceSize * type_width;452lldb::WritableDataBufferSP buffer_sp(new DataBufferHeap(bufferSPSize, 0));453454// Check if we got bytes. We never get any bytes if we have an empty455// string, but we still continue so that we end up actually printing456// an empty string ("").457if (sourceSize != 0 && !buffer_sp->GetBytes())458return false;459460Status error;461char *buffer = reinterpret_cast<char *>(buffer_sp->GetBytes());462463if (elem_type == StringElementType::ASCII)464target_sp->ReadCStringFromMemory(options.GetLocation(), buffer,465bufferSPSize, error);466else if (needs_zero_terminator)467target_sp->ReadStringFromMemory(options.GetLocation(), buffer,468bufferSPSize, error, type_width);469else470target_sp->ReadMemory(options.GetLocation(), buffer, bufferSPSize, error);471if (error.Fail()) {472options.GetStream()->Printf("unable to read data");473return true;474}475476StringPrinter::ReadBufferAndDumpToStreamOptions dump_options(options);477dump_options.SetData(478DataExtractor(buffer_sp, target_sp->GetArchitecture().GetByteOrder(),479target_sp->GetArchitecture().GetAddressByteSize()));480dump_options.SetSourceSize(sourceSize);481dump_options.SetIsTruncated(is_truncated);482dump_options.SetNeedsZeroTermination(needs_zero_terminator);483if (needs_zero_terminator)484dump_options.SetBinaryZeroIsTerminator(true);485486GetPrintableElementType print_style = (elem_type == StringElementType::ASCII)487? GetPrintableElementType::ASCII488: GetPrintableElementType::UTF8;489return DumpEncodedBufferToStream(print_style, ConvertFunction, dump_options);490}491492template <>493bool StringPrinter::ReadStringAndDumpToStream<StringElementType::UTF8>(494const ReadStringAndDumpToStreamOptions &options) {495return ReadEncodedBufferAndDumpToStream<llvm::UTF8>(StringElementType::UTF8,496options, nullptr);497}498499template <>500bool StringPrinter::ReadStringAndDumpToStream<StringElementType::UTF16>(501const ReadStringAndDumpToStreamOptions &options) {502return ReadEncodedBufferAndDumpToStream<llvm::UTF16>(503StringElementType::UTF16, options, llvm::ConvertUTF16toUTF8);504}505506template <>507bool StringPrinter::ReadStringAndDumpToStream<StringElementType::UTF32>(508const ReadStringAndDumpToStreamOptions &options) {509return ReadEncodedBufferAndDumpToStream<llvm::UTF32>(510StringElementType::UTF32, options, llvm::ConvertUTF32toUTF8);511}512513template <>514bool StringPrinter::ReadStringAndDumpToStream<StringElementType::ASCII>(515const ReadStringAndDumpToStreamOptions &options) {516return ReadEncodedBufferAndDumpToStream<char>(StringElementType::ASCII,517options, nullptr);518}519520template <>521bool StringPrinter::ReadBufferAndDumpToStream<StringElementType::UTF8>(522const ReadBufferAndDumpToStreamOptions &options) {523return DumpEncodedBufferToStream<llvm::UTF8>(GetPrintableElementType::UTF8,524nullptr, options);525}526527template <>528bool StringPrinter::ReadBufferAndDumpToStream<StringElementType::UTF16>(529const ReadBufferAndDumpToStreamOptions &options) {530return DumpEncodedBufferToStream(GetPrintableElementType::UTF8,531llvm::ConvertUTF16toUTF8, options);532}533534template <>535bool StringPrinter::ReadBufferAndDumpToStream<StringElementType::UTF32>(536const ReadBufferAndDumpToStreamOptions &options) {537return DumpEncodedBufferToStream(GetPrintableElementType::UTF8,538llvm::ConvertUTF32toUTF8, options);539}540541template <>542bool StringPrinter::ReadBufferAndDumpToStream<StringElementType::ASCII>(543const ReadBufferAndDumpToStreamOptions &options) {544// Treat ASCII the same as UTF8.545//546// FIXME: This is probably not the right thing to do (well, it's debatable).547// If an ASCII-encoded string happens to contain a sequence of invalid bytes548// that forms a valid UTF8 character, we'll print out that character. This is549// good if you're playing fast and loose with encodings (probably good for550// std::string users), but maybe not so good if you care about your string551// formatter respecting the semantics of your selected string encoding. In552// the latter case you'd want to see the character byte sequence ('\x..'), not553// the UTF8 character itself.554return ReadBufferAndDumpToStream<StringElementType::UTF8>(options);555}556557} // namespace formatters558559} // namespace lldb_private560561562