CoCalc -- parse.cpp

GitHub Repository: stenzek/duckstation
Path: blob/master/dep/rapidyaml/src/c4/yml/parse.cpp
⁴²⁶² views
1
#include "c4/yml/parse.hpp"
2
#include "c4/error.hpp"
3
#include "c4/utf.hpp"
4
#include <c4/dump.hpp>
5

6
#include <ctype.h>
7
#include <stdarg.h>
8
#include <stdio.h>
9

10
#include "c4/yml/detail/parser_dbg.hpp"
11
#ifdef RYML_DBG
12
#include "c4/yml/detail/print.hpp"
13
#endif
14

15
#ifndef RYML_ERRMSG_SIZE
16
    #define RYML_ERRMSG_SIZE 1024
17
#endif
18

19
//#define RYML_WITH_TAB_TOKENS
20
#ifdef RYML_WITH_TAB_TOKENS
21
#define _RYML_WITH_TAB_TOKENS(...) __VA_ARGS__
22
#define _RYML_WITH_OR_WITHOUT_TAB_TOKENS(with, without) with
23
#else
24
#define _RYML_WITH_TAB_TOKENS(...)
25
#define _RYML_WITH_OR_WITHOUT_TAB_TOKENS(with, without) without
26
#endif
27

28

29
#if defined(_MSC_VER)
30
#   pragma warning(push)
31
#   pragma warning(disable: 4296/*expression is always 'boolean_value'*/)
32
#elif defined(__clang__)
33
#   pragma clang diagnostic push
34
#   pragma clang diagnostic ignored "-Wtype-limits" // to remove a warning on an assertion that a size_t >= 0. Later on, this size_t will turn into a template argument, and then it can become < 0.
35
#   pragma clang diagnostic ignored "-Wformat-nonliteral"
36
#   pragma clang diagnostic ignored "-Wold-style-cast"
37
#elif defined(__GNUC__)
38
#   pragma GCC diagnostic push
39
#   pragma GCC diagnostic ignored "-Wtype-limits" // to remove a warning on an assertion that a size_t >= 0. Later on, this size_t will turn into a template argument, and then it can become < 0.
40
#   pragma GCC diagnostic ignored "-Wformat-nonliteral"
41
#   pragma GCC diagnostic ignored "-Wold-style-cast"
42
#   if __GNUC__ >= 7
43
#       pragma GCC diagnostic ignored "-Wduplicated-branches"
44
#   endif
45
#endif
46

47
namespace c4 {
48
namespace yml {
49

50
namespace {
51

52
template<class DumpFn, class ...Args>
53
void _parse_dump(DumpFn dumpfn, c4::csubstr fmt, Args&& ...args)
54
{
55
    char writebuf[256];
56
    auto results = c4::format_dump_resume(dumpfn, writebuf, fmt, std::forward<Args>(args)...);
57
    // resume writing if the results failed to fit the buffer
58
    if(C4_UNLIKELY(results.bufsize > sizeof(writebuf))) // bufsize will be that of the largest element serialized. Eg int(1), will require 1 byte.
59
    {
60
        results = format_dump_resume(dumpfn, results, writebuf, fmt, std::forward<Args>(args)...);
61
        if(C4_UNLIKELY(results.bufsize > sizeof(writebuf)))
62
        {
63
            results = format_dump_resume(dumpfn, results, writebuf, fmt, std::forward<Args>(args)...);
64
        }
65
    }
66
}
67

68
bool _is_scalar_next__runk(csubstr s)
69
{
70
    return !(s.begins_with(": ") || s.begins_with_any("#,{}[]%&") || s.begins_with("? ") || s == "-" || s.begins_with("- ") || s.begins_with(":\"") || s.begins_with(":'"));
71
}
72

73
bool _is_scalar_next__rseq_rval(csubstr s)
74
{
75
    return !(s.begins_with_any("[{!&") || s.begins_with("? ") || s.begins_with("- ") || s == "-");
76
}
77

78
bool _is_scalar_next__rmap(csubstr s)
79
{
80
    return !(s.begins_with(": ") || s.begins_with_any("#,!&") || s.begins_with("? ") _RYML_WITH_TAB_TOKENS(|| s.begins_with(":\t")));
81
}
82

83
bool _is_scalar_next__rmap_val(csubstr s)
84
{
85
    return !(s.begins_with("- ") || s.begins_with_any("{[") || s == "-");
86
}
87

88
bool _is_doc_sep(csubstr s)
89
{
90
    constexpr const csubstr dashes = "---";
91
    constexpr const csubstr ellipsis = "...";
92
    constexpr const csubstr whitesp = " \t";
93
    if(s.begins_with(dashes))
94
        return s == dashes || s.sub(3).begins_with_any(whitesp);
95
    else if(s.begins_with(ellipsis))
96
        return s == ellipsis || s.sub(3).begins_with_any(whitesp);
97
    return false;
98
}
99

100
/** @p i is set to the first non whitespace character after the line
101
 * @return the number of empty lines after the initial position */
102
size_t count_following_newlines(csubstr r, size_t *C4_RESTRICT i, size_t indentation)
103
{
104
    RYML_ASSERT(r[*i] == '\n');
105
    size_t numnl_following = 0;
106
    ++(*i);
107
    for( ; *i < r.len; ++(*i))
108
    {
109
        if(r.str[*i] == '\n')
110
        {
111
            ++numnl_following;
112
            if(indentation) // skip the indentation after the newline
113
            {
114
                size_t stop = *i + indentation;
115
                for( ; *i < r.len; ++(*i))
116
                {
117
                    if(r.str[*i] != ' ' && r.str[*i] != '\r')
118
                        break;
119
                    RYML_ASSERT(*i < stop);
120
                }
121
                C4_UNUSED(stop);
122
            }
123
        }
124
        else if(r.str[*i] == ' ' || r.str[*i] == '\t' || r.str[*i] == '\r')  // skip leading whitespace
125
            ;
126
        else
127
            break;
128
    }
129
    return numnl_following;
130
}
131

132
} // anon namespace
133

134

135
//-----------------------------------------------------------------------------
136

137
Parser::~Parser()
138
{
139
    _free();
140
    _clr();
141
}
142

143
Parser::Parser(Callbacks const& cb, ParserOptions opts)
144
    : m_options(opts)
145
    , m_file()
146
    , m_buf()
147
    , m_root_id(NONE)
148
    , m_tree()
149
    , m_stack(cb)
150
    , m_state()
151
    , m_key_tag_indentation(0)
152
    , m_key_tag2_indentation(0)
153
    , m_key_tag()
154
    , m_key_tag2()
155
    , m_val_tag_indentation(0)
156
    , m_val_tag()
157
    , m_key_anchor_was_before(false)
158
    , m_key_anchor_indentation(0)
159
    , m_key_anchor()
160
    , m_val_anchor_indentation(0)
161
    , m_val_anchor()
162
    , m_filter_arena()
163
    , m_newline_offsets()
164
    , m_newline_offsets_size(0)
165
    , m_newline_offsets_capacity(0)
166
    , m_newline_offsets_buf()
167
{
168
    m_stack.push(State{});
169
    m_state = &m_stack.top();
170
}
171

172
Parser::Parser(Parser &&that)
173
    : m_options(that.m_options)
174
    , m_file(that.m_file)
175
    , m_buf(that.m_buf)
176
    , m_root_id(that.m_root_id)
177
    , m_tree(that.m_tree)
178
    , m_stack(std::move(that.m_stack))
179
    , m_state(&m_stack.top())
180
    , m_key_tag_indentation(that.m_key_tag_indentation)
181
    , m_key_tag2_indentation(that.m_key_tag2_indentation)
182
    , m_key_tag(that.m_key_tag)
183
    , m_key_tag2(that.m_key_tag2)
184
    , m_val_tag_indentation(that.m_val_tag_indentation)
185
    , m_val_tag(that.m_val_tag)
186
    , m_key_anchor_was_before(that.m_key_anchor_was_before)
187
    , m_key_anchor_indentation(that.m_key_anchor_indentation)
188
    , m_key_anchor(that.m_key_anchor)
189
    , m_val_anchor_indentation(that.m_val_anchor_indentation)
190
    , m_val_anchor(that.m_val_anchor)
191
    , m_filter_arena(that.m_filter_arena)
192
    , m_newline_offsets(that.m_newline_offsets)
193
    , m_newline_offsets_size(that.m_newline_offsets_size)
194
    , m_newline_offsets_capacity(that.m_newline_offsets_capacity)
195
    , m_newline_offsets_buf(that.m_newline_offsets_buf)
196
{
197
    that._clr();
198
}
199

200
Parser::Parser(Parser const& that)
201
    : m_options(that.m_options)
202
    , m_file(that.m_file)
203
    , m_buf(that.m_buf)
204
    , m_root_id(that.m_root_id)
205
    , m_tree(that.m_tree)
206
    , m_stack(that.m_stack)
207
    , m_state(&m_stack.top())
208
    , m_key_tag_indentation(that.m_key_tag_indentation)
209
    , m_key_tag2_indentation(that.m_key_tag2_indentation)
210
    , m_key_tag(that.m_key_tag)
211
    , m_key_tag2(that.m_key_tag2)
212
    , m_val_tag_indentation(that.m_val_tag_indentation)
213
    , m_val_tag(that.m_val_tag)
214
    , m_key_anchor_was_before(that.m_key_anchor_was_before)
215
    , m_key_anchor_indentation(that.m_key_anchor_indentation)
216
    , m_key_anchor(that.m_key_anchor)
217
    , m_val_anchor_indentation(that.m_val_anchor_indentation)
218
    , m_val_anchor(that.m_val_anchor)
219
    , m_filter_arena()
220
    , m_newline_offsets()
221
    , m_newline_offsets_size()
222
    , m_newline_offsets_capacity()
223
    , m_newline_offsets_buf()
224
{
225
    if(that.m_newline_offsets_capacity)
226
    {
227
        _resize_locations(that.m_newline_offsets_capacity);
228
        _RYML_CB_CHECK(m_stack.m_callbacks, m_newline_offsets_capacity == that.m_newline_offsets_capacity);
229
        memcpy(m_newline_offsets, that.m_newline_offsets, that.m_newline_offsets_size * sizeof(size_t));
230
        m_newline_offsets_size = that.m_newline_offsets_size;
231
    }
232
    if(that.m_filter_arena.len)
233
    {
234
        _resize_filter_arena(that.m_filter_arena.len);
235
    }
236
}
237

238
Parser& Parser::operator=(Parser &&that)
239
{
240
    _free();
241
    m_options = (that.m_options);
242
    m_file = (that.m_file);
243
    m_buf = (that.m_buf);
244
    m_root_id = (that.m_root_id);
245
    m_tree = (that.m_tree);
246
    m_stack = std::move(that.m_stack);
247
    m_state = (&m_stack.top());
248
    m_key_tag_indentation = (that.m_key_tag_indentation);
249
    m_key_tag2_indentation = (that.m_key_tag2_indentation);
250
    m_key_tag = (that.m_key_tag);
251
    m_key_tag2 = (that.m_key_tag2);
252
    m_val_tag_indentation = (that.m_val_tag_indentation);
253
    m_val_tag = (that.m_val_tag);
254
    m_key_anchor_was_before = (that.m_key_anchor_was_before);
255
    m_key_anchor_indentation = (that.m_key_anchor_indentation);
256
    m_key_anchor = (that.m_key_anchor);
257
    m_val_anchor_indentation = (that.m_val_anchor_indentation);
258
    m_val_anchor = (that.m_val_anchor);
259
    m_filter_arena = that.m_filter_arena;
260
    m_newline_offsets = (that.m_newline_offsets);
261
    m_newline_offsets_size = (that.m_newline_offsets_size);
262
    m_newline_offsets_capacity = (that.m_newline_offsets_capacity);
263
    m_newline_offsets_buf = (that.m_newline_offsets_buf);
264
    that._clr();
265
    return *this;
266
}
267

268
Parser& Parser::operator=(Parser const& that)
269
{
270
    _free();
271
    m_options = (that.m_options);
272
    m_file = (that.m_file);
273
    m_buf = (that.m_buf);
274
    m_root_id = (that.m_root_id);
275
    m_tree = (that.m_tree);
276
    m_stack = that.m_stack;
277
    m_state = &m_stack.top();
278
    m_key_tag_indentation = (that.m_key_tag_indentation);
279
    m_key_tag2_indentation = (that.m_key_tag2_indentation);
280
    m_key_tag = (that.m_key_tag);
281
    m_key_tag2 = (that.m_key_tag2);
282
    m_val_tag_indentation = (that.m_val_tag_indentation);
283
    m_val_tag = (that.m_val_tag);
284
    m_key_anchor_was_before = (that.m_key_anchor_was_before);
285
    m_key_anchor_indentation = (that.m_key_anchor_indentation);
286
    m_key_anchor = (that.m_key_anchor);
287
    m_val_anchor_indentation = (that.m_val_anchor_indentation);
288
    m_val_anchor = (that.m_val_anchor);
289
    if(that.m_filter_arena.len > 0)
290
        _resize_filter_arena(that.m_filter_arena.len);
291
    if(that.m_newline_offsets_capacity > m_newline_offsets_capacity)
292
        _resize_locations(that.m_newline_offsets_capacity);
293
    _RYML_CB_CHECK(m_stack.m_callbacks, m_newline_offsets_capacity >= that.m_newline_offsets_capacity);
294
    _RYML_CB_CHECK(m_stack.m_callbacks, m_newline_offsets_capacity >= that.m_newline_offsets_size);
295
    memcpy(m_newline_offsets, that.m_newline_offsets, that.m_newline_offsets_size * sizeof(size_t));
296
    m_newline_offsets_size = that.m_newline_offsets_size;
297
    m_newline_offsets_buf = that.m_newline_offsets_buf;
298
    return *this;
299
}
300

301
void Parser::_clr()
302
{
303
    m_options = {};
304
    m_file = {};
305
    m_buf = {};
306
    m_root_id = {};
307
    m_tree = {};
308
    m_stack.clear();
309
    m_state = {};
310
    m_key_tag_indentation = {};
311
    m_key_tag2_indentation = {};
312
    m_key_tag = {};
313
    m_key_tag2 = {};
314
    m_val_tag_indentation = {};
315
    m_val_tag = {};
316
    m_key_anchor_was_before = {};
317
    m_key_anchor_indentation = {};
318
    m_key_anchor = {};
319
    m_val_anchor_indentation = {};
320
    m_val_anchor = {};
321
    m_filter_arena = {};
322
    m_newline_offsets = {};
323
    m_newline_offsets_size = {};
324
    m_newline_offsets_capacity = {};
325
    m_newline_offsets_buf = {};
326
}
327

328
void Parser::_free()
329
{
330
    if(m_newline_offsets)
331
    {
332
        _RYML_CB_FREE(m_stack.m_callbacks, m_newline_offsets, size_t, m_newline_offsets_capacity);
333
        m_newline_offsets = nullptr;
334
        m_newline_offsets_size = 0u;
335
        m_newline_offsets_capacity = 0u;
336
        m_newline_offsets_buf = 0u;
337
    }
338
    if(m_filter_arena.len)
339
    {
340
        _RYML_CB_FREE(m_stack.m_callbacks, m_filter_arena.str, char, m_filter_arena.len);
341
        m_filter_arena = {};
342
    }
343
    m_stack._free();
344
}
345

346

347
//-----------------------------------------------------------------------------
348
void Parser::_reset()
349
{
350
    _RYML_CB_ASSERT(m_stack.m_callbacks, m_stack.size() == 1);
351
    m_stack.clear();
352
    m_stack.push({});
353
    m_state = &m_stack.top();
354
    m_state->reset(m_file.str, m_root_id);
355

356
    m_key_tag_indentation = 0;
357
    m_key_tag2_indentation = 0;
358
    m_key_tag.clear();
359
    m_key_tag2.clear();
360
    m_val_tag_indentation = 0;
361
    m_val_tag.clear();
362
    m_key_anchor_was_before = false;
363
    m_key_anchor_indentation = 0;
364
    m_key_anchor.clear();
365
    m_val_anchor_indentation = 0;
366
    m_val_anchor.clear();
367

368
    if(m_options.locations())
369
    {
370
        _prepare_locations();
371
    }
372
}
373

374
//-----------------------------------------------------------------------------
375
template<class DumpFn>
376
void Parser::_fmt_msg(DumpFn &&dumpfn) const
377
{
378
    auto const& lc = m_state->line_contents;
379
    csubstr contents = lc.stripped;
380
    if(contents.len)
381
    {
382
        // print the yaml src line
383
        size_t offs = 3u + to_chars(substr{}, m_state->pos.line) + to_chars(substr{}, m_state->pos.col);
384
        if(m_file.len)
385
        {
386
            _parse_dump(dumpfn, "{}:", m_file);
387
            offs += m_file.len + 1;
388
        }
389
        _parse_dump(dumpfn, "{}:{}: ", m_state->pos.line, m_state->pos.col);
390
        csubstr maybe_full_content = (contents.len < 80u ? contents : contents.first(80u));
391
        csubstr maybe_ellipsis = (contents.len < 80u ? csubstr{} : csubstr("..."));
392
        _parse_dump(dumpfn, "{}{}  (size={})\n", maybe_full_content, maybe_ellipsis, contents.len);
393
        // highlight the remaining portion of the previous line
394
        size_t firstcol = (size_t)(lc.rem.begin() - lc.full.begin());
395
        size_t lastcol = firstcol + lc.rem.len;
396
        for(size_t i = 0; i < offs + firstcol; ++i)
397
            dumpfn(" ");
398
        dumpfn("^");
399
        for(size_t i = 1, e = (lc.rem.len < 80u ? lc.rem.len : 80u); i < e; ++i)
400
            dumpfn("~");
401
        _parse_dump(dumpfn, "{}  (cols {}-{})\n", maybe_ellipsis, firstcol+1, lastcol+1);
402
    }
403
    else
404
    {
405
        dumpfn("\n");
406
    }
407

408
#ifdef RYML_DBG
409
    // next line: print the state flags
410
    {
411
        char flagbuf_[64];
412
        _parse_dump(dumpfn, "top state: {}\n", _prfl(flagbuf_, m_state->flags));
413
    }
414
#endif
415
}
416

417

418
//-----------------------------------------------------------------------------
419
template<class ...Args>
420
void Parser::_err(csubstr fmt, Args const& C4_RESTRICT ...args) const
421
{
422
    char errmsg[RYML_ERRMSG_SIZE];
423
    detail::_SubstrWriter writer(errmsg);
424
    auto dumpfn = [&writer](csubstr s){ writer.append(s); };
425
    _parse_dump(dumpfn, fmt, args...);
426
    writer.append('\n');
427
    _fmt_msg(dumpfn);
428
    size_t len = writer.pos < RYML_ERRMSG_SIZE ? writer.pos : RYML_ERRMSG_SIZE;
429
    m_tree->m_callbacks.m_error(errmsg, len, m_state->pos, m_tree->m_callbacks.m_user_data);
430
}
431

432
//-----------------------------------------------------------------------------
433
#ifdef RYML_DBG
434
template<class ...Args>
435
void Parser::_dbg(csubstr fmt, Args const& C4_RESTRICT ...args) const
436
{
437
    auto dumpfn = [](csubstr s){ fwrite(s.str, 1, s.len, stdout); };
438
    _parse_dump(dumpfn, fmt, args...);
439
    dumpfn("\n");
440
    _fmt_msg(dumpfn);
441
}
442
#endif
443

444
//-----------------------------------------------------------------------------
445
bool Parser::_finished_file() const
446
{
447
    bool ret = m_state->pos.offset >= m_buf.len;
448
    if(ret)
449
    {
450
        _c4dbgp("finished file!!!");
451
    }
452
    return ret;
453
}
454

455
//-----------------------------------------------------------------------------
456
bool Parser::_finished_line() const
457
{
458
    return m_state->line_contents.rem.empty();
459
}
460

461
//-----------------------------------------------------------------------------
462
void Parser::parse_in_place(csubstr file, substr buf, Tree *t, size_t node_id)
463
{
464
    m_file = file;
465
    m_buf = buf;
466
    m_root_id = node_id;
467
    m_tree = t;
468
    _reset();
469
    while( ! _finished_file())
470
    {
471
        _scan_line();
472
        while( ! _finished_line())
473
            _handle_line();
474
        if(_finished_file())
475
            break; // it may have finished because of multiline blocks
476
        _line_ended();
477
    }
478
    _handle_finished_file();
479
}
480

481
//-----------------------------------------------------------------------------
482
void Parser::_handle_finished_file()
483
{
484
    _end_stream();
485
}
486

487
//-----------------------------------------------------------------------------
488
void Parser::_handle_line()
489
{
490
    _c4dbgq("\n-----------");
491
    _c4dbgt("handling line={}, offset={}B", m_state->pos.line, m_state->pos.offset);
492
    _RYML_CB_ASSERT(m_stack.m_callbacks,  ! m_state->line_contents.rem.empty());
493
    if(has_any(RSEQ))
494
    {
495
        if(has_any(FLOW))
496
        {
497
            if(_handle_seq_flow())
498
                return;
499
        }
500
        else
501
        {
502
            if(_handle_seq_blck())
503
                return;
504
        }
505
    }
506
    else if(has_any(RMAP))
507
    {
508
        if(has_any(FLOW))
509
        {
510
            if(_handle_map_flow())
511
                return;
512
        }
513
        else
514
        {
515
            if(_handle_map_blck())
516
                return;
517
        }
518
    }
519
    else if(has_any(RUNK))
520
    {
521
        if(_handle_unk())
522
            return;
523
    }
524

525
    if(_handle_top())
526
        return;
527
}
528

529

530
//-----------------------------------------------------------------------------
531
bool Parser::_handle_unk()
532
{
533
    _c4dbgp("handle_unk");
534

535
    csubstr rem = m_state->line_contents.rem;
536
    const bool start_as_child = (node(m_state) == nullptr);
537

538
    if(C4_UNLIKELY(has_any(NDOC)))
539
    {
540
        if(rem == "---" || rem.begins_with("--- "))
541
        {
542
            _start_new_doc(rem);
543
            return true;
544
        }
545
        auto trimmed = rem.triml(' ');
546
        if(trimmed == "---" || trimmed.begins_with("--- "))
547
        {
548
            _RYML_CB_ASSERT(m_stack.m_callbacks, rem.len >= trimmed.len);
549
            _line_progressed(rem.len - trimmed.len);
550
            _start_new_doc(trimmed);
551
            _save_indentation();
552
            return true;
553
        }
554
        else if(trimmed.begins_with("..."))
555
        {
556
            _end_stream();
557
        }
558
        else if(trimmed.first_of("#%") == csubstr::npos) // neither a doc nor a tag
559
        {
560
            _c4dbgpf("starting implicit doc to accomodate unexpected tokens: '{}'", rem);
561
            size_t indref = m_state->indref;
562
            _push_level();
563
            _start_doc();
564
            _set_indentation(indref);
565
        }
566
        _RYML_CB_ASSERT(m_stack.m_callbacks, !trimmed.empty());
567
    }
568

569
    _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RNXT|RSEQ|RMAP));
570
    if(m_state->indref > 0)
571
    {
572
        csubstr ws = rem.left_of(rem.first_not_of(' '));
573
        if(m_state->indref <= ws.len)
574
        {
575
            _c4dbgpf("skipping base indentation of {}", m_state->indref);
576
            _line_progressed(m_state->indref);
577
            rem = rem.sub(m_state->indref);
578
        }
579
    }
580

581
    if(rem.begins_with("- ") _RYML_WITH_TAB_TOKENS( || rem.begins_with("-\t")))
582
    {
583
        _c4dbgpf("it's a seq (as_child={})", start_as_child);
584
        _move_key_anchor_to_val_anchor();
585
        _move_key_tag_to_val_tag();
586
        _push_level();
587
        _start_seq(start_as_child);
588
        _save_indentation();
589
        _line_progressed(2);
590
        return true;
591
    }
592
    else if(rem == '-')
593
    {
594
        _c4dbgpf("it's a seq (as_child={})", start_as_child);
595
        _move_key_anchor_to_val_anchor();
596
        _move_key_tag_to_val_tag();
597
        _push_level();
598
        _start_seq(start_as_child);
599
        _save_indentation();
600
        _line_progressed(1);
601
        return true;
602
    }
603
    else if(rem.begins_with('['))
604
    {
605
        _c4dbgpf("it's a seq, flow (as_child={})", start_as_child);
606
        _move_key_anchor_to_val_anchor();
607
        _move_key_tag_to_val_tag();
608
        _push_level(/*explicit flow*/true);
609
        _start_seq(start_as_child);
610
        add_flags(FLOW);
611
        _line_progressed(1);
612
        return true;
613
    }
614
    else if(rem.begins_with('{'))
615
    {
616
        _c4dbgpf("it's a map, flow (as_child={})", start_as_child);
617
        _move_key_anchor_to_val_anchor();
618
        _move_key_tag_to_val_tag();
619
        _push_level(/*explicit flow*/true);
620
        _start_map(start_as_child);
621
        addrem_flags(FLOW|RKEY, RVAL);
622
        _line_progressed(1);
623
        return true;
624
    }
625
    else if(rem.begins_with("? "))
626
    {
627
        _c4dbgpf("it's a map (as_child={}) + this key is complex", start_as_child);
628
        _move_key_anchor_to_val_anchor();
629
        _move_key_tag_to_val_tag();
630
        _push_level();
631
        _start_map(start_as_child);
632
        addrem_flags(RKEY|QMRK, RVAL);
633
        _save_indentation();
634
        _line_progressed(2);
635
        return true;
636
    }
637
    else if(rem.begins_with(": ") && !has_any(SSCL))
638
    {
639
        _c4dbgp("it's a map with an empty key");
640
        _move_key_anchor_to_val_anchor();
641
        _move_key_tag_to_val_tag();
642
        _push_level();
643
        _start_map(start_as_child);
644
        _store_scalar_null(rem.str);
645
        addrem_flags(RVAL, RKEY);
646
        _save_indentation();
647
        _line_progressed(2);
648
        return true;
649
    }
650
    else if(rem == ':' && !has_any(SSCL))
651
    {
652
        _c4dbgp("it's a map with an empty key");
653
        _move_key_anchor_to_val_anchor();
654
        _move_key_tag_to_val_tag();
655
        _push_level();
656
        _start_map(start_as_child);
657
        _store_scalar_null(rem.str);
658
        addrem_flags(RVAL, RKEY);
659
        _save_indentation();
660
        _line_progressed(1);
661
        return true;
662
    }
663
    else if(_handle_types())
664
    {
665
        return true;
666
    }
667
    else if(!rem.begins_with('*') && _handle_key_anchors_and_refs())
668
    {
669
        return true;
670
    }
671
    else if(has_any(SSCL))
672
    {
673
        _c4dbgpf("there's a stored scalar: '{}'", m_state->scalar);
674

675
        csubstr saved_scalar;
676
        bool is_quoted = false;
677
        if(_scan_scalar_unk(&saved_scalar, &is_quoted))
678
        {
679
            rem = m_state->line_contents.rem;
680
            _c4dbgpf("... and there's also a scalar next! '{}'", saved_scalar);
681
            if(rem.begins_with_any(" \t"))
682
            {
683
                size_t n = rem.first_not_of(" \t");
684
                _c4dbgpf("skipping {} spaces/tabs", n);
685
                rem = rem.sub(n);
686
                _line_progressed(n);
687
            }
688
        }
689

690
        _c4dbgpf("rem='{}'", rem);
691

692
        if(rem.begins_with(", "))
693
        {
694
            _c4dbgpf("got a ',' -- it's a seq (as_child={})", start_as_child);
695
            _start_seq(start_as_child);
696
            add_flags(FLOW);
697
            _append_val(_consume_scalar());
698
            _line_progressed(2);
699
        }
700
        else if(rem.begins_with(','))
701
        {
702
            _c4dbgpf("got a ',' -- it's a seq (as_child={})", start_as_child);
703
            _start_seq(start_as_child);
704
            add_flags(FLOW);
705
            _append_val(_consume_scalar());
706
            _line_progressed(1);
707
        }
708
        else if(rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t")))
709
        {
710
            _c4dbgpf("got a ': ' -- it's a map (as_child={})", start_as_child);
711
            _start_map_unk(start_as_child); // wait for the val scalar to append the key-val pair
712
            _line_progressed(2);
713
        }
714
        else if(rem == ":" || rem.begins_with(":\"") || rem.begins_with(":'"))
715
        {
716
            if(rem == ":") { _c4dbgpf("got a ':' -- it's a map (as_child={})", start_as_child); }
717
            else { _c4dbgpf("got a '{}' -- it's a map (as_child={})", rem.first(2), start_as_child); }
718
            _start_map_unk(start_as_child); // wait for the val scalar to append the key-val pair
719
            _line_progressed(1); // advance only 1
720
        }
721
        #ifdef RYML_NO_COVERAGE__TO_BE_DELETED
722
        else if(rem.begins_with('}'))
723
        {
724
            if(!has_all(RMAP|FLOW))
725
            {
726
                _c4err("invalid token: not reading a map");
727
            }
728
            if(!has_all(SSCL))
729
            {
730
                _c4err("no scalar stored");
731
            }
732
            _append_key_val(saved_scalar, is_quoted);
733
            _stop_map();
734
            _line_progressed(1);
735
            saved_scalar.clear();
736
            is_quoted = false;
737
        }
738
        #endif
739
        else if(rem.begins_with("..."))
740
        {
741
            _c4dbgp("got stream end '...'");
742
            _end_stream();
743
            _line_progressed(3);
744
        }
745
        else if(rem.begins_with('#'))
746
        {
747
            _c4dbgpf("it's a comment: '{}'", rem);
748
            _scan_comment();
749
            return true;
750
        }
751
        else if(_handle_key_anchors_and_refs())
752
        {
753
            return true;
754
        }
755
        else if(rem.begins_with(" ") || rem.begins_with("\t"))
756
        {
757
            size_t n = rem.first_not_of(" \t");
758
            if(n == npos)
759
                n = rem.len;
760
            _c4dbgpf("has {} spaces/tabs, skip...", n);
761
            _line_progressed(n);
762
            return true;
763
        }
764
        else if(rem.empty())
765
        {
766
            // nothing to do
767
        }
768
        else if(rem == "---" || rem.begins_with("--- "))
769
        {
770
            _c4dbgp("caught ---: starting doc");
771
            _start_new_doc(rem);
772
            return true;
773
        }
774
        else if(rem.begins_with('%'))
775
        {
776
            _c4dbgp("caught a directive: ignoring...");
777
            _line_progressed(rem.len);
778
            return true;
779
        }
780
        else
781
        {
782
            _c4err("parse error");
783
        }
784

785
        if(is_quoted || (! saved_scalar.empty()))
786
        {
787
            _store_scalar(saved_scalar, is_quoted);
788
        }
789

790
        return true;
791
    }
792
    else
793
    {
794
        _RYML_CB_ASSERT(m_stack.m_callbacks,  ! has_any(SSCL));
795
        csubstr scalar;
796
        size_t indentation = m_state->line_contents.indentation; // save
797
        bool is_quoted;
798
        if(_scan_scalar_unk(&scalar, &is_quoted))
799
        {
800
            _c4dbgpf("got a {} scalar", is_quoted ? "quoted" : "");
801
            rem = m_state->line_contents.rem;
802
            {
803
                size_t first = rem.first_not_of(" \t");
804
                if(first && first != npos)
805
                {
806
                    _c4dbgpf("skip {} whitespace characters", first);
807
                   _line_progressed(first);
808
                   rem = rem.sub(first);
809
                }
810
            }
811
            _store_scalar(scalar, is_quoted);
812
            if(rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t")))
813
            {
814
                _c4dbgpf("got a ': ' next -- it's a map (as_child={})", start_as_child);
815
                _push_level();
816
                _start_map(start_as_child); // wait for the val scalar to append the key-val pair
817
                _set_indentation(indentation);
818
                _line_progressed(2); // call this AFTER saving the indentation
819
            }
820
            else if(rem.begins_with(':'))
821
            {
822
                _c4dbgpf("got a ':' next -- it's a map (as_child={})", start_as_child);
823
                _push_level();
824
                _start_map(start_as_child); // wait for the val scalar to append the key-val pair
825
                _set_indentation(indentation);
826
                _line_progressed(1); // call this AFTER saving the indentation
827
            }
828
            else
829
            {
830
                // we still don't know whether it's a seq or a map
831
                // so just store the scalar
832
            }
833
            return true;
834
        }
835
        else if(rem.begins_with_any(" \t"))
836
        {
837
            csubstr ws = rem.left_of(rem.first_not_of(" \t"));
838
            rem = rem.right_of(ws);
839
            if(has_all(RTOP) && rem.begins_with("---"))
840
            {
841
                _c4dbgp("there's a doc starting, and it's indented");
842
                _set_indentation(ws.len);
843
            }
844
            _c4dbgpf("skipping {} spaces/tabs", ws.len);
845
            _line_progressed(ws.len);
846
            return true;
847
        }
848
    }
849

850
    return false;
851
}
852

853

854
//-----------------------------------------------------------------------------
855
C4_ALWAYS_INLINE void Parser::_skipchars(char c)
856
{
857
    _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->line_contents.rem.begins_with(c));
858
    size_t pos = m_state->line_contents.rem.first_not_of(c);
859
    if(pos == npos)
860
        pos = m_state->line_contents.rem.len; // maybe the line is just whitespace
861
    _c4dbgpf("skip {} '{}'", pos, c);
862
    _line_progressed(pos);
863
}
864

865
template<size_t N>
866
C4_ALWAYS_INLINE void Parser::_skipchars(const char (&chars)[N])
867
{
868
    _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->line_contents.rem.begins_with_any(chars));
869
    size_t pos = m_state->line_contents.rem.first_not_of(chars);
870
    if(pos == npos)
871
        pos = m_state->line_contents.rem.len; // maybe the line is just whitespace
872
    _c4dbgpf("skip {} characters", pos);
873
    _line_progressed(pos);
874
}
875

876

877
//-----------------------------------------------------------------------------
878
bool Parser::_handle_seq_flow()
879
{
880
    _c4dbgpf("handle_seq_flow: node_id={} level={}", m_state->node_id, m_state->level);
881
    csubstr rem = m_state->line_contents.rem;
882

883
    _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RKEY));
884
    _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(RSEQ|FLOW));
885

886
    if(rem.begins_with(' '))
887
    {
888
        // with explicit flow, indentation does not matter
889
        _c4dbgp("starts with spaces");
890
        _skipchars(' ');
891
        return true;
892
    }
893
    _RYML_WITH_TAB_TOKENS(else if(rem.begins_with('\t'))
894
    {
895
        _c4dbgp("starts with tabs");
896
        _skipchars('\t');
897
        return true;
898
    })
899
    else if(rem.begins_with('#'))
900
    {
901
        _c4dbgp("it's a comment");
902
        rem = _scan_comment(); // also progresses the line
903
        return true;
904
    }
905
    else if(rem.begins_with(']'))
906
    {
907
        _c4dbgp("end the sequence");
908
        _pop_level();
909
        _line_progressed(1);
910
        if(has_all(RSEQIMAP))
911
        {
912
            _stop_seqimap();
913
            _pop_level();
914
        }
915
        return true;
916
    }
917

918
    if(has_any(RVAL))
919
    {
920
        _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RNXT));
921
        bool is_quoted;
922
        if(_scan_scalar_seq_flow(&rem, &is_quoted))
923
        {
924
            _c4dbgp("it's a scalar");
925
            addrem_flags(RNXT, RVAL);
926
            _append_val(rem, is_quoted);
927
            return true;
928
        }
929
        else if(rem.begins_with('['))
930
        {
931
            _c4dbgp("val is a child seq");
932
            addrem_flags(RNXT, RVAL); // before _push_level!
933
            _push_level(/*explicit flow*/true);
934
            _start_seq();
935
            add_flags(FLOW);
936
            _line_progressed(1);
937
            return true;
938
        }
939
        else if(rem.begins_with('{'))
940
        {
941
            _c4dbgp("val is a child map");
942
            addrem_flags(RNXT, RVAL); // before _push_level!
943
            _push_level(/*explicit flow*/true);
944
            _start_map();
945
            addrem_flags(FLOW|RKEY, RVAL);
946
            _line_progressed(1);
947
            return true;
948
        }
949
        else if(rem == ':')
950
        {
951
            _c4dbgpf("found ':' -- there's an implicit map in the seq node[{}]", m_state->node_id);
952
            _start_seqimap();
953
            _line_progressed(1);
954
            return true;
955
        }
956
        else if(rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t")))
957
        {
958
            _c4dbgpf("found ': ' -- there's an implicit map in the seq node[{}]", m_state->node_id);
959
            _start_seqimap();
960
            _line_progressed(2);
961
            return true;
962
        }
963
        else if(rem.begins_with("? "))
964
        {
965
            _c4dbgpf("found '? ' -- there's an implicit map in the seq node[{}]", m_state->node_id);
966
            _start_seqimap();
967
            _line_progressed(2);
968
            _RYML_CB_ASSERT(m_stack.m_callbacks, has_any(SSCL) && m_state->scalar == "");
969
            addrem_flags(QMRK|RKEY, RVAL|SSCL);
970
            return true;
971
        }
972
        else if(_handle_types())
973
        {
974
            return true;
975
        }
976
        else if(_handle_val_anchors_and_refs())
977
        {
978
            return true;
979
        }
980
        else if(rem.begins_with(", "))
981
        {
982
            _c4dbgp("found ',' -- the value was null");
983
            _append_val_null(rem.str - 1);
984
            _line_progressed(2);
985
            return true;
986
        }
987
        else if(rem.begins_with(','))
988
        {
989
            _c4dbgp("found ',' -- the value was null");
990
            _append_val_null(rem.str - 1);
991
            _line_progressed(1);
992
            return true;
993
        }
994
        else if(rem.begins_with('\t'))
995
        {
996
            _skipchars('\t');
997
            return true;
998
        }
999
        else
1000
        {
1001
            _c4err("parse error");
1002
        }
1003
    }
1004
    else if(has_any(RNXT))
1005
    {
1006
        _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RVAL));
1007
        if(rem.begins_with(", "))
1008
        {
1009
            _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(FLOW));
1010
            _c4dbgp("seq: expect next val");
1011
            addrem_flags(RVAL, RNXT);
1012
            _line_progressed(2);
1013
            return true;
1014
        }
1015
        else if(rem.begins_with(','))
1016
        {
1017
            _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(FLOW));
1018
            _c4dbgp("seq: expect next val");
1019
            addrem_flags(RVAL, RNXT);
1020
            _line_progressed(1);
1021
            return true;
1022
        }
1023
        else if(rem == ':')
1024
        {
1025
            _c4dbgpf("found ':' -- there's an implicit map in the seq node[{}]", m_state->node_id);
1026
            _start_seqimap();
1027
            _line_progressed(1);
1028
            return true;
1029
        }
1030
        else if(rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t")))
1031
        {
1032
            _c4dbgpf("found ': ' -- there's an implicit map in the seq node[{}]", m_state->node_id);
1033
            _start_seqimap();
1034
            _line_progressed(2);
1035
            return true;
1036
        }
1037
        else
1038
        {
1039
            _c4err("was expecting a comma");
1040
        }
1041
    }
1042
    else
1043
    {
1044
        _c4err("internal error");
1045
    }
1046

1047
    return true;
1048
}
1049

1050
//-----------------------------------------------------------------------------
1051
bool Parser::_handle_seq_blck()
1052
{
1053
    _c4dbgpf("handle_seq_impl: node_id={} level={}", m_state->node_id, m_state->level);
1054
    csubstr rem = m_state->line_contents.rem;
1055

1056
    _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(RSEQ));
1057
    _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RKEY));
1058
    _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(FLOW));
1059

1060
    if(rem.begins_with('#'))
1061
    {
1062
        _c4dbgp("it's a comment");
1063
        rem = _scan_comment();
1064
        return true;
1065
    }
1066
    if(has_any(RNXT))
1067
    {
1068
        _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RVAL));
1069

1070
        if(_handle_indentation())
1071
            return true;
1072

1073
        if(rem.begins_with("- ") _RYML_WITH_TAB_TOKENS( || rem.begins_with("-\t")))
1074
        {
1075
            _c4dbgp("expect another val");
1076
            addrem_flags(RVAL, RNXT);
1077
            _line_progressed(2);
1078
            return true;
1079
        }
1080
        else if(rem == '-')
1081
        {
1082
            _c4dbgp("expect another val");
1083
            addrem_flags(RVAL, RNXT);
1084
            _line_progressed(1);
1085
            return true;
1086
        }
1087
        else if(rem.begins_with_any(" \t"))
1088
        {
1089
            _RYML_CB_ASSERT(m_stack.m_callbacks,  ! _at_line_begin());
1090
            _skipchars(" \t");
1091
            return true;
1092
        }
1093
        else if(rem.begins_with("..."))
1094
        {
1095
            _c4dbgp("got stream end '...'");
1096
            _end_stream();
1097
            _line_progressed(3);
1098
            return true;
1099
        }
1100
        else if(rem.begins_with("---"))
1101
        {
1102
            _c4dbgp("got document start '---'");
1103
            _start_new_doc(rem);
1104
            return true;
1105
        }
1106
        else
1107
        {
1108
            _c4err("parse error");
1109
        }
1110
    }
1111
    else if(has_any(RVAL))
1112
    {
1113
        // there can be empty values
1114
        if(_handle_indentation())
1115
            return true;
1116

1117
        csubstr s;
1118
        bool is_quoted;
1119
        if(_scan_scalar_seq_blck(&s, &is_quoted)) // this also progresses the line
1120
        {
1121
            _c4dbgpf("it's a{} scalar", is_quoted ? " quoted" : "");
1122

1123
            rem = m_state->line_contents.rem;
1124
            if(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(rem.begins_with_any(" \t"), rem.begins_with(' ')))
1125
            {
1126
                _c4dbgp("skipping whitespace...");
1127
                size_t skip = rem.first_not_of(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' '));
1128
                if(skip == csubstr::npos)
1129
                    skip = rem.len; // maybe the line is just whitespace
1130
                _line_progressed(skip);
1131
                rem = rem.sub(skip);
1132
            }
1133

1134
            _c4dbgpf("rem=[{}]~~~{}~~~", rem.len, rem);
1135
            if(!rem.begins_with('#') && (rem.ends_with(':') || rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t"))))
1136
            {
1137
                _c4dbgp("actually, the scalar is the first key of a map, and it opens a new scope");
1138
                if(m_key_anchor.empty())
1139
                    _move_val_anchor_to_key_anchor();
1140
                if(m_key_tag.empty())
1141
                    _move_val_tag_to_key_tag();
1142
                addrem_flags(RNXT, RVAL); // before _push_level! This prepares the current level for popping by setting it to RNXT
1143
                _push_level();
1144
                _start_map();
1145
                _store_scalar(s, is_quoted);
1146
                if( ! _maybe_set_indentation_from_anchor_or_tag())
1147
                {
1148
                    _c4dbgpf("set indentation from scalar: {}", m_state->scalar_col);
1149
                    _set_indentation(m_state->scalar_col); // this is the column where the scalar starts
1150
                }
1151
                _move_key_tag2_to_key_tag();
1152
                addrem_flags(RVAL, RKEY);
1153
                _line_progressed(1);
1154
            }
1155
            else
1156
            {
1157
                _c4dbgp("appending val to current seq");
1158
                _append_val(s, is_quoted);
1159
                addrem_flags(RNXT, RVAL);
1160
            }
1161
            return true;
1162
        }
1163
        else if(rem.begins_with("- ") _RYML_WITH_TAB_TOKENS( || rem.begins_with("-\t")))
1164
        {
1165
            if(_rval_dash_start_or_continue_seq())
1166
                _line_progressed(2);
1167
            return true;
1168
        }
1169
        else if(rem == '-')
1170
        {
1171
            if(_rval_dash_start_or_continue_seq())
1172
                _line_progressed(1);
1173
            return true;
1174
        }
1175
        else if(rem.begins_with('['))
1176
        {
1177
            _c4dbgp("val is a child seq, flow");
1178
            addrem_flags(RNXT, RVAL); // before _push_level!
1179
            _push_level(/*explicit flow*/true);
1180
            _start_seq();
1181
            add_flags(FLOW);
1182
            _line_progressed(1);
1183
            return true;
1184
        }
1185
        else if(rem.begins_with('{'))
1186
        {
1187
            _c4dbgp("val is a child map, flow");
1188
            addrem_flags(RNXT, RVAL); // before _push_level!
1189
            _push_level(/*explicit flow*/true);
1190
            _start_map();
1191
            addrem_flags(FLOW|RKEY, RVAL);
1192
            _line_progressed(1);
1193
            return true;
1194
        }
1195
        else if(rem.begins_with("? "))
1196
        {
1197
            _c4dbgp("val is a child map + this key is complex");
1198
            addrem_flags(RNXT, RVAL); // before _push_level!
1199
            _push_level();
1200
            _start_map();
1201
            addrem_flags(QMRK|RKEY, RVAL);
1202
            _save_indentation();
1203
            _line_progressed(2);
1204
            return true;
1205
        }
1206
        else if(rem.begins_with(' '))
1207
        {
1208
            csubstr spc = rem.left_of(rem.first_not_of(' '));
1209
            if(_at_line_begin())
1210
            {
1211
                _c4dbgpf("skipping value indentation: {} spaces", spc.len);
1212
                _line_progressed(spc.len);
1213
                return true;
1214
            }
1215
            else
1216
            {
1217
                _c4dbgpf("skipping {} spaces", spc.len);
1218
                _line_progressed(spc.len);
1219
                return true;
1220
            }
1221
        }
1222
        else if(_handle_types())
1223
        {
1224
            return true;
1225
        }
1226
        else if(_handle_val_anchors_and_refs())
1227
        {
1228
            return true;
1229
        }
1230
        /* pathological case:
1231
         * - &key : val
1232
         * - &key :
1233
         * - : val
1234
         */
1235
        else if((!has_all(SSCL)) &&
1236
                (rem.begins_with(": ") || rem.left_of(rem.find("#")).trimr("\t") == ":"))
1237
        {
1238
            if(!m_val_anchor.empty() || !m_val_tag.empty())
1239
            {
1240
                _c4dbgp("val is a child map + this key is empty, with anchors or tags");
1241
                addrem_flags(RNXT, RVAL); // before _push_level!
1242
                _move_val_tag_to_key_tag();
1243
                _move_val_anchor_to_key_anchor();
1244
                _push_level();
1245
                _start_map();
1246
                _store_scalar_null(rem.str);
1247
                addrem_flags(RVAL, RKEY);
1248
                RYML_CHECK(_maybe_set_indentation_from_anchor_or_tag()); // one of them must exist
1249
                _line_progressed(rem.begins_with(": ") ? 2u : 1u);
1250
                return true;
1251
            }
1252
            else
1253
            {
1254
                _c4dbgp("val is a child map + this key is empty, no anchors or tags");
1255
                addrem_flags(RNXT, RVAL); // before _push_level!
1256
                size_t ind = m_state->indref;
1257
                _push_level();
1258
                _start_map();
1259
                _store_scalar_null(rem.str);
1260
                addrem_flags(RVAL, RKEY);
1261
                _c4dbgpf("set indentation from map anchor: {}", ind + 2);
1262
                _set_indentation(ind + 2); // this is the column where the map starts
1263
                _line_progressed(rem.begins_with(": ") ? 2u : 1u);
1264
                return true;
1265
            }
1266
        }
1267
        else
1268
        {
1269
            _c4err("parse error");
1270
        }
1271
    }
1272

1273
    return false;
1274
}
1275

1276
//-----------------------------------------------------------------------------
1277

1278
bool Parser::_rval_dash_start_or_continue_seq()
1279
{
1280
    size_t ind = m_state->line_contents.current_col();
1281
    _RYML_CB_ASSERT(m_stack.m_callbacks, ind >= m_state->indref);
1282
    size_t delta_ind = ind - m_state->indref;
1283
    if( ! delta_ind)
1284
    {
1285
        _c4dbgp("prev val was empty");
1286
        addrem_flags(RNXT, RVAL);
1287
        _append_val_null(&m_state->line_contents.full[ind]);
1288
        return false;
1289
    }
1290
    _c4dbgp("val is a nested seq, indented");
1291
    addrem_flags(RNXT, RVAL); // before _push_level!
1292
    _push_level();
1293
    _start_seq();
1294
    _save_indentation();
1295
    return true;
1296
}
1297

1298
//-----------------------------------------------------------------------------
1299
bool Parser::_handle_map_flow()
1300
{
1301
    // explicit flow, ie, inside {}, separated by commas
1302
    _c4dbgpf("handle_map_flow: node_id={}  level={}", m_state->node_id, m_state->level);
1303
    csubstr rem = m_state->line_contents.rem;
1304

1305
    _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(RMAP|FLOW));
1306

1307
    if(rem.begins_with(' '))
1308
    {
1309
        // with explicit flow, indentation does not matter
1310
        _c4dbgp("starts with spaces");
1311
        _skipchars(' ');
1312
        return true;
1313
    }
1314
    _RYML_WITH_TAB_TOKENS(else if(rem.begins_with('\t'))
1315
    {
1316
        // with explicit flow, indentation does not matter
1317
        _c4dbgp("starts with tabs");
1318
        _skipchars('\t');
1319
        return true;
1320
    })
1321
    else if(rem.begins_with('#'))
1322
    {
1323
        _c4dbgp("it's a comment");
1324
        rem = _scan_comment(); // also progresses the line
1325
        return true;
1326
    }
1327
    else if(rem.begins_with('}'))
1328
    {
1329
        _c4dbgp("end the map");
1330
        if(has_all(SSCL))
1331
        {
1332
            _c4dbgp("the last val was null");
1333
            _append_key_val_null(rem.str - 1);
1334
            rem_flags(RVAL);
1335
        }
1336
        _pop_level();
1337
        _line_progressed(1);
1338
        if(has_all(RSEQIMAP))
1339
        {
1340
            _c4dbgp("stopping implicitly nested 1x map");
1341
            _stop_seqimap();
1342
            _pop_level();
1343
        }
1344
        return true;
1345
    }
1346

1347
    if(has_any(RNXT))
1348
    {
1349
        _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RKEY));
1350
        _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RVAL));
1351
        _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RSEQIMAP));
1352

1353
        if(rem.begins_with(", "))
1354
        {
1355
            _c4dbgp("seq: expect next keyval");
1356
            addrem_flags(RKEY, RNXT);
1357
            _line_progressed(2);
1358
            return true;
1359
        }
1360
        else if(rem.begins_with(','))
1361
        {
1362
            _c4dbgp("seq: expect next keyval");
1363
            addrem_flags(RKEY, RNXT);
1364
            _line_progressed(1);
1365
            return true;
1366
        }
1367
        else
1368
        {
1369
            _c4err("parse error");
1370
        }
1371
    }
1372
    else if(has_any(RKEY))
1373
    {
1374
        _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RNXT));
1375
        _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RVAL));
1376

1377
        bool is_quoted;
1378
        if(has_none(SSCL) && _scan_scalar_map_flow(&rem, &is_quoted))
1379
        {
1380
            _c4dbgp("it's a scalar");
1381
            _store_scalar(rem, is_quoted);
1382
            rem = m_state->line_contents.rem;
1383
            csubstr trimmed = rem.triml(" \t");
1384
            if(trimmed.len && (trimmed.begins_with(": ") || trimmed.begins_with_any(":,}") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t"))))
1385
            {
1386
                _RYML_CB_ASSERT(m_stack.m_callbacks, trimmed.str >= rem.str);
1387
                size_t num = static_cast<size_t>(trimmed.str - rem.str);
1388
                _c4dbgpf("trimming {} whitespace after the scalar: '{}' --> '{}'", num, rem, rem.sub(num));
1389
                rem = rem.sub(num);
1390
                _line_progressed(num);
1391
            }
1392
        }
1393

1394
        if(rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t")))
1395
        {
1396
            _c4dbgp("wait for val");
1397
            addrem_flags(RVAL, RKEY|QMRK);
1398
            _line_progressed(2);
1399
            if(!has_all(SSCL))
1400
            {
1401
                _c4dbgp("no key was found, defaulting to empty key ''");
1402
                _store_scalar_null(rem.str);
1403
            }
1404
            return true;
1405
        }
1406
        else if(rem == ':')
1407
        {
1408
            _c4dbgp("wait for val");
1409
            addrem_flags(RVAL, RKEY|QMRK);
1410
            _line_progressed(1);
1411
            if(!has_all(SSCL))
1412
            {
1413
                _c4dbgp("no key was found, defaulting to empty key ''");
1414
                _store_scalar_null(rem.str);
1415
            }
1416
            return true;
1417
        }
1418
        else if(rem.begins_with('?'))
1419
        {
1420
            _c4dbgp("complex key");
1421
            add_flags(QMRK);
1422
            _line_progressed(1);
1423
            return true;
1424
        }
1425
        else if(rem.begins_with(','))
1426
        {
1427
            _c4dbgp("prev scalar was a key with null value");
1428
            _append_key_val_null(rem.str - 1);
1429
            _line_progressed(1);
1430
            return true;
1431
        }
1432
        else if(rem.begins_with('}'))
1433
        {
1434
            _c4dbgp("map terminates after a key...");
1435
            _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(SSCL));
1436
            _c4dbgp("the last val was null");
1437
            _append_key_val_null(rem.str - 1);
1438
            rem_flags(RVAL);
1439
            if(has_all(RSEQIMAP))
1440
            {
1441
                _c4dbgp("stopping implicitly nested 1x map");
1442
                _stop_seqimap();
1443
                _pop_level();
1444
            }
1445
            _pop_level();
1446
            _line_progressed(1);
1447
            return true;
1448
        }
1449
        else if(_handle_types())
1450
        {
1451
            return true;
1452
        }
1453
        else if(_handle_key_anchors_and_refs())
1454
        {
1455
            return true;
1456
        }
1457
        else if(rem == "")
1458
        {
1459
            return true;
1460
        }
1461
        else
1462
        {
1463
            size_t pos = rem.first_not_of(" \t");
1464
            if(pos == csubstr::npos)
1465
               pos = 0;
1466
            rem = rem.sub(pos);
1467
            if(rem.begins_with(':'))
1468
            {
1469
                _c4dbgp("wait for val");
1470
                addrem_flags(RVAL, RKEY|QMRK);
1471
                _line_progressed(pos + 1);
1472
                if(!has_all(SSCL))
1473
                {
1474
                    _c4dbgp("no key was found, defaulting to empty key ''");
1475
                    _store_scalar_null(rem.str);
1476
                }
1477
                return true;
1478
            }
1479
            else if(rem.begins_with('#'))
1480
            {
1481
                _c4dbgp("it's a comment");
1482
                _line_progressed(pos);
1483
                rem = _scan_comment(); // also progresses the line
1484
                return true;
1485
            }
1486
            else
1487
            {
1488
                _c4err("parse error");
1489
            }
1490
        }
1491
    }
1492
    else if(has_any(RVAL))
1493
    {
1494
        _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RNXT));
1495
        _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RKEY));
1496
        _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(SSCL));
1497
        bool is_quoted;
1498
        if(_scan_scalar_map_flow(&rem, &is_quoted))
1499
        {
1500
            _c4dbgp("it's a scalar");
1501
            addrem_flags(RNXT, RVAL|RKEY);
1502
            _append_key_val(rem, is_quoted);
1503
            if(has_all(RSEQIMAP))
1504
            {
1505
                _c4dbgp("stopping implicitly nested 1x map");
1506
                _stop_seqimap();
1507
                _pop_level();
1508
            }
1509
            return true;
1510
        }
1511
        else if(rem.begins_with('['))
1512
        {
1513
            _c4dbgp("val is a child seq");
1514
            addrem_flags(RNXT, RVAL|RKEY); // before _push_level!
1515
            _push_level(/*explicit flow*/true);
1516
            _move_scalar_from_top();
1517
            _start_seq();
1518
            add_flags(FLOW);
1519
            _line_progressed(1);
1520
            return true;
1521
        }
1522
        else if(rem.begins_with('{'))
1523
        {
1524
            _c4dbgp("val is a child map");
1525
            addrem_flags(RNXT, RVAL|RKEY); // before _push_level!
1526
            _push_level(/*explicit flow*/true);
1527
            _move_scalar_from_top();
1528
            _start_map();
1529
            addrem_flags(FLOW|RKEY, RNXT|RVAL);
1530
            _line_progressed(1);
1531
            return true;
1532
        }
1533
        else if(_handle_types())
1534
        {
1535
            return true;
1536
        }
1537
        else if(_handle_val_anchors_and_refs())
1538
        {
1539
            return true;
1540
        }
1541
        else if(rem.begins_with(','))
1542
        {
1543
            _c4dbgp("appending empty val");
1544
            _append_key_val_null(rem.str - 1);
1545
            addrem_flags(RKEY, RVAL);
1546
            _line_progressed(1);
1547
            if(has_any(RSEQIMAP))
1548
            {
1549
                _c4dbgp("stopping implicitly nested 1x map");
1550
                _stop_seqimap();
1551
                _pop_level();
1552
            }
1553
            return true;
1554
        }
1555
        else if(has_any(RSEQIMAP) && rem.begins_with(']'))
1556
        {
1557
            _c4dbgp("stopping implicitly nested 1x map");
1558
            if(has_any(SSCL))
1559
            {
1560
                _append_key_val_null(rem.str - 1);
1561
            }
1562
            _stop_seqimap();
1563
            _pop_level();
1564
            return true;
1565
        }
1566
        else
1567
        {
1568
            _c4err("parse error");
1569
        }
1570
    }
1571
    else
1572
    {
1573
        _c4err("internal error");
1574
    }
1575

1576
    return false;
1577
}
1578

1579
//-----------------------------------------------------------------------------
1580
bool Parser::_handle_map_blck()
1581
{
1582
    _c4dbgpf("handle_map_blck: node_id={}  level={}", m_state->node_id, m_state->level);
1583
    csubstr rem = m_state->line_contents.rem;
1584

1585
    _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(RMAP));
1586
    _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(FLOW));
1587

1588
    if(rem.begins_with('#'))
1589
    {
1590
        _c4dbgp("it's a comment");
1591
        rem = _scan_comment();
1592
        return true;
1593
    }
1594

1595
    if(has_any(RNXT))
1596
    {
1597
        _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RKEY));
1598
        _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RVAL));
1599
        // actually, we don't need RNXT in indent-based maps.
1600
        addrem_flags(RKEY, RNXT);
1601
    }
1602

1603
    if(_handle_indentation())
1604
    {
1605
        _c4dbgp("indentation token");
1606
        return true;
1607
    }
1608

1609
    if(has_any(RKEY))
1610
    {
1611
        _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RNXT));
1612
        _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RVAL));
1613

1614
        _c4dbgp("RMAP|RKEY read scalar?");
1615
        bool is_quoted;
1616
        if(_scan_scalar_map_blck(&rem, &is_quoted)) // this also progresses the line
1617
        {
1618
            _c4dbgpf("it's a{} scalar", is_quoted ? " quoted" : "");
1619
            if(has_all(QMRK|SSCL))
1620
            {
1621
                _c4dbgpf("current key is QMRK; SSCL is set. so take store scalar='{}' as key and add an empty val", m_state->scalar);
1622
                _append_key_val_null(rem.str - 1);
1623
            }
1624
            _store_scalar(rem, is_quoted);
1625
            if(has_all(QMRK|RSET))
1626
            {
1627
                _c4dbgp("it's a complex key, so use null value '~'");
1628
                _append_key_val_null(rem.str);
1629
            }
1630
            rem = m_state->line_contents.rem;
1631

1632
            if(rem.begins_with(':'))
1633
            {
1634
                _c4dbgp("wait for val");
1635
                addrem_flags(RVAL, RKEY|QMRK);
1636
                _line_progressed(1);
1637
                rem = m_state->line_contents.rem;
1638
                if(rem.begins_with_any(" \t"))
1639
                {
1640
                    _RYML_CB_ASSERT(m_stack.m_callbacks,  ! _at_line_begin());
1641
                    rem = rem.left_of(rem.first_not_of(" \t"));
1642
                    _c4dbgpf("skip {} spaces/tabs", rem.len);
1643
                    _line_progressed(rem.len);
1644
                }
1645
            }
1646
            return true;
1647
        }
1648
        else if(rem.begins_with_any(" \t"))
1649
        {
1650
            size_t pos = rem.first_not_of(" \t");
1651
            if(pos == npos)
1652
                pos = rem.len;
1653
            _c4dbgpf("skip {} spaces/tabs", pos);
1654
            _line_progressed(pos);
1655
            return true;
1656
        }
1657
        else if(rem == '?' || rem.begins_with("? "))
1658
        {
1659
            _c4dbgp("it's a complex key");
1660
            _line_progressed(rem.begins_with("? ") ? 2u : 1u);
1661
            if(has_any(SSCL))
1662
                _append_key_val_null(rem.str - 1);
1663
            add_flags(QMRK);
1664
            return true;
1665
        }
1666
        else if(has_all(QMRK) && rem.begins_with(':'))
1667
        {
1668
            _c4dbgp("complex key finished");
1669
            if(!has_any(SSCL))
1670
                _store_scalar_null(rem.str);
1671
            addrem_flags(RVAL, RKEY|QMRK);
1672
            _line_progressed(1);
1673
            rem = m_state->line_contents.rem;
1674
            if(rem.begins_with(' '))
1675
            {
1676
                _RYML_CB_ASSERT(m_stack.m_callbacks,  ! _at_line_begin());
1677
                _skipchars(' ');
1678
            }
1679
            return true;
1680
        }
1681
        else if(rem == ':' || rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t")))
1682
        {
1683
            _c4dbgp("key finished");
1684
            if(!has_all(SSCL))
1685
            {
1686
                _c4dbgp("key was empty...");
1687
                _store_scalar_null(rem.str);
1688
                rem_flags(QMRK);
1689
            }
1690
            addrem_flags(RVAL, RKEY);
1691
            _line_progressed(rem == ':' ? 1 : 2);
1692
            return true;
1693
        }
1694
        else if(rem.begins_with("..."))
1695
        {
1696
            _c4dbgp("end current document");
1697
            _end_stream();
1698
            _line_progressed(3);
1699
            return true;
1700
        }
1701
        else if(rem.begins_with("---"))
1702
        {
1703
            _c4dbgp("start new document '---'");
1704
            _start_new_doc(rem);
1705
            return true;
1706
        }
1707
        else if(_handle_types())
1708
        {
1709
            return true;
1710
        }
1711
        else if(_handle_key_anchors_and_refs())
1712
        {
1713
            return true;
1714
        }
1715
        else
1716
        {
1717
            _c4err("parse error");
1718
        }
1719
    }
1720
    else if(has_any(RVAL))
1721
    {
1722
        _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RNXT));
1723
        _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RKEY));
1724

1725
        _c4dbgp("RMAP|RVAL read scalar?");
1726
        csubstr s;
1727
        bool is_quoted;
1728
        if(_scan_scalar_map_blck(&s, &is_quoted)) // this also progresses the line
1729
        {
1730
            _c4dbgpf("it's a{} scalar", is_quoted ? " quoted" : "");
1731

1732
            rem = m_state->line_contents.rem;
1733

1734
            if(rem.begins_with(": "))
1735
            {
1736
                _c4dbgp("actually, the scalar is the first key of a map");
1737
                addrem_flags(RKEY, RVAL); // before _push_level! This prepares the current level for popping by setting it to RNXT
1738
                _push_level();
1739
                _move_scalar_from_top();
1740
                _move_val_anchor_to_key_anchor();
1741
                _start_map();
1742
                _save_indentation(m_state->scalar_col);
1743
                addrem_flags(RVAL, RKEY);
1744
                _line_progressed(2);
1745
            }
1746
            else if(rem.begins_with(':'))
1747
            {
1748
                _c4dbgp("actually, the scalar is the first key of a map, and it opens a new scope");
1749
                addrem_flags(RKEY, RVAL); // before _push_level! This prepares the current level for popping by setting it to RNXT
1750
                _push_level();
1751
                _move_scalar_from_top();
1752
                _move_val_anchor_to_key_anchor();
1753
                _start_map();
1754
                _save_indentation(/*behind*/s.len);
1755
                addrem_flags(RVAL, RKEY);
1756
                _line_progressed(1);
1757
            }
1758
            else
1759
            {
1760
                _c4dbgp("appending keyval to current map");
1761
                _append_key_val(s, is_quoted);
1762
                addrem_flags(RKEY, RVAL);
1763
            }
1764
            return true;
1765
        }
1766
        else if(rem.begins_with("- ") _RYML_WITH_TAB_TOKENS( || rem.begins_with("-\t")))
1767
        {
1768
            _c4dbgp("val is a nested seq, indented");
1769
            addrem_flags(RKEY, RVAL); // before _push_level!
1770
            _push_level();
1771
            _move_scalar_from_top();
1772
            _start_seq();
1773
            _save_indentation();
1774
            _line_progressed(2);
1775
            return true;
1776
        }
1777
        else if(rem == '-')
1778
        {
1779
            _c4dbgp("maybe a seq. start unknown, indented");
1780
            _start_unk();
1781
            _save_indentation();
1782
            _line_progressed(1);
1783
            return true;
1784
        }
1785
        else if(rem.begins_with('['))
1786
        {
1787
            _c4dbgp("val is a child seq, flow");
1788
            addrem_flags(RKEY, RVAL); // before _push_level!
1789
            _push_level(/*explicit flow*/true);
1790
            _move_scalar_from_top();
1791
            _start_seq();
1792
            add_flags(FLOW);
1793
            _line_progressed(1);
1794
            return true;
1795
        }
1796
        else if(rem.begins_with('{'))
1797
        {
1798
            _c4dbgp("val is a child map, flow");
1799
            addrem_flags(RKEY, RVAL); // before _push_level!
1800
            _push_level(/*explicit flow*/true);
1801
            _move_scalar_from_top();
1802
            _start_map();
1803
            addrem_flags(FLOW|RKEY, RVAL);
1804
            _line_progressed(1);
1805
            return true;
1806
        }
1807
        else if(rem.begins_with(' '))
1808
        {
1809
            csubstr spc = rem.left_of(rem.first_not_of(' '));
1810
            if(_at_line_begin())
1811
            {
1812
                _c4dbgpf("skipping value indentation: {} spaces", spc.len);
1813
                _line_progressed(spc.len);
1814
                return true;
1815
            }
1816
            else
1817
            {
1818
                _c4dbgpf("skipping {} spaces", spc.len);
1819
                _line_progressed(spc.len);
1820
                return true;
1821
            }
1822
        }
1823
        else if(_handle_types())
1824
        {
1825
            return true;
1826
        }
1827
        else if(_handle_val_anchors_and_refs())
1828
        {
1829
            return true;
1830
        }
1831
        else if(rem.begins_with("--- ") || rem == "---" || rem.begins_with("---\t"))
1832
        {
1833
            _start_new_doc(rem);
1834
            return true;
1835
        }
1836
        else if(rem.begins_with("..."))
1837
        {
1838
            _c4dbgp("end current document");
1839
            _end_stream();
1840
            _line_progressed(3);
1841
            return true;
1842
        }
1843
        else
1844
        {
1845
            _c4err("parse error");
1846
        }
1847
    }
1848
    else
1849
    {
1850
        _c4err("internal error");
1851
    }
1852

1853
    return false;
1854
}
1855

1856

1857
//-----------------------------------------------------------------------------
1858
bool Parser::_handle_top()
1859
{
1860
    _c4dbgp("handle_top");
1861
    csubstr rem = m_state->line_contents.rem;
1862

1863
    if(rem.begins_with('#'))
1864
    {
1865
        _c4dbgp("a comment line");
1866
        _scan_comment();
1867
        return true;
1868
    }
1869

1870
    csubstr trimmed = rem.triml(' ');
1871

1872
    if(trimmed.begins_with('%'))
1873
    {
1874
        _handle_directive(trimmed);
1875
        _line_progressed(rem.len);
1876
        return true;
1877
    }
1878
    else if(trimmed.begins_with("--- ") || trimmed == "---" || trimmed.begins_with("---\t"))
1879
    {
1880
        _start_new_doc(rem);
1881
        if(trimmed.len < rem.len)
1882
        {
1883
            _line_progressed(rem.len - trimmed.len);
1884
            _save_indentation();
1885
        }
1886
        return true;
1887
    }
1888
    else if(trimmed.begins_with("..."))
1889
    {
1890
        _c4dbgp("end current document");
1891
        _end_stream();
1892
        if(trimmed.len < rem.len)
1893
        {
1894
            _line_progressed(rem.len - trimmed.len);
1895
        }
1896
        _line_progressed(3);
1897
        return true;
1898
    }
1899
    else
1900
    {
1901
        _c4err("parse error");
1902
    }
1903

1904
    return false;
1905
}
1906

1907

1908
//-----------------------------------------------------------------------------
1909

1910
bool Parser::_handle_key_anchors_and_refs()
1911
{
1912
    _RYML_CB_ASSERT(m_stack.m_callbacks, !has_any(RVAL));
1913
    const csubstr rem = m_state->line_contents.rem;
1914
    if(rem.begins_with('&'))
1915
    {
1916
        _c4dbgp("found a key anchor!!!");
1917
        if(has_all(QMRK|SSCL))
1918
        {
1919
            _RYML_CB_ASSERT(m_stack.m_callbacks, has_any(RKEY));
1920
            _c4dbgp("there is a stored key, so this anchor is for the next element");
1921
            _append_key_val_null(rem.str - 1);
1922
            rem_flags(QMRK);
1923
            return true;
1924
        }
1925
        csubstr anchor = rem.left_of(rem.first_of(' '));
1926
        _line_progressed(anchor.len);
1927
        anchor = anchor.sub(1); // skip the first character
1928
        _move_key_anchor_to_val_anchor();
1929
        _c4dbgpf("key anchor value: '{}'", anchor);
1930
        m_key_anchor = anchor;
1931
        m_key_anchor_indentation = m_state->line_contents.current_col(rem);
1932
        return true;
1933
    }
1934
    else if(C4_UNLIKELY(rem.begins_with('*')))
1935
    {
1936
        _c4err("not implemented - this should have been catched elsewhere");
1937
        C4_NEVER_REACH();
1938
        return false;
1939
    }
1940
    return false;
1941
}
1942

1943
bool Parser::_handle_val_anchors_and_refs()
1944
{
1945
    _RYML_CB_ASSERT(m_stack.m_callbacks, !has_any(RKEY));
1946
    const csubstr rem = m_state->line_contents.rem;
1947
    if(rem.begins_with('&'))
1948
    {
1949
        csubstr anchor = rem.left_of(rem.first_of(' '));
1950
        _line_progressed(anchor.len);
1951
        anchor = anchor.sub(1); // skip the first character
1952
        _c4dbgpf("val: found an anchor: '{}', indentation={}!!!", anchor, m_state->line_contents.current_col(rem));
1953
        if(m_val_anchor.empty())
1954
        {
1955
            _c4dbgpf("save val anchor: '{}'", anchor);
1956
            m_val_anchor = anchor;
1957
            m_val_anchor_indentation = m_state->line_contents.current_col(rem);
1958
        }
1959
        else
1960
        {
1961
            _c4dbgpf("there is a pending val anchor '{}'", m_val_anchor);
1962
            if(m_tree->is_seq(m_state->node_id))
1963
            {
1964
                if(m_tree->has_children(m_state->node_id))
1965
                {
1966
                    _c4dbgpf("current node={} is a seq, has {} children", m_state->node_id, m_tree->num_children(m_state->node_id));
1967
                    _c4dbgpf("... so take the new one as a key anchor '{}'", anchor);
1968
                    m_key_anchor = anchor;
1969
                    m_key_anchor_indentation = m_state->line_contents.current_col(rem);
1970
                }
1971
                else
1972
                {
1973
                    _c4dbgpf("current node={} is a seq, has no children", m_state->node_id);
1974
                    if(m_tree->has_val_anchor(m_state->node_id))
1975
                    {
1976
                        _c4dbgpf("... node={} already has val anchor: '{}'", m_state->node_id, m_tree->val_anchor(m_state->node_id));
1977
                        _c4dbgpf("... so take the new one as a key anchor '{}'", anchor);
1978
                        m_key_anchor = anchor;
1979
                        m_key_anchor_indentation = m_state->line_contents.current_col(rem);
1980
                    }
1981
                    else
1982
                    {
1983
                        _c4dbgpf("... so set pending val anchor: '{}' on current node {}", m_val_anchor, m_state->node_id);
1984
                        m_tree->set_val_anchor(m_state->node_id, m_val_anchor);
1985
                        m_val_anchor = anchor;
1986
                        m_val_anchor_indentation = m_state->line_contents.current_col(rem);
1987
                    }
1988
                }
1989
            }
1990
        }
1991
        return true;
1992
    }
1993
    else if(C4_UNLIKELY(rem.begins_with('*')))
1994
    {
1995
        _c4err("not implemented - this should have been catched elsewhere");
1996
        C4_NEVER_REACH();
1997
        return false;
1998
    }
1999
    return false;
2000
}
2001

2002
void Parser::_move_key_anchor_to_val_anchor()
2003
{
2004
    if(m_key_anchor.empty())
2005
        return;
2006
    _c4dbgpf("move current key anchor to val slot: key='{}' -> val='{}'", m_key_anchor, m_val_anchor);
2007
    if(!m_val_anchor.empty())
2008
        _c4err("triple-pending anchor");
2009
    m_val_anchor = m_key_anchor;
2010
    m_val_anchor_indentation = m_key_anchor_indentation;
2011
    m_key_anchor = {};
2012
    m_key_anchor_indentation = {};
2013
}
2014

2015
void Parser::_move_val_anchor_to_key_anchor()
2016
{
2017
    if(m_val_anchor.empty())
2018
        return;
2019
    if(!_token_is_from_this_line(m_val_anchor))
2020
        return;
2021
    _c4dbgpf("move current val anchor to key slot: key='{}' <- val='{}'", m_key_anchor, m_val_anchor);
2022
    if(!m_key_anchor.empty())
2023
        _c4err("triple-pending anchor");
2024
    m_key_anchor = m_val_anchor;
2025
    m_key_anchor_indentation = m_val_anchor_indentation;
2026
    m_val_anchor = {};
2027
    m_val_anchor_indentation = {};
2028
}
2029

2030
void Parser::_move_key_tag_to_val_tag()
2031
{
2032
    if(m_key_tag.empty())
2033
        return;
2034
    _c4dbgpf("move key tag to val tag: key='{}' -> val='{}'", m_key_tag, m_val_tag);
2035
    m_val_tag = m_key_tag;
2036
    m_val_tag_indentation = m_key_tag_indentation;
2037
    m_key_tag.clear();
2038
    m_key_tag_indentation = 0;
2039
}
2040

2041
void Parser::_move_val_tag_to_key_tag()
2042
{
2043
    if(m_val_tag.empty())
2044
        return;
2045
    if(!_token_is_from_this_line(m_val_tag))
2046
        return;
2047
    _c4dbgpf("move val tag to key tag: key='{}' <- val='{}'", m_key_tag, m_val_tag);
2048
    m_key_tag = m_val_tag;
2049
    m_key_tag_indentation = m_val_tag_indentation;
2050
    m_val_tag.clear();
2051
    m_val_tag_indentation = 0;
2052
}
2053

2054
void Parser::_move_key_tag2_to_key_tag()
2055
{
2056
    if(m_key_tag2.empty())
2057
        return;
2058
    _c4dbgpf("move key tag2 to key tag: key='{}' <- key2='{}'", m_key_tag, m_key_tag2);
2059
    m_key_tag = m_key_tag2;
2060
    m_key_tag_indentation = m_key_tag2_indentation;
2061
    m_key_tag2.clear();
2062
    m_key_tag2_indentation = 0;
2063
}
2064

2065

2066
//-----------------------------------------------------------------------------
2067

2068
bool Parser::_handle_types()
2069
{
2070
    csubstr rem = m_state->line_contents.rem.triml(' ');
2071
    csubstr t;
2072

2073
    if(rem.begins_with("!!"))
2074
    {
2075
        _c4dbgp("begins with '!!'");
2076
        t = rem.left_of(rem.first_of(" ,"));
2077
        _RYML_CB_ASSERT(m_stack.m_callbacks, t.len >= 2);
2078
        //t = t.sub(2);
2079
        if(t == "!!set")
2080
            add_flags(RSET);
2081
    }
2082
    else if(rem.begins_with("!<"))
2083
    {
2084
        _c4dbgp("begins with '!<'");
2085
        t = rem.left_of(rem.first_of('>'), true);
2086
        _RYML_CB_ASSERT(m_stack.m_callbacks, t.len >= 2);
2087
        //t = t.sub(2, t.len-1);
2088
    }
2089
    else if(rem.begins_with("!h!"))
2090
    {
2091
        _c4dbgp("begins with '!h!'");
2092
        t = rem.left_of(rem.first_of(' '));
2093
        _RYML_CB_ASSERT(m_stack.m_callbacks, t.len >= 3);
2094
        //t = t.sub(3);
2095
    }
2096
    else if(rem.begins_with('!'))
2097
    {
2098
        _c4dbgp("begins with '!'");
2099
        t = rem.left_of(rem.first_of(' '));
2100
        _RYML_CB_ASSERT(m_stack.m_callbacks, t.len >= 1);
2101
        //t = t.sub(1);
2102
    }
2103

2104
    if(t.empty())
2105
        return false;
2106

2107
    if(has_all(QMRK|SSCL))
2108
    {
2109
        _RYML_CB_ASSERT(m_stack.m_callbacks, has_any(RKEY));
2110
        _c4dbgp("there is a stored key, so this tag is for the next element");
2111
        _append_key_val_null(rem.str - 1);
2112
        rem_flags(QMRK);
2113
    }
2114

2115
    #ifdef RYML_NO_COVERAGE__TO_BE_DELETED
2116
    const char *tag_beginning = rem.str;
2117
    #endif
2118
    size_t tag_indentation = m_state->line_contents.current_col(t);
2119
    _c4dbgpf("there was a tag: '{}', indentation={}", t, tag_indentation);
2120
    _RYML_CB_ASSERT(m_stack.m_callbacks, t.end() > m_state->line_contents.rem.begin());
2121
    _line_progressed(static_cast<size_t>(t.end() - m_state->line_contents.rem.begin()));
2122
    {
2123
        size_t pos = m_state->line_contents.rem.first_not_of(" \t");
2124
        if(pos != csubstr::npos)
2125
            _line_progressed(pos);
2126
    }
2127

2128
    if(has_all(RMAP|RKEY))
2129
    {
2130
        _c4dbgpf("saving map key tag '{}'", t);
2131
        _RYML_CB_ASSERT(m_stack.m_callbacks, m_key_tag.empty());
2132
        m_key_tag = t;
2133
        m_key_tag_indentation = tag_indentation;
2134
    }
2135
    else if(has_all(RMAP|RVAL))
2136
    {
2137
        /* foo: !!str
2138
         * !!str : bar  */
2139
        rem = m_state->line_contents.rem;
2140
        rem = rem.left_of(rem.find("#"));
2141
        rem = rem.trimr(" \t");
2142
        _c4dbgpf("rem='{}'", rem);
2143
        #ifdef RYML_NO_COVERAGE__TO_BE_DELETED
2144
        if(rem == ':' || rem.begins_with(": "))
2145
        {
2146
            _c4dbgp("the last val was null, and this is a tag from a null key");
2147
            _append_key_val_null(tag_beginning - 1);
2148
            _store_scalar_null(rem.str - 1);
2149
            // do not change the flag to key, it is ~
2150
            _RYML_CB_ASSERT(m_stack.m_callbacks, rem.begin() > m_state->line_contents.rem.begin());
2151
            size_t token_len = rem == ':' ? 1 : 2;
2152
            _line_progressed(static_cast<size_t>(token_len + rem.begin() - m_state->line_contents.rem.begin()));
2153
        }
2154
        #endif
2155
        _c4dbgpf("saving map val tag '{}'", t);
2156
        _RYML_CB_ASSERT(m_stack.m_callbacks, m_val_tag.empty());
2157
        m_val_tag = t;
2158
        m_val_tag_indentation = tag_indentation;
2159
    }
2160
    else if(has_all(RSEQ|RVAL) || has_all(RTOP|RUNK|NDOC))
2161
    {
2162
        if(m_val_tag.empty())
2163
        {
2164
            _c4dbgpf("saving seq/doc val tag '{}'", t);
2165
            m_val_tag = t;
2166
            m_val_tag_indentation = tag_indentation;
2167
        }
2168
        else
2169
        {
2170
            _c4dbgpf("saving seq/doc key tag '{}'", t);
2171
            m_key_tag = t;
2172
            m_key_tag_indentation = tag_indentation;
2173
        }
2174
    }
2175
    else if(has_all(RTOP|RUNK) || has_any(RUNK))
2176
    {
2177
        rem = m_state->line_contents.rem;
2178
        rem = rem.left_of(rem.find("#"));
2179
        rem = rem.trimr(" \t");
2180
        if(rem.empty())
2181
        {
2182
            _c4dbgpf("saving val tag '{}'", t);
2183
            _RYML_CB_ASSERT(m_stack.m_callbacks, m_val_tag.empty());
2184
            m_val_tag = t;
2185
            m_val_tag_indentation = tag_indentation;
2186
        }
2187
        else
2188
        {
2189
            _c4dbgpf("saving key tag '{}'", t);
2190
            if(m_key_tag.empty())
2191
            {
2192
                m_key_tag = t;
2193
                m_key_tag_indentation = tag_indentation;
2194
            }
2195
            else
2196
            {
2197
                /* handle this case:
2198
                 * !!str foo: !!map
2199
                 *   !!int 1: !!float 20.0
2200
                 *   !!int 3: !!float 40.0
2201
                 *
2202
                 * (m_key_tag would be !!str and m_key_tag2 would be !!int)
2203
                 */
2204
                m_key_tag2 = t;
2205
                m_key_tag2_indentation = tag_indentation;
2206
            }
2207
        }
2208
    }
2209
    else
2210
    {
2211
        _c4err("internal error");
2212
    }
2213

2214
    if(m_val_tag.not_empty())
2215
    {
2216
        YamlTag_e tag = to_tag(t);
2217
        if(tag == TAG_STR)
2218
        {
2219
            _c4dbgpf("tag '{}' is a str-type tag", t);
2220
            if(has_all(RTOP|RUNK|NDOC))
2221
            {
2222
                _c4dbgpf("docval. slurping the string. pos={}", m_state->pos.offset);
2223
                csubstr scalar = _slurp_doc_scalar();
2224
                _c4dbgpf("docval. after slurp: {}, at node {}: '{}'", m_state->pos.offset, m_state->node_id, scalar);
2225
                m_tree->to_val(m_state->node_id, scalar, DOC);
2226
                _c4dbgpf("docval. val tag {} -> {}", m_val_tag, normalize_tag(m_val_tag));
2227
                m_tree->set_val_tag(m_state->node_id, normalize_tag(m_val_tag));
2228
                m_val_tag.clear();
2229
                if(!m_val_anchor.empty())
2230
                {
2231
                    _c4dbgpf("setting val anchor[{}]='{}'", m_state->node_id, m_val_anchor);
2232
                    m_tree->set_val_anchor(m_state->node_id, m_val_anchor);
2233
                    m_val_anchor.clear();
2234
                }
2235
                _end_stream();
2236
            }
2237
        }
2238
    }
2239
    return true;
2240
}
2241

2242
//-----------------------------------------------------------------------------
2243
csubstr Parser::_slurp_doc_scalar()
2244
{
2245
    csubstr s = m_state->line_contents.rem;
2246
    size_t pos = m_state->pos.offset;
2247
    _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->line_contents.full.find("---") != csubstr::npos);
2248
    _c4dbgpf("slurp 0 '{}'. REM='{}'", s, m_buf.sub(m_state->pos.offset));
2249
    if(s.len == 0)
2250
    {
2251
        _line_ended();
2252
        _scan_line();
2253
        s = m_state->line_contents.rem;
2254
        pos = m_state->pos.offset;
2255
    }
2256

2257
    size_t skipws = s.first_not_of(" \t");
2258
    _c4dbgpf("slurp 1 '{}'. REM='{}'", s, m_buf.sub(m_state->pos.offset));
2259
    if(skipws != npos)
2260
    {
2261
        _line_progressed(skipws);
2262
        s = m_state->line_contents.rem;
2263
        pos = m_state->pos.offset;
2264
        _c4dbgpf("slurp 2 '{}'. REM='{}'", s, m_buf.sub(m_state->pos.offset));
2265
    }
2266

2267
    _RYML_CB_ASSERT(m_stack.m_callbacks, m_val_anchor.empty());
2268
    _handle_val_anchors_and_refs();
2269
    if(!m_val_anchor.empty())
2270
    {
2271
        s = m_state->line_contents.rem;
2272
        skipws = s.first_not_of(" \t");
2273
        if(skipws != npos)
2274
        {
2275
            _line_progressed(skipws);
2276
        }
2277
        s = m_state->line_contents.rem;
2278
        pos = m_state->pos.offset;
2279
        _c4dbgpf("slurp 3 '{}'. REM='{}'", s, m_buf.sub(m_state->pos.offset));
2280
    }
2281

2282
    if(s.begins_with('\''))
2283
    {
2284
        m_state->scalar_col = m_state->line_contents.current_col(s);
2285
        return _scan_squot_scalar();
2286
    }
2287
    else if(s.begins_with('"'))
2288
    {
2289
        m_state->scalar_col = m_state->line_contents.current_col(s);
2290
        return _scan_dquot_scalar();
2291
    }
2292
    else if(s.begins_with('|') || s.begins_with('>'))
2293
    {
2294
        return _scan_block();
2295
    }
2296

2297
    _c4dbgpf("slurp 4 '{}'. REM='{}'", s, m_buf.sub(m_state->pos.offset));
2298

2299
    m_state->scalar_col = m_state->line_contents.current_col(s);
2300
    _RYML_CB_ASSERT(m_stack.m_callbacks, s.end() >= m_buf.begin() + pos);
2301
    _line_progressed(static_cast<size_t>(s.end() - (m_buf.begin() + pos)));
2302

2303
    _c4dbgpf("slurp 5 '{}'. REM='{}'", s, m_buf.sub(m_state->pos.offset));
2304

2305
    if(_at_line_end())
2306
    {
2307
        _c4dbgpf("at line end. curr='{}'", s);
2308
        s = _extend_scanned_scalar(s);
2309
    }
2310

2311
    _c4dbgpf("scalar was '{}'", s);
2312

2313
    return s;
2314
}
2315

2316

2317
//-----------------------------------------------------------------------------
2318

2319
bool Parser::_scan_scalar_seq_blck(csubstr *C4_RESTRICT scalar, bool *C4_RESTRICT quoted)
2320
{
2321
    _RYML_CB_ASSERT(m_stack.m_callbacks,  has_any(RSEQ));
2322
    _RYML_CB_ASSERT(m_stack.m_callbacks,  has_any(RVAL));
2323
    _RYML_CB_ASSERT(m_stack.m_callbacks,  ! has_any(RKEY));
2324
    _RYML_CB_ASSERT(m_stack.m_callbacks,  ! has_any(FLOW));
2325

2326
    csubstr s = m_state->line_contents.rem;
2327
    if(s.len == 0)
2328
        return false;
2329
    s = s.trim(" \t");
2330
    if(s.len == 0)
2331
        return false;
2332

2333
    if(s.begins_with('\''))
2334
    {
2335
        _c4dbgp("got a ': scanning single-quoted scalar");
2336
        m_state->scalar_col = m_state->line_contents.current_col(s);
2337
        *scalar = _scan_squot_scalar();
2338
        *quoted = true;
2339
        return true;
2340
    }
2341
    else if(s.begins_with('"'))
2342
    {
2343
        _c4dbgp("got a \": scanning double-quoted scalar");
2344
        m_state->scalar_col = m_state->line_contents.current_col(s);
2345
        *scalar = _scan_dquot_scalar();
2346
        *quoted = true;
2347
        return true;
2348
    }
2349
    else if(s.begins_with('|') || s.begins_with('>'))
2350
    {
2351
        *scalar = _scan_block();
2352
        *quoted = true;
2353
        return true;
2354
    }
2355
    else if(has_any(RTOP) && _is_doc_sep(s))
2356
    {
2357
        return false;
2358
    }
2359

2360
    _c4dbgp("RSEQ|RVAL");
2361
    if( ! _is_scalar_next__rseq_rval(s))
2362
        return false;
2363
    _RYML_WITH_TAB_TOKENS(else if(s.begins_with("-\t"))
2364
        return false;
2365
    )
2366

2367
    if(s.ends_with(':'))
2368
    {
2369
        --s.len;
2370
    }
2371
    else
2372
    {
2373
        auto first = s.first_of_any(": " _RYML_WITH_TAB_TOKENS( , ":\t"), " #");
2374
        if(first)
2375
            s.len = first.pos;
2376
    }
2377
    s = s.trimr(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' '));
2378

2379
    if(s.empty())
2380
        return false;
2381

2382
    m_state->scalar_col = m_state->line_contents.current_col(s);
2383
    _RYML_CB_ASSERT(m_stack.m_callbacks, s.str >= m_state->line_contents.rem.str);
2384
    _line_progressed(static_cast<size_t>(s.str - m_state->line_contents.rem.str) + s.len);
2385

2386
    if(_at_line_end() && s != '~')
2387
    {
2388
        _c4dbgpf("at line end. curr='{}'", s);
2389
        s = _extend_scanned_scalar(s);
2390
    }
2391

2392
    _c4dbgpf("scalar was '{}'", s);
2393

2394
    *scalar = s;
2395
    *quoted = false;
2396
    return true;
2397
}
2398

2399
bool Parser::_scan_scalar_map_blck(csubstr *C4_RESTRICT scalar, bool *C4_RESTRICT quoted)
2400
{
2401
    _c4dbgp("_scan_scalar_map_blck");
2402
    _RYML_CB_ASSERT(m_stack.m_callbacks,  has_any(RMAP));
2403
    _RYML_CB_ASSERT(m_stack.m_callbacks,  ! has_any(FLOW));
2404
    _RYML_CB_ASSERT(m_stack.m_callbacks,  has_any(RKEY|RVAL));
2405

2406
    csubstr s = m_state->line_contents.rem;
2407
    #ifdef RYML_NO_COVERAGE__TO_BE_DELETED__OR_REFACTORED
2408
    if(s.len == 0)
2409
        return false;
2410
    #endif
2411
    s = s.trim(" \t");
2412
    if(s.len == 0)
2413
        return false;
2414

2415
    if(s.begins_with('\''))
2416
    {
2417
        _c4dbgp("got a ': scanning single-quoted scalar");
2418
        m_state->scalar_col = m_state->line_contents.current_col(s);
2419
        *scalar = _scan_squot_scalar();
2420
        *quoted = true;
2421
        return true;
2422
    }
2423
    else if(s.begins_with('"'))
2424
    {
2425
        _c4dbgp("got a \": scanning double-quoted scalar");
2426
        m_state->scalar_col = m_state->line_contents.current_col(s);
2427
        *scalar = _scan_dquot_scalar();
2428
        *quoted = true;
2429
        return true;
2430
    }
2431
    else if(s.begins_with('|') || s.begins_with('>'))
2432
    {
2433
        *scalar = _scan_block();
2434
        *quoted = true;
2435
        return true;
2436
    }
2437
    else if(has_any(RTOP) && _is_doc_sep(s))
2438
    {
2439
        return false;
2440
    }
2441

2442
    if( ! _is_scalar_next__rmap(s))
2443
        return false;
2444

2445
    size_t colon_token = s.find(": ");
2446
    if(colon_token == npos)
2447
    {
2448
        _RYML_WITH_OR_WITHOUT_TAB_TOKENS(
2449
            // with tab tokens
2450
            colon_token = s.find(":\t");
2451
            if(colon_token == npos)
2452
            {
2453
                _RYML_CB_ASSERT(m_stack.m_callbacks, s.len > 0);
2454
                colon_token = s.find(':');
2455
                if(colon_token != s.len-1)
2456
                    colon_token = npos;
2457
            }
2458
            ,
2459
            // without tab tokens
2460
            colon_token = s.find(':');
2461
            _RYML_CB_ASSERT(m_stack.m_callbacks, s.len > 0);
2462
            if(colon_token != s.len-1)
2463
                colon_token = npos;
2464
        )
2465
    }
2466

2467
    if(has_all(RKEY))
2468
    {
2469
        _RYML_CB_ASSERT(m_stack.m_callbacks, !s.begins_with(' '));
2470
        if(has_any(QMRK))
2471
        {
2472
            _c4dbgp("RMAP|RKEY|CPLX");
2473
            _RYML_CB_ASSERT(m_stack.m_callbacks, has_any(RMAP));
2474
            if(s.begins_with("? ") || s == '?')
2475
                return false;
2476
            s = s.left_of(colon_token);
2477
            s = s.left_of(s.first_of("#"));
2478
            s = s.trimr(" \t");
2479
            if(s.begins_with("---"))
2480
                return false;
2481
            else if(s.begins_with("..."))
2482
                return false;
2483
        }
2484
        else
2485
        {
2486
            _c4dbgp("RMAP|RKEY");
2487
            _RYML_CB_CHECK(m_stack.m_callbacks, !s.begins_with('{'));
2488
            if(s.begins_with("? ") || s == '?')
2489
                return false;
2490
            s = s.left_of(colon_token);
2491
            s = s.trimr(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' '));
2492
            if(s.begins_with("---"))
2493
            {
2494
                return false;
2495
            }
2496
            else if(s.begins_with("..."))
2497
            {
2498
                return false;
2499
            }
2500
        }
2501
    }
2502
    else if(has_all(RVAL))
2503
    {
2504
        _c4dbgp("RMAP|RVAL");
2505
        _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(QMRK));
2506
        if( ! _is_scalar_next__rmap_val(s))
2507
            return false;
2508
        _RYML_WITH_TAB_TOKENS(
2509
        else if(s.begins_with("-\t"))
2510
            return false;
2511
        )
2512
        _c4dbgp("RMAP|RVAL: scalar");
2513
        s = s.left_of(s.find(" #")); // is there a comment?
2514
        s = s.left_of(s.find("\t#")); // is there a comment?
2515
        s = s.trim(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' '));
2516
        if(s.begins_with("---"))
2517
            return false;
2518
        #ifdef RYML_NO_COVERAGE__TO_BE_DELETED__OR_REFACTORED
2519
        else if(s.begins_with("..."))
2520
            return false;
2521
        #endif
2522
    }
2523

2524
    if(s.empty())
2525
        return false;
2526

2527
    m_state->scalar_col = m_state->line_contents.current_col(s);
2528
    _RYML_CB_ASSERT(m_stack.m_callbacks, s.str >= m_state->line_contents.rem.str);
2529
    _line_progressed(static_cast<size_t>(s.str - m_state->line_contents.rem.str) + s.len);
2530

2531
    if(_at_line_end() && s != '~')
2532
    {
2533
        _c4dbgpf("at line end. curr='{}'", s);
2534
        s = _extend_scanned_scalar(s);
2535
    }
2536

2537
    _c4dbgpf("scalar was '{}'", s);
2538

2539
    *scalar = s;
2540
    *quoted = false;
2541
    return true;
2542
}
2543

2544
bool Parser::_scan_scalar_seq_flow(csubstr *C4_RESTRICT scalar, bool *C4_RESTRICT quoted)
2545
{
2546
    _RYML_CB_ASSERT(m_stack.m_callbacks,  has_any(RSEQ));
2547
    _RYML_CB_ASSERT(m_stack.m_callbacks,  has_any(FLOW));
2548
    _RYML_CB_ASSERT(m_stack.m_callbacks,  has_any(RVAL));
2549
    _RYML_CB_ASSERT(m_stack.m_callbacks,  ! has_any(RKEY));
2550

2551
    csubstr s = m_state->line_contents.rem;
2552
    if(s.len == 0)
2553
        return false;
2554
    s = s.trim(" \t");
2555
    if(s.len == 0)
2556
        return false;
2557

2558
    if(s.begins_with('\''))
2559
    {
2560
        _c4dbgp("got a ': scanning single-quoted scalar");
2561
        m_state->scalar_col = m_state->line_contents.current_col(s);
2562
        *scalar = _scan_squot_scalar();
2563
        *quoted = true;
2564
        return true;
2565
    }
2566
    else if(s.begins_with('"'))
2567
    {
2568
        _c4dbgp("got a \": scanning double-quoted scalar");
2569
        m_state->scalar_col = m_state->line_contents.current_col(s);
2570
        *scalar = _scan_dquot_scalar();
2571
        *quoted = true;
2572
        return true;
2573
    }
2574

2575
    if(has_all(RVAL))
2576
    {
2577
        _c4dbgp("RSEQ|RVAL");
2578
        if( ! _is_scalar_next__rseq_rval(s))
2579
            return false;
2580
        _RYML_WITH_TAB_TOKENS(else if(s.begins_with("-\t"))
2581
            return false;
2582
        )
2583
        _c4dbgp("RSEQ|RVAL|FLOW");
2584
        s = s.left_of(s.first_of(",]"));
2585
        if(s.ends_with(':'))
2586
        {
2587
            --s.len;
2588
        }
2589
        else
2590
        {
2591
            auto first = s.first_of_any(": " _RYML_WITH_TAB_TOKENS( , ":\t"), " #");
2592
            if(first)
2593
                s.len = first.pos;
2594
        }
2595
        s = s.trimr(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' '));
2596
    }
2597

2598
    if(s.empty())
2599
        return false;
2600

2601
    m_state->scalar_col = m_state->line_contents.current_col(s);
2602
    _RYML_CB_ASSERT(m_stack.m_callbacks, s.str >= m_state->line_contents.rem.str);
2603
    _line_progressed(static_cast<size_t>(s.str - m_state->line_contents.rem.str) + s.len);
2604

2605
    if(_at_line_end() && s != '~')
2606
    {
2607
        _c4dbgpf("at line end. curr='{}'", s);
2608
        s = _extend_scanned_scalar(s);
2609
    }
2610

2611
    _c4dbgpf("scalar was '{}'", s);
2612

2613
    *scalar = s;
2614
    *quoted = false;
2615
    return true;
2616
}
2617

2618
bool Parser::_scan_scalar_map_flow(csubstr *C4_RESTRICT scalar, bool *C4_RESTRICT quoted)
2619
{
2620
    _RYML_CB_ASSERT(m_stack.m_callbacks,  has_any(RMAP));
2621
    _RYML_CB_ASSERT(m_stack.m_callbacks,  has_any(FLOW));
2622
    _RYML_CB_ASSERT(m_stack.m_callbacks,  has_any(RKEY|RVAL));
2623

2624
    csubstr s = m_state->line_contents.rem;
2625
    if(s.len == 0)
2626
        return false;
2627
    s = s.trim(" \t");
2628
    if(s.len == 0)
2629
        return false;
2630

2631
    if(s.begins_with('\''))
2632
    {
2633
        _c4dbgp("got a ': scanning single-quoted scalar");
2634
        m_state->scalar_col = m_state->line_contents.current_col(s);
2635
        *scalar = _scan_squot_scalar();
2636
        *quoted = true;
2637
        return true;
2638
    }
2639
    else if(s.begins_with('"'))
2640
    {
2641
        _c4dbgp("got a \": scanning double-quoted scalar");
2642
        m_state->scalar_col = m_state->line_contents.current_col(s);
2643
        *scalar = _scan_dquot_scalar();
2644
        *quoted = true;
2645
        return true;
2646
    }
2647

2648
    if( ! _is_scalar_next__rmap(s))
2649
        return false;
2650

2651
    if(has_all(RKEY))
2652
    {
2653
        _RYML_CB_ASSERT(m_stack.m_callbacks, !s.begins_with(' '));
2654
        size_t colon_token = s.find(": ");
2655
        if(colon_token == npos)
2656
        {
2657
            _RYML_WITH_OR_WITHOUT_TAB_TOKENS(
2658
                // with tab tokens
2659
                colon_token = s.find(":\t");
2660
                if(colon_token == npos)
2661
                {
2662
                    _RYML_CB_ASSERT(m_stack.m_callbacks, s.len > 0);
2663
                    colon_token = s.find(':');
2664
                    if(colon_token != s.len-1)
2665
                        colon_token = npos;
2666
                }
2667
                ,
2668
                // without tab tokens
2669
                colon_token = s.find(':');
2670
                _RYML_CB_ASSERT(m_stack.m_callbacks, s.len > 0);
2671
                if(colon_token != s.len-1)
2672
                    colon_token = npos;
2673
            )
2674
        }
2675
        if(s.begins_with("? ") || s == '?')
2676
            return false;
2677
        if(has_any(QMRK))
2678
        {
2679
            _c4dbgp("RMAP|RKEY|CPLX");
2680
            _RYML_CB_ASSERT(m_stack.m_callbacks, has_any(RMAP));
2681
            s = s.left_of(colon_token);
2682
            s = s.left_of(s.first_of("#"));
2683
            s = s.left_of(s.first_of(':'));
2684
            s = s.trimr(" \t");
2685
            if(s.begins_with("---"))
2686
                return false;
2687
            else if(s.begins_with("..."))
2688
                return false;
2689
        }
2690
        else
2691
        {
2692
            _RYML_CB_CHECK(m_stack.m_callbacks, !s.begins_with('{'));
2693
            _c4dbgp("RMAP|RKEY");
2694
            s = s.left_of(colon_token);
2695
            s = s.trimr(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' '));
2696
            _c4dbgpf("RMAP|RKEY|FLOW: '{}'", s);
2697
            s = s.left_of(s.first_of(",}"));
2698
            if(s.ends_with(':'))
2699
                --s.len;
2700
        }
2701
    }
2702
    else if(has_all(RVAL))
2703
    {
2704
        _c4dbgp("RMAP|RVAL");
2705
        _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(QMRK));
2706
        if( ! _is_scalar_next__rmap_val(s))
2707
            return false;
2708
        _RYML_WITH_TAB_TOKENS(else if(s.begins_with("-\t"))
2709
            return false;
2710
        )
2711
        _c4dbgp("RMAP|RVAL|FLOW");
2712
        if(has_none(RSEQIMAP))
2713
            s = s.left_of(s.first_of(",}"));
2714
        else
2715
            s = s.left_of(s.first_of(",]"));
2716
        s = s.left_of(s.find(" #")); // is there a comment?
2717
        s = s.left_of(s.find("\t#")); // is there a comment?
2718
        s = s.trim(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' '));
2719
    }
2720

2721
    if(s.empty())
2722
        return false;
2723

2724
    m_state->scalar_col = m_state->line_contents.current_col(s);
2725
    _RYML_CB_ASSERT(m_stack.m_callbacks, s.str >= m_state->line_contents.rem.str);
2726
    _line_progressed(static_cast<size_t>(s.str - m_state->line_contents.rem.str) + s.len);
2727

2728
    if(_at_line_end() && s != '~')
2729
    {
2730
        _c4dbgpf("at line end. curr='{}'", s);
2731
        s = _extend_scanned_scalar(s);
2732
    }
2733

2734
    _c4dbgpf("scalar was '{}'", s);
2735

2736
    *scalar = s;
2737
    *quoted = false;
2738
    return true;
2739
}
2740

2741
bool Parser::_scan_scalar_unk(csubstr *C4_RESTRICT scalar, bool *C4_RESTRICT quoted)
2742
{
2743
    _RYML_CB_ASSERT(m_stack.m_callbacks,  has_any(RUNK));
2744

2745
    csubstr s = m_state->line_contents.rem;
2746
    if(s.len == 0)
2747
        return false;
2748
    s = s.trim(" \t");
2749
    if(s.len == 0)
2750
        return false;
2751

2752
    if(s.begins_with('\''))
2753
    {
2754
        _c4dbgp("got a ': scanning single-quoted scalar");
2755
        m_state->scalar_col = m_state->line_contents.current_col(s);
2756
        *scalar = _scan_squot_scalar();
2757
        *quoted = true;
2758
        return true;
2759
    }
2760
    else if(s.begins_with('"'))
2761
    {
2762
        _c4dbgp("got a \": scanning double-quoted scalar");
2763
        m_state->scalar_col = m_state->line_contents.current_col(s);
2764
        *scalar = _scan_dquot_scalar();
2765
        *quoted = true;
2766
        return true;
2767
    }
2768
    else if(s.begins_with('|') || s.begins_with('>'))
2769
    {
2770
        *scalar = _scan_block();
2771
        *quoted = true;
2772
        return true;
2773
    }
2774
    else if(has_any(RTOP) && _is_doc_sep(s))
2775
    {
2776
        return false;
2777
    }
2778

2779
    _c4dbgpf("RUNK '[{}]~~~{}~~~", s.len, s);
2780
    if( ! _is_scalar_next__runk(s))
2781
    {
2782
        _c4dbgp("RUNK: no scalar next");
2783
        return false;
2784
    }
2785
    size_t pos = s.find(" #");
2786
    if(pos != npos)
2787
    {
2788
        _c4dbgpf("RUNK: found ' #' at {}", pos);
2789
        s = s.left_of(pos);
2790
    }
2791
    pos = s.find(": ");
2792
    if(pos != npos)
2793
    {
2794
        _c4dbgpf("RUNK: found ': ' at {}", pos);
2795
        s = s.left_of(pos);
2796
    }
2797
    else if(s.ends_with(':'))
2798
    {
2799
        _c4dbgp("RUNK: ends with ':'");
2800
        s = s.left_of(s.len-1);
2801
    }
2802
    _RYML_WITH_TAB_TOKENS(
2803
    else if((pos = s.find(":\t")) != npos) // TABS
2804
    {
2805
        _c4dbgp("RUNK: ends with ':\\t'");
2806
        s = s.left_of(pos);
2807
    })
2808
    else
2809
    {
2810
        _c4dbgp("RUNK: trimming left of ,");
2811
        s = s.left_of(s.first_of(','));
2812
    }
2813
    s = s.trim(" \t");
2814
    _c4dbgpf("RUNK: scalar=[{}]~~~{}~~~", s.len, s);
2815

2816
    if(s.empty())
2817
        return false;
2818

2819
    m_state->scalar_col = m_state->line_contents.current_col(s);
2820
    _RYML_CB_ASSERT(m_stack.m_callbacks, s.str >= m_state->line_contents.rem.str);
2821
    _line_progressed(static_cast<size_t>(s.str - m_state->line_contents.rem.str) + s.len);
2822

2823
    if(_at_line_end() && s != '~')
2824
    {
2825
        _c4dbgpf("at line end. curr=[{}]~~~{}~~", s.len, s);
2826
        s = _extend_scanned_scalar(s);
2827
    }
2828

2829
    _c4dbgpf("scalar was [{}]~~~{}~~~", s.len, s);
2830

2831
    *scalar = s;
2832
    *quoted = false;
2833
    return true;
2834
}
2835

2836

2837
//-----------------------------------------------------------------------------
2838

2839
csubstr Parser::_extend_scanned_scalar(csubstr s)
2840
{
2841
    if(has_all(RMAP|RKEY|QMRK))
2842
    {
2843
        size_t scalar_indentation = has_any(FLOW) ? 0 : m_state->scalar_col;
2844
        _c4dbgpf("extend_scalar: explicit key! indref={} scalar_indentation={} scalar_col={}", m_state->indref, scalar_indentation, m_state->scalar_col);
2845
        csubstr n = _scan_to_next_nonempty_line(scalar_indentation);
2846
        if(!n.empty())
2847
        {
2848
            substr full = _scan_complex_key(s, n).trimr(" \t\r\n");
2849
            if(full != s)
2850
                s = _filter_plain_scalar(full, scalar_indentation);
2851
        }
2852
    }
2853
    // deal with plain (unquoted) scalars that continue to the next line
2854
    else if(!s.begins_with_any("*")) // cannot be a plain scalar if it starts with * (that's an anchor reference)
2855
    {
2856
        _c4dbgpf("extend_scalar: line ended, scalar='{}'", s);
2857
        if(has_none(FLOW))
2858
        {
2859
            size_t scalar_indentation = m_state->indref + 1;
2860
            if(has_all(RUNK) && scalar_indentation == 1)
2861
                scalar_indentation = 0;
2862
            csubstr n = _scan_to_next_nonempty_line(scalar_indentation);
2863
            if(!n.empty())
2864
            {
2865
                _c4dbgpf("rscalar[IMPL]: state_indref={} state_indentation={} scalar_indentation={}", m_state->indref, m_state->line_contents.indentation, scalar_indentation);
2866
                _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->line_contents.full.is_super(n));
2867
                substr full = _scan_plain_scalar_blck(s, n, scalar_indentation);
2868
                if(full.len >= s.len)
2869
                    s = _filter_plain_scalar(full, scalar_indentation);
2870
            }
2871
        }
2872
        else
2873
        {
2874
            _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(FLOW));
2875
            csubstr n = _scan_to_next_nonempty_line(/*indentation*/0);
2876
            if(!n.empty())
2877
            {
2878
                _c4dbgp("rscalar[FLOW]");
2879
                substr full = _scan_plain_scalar_flow(s, n);
2880
                s = _filter_plain_scalar(full, /*indentation*/0);
2881
            }
2882
        }
2883
    }
2884

2885
    return s;
2886
}
2887

2888

2889
//-----------------------------------------------------------------------------
2890

2891
substr Parser::_scan_plain_scalar_flow(csubstr currscalar, csubstr peeked_line)
2892
{
2893
    static constexpr const csubstr chars = "[]{}?#,";
2894
    size_t pos = peeked_line.first_of(chars);
2895
    bool first = true;
2896
    while(pos != 0)
2897
    {
2898
        if(has_all(RMAP|RKEY) || has_any(RUNK))
2899
        {
2900
            csubstr tpkl = peeked_line.triml(' ').trimr("\r\n");
2901
            if(tpkl.begins_with(": ") || tpkl == ':')
2902
            {
2903
                _c4dbgpf("rscalar[FLOW]: map value starts on the peeked line: '{}'", peeked_line);
2904
                peeked_line = peeked_line.first(0);
2905
                break;
2906
            }
2907
            else
2908
            {
2909
                auto colon_pos = peeked_line.first_of_any(": ", ":");
2910
                if(colon_pos && colon_pos.pos < pos)
2911
                {
2912
                    peeked_line = peeked_line.first(colon_pos.pos);
2913
                    _c4dbgpf("rscalar[FLOW]: found colon at {}. peeked='{}'", colon_pos.pos, peeked_line);
2914
                    _RYML_CB_ASSERT(m_stack.m_callbacks, peeked_line.end() >= m_state->line_contents.rem.begin());
2915
                    _line_progressed(static_cast<size_t>(peeked_line.end() - m_state->line_contents.rem.begin()));
2916
                    break;
2917
                }
2918
            }
2919
        }
2920
        if(pos != npos)
2921
        {
2922
            _c4dbgpf("rscalar[FLOW]: found special character '{}' at {}, stopping: '{}'", peeked_line[pos], pos, peeked_line.left_of(pos).trimr("\r\n"));
2923
            peeked_line = peeked_line.left_of(pos);
2924
            _RYML_CB_ASSERT(m_stack.m_callbacks, peeked_line.end() >= m_state->line_contents.rem.begin());
2925
            _line_progressed(static_cast<size_t>(peeked_line.end() - m_state->line_contents.rem.begin()));
2926
            break;
2927
        }
2928
        _c4dbgpf("rscalar[FLOW]: append another line, full: '{}'", peeked_line.trimr("\r\n"));
2929
        if(!first)
2930
        {
2931
            RYML_CHECK(_advance_to_peeked());
2932
        }
2933
        peeked_line = _scan_to_next_nonempty_line(/*indentation*/0);
2934
        if(peeked_line.empty())
2935
        {
2936
            _c4err("expected token or continuation");
2937
        }
2938
        pos = peeked_line.first_of(chars);
2939
        first = false;
2940
    }
2941
    substr full(m_buf.str + (currscalar.str - m_buf.str), m_buf.begin() + m_state->pos.offset);
2942
    full = full.trimr("\n\r ");
2943
    return full;
2944
}
2945

2946

2947
//-----------------------------------------------------------------------------
2948

2949
substr Parser::_scan_plain_scalar_blck(csubstr currscalar, csubstr peeked_line, size_t indentation)
2950
{
2951
    _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.is_super(currscalar));
2952
    // NOTE. there's a problem with _scan_to_next_nonempty_line(), as it counts newlines twice
2953
    // size_t offs = m_state->pos.offset;   // so we workaround by directly counting from the end of the given scalar
2954
    _RYML_CB_ASSERT(m_stack.m_callbacks, currscalar.end() >= m_buf.begin());
2955
    size_t offs = static_cast<size_t>(currscalar.end() - m_buf.begin());
2956
    _RYML_CB_ASSERT(m_stack.m_callbacks, peeked_line.begins_with(' ', indentation));
2957
    while(true)
2958
    {
2959
        _c4dbgpf("rscalar[IMPL]: continuing... ref_indentation={}", indentation);
2960
        if(peeked_line.begins_with("...") || peeked_line.begins_with("---"))
2961
        {
2962
            _c4dbgpf("rscalar[IMPL]: document termination next -- bail now '{}'", peeked_line.trimr("\r\n"));
2963
            break;
2964
        }
2965
        else if(( ! peeked_line.begins_with(' ', indentation))) // is the line deindented?
2966
        {
2967
            if(!peeked_line.trim(" \r\n\t").empty()) // is the line not blank?
2968
            {
2969
                _c4dbgpf("rscalar[IMPL]: deindented line, not blank -- bail now '{}'", peeked_line.trimr("\r\n"));
2970
                break;
2971
            }
2972
            _c4dbgpf("rscalar[IMPL]: line is blank and has less indentation: ref={} line={}: '{}'", indentation, peeked_line.first_not_of(' ') == csubstr::npos ? 0 : peeked_line.first_not_of(' '), peeked_line.trimr("\r\n"));
2973
            _c4dbgpf("rscalar[IMPL]: ... searching for a line starting at indentation {}", indentation);
2974
            csubstr next_peeked = _scan_to_next_nonempty_line(indentation);
2975
            if(next_peeked.empty())
2976
            {
2977
                _c4dbgp("rscalar[IMPL]: ... finished.");
2978
                break;
2979
            }
2980
            _c4dbgp("rscalar[IMPL]: ... continuing.");
2981
            peeked_line = next_peeked;
2982
        }
2983

2984
        _c4dbgpf("rscalar[IMPL]: line contents: '{}'", peeked_line.right_of(indentation, true).trimr("\r\n"));
2985
        size_t token_pos;
2986
        if(peeked_line.find(": ") != npos)
2987
        {
2988
            _line_progressed(peeked_line.find(": "));
2989
            _c4err("': ' is not a valid token in plain flow (unquoted) scalars");
2990
        }
2991
        else if(peeked_line.ends_with(':'))
2992
        {
2993
            _line_progressed(peeked_line.find(':'));
2994
            _c4err("lines cannot end with ':' in plain flow (unquoted) scalars");
2995
        }
2996
        else if((token_pos = peeked_line.find(" #")) != npos)
2997
        {
2998
            _line_progressed(token_pos);
2999
            break;
3000
            //_c4err("' #' is not a valid token in plain flow (unquoted) scalars");
3001
        }
3002

3003
        _c4dbgpf("rscalar[IMPL]: append another line: (len={})'{}'", peeked_line.len, peeked_line.trimr("\r\n"));
3004
        if(!_advance_to_peeked())
3005
        {
3006
            _c4dbgp("rscalar[IMPL]: file finishes after the scalar");
3007
            break;
3008
        }
3009
        peeked_line = m_state->line_contents.rem;
3010
    }
3011
    _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.offset >= offs);
3012
    substr full(m_buf.str + (currscalar.str - m_buf.str),
3013
                currscalar.len + (m_state->pos.offset - offs));
3014
    full = full.trimr("\r\n ");
3015
    return full;
3016
}
3017

3018
substr Parser::_scan_complex_key(csubstr currscalar, csubstr peeked_line)
3019
{
3020
    _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.is_super(currscalar));
3021
    // NOTE. there's a problem with _scan_to_next_nonempty_line(), as it counts newlines twice
3022
    // size_t offs = m_state->pos.offset;   // so we workaround by directly counting from the end of the given scalar
3023
    _RYML_CB_ASSERT(m_stack.m_callbacks, currscalar.end() >= m_buf.begin());
3024
    size_t offs = static_cast<size_t>(currscalar.end() - m_buf.begin());
3025
    while(true)
3026
    {
3027
        _c4dbgp("rcplxkey: continuing...");
3028
        if(peeked_line.begins_with("...") || peeked_line.begins_with("---"))
3029
        {
3030
            _c4dbgpf("rcplxkey: document termination next -- bail now '{}'", peeked_line.trimr("\r\n"));
3031
            break;
3032
        }
3033
        else
3034
        {
3035
            size_t pos = peeked_line.first_of("?:[]{}");
3036
            if(pos == csubstr::npos)
3037
            {
3038
                pos = peeked_line.find("- ");
3039
            }
3040
            if(pos != csubstr::npos)
3041
            {
3042
                _c4dbgpf("rcplxkey: found special characters at pos={}: '{}'", pos, peeked_line.trimr("\r\n"));
3043
                _line_progressed(pos);
3044
                break;
3045
            }
3046
        }
3047

3048
        _c4dbgpf("rcplxkey: no special chars found '{}'", peeked_line.trimr("\r\n"));
3049
        csubstr next_peeked = _scan_to_next_nonempty_line(0);
3050
        if(next_peeked.empty())
3051
        {
3052
            _c4dbgp("rcplxkey: empty ... finished.");
3053
            break;
3054
        }
3055
        _c4dbgp("rcplxkey: ... continuing.");
3056
        peeked_line = next_peeked;
3057

3058
        _c4dbgpf("rcplxkey: line contents: '{}'", peeked_line.trimr("\r\n"));
3059
        size_t colpos;
3060
        if((colpos = peeked_line.find(": ")) != npos)
3061
        {
3062
            _c4dbgp("rcplxkey: found ': ', stopping.");
3063
            _line_progressed(colpos);
3064
            break;
3065
        }
3066
        #ifdef RYML_NO_COVERAGE__TO_BE_DELETED
3067
        else if((colpos = peeked_line.ends_with(':')))
3068
        {
3069
            _c4dbgp("rcplxkey: ends with ':', stopping.");
3070
            _line_progressed(colpos);
3071
            break;
3072
        }
3073
        #endif
3074
        _c4dbgpf("rcplxkey: append another line: (len={})'{}'", peeked_line.len, peeked_line.trimr("\r\n"));
3075
        if(!_advance_to_peeked())
3076
        {
3077
            _c4dbgp("rcplxkey: file finishes after the scalar");
3078
            break;
3079
        }
3080
        peeked_line = m_state->line_contents.rem;
3081
    }
3082
    _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.offset >= offs);
3083
    substr full(m_buf.str + (currscalar.str - m_buf.str),
3084
                currscalar.len + (m_state->pos.offset - offs));
3085
    return full;
3086
}
3087

3088
//! scans to the next non-blank line starting with the given indentation
3089
csubstr Parser::_scan_to_next_nonempty_line(size_t indentation)
3090
{
3091
    csubstr next_peeked;
3092
    while(true)
3093
    {
3094
        _c4dbgpf("rscalar: ... curr offset: {} indentation={}", m_state->pos.offset, indentation);
3095
        next_peeked = _peek_next_line(m_state->pos.offset);
3096
        csubstr next_peeked_triml = next_peeked.triml(' ');
3097
        _c4dbgpf("rscalar: ... next peeked line='{}'", next_peeked.trimr("\r\n"));
3098
        if(next_peeked_triml.begins_with('#'))
3099
        {
3100
            _c4dbgp("rscalar: ... first non-space character is #");
3101
            return {};
3102
        }
3103
        else if(next_peeked.begins_with(' ', indentation))
3104
        {
3105
            _c4dbgpf("rscalar: ... begins at same indentation {}, assuming continuation", indentation);
3106
            _advance_to_peeked();
3107
            return next_peeked;
3108
        }
3109
        else   // check for de-indentation
3110
        {
3111
            csubstr trimmed = next_peeked_triml.trimr("\t\r\n");
3112
            _c4dbgpf("rscalar: ... deindented! trimmed='{}'", trimmed);
3113
            if(!trimmed.empty())
3114
            {
3115
                _c4dbgp("rscalar: ... and not empty. bailing out.");
3116
                return {};
3117
            }
3118
        }
3119
        if(!_advance_to_peeked())
3120
        {
3121
            _c4dbgp("rscalar: file finished");
3122
            return {};
3123
        }
3124
    }
3125
    return {};
3126
}
3127

3128
// returns false when the file finished
3129
bool Parser::_advance_to_peeked()
3130
{
3131
    _line_progressed(m_state->line_contents.rem.len);
3132
    _line_ended(); // advances to the peeked-at line, consuming all remaining (probably newline) characters on the current line
3133
    _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->line_contents.rem.first_of("\r\n") == csubstr::npos);
3134
    _c4dbgpf("advance to peeked: scan more... pos={} len={}", m_state->pos.offset, m_buf.len);
3135
    _scan_line();  // puts the peeked-at line in the buffer
3136
    if(_finished_file())
3137
    {
3138
        _c4dbgp("rscalar: finished file!");
3139
        return false;
3140
    }
3141
    return true;
3142
}
3143

3144
//-----------------------------------------------------------------------------
3145

3146
C4_ALWAYS_INLINE size_t _extend_from_combined_newline(char nl, char following)
3147
{
3148
    return (nl == '\n' && following == '\r') || (nl == '\r' && following == '\n');
3149
}
3150

3151
//! look for the next newline chars, and jump to the right of those
3152
csubstr from_next_line(csubstr rem)
3153
{
3154
    size_t nlpos = rem.first_of("\r\n");
3155
    if(nlpos == csubstr::npos)
3156
        return {};
3157
    const char nl = rem[nlpos];
3158
    rem = rem.right_of(nlpos);
3159
    if(rem.empty())
3160
        return {};
3161
    if(_extend_from_combined_newline(nl, rem.front()))
3162
        rem = rem.sub(1);
3163
    return rem;
3164
}
3165

3166
csubstr Parser::_peek_next_line(size_t pos) const
3167
{
3168
    csubstr rem{}; // declare here because of the goto
3169
    size_t nlpos{}; // declare here because of the goto
3170
    pos = pos == npos ? m_state->pos.offset : pos;
3171
    if(pos >= m_buf.len)
3172
        goto next_is_empty;
3173

3174
    // look for the next newline chars, and jump to the right of those
3175
    rem = from_next_line(m_buf.sub(pos));
3176
    if(rem.empty())
3177
        goto next_is_empty;
3178

3179
    // now get everything up to and including the following newline chars
3180
    nlpos = rem.first_of("\r\n");
3181
    if((nlpos != csubstr::npos) && (nlpos + 1 < rem.len))
3182
        nlpos += _extend_from_combined_newline(rem[nlpos], rem[nlpos+1]);
3183
    rem = rem.left_of(nlpos, /*include_pos*/true);
3184

3185
    _c4dbgpf("peek next line @ {}: (len={})'{}'", pos, rem.len, rem.trimr("\r\n"));
3186
    return rem;
3187

3188
next_is_empty:
3189
    _c4dbgpf("peek next line @ {}: (len=0)''", pos);
3190
    return {};
3191
}
3192

3193

3194
//-----------------------------------------------------------------------------
3195
void Parser::LineContents::reset_with_next_line(csubstr buf, size_t offset)
3196
{
3197
    RYML_ASSERT(offset <= buf.len);
3198
    char const* C4_RESTRICT b = &buf[offset];
3199
    char const* C4_RESTRICT e = b;
3200
    // get the current line stripped of newline chars
3201
    while(e < buf.end() && (*e != '\n' && *e != '\r'))
3202
        ++e;
3203
    RYML_ASSERT(e >= b);
3204
    const csubstr stripped_ = buf.sub(offset, static_cast<size_t>(e - b));
3205
    // advance pos to include the first line ending
3206
    if(e != buf.end() && *e == '\r')
3207
        ++e;
3208
    if(e != buf.end() && *e == '\n')
3209
        ++e;
3210
    RYML_ASSERT(e >= b);
3211
    const csubstr full_ = buf.sub(offset, static_cast<size_t>(e - b));
3212
    reset(full_, stripped_);
3213
}
3214

3215
void Parser::_scan_line()
3216
{
3217
    if(m_state->pos.offset >= m_buf.len)
3218
    {
3219
        m_state->line_contents.reset(m_buf.last(0), m_buf.last(0));
3220
        return;
3221
    }
3222
    m_state->line_contents.reset_with_next_line(m_buf, m_state->pos.offset);
3223
}
3224

3225

3226
//-----------------------------------------------------------------------------
3227
void Parser::_line_progressed(size_t ahead)
3228
{
3229
    _c4dbgpf("line[{}] ({} cols) progressed by {}:  col {}-->{}   offset {}-->{}", m_state->pos.line, m_state->line_contents.full.len, ahead, m_state->pos.col, m_state->pos.col+ahead, m_state->pos.offset, m_state->pos.offset+ahead);
3230
    m_state->pos.offset += ahead;
3231
    m_state->pos.col += ahead;
3232
    _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.col <= m_state->line_contents.stripped.len+1);
3233
    m_state->line_contents.rem = m_state->line_contents.rem.sub(ahead);
3234
}
3235

3236
void Parser::_line_ended()
3237
{
3238
    _c4dbgpf("line[{}] ({} cols) ended! offset {}-->{}", m_state->pos.line, m_state->line_contents.full.len, m_state->pos.offset, m_state->pos.offset+m_state->line_contents.full.len - m_state->line_contents.stripped.len);
3239
    _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.col == m_state->line_contents.stripped.len+1);
3240
    m_state->pos.offset += m_state->line_contents.full.len - m_state->line_contents.stripped.len;
3241
    ++m_state->pos.line;
3242
    m_state->pos.col = 1;
3243
}
3244

3245
void Parser::_line_ended_undo()
3246
{
3247
    _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.col == 1u);
3248
    _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.line > 0u);
3249
    _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.offset >= m_state->line_contents.full.len - m_state->line_contents.stripped.len);
3250
    size_t delta = m_state->line_contents.full.len - m_state->line_contents.stripped.len;
3251
    _c4dbgpf("line[{}] undo ended! line {}-->{}, offset {}-->{}", m_state->pos.line, m_state->pos.line, m_state->pos.line - 1, m_state->pos.offset, m_state->pos.offset - delta);
3252
    m_state->pos.offset -= delta;
3253
    --m_state->pos.line;
3254
    m_state->pos.col = m_state->line_contents.stripped.len + 1u;
3255
    // don't forget to undo also the changes to the remainder of the line
3256
    _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.offset >= m_buf.len || m_buf[m_state->pos.offset] == '\n' || m_buf[m_state->pos.offset] == '\r');
3257
    m_state->line_contents.rem = m_buf.sub(m_state->pos.offset, 0);
3258
}
3259

3260

3261
//-----------------------------------------------------------------------------
3262
void Parser::_set_indentation(size_t indentation)
3263
{
3264
    m_state->indref = indentation;
3265
    _c4dbgpf("state[{}]: saving indentation: {}", m_state-m_stack.begin(), m_state->indref);
3266
}
3267

3268
void Parser::_save_indentation(size_t behind)
3269
{
3270
    _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->line_contents.rem.begin() >= m_state->line_contents.full.begin());
3271
    m_state->indref = static_cast<size_t>(m_state->line_contents.rem.begin() - m_state->line_contents.full.begin());
3272
    _RYML_CB_ASSERT(m_stack.m_callbacks, behind <= m_state->indref);
3273
    m_state->indref -= behind;
3274
    _c4dbgpf("state[{}]: saving indentation: {}", m_state-m_stack.begin(), m_state->indref);
3275
}
3276

3277
bool Parser::_maybe_set_indentation_from_anchor_or_tag()
3278
{
3279
    if(m_key_anchor.not_empty())
3280
    {
3281
        _c4dbgpf("set indentation from key anchor: {}", m_key_anchor_indentation);
3282
        _set_indentation(m_key_anchor_indentation); // this is the column where the anchor starts
3283
        return true;
3284
    }
3285
    else if(m_key_tag.not_empty())
3286
    {
3287
        _c4dbgpf("set indentation from key tag: {}", m_key_tag_indentation);
3288
        _set_indentation(m_key_tag_indentation); // this is the column where the tag starts
3289
        return true;
3290
    }
3291
    return false;
3292
}
3293

3294

3295
//-----------------------------------------------------------------------------
3296
void Parser::_write_key_anchor(size_t node_id)
3297
{
3298
    _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->has_key(node_id));
3299
    if( ! m_key_anchor.empty())
3300
    {
3301
        _c4dbgpf("node={}: set key anchor to '{}'", node_id, m_key_anchor);
3302
        m_tree->set_key_anchor(node_id, m_key_anchor);
3303
        m_key_anchor.clear();
3304
        m_key_anchor_was_before = false;
3305
        m_key_anchor_indentation = 0;
3306
    }
3307
    else if( ! m_tree->is_key_quoted(node_id))
3308
    {
3309
        csubstr r = m_tree->key(node_id);
3310
        if(r.begins_with('*'))
3311
        {
3312
            _c4dbgpf("node={}: set key reference: '{}'", node_id, r);
3313
            m_tree->set_key_ref(node_id, r.sub(1));
3314
        }
3315
        else if(r == "<<")
3316
        {
3317
            m_tree->set_key_ref(node_id, r);
3318
            _c4dbgpf("node={}: it's an inheriting reference", node_id);
3319
            if(m_tree->is_seq(node_id))
3320
            {
3321
                _c4dbgpf("node={}: inheriting from seq of {}", node_id, m_tree->num_children(node_id));
3322
                for(size_t i = m_tree->first_child(node_id); i != NONE; i = m_tree->next_sibling(i))
3323
                {
3324
                    if( ! (m_tree->val(i).begins_with('*')))
3325
                        _c4err("malformed reference: '{}'", m_tree->val(i));
3326
                }
3327
            }
3328
            else if( ! m_tree->val(node_id).begins_with('*'))
3329
            {
3330
                 _c4err("malformed reference: '{}'", m_tree->val(node_id));
3331
            }
3332
            //m_tree->set_key_ref(node_id, r);
3333
        }
3334
    }
3335
}
3336

3337
//-----------------------------------------------------------------------------
3338
void Parser::_write_val_anchor(size_t node_id)
3339
{
3340
    if( ! m_val_anchor.empty())
3341
    {
3342
        _c4dbgpf("node={}: set val anchor to '{}'", node_id, m_val_anchor);
3343
        m_tree->set_val_anchor(node_id, m_val_anchor);
3344
        m_val_anchor.clear();
3345
    }
3346
    csubstr r = m_tree->has_val(node_id) ? m_tree->val(node_id) : "";
3347
    if(!m_tree->is_val_quoted(node_id) && r.begins_with('*'))
3348
    {
3349
        _c4dbgpf("node={}: set val reference: '{}'", node_id, r);
3350
        RYML_CHECK(!m_tree->has_val_anchor(node_id));
3351
        m_tree->set_val_ref(node_id, r.sub(1));
3352
    }
3353
}
3354

3355
//-----------------------------------------------------------------------------
3356
void Parser::_push_level(bool explicit_flow_chars)
3357
{
3358
    _c4dbgpf("pushing level! currnode={}  currlevel={} stacksize={} stackcap={}", m_state->node_id, m_state->level, m_stack.size(), m_stack.capacity());
3359
    _RYML_CB_ASSERT(m_stack.m_callbacks, m_state == &m_stack.top());
3360
    if(node(m_state) == nullptr)
3361
    {
3362
        _c4dbgp("pushing level! actually no, current node is null");
3363
        //_RYML_CB_ASSERT(m_stack.m_callbacks,  ! explicit_flow_chars);
3364
        return;
3365
    }
3366
    flag_t st = RUNK;
3367
    if(explicit_flow_chars || has_all(FLOW))
3368
    {
3369
        st |= FLOW;
3370
    }
3371
    m_stack.push_top();
3372
    m_state = &m_stack.top();
3373
    set_flags(st);
3374
    m_state->node_id = (size_t)NONE;
3375
    m_state->indref = (size_t)NONE;
3376
    ++m_state->level;
3377
    _c4dbgpf("pushing level: now, currlevel={}", m_state->level);
3378
}
3379

3380
void Parser::_pop_level()
3381
{
3382
    _c4dbgpf("popping level! currnode={} currlevel={}", m_state->node_id, m_state->level);
3383
    if(has_any(RMAP) || m_tree->is_map(m_state->node_id))
3384
    {
3385
        _stop_map();
3386
    }
3387
    if(has_any(RSEQ) || m_tree->is_seq(m_state->node_id))
3388
    {
3389
        _stop_seq();
3390
    }
3391
    if(m_tree->is_doc(m_state->node_id))
3392
    {
3393
        _stop_doc();
3394
    }
3395
    _RYML_CB_ASSERT(m_stack.m_callbacks, m_stack.size() > 1);
3396
    _prepare_pop();
3397
    m_stack.pop();
3398
    m_state = &m_stack.top();
3399
    /*if(has_any(RMAP))
3400
    {
3401
        _toggle_key_val();
3402
    }*/
3403
    if(m_state->line_contents.indentation == 0)
3404
    {
3405
        //_RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RTOP));
3406
        add_flags(RTOP);
3407
    }
3408
    _c4dbgpf("popping level: now, currnode={} currlevel={}", m_state->node_id, m_state->level);
3409
}
3410

3411
//-----------------------------------------------------------------------------
3412
void Parser::_start_unk(bool /*as_child*/)
3413
{
3414
    _c4dbgp("start_unk");
3415
    _push_level();
3416
    _move_scalar_from_top();
3417
}
3418

3419
//-----------------------------------------------------------------------------
3420
void Parser::_start_doc(bool as_child)
3421
{
3422
    _c4dbgpf("start_doc (as child={})", as_child);
3423
    _RYML_CB_ASSERT(m_stack.m_callbacks, node(m_stack.bottom()) == node(m_root_id));
3424
    size_t parent_id = m_stack.size() < 2 ? m_root_id : m_stack.top(1).node_id;
3425
    _RYML_CB_ASSERT(m_stack.m_callbacks, parent_id != NONE);
3426
    _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_root(parent_id));
3427
    _RYML_CB_ASSERT(m_stack.m_callbacks, node(m_state) == nullptr || node(m_state) == node(m_root_id));
3428
    if(as_child)
3429
    {
3430
        _c4dbgpf("start_doc: parent={}", parent_id);
3431
        if( ! m_tree->is_stream(parent_id))
3432
        {
3433
            _c4dbgp("start_doc: rearranging with root as STREAM");
3434
            m_tree->set_root_as_stream();
3435
        }
3436
        m_state->node_id = m_tree->append_child(parent_id);
3437
        m_tree->to_doc(m_state->node_id);
3438
    }
3439
    #ifdef RYML_NO_COVERAGE__TO_BE_DELETED
3440
    else
3441
    {
3442
        _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_seq(parent_id) || m_tree->empty(parent_id));
3443
        m_state->node_id = parent_id;
3444
        if( ! m_tree->is_doc(parent_id))
3445
        {
3446
            m_tree->to_doc(parent_id, DOC);
3447
        }
3448
    }
3449
    #endif
3450
    _c4dbgpf("start_doc: id={}", m_state->node_id);
3451
    add_flags(RUNK|RTOP|NDOC);
3452
    _handle_types();
3453
    rem_flags(NDOC);
3454
}
3455

3456
void Parser::_stop_doc()
3457
{
3458
    size_t doc_node = m_state->node_id;
3459
    _c4dbgpf("stop_doc[{}]", doc_node);
3460
    _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_doc(doc_node));
3461
    if(!m_tree->is_seq(doc_node) && !m_tree->is_map(doc_node) && !m_tree->is_val(doc_node))
3462
    {
3463
        _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(SSCL));
3464
        _c4dbgpf("stop_doc[{}]: there was nothing; adding null val", doc_node);
3465
        m_tree->to_val(doc_node, {}, DOC);
3466
    }
3467
}
3468

3469
void Parser::_end_stream()
3470
{
3471
    _c4dbgpf("end_stream, level={} node_id={}", m_state->level, m_state->node_id);
3472
    _RYML_CB_ASSERT(m_stack.m_callbacks,  ! m_stack.empty());
3473
    NodeData *added = nullptr;
3474
    if(has_any(SSCL))
3475
    {
3476
        if(m_tree->is_seq(m_state->node_id))
3477
        {
3478
            _c4dbgp("append val...");
3479
            added = _append_val(_consume_scalar());
3480
        }
3481
        else if(m_tree->is_map(m_state->node_id))
3482
        {
3483
            _c4dbgp("append null key val...");
3484
            added = _append_key_val_null(m_state->line_contents.rem.str);
3485
            #ifdef RYML_NO_COVERAGE__TO_BE_DELETED
3486
            if(has_any(RSEQIMAP))
3487
            {
3488
                _stop_seqimap();
3489
                _pop_level();
3490
            }
3491
            #endif
3492
        }
3493
        else if(m_tree->is_doc(m_state->node_id) || m_tree->type(m_state->node_id) == NOTYPE)
3494
        {
3495
            NodeType_e quoted = has_any(QSCL) ? VALQUO : NOTYPE; // do this before consuming the scalar
3496
            csubstr scalar = _consume_scalar();
3497
            _c4dbgpf("node[{}]: to docval '{}'{}", m_state->node_id, scalar, quoted == VALQUO ? ", quoted" : "");
3498
            m_tree->to_val(m_state->node_id, scalar, DOC|quoted);
3499
            added = m_tree->get(m_state->node_id);
3500
        }
3501
        else
3502
        {
3503
            _c4err("internal error");
3504
        }
3505
    }
3506
    else if(has_all(RSEQ|RVAL) && has_none(FLOW))
3507
    {
3508
        _c4dbgp("add last...");
3509
        added = _append_val_null(m_state->line_contents.rem.str);
3510
    }
3511
    else if(!m_val_tag.empty() && (m_tree->is_doc(m_state->node_id) || m_tree->type(m_state->node_id) == NOTYPE))
3512
    {
3513
        csubstr scalar = m_state->line_contents.rem.first(0);
3514
        _c4dbgpf("node[{}]: add null scalar as docval", m_state->node_id);
3515
        m_tree->to_val(m_state->node_id, scalar, DOC);
3516
        added = m_tree->get(m_state->node_id);
3517
    }
3518

3519
    if(added)
3520
    {
3521
        size_t added_id = m_tree->id(added);
3522
        if(m_tree->is_seq(m_state->node_id) || m_tree->is_doc(m_state->node_id))
3523
        {
3524
            if(!m_key_anchor.empty())
3525
            {
3526
                _c4dbgpf("node[{}]: move key to val anchor: '{}'", added_id, m_key_anchor);
3527
                m_val_anchor = m_key_anchor;
3528
                m_key_anchor = {};
3529
            }
3530
            if(!m_key_tag.empty())
3531
            {
3532
                _c4dbgpf("node[{}]: move key to val tag: '{}'", added_id, m_key_tag);
3533
                m_val_tag = m_key_tag;
3534
                m_key_tag = {};
3535
            }
3536
        }
3537
        #ifdef RYML_NO_COVERAGE__TO_BE_DELETED
3538
        if(!m_key_anchor.empty())
3539
        {
3540
            _c4dbgpf("node[{}]: set key anchor='{}'", added_id, m_key_anchor);
3541
            m_tree->set_key_anchor(added_id, m_key_anchor);
3542
            m_key_anchor = {};
3543
        }
3544
        #endif
3545
        if(!m_val_anchor.empty())
3546
        {
3547
            _c4dbgpf("node[{}]: set val anchor='{}'", added_id, m_val_anchor);
3548
            m_tree->set_val_anchor(added_id, m_val_anchor);
3549
            m_val_anchor = {};
3550
        }
3551
        #ifdef RYML_NO_COVERAGE__TO_BE_DELETED
3552
        if(!m_key_tag.empty())
3553
        {
3554
            _c4dbgpf("node[{}]: set key tag='{}' -> '{}'", added_id, m_key_tag, normalize_tag(m_key_tag));
3555
            m_tree->set_key_tag(added_id, normalize_tag(m_key_tag));
3556
            m_key_tag = {};
3557
        }
3558
        #endif
3559
        if(!m_val_tag.empty())
3560
        {
3561
            _c4dbgpf("node[{}]: set val tag='{}' -> '{}'", added_id, m_val_tag, normalize_tag(m_val_tag));
3562
            m_tree->set_val_tag(added_id, normalize_tag(m_val_tag));
3563
            m_val_tag = {};
3564
        }
3565
    }
3566

3567
    while(m_stack.size() > 1)
3568
    {
3569
        _c4dbgpf("popping level: {} (stack sz={})", m_state->level, m_stack.size());
3570
        _RYML_CB_ASSERT(m_stack.m_callbacks,  ! has_any(SSCL, &m_stack.top()));
3571
        if(has_all(RSEQ|FLOW))
3572
            _err("closing ] not found");
3573
        _pop_level();
3574
    }
3575
    add_flags(NDOC);
3576
}
3577

3578
void Parser::_start_new_doc(csubstr rem)
3579
{
3580
    _c4dbgp("_start_new_doc");
3581
    _RYML_CB_ASSERT(m_stack.m_callbacks, rem.begins_with("---"));
3582
    C4_UNUSED(rem);
3583

3584
    _end_stream();
3585

3586
    size_t indref = m_state->indref;
3587
    _c4dbgpf("start a document, indentation={}", indref);
3588
    _line_progressed(3);
3589
    _push_level();
3590
    _start_doc();
3591
    _set_indentation(indref);
3592
}
3593

3594

3595
//-----------------------------------------------------------------------------
3596
void Parser::_start_map(bool as_child)
3597
{
3598
    _c4dbgpf("start_map (as child={})", as_child);
3599
    addrem_flags(RMAP|RVAL, RKEY|RUNK);
3600
    _RYML_CB_ASSERT(m_stack.m_callbacks, node(m_stack.bottom()) == node(m_root_id));
3601
    size_t parent_id = m_stack.size() < 2 ? m_root_id : m_stack.top(1).node_id;
3602
    _RYML_CB_ASSERT(m_stack.m_callbacks, parent_id != NONE);
3603
    _RYML_CB_ASSERT(m_stack.m_callbacks, node(m_state) == nullptr || node(m_state) == node(m_root_id));
3604
    if(as_child)
3605
    {
3606
        m_state->node_id = m_tree->append_child(parent_id);
3607
        if(has_all(SSCL))
3608
        {
3609
            type_bits key_quoted = NOTYPE;
3610
            if(m_state->flags & QSCL) // before consuming the scalar
3611
                key_quoted |= KEYQUO;
3612
            csubstr key = _consume_scalar();
3613
            m_tree->to_map(m_state->node_id, key, key_quoted);
3614
            _c4dbgpf("start_map: id={} key='{}'", m_state->node_id, m_tree->key(m_state->node_id));
3615
            _write_key_anchor(m_state->node_id);
3616
            if( ! m_key_tag.empty())
3617
            {
3618
                _c4dbgpf("node[{}]: set key tag='{}' -> '{}'", m_state->node_id, m_key_tag, normalize_tag(m_key_tag));
3619
                m_tree->set_key_tag(m_state->node_id, normalize_tag(m_key_tag));
3620
                m_key_tag.clear();
3621
            }
3622
        }
3623
        else
3624
        {
3625
            m_tree->to_map(m_state->node_id);
3626
            _c4dbgpf("start_map: id={}", m_state->node_id);
3627
        }
3628
        m_tree->_p(m_state->node_id)->m_val.scalar.str = m_state->line_contents.rem.str;
3629
        _write_val_anchor(m_state->node_id);
3630
    }
3631
    else
3632
    {
3633
        _RYML_CB_ASSERT(m_stack.m_callbacks, parent_id != NONE);
3634
        m_state->node_id = parent_id;
3635
        _c4dbgpf("start_map: id={}", m_state->node_id);
3636
        type_bits as_doc = 0;
3637
        if(m_tree->is_doc(m_state->node_id))
3638
            as_doc |= DOC;
3639
        if(!m_tree->is_map(parent_id))
3640
        {
3641
            RYML_CHECK(!m_tree->has_children(parent_id));
3642
            m_tree->to_map(parent_id, as_doc);
3643
        }
3644
        else
3645
        {
3646
            m_tree->_add_flags(parent_id, as_doc);
3647
        }
3648
        _move_scalar_from_top();
3649
        if(m_key_anchor.not_empty())
3650
            m_key_anchor_was_before = true;
3651
        _write_val_anchor(parent_id);
3652
        if(m_stack.size() >= 2)
3653
        {
3654
            State const& parent_state = m_stack.top(1);
3655
            if(parent_state.flags & RSET)
3656
                add_flags(RSET);
3657
        }
3658
        m_tree->_p(parent_id)->m_val.scalar.str = m_state->line_contents.rem.str;
3659
    }
3660
    if( ! m_val_tag.empty())
3661
    {
3662
        _c4dbgpf("node[{}]: set val tag='{}' -> '{}'", m_state->node_id, m_val_tag, normalize_tag(m_val_tag));
3663
        m_tree->set_val_tag(m_state->node_id, normalize_tag(m_val_tag));
3664
        m_val_tag.clear();
3665
    }
3666
}
3667

3668
void Parser::_start_map_unk(bool as_child)
3669
{
3670
    _c4dbgpf("start_map_unk (as child={})", as_child);
3671
    if(!m_key_anchor_was_before)
3672
    {
3673
        _c4dbgpf("stash key anchor before starting map... '{}'", m_key_anchor);
3674
        csubstr ka = m_key_anchor;
3675
        m_key_anchor = {};
3676
        _start_map(as_child);
3677
        m_key_anchor = ka;
3678
    }
3679
    else
3680
    {
3681
        _start_map(as_child);
3682
        m_key_anchor_was_before = false;
3683
    }
3684
    if(m_key_tag2.not_empty())
3685
    {
3686
        m_key_tag = m_key_tag2;
3687
        m_key_tag_indentation = m_key_tag2_indentation;
3688
        m_key_tag2.clear();
3689
        m_key_tag2_indentation = 0;
3690
    }
3691
}
3692

3693
void Parser::_stop_map()
3694
{
3695
    _c4dbgpf("stop_map[{}]", m_state->node_id);
3696
    _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_map(m_state->node_id));
3697
    if(has_all(QMRK|RKEY) && !has_all(SSCL))
3698
    {
3699
        _c4dbgpf("stop_map[{}]: RKEY", m_state->node_id);
3700
        _store_scalar_null(m_state->line_contents.rem.str);
3701
        _append_key_val_null(m_state->line_contents.rem.str);
3702
    }
3703
}
3704

3705

3706
//-----------------------------------------------------------------------------
3707
void Parser::_start_seq(bool as_child)
3708
{
3709
    _c4dbgpf("start_seq (as child={})", as_child);
3710
    if(has_all(RTOP|RUNK))
3711
    {
3712
        _c4dbgpf("start_seq: moving key tag to val tag: '{}'", m_key_tag);
3713
        m_val_tag = m_key_tag;
3714
        m_key_tag.clear();
3715
    }
3716
    addrem_flags(RSEQ|RVAL, RUNK);
3717
    _RYML_CB_ASSERT(m_stack.m_callbacks, node(m_stack.bottom()) == node(m_root_id));
3718
    size_t parent_id = m_stack.size() < 2 ? m_root_id : m_stack.top(1).node_id;
3719
    _RYML_CB_ASSERT(m_stack.m_callbacks, parent_id != NONE);
3720
    _RYML_CB_ASSERT(m_stack.m_callbacks, node(m_state) == nullptr || node(m_state) == node(m_root_id));
3721
    if(as_child)
3722
    {
3723
        m_state->node_id = m_tree->append_child(parent_id);
3724
        if(has_all(SSCL))
3725
        {
3726
            _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_map(parent_id));
3727
            type_bits key_quoted = 0;
3728
            if(m_state->flags & QSCL) // before consuming the scalar
3729
                key_quoted |= KEYQUO;
3730
            csubstr key = _consume_scalar();
3731
            m_tree->to_seq(m_state->node_id, key, key_quoted);
3732
            _c4dbgpf("start_seq: id={} name='{}'", m_state->node_id, m_tree->key(m_state->node_id));
3733
            _write_key_anchor(m_state->node_id);
3734
            if( ! m_key_tag.empty())
3735
            {
3736
                _c4dbgpf("start_seq[{}]: set key tag='{}' -> '{}'", m_state->node_id, m_key_tag, normalize_tag(m_key_tag));
3737
                m_tree->set_key_tag(m_state->node_id, normalize_tag(m_key_tag));
3738
                m_key_tag.clear();
3739
            }
3740
        }
3741
        else
3742
        {
3743
            type_bits as_doc = 0;
3744
            _RYML_CB_ASSERT(m_stack.m_callbacks, !m_tree->is_doc(m_state->node_id));
3745
            m_tree->to_seq(m_state->node_id, as_doc);
3746
            _c4dbgpf("start_seq: id={}{}", m_state->node_id, as_doc ? " as doc" : "");
3747
        }
3748
        _write_val_anchor(m_state->node_id);
3749
        m_tree->_p(m_state->node_id)->m_val.scalar.str = m_state->line_contents.rem.str;
3750
    }
3751
    else
3752
    {
3753
        m_state->node_id = parent_id;
3754
        type_bits as_doc = 0;
3755
        if(m_tree->is_doc(m_state->node_id))
3756
            as_doc |= DOC;
3757
        if(!m_tree->is_seq(parent_id))
3758
        {
3759
            RYML_CHECK(!m_tree->has_children(parent_id));
3760
            m_tree->to_seq(parent_id, as_doc);
3761
        }
3762
        else
3763
        {
3764
            m_tree->_add_flags(parent_id, as_doc);
3765
        }
3766
        _move_scalar_from_top();
3767
        _c4dbgpf("start_seq: id={}{}", m_state->node_id, as_doc ? " as_doc" : "");
3768
        _write_val_anchor(parent_id);
3769
        m_tree->_p(parent_id)->m_val.scalar.str = m_state->line_contents.rem.str;
3770
    }
3771
    if( ! m_val_tag.empty())
3772
    {
3773
        _c4dbgpf("start_seq[{}]: set val tag='{}' -> '{}'", m_state->node_id, m_val_tag, normalize_tag(m_val_tag));
3774
        m_tree->set_val_tag(m_state->node_id, normalize_tag(m_val_tag));
3775
        m_val_tag.clear();
3776
    }
3777
}
3778

3779
void Parser::_stop_seq()
3780
{
3781
    _c4dbgp("stop_seq");
3782
    _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_seq(m_state->node_id));
3783
}
3784

3785

3786
//-----------------------------------------------------------------------------
3787
void Parser::_start_seqimap()
3788
{
3789
    _c4dbgpf("start_seqimap at node={}. has_children={}", m_state->node_id, m_tree->has_children(m_state->node_id));
3790
    _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(RSEQ|FLOW));
3791
    // create a map, and turn the last scalar of this sequence
3792
    // into the key of the map's first child. This scalar was
3793
    // understood to be a value in the sequence, but it is
3794
    // actually a key of a map, implicitly opened here.
3795
    // Eg [val, key: val]
3796
    //
3797
    // Yep, YAML is crazy.
3798
    if(m_tree->has_children(m_state->node_id) && m_tree->has_val(m_tree->last_child(m_state->node_id)))
3799
    {
3800
        size_t prev = m_tree->last_child(m_state->node_id);
3801
        NodeType ty = m_tree->_p(prev)->m_type; // don't use type() because it masks out the quotes
3802
        NodeScalar tmp = m_tree->valsc(prev);
3803
        _c4dbgpf("has children and last child={} has val. saving the scalars, val='{}' quoted={}", prev, tmp.scalar, ty.is_val_quoted());
3804
        m_tree->remove(prev);
3805
        _push_level();
3806
        _start_map();
3807
        _store_scalar(tmp.scalar, ty.is_val_quoted());
3808
        m_key_anchor = tmp.anchor;
3809
        m_key_tag = tmp.tag;
3810
    }
3811
    else
3812
    {
3813
        _c4dbgpf("node {} has no children yet, using empty key", m_state->node_id);
3814
        _push_level();
3815
        _start_map();
3816
        _store_scalar_null(m_state->line_contents.rem.str);
3817
    }
3818
    add_flags(RSEQIMAP|FLOW);
3819
}
3820

3821
void Parser::_stop_seqimap()
3822
{
3823
    _c4dbgp("stop_seqimap");
3824
    _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(RSEQIMAP));
3825
}
3826

3827

3828
//-----------------------------------------------------------------------------
3829
NodeData* Parser::_append_val(csubstr val, flag_t quoted)
3830
{
3831
    _RYML_CB_ASSERT(m_stack.m_callbacks,  ! has_all(SSCL));
3832
    _RYML_CB_ASSERT(m_stack.m_callbacks, node(m_state) != nullptr);
3833
    _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_seq(m_state->node_id));
3834
    type_bits additional_flags = quoted ? VALQUO : NOTYPE;
3835
    _c4dbgpf("append val: '{}' to parent id={} (level={}){}", val, m_state->node_id, m_state->level, quoted ? " VALQUO!" : "");
3836
    size_t nid = m_tree->append_child(m_state->node_id);
3837
    m_tree->to_val(nid, val, additional_flags);
3838
    _c4dbgpf("append val: id={} val='{}'", nid, m_tree->get(nid)->m_val.scalar);
3839
    if( ! m_val_tag.empty())
3840
    {
3841
        _c4dbgpf("append val[{}]: set val tag='{}' -> '{}'", nid, m_val_tag, normalize_tag(m_val_tag));
3842
        m_tree->set_val_tag(nid, normalize_tag(m_val_tag));
3843
        m_val_tag.clear();
3844
    }
3845
    _write_val_anchor(nid);
3846
    return m_tree->get(nid);
3847
}
3848

3849
NodeData* Parser::_append_key_val(csubstr val, flag_t val_quoted)
3850
{
3851
    _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_map(m_state->node_id));
3852
    type_bits additional_flags = 0;
3853
    if(m_state->flags & QSCL)
3854
        additional_flags |= KEYQUO;
3855
    if(val_quoted)
3856
        additional_flags |= VALQUO;
3857
    csubstr key = _consume_scalar();
3858
    _c4dbgpf("append keyval: '{}' '{}' to parent id={} (level={}){}{}", key, val, m_state->node_id, m_state->level, (additional_flags & KEYQUO) ? " KEYQUO!" : "", (additional_flags & VALQUO) ? " VALQUO!" : "");
3859
    size_t nid = m_tree->append_child(m_state->node_id);
3860
    m_tree->to_keyval(nid, key, val, additional_flags);
3861
    _c4dbgpf("append keyval: id={} key='{}' val='{}'", nid, m_tree->key(nid), m_tree->val(nid));
3862
    if( ! m_key_tag.empty())
3863
    {
3864
        _c4dbgpf("append keyval[{}]: set key tag='{}' -> '{}'", nid, m_key_tag, normalize_tag(m_key_tag));
3865
        m_tree->set_key_tag(nid, normalize_tag(m_key_tag));
3866
        m_key_tag.clear();
3867
    }
3868
    if( ! m_val_tag.empty())
3869
    {
3870
        _c4dbgpf("append keyval[{}]: set val tag='{}' -> '{}'", nid, m_val_tag, normalize_tag(m_val_tag));
3871
        m_tree->set_val_tag(nid, normalize_tag(m_val_tag));
3872
        m_val_tag.clear();
3873
    }
3874
    _write_key_anchor(nid);
3875
    _write_val_anchor(nid);
3876
    rem_flags(QMRK);
3877
    return m_tree->get(nid);
3878
}
3879

3880

3881
//-----------------------------------------------------------------------------
3882
void Parser::_store_scalar(csubstr s, flag_t is_quoted)
3883
{
3884
    _c4dbgpf("state[{}]: storing scalar '{}' (flag: {}) (old scalar='{}')",
3885
             m_state-m_stack.begin(), s, m_state->flags & SSCL, m_state->scalar);
3886
    RYML_CHECK(has_none(SSCL));
3887
    add_flags(SSCL | (is_quoted * QSCL));
3888
    m_state->scalar = s;
3889
}
3890

3891
csubstr Parser::_consume_scalar()
3892
{
3893
    _c4dbgpf("state[{}]: consuming scalar '{}' (flag: {}))", m_state-m_stack.begin(), m_state->scalar, m_state->flags & SSCL);
3894
    RYML_CHECK(m_state->flags & SSCL);
3895
    csubstr s = m_state->scalar;
3896
    rem_flags(SSCL | QSCL);
3897
    m_state->scalar.clear();
3898
    return s;
3899
}
3900

3901
void Parser::_move_scalar_from_top()
3902
{
3903
    if(m_stack.size() < 2) return;
3904
    State &prev = m_stack.top(1);
3905
    _RYML_CB_ASSERT(m_stack.m_callbacks, m_state == &m_stack.top());
3906
    _RYML_CB_ASSERT(m_stack.m_callbacks, m_state != &prev);
3907
    if(prev.flags & SSCL)
3908
    {
3909
        _c4dbgpf("moving scalar '{}' from state[{}] to state[{}] (overwriting '{}')", prev.scalar, &prev-m_stack.begin(), m_state-m_stack.begin(), m_state->scalar);
3910
        add_flags(prev.flags & (SSCL | QSCL));
3911
        m_state->scalar = prev.scalar;
3912
        rem_flags(SSCL | QSCL, &prev);
3913
        prev.scalar.clear();
3914
    }
3915
}
3916

3917
//-----------------------------------------------------------------------------
3918
/** @todo this function is a monster and needs love. Likely, it needs
3919
 * to be split like _scan_scalar_*() */
3920
bool Parser::_handle_indentation()
3921
{
3922
    _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(FLOW));
3923
    if( ! _at_line_begin())
3924
        return false;
3925

3926
    size_t ind = m_state->line_contents.indentation;
3927
    csubstr rem = m_state->line_contents.rem;
3928
    /** @todo instead of trimming, we should use the indentation index from above */
3929
    csubstr remt = rem.triml(' ');
3930

3931
    if(remt.empty() || remt.begins_with('#')) // this is a blank or comment line
3932
    {
3933
        _line_progressed(rem.size());
3934
        return true;
3935
    }
3936

3937
    _c4dbgpf("indentation? ind={} indref={}", ind, m_state->indref);
3938
    if(ind == m_state->indref)
3939
    {
3940
        _c4dbgpf("same indentation: {}", ind);
3941
        if(!rem.sub(ind).begins_with('-'))
3942
        {
3943
            _c4dbgp("does not begin with -");
3944
            if(has_any(RMAP))
3945
            {
3946
                if(has_all(SSCL|RVAL))
3947
                {
3948
                    _c4dbgp("add with null val");
3949
                    _append_key_val_null(rem.str + ind - 1);
3950
                    addrem_flags(RKEY, RVAL);
3951
                }
3952
            }
3953
            else if(has_any(RSEQ))
3954
            {
3955
                if(m_stack.size() > 2) // do not pop to root level
3956
                {
3957
                    if(has_any(RNXT))
3958
                    {
3959
                        _c4dbgp("end the indentless seq");
3960
                        _pop_level();
3961
                        return true;
3962
                    }
3963
                    else if(has_any(RVAL))
3964
                    {
3965
                        _c4dbgp("add with null val");
3966
                        _append_val_null(rem.str);
3967
                        _c4dbgp("end the indentless seq");
3968
                        _pop_level();
3969
                        return true;
3970
                    }
3971
                }
3972
            }
3973
        }
3974
        _line_progressed(ind);
3975
        return ind > 0;
3976
    }
3977
    else if(ind < m_state->indref)
3978
    {
3979
        _c4dbgpf("smaller indentation ({} < {})!!!", ind, m_state->indref);
3980
        if(has_all(RVAL))
3981
        {
3982
            _c4dbgp("there was an empty val -- appending");
3983
            if(has_all(RMAP))
3984
            {
3985
                _RYML_CB_ASSERT(m_stack.m_callbacks, has_all(SSCL));
3986
                _append_key_val_null(rem.sub(ind).str - 1);
3987
            }
3988
            else if(has_all(RSEQ))
3989
            {
3990
                _RYML_CB_ASSERT(m_stack.m_callbacks, has_none(SSCL));
3991
                _append_val_null(rem.sub(ind).str - 1);
3992
            }
3993
        }
3994
        // search the stack frame to jump to based on its indentation
3995
        State const* popto = nullptr;
3996
        _RYML_CB_ASSERT(m_stack.m_callbacks, m_stack.is_contiguous()); // this search relies on the stack being contiguous
3997
        for(State const* s = m_state-1; s >= m_stack.begin(); --s)
3998
        {
3999
            _c4dbgpf("searching for state with indentation {}. curr={} (level={},node={})", ind, s->indref, s->level, s->node_id);
4000
            if(s->indref == ind)
4001
            {
4002
                _c4dbgpf("gotit!!! level={} node={}", s->level, s->node_id);
4003
                popto = s;
4004
                // while it may be tempting to think we're done at this
4005
                // point, we must still determine whether we're jumping to a
4006
                // parent with the same indentation. Consider this case with
4007
                // an indentless sequence:
4008
                //
4009
                // product:
4010
                // - sku: BL394D
4011
                //   quantity: 4
4012
                //   description: Basketball
4013
                //   price: 450.00
4014
                // - sku: BL4438H
4015
                //   quantity: 1
4016
                //   description: Super Hoop
4017
                //   price: 2392.00  # jumping one level here would be wrong.
4018
                // tax: 1234.5       # we must jump two levels
4019
                if(popto > m_stack.begin())
4020
                {
4021
                    auto parent = popto - 1;
4022
                    if(parent->indref == popto->indref)
4023
                    {
4024
                        _c4dbgpf("the parent (level={},node={}) has the same indentation ({}). is this in an indentless sequence?", parent->level, parent->node_id, popto->indref);
4025
                        _c4dbgpf("isseq(popto)={} ismap(parent)={}", m_tree->is_seq(popto->node_id), m_tree->is_map(parent->node_id));
4026
                        if(m_tree->is_seq(popto->node_id) && m_tree->is_map(parent->node_id))
4027
                        {
4028
                            if( ! remt.begins_with('-'))
4029
                            {
4030
                                _c4dbgp("this is an indentless sequence");
4031
                                popto = parent;
4032
                            }
4033
                            else
4034
                            {
4035
                                _c4dbgp("not an indentless sequence");
4036
                            }
4037
                        }
4038
                    }
4039
                }
4040
                break;
4041
            }
4042
        }
4043
        if(!popto || popto >= m_state || popto->level >= m_state->level)
4044
        {
4045
            _c4err("parse error: incorrect indentation?");
4046
        }
4047
        _c4dbgpf("popping {} levels: from level {} to level {}", m_state->level-popto->level, m_state->level, popto->level);
4048
        while(m_state != popto)
4049
        {
4050
            _c4dbgpf("popping level {} (indentation={})", m_state->level, m_state->indref);
4051
            _pop_level();
4052
        }
4053
        _RYML_CB_ASSERT(m_stack.m_callbacks, ind == m_state->indref);
4054
        _line_progressed(ind);
4055
        return true;
4056
    }
4057
    else
4058
    {
4059
        _c4dbgpf("larger indentation ({} > {})!!!", ind, m_state->indref);
4060
        _RYML_CB_ASSERT(m_stack.m_callbacks, ind > m_state->indref);
4061
        if(has_all(RMAP|RVAL))
4062
        {
4063
            if(_is_scalar_next__rmap_val(remt) && (!remt.first_of_any(": ", "? ")) && (!remt.ends_with(":")))
4064
            {
4065
                _c4dbgpf("actually it seems a value: '{}'", remt);
4066
            }
4067
            else
4068
            {
4069
                addrem_flags(RKEY, RVAL);
4070
                _start_unk();
4071
                //_move_scalar_from_top();
4072
                _line_progressed(ind);
4073
                _save_indentation();
4074
                return true;
4075
            }
4076
        }
4077
        else if(has_all(RSEQ|RVAL))
4078
        {
4079
            // nothing to do here
4080
        }
4081
        else
4082
        {
4083
            _c4err("parse error - indentation should not increase at this point");
4084
        }
4085
    }
4086

4087
    return false;
4088
}
4089

4090
//-----------------------------------------------------------------------------
4091
csubstr Parser::_scan_comment()
4092
{
4093
    csubstr s = m_state->line_contents.rem;
4094
    _RYML_CB_ASSERT(m_stack.m_callbacks, s.begins_with('#'));
4095
    _line_progressed(s.len);
4096
    // skip the # character
4097
    s = s.sub(1);
4098
    // skip leading whitespace
4099
    s = s.right_of(s.first_not_of(' '), /*include_pos*/true);
4100
    _c4dbgpf("comment was '{}'", s);
4101
    return s;
4102
}
4103

4104
//-----------------------------------------------------------------------------
4105
csubstr Parser::_scan_squot_scalar()
4106
{
4107
    // quoted scalars can spread over multiple lines!
4108
    // nice explanation here: http://yaml-multiline.info/
4109

4110
    // a span to the end of the file
4111
    size_t b = m_state->pos.offset;
4112
    substr s = m_buf.sub(b);
4113
    if(s.begins_with(' '))
4114
    {
4115
        s = s.triml(' ');
4116
        _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.sub(b).is_super(s));
4117
        _RYML_CB_ASSERT(m_stack.m_callbacks, s.begin() >= m_buf.sub(b).begin());
4118
        _line_progressed((size_t)(s.begin() - m_buf.sub(b).begin()));
4119
    }
4120
    b = m_state->pos.offset; // take this into account
4121
    _RYML_CB_ASSERT(m_stack.m_callbacks, s.begins_with('\''));
4122

4123
    // skip the opening quote
4124
    _line_progressed(1);
4125
    s = s.sub(1);
4126

4127
    bool needs_filter = false;
4128

4129
    size_t numlines = 1; // we already have one line
4130
    size_t pos = npos; // find the pos of the matching quote
4131
    while( ! _finished_file())
4132
    {
4133
        const csubstr line = m_state->line_contents.rem;
4134
        bool line_is_blank = true;
4135
        _c4dbgpf("scanning single quoted scalar @ line[{}]: ~~~{}~~~", m_state->pos.line, line);
4136
        for(size_t i = 0; i < line.len; ++i)
4137
        {
4138
            const char curr = line.str[i];
4139
            if(curr == '\'') // single quotes are escaped with two single quotes
4140
            {
4141
                const char next = i+1 < line.len ? line.str[i+1] : '~';
4142
                if(next != '\'') // so just look for the first quote
4143
                {                // without another after it
4144
                    pos = i;
4145
                    break;
4146
                }
4147
                else
4148
                {
4149
                    needs_filter = true; // needs filter to remove escaped quotes
4150
                    ++i; // skip the escaped quote
4151
                }
4152
            }
4153
            else if(curr != ' ')
4154
            {
4155
                line_is_blank = false;
4156
            }
4157
        }
4158

4159
        // leading whitespace also needs filtering
4160
        needs_filter = needs_filter
4161
            || (numlines > 1)
4162
            || line_is_blank
4163
            || (_at_line_begin() && line.begins_with(' '));
4164

4165
        if(pos == npos)
4166
        {
4167
            _line_progressed(line.len);
4168
            ++numlines;
4169
        }
4170
        else
4171
        {
4172
            _RYML_CB_ASSERT(m_stack.m_callbacks, pos >= 0 && pos < m_buf.len);
4173
            _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf[m_state->pos.offset + pos] == '\'');
4174
            _line_progressed(pos + 1); // progress beyond the quote
4175
            pos = m_state->pos.offset - b - 1; // but we stop before it
4176
            break;
4177
        }
4178

4179
        _line_ended();
4180
        _scan_line();
4181
    }
4182

4183
    if(pos == npos)
4184
    {
4185
        _c4err("reached end of file while looking for closing quote");
4186
    }
4187
    else
4188
    {
4189
        _RYML_CB_ASSERT(m_stack.m_callbacks, pos > 0);
4190
        _RYML_CB_ASSERT(m_stack.m_callbacks, s.end() >= m_buf.begin() && s.end() <= m_buf.end());
4191
        _RYML_CB_ASSERT(m_stack.m_callbacks, s.end() == m_buf.end() || *s.end() == '\'');
4192
        s = s.sub(0, pos-1);
4193
    }
4194

4195
    if(needs_filter)
4196
    {
4197
        csubstr ret = _filter_squot_scalar(s);
4198
        _RYML_CB_ASSERT(m_stack.m_callbacks, ret.len <= s.len || s.empty() || s.trim(' ').empty());
4199
        _c4dbgpf("final scalar: \"{}\"", ret);
4200
        return ret;
4201
    }
4202

4203
    _c4dbgpf("final scalar: \"{}\"", s);
4204

4205
    return s;
4206
}
4207

4208
//-----------------------------------------------------------------------------
4209
csubstr Parser::_scan_dquot_scalar()
4210
{
4211
    // quoted scalars can spread over multiple lines!
4212
    // nice explanation here: http://yaml-multiline.info/
4213

4214
    // a span to the end of the file
4215
    size_t b = m_state->pos.offset;
4216
    substr s = m_buf.sub(b);
4217
    if(s.begins_with(' '))
4218
    {
4219
        s = s.triml(' ');
4220
        _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.sub(b).is_super(s));
4221
        _RYML_CB_ASSERT(m_stack.m_callbacks, s.begin() >= m_buf.sub(b).begin());
4222
        _line_progressed((size_t)(s.begin() - m_buf.sub(b).begin()));
4223
    }
4224
    b = m_state->pos.offset; // take this into account
4225
    _RYML_CB_ASSERT(m_stack.m_callbacks, s.begins_with('"'));
4226

4227
    // skip the opening quote
4228
    _line_progressed(1);
4229
    s = s.sub(1);
4230

4231
    bool needs_filter = false;
4232

4233
    size_t numlines = 1; // we already have one line
4234
    size_t pos = npos; // find the pos of the matching quote
4235
    while( ! _finished_file())
4236
    {
4237
        const csubstr line = m_state->line_contents.rem;
4238
        bool line_is_blank = true;
4239
        _c4dbgpf("scanning double quoted scalar @ line[{}]:  line='{}'", m_state->pos.line, line);
4240
        for(size_t i = 0; i < line.len; ++i)
4241
        {
4242
            const char curr = line.str[i];
4243
            if(curr != ' ')
4244
                line_is_blank = false;
4245
            // every \ is an escape
4246
            if(curr == '\\')
4247
            {
4248
                const char next = i+1 < line.len ? line.str[i+1] : '~';
4249
                needs_filter = true;
4250
                if(next == '"' || next == '\\')
4251
                    ++i;
4252
            }
4253
            else if(curr == '"')
4254
            {
4255
                pos = i;
4256
                break;
4257
            }
4258
        }
4259

4260
        // leading whitespace also needs filtering
4261
        needs_filter = needs_filter
4262
            || (numlines > 1)
4263
            || line_is_blank
4264
            || (_at_line_begin() && line.begins_with(' '));
4265

4266
        if(pos == npos)
4267
        {
4268
            _line_progressed(line.len);
4269
            ++numlines;
4270
        }
4271
        else
4272
        {
4273
            _RYML_CB_ASSERT(m_stack.m_callbacks, pos >= 0 && pos < m_buf.len);
4274
            _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf[m_state->pos.offset + pos] == '"');
4275
            _line_progressed(pos + 1); // progress beyond the quote
4276
            pos = m_state->pos.offset - b - 1; // but we stop before it
4277
            break;
4278
        }
4279

4280
        _line_ended();
4281
        _scan_line();
4282
    }
4283

4284
    if(pos == npos)
4285
    {
4286
        _c4err("reached end of file looking for closing quote");
4287
    }
4288
    else
4289
    {
4290
        _RYML_CB_ASSERT(m_stack.m_callbacks, pos > 0);
4291
        _RYML_CB_ASSERT(m_stack.m_callbacks, s.end() == m_buf.end() || *s.end() == '"');
4292
        _RYML_CB_ASSERT(m_stack.m_callbacks, s.end() >= m_buf.begin() && s.end() <= m_buf.end());
4293
        s = s.sub(0, pos-1);
4294
    }
4295

4296
    if(needs_filter)
4297
    {
4298
        csubstr ret = _filter_dquot_scalar(s);
4299
        _c4dbgpf("final scalar: [{}]\"{}\"", ret.len, ret);
4300
        _RYML_CB_ASSERT(m_stack.m_callbacks, ret.len <= s.len || s.empty() || s.trim(' ').empty());
4301
        return ret;
4302
    }
4303

4304
    _c4dbgpf("final scalar: \"{}\"", s);
4305

4306
    return s;
4307
}
4308

4309
//-----------------------------------------------------------------------------
4310
csubstr Parser::_scan_block()
4311
{
4312
    // nice explanation here: http://yaml-multiline.info/
4313
    csubstr s = m_state->line_contents.rem;
4314
    csubstr trimmed = s.triml(' ');
4315
    if(trimmed.str > s.str)
4316
    {
4317
        _c4dbgp("skipping whitespace");
4318
        _RYML_CB_ASSERT(m_stack.m_callbacks, trimmed.str >= s.str);
4319
        _line_progressed(static_cast<size_t>(trimmed.str - s.str));
4320
        s = trimmed;
4321
    }
4322
    _RYML_CB_ASSERT(m_stack.m_callbacks, s.begins_with('|') || s.begins_with('>'));
4323

4324
    _c4dbgpf("scanning block: specs=\"{}\"", s);
4325

4326
    // parse the spec
4327
    BlockStyle_e newline = s.begins_with('>') ? BLOCK_FOLD : BLOCK_LITERAL;
4328
    BlockChomp_e chomp = CHOMP_CLIP; // default to clip unless + or - are used
4329
    size_t indentation = npos; // have to find out if no spec is given
4330
    csubstr digits;
4331
    if(s.len > 1)
4332
    {
4333
        _RYML_CB_ASSERT(m_stack.m_callbacks, s.begins_with_any("|>"));
4334
        csubstr t = s.sub(1);
4335
        _c4dbgpf("scanning block: spec is multichar: '{}'", t);
4336
        _RYML_CB_ASSERT(m_stack.m_callbacks, t.len >= 1);
4337
        size_t pos = t.first_of("-+");
4338
        _c4dbgpf("scanning block: spec chomp char at {}", pos);
4339
        if(pos != npos)
4340
        {
4341
            if(t[pos] == '-')
4342
                chomp = CHOMP_STRIP;
4343
            else if(t[pos] == '+')
4344
                chomp = CHOMP_KEEP;
4345
            if(pos == 0)
4346
                t = t.sub(1);
4347
            else
4348
                t = t.first(pos);
4349
        }
4350
        // from here to the end, only digits are considered
4351
        digits = t.left_of(t.first_not_of("0123456789"));
4352
        if( ! digits.empty())
4353
        {
4354
            if( ! c4::atou(digits, &indentation))
4355
                _c4err("parse error: could not read decimal");
4356
            _c4dbgpf("scanning block: indentation specified: {}. add {} from curr state -> {}", indentation, m_state->indref, indentation+m_state->indref);
4357
            indentation += m_state->indref;
4358
        }
4359
    }
4360

4361
    // finish the current line
4362
    _line_progressed(s.len);
4363
    _line_ended();
4364
    _scan_line();
4365

4366
    _c4dbgpf("scanning block: style={}  chomp={}  indentation={}", newline==BLOCK_FOLD ? "fold" : "literal", chomp==CHOMP_CLIP ? "clip" : (chomp==CHOMP_STRIP ? "strip" : "keep"), indentation);
4367

4368
    // start with a zero-length block, already pointing at the right place
4369
    substr raw_block(m_buf.data() + m_state->pos.offset, size_t(0));// m_state->line_contents.full.sub(0, 0);
4370
    _RYML_CB_ASSERT(m_stack.m_callbacks, raw_block.begin() == m_state->line_contents.full.begin());
4371

4372
    // read every full line into a raw block,
4373
    // from which newlines are to be stripped as needed.
4374
    //
4375
    // If no explicit indentation was given, pick it from the first
4376
    // non-empty line. See
4377
    // https://yaml.org/spec/1.2.2/#8111-block-indentation-indicator
4378
    size_t num_lines = 0, first = m_state->pos.line, provisional_indentation = npos;
4379
    LineContents lc;
4380
    while(( ! _finished_file()))
4381
    {
4382
        // peek next line, but do not advance immediately
4383
        lc.reset_with_next_line(m_buf, m_state->pos.offset);
4384
        _c4dbgpf("scanning block: peeking at '{}'", lc.stripped);
4385
        // evaluate termination conditions
4386
        if(indentation != npos)
4387
        {
4388
            // stop when the line is deindented and not empty
4389
            if(lc.indentation < indentation && ( ! lc.rem.trim(" \t\r\n").empty()))
4390
            {
4391
                if(raw_block.len)
4392
                {
4393
                    _c4dbgpf("scanning block: indentation decreased ref={} thisline={}", indentation, lc.indentation);
4394
                }
4395
                else
4396
                {
4397
                    _c4err("indentation decreased without any scalar");
4398
                }
4399
                break;
4400
            }
4401
            else if(indentation == 0)
4402
            {
4403
                if((lc.rem == "..." || lc.rem.begins_with("... "))
4404
                    ||
4405
                   (lc.rem == "---" || lc.rem.begins_with("--- ")))
4406
                {
4407
                    _c4dbgp("scanning block: stop. indentation=0 and stream ended");
4408
                    break;
4409
                }
4410
            }
4411
        }
4412
        else
4413
        {
4414
            _c4dbgpf("scanning block: indentation ref not set. firstnonws={}", lc.stripped.first_not_of(' '));
4415
            if(lc.stripped.first_not_of(' ') != npos) // non-empty line
4416
            {
4417
                _c4dbgpf("scanning block: line not empty. indref={} indprov={} indentation={}", m_state->indref, provisional_indentation, lc.indentation);
4418
                if(provisional_indentation == npos)
4419
                {
4420
                    if(lc.indentation < m_state->indref)
4421
                    {
4422
                        _c4dbgpf("scanning block: block terminated indentation={} < indref={}", lc.indentation, m_state->indref);
4423
                        if(raw_block.len == 0)
4424
                        {
4425
                            _c4dbgp("scanning block: was empty, undo next line");
4426
                            _line_ended_undo();
4427
                        }
4428
                        break;
4429
                    }
4430
                    else if(lc.indentation == m_state->indref)
4431
                    {
4432
                        if(has_any(RSEQ|RMAP))
4433
                        {
4434
                            _c4dbgpf("scanning block: block terminated. reading container and indentation={}==indref={}", lc.indentation, m_state->indref);
4435
                            break;
4436
                        }
4437
                    }
4438
                    _c4dbgpf("scanning block: set indentation ref from this line: ref={}", lc.indentation);
4439
                    indentation = lc.indentation;
4440
                }
4441
                else
4442
                {
4443
                    if(lc.indentation >= provisional_indentation)
4444
                    {
4445
                        _c4dbgpf("scanning block: set indentation ref from provisional indentation: provisional_ref={}, thisline={}", provisional_indentation, lc.indentation);
4446
                        //indentation = provisional_indentation ? provisional_indentation : lc.indentation;
4447
                        indentation = lc.indentation;
4448
                    }
4449
                    else
4450
                    {
4451
                        break;
4452
                        //_c4err("parse error: first non-empty block line should have at least the original indentation");
4453
                    }
4454
                }
4455
            }
4456
            else // empty line
4457
            {
4458
                _c4dbgpf("scanning block: line empty or {} spaces. line_indentation={} prov_indentation={}", lc.stripped.len, lc.indentation, provisional_indentation);
4459
                if(provisional_indentation != npos)
4460
                {
4461
                    if(lc.stripped.len >= provisional_indentation)
4462
                    {
4463
                        _c4dbgpf("scanning block: increase provisional_ref {} -> {}", provisional_indentation, lc.stripped.len);
4464
                        provisional_indentation = lc.stripped.len;
4465
                    }
4466
                    #ifdef RYML_NO_COVERAGE__TO_BE_DELETED
4467
                    else if(lc.indentation >= provisional_indentation && lc.indentation != npos)
4468
                    {
4469
                        _c4dbgpf("scanning block: increase provisional_ref {} -> {}", provisional_indentation, lc.indentation);
4470
                        provisional_indentation = lc.indentation;
4471
                    }
4472
                    #endif
4473
                }
4474
                else
4475
                {
4476
                    provisional_indentation = lc.indentation ? lc.indentation : has_any(RSEQ|RVAL);
4477
                    _c4dbgpf("scanning block: initialize provisional_ref={}", provisional_indentation);
4478
                    if(provisional_indentation == npos)
4479
                    {
4480
                        provisional_indentation = lc.stripped.len ? lc.stripped.len : has_any(RSEQ|RVAL);
4481
                        _c4dbgpf("scanning block: initialize provisional_ref={}", provisional_indentation);
4482
                    }
4483
                }
4484
            }
4485
        }
4486
        // advance now that we know the folded scalar continues
4487
        m_state->line_contents = lc;
4488
        _c4dbgpf("scanning block: append '{}'", m_state->line_contents.rem);
4489
        raw_block.len += m_state->line_contents.full.len;
4490
        _line_progressed(m_state->line_contents.rem.len);
4491
        _line_ended();
4492
        ++num_lines;
4493
    }
4494
    _RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.line == (first + num_lines) || (raw_block.len == 0));
4495
    C4_UNUSED(num_lines);
4496
    C4_UNUSED(first);
4497

4498
    if(indentation == npos)
4499
    {
4500
        _c4dbgpf("scanning block: set indentation from provisional: {}", provisional_indentation);
4501
        indentation = provisional_indentation;
4502
    }
4503

4504
    if(num_lines)
4505
        _line_ended_undo();
4506

4507
    _c4dbgpf("scanning block: raw=~~~{}~~~", raw_block);
4508

4509
    // ok! now we strip the newlines and spaces according to the specs
4510
    s = _filter_block_scalar(raw_block, newline, chomp, indentation);
4511

4512
    _c4dbgpf("scanning block: final=~~~{}~~~", s);
4513

4514
    return s;
4515
}
4516

4517

4518
//-----------------------------------------------------------------------------
4519

4520
template<bool backslash_is_escape, bool keep_trailing_whitespace>
4521
bool Parser::_filter_nl(substr r, size_t *C4_RESTRICT i, size_t *C4_RESTRICT pos, size_t indentation)
4522
{
4523
    // a debugging scaffold:
4524
    #if 0
4525
    #define _c4dbgfnl(fmt, ...) _c4dbgpf("filter_nl[{}]: " fmt, *i, __VA_ARGS__)
4526
    #else
4527
    #define _c4dbgfnl(...)
4528
    #endif
4529

4530
    const char curr = r[*i];
4531
    bool replaced = false;
4532

4533
    _RYML_CB_ASSERT(m_stack.m_callbacks, indentation != npos);
4534
    _RYML_CB_ASSERT(m_stack.m_callbacks, curr == '\n');
4535

4536
    _c4dbgfnl("found newline. sofar=[{}]~~~{}~~~", *pos, m_filter_arena.first(*pos));
4537
    size_t ii = *i;
4538
    size_t numnl_following = count_following_newlines(r, &ii, indentation);
4539
    if(numnl_following)
4540
    {
4541
        _c4dbgfnl("{} consecutive (empty) lines {} in the middle. totalws={}", 1+numnl_following, ii < r.len ? "in the middle" : "at the end", ii - *i);
4542
        for(size_t j = 0; j < numnl_following; ++j)
4543
            m_filter_arena.str[(*pos)++] = '\n';
4544
    }
4545
    else
4546
    {
4547
        if(r.first_not_of(" \t", *i+1) != npos)
4548
        {
4549
            m_filter_arena.str[(*pos)++] = ' ';
4550
            _c4dbgfnl("single newline. convert to space. ii={}/{}. sofar=[{}]~~~{}~~~", ii, r.len, *pos, m_filter_arena.first(*pos));
4551
            replaced = true;
4552
        }
4553
        else
4554
        {
4555
            if C4_IF_CONSTEXPR (keep_trailing_whitespace)
4556
            {
4557
                m_filter_arena.str[(*pos)++] = ' ';
4558
                _c4dbgfnl("single newline. convert to space. ii={}/{}. sofar=[{}]~~~{}~~~", ii, r.len, *pos, m_filter_arena.first(*pos));
4559
                replaced = true;
4560
            }
4561
            else
4562
            {
4563
                _c4dbgfnl("last newline, everything else is whitespace. ii={}/{}", ii, r.len);
4564
                *i = r.len;
4565
            }
4566
        }
4567
        if C4_IF_CONSTEXPR (backslash_is_escape)
4568
        {
4569
            if(ii < r.len && r.str[ii] == '\\')
4570
            {
4571
                const char next = ii+1 < r.len ? r.str[ii+1] : '\0';
4572
                if(next == ' ' || next == '\t')
4573
                {
4574
                    _c4dbgfnl("extend skip to backslash{}", "");
4575
                    ++ii;
4576
                }
4577
            }
4578
        }
4579
    }
4580
    *i = ii - 1; // correct for the loop increment
4581

4582
    #undef _c4dbgfnl
4583

4584
    return replaced;
4585
}
4586

4587

4588
//-----------------------------------------------------------------------------
4589

4590
template<bool keep_trailing_whitespace>
4591
void Parser::_filter_ws(substr r, size_t *C4_RESTRICT i, size_t *C4_RESTRICT pos)
4592
{
4593
    // a debugging scaffold:
4594
    #if 0
4595
    #define _c4dbgfws(fmt, ...) _c4dbgpf("filt_nl[{}]: " fmt, *i, __VA_ARGS__)
4596
    #else
4597
    #define _c4dbgfws(...)
4598
    #endif
4599

4600
    const char curr = r[*i];
4601
    _c4dbgfws("found whitespace '{}'", _c4prc(curr));
4602
    _RYML_CB_ASSERT(m_stack.m_callbacks, curr == ' ' || curr == '\t');
4603

4604
    size_t first = *i > 0 ? r.first_not_of(" \t", *i) : r.first_not_of(' ', *i);
4605
    if(first != npos)
4606
    {
4607
        if(r[first] == '\n' || r[first] == '\r') // skip trailing whitespace
4608
        {
4609
            _c4dbgfws("whitespace is trailing on line. firstnonws='{}'@{}", _c4prc(r[first]), first);
4610
            *i = first - 1; // correct for the loop increment
4611
        }
4612
        else // a legit whitespace
4613
        {
4614
            m_filter_arena.str[(*pos)++] = curr;
4615
            _c4dbgfws("legit whitespace. sofar=[{}]~~~{}~~~", *pos, m_filter_arena.first(*pos));
4616
        }
4617
    }
4618
    else
4619
    {
4620
        _c4dbgfws("... everything else is trailing whitespace{}", "");
4621
        if C4_IF_CONSTEXPR (keep_trailing_whitespace)
4622
            for(size_t j = *i; j < r.len; ++j)
4623
                m_filter_arena.str[(*pos)++] = r[j];
4624
        *i = r.len;
4625
    }
4626

4627
    #undef _c4dbgfws
4628
}
4629

4630

4631
//-----------------------------------------------------------------------------
4632
csubstr Parser::_filter_plain_scalar(substr s, size_t indentation)
4633
{
4634
    // a debugging scaffold:
4635
    #if 0
4636
    #define _c4dbgfps(...) _c4dbgpf("filt_plain_scalar" __VA_ARGS__)
4637
    #else
4638
    #define _c4dbgfps(...)
4639
    #endif
4640

4641
    _c4dbgfps("before=~~~{}~~~", s);
4642

4643
    substr r = s.triml(" \t");
4644
    _grow_filter_arena(r.len);
4645
    size_t pos = 0; // the filtered size
4646
    bool filtered_chars = false;
4647
    for(size_t i = 0; i < r.len; ++i)
4648
    {
4649
        const char curr = r.str[i];
4650
        _c4dbgfps("[{}]: '{}'", i, _c4prc(curr));
4651
        if(curr == ' ' || curr == '\t')
4652
        {
4653
            _filter_ws</*keep_trailing_ws*/false>(r, &i, &pos);
4654
        }
4655
        else if(curr == '\n')
4656
        {
4657
            filtered_chars = _filter_nl</*backslash_is_escape*/false, /*keep_trailing_ws*/false>(r, &i, &pos, indentation);
4658
        }
4659
        else if(curr == '\r')  // skip \r --- https://stackoverflow.com/questions/1885900
4660
        {
4661
            ;
4662
        }
4663
        else
4664
        {
4665
            m_filter_arena.str[pos++] = r[i];
4666
        }
4667
    }
4668

4669
    _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= m_filter_arena.len);
4670
    if(pos < r.len || filtered_chars)
4671
    {
4672
        r = _finish_filter_arena(r, pos);
4673
    }
4674

4675
    _RYML_CB_ASSERT(m_stack.m_callbacks, s.len >= r.len);
4676
    _c4dbgfps("#filteredchars={} after=~~~{}~~~", s.len - r.len, r);
4677

4678
    #undef _c4dbgfps
4679
    return r;
4680
}
4681

4682

4683
//-----------------------------------------------------------------------------
4684
csubstr Parser::_filter_squot_scalar(substr s)
4685
{
4686
    // a debugging scaffold:
4687
    #if 0
4688
    #define _c4dbgfsq(...) _c4dbgpf("filt_squo_scalar")
4689
    #else
4690
    #define _c4dbgfsq(...)
4691
    #endif
4692

4693
    // from the YAML spec for double-quoted scalars:
4694
    // https://yaml.org/spec/1.2-old/spec.html#style/flow/single-quoted
4695

4696
    _c4dbgfsq(": before=~~~{}~~~", s);
4697

4698
    _grow_filter_arena(s.len);
4699
    substr r = s;
4700
    size_t pos = 0; // the filtered size
4701
    bool filtered_chars = false;
4702
    for(size_t i = 0; i < r.len; ++i)
4703
    {
4704
        const char curr = r[i];
4705
        _c4dbgfsq("[{}]: '{}'", i, _c4prc(curr));
4706
        if(curr == ' ' || curr == '\t')
4707
        {
4708
            _filter_ws</*keep_trailing_ws*/true>(r, &i, &pos);
4709
        }
4710
        else if(curr == '\n')
4711
        {
4712
            filtered_chars = _filter_nl</*backslash_is_escape*/false, /*keep_trailing_ws*/true>(r, &i, &pos, /*indentation*/0);
4713
        }
4714
        else if(curr == '\r')  // skip \r --- https://stackoverflow.com/questions/1885900
4715
        {
4716
            ;
4717
        }
4718
        else if(curr == '\'')
4719
        {
4720
            char next = i+1 < r.len ? r[i+1] : '\0';
4721
            if(next == '\'')
4722
            {
4723
                _c4dbgfsq("[{}]: two consecutive quotes", i);
4724
                filtered_chars = true;
4725
                m_filter_arena.str[pos++] = '\'';
4726
                ++i;
4727
            }
4728
        }
4729
        else
4730
        {
4731
            m_filter_arena.str[pos++] = curr;
4732
        }
4733
    }
4734

4735
    _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= m_filter_arena.len);
4736
    if(pos < r.len || filtered_chars)
4737
    {
4738
        r = _finish_filter_arena(r, pos);
4739
    }
4740

4741
    _RYML_CB_ASSERT(m_stack.m_callbacks, s.len >= r.len);
4742
    _c4dbgpf(": #filteredchars={} after=~~~{}~~~", s.len - r.len, r);
4743

4744
    #undef _c4dbgfsq
4745
    return r;
4746
}
4747

4748

4749
//-----------------------------------------------------------------------------
4750
csubstr Parser::_filter_dquot_scalar(substr s)
4751
{
4752
    // a debugging scaffold:
4753
    #if 0
4754
    #define _c4dbgfdq(...) _c4dbgpf("filt_dquo_scalar" __VA_ARGS__)
4755
    #else
4756
    #define _c4dbgfdq(...)
4757
    #endif
4758

4759
    _c4dbgfdq(": before=~~~{}~~~", s);
4760

4761
    // from the YAML spec for double-quoted scalars:
4762
    // https://yaml.org/spec/1.2-old/spec.html#style/flow/double-quoted
4763
    //
4764
    // All leading and trailing white space characters are excluded
4765
    // from the content. Each continuation line must therefore contain
4766
    // at least one non-space character. Empty lines, if any, are
4767
    // consumed as part of the line folding.
4768

4769
    _grow_filter_arena(s.len + 2u * s.count('\\'));
4770
    substr r = s;
4771
    size_t pos = 0; // the filtered size
4772
    bool filtered_chars = false;
4773
    for(size_t i = 0; i < r.len; ++i)
4774
    {
4775
        const char curr = r[i];
4776
        _c4dbgfdq("[{}]: '{}'", i, _c4prc(curr));
4777
        if(curr == ' ' || curr == '\t')
4778
        {
4779
            _filter_ws</*keep_trailing_ws*/true>(r, &i, &pos);
4780
        }
4781
        else if(curr == '\n')
4782
        {
4783
            filtered_chars = _filter_nl</*backslash_is_escape*/true, /*keep_trailing_ws*/true>(r, &i, &pos, /*indentation*/0);
4784
        }
4785
        else if(curr == '\r')  // skip \r --- https://stackoverflow.com/questions/1885900
4786
        {
4787
            ;
4788
        }
4789
        else if(curr == '\\')
4790
        {
4791
            char next = i+1 < r.len ? r[i+1] : '\0';
4792
            _c4dbgfdq("[{}]: backslash, next='{}'", i, _c4prc(next));
4793
            filtered_chars = true;
4794
            if(next == '\r')
4795
            {
4796
                if(i+2 < r.len && r[i+2] == '\n')
4797
                {
4798
                    ++i; // newline escaped with \ -- skip both (add only one as i is loop-incremented)
4799
                    next = '\n';
4800
                    _c4dbgfdq("[{}]: was \\r\\n, now next='\\n'", i);
4801
                }
4802
            }
4803
            // remember the loop will also increment i
4804
            if(next == '\n')
4805
            {
4806
                size_t ii = i + 2;
4807
                for( ; ii < r.len; ++ii)
4808
                {
4809
                    if(r.str[ii] == ' ' || r.str[ii] == '\t')  // skip leading whitespace
4810
                        ;
4811
                    else
4812
                        break;
4813
                }
4814
                i += ii - i - 1;
4815
            }
4816
            else if(next == '"' || next == '/'  || next == ' ' || next == '\t') // escapes for json compatibility
4817
            {
4818
                m_filter_arena.str[pos++] = next;
4819
                ++i;
4820
            }
4821
            else if(next == '\r')
4822
            {
4823
                //++i;
4824
            }
4825
            else if(next == 'n')
4826
            {
4827
                m_filter_arena.str[pos++] = '\n';
4828
                ++i;
4829
            }
4830
            else if(next == 'r')
4831
            {
4832
                m_filter_arena.str[pos++] = '\r';
4833
                ++i; // skip
4834
            }
4835
            else if(next == 't')
4836
            {
4837
                m_filter_arena.str[pos++] = '\t';
4838
                ++i;
4839
            }
4840
            else if(next == '\\')
4841
            {
4842
                m_filter_arena.str[pos++] = '\\';
4843
                ++i;
4844
            }
4845
            else if(next == 'x') // UTF8
4846
            {
4847
                if(i + 1u + 2u >= r.len)
4848
                    _c4err("\\x requires 2 hex digits");
4849
                uint8_t byteval = {};
4850
                if(!read_hex(r.sub(i + 2u, 2u), &byteval))
4851
                    _c4err("failed to read \\x codepoint");
4852
                m_filter_arena.str[pos++] = *(char*)&byteval;
4853
                i += 1u + 2u;
4854
            }
4855
            else if(next == 'u') // UTF16
4856
            {
4857
                if(i + 1u + 4u >= r.len)
4858
                    _c4err("\\u requires 4 hex digits");
4859
                char readbuf[8];
4860
                csubstr codepoint = r.sub(i + 2u, 4u);
4861
                uint32_t codepoint_val = {};
4862
                if(!read_hex(codepoint, &codepoint_val))
4863
                    _c4err("failed to parse \\u codepoint");
4864
                size_t numbytes = decode_code_point((uint8_t*)readbuf, sizeof(readbuf), codepoint_val);
4865
                C4_ASSERT(numbytes <= 4);
4866
                memcpy(m_filter_arena.str + pos, readbuf, numbytes);
4867
                pos += numbytes;
4868
                i += 1u + 4u;
4869
            }
4870
            else if(next == 'U') // UTF32
4871
            {
4872
                if(i + 1u + 8u >= r.len)
4873
                    _c4err("\\U requires 8 hex digits");
4874
                char readbuf[8];
4875
                csubstr codepoint = r.sub(i + 2u, 8u);
4876
                uint32_t codepoint_val = {};
4877
                if(!read_hex(codepoint, &codepoint_val))
4878
                    _c4err("failed to parse \\U codepoint");
4879
                size_t numbytes = decode_code_point((uint8_t*)readbuf, sizeof(readbuf), codepoint_val);
4880
                C4_ASSERT(numbytes <= 4);
4881
                memcpy(m_filter_arena.str + pos, readbuf, numbytes);
4882
                pos += numbytes;
4883
                i += 1u + 8u;
4884
            }
4885
            // https://yaml.org/spec/1.2.2/#rule-c-ns-esc-char
4886
            else if(next == '0')
4887
            {
4888
                m_filter_arena.str[pos++] = '\0';
4889
                ++i;
4890
            }
4891
            else if(next == 'b') // backspace
4892
            {
4893
                m_filter_arena.str[pos++] = '\b';
4894
                ++i;
4895
            }
4896
            else if(next == 'f') // form feed
4897
            {
4898
                m_filter_arena.str[pos++] = '\f';
4899
                ++i;
4900
            }
4901
            else if(next == 'a') // bell character
4902
            {
4903
                m_filter_arena.str[pos++] = '\a';
4904
                ++i;
4905
            }
4906
            else if(next == 'v') // vertical tab
4907
            {
4908
                m_filter_arena.str[pos++] = '\v';
4909
                ++i;
4910
            }
4911
            else if(next == 'e') // escape character
4912
            {
4913
                m_filter_arena.str[pos++] = '\x1b';
4914
                ++i;
4915
            }
4916
            else if(next == '_') // unicode non breaking space \u00a0
4917
            {
4918
                // https://www.compart.com/en/unicode/U+00a0
4919
                m_filter_arena.str[pos++] = _RYML_CHCONST(-0x3e, 0xc2);
4920
                m_filter_arena.str[pos++] = _RYML_CHCONST(-0x60, 0xa0);
4921
                ++i;
4922
            }
4923
            else if(next == 'N') // unicode next line \u0085
4924
            {
4925
                // https://www.compart.com/en/unicode/U+0085
4926
                m_filter_arena.str[pos++] = _RYML_CHCONST(-0x3e, 0xc2);
4927
                m_filter_arena.str[pos++] = _RYML_CHCONST(-0x7b, 0x85);
4928
                ++i;
4929
            }
4930
            else if(next == 'L') // unicode line separator \u2028
4931
            {
4932
                // https://www.utf8-chartable.de/unicode-utf8-table.pl?start=8192&number=1024&names=-&utf8=0x&unicodeinhtml=hex
4933
                m_filter_arena.str[pos++] = _RYML_CHCONST(-0x1e, 0xe2);
4934
                m_filter_arena.str[pos++] = _RYML_CHCONST(-0x80, 0x80);
4935
                m_filter_arena.str[pos++] = _RYML_CHCONST(-0x58, 0xa8);
4936
                ++i;
4937
            }
4938
            else if(next == 'P') // unicode paragraph separator \u2029
4939
            {
4940
                // https://www.utf8-chartable.de/unicode-utf8-table.pl?start=8192&number=1024&names=-&utf8=0x&unicodeinhtml=hex
4941
                m_filter_arena.str[pos++] = _RYML_CHCONST(-0x1e, 0xe2);
4942
                m_filter_arena.str[pos++] = _RYML_CHCONST(-0x80, 0x80);
4943
                m_filter_arena.str[pos++] = _RYML_CHCONST(-0x57, 0xa9);
4944
                ++i;
4945
            }
4946
            _c4dbgfdq("[{}]: backslash...sofar=[{}]~~~{}~~~", i, pos, m_filter_arena.first(pos));
4947
        }
4948
        else
4949
        {
4950
            m_filter_arena.str[pos++] = curr;
4951
        }
4952
    }
4953

4954
    _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= m_filter_arena.len);
4955
    if(pos < r.len || filtered_chars)
4956
    {
4957
        r = _finish_filter_arena(r, pos);
4958
    }
4959

4960
    _RYML_CB_ASSERT(m_stack.m_callbacks, s.len >= r.len);
4961
    _c4dbgpf(": #filteredchars={} after=~~~{}~~~", s.len - r.len, r);
4962

4963
    #undef _c4dbgfdq
4964

4965
    return r;
4966
}
4967

4968

4969
//-----------------------------------------------------------------------------
4970
bool Parser::_apply_chomp(substr buf, size_t *C4_RESTRICT pos, BlockChomp_e chomp)
4971
{
4972
    substr trimmed = buf.first(*pos).trimr('\n');
4973
    bool added_newline = false;
4974
    switch(chomp)
4975
    {
4976
    case CHOMP_KEEP:
4977
        if(trimmed.len == *pos)
4978
        {
4979
            _c4dbgpf("chomp=KEEP: add missing newline @{}", *pos);
4980
            //m_filter_arena.str[(*pos)++] = '\n';
4981
            added_newline = true;
4982
        }
4983
        break;
4984
    case CHOMP_CLIP:
4985
        if(trimmed.len == *pos)
4986
        {
4987
            _c4dbgpf("chomp=CLIP: add missing newline @{}", *pos);
4988
            m_filter_arena.str[(*pos)++] = '\n';
4989
            added_newline = true;
4990
        }
4991
        else
4992
        {
4993
            _c4dbgpf("chomp=CLIP: include single trailing newline @{}", trimmed.len+1);
4994
            *pos = trimmed.len + 1;
4995
        }
4996
        break;
4997
    case CHOMP_STRIP:
4998
        _c4dbgpf("chomp=STRIP: strip {}-{}-{} newlines", *pos, trimmed.len, *pos-trimmed.len);
4999
        *pos = trimmed.len;
5000
        break;
5001
    default:
5002
        _c4err("unknown chomp style");
5003
    }
5004
    return added_newline;
5005
}
5006

5007

5008
//-----------------------------------------------------------------------------
5009
csubstr Parser::_filter_block_scalar(substr s, BlockStyle_e style, BlockChomp_e chomp, size_t indentation)
5010
{
5011
    // a debugging scaffold:
5012
    #if 0
5013
    #define _c4dbgfbl(fmt, ...) _c4dbgpf("filt_block" fmt, __VA_ARGS__)
5014
    #else
5015
    #define _c4dbgfbl(...)
5016
    #endif
5017

5018
    _c4dbgfbl(": indentation={} before=[{}]~~~{}~~~", indentation, s.len, s);
5019

5020
    if(chomp != CHOMP_KEEP && s.trim(" \n\r").len == 0u)
5021
    {
5022
        _c4dbgp("filt_block: empty scalar");
5023
        return s.first(0);
5024
    }
5025

5026
    substr r = s;
5027

5028
    switch(style)
5029
    {
5030
    case BLOCK_LITERAL:
5031
        {
5032
            _c4dbgp("filt_block: style=literal");
5033
            // trim leading whitespace up to indentation
5034
            {
5035
                size_t numws = r.first_not_of(' ');
5036
                if(numws != npos)
5037
                {
5038
                    if(numws > indentation)
5039
                        r = r.sub(indentation);
5040
                    else
5041
                        r = r.sub(numws);
5042
                    _c4dbgfbl(": after triml=[{}]~~~{}~~~", r.len, r);
5043
                }
5044
                else
5045
                {
5046
                    if(chomp != CHOMP_KEEP || r.len == 0)
5047
                    {
5048
                        _c4dbgfbl(": all spaces {}, return empty", r.len);
5049
                        return r.first(0);
5050
                    }
5051
                    else
5052
                    {
5053
                        r[0] = '\n';
5054
                        return r.first(1);
5055
                    }
5056
                }
5057
            }
5058
            _grow_filter_arena(s.len + 2u);  // use s.len! because we may need to add a newline at the end, so the leading indentation will allow space for that newline
5059
            size_t pos = 0; // the filtered size
5060
            for(size_t i = 0; i < r.len; ++i)
5061
            {
5062
                const char curr = r.str[i];
5063
                _c4dbgfbl("[{}]='{}'  pos={}", i, _c4prc(curr), pos);
5064
                if(curr == '\r')
5065
                    continue;
5066
                m_filter_arena.str[pos++] = curr;
5067
                if(curr == '\n')
5068
                {
5069
                    _c4dbgfbl("[{}]: found newline", i);
5070
                    // skip indentation on the next line
5071
                    csubstr rem = r.sub(i+1);
5072
                    size_t first = rem.first_not_of(' ');
5073
                    if(first != npos)
5074
                    {
5075
                        _RYML_CB_ASSERT(m_stack.m_callbacks, first < rem.len);
5076
                        _RYML_CB_ASSERT(m_stack.m_callbacks, i+1+first < r.len);
5077
                        _c4dbgfbl("[{}]: {} spaces follow before next nonws character @ [{}]='{}'", i, first, i+1+first, rem.str[first]);
5078
                        if(first < indentation)
5079
                        {
5080
                            _c4dbgfbl("[{}]: skip {}<{} spaces from indentation", i, first, indentation);
5081
                            i += first;
5082
                        }
5083
                        else
5084
                        {
5085
                            _c4dbgfbl("[{}]: skip {} spaces from indentation", i, indentation);
5086
                            i += indentation;
5087
                        }
5088
                    }
5089
                    else
5090
                    {
5091
                        _RYML_CB_ASSERT(m_stack.m_callbacks, i+1 <= r.len);
5092
                        first = rem.len;
5093
                        _c4dbgfbl("[{}]: {} spaces to the end", i, first);
5094
                        if(first)
5095
                        {
5096
                            if(first < indentation)
5097
                            {
5098
                                _c4dbgfbl("[{}]: skip everything", i);
5099
                                --pos;
5100
                                break;
5101
                            }
5102
                            else
5103
                            {
5104
                                _c4dbgfbl("[{}]: skip {} spaces from indentation", i, indentation);
5105
                                i += indentation;
5106
                            }
5107
                        }
5108
                        else if(i+1 == r.len)
5109
                        {
5110
                            if(chomp == CHOMP_STRIP)
5111
                                --pos;
5112
                            break;
5113
                        }
5114
                    }
5115
                }
5116
            }
5117
            _RYML_CB_ASSERT(m_stack.m_callbacks, s.len >= pos);
5118
            _c4dbgfbl(": #filteredchars={} after=~~~{}~~~", s.len - r.len, r);
5119
            bool changed = _apply_chomp(m_filter_arena, &pos, chomp);
5120
            _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= m_filter_arena.len);
5121
            _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= s.len);
5122
            if(pos < r.len || changed)
5123
            {
5124
                r = _finish_filter_arena(s, pos); // write into s
5125
            }
5126
            break;
5127
        }
5128
    case BLOCK_FOLD:
5129
        {
5130
            _c4dbgp("filt_block: style=fold");
5131
            _grow_filter_arena(r.len + 2);
5132
            size_t pos = 0; // the filtered size
5133
            bool filtered_chars = false;
5134
            bool started = false;
5135
            bool is_indented = false;
5136
            size_t i = r.first_not_of(' ');
5137
            _c4dbgfbl(": first non space at {}", i);
5138
            if(i > indentation)
5139
            {
5140
                is_indented = true;
5141
                i = indentation;
5142
            }
5143
            _c4dbgfbl(": start folding at {}, is_indented={}", i, (int)is_indented);
5144
            auto on_change_indentation = [&](size_t numnl_following, size_t last_newl, size_t first_non_whitespace){
5145
                _c4dbgfbl("[{}]: add 1+{} newlines", i, numnl_following);
5146
                for(size_t j = 0; j < 1 + numnl_following; ++j)
5147
                    m_filter_arena.str[pos++] = '\n';
5148
                for(i = last_newl + 1 + indentation; i < first_non_whitespace; ++i)
5149
                {
5150
                    if(r.str[i] == '\r')
5151
                        continue;
5152
                    _c4dbgfbl("[{}]: add '{}'", i, _c4prc(r.str[i]));
5153
                    m_filter_arena.str[pos++] = r.str[i];
5154
                }
5155
                --i;
5156
            };
5157
            for( ; i < r.len; ++i)
5158
            {
5159
                const char curr = r.str[i];
5160
                _c4dbgfbl("[{}]='{}'", i, _c4prc(curr));
5161
                if(curr == '\n')
5162
                {
5163
                    filtered_chars = true;
5164
                    // skip indentation on the next line, and advance over the next non-indented blank lines as well
5165
                    size_t first_non_whitespace;
5166
                    size_t numnl_following = (size_t)-1;
5167
                    while(r[i] == '\n')
5168
                    {
5169
                        ++numnl_following;
5170
                        csubstr rem = r.sub(i+1);
5171
                        size_t first = rem.first_not_of(' ');
5172
                        _c4dbgfbl("[{}]: found newline. first={} rem.len={}", i, first, rem.len);
5173
                        if(first != npos)
5174
                        {
5175
                            first_non_whitespace = first + i+1;
5176
                            while(first_non_whitespace < r.len && r[first_non_whitespace] == '\r')
5177
                                ++first_non_whitespace;
5178
                            _RYML_CB_ASSERT(m_stack.m_callbacks, first < rem.len);
5179
                            _RYML_CB_ASSERT(m_stack.m_callbacks, i+1+first < r.len);
5180
                            _c4dbgfbl("[{}]: {} spaces follow before next nonws character @ [{}]='{}'", i, first, i+1+first, _c4prc(rem.str[first]));
5181
                            if(first < indentation)
5182
                            {
5183
                                _c4dbgfbl("[{}]: skip {}<{} spaces from indentation", i, first, indentation);
5184
                                i += first;
5185
                            }
5186
                            else
5187
                            {
5188
                                _c4dbgfbl("[{}]: skip {} spaces from indentation", i, indentation);
5189
                                i += indentation;
5190
                                if(first > indentation)
5191
                                {
5192
                                    _c4dbgfbl("[{}]: {} further indented than {}, stop newlining", i, first, indentation);
5193
                                    goto finished_counting_newlines;
5194
                                }
5195
                            }
5196
                            // prepare the next while loop iteration
5197
                            // by setting i at the next newline after
5198
                            // an empty line
5199
                            if(r[first_non_whitespace] == '\n')
5200
                                i = first_non_whitespace;
5201
                            else
5202
                                goto finished_counting_newlines;
5203
                        }
5204
                        else
5205
                        {
5206
                            _RYML_CB_ASSERT(m_stack.m_callbacks, i+1 <= r.len);
5207
                            first = rem.len;
5208
                            first_non_whitespace = first + i+1;
5209
                            if(first)
5210
                            {
5211
                                _c4dbgfbl("[{}]: {} spaces to the end", i, first);
5212
                                if(first < indentation)
5213
                                {
5214
                                    _c4dbgfbl("[{}]: skip everything", i);
5215
                                    i += first;
5216
                                }
5217
                                else
5218
                                {
5219
                                    _c4dbgfbl("[{}]: skip {} spaces from indentation", i, indentation);
5220
                                    i += indentation;
5221
                                    if(first > indentation)
5222
                                    {
5223
                                        _c4dbgfbl("[{}]: {} spaces missing. not done yet", i, indentation - first);
5224
                                        goto finished_counting_newlines;
5225
                                    }
5226
                                }
5227
                            }
5228
                            else // if(i+1 == r.len)
5229
                            {
5230
                                _c4dbgfbl("[{}]: it's the final newline", i);
5231
                                _RYML_CB_ASSERT(m_stack.m_callbacks, i+1 == r.len);
5232
                                _RYML_CB_ASSERT(m_stack.m_callbacks, rem.len == 0);
5233
                            }
5234
                            goto end_of_scalar;
5235
                        }
5236
                    }
5237
                end_of_scalar:
5238
                    // Write all the trailing newlines. Since we're
5239
                    // at the end no folding is needed, so write every
5240
                    // newline (add 1).
5241
                    _c4dbgfbl("[{}]: add {} trailing newlines", i, 1+numnl_following);
5242
                    for(size_t j = 0; j < 1 + numnl_following; ++j)
5243
                        m_filter_arena.str[pos++] = '\n';
5244
                    break;
5245
                finished_counting_newlines:
5246
                    _c4dbgfbl("[{}]: #newlines={} firstnonws={}", i, numnl_following, first_non_whitespace);
5247
                    while(first_non_whitespace < r.len && r[first_non_whitespace] == '\t')
5248
                        ++first_non_whitespace;
5249
                    _c4dbgfbl("[{}]: #newlines={} firstnonws={}", i, numnl_following, first_non_whitespace);
5250
                    _RYML_CB_ASSERT(m_stack.m_callbacks, first_non_whitespace <= r.len);
5251
                    size_t last_newl = r.last_of('\n', first_non_whitespace);
5252
                    size_t this_indentation = first_non_whitespace - last_newl - 1;
5253
                    _c4dbgfbl("[{}]: #newlines={} firstnonws={} lastnewl={} this_indentation={} vs indentation={}", i, numnl_following, first_non_whitespace, last_newl, this_indentation, indentation);
5254
                    _RYML_CB_ASSERT(m_stack.m_callbacks, first_non_whitespace >= last_newl + 1);
5255
                    _RYML_CB_ASSERT(m_stack.m_callbacks, this_indentation >= indentation);
5256
                    if(!started)
5257
                    {
5258
                        _c4dbgfbl("[{}]: #newlines={}. write all leading newlines", i, numnl_following);
5259
                        for(size_t j = 0; j < 1 + numnl_following; ++j)
5260
                            m_filter_arena.str[pos++] = '\n';
5261
                        if(this_indentation > indentation)
5262
                        {
5263
                            is_indented = true;
5264
                            _c4dbgfbl("[{}]: advance ->{}", i, last_newl + indentation);
5265
                            i = last_newl + indentation;
5266
                        }
5267
                        else
5268
                        {
5269
                            i = first_non_whitespace - 1;
5270
                            _c4dbgfbl("[{}]: advance ->{}", i, first_non_whitespace);
5271
                        }
5272
                    }
5273
                    else if(this_indentation == indentation)
5274
                    {
5275
                        _c4dbgfbl("[{}]: same indentation", i);
5276
                        if(!is_indented)
5277
                        {
5278
                            if(numnl_following == 0)
5279
                            {
5280
                                _c4dbgfbl("[{}]: fold!", i);
5281
                                m_filter_arena.str[pos++] = ' ';
5282
                            }
5283
                            else
5284
                            {
5285
                                _c4dbgfbl("[{}]: add {} newlines", i, 1 + numnl_following);
5286
                                for(size_t j = 0; j < numnl_following; ++j)
5287
                                    m_filter_arena.str[pos++] = '\n';
5288
                            }
5289
                            i = first_non_whitespace - 1;
5290
                            _c4dbgfbl("[{}]: advance {}->{}", i, i, first_non_whitespace);
5291
                        }
5292
                        else
5293
                        {
5294
                            _c4dbgfbl("[{}]: back to ref indentation", i);
5295
                            is_indented = false;
5296
                            on_change_indentation(numnl_following, last_newl, first_non_whitespace);
5297
                            _c4dbgfbl("[{}]: advance {}->{}", i, i, first_non_whitespace);
5298
                        }
5299
                    }
5300
                    else
5301
                    {
5302
                        _c4dbgfbl("[{}]: increased indentation.", i);
5303
                        is_indented = true;
5304
                        _RYML_CB_ASSERT(m_stack.m_callbacks, this_indentation > indentation);
5305
                        on_change_indentation(numnl_following, last_newl, first_non_whitespace);
5306
                        _c4dbgfbl("[{}]: advance {}->{}", i, i, first_non_whitespace);
5307
                    }
5308
                }
5309
                else if(curr != '\r')
5310
                {
5311
                    if(curr != '\t')
5312
                        started = true;
5313
                    m_filter_arena.str[pos++] = curr;
5314
                }
5315
            }
5316
            _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= m_filter_arena.len);
5317
            _c4dbgfbl(": #filteredchars={} after=[{}]~~~{}~~~", (int)s.len - (int)pos, pos, m_filter_arena.first(pos));
5318
            bool changed = _apply_chomp(m_filter_arena, &pos, chomp);
5319
            if(pos < r.len || filtered_chars || changed)
5320
            {
5321
                r = _finish_filter_arena(s, pos); // write into s
5322
            }
5323
        }
5324
        break;
5325
    default:
5326
        _c4err("unknown block style");
5327
    }
5328

5329
    _c4dbgfbl(": final=[{}]~~~{}~~~", r.len, r);
5330

5331
    #undef _c4dbgfbl
5332

5333
    return r;
5334
}
5335

5336
//-----------------------------------------------------------------------------
5337
size_t Parser::_count_nlines(csubstr src)
5338
{
5339
    return 1 + src.count('\n');
5340
}
5341

5342
//-----------------------------------------------------------------------------
5343
void Parser::_handle_directive(csubstr directive_)
5344
{
5345
    csubstr directive = directive_;
5346
    if(directive.begins_with("%TAG"))
5347
    {
5348
        TagDirective td;
5349
        _c4dbgpf("%TAG directive: {}", directive_);
5350
        directive = directive.sub(4);
5351
        if(!directive.begins_with(' '))
5352
            _c4err("malformed tag directive: {}", directive_);
5353
        directive = directive.triml(' ');
5354
        size_t pos = directive.find(' ');
5355
        if(pos == npos)
5356
            _c4err("malformed tag directive: {}", directive_);
5357
        td.handle = directive.first(pos);
5358
        directive = directive.sub(td.handle.len).triml(' ');
5359
        pos = directive.find(' ');
5360
        if(pos != npos)
5361
            directive = directive.first(pos);
5362
        td.prefix = directive;
5363
        td.next_node_id = m_tree->size();
5364
        if(m_tree->size() > 0)
5365
        {
5366
            size_t prev = m_tree->size() - 1;
5367
            if(m_tree->is_root(prev) && m_tree->type(prev) != NOTYPE && !m_tree->is_stream(prev))
5368
                ++td.next_node_id;
5369
        }
5370
        _c4dbgpf("%TAG: handle={} prefix={} next_node={}", td.handle, td.prefix, td.next_node_id);
5371
        m_tree->add_tag_directive(td);
5372
    }
5373
    else if(directive.begins_with("%YAML"))
5374
    {
5375
        _c4dbgpf("%YAML directive! ignoring...: {}", directive);
5376
    }
5377
}
5378

5379
//-----------------------------------------------------------------------------
5380
void Parser::set_flags(flag_t f, State * s)
5381
{
5382
#ifdef RYML_DBG
5383
    char buf1_[64], buf2_[64];
5384
    csubstr buf1 = _prfl(buf1_, f);
5385
    csubstr buf2 = _prfl(buf2_, s->flags);
5386
    _c4dbgpf("state[{}]: setting flags to {}: before={}", s-m_stack.begin(), buf1, buf2);
5387
#endif
5388
    s->flags = f;
5389
}
5390

5391
void Parser::add_flags(flag_t on, State * s)
5392
{
5393
#ifdef RYML_DBG
5394
    char buf1_[64], buf2_[64], buf3_[64];
5395
    csubstr buf1 = _prfl(buf1_, on);
5396
    csubstr buf2 = _prfl(buf2_, s->flags);
5397
    csubstr buf3 = _prfl(buf3_, s->flags|on);
5398
    _c4dbgpf("state[{}]: adding flags {}: before={} after={}", s-m_stack.begin(), buf1, buf2, buf3);
5399
#endif
5400
    s->flags |= on;
5401
}
5402

5403
void Parser::addrem_flags(flag_t on, flag_t off, State * s)
5404
{
5405
#ifdef RYML_DBG
5406
    char buf1_[64], buf2_[64], buf3_[64], buf4_[64];
5407
    csubstr buf1 = _prfl(buf1_, on);
5408
    csubstr buf2 = _prfl(buf2_, off);
5409
    csubstr buf3 = _prfl(buf3_, s->flags);
5410
    csubstr buf4 = _prfl(buf4_, ((s->flags|on)&(~off)));
5411
    _c4dbgpf("state[{}]: adding flags {} / removing flags {}: before={} after={}", s-m_stack.begin(), buf1, buf2, buf3, buf4);
5412
#endif
5413
    s->flags |= on;
5414
    s->flags &= ~off;
5415
}
5416

5417
void Parser::rem_flags(flag_t off, State * s)
5418
{
5419
#ifdef RYML_DBG
5420
    char buf1_[64], buf2_[64], buf3_[64];
5421
    csubstr buf1 = _prfl(buf1_, off);
5422
    csubstr buf2 = _prfl(buf2_, s->flags);
5423
    csubstr buf3 = _prfl(buf3_, s->flags&(~off));
5424
    _c4dbgpf("state[{}]: removing flags {}: before={} after={}", s-m_stack.begin(), buf1, buf2, buf3);
5425
#endif
5426
    s->flags &= ~off;
5427
}
5428

5429
//-----------------------------------------------------------------------------
5430

5431
csubstr Parser::_prfl(substr buf, flag_t flags)
5432
{
5433
    size_t pos = 0;
5434
    bool gotone = false;
5435

5436
    #define _prflag(fl)                                     \
5437
    if((flags & fl) == (fl))                                \
5438
    {                                                       \
5439
        if(gotone)                                          \
5440
        {                                                   \
5441
            if(pos + 1 < buf.len)                           \
5442
                buf[pos] = '|';                             \
5443
            ++pos;                                          \
5444
        }                                                   \
5445
        csubstr fltxt = #fl;                                \
5446
        if(pos + fltxt.len <= buf.len)                      \
5447
            memcpy(buf.str + pos, fltxt.str, fltxt.len);    \
5448
        pos += fltxt.len;                                   \
5449
        gotone = true;                                      \
5450
    }
5451

5452
    _prflag(RTOP);
5453
    _prflag(RUNK);
5454
    _prflag(RMAP);
5455
    _prflag(RSEQ);
5456
    _prflag(FLOW);
5457
    _prflag(QMRK);
5458
    _prflag(RKEY);
5459
    _prflag(RVAL);
5460
    _prflag(RNXT);
5461
    _prflag(SSCL);
5462
    _prflag(QSCL);
5463
    _prflag(RSET);
5464
    _prflag(NDOC);
5465
    _prflag(RSEQIMAP);
5466

5467
    #undef _prflag
5468

5469
    RYML_ASSERT(pos <= buf.len);
5470

5471
    return buf.first(pos);
5472
}
5473

5474

5475
//-----------------------------------------------------------------------------
5476
//-----------------------------------------------------------------------------
5477
//-----------------------------------------------------------------------------
5478

5479
void Parser::_grow_filter_arena(size_t num_characters_needed)
5480
{
5481
    _c4dbgpf("grow: arena={} numchars={}", m_filter_arena.len, num_characters_needed);
5482
    if(num_characters_needed <= m_filter_arena.len)
5483
        return;
5484
    size_t sz = m_filter_arena.len << 1;
5485
    _c4dbgpf("grow: sz={}", sz);
5486
    sz = num_characters_needed > sz ? num_characters_needed : sz;
5487
    _c4dbgpf("grow: sz={}", sz);
5488
    sz = sz < 128u ? 128u : sz;
5489
    _c4dbgpf("grow: sz={}", sz);
5490
    _RYML_CB_ASSERT(m_stack.m_callbacks, sz >= num_characters_needed);
5491
    _resize_filter_arena(sz);
5492
}
5493

5494
void Parser::_resize_filter_arena(size_t num_characters)
5495
{
5496
    if(num_characters > m_filter_arena.len)
5497
    {
5498
        _c4dbgpf("resize: sz={}", num_characters);
5499
        char *prev = m_filter_arena.str;
5500
        if(m_filter_arena.str)
5501
        {
5502
            _RYML_CB_ASSERT(m_stack.m_callbacks, m_filter_arena.len > 0);
5503
            _RYML_CB_FREE(m_stack.m_callbacks, m_filter_arena.str, char, m_filter_arena.len);
5504
        }
5505
        m_filter_arena.str = _RYML_CB_ALLOC_HINT(m_stack.m_callbacks, char, num_characters, prev);
5506
        m_filter_arena.len = num_characters;
5507
    }
5508
}
5509

5510
substr Parser::_finish_filter_arena(substr dst, size_t pos)
5511
{
5512
    _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= m_filter_arena.len);
5513
    _RYML_CB_ASSERT(m_stack.m_callbacks, pos <= dst.len);
5514
    memcpy(dst.str, m_filter_arena.str, pos);
5515
    return dst.first(pos);
5516
}
5517

5518

5519
//-----------------------------------------------------------------------------
5520
//-----------------------------------------------------------------------------
5521
//-----------------------------------------------------------------------------
5522

5523
csubstr Parser::location_contents(Location const& loc) const
5524
{
5525
    _RYML_CB_ASSERT(m_stack.m_callbacks, loc.offset < m_buf.len);
5526
    return m_buf.sub(loc.offset);
5527
}
5528

5529
Location Parser::location(ConstNodeRef node) const
5530
{
5531
    _RYML_CB_ASSERT(m_stack.m_callbacks, node.valid());
5532
    return location(*node.tree(), node.id());
5533
}
5534

5535
Location Parser::location(Tree const& tree, size_t node) const
5536
{
5537
    // try hard to avoid getting the location from a null string.
5538
    Location loc;
5539
    if(_location_from_node(tree, node, &loc, 0))
5540
        return loc;
5541
    return val_location(m_buf.str);
5542
}
5543

5544
bool Parser::_location_from_node(Tree const& tree, size_t node, Location *C4_RESTRICT loc, size_t level) const
5545
{
5546
    if(tree.has_key(node))
5547
    {
5548
        csubstr k = tree.key(node);
5549
        if(C4_LIKELY(k.str != nullptr))
5550
        {
5551
            _RYML_CB_ASSERT(m_stack.m_callbacks, k.is_sub(m_buf));
5552
            _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.is_super(k));
5553
            *loc = val_location(k.str);
5554
            return true;
5555
        }
5556
    }
5557

5558
    if(tree.has_val(node))
5559
    {
5560
        csubstr v = tree.val(node);
5561
        if(C4_LIKELY(v.str != nullptr))
5562
        {
5563
            _RYML_CB_ASSERT(m_stack.m_callbacks, v.is_sub(m_buf));
5564
            _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.is_super(v));
5565
            *loc = val_location(v.str);
5566
            return true;
5567
        }
5568
    }
5569

5570
    if(tree.is_container(node))
5571
    {
5572
        if(_location_from_cont(tree, node, loc))
5573
            return true;
5574
    }
5575

5576
    if(tree.type(node) != NOTYPE && level == 0)
5577
    {
5578
        // try the prev sibling
5579
        {
5580
            const size_t prev = tree.prev_sibling(node);
5581
            if(prev != NONE)
5582
            {
5583
                if(_location_from_node(tree, prev, loc, level+1))
5584
                    return true;
5585
            }
5586
        }
5587
        // try the next sibling
5588
        {
5589
            const size_t next = tree.next_sibling(node);
5590
            if(next != NONE)
5591
            {
5592
                if(_location_from_node(tree, next, loc, level+1))
5593
                    return true;
5594
            }
5595
        }
5596
        // try the parent
5597
        {
5598
            const size_t parent = tree.parent(node);
5599
            if(parent != NONE)
5600
            {
5601
                if(_location_from_node(tree, parent, loc, level+1))
5602
                    return true;
5603
            }
5604
        }
5605
    }
5606

5607
    return false;
5608
}
5609

5610
bool Parser::_location_from_cont(Tree const& tree, size_t node, Location *C4_RESTRICT loc) const
5611
{
5612
    _RYML_CB_ASSERT(m_stack.m_callbacks, tree.is_container(node));
5613
    if(!tree.is_stream(node))
5614
    {
5615
        const char *node_start = tree._p(node)->m_val.scalar.str;  // this was stored in the container
5616
        if(tree.has_children(node))
5617
        {
5618
            size_t child = tree.first_child(node);
5619
            if(tree.has_key(child))
5620
            {
5621
                // when a map starts, the container was set after the key
5622
                csubstr k = tree.key(child);
5623
                if(k.str && node_start > k.str)
5624
                    node_start = k.str;
5625
            }
5626
        }
5627
        *loc = val_location(node_start);
5628
        return true;
5629
    }
5630
    else // it's a stream
5631
    {
5632
        *loc = val_location(m_buf.str); // just return the front of the buffer
5633
    }
5634
    return true;
5635
}
5636

5637

5638
Location Parser::val_location(const char *val) const
5639
{
5640
    if(C4_UNLIKELY(val == nullptr))
5641
        return {m_file, 0, 0, 0};
5642

5643
    _RYML_CB_CHECK(m_stack.m_callbacks, m_options.locations());
5644
    // NOTE: if any of these checks fails, the parser needs to be
5645
    // instantiated with locations enabled.
5646
    _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.str == m_newline_offsets_buf.str);
5647
    _RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.len == m_newline_offsets_buf.len);
5648
    _RYML_CB_ASSERT(m_stack.m_callbacks, m_options.locations());
5649
    _RYML_CB_ASSERT(m_stack.m_callbacks, !_locations_dirty());
5650
    _RYML_CB_ASSERT(m_stack.m_callbacks, m_newline_offsets != nullptr);
5651
    _RYML_CB_ASSERT(m_stack.m_callbacks, m_newline_offsets_size > 0);
5652
    // NOTE: the pointer needs to belong to the buffer that was used to parse.
5653
    csubstr src = m_buf;
5654
    _RYML_CB_CHECK(m_stack.m_callbacks, val != nullptr || src.str == nullptr);
5655
    _RYML_CB_CHECK(m_stack.m_callbacks, (val >= src.begin() && val <= src.end()) || (src.str == nullptr && val == nullptr));
5656
    // ok. search the first stored newline after the given ptr
5657
    using lineptr_type = size_t const* C4_RESTRICT;
5658
    lineptr_type lineptr = nullptr;
5659
    size_t offset = (size_t)(val - src.begin());
5660
    if(m_newline_offsets_size < 30) // TODO magic number
5661
    {
5662
        // just do a linear search if the size is small.
5663
        for(lineptr_type curr = m_newline_offsets, last = m_newline_offsets + m_newline_offsets_size; curr < last; ++curr)
5664
        {
5665
            if(*curr > offset)
5666
            {
5667
                lineptr = curr;
5668
                break;
5669
            }
5670
        }
5671
    }
5672
    else
5673
    {
5674
        // do a bisection search if the size is not small.
5675
        //
5676
        // We could use std::lower_bound but this is simple enough and
5677
        // spares the include of <algorithm>.
5678
        size_t count = m_newline_offsets_size;
5679
        size_t step;
5680
        lineptr_type it;
5681
        lineptr = m_newline_offsets;
5682
        while(count)
5683
        {
5684
            step = count >> 1;
5685
            it = lineptr + step;
5686
            if(*it < offset)
5687
            {
5688
                lineptr = ++it;
5689
                count -= step + 1;
5690
            }
5691
            else
5692
            {
5693
                count = step;
5694
            }
5695
        }
5696
    }
5697
    _RYML_CB_ASSERT(m_stack.m_callbacks, lineptr >= m_newline_offsets);
5698
    _RYML_CB_ASSERT(m_stack.m_callbacks, lineptr <= m_newline_offsets + m_newline_offsets_size);
5699
    _RYML_CB_ASSERT(m_stack.m_callbacks, *lineptr > offset);
5700
    Location loc;
5701
    loc.name = m_file;
5702
    loc.offset = offset;
5703
    loc.line = (size_t)(lineptr - m_newline_offsets);
5704
    if(lineptr > m_newline_offsets)
5705
        loc.col = (offset - *(lineptr-1) - 1u);
5706
    else
5707
        loc.col = offset;
5708
    return loc;
5709
}
5710

5711
void Parser::_prepare_locations()
5712
{
5713
    m_newline_offsets_buf = m_buf;
5714
    size_t numnewlines = 1u + m_buf.count('\n');
5715
    _resize_locations(numnewlines);
5716
    m_newline_offsets_size = 0;
5717
    for(size_t i = 0; i < m_buf.len; i++)
5718
        if(m_buf[i] == '\n')
5719
            m_newline_offsets[m_newline_offsets_size++] = i;
5720
    m_newline_offsets[m_newline_offsets_size++] = m_buf.len;
5721
    _RYML_CB_ASSERT(m_stack.m_callbacks, m_newline_offsets_size == numnewlines);
5722
}
5723

5724
void Parser::_resize_locations(size_t numnewlines)
5725
{
5726
    if(numnewlines > m_newline_offsets_capacity)
5727
    {
5728
        if(m_newline_offsets)
5729
            _RYML_CB_FREE(m_stack.m_callbacks, m_newline_offsets, size_t, m_newline_offsets_capacity);
5730
        m_newline_offsets = _RYML_CB_ALLOC_HINT(m_stack.m_callbacks, size_t, numnewlines, m_newline_offsets);
5731
        m_newline_offsets_capacity = numnewlines;
5732
    }
5733
}
5734

5735
bool Parser::_locations_dirty() const
5736
{
5737
    return !m_newline_offsets_size;
5738
}
5739

5740
} // namespace yml
5741
} // namespace c4
5742

5743

5744
#if defined(_MSC_VER)
5745
#   pragma warning(pop)
5746
#elif defined(__clang__)
5747
#   pragma clang diagnostic pop
5748
#elif defined(__GNUC__)
5749
#   pragma GCC diagnostic pop
5750
#endif
5751

5752
Product

Resources

Company