Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
stenzek
GitHub Repository: stenzek/duckstation
Path: blob/master/dep/rapidyaml/src/c4/yml/parse.cpp
4262 views
1
#include "c4/yml/parse.hpp"
2
#include "c4/error.hpp"
3
#include "c4/utf.hpp"
4
#include <c4/dump.hpp>
5
6
#include <ctype.h>
7
#include <stdarg.h>
8
#include <stdio.h>
9
10
#include "c4/yml/detail/parser_dbg.hpp"
11
#ifdef RYML_DBG
12
#include "c4/yml/detail/print.hpp"
13
#endif
14
15
#ifndef RYML_ERRMSG_SIZE
16
#define RYML_ERRMSG_SIZE 1024
17
#endif
18
19
//#define RYML_WITH_TAB_TOKENS
20
#ifdef RYML_WITH_TAB_TOKENS
21
#define _RYML_WITH_TAB_TOKENS(...) __VA_ARGS__
22
#define _RYML_WITH_OR_WITHOUT_TAB_TOKENS(with, without) with
23
#else
24
#define _RYML_WITH_TAB_TOKENS(...)
25
#define _RYML_WITH_OR_WITHOUT_TAB_TOKENS(with, without) without
26
#endif
27
28
29
#if defined(_MSC_VER)
30
# pragma warning(push)
31
# pragma warning(disable: 4296/*expression is always 'boolean_value'*/)
32
#elif defined(__clang__)
33
# pragma clang diagnostic push
34
# pragma clang diagnostic ignored "-Wtype-limits" // to remove a warning on an assertion that a size_t >= 0. Later on, this size_t will turn into a template argument, and then it can become < 0.
35
# pragma clang diagnostic ignored "-Wformat-nonliteral"
36
# pragma clang diagnostic ignored "-Wold-style-cast"
37
#elif defined(__GNUC__)
38
# pragma GCC diagnostic push
39
# pragma GCC diagnostic ignored "-Wtype-limits" // to remove a warning on an assertion that a size_t >= 0. Later on, this size_t will turn into a template argument, and then it can become < 0.
40
# pragma GCC diagnostic ignored "-Wformat-nonliteral"
41
# pragma GCC diagnostic ignored "-Wold-style-cast"
42
# if __GNUC__ >= 7
43
# pragma GCC diagnostic ignored "-Wduplicated-branches"
44
# endif
45
#endif
46
47
namespace c4 {
48
namespace yml {
49
50
namespace {
51
52
template<class DumpFn, class ...Args>
53
void _parse_dump(DumpFn dumpfn, c4::csubstr fmt, Args&& ...args)
54
{
55
char writebuf[256];
56
auto results = c4::format_dump_resume(dumpfn, writebuf, fmt, std::forward<Args>(args)...);
57
// resume writing if the results failed to fit the buffer
58
if(C4_UNLIKELY(results.bufsize > sizeof(writebuf))) // bufsize will be that of the largest element serialized. Eg int(1), will require 1 byte.
59
{
60
results = format_dump_resume(dumpfn, results, writebuf, fmt, std::forward<Args>(args)...);
61
if(C4_UNLIKELY(results.bufsize > sizeof(writebuf)))
62
{
63
results = format_dump_resume(dumpfn, results, writebuf, fmt, std::forward<Args>(args)...);
64
}
65
}
66
}
67
68
bool _is_scalar_next__runk(csubstr s)
69
{
70
return !(s.begins_with(": ") || s.begins_with_any("#,{}[]%&") || s.begins_with("? ") || s == "-" || s.begins_with("- ") || s.begins_with(":\"") || s.begins_with(":'"));
71
}
72
73
bool _is_scalar_next__rseq_rval(csubstr s)
74
{
75
return !(s.begins_with_any("[{!&") || s.begins_with("? ") || s.begins_with("- ") || s == "-");
76
}
77
78
bool _is_scalar_next__rmap(csubstr s)
79
{
80
return !(s.begins_with(": ") || s.begins_with_any("#,!&") || s.begins_with("? ") _RYML_WITH_TAB_TOKENS(|| s.begins_with(":\t")));
81
}
82
83
bool _is_scalar_next__rmap_val(csubstr s)
84
{
85
return !(s.begins_with("- ") || s.begins_with_any("{[") || s == "-");
86
}
87
88
bool _is_doc_sep(csubstr s)
89
{
90
constexpr const csubstr dashes = "---";
91
constexpr const csubstr ellipsis = "...";
92
constexpr const csubstr whitesp = " \t";
93
if(s.begins_with(dashes))
94
return s == dashes || s.sub(3).begins_with_any(whitesp);
95
else if(s.begins_with(ellipsis))
96
return s == ellipsis || s.sub(3).begins_with_any(whitesp);
97
return false;
98
}
99
100
/** @p i is set to the first non whitespace character after the line
101
* @return the number of empty lines after the initial position */
102
size_t count_following_newlines(csubstr r, size_t *C4_RESTRICT i, size_t indentation)
103
{
104
RYML_ASSERT(r[*i] == '\n');
105
size_t numnl_following = 0;
106
++(*i);
107
for( ; *i < r.len; ++(*i))
108
{
109
if(r.str[*i] == '\n')
110
{
111
++numnl_following;
112
if(indentation) // skip the indentation after the newline
113
{
114
size_t stop = *i + indentation;
115
for( ; *i < r.len; ++(*i))
116
{
117
if(r.str[*i] != ' ' && r.str[*i] != '\r')
118
break;
119
RYML_ASSERT(*i < stop);
120
}
121
C4_UNUSED(stop);
122
}
123
}
124
else if(r.str[*i] == ' ' || r.str[*i] == '\t' || r.str[*i] == '\r') // skip leading whitespace
125
;
126
else
127
break;
128
}
129
return numnl_following;
130
}
131
132
} // anon namespace
133
134
135
//-----------------------------------------------------------------------------
136
137
Parser::~Parser()
138
{
139
_free();
140
_clr();
141
}
142
143
Parser::Parser(Callbacks const& cb, ParserOptions opts)
144
: m_options(opts)
145
, m_file()
146
, m_buf()
147
, m_root_id(NONE)
148
, m_tree()
149
, m_stack(cb)
150
, m_state()
151
, m_key_tag_indentation(0)
152
, m_key_tag2_indentation(0)
153
, m_key_tag()
154
, m_key_tag2()
155
, m_val_tag_indentation(0)
156
, m_val_tag()
157
, m_key_anchor_was_before(false)
158
, m_key_anchor_indentation(0)
159
, m_key_anchor()
160
, m_val_anchor_indentation(0)
161
, m_val_anchor()
162
, m_filter_arena()
163
, m_newline_offsets()
164
, m_newline_offsets_size(0)
165
, m_newline_offsets_capacity(0)
166
, m_newline_offsets_buf()
167
{
168
m_stack.push(State{});
169
m_state = &m_stack.top();
170
}
171
172
Parser::Parser(Parser &&that)
173
: m_options(that.m_options)
174
, m_file(that.m_file)
175
, m_buf(that.m_buf)
176
, m_root_id(that.m_root_id)
177
, m_tree(that.m_tree)
178
, m_stack(std::move(that.m_stack))
179
, m_state(&m_stack.top())
180
, m_key_tag_indentation(that.m_key_tag_indentation)
181
, m_key_tag2_indentation(that.m_key_tag2_indentation)
182
, m_key_tag(that.m_key_tag)
183
, m_key_tag2(that.m_key_tag2)
184
, m_val_tag_indentation(that.m_val_tag_indentation)
185
, m_val_tag(that.m_val_tag)
186
, m_key_anchor_was_before(that.m_key_anchor_was_before)
187
, m_key_anchor_indentation(that.m_key_anchor_indentation)
188
, m_key_anchor(that.m_key_anchor)
189
, m_val_anchor_indentation(that.m_val_anchor_indentation)
190
, m_val_anchor(that.m_val_anchor)
191
, m_filter_arena(that.m_filter_arena)
192
, m_newline_offsets(that.m_newline_offsets)
193
, m_newline_offsets_size(that.m_newline_offsets_size)
194
, m_newline_offsets_capacity(that.m_newline_offsets_capacity)
195
, m_newline_offsets_buf(that.m_newline_offsets_buf)
196
{
197
that._clr();
198
}
199
200
Parser::Parser(Parser const& that)
201
: m_options(that.m_options)
202
, m_file(that.m_file)
203
, m_buf(that.m_buf)
204
, m_root_id(that.m_root_id)
205
, m_tree(that.m_tree)
206
, m_stack(that.m_stack)
207
, m_state(&m_stack.top())
208
, m_key_tag_indentation(that.m_key_tag_indentation)
209
, m_key_tag2_indentation(that.m_key_tag2_indentation)
210
, m_key_tag(that.m_key_tag)
211
, m_key_tag2(that.m_key_tag2)
212
, m_val_tag_indentation(that.m_val_tag_indentation)
213
, m_val_tag(that.m_val_tag)
214
, m_key_anchor_was_before(that.m_key_anchor_was_before)
215
, m_key_anchor_indentation(that.m_key_anchor_indentation)
216
, m_key_anchor(that.m_key_anchor)
217
, m_val_anchor_indentation(that.m_val_anchor_indentation)
218
, m_val_anchor(that.m_val_anchor)
219
, m_filter_arena()
220
, m_newline_offsets()
221
, m_newline_offsets_size()
222
, m_newline_offsets_capacity()
223
, m_newline_offsets_buf()
224
{
225
if(that.m_newline_offsets_capacity)
226
{
227
_resize_locations(that.m_newline_offsets_capacity);
228
_RYML_CB_CHECK(m_stack.m_callbacks, m_newline_offsets_capacity == that.m_newline_offsets_capacity);
229
memcpy(m_newline_offsets, that.m_newline_offsets, that.m_newline_offsets_size * sizeof(size_t));
230
m_newline_offsets_size = that.m_newline_offsets_size;
231
}
232
if(that.m_filter_arena.len)
233
{
234
_resize_filter_arena(that.m_filter_arena.len);
235
}
236
}
237
238
Parser& Parser::operator=(Parser &&that)
239
{
240
_free();
241
m_options = (that.m_options);
242
m_file = (that.m_file);
243
m_buf = (that.m_buf);
244
m_root_id = (that.m_root_id);
245
m_tree = (that.m_tree);
246
m_stack = std::move(that.m_stack);
247
m_state = (&m_stack.top());
248
m_key_tag_indentation = (that.m_key_tag_indentation);
249
m_key_tag2_indentation = (that.m_key_tag2_indentation);
250
m_key_tag = (that.m_key_tag);
251
m_key_tag2 = (that.m_key_tag2);
252
m_val_tag_indentation = (that.m_val_tag_indentation);
253
m_val_tag = (that.m_val_tag);
254
m_key_anchor_was_before = (that.m_key_anchor_was_before);
255
m_key_anchor_indentation = (that.m_key_anchor_indentation);
256
m_key_anchor = (that.m_key_anchor);
257
m_val_anchor_indentation = (that.m_val_anchor_indentation);
258
m_val_anchor = (that.m_val_anchor);
259
m_filter_arena = that.m_filter_arena;
260
m_newline_offsets = (that.m_newline_offsets);
261
m_newline_offsets_size = (that.m_newline_offsets_size);
262
m_newline_offsets_capacity = (that.m_newline_offsets_capacity);
263
m_newline_offsets_buf = (that.m_newline_offsets_buf);
264
that._clr();
265
return *this;
266
}
267
268
Parser& Parser::operator=(Parser const& that)
269
{
270
_free();
271
m_options = (that.m_options);
272
m_file = (that.m_file);
273
m_buf = (that.m_buf);
274
m_root_id = (that.m_root_id);
275
m_tree = (that.m_tree);
276
m_stack = that.m_stack;
277
m_state = &m_stack.top();
278
m_key_tag_indentation = (that.m_key_tag_indentation);
279
m_key_tag2_indentation = (that.m_key_tag2_indentation);
280
m_key_tag = (that.m_key_tag);
281
m_key_tag2 = (that.m_key_tag2);
282
m_val_tag_indentation = (that.m_val_tag_indentation);
283
m_val_tag = (that.m_val_tag);
284
m_key_anchor_was_before = (that.m_key_anchor_was_before);
285
m_key_anchor_indentation = (that.m_key_anchor_indentation);
286
m_key_anchor = (that.m_key_anchor);
287
m_val_anchor_indentation = (that.m_val_anchor_indentation);
288
m_val_anchor = (that.m_val_anchor);
289
if(that.m_filter_arena.len > 0)
290
_resize_filter_arena(that.m_filter_arena.len);
291
if(that.m_newline_offsets_capacity > m_newline_offsets_capacity)
292
_resize_locations(that.m_newline_offsets_capacity);
293
_RYML_CB_CHECK(m_stack.m_callbacks, m_newline_offsets_capacity >= that.m_newline_offsets_capacity);
294
_RYML_CB_CHECK(m_stack.m_callbacks, m_newline_offsets_capacity >= that.m_newline_offsets_size);
295
memcpy(m_newline_offsets, that.m_newline_offsets, that.m_newline_offsets_size * sizeof(size_t));
296
m_newline_offsets_size = that.m_newline_offsets_size;
297
m_newline_offsets_buf = that.m_newline_offsets_buf;
298
return *this;
299
}
300
301
void Parser::_clr()
302
{
303
m_options = {};
304
m_file = {};
305
m_buf = {};
306
m_root_id = {};
307
m_tree = {};
308
m_stack.clear();
309
m_state = {};
310
m_key_tag_indentation = {};
311
m_key_tag2_indentation = {};
312
m_key_tag = {};
313
m_key_tag2 = {};
314
m_val_tag_indentation = {};
315
m_val_tag = {};
316
m_key_anchor_was_before = {};
317
m_key_anchor_indentation = {};
318
m_key_anchor = {};
319
m_val_anchor_indentation = {};
320
m_val_anchor = {};
321
m_filter_arena = {};
322
m_newline_offsets = {};
323
m_newline_offsets_size = {};
324
m_newline_offsets_capacity = {};
325
m_newline_offsets_buf = {};
326
}
327
328
void Parser::_free()
329
{
330
if(m_newline_offsets)
331
{
332
_RYML_CB_FREE(m_stack.m_callbacks, m_newline_offsets, size_t, m_newline_offsets_capacity);
333
m_newline_offsets = nullptr;
334
m_newline_offsets_size = 0u;
335
m_newline_offsets_capacity = 0u;
336
m_newline_offsets_buf = 0u;
337
}
338
if(m_filter_arena.len)
339
{
340
_RYML_CB_FREE(m_stack.m_callbacks, m_filter_arena.str, char, m_filter_arena.len);
341
m_filter_arena = {};
342
}
343
m_stack._free();
344
}
345
346
347
//-----------------------------------------------------------------------------
348
void Parser::_reset()
349
{
350
_RYML_CB_ASSERT(m_stack.m_callbacks, m_stack.size() == 1);
351
m_stack.clear();
352
m_stack.push({});
353
m_state = &m_stack.top();
354
m_state->reset(m_file.str, m_root_id);
355
356
m_key_tag_indentation = 0;
357
m_key_tag2_indentation = 0;
358
m_key_tag.clear();
359
m_key_tag2.clear();
360
m_val_tag_indentation = 0;
361
m_val_tag.clear();
362
m_key_anchor_was_before = false;
363
m_key_anchor_indentation = 0;
364
m_key_anchor.clear();
365
m_val_anchor_indentation = 0;
366
m_val_anchor.clear();
367
368
if(m_options.locations())
369
{
370
_prepare_locations();
371
}
372
}
373
374
//-----------------------------------------------------------------------------
375
template<class DumpFn>
376
void Parser::_fmt_msg(DumpFn &&dumpfn) const
377
{
378
auto const& lc = m_state->line_contents;
379
csubstr contents = lc.stripped;
380
if(contents.len)
381
{
382
// print the yaml src line
383
size_t offs = 3u + to_chars(substr{}, m_state->pos.line) + to_chars(substr{}, m_state->pos.col);
384
if(m_file.len)
385
{
386
_parse_dump(dumpfn, "{}:", m_file);
387
offs += m_file.len + 1;
388
}
389
_parse_dump(dumpfn, "{}:{}: ", m_state->pos.line, m_state->pos.col);
390
csubstr maybe_full_content = (contents.len < 80u ? contents : contents.first(80u));
391
csubstr maybe_ellipsis = (contents.len < 80u ? csubstr{} : csubstr("..."));
392
_parse_dump(dumpfn, "{}{} (size={})\n", maybe_full_content, maybe_ellipsis, contents.len);
393
// highlight the remaining portion of the previous line
394
size_t firstcol = (size_t)(lc.rem.begin() - lc.full.begin());
395
size_t lastcol = firstcol + lc.rem.len;
396
for(size_t i = 0; i < offs + firstcol; ++i)
397
dumpfn(" ");
398
dumpfn("^");
399
for(size_t i = 1, e = (lc.rem.len < 80u ? lc.rem.len : 80u); i < e; ++i)
400
dumpfn("~");
401
_parse_dump(dumpfn, "{} (cols {}-{})\n", maybe_ellipsis, firstcol+1, lastcol+1);
402
}
403
else
404
{
405
dumpfn("\n");
406
}
407
408
#ifdef RYML_DBG
409
// next line: print the state flags
410
{
411
char flagbuf_[64];
412
_parse_dump(dumpfn, "top state: {}\n", _prfl(flagbuf_, m_state->flags));
413
}
414
#endif
415
}
416
417
418
//-----------------------------------------------------------------------------
419
template<class ...Args>
420
void Parser::_err(csubstr fmt, Args const& C4_RESTRICT ...args) const
421
{
422
char errmsg[RYML_ERRMSG_SIZE];
423
detail::_SubstrWriter writer(errmsg);
424
auto dumpfn = [&writer](csubstr s){ writer.append(s); };
425
_parse_dump(dumpfn, fmt, args...);
426
writer.append('\n');
427
_fmt_msg(dumpfn);
428
size_t len = writer.pos < RYML_ERRMSG_SIZE ? writer.pos : RYML_ERRMSG_SIZE;
429
m_tree->m_callbacks.m_error(errmsg, len, m_state->pos, m_tree->m_callbacks.m_user_data);
430
}
431
432
//-----------------------------------------------------------------------------
433
#ifdef RYML_DBG
434
template<class ...Args>
435
void Parser::_dbg(csubstr fmt, Args const& C4_RESTRICT ...args) const
436
{
437
auto dumpfn = [](csubstr s){ fwrite(s.str, 1, s.len, stdout); };
438
_parse_dump(dumpfn, fmt, args...);
439
dumpfn("\n");
440
_fmt_msg(dumpfn);
441
}
442
#endif
443
444
//-----------------------------------------------------------------------------
445
bool Parser::_finished_file() const
446
{
447
bool ret = m_state->pos.offset >= m_buf.len;
448
if(ret)
449
{
450
_c4dbgp("finished file!!!");
451
}
452
return ret;
453
}
454
455
//-----------------------------------------------------------------------------
456
bool Parser::_finished_line() const
457
{
458
return m_state->line_contents.rem.empty();
459
}
460
461
//-----------------------------------------------------------------------------
462
void Parser::parse_in_place(csubstr file, substr buf, Tree *t, size_t node_id)
463
{
464
m_file = file;
465
m_buf = buf;
466
m_root_id = node_id;
467
m_tree = t;
468
_reset();
469
while( ! _finished_file())
470
{
471
_scan_line();
472
while( ! _finished_line())
473
_handle_line();
474
if(_finished_file())
475
break; // it may have finished because of multiline blocks
476
_line_ended();
477
}
478
_handle_finished_file();
479
}
480
481
//-----------------------------------------------------------------------------
482
void Parser::_handle_finished_file()
483
{
484
_end_stream();
485
}
486
487
//-----------------------------------------------------------------------------
488
void Parser::_handle_line()
489
{
490
_c4dbgq("\n-----------");
491
_c4dbgt("handling line={}, offset={}B", m_state->pos.line, m_state->pos.offset);
492
_RYML_CB_ASSERT(m_stack.m_callbacks, ! m_state->line_contents.rem.empty());
493
if(has_any(RSEQ))
494
{
495
if(has_any(FLOW))
496
{
497
if(_handle_seq_flow())
498
return;
499
}
500
else
501
{
502
if(_handle_seq_blck())
503
return;
504
}
505
}
506
else if(has_any(RMAP))
507
{
508
if(has_any(FLOW))
509
{
510
if(_handle_map_flow())
511
return;
512
}
513
else
514
{
515
if(_handle_map_blck())
516
return;
517
}
518
}
519
else if(has_any(RUNK))
520
{
521
if(_handle_unk())
522
return;
523
}
524
525
if(_handle_top())
526
return;
527
}
528
529
530
//-----------------------------------------------------------------------------
531
bool Parser::_handle_unk()
532
{
533
_c4dbgp("handle_unk");
534
535
csubstr rem = m_state->line_contents.rem;
536
const bool start_as_child = (node(m_state) == nullptr);
537
538
if(C4_UNLIKELY(has_any(NDOC)))
539
{
540
if(rem == "---" || rem.begins_with("--- "))
541
{
542
_start_new_doc(rem);
543
return true;
544
}
545
auto trimmed = rem.triml(' ');
546
if(trimmed == "---" || trimmed.begins_with("--- "))
547
{
548
_RYML_CB_ASSERT(m_stack.m_callbacks, rem.len >= trimmed.len);
549
_line_progressed(rem.len - trimmed.len);
550
_start_new_doc(trimmed);
551
_save_indentation();
552
return true;
553
}
554
else if(trimmed.begins_with("..."))
555
{
556
_end_stream();
557
}
558
else if(trimmed.first_of("#%") == csubstr::npos) // neither a doc nor a tag
559
{
560
_c4dbgpf("starting implicit doc to accomodate unexpected tokens: '{}'", rem);
561
size_t indref = m_state->indref;
562
_push_level();
563
_start_doc();
564
_set_indentation(indref);
565
}
566
_RYML_CB_ASSERT(m_stack.m_callbacks, !trimmed.empty());
567
}
568
569
_RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RNXT|RSEQ|RMAP));
570
if(m_state->indref > 0)
571
{
572
csubstr ws = rem.left_of(rem.first_not_of(' '));
573
if(m_state->indref <= ws.len)
574
{
575
_c4dbgpf("skipping base indentation of {}", m_state->indref);
576
_line_progressed(m_state->indref);
577
rem = rem.sub(m_state->indref);
578
}
579
}
580
581
if(rem.begins_with("- ") _RYML_WITH_TAB_TOKENS( || rem.begins_with("-\t")))
582
{
583
_c4dbgpf("it's a seq (as_child={})", start_as_child);
584
_move_key_anchor_to_val_anchor();
585
_move_key_tag_to_val_tag();
586
_push_level();
587
_start_seq(start_as_child);
588
_save_indentation();
589
_line_progressed(2);
590
return true;
591
}
592
else if(rem == '-')
593
{
594
_c4dbgpf("it's a seq (as_child={})", start_as_child);
595
_move_key_anchor_to_val_anchor();
596
_move_key_tag_to_val_tag();
597
_push_level();
598
_start_seq(start_as_child);
599
_save_indentation();
600
_line_progressed(1);
601
return true;
602
}
603
else if(rem.begins_with('['))
604
{
605
_c4dbgpf("it's a seq, flow (as_child={})", start_as_child);
606
_move_key_anchor_to_val_anchor();
607
_move_key_tag_to_val_tag();
608
_push_level(/*explicit flow*/true);
609
_start_seq(start_as_child);
610
add_flags(FLOW);
611
_line_progressed(1);
612
return true;
613
}
614
else if(rem.begins_with('{'))
615
{
616
_c4dbgpf("it's a map, flow (as_child={})", start_as_child);
617
_move_key_anchor_to_val_anchor();
618
_move_key_tag_to_val_tag();
619
_push_level(/*explicit flow*/true);
620
_start_map(start_as_child);
621
addrem_flags(FLOW|RKEY, RVAL);
622
_line_progressed(1);
623
return true;
624
}
625
else if(rem.begins_with("? "))
626
{
627
_c4dbgpf("it's a map (as_child={}) + this key is complex", start_as_child);
628
_move_key_anchor_to_val_anchor();
629
_move_key_tag_to_val_tag();
630
_push_level();
631
_start_map(start_as_child);
632
addrem_flags(RKEY|QMRK, RVAL);
633
_save_indentation();
634
_line_progressed(2);
635
return true;
636
}
637
else if(rem.begins_with(": ") && !has_any(SSCL))
638
{
639
_c4dbgp("it's a map with an empty key");
640
_move_key_anchor_to_val_anchor();
641
_move_key_tag_to_val_tag();
642
_push_level();
643
_start_map(start_as_child);
644
_store_scalar_null(rem.str);
645
addrem_flags(RVAL, RKEY);
646
_save_indentation();
647
_line_progressed(2);
648
return true;
649
}
650
else if(rem == ':' && !has_any(SSCL))
651
{
652
_c4dbgp("it's a map with an empty key");
653
_move_key_anchor_to_val_anchor();
654
_move_key_tag_to_val_tag();
655
_push_level();
656
_start_map(start_as_child);
657
_store_scalar_null(rem.str);
658
addrem_flags(RVAL, RKEY);
659
_save_indentation();
660
_line_progressed(1);
661
return true;
662
}
663
else if(_handle_types())
664
{
665
return true;
666
}
667
else if(!rem.begins_with('*') && _handle_key_anchors_and_refs())
668
{
669
return true;
670
}
671
else if(has_any(SSCL))
672
{
673
_c4dbgpf("there's a stored scalar: '{}'", m_state->scalar);
674
675
csubstr saved_scalar;
676
bool is_quoted = false;
677
if(_scan_scalar_unk(&saved_scalar, &is_quoted))
678
{
679
rem = m_state->line_contents.rem;
680
_c4dbgpf("... and there's also a scalar next! '{}'", saved_scalar);
681
if(rem.begins_with_any(" \t"))
682
{
683
size_t n = rem.first_not_of(" \t");
684
_c4dbgpf("skipping {} spaces/tabs", n);
685
rem = rem.sub(n);
686
_line_progressed(n);
687
}
688
}
689
690
_c4dbgpf("rem='{}'", rem);
691
692
if(rem.begins_with(", "))
693
{
694
_c4dbgpf("got a ',' -- it's a seq (as_child={})", start_as_child);
695
_start_seq(start_as_child);
696
add_flags(FLOW);
697
_append_val(_consume_scalar());
698
_line_progressed(2);
699
}
700
else if(rem.begins_with(','))
701
{
702
_c4dbgpf("got a ',' -- it's a seq (as_child={})", start_as_child);
703
_start_seq(start_as_child);
704
add_flags(FLOW);
705
_append_val(_consume_scalar());
706
_line_progressed(1);
707
}
708
else if(rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t")))
709
{
710
_c4dbgpf("got a ': ' -- it's a map (as_child={})", start_as_child);
711
_start_map_unk(start_as_child); // wait for the val scalar to append the key-val pair
712
_line_progressed(2);
713
}
714
else if(rem == ":" || rem.begins_with(":\"") || rem.begins_with(":'"))
715
{
716
if(rem == ":") { _c4dbgpf("got a ':' -- it's a map (as_child={})", start_as_child); }
717
else { _c4dbgpf("got a '{}' -- it's a map (as_child={})", rem.first(2), start_as_child); }
718
_start_map_unk(start_as_child); // wait for the val scalar to append the key-val pair
719
_line_progressed(1); // advance only 1
720
}
721
#ifdef RYML_NO_COVERAGE__TO_BE_DELETED
722
else if(rem.begins_with('}'))
723
{
724
if(!has_all(RMAP|FLOW))
725
{
726
_c4err("invalid token: not reading a map");
727
}
728
if(!has_all(SSCL))
729
{
730
_c4err("no scalar stored");
731
}
732
_append_key_val(saved_scalar, is_quoted);
733
_stop_map();
734
_line_progressed(1);
735
saved_scalar.clear();
736
is_quoted = false;
737
}
738
#endif
739
else if(rem.begins_with("..."))
740
{
741
_c4dbgp("got stream end '...'");
742
_end_stream();
743
_line_progressed(3);
744
}
745
else if(rem.begins_with('#'))
746
{
747
_c4dbgpf("it's a comment: '{}'", rem);
748
_scan_comment();
749
return true;
750
}
751
else if(_handle_key_anchors_and_refs())
752
{
753
return true;
754
}
755
else if(rem.begins_with(" ") || rem.begins_with("\t"))
756
{
757
size_t n = rem.first_not_of(" \t");
758
if(n == npos)
759
n = rem.len;
760
_c4dbgpf("has {} spaces/tabs, skip...", n);
761
_line_progressed(n);
762
return true;
763
}
764
else if(rem.empty())
765
{
766
// nothing to do
767
}
768
else if(rem == "---" || rem.begins_with("--- "))
769
{
770
_c4dbgp("caught ---: starting doc");
771
_start_new_doc(rem);
772
return true;
773
}
774
else if(rem.begins_with('%'))
775
{
776
_c4dbgp("caught a directive: ignoring...");
777
_line_progressed(rem.len);
778
return true;
779
}
780
else
781
{
782
_c4err("parse error");
783
}
784
785
if(is_quoted || (! saved_scalar.empty()))
786
{
787
_store_scalar(saved_scalar, is_quoted);
788
}
789
790
return true;
791
}
792
else
793
{
794
_RYML_CB_ASSERT(m_stack.m_callbacks, ! has_any(SSCL));
795
csubstr scalar;
796
size_t indentation = m_state->line_contents.indentation; // save
797
bool is_quoted;
798
if(_scan_scalar_unk(&scalar, &is_quoted))
799
{
800
_c4dbgpf("got a {} scalar", is_quoted ? "quoted" : "");
801
rem = m_state->line_contents.rem;
802
{
803
size_t first = rem.first_not_of(" \t");
804
if(first && first != npos)
805
{
806
_c4dbgpf("skip {} whitespace characters", first);
807
_line_progressed(first);
808
rem = rem.sub(first);
809
}
810
}
811
_store_scalar(scalar, is_quoted);
812
if(rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t")))
813
{
814
_c4dbgpf("got a ': ' next -- it's a map (as_child={})", start_as_child);
815
_push_level();
816
_start_map(start_as_child); // wait for the val scalar to append the key-val pair
817
_set_indentation(indentation);
818
_line_progressed(2); // call this AFTER saving the indentation
819
}
820
else if(rem.begins_with(':'))
821
{
822
_c4dbgpf("got a ':' next -- it's a map (as_child={})", start_as_child);
823
_push_level();
824
_start_map(start_as_child); // wait for the val scalar to append the key-val pair
825
_set_indentation(indentation);
826
_line_progressed(1); // call this AFTER saving the indentation
827
}
828
else
829
{
830
// we still don't know whether it's a seq or a map
831
// so just store the scalar
832
}
833
return true;
834
}
835
else if(rem.begins_with_any(" \t"))
836
{
837
csubstr ws = rem.left_of(rem.first_not_of(" \t"));
838
rem = rem.right_of(ws);
839
if(has_all(RTOP) && rem.begins_with("---"))
840
{
841
_c4dbgp("there's a doc starting, and it's indented");
842
_set_indentation(ws.len);
843
}
844
_c4dbgpf("skipping {} spaces/tabs", ws.len);
845
_line_progressed(ws.len);
846
return true;
847
}
848
}
849
850
return false;
851
}
852
853
854
//-----------------------------------------------------------------------------
855
C4_ALWAYS_INLINE void Parser::_skipchars(char c)
856
{
857
_RYML_CB_ASSERT(m_stack.m_callbacks, m_state->line_contents.rem.begins_with(c));
858
size_t pos = m_state->line_contents.rem.first_not_of(c);
859
if(pos == npos)
860
pos = m_state->line_contents.rem.len; // maybe the line is just whitespace
861
_c4dbgpf("skip {} '{}'", pos, c);
862
_line_progressed(pos);
863
}
864
865
template<size_t N>
866
C4_ALWAYS_INLINE void Parser::_skipchars(const char (&chars)[N])
867
{
868
_RYML_CB_ASSERT(m_stack.m_callbacks, m_state->line_contents.rem.begins_with_any(chars));
869
size_t pos = m_state->line_contents.rem.first_not_of(chars);
870
if(pos == npos)
871
pos = m_state->line_contents.rem.len; // maybe the line is just whitespace
872
_c4dbgpf("skip {} characters", pos);
873
_line_progressed(pos);
874
}
875
876
877
//-----------------------------------------------------------------------------
878
bool Parser::_handle_seq_flow()
879
{
880
_c4dbgpf("handle_seq_flow: node_id={} level={}", m_state->node_id, m_state->level);
881
csubstr rem = m_state->line_contents.rem;
882
883
_RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RKEY));
884
_RYML_CB_ASSERT(m_stack.m_callbacks, has_all(RSEQ|FLOW));
885
886
if(rem.begins_with(' '))
887
{
888
// with explicit flow, indentation does not matter
889
_c4dbgp("starts with spaces");
890
_skipchars(' ');
891
return true;
892
}
893
_RYML_WITH_TAB_TOKENS(else if(rem.begins_with('\t'))
894
{
895
_c4dbgp("starts with tabs");
896
_skipchars('\t');
897
return true;
898
})
899
else if(rem.begins_with('#'))
900
{
901
_c4dbgp("it's a comment");
902
rem = _scan_comment(); // also progresses the line
903
return true;
904
}
905
else if(rem.begins_with(']'))
906
{
907
_c4dbgp("end the sequence");
908
_pop_level();
909
_line_progressed(1);
910
if(has_all(RSEQIMAP))
911
{
912
_stop_seqimap();
913
_pop_level();
914
}
915
return true;
916
}
917
918
if(has_any(RVAL))
919
{
920
_RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RNXT));
921
bool is_quoted;
922
if(_scan_scalar_seq_flow(&rem, &is_quoted))
923
{
924
_c4dbgp("it's a scalar");
925
addrem_flags(RNXT, RVAL);
926
_append_val(rem, is_quoted);
927
return true;
928
}
929
else if(rem.begins_with('['))
930
{
931
_c4dbgp("val is a child seq");
932
addrem_flags(RNXT, RVAL); // before _push_level!
933
_push_level(/*explicit flow*/true);
934
_start_seq();
935
add_flags(FLOW);
936
_line_progressed(1);
937
return true;
938
}
939
else if(rem.begins_with('{'))
940
{
941
_c4dbgp("val is a child map");
942
addrem_flags(RNXT, RVAL); // before _push_level!
943
_push_level(/*explicit flow*/true);
944
_start_map();
945
addrem_flags(FLOW|RKEY, RVAL);
946
_line_progressed(1);
947
return true;
948
}
949
else if(rem == ':')
950
{
951
_c4dbgpf("found ':' -- there's an implicit map in the seq node[{}]", m_state->node_id);
952
_start_seqimap();
953
_line_progressed(1);
954
return true;
955
}
956
else if(rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t")))
957
{
958
_c4dbgpf("found ': ' -- there's an implicit map in the seq node[{}]", m_state->node_id);
959
_start_seqimap();
960
_line_progressed(2);
961
return true;
962
}
963
else if(rem.begins_with("? "))
964
{
965
_c4dbgpf("found '? ' -- there's an implicit map in the seq node[{}]", m_state->node_id);
966
_start_seqimap();
967
_line_progressed(2);
968
_RYML_CB_ASSERT(m_stack.m_callbacks, has_any(SSCL) && m_state->scalar == "");
969
addrem_flags(QMRK|RKEY, RVAL|SSCL);
970
return true;
971
}
972
else if(_handle_types())
973
{
974
return true;
975
}
976
else if(_handle_val_anchors_and_refs())
977
{
978
return true;
979
}
980
else if(rem.begins_with(", "))
981
{
982
_c4dbgp("found ',' -- the value was null");
983
_append_val_null(rem.str - 1);
984
_line_progressed(2);
985
return true;
986
}
987
else if(rem.begins_with(','))
988
{
989
_c4dbgp("found ',' -- the value was null");
990
_append_val_null(rem.str - 1);
991
_line_progressed(1);
992
return true;
993
}
994
else if(rem.begins_with('\t'))
995
{
996
_skipchars('\t');
997
return true;
998
}
999
else
1000
{
1001
_c4err("parse error");
1002
}
1003
}
1004
else if(has_any(RNXT))
1005
{
1006
_RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RVAL));
1007
if(rem.begins_with(", "))
1008
{
1009
_RYML_CB_ASSERT(m_stack.m_callbacks, has_all(FLOW));
1010
_c4dbgp("seq: expect next val");
1011
addrem_flags(RVAL, RNXT);
1012
_line_progressed(2);
1013
return true;
1014
}
1015
else if(rem.begins_with(','))
1016
{
1017
_RYML_CB_ASSERT(m_stack.m_callbacks, has_all(FLOW));
1018
_c4dbgp("seq: expect next val");
1019
addrem_flags(RVAL, RNXT);
1020
_line_progressed(1);
1021
return true;
1022
}
1023
else if(rem == ':')
1024
{
1025
_c4dbgpf("found ':' -- there's an implicit map in the seq node[{}]", m_state->node_id);
1026
_start_seqimap();
1027
_line_progressed(1);
1028
return true;
1029
}
1030
else if(rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t")))
1031
{
1032
_c4dbgpf("found ': ' -- there's an implicit map in the seq node[{}]", m_state->node_id);
1033
_start_seqimap();
1034
_line_progressed(2);
1035
return true;
1036
}
1037
else
1038
{
1039
_c4err("was expecting a comma");
1040
}
1041
}
1042
else
1043
{
1044
_c4err("internal error");
1045
}
1046
1047
return true;
1048
}
1049
1050
//-----------------------------------------------------------------------------
1051
bool Parser::_handle_seq_blck()
1052
{
1053
_c4dbgpf("handle_seq_impl: node_id={} level={}", m_state->node_id, m_state->level);
1054
csubstr rem = m_state->line_contents.rem;
1055
1056
_RYML_CB_ASSERT(m_stack.m_callbacks, has_all(RSEQ));
1057
_RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RKEY));
1058
_RYML_CB_ASSERT(m_stack.m_callbacks, has_none(FLOW));
1059
1060
if(rem.begins_with('#'))
1061
{
1062
_c4dbgp("it's a comment");
1063
rem = _scan_comment();
1064
return true;
1065
}
1066
if(has_any(RNXT))
1067
{
1068
_RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RVAL));
1069
1070
if(_handle_indentation())
1071
return true;
1072
1073
if(rem.begins_with("- ") _RYML_WITH_TAB_TOKENS( || rem.begins_with("-\t")))
1074
{
1075
_c4dbgp("expect another val");
1076
addrem_flags(RVAL, RNXT);
1077
_line_progressed(2);
1078
return true;
1079
}
1080
else if(rem == '-')
1081
{
1082
_c4dbgp("expect another val");
1083
addrem_flags(RVAL, RNXT);
1084
_line_progressed(1);
1085
return true;
1086
}
1087
else if(rem.begins_with_any(" \t"))
1088
{
1089
_RYML_CB_ASSERT(m_stack.m_callbacks, ! _at_line_begin());
1090
_skipchars(" \t");
1091
return true;
1092
}
1093
else if(rem.begins_with("..."))
1094
{
1095
_c4dbgp("got stream end '...'");
1096
_end_stream();
1097
_line_progressed(3);
1098
return true;
1099
}
1100
else if(rem.begins_with("---"))
1101
{
1102
_c4dbgp("got document start '---'");
1103
_start_new_doc(rem);
1104
return true;
1105
}
1106
else
1107
{
1108
_c4err("parse error");
1109
}
1110
}
1111
else if(has_any(RVAL))
1112
{
1113
// there can be empty values
1114
if(_handle_indentation())
1115
return true;
1116
1117
csubstr s;
1118
bool is_quoted;
1119
if(_scan_scalar_seq_blck(&s, &is_quoted)) // this also progresses the line
1120
{
1121
_c4dbgpf("it's a{} scalar", is_quoted ? " quoted" : "");
1122
1123
rem = m_state->line_contents.rem;
1124
if(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(rem.begins_with_any(" \t"), rem.begins_with(' ')))
1125
{
1126
_c4dbgp("skipping whitespace...");
1127
size_t skip = rem.first_not_of(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' '));
1128
if(skip == csubstr::npos)
1129
skip = rem.len; // maybe the line is just whitespace
1130
_line_progressed(skip);
1131
rem = rem.sub(skip);
1132
}
1133
1134
_c4dbgpf("rem=[{}]~~~{}~~~", rem.len, rem);
1135
if(!rem.begins_with('#') && (rem.ends_with(':') || rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t"))))
1136
{
1137
_c4dbgp("actually, the scalar is the first key of a map, and it opens a new scope");
1138
if(m_key_anchor.empty())
1139
_move_val_anchor_to_key_anchor();
1140
if(m_key_tag.empty())
1141
_move_val_tag_to_key_tag();
1142
addrem_flags(RNXT, RVAL); // before _push_level! This prepares the current level for popping by setting it to RNXT
1143
_push_level();
1144
_start_map();
1145
_store_scalar(s, is_quoted);
1146
if( ! _maybe_set_indentation_from_anchor_or_tag())
1147
{
1148
_c4dbgpf("set indentation from scalar: {}", m_state->scalar_col);
1149
_set_indentation(m_state->scalar_col); // this is the column where the scalar starts
1150
}
1151
_move_key_tag2_to_key_tag();
1152
addrem_flags(RVAL, RKEY);
1153
_line_progressed(1);
1154
}
1155
else
1156
{
1157
_c4dbgp("appending val to current seq");
1158
_append_val(s, is_quoted);
1159
addrem_flags(RNXT, RVAL);
1160
}
1161
return true;
1162
}
1163
else if(rem.begins_with("- ") _RYML_WITH_TAB_TOKENS( || rem.begins_with("-\t")))
1164
{
1165
if(_rval_dash_start_or_continue_seq())
1166
_line_progressed(2);
1167
return true;
1168
}
1169
else if(rem == '-')
1170
{
1171
if(_rval_dash_start_or_continue_seq())
1172
_line_progressed(1);
1173
return true;
1174
}
1175
else if(rem.begins_with('['))
1176
{
1177
_c4dbgp("val is a child seq, flow");
1178
addrem_flags(RNXT, RVAL); // before _push_level!
1179
_push_level(/*explicit flow*/true);
1180
_start_seq();
1181
add_flags(FLOW);
1182
_line_progressed(1);
1183
return true;
1184
}
1185
else if(rem.begins_with('{'))
1186
{
1187
_c4dbgp("val is a child map, flow");
1188
addrem_flags(RNXT, RVAL); // before _push_level!
1189
_push_level(/*explicit flow*/true);
1190
_start_map();
1191
addrem_flags(FLOW|RKEY, RVAL);
1192
_line_progressed(1);
1193
return true;
1194
}
1195
else if(rem.begins_with("? "))
1196
{
1197
_c4dbgp("val is a child map + this key is complex");
1198
addrem_flags(RNXT, RVAL); // before _push_level!
1199
_push_level();
1200
_start_map();
1201
addrem_flags(QMRK|RKEY, RVAL);
1202
_save_indentation();
1203
_line_progressed(2);
1204
return true;
1205
}
1206
else if(rem.begins_with(' '))
1207
{
1208
csubstr spc = rem.left_of(rem.first_not_of(' '));
1209
if(_at_line_begin())
1210
{
1211
_c4dbgpf("skipping value indentation: {} spaces", spc.len);
1212
_line_progressed(spc.len);
1213
return true;
1214
}
1215
else
1216
{
1217
_c4dbgpf("skipping {} spaces", spc.len);
1218
_line_progressed(spc.len);
1219
return true;
1220
}
1221
}
1222
else if(_handle_types())
1223
{
1224
return true;
1225
}
1226
else if(_handle_val_anchors_and_refs())
1227
{
1228
return true;
1229
}
1230
/* pathological case:
1231
* - &key : val
1232
* - &key :
1233
* - : val
1234
*/
1235
else if((!has_all(SSCL)) &&
1236
(rem.begins_with(": ") || rem.left_of(rem.find("#")).trimr("\t") == ":"))
1237
{
1238
if(!m_val_anchor.empty() || !m_val_tag.empty())
1239
{
1240
_c4dbgp("val is a child map + this key is empty, with anchors or tags");
1241
addrem_flags(RNXT, RVAL); // before _push_level!
1242
_move_val_tag_to_key_tag();
1243
_move_val_anchor_to_key_anchor();
1244
_push_level();
1245
_start_map();
1246
_store_scalar_null(rem.str);
1247
addrem_flags(RVAL, RKEY);
1248
RYML_CHECK(_maybe_set_indentation_from_anchor_or_tag()); // one of them must exist
1249
_line_progressed(rem.begins_with(": ") ? 2u : 1u);
1250
return true;
1251
}
1252
else
1253
{
1254
_c4dbgp("val is a child map + this key is empty, no anchors or tags");
1255
addrem_flags(RNXT, RVAL); // before _push_level!
1256
size_t ind = m_state->indref;
1257
_push_level();
1258
_start_map();
1259
_store_scalar_null(rem.str);
1260
addrem_flags(RVAL, RKEY);
1261
_c4dbgpf("set indentation from map anchor: {}", ind + 2);
1262
_set_indentation(ind + 2); // this is the column where the map starts
1263
_line_progressed(rem.begins_with(": ") ? 2u : 1u);
1264
return true;
1265
}
1266
}
1267
else
1268
{
1269
_c4err("parse error");
1270
}
1271
}
1272
1273
return false;
1274
}
1275
1276
//-----------------------------------------------------------------------------
1277
1278
bool Parser::_rval_dash_start_or_continue_seq()
1279
{
1280
size_t ind = m_state->line_contents.current_col();
1281
_RYML_CB_ASSERT(m_stack.m_callbacks, ind >= m_state->indref);
1282
size_t delta_ind = ind - m_state->indref;
1283
if( ! delta_ind)
1284
{
1285
_c4dbgp("prev val was empty");
1286
addrem_flags(RNXT, RVAL);
1287
_append_val_null(&m_state->line_contents.full[ind]);
1288
return false;
1289
}
1290
_c4dbgp("val is a nested seq, indented");
1291
addrem_flags(RNXT, RVAL); // before _push_level!
1292
_push_level();
1293
_start_seq();
1294
_save_indentation();
1295
return true;
1296
}
1297
1298
//-----------------------------------------------------------------------------
1299
bool Parser::_handle_map_flow()
1300
{
1301
// explicit flow, ie, inside {}, separated by commas
1302
_c4dbgpf("handle_map_flow: node_id={} level={}", m_state->node_id, m_state->level);
1303
csubstr rem = m_state->line_contents.rem;
1304
1305
_RYML_CB_ASSERT(m_stack.m_callbacks, has_all(RMAP|FLOW));
1306
1307
if(rem.begins_with(' '))
1308
{
1309
// with explicit flow, indentation does not matter
1310
_c4dbgp("starts with spaces");
1311
_skipchars(' ');
1312
return true;
1313
}
1314
_RYML_WITH_TAB_TOKENS(else if(rem.begins_with('\t'))
1315
{
1316
// with explicit flow, indentation does not matter
1317
_c4dbgp("starts with tabs");
1318
_skipchars('\t');
1319
return true;
1320
})
1321
else if(rem.begins_with('#'))
1322
{
1323
_c4dbgp("it's a comment");
1324
rem = _scan_comment(); // also progresses the line
1325
return true;
1326
}
1327
else if(rem.begins_with('}'))
1328
{
1329
_c4dbgp("end the map");
1330
if(has_all(SSCL))
1331
{
1332
_c4dbgp("the last val was null");
1333
_append_key_val_null(rem.str - 1);
1334
rem_flags(RVAL);
1335
}
1336
_pop_level();
1337
_line_progressed(1);
1338
if(has_all(RSEQIMAP))
1339
{
1340
_c4dbgp("stopping implicitly nested 1x map");
1341
_stop_seqimap();
1342
_pop_level();
1343
}
1344
return true;
1345
}
1346
1347
if(has_any(RNXT))
1348
{
1349
_RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RKEY));
1350
_RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RVAL));
1351
_RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RSEQIMAP));
1352
1353
if(rem.begins_with(", "))
1354
{
1355
_c4dbgp("seq: expect next keyval");
1356
addrem_flags(RKEY, RNXT);
1357
_line_progressed(2);
1358
return true;
1359
}
1360
else if(rem.begins_with(','))
1361
{
1362
_c4dbgp("seq: expect next keyval");
1363
addrem_flags(RKEY, RNXT);
1364
_line_progressed(1);
1365
return true;
1366
}
1367
else
1368
{
1369
_c4err("parse error");
1370
}
1371
}
1372
else if(has_any(RKEY))
1373
{
1374
_RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RNXT));
1375
_RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RVAL));
1376
1377
bool is_quoted;
1378
if(has_none(SSCL) && _scan_scalar_map_flow(&rem, &is_quoted))
1379
{
1380
_c4dbgp("it's a scalar");
1381
_store_scalar(rem, is_quoted);
1382
rem = m_state->line_contents.rem;
1383
csubstr trimmed = rem.triml(" \t");
1384
if(trimmed.len && (trimmed.begins_with(": ") || trimmed.begins_with_any(":,}") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t"))))
1385
{
1386
_RYML_CB_ASSERT(m_stack.m_callbacks, trimmed.str >= rem.str);
1387
size_t num = static_cast<size_t>(trimmed.str - rem.str);
1388
_c4dbgpf("trimming {} whitespace after the scalar: '{}' --> '{}'", num, rem, rem.sub(num));
1389
rem = rem.sub(num);
1390
_line_progressed(num);
1391
}
1392
}
1393
1394
if(rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t")))
1395
{
1396
_c4dbgp("wait for val");
1397
addrem_flags(RVAL, RKEY|QMRK);
1398
_line_progressed(2);
1399
if(!has_all(SSCL))
1400
{
1401
_c4dbgp("no key was found, defaulting to empty key ''");
1402
_store_scalar_null(rem.str);
1403
}
1404
return true;
1405
}
1406
else if(rem == ':')
1407
{
1408
_c4dbgp("wait for val");
1409
addrem_flags(RVAL, RKEY|QMRK);
1410
_line_progressed(1);
1411
if(!has_all(SSCL))
1412
{
1413
_c4dbgp("no key was found, defaulting to empty key ''");
1414
_store_scalar_null(rem.str);
1415
}
1416
return true;
1417
}
1418
else if(rem.begins_with('?'))
1419
{
1420
_c4dbgp("complex key");
1421
add_flags(QMRK);
1422
_line_progressed(1);
1423
return true;
1424
}
1425
else if(rem.begins_with(','))
1426
{
1427
_c4dbgp("prev scalar was a key with null value");
1428
_append_key_val_null(rem.str - 1);
1429
_line_progressed(1);
1430
return true;
1431
}
1432
else if(rem.begins_with('}'))
1433
{
1434
_c4dbgp("map terminates after a key...");
1435
_RYML_CB_ASSERT(m_stack.m_callbacks, has_all(SSCL));
1436
_c4dbgp("the last val was null");
1437
_append_key_val_null(rem.str - 1);
1438
rem_flags(RVAL);
1439
if(has_all(RSEQIMAP))
1440
{
1441
_c4dbgp("stopping implicitly nested 1x map");
1442
_stop_seqimap();
1443
_pop_level();
1444
}
1445
_pop_level();
1446
_line_progressed(1);
1447
return true;
1448
}
1449
else if(_handle_types())
1450
{
1451
return true;
1452
}
1453
else if(_handle_key_anchors_and_refs())
1454
{
1455
return true;
1456
}
1457
else if(rem == "")
1458
{
1459
return true;
1460
}
1461
else
1462
{
1463
size_t pos = rem.first_not_of(" \t");
1464
if(pos == csubstr::npos)
1465
pos = 0;
1466
rem = rem.sub(pos);
1467
if(rem.begins_with(':'))
1468
{
1469
_c4dbgp("wait for val");
1470
addrem_flags(RVAL, RKEY|QMRK);
1471
_line_progressed(pos + 1);
1472
if(!has_all(SSCL))
1473
{
1474
_c4dbgp("no key was found, defaulting to empty key ''");
1475
_store_scalar_null(rem.str);
1476
}
1477
return true;
1478
}
1479
else if(rem.begins_with('#'))
1480
{
1481
_c4dbgp("it's a comment");
1482
_line_progressed(pos);
1483
rem = _scan_comment(); // also progresses the line
1484
return true;
1485
}
1486
else
1487
{
1488
_c4err("parse error");
1489
}
1490
}
1491
}
1492
else if(has_any(RVAL))
1493
{
1494
_RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RNXT));
1495
_RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RKEY));
1496
_RYML_CB_ASSERT(m_stack.m_callbacks, has_all(SSCL));
1497
bool is_quoted;
1498
if(_scan_scalar_map_flow(&rem, &is_quoted))
1499
{
1500
_c4dbgp("it's a scalar");
1501
addrem_flags(RNXT, RVAL|RKEY);
1502
_append_key_val(rem, is_quoted);
1503
if(has_all(RSEQIMAP))
1504
{
1505
_c4dbgp("stopping implicitly nested 1x map");
1506
_stop_seqimap();
1507
_pop_level();
1508
}
1509
return true;
1510
}
1511
else if(rem.begins_with('['))
1512
{
1513
_c4dbgp("val is a child seq");
1514
addrem_flags(RNXT, RVAL|RKEY); // before _push_level!
1515
_push_level(/*explicit flow*/true);
1516
_move_scalar_from_top();
1517
_start_seq();
1518
add_flags(FLOW);
1519
_line_progressed(1);
1520
return true;
1521
}
1522
else if(rem.begins_with('{'))
1523
{
1524
_c4dbgp("val is a child map");
1525
addrem_flags(RNXT, RVAL|RKEY); // before _push_level!
1526
_push_level(/*explicit flow*/true);
1527
_move_scalar_from_top();
1528
_start_map();
1529
addrem_flags(FLOW|RKEY, RNXT|RVAL);
1530
_line_progressed(1);
1531
return true;
1532
}
1533
else if(_handle_types())
1534
{
1535
return true;
1536
}
1537
else if(_handle_val_anchors_and_refs())
1538
{
1539
return true;
1540
}
1541
else if(rem.begins_with(','))
1542
{
1543
_c4dbgp("appending empty val");
1544
_append_key_val_null(rem.str - 1);
1545
addrem_flags(RKEY, RVAL);
1546
_line_progressed(1);
1547
if(has_any(RSEQIMAP))
1548
{
1549
_c4dbgp("stopping implicitly nested 1x map");
1550
_stop_seqimap();
1551
_pop_level();
1552
}
1553
return true;
1554
}
1555
else if(has_any(RSEQIMAP) && rem.begins_with(']'))
1556
{
1557
_c4dbgp("stopping implicitly nested 1x map");
1558
if(has_any(SSCL))
1559
{
1560
_append_key_val_null(rem.str - 1);
1561
}
1562
_stop_seqimap();
1563
_pop_level();
1564
return true;
1565
}
1566
else
1567
{
1568
_c4err("parse error");
1569
}
1570
}
1571
else
1572
{
1573
_c4err("internal error");
1574
}
1575
1576
return false;
1577
}
1578
1579
//-----------------------------------------------------------------------------
1580
bool Parser::_handle_map_blck()
1581
{
1582
_c4dbgpf("handle_map_blck: node_id={} level={}", m_state->node_id, m_state->level);
1583
csubstr rem = m_state->line_contents.rem;
1584
1585
_RYML_CB_ASSERT(m_stack.m_callbacks, has_all(RMAP));
1586
_RYML_CB_ASSERT(m_stack.m_callbacks, has_none(FLOW));
1587
1588
if(rem.begins_with('#'))
1589
{
1590
_c4dbgp("it's a comment");
1591
rem = _scan_comment();
1592
return true;
1593
}
1594
1595
if(has_any(RNXT))
1596
{
1597
_RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RKEY));
1598
_RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RVAL));
1599
// actually, we don't need RNXT in indent-based maps.
1600
addrem_flags(RKEY, RNXT);
1601
}
1602
1603
if(_handle_indentation())
1604
{
1605
_c4dbgp("indentation token");
1606
return true;
1607
}
1608
1609
if(has_any(RKEY))
1610
{
1611
_RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RNXT));
1612
_RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RVAL));
1613
1614
_c4dbgp("RMAP|RKEY read scalar?");
1615
bool is_quoted;
1616
if(_scan_scalar_map_blck(&rem, &is_quoted)) // this also progresses the line
1617
{
1618
_c4dbgpf("it's a{} scalar", is_quoted ? " quoted" : "");
1619
if(has_all(QMRK|SSCL))
1620
{
1621
_c4dbgpf("current key is QMRK; SSCL is set. so take store scalar='{}' as key and add an empty val", m_state->scalar);
1622
_append_key_val_null(rem.str - 1);
1623
}
1624
_store_scalar(rem, is_quoted);
1625
if(has_all(QMRK|RSET))
1626
{
1627
_c4dbgp("it's a complex key, so use null value '~'");
1628
_append_key_val_null(rem.str);
1629
}
1630
rem = m_state->line_contents.rem;
1631
1632
if(rem.begins_with(':'))
1633
{
1634
_c4dbgp("wait for val");
1635
addrem_flags(RVAL, RKEY|QMRK);
1636
_line_progressed(1);
1637
rem = m_state->line_contents.rem;
1638
if(rem.begins_with_any(" \t"))
1639
{
1640
_RYML_CB_ASSERT(m_stack.m_callbacks, ! _at_line_begin());
1641
rem = rem.left_of(rem.first_not_of(" \t"));
1642
_c4dbgpf("skip {} spaces/tabs", rem.len);
1643
_line_progressed(rem.len);
1644
}
1645
}
1646
return true;
1647
}
1648
else if(rem.begins_with_any(" \t"))
1649
{
1650
size_t pos = rem.first_not_of(" \t");
1651
if(pos == npos)
1652
pos = rem.len;
1653
_c4dbgpf("skip {} spaces/tabs", pos);
1654
_line_progressed(pos);
1655
return true;
1656
}
1657
else if(rem == '?' || rem.begins_with("? "))
1658
{
1659
_c4dbgp("it's a complex key");
1660
_line_progressed(rem.begins_with("? ") ? 2u : 1u);
1661
if(has_any(SSCL))
1662
_append_key_val_null(rem.str - 1);
1663
add_flags(QMRK);
1664
return true;
1665
}
1666
else if(has_all(QMRK) && rem.begins_with(':'))
1667
{
1668
_c4dbgp("complex key finished");
1669
if(!has_any(SSCL))
1670
_store_scalar_null(rem.str);
1671
addrem_flags(RVAL, RKEY|QMRK);
1672
_line_progressed(1);
1673
rem = m_state->line_contents.rem;
1674
if(rem.begins_with(' '))
1675
{
1676
_RYML_CB_ASSERT(m_stack.m_callbacks, ! _at_line_begin());
1677
_skipchars(' ');
1678
}
1679
return true;
1680
}
1681
else if(rem == ':' || rem.begins_with(": ") _RYML_WITH_TAB_TOKENS( || rem.begins_with(":\t")))
1682
{
1683
_c4dbgp("key finished");
1684
if(!has_all(SSCL))
1685
{
1686
_c4dbgp("key was empty...");
1687
_store_scalar_null(rem.str);
1688
rem_flags(QMRK);
1689
}
1690
addrem_flags(RVAL, RKEY);
1691
_line_progressed(rem == ':' ? 1 : 2);
1692
return true;
1693
}
1694
else if(rem.begins_with("..."))
1695
{
1696
_c4dbgp("end current document");
1697
_end_stream();
1698
_line_progressed(3);
1699
return true;
1700
}
1701
else if(rem.begins_with("---"))
1702
{
1703
_c4dbgp("start new document '---'");
1704
_start_new_doc(rem);
1705
return true;
1706
}
1707
else if(_handle_types())
1708
{
1709
return true;
1710
}
1711
else if(_handle_key_anchors_and_refs())
1712
{
1713
return true;
1714
}
1715
else
1716
{
1717
_c4err("parse error");
1718
}
1719
}
1720
else if(has_any(RVAL))
1721
{
1722
_RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RNXT));
1723
_RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RKEY));
1724
1725
_c4dbgp("RMAP|RVAL read scalar?");
1726
csubstr s;
1727
bool is_quoted;
1728
if(_scan_scalar_map_blck(&s, &is_quoted)) // this also progresses the line
1729
{
1730
_c4dbgpf("it's a{} scalar", is_quoted ? " quoted" : "");
1731
1732
rem = m_state->line_contents.rem;
1733
1734
if(rem.begins_with(": "))
1735
{
1736
_c4dbgp("actually, the scalar is the first key of a map");
1737
addrem_flags(RKEY, RVAL); // before _push_level! This prepares the current level for popping by setting it to RNXT
1738
_push_level();
1739
_move_scalar_from_top();
1740
_move_val_anchor_to_key_anchor();
1741
_start_map();
1742
_save_indentation(m_state->scalar_col);
1743
addrem_flags(RVAL, RKEY);
1744
_line_progressed(2);
1745
}
1746
else if(rem.begins_with(':'))
1747
{
1748
_c4dbgp("actually, the scalar is the first key of a map, and it opens a new scope");
1749
addrem_flags(RKEY, RVAL); // before _push_level! This prepares the current level for popping by setting it to RNXT
1750
_push_level();
1751
_move_scalar_from_top();
1752
_move_val_anchor_to_key_anchor();
1753
_start_map();
1754
_save_indentation(/*behind*/s.len);
1755
addrem_flags(RVAL, RKEY);
1756
_line_progressed(1);
1757
}
1758
else
1759
{
1760
_c4dbgp("appending keyval to current map");
1761
_append_key_val(s, is_quoted);
1762
addrem_flags(RKEY, RVAL);
1763
}
1764
return true;
1765
}
1766
else if(rem.begins_with("- ") _RYML_WITH_TAB_TOKENS( || rem.begins_with("-\t")))
1767
{
1768
_c4dbgp("val is a nested seq, indented");
1769
addrem_flags(RKEY, RVAL); // before _push_level!
1770
_push_level();
1771
_move_scalar_from_top();
1772
_start_seq();
1773
_save_indentation();
1774
_line_progressed(2);
1775
return true;
1776
}
1777
else if(rem == '-')
1778
{
1779
_c4dbgp("maybe a seq. start unknown, indented");
1780
_start_unk();
1781
_save_indentation();
1782
_line_progressed(1);
1783
return true;
1784
}
1785
else if(rem.begins_with('['))
1786
{
1787
_c4dbgp("val is a child seq, flow");
1788
addrem_flags(RKEY, RVAL); // before _push_level!
1789
_push_level(/*explicit flow*/true);
1790
_move_scalar_from_top();
1791
_start_seq();
1792
add_flags(FLOW);
1793
_line_progressed(1);
1794
return true;
1795
}
1796
else if(rem.begins_with('{'))
1797
{
1798
_c4dbgp("val is a child map, flow");
1799
addrem_flags(RKEY, RVAL); // before _push_level!
1800
_push_level(/*explicit flow*/true);
1801
_move_scalar_from_top();
1802
_start_map();
1803
addrem_flags(FLOW|RKEY, RVAL);
1804
_line_progressed(1);
1805
return true;
1806
}
1807
else if(rem.begins_with(' '))
1808
{
1809
csubstr spc = rem.left_of(rem.first_not_of(' '));
1810
if(_at_line_begin())
1811
{
1812
_c4dbgpf("skipping value indentation: {} spaces", spc.len);
1813
_line_progressed(spc.len);
1814
return true;
1815
}
1816
else
1817
{
1818
_c4dbgpf("skipping {} spaces", spc.len);
1819
_line_progressed(spc.len);
1820
return true;
1821
}
1822
}
1823
else if(_handle_types())
1824
{
1825
return true;
1826
}
1827
else if(_handle_val_anchors_and_refs())
1828
{
1829
return true;
1830
}
1831
else if(rem.begins_with("--- ") || rem == "---" || rem.begins_with("---\t"))
1832
{
1833
_start_new_doc(rem);
1834
return true;
1835
}
1836
else if(rem.begins_with("..."))
1837
{
1838
_c4dbgp("end current document");
1839
_end_stream();
1840
_line_progressed(3);
1841
return true;
1842
}
1843
else
1844
{
1845
_c4err("parse error");
1846
}
1847
}
1848
else
1849
{
1850
_c4err("internal error");
1851
}
1852
1853
return false;
1854
}
1855
1856
1857
//-----------------------------------------------------------------------------
1858
bool Parser::_handle_top()
1859
{
1860
_c4dbgp("handle_top");
1861
csubstr rem = m_state->line_contents.rem;
1862
1863
if(rem.begins_with('#'))
1864
{
1865
_c4dbgp("a comment line");
1866
_scan_comment();
1867
return true;
1868
}
1869
1870
csubstr trimmed = rem.triml(' ');
1871
1872
if(trimmed.begins_with('%'))
1873
{
1874
_handle_directive(trimmed);
1875
_line_progressed(rem.len);
1876
return true;
1877
}
1878
else if(trimmed.begins_with("--- ") || trimmed == "---" || trimmed.begins_with("---\t"))
1879
{
1880
_start_new_doc(rem);
1881
if(trimmed.len < rem.len)
1882
{
1883
_line_progressed(rem.len - trimmed.len);
1884
_save_indentation();
1885
}
1886
return true;
1887
}
1888
else if(trimmed.begins_with("..."))
1889
{
1890
_c4dbgp("end current document");
1891
_end_stream();
1892
if(trimmed.len < rem.len)
1893
{
1894
_line_progressed(rem.len - trimmed.len);
1895
}
1896
_line_progressed(3);
1897
return true;
1898
}
1899
else
1900
{
1901
_c4err("parse error");
1902
}
1903
1904
return false;
1905
}
1906
1907
1908
//-----------------------------------------------------------------------------
1909
1910
bool Parser::_handle_key_anchors_and_refs()
1911
{
1912
_RYML_CB_ASSERT(m_stack.m_callbacks, !has_any(RVAL));
1913
const csubstr rem = m_state->line_contents.rem;
1914
if(rem.begins_with('&'))
1915
{
1916
_c4dbgp("found a key anchor!!!");
1917
if(has_all(QMRK|SSCL))
1918
{
1919
_RYML_CB_ASSERT(m_stack.m_callbacks, has_any(RKEY));
1920
_c4dbgp("there is a stored key, so this anchor is for the next element");
1921
_append_key_val_null(rem.str - 1);
1922
rem_flags(QMRK);
1923
return true;
1924
}
1925
csubstr anchor = rem.left_of(rem.first_of(' '));
1926
_line_progressed(anchor.len);
1927
anchor = anchor.sub(1); // skip the first character
1928
_move_key_anchor_to_val_anchor();
1929
_c4dbgpf("key anchor value: '{}'", anchor);
1930
m_key_anchor = anchor;
1931
m_key_anchor_indentation = m_state->line_contents.current_col(rem);
1932
return true;
1933
}
1934
else if(C4_UNLIKELY(rem.begins_with('*')))
1935
{
1936
_c4err("not implemented - this should have been catched elsewhere");
1937
C4_NEVER_REACH();
1938
return false;
1939
}
1940
return false;
1941
}
1942
1943
bool Parser::_handle_val_anchors_and_refs()
1944
{
1945
_RYML_CB_ASSERT(m_stack.m_callbacks, !has_any(RKEY));
1946
const csubstr rem = m_state->line_contents.rem;
1947
if(rem.begins_with('&'))
1948
{
1949
csubstr anchor = rem.left_of(rem.first_of(' '));
1950
_line_progressed(anchor.len);
1951
anchor = anchor.sub(1); // skip the first character
1952
_c4dbgpf("val: found an anchor: '{}', indentation={}!!!", anchor, m_state->line_contents.current_col(rem));
1953
if(m_val_anchor.empty())
1954
{
1955
_c4dbgpf("save val anchor: '{}'", anchor);
1956
m_val_anchor = anchor;
1957
m_val_anchor_indentation = m_state->line_contents.current_col(rem);
1958
}
1959
else
1960
{
1961
_c4dbgpf("there is a pending val anchor '{}'", m_val_anchor);
1962
if(m_tree->is_seq(m_state->node_id))
1963
{
1964
if(m_tree->has_children(m_state->node_id))
1965
{
1966
_c4dbgpf("current node={} is a seq, has {} children", m_state->node_id, m_tree->num_children(m_state->node_id));
1967
_c4dbgpf("... so take the new one as a key anchor '{}'", anchor);
1968
m_key_anchor = anchor;
1969
m_key_anchor_indentation = m_state->line_contents.current_col(rem);
1970
}
1971
else
1972
{
1973
_c4dbgpf("current node={} is a seq, has no children", m_state->node_id);
1974
if(m_tree->has_val_anchor(m_state->node_id))
1975
{
1976
_c4dbgpf("... node={} already has val anchor: '{}'", m_state->node_id, m_tree->val_anchor(m_state->node_id));
1977
_c4dbgpf("... so take the new one as a key anchor '{}'", anchor);
1978
m_key_anchor = anchor;
1979
m_key_anchor_indentation = m_state->line_contents.current_col(rem);
1980
}
1981
else
1982
{
1983
_c4dbgpf("... so set pending val anchor: '{}' on current node {}", m_val_anchor, m_state->node_id);
1984
m_tree->set_val_anchor(m_state->node_id, m_val_anchor);
1985
m_val_anchor = anchor;
1986
m_val_anchor_indentation = m_state->line_contents.current_col(rem);
1987
}
1988
}
1989
}
1990
}
1991
return true;
1992
}
1993
else if(C4_UNLIKELY(rem.begins_with('*')))
1994
{
1995
_c4err("not implemented - this should have been catched elsewhere");
1996
C4_NEVER_REACH();
1997
return false;
1998
}
1999
return false;
2000
}
2001
2002
void Parser::_move_key_anchor_to_val_anchor()
2003
{
2004
if(m_key_anchor.empty())
2005
return;
2006
_c4dbgpf("move current key anchor to val slot: key='{}' -> val='{}'", m_key_anchor, m_val_anchor);
2007
if(!m_val_anchor.empty())
2008
_c4err("triple-pending anchor");
2009
m_val_anchor = m_key_anchor;
2010
m_val_anchor_indentation = m_key_anchor_indentation;
2011
m_key_anchor = {};
2012
m_key_anchor_indentation = {};
2013
}
2014
2015
void Parser::_move_val_anchor_to_key_anchor()
2016
{
2017
if(m_val_anchor.empty())
2018
return;
2019
if(!_token_is_from_this_line(m_val_anchor))
2020
return;
2021
_c4dbgpf("move current val anchor to key slot: key='{}' <- val='{}'", m_key_anchor, m_val_anchor);
2022
if(!m_key_anchor.empty())
2023
_c4err("triple-pending anchor");
2024
m_key_anchor = m_val_anchor;
2025
m_key_anchor_indentation = m_val_anchor_indentation;
2026
m_val_anchor = {};
2027
m_val_anchor_indentation = {};
2028
}
2029
2030
void Parser::_move_key_tag_to_val_tag()
2031
{
2032
if(m_key_tag.empty())
2033
return;
2034
_c4dbgpf("move key tag to val tag: key='{}' -> val='{}'", m_key_tag, m_val_tag);
2035
m_val_tag = m_key_tag;
2036
m_val_tag_indentation = m_key_tag_indentation;
2037
m_key_tag.clear();
2038
m_key_tag_indentation = 0;
2039
}
2040
2041
void Parser::_move_val_tag_to_key_tag()
2042
{
2043
if(m_val_tag.empty())
2044
return;
2045
if(!_token_is_from_this_line(m_val_tag))
2046
return;
2047
_c4dbgpf("move val tag to key tag: key='{}' <- val='{}'", m_key_tag, m_val_tag);
2048
m_key_tag = m_val_tag;
2049
m_key_tag_indentation = m_val_tag_indentation;
2050
m_val_tag.clear();
2051
m_val_tag_indentation = 0;
2052
}
2053
2054
void Parser::_move_key_tag2_to_key_tag()
2055
{
2056
if(m_key_tag2.empty())
2057
return;
2058
_c4dbgpf("move key tag2 to key tag: key='{}' <- key2='{}'", m_key_tag, m_key_tag2);
2059
m_key_tag = m_key_tag2;
2060
m_key_tag_indentation = m_key_tag2_indentation;
2061
m_key_tag2.clear();
2062
m_key_tag2_indentation = 0;
2063
}
2064
2065
2066
//-----------------------------------------------------------------------------
2067
2068
bool Parser::_handle_types()
2069
{
2070
csubstr rem = m_state->line_contents.rem.triml(' ');
2071
csubstr t;
2072
2073
if(rem.begins_with("!!"))
2074
{
2075
_c4dbgp("begins with '!!'");
2076
t = rem.left_of(rem.first_of(" ,"));
2077
_RYML_CB_ASSERT(m_stack.m_callbacks, t.len >= 2);
2078
//t = t.sub(2);
2079
if(t == "!!set")
2080
add_flags(RSET);
2081
}
2082
else if(rem.begins_with("!<"))
2083
{
2084
_c4dbgp("begins with '!<'");
2085
t = rem.left_of(rem.first_of('>'), true);
2086
_RYML_CB_ASSERT(m_stack.m_callbacks, t.len >= 2);
2087
//t = t.sub(2, t.len-1);
2088
}
2089
else if(rem.begins_with("!h!"))
2090
{
2091
_c4dbgp("begins with '!h!'");
2092
t = rem.left_of(rem.first_of(' '));
2093
_RYML_CB_ASSERT(m_stack.m_callbacks, t.len >= 3);
2094
//t = t.sub(3);
2095
}
2096
else if(rem.begins_with('!'))
2097
{
2098
_c4dbgp("begins with '!'");
2099
t = rem.left_of(rem.first_of(' '));
2100
_RYML_CB_ASSERT(m_stack.m_callbacks, t.len >= 1);
2101
//t = t.sub(1);
2102
}
2103
2104
if(t.empty())
2105
return false;
2106
2107
if(has_all(QMRK|SSCL))
2108
{
2109
_RYML_CB_ASSERT(m_stack.m_callbacks, has_any(RKEY));
2110
_c4dbgp("there is a stored key, so this tag is for the next element");
2111
_append_key_val_null(rem.str - 1);
2112
rem_flags(QMRK);
2113
}
2114
2115
#ifdef RYML_NO_COVERAGE__TO_BE_DELETED
2116
const char *tag_beginning = rem.str;
2117
#endif
2118
size_t tag_indentation = m_state->line_contents.current_col(t);
2119
_c4dbgpf("there was a tag: '{}', indentation={}", t, tag_indentation);
2120
_RYML_CB_ASSERT(m_stack.m_callbacks, t.end() > m_state->line_contents.rem.begin());
2121
_line_progressed(static_cast<size_t>(t.end() - m_state->line_contents.rem.begin()));
2122
{
2123
size_t pos = m_state->line_contents.rem.first_not_of(" \t");
2124
if(pos != csubstr::npos)
2125
_line_progressed(pos);
2126
}
2127
2128
if(has_all(RMAP|RKEY))
2129
{
2130
_c4dbgpf("saving map key tag '{}'", t);
2131
_RYML_CB_ASSERT(m_stack.m_callbacks, m_key_tag.empty());
2132
m_key_tag = t;
2133
m_key_tag_indentation = tag_indentation;
2134
}
2135
else if(has_all(RMAP|RVAL))
2136
{
2137
/* foo: !!str
2138
* !!str : bar */
2139
rem = m_state->line_contents.rem;
2140
rem = rem.left_of(rem.find("#"));
2141
rem = rem.trimr(" \t");
2142
_c4dbgpf("rem='{}'", rem);
2143
#ifdef RYML_NO_COVERAGE__TO_BE_DELETED
2144
if(rem == ':' || rem.begins_with(": "))
2145
{
2146
_c4dbgp("the last val was null, and this is a tag from a null key");
2147
_append_key_val_null(tag_beginning - 1);
2148
_store_scalar_null(rem.str - 1);
2149
// do not change the flag to key, it is ~
2150
_RYML_CB_ASSERT(m_stack.m_callbacks, rem.begin() > m_state->line_contents.rem.begin());
2151
size_t token_len = rem == ':' ? 1 : 2;
2152
_line_progressed(static_cast<size_t>(token_len + rem.begin() - m_state->line_contents.rem.begin()));
2153
}
2154
#endif
2155
_c4dbgpf("saving map val tag '{}'", t);
2156
_RYML_CB_ASSERT(m_stack.m_callbacks, m_val_tag.empty());
2157
m_val_tag = t;
2158
m_val_tag_indentation = tag_indentation;
2159
}
2160
else if(has_all(RSEQ|RVAL) || has_all(RTOP|RUNK|NDOC))
2161
{
2162
if(m_val_tag.empty())
2163
{
2164
_c4dbgpf("saving seq/doc val tag '{}'", t);
2165
m_val_tag = t;
2166
m_val_tag_indentation = tag_indentation;
2167
}
2168
else
2169
{
2170
_c4dbgpf("saving seq/doc key tag '{}'", t);
2171
m_key_tag = t;
2172
m_key_tag_indentation = tag_indentation;
2173
}
2174
}
2175
else if(has_all(RTOP|RUNK) || has_any(RUNK))
2176
{
2177
rem = m_state->line_contents.rem;
2178
rem = rem.left_of(rem.find("#"));
2179
rem = rem.trimr(" \t");
2180
if(rem.empty())
2181
{
2182
_c4dbgpf("saving val tag '{}'", t);
2183
_RYML_CB_ASSERT(m_stack.m_callbacks, m_val_tag.empty());
2184
m_val_tag = t;
2185
m_val_tag_indentation = tag_indentation;
2186
}
2187
else
2188
{
2189
_c4dbgpf("saving key tag '{}'", t);
2190
if(m_key_tag.empty())
2191
{
2192
m_key_tag = t;
2193
m_key_tag_indentation = tag_indentation;
2194
}
2195
else
2196
{
2197
/* handle this case:
2198
* !!str foo: !!map
2199
* !!int 1: !!float 20.0
2200
* !!int 3: !!float 40.0
2201
*
2202
* (m_key_tag would be !!str and m_key_tag2 would be !!int)
2203
*/
2204
m_key_tag2 = t;
2205
m_key_tag2_indentation = tag_indentation;
2206
}
2207
}
2208
}
2209
else
2210
{
2211
_c4err("internal error");
2212
}
2213
2214
if(m_val_tag.not_empty())
2215
{
2216
YamlTag_e tag = to_tag(t);
2217
if(tag == TAG_STR)
2218
{
2219
_c4dbgpf("tag '{}' is a str-type tag", t);
2220
if(has_all(RTOP|RUNK|NDOC))
2221
{
2222
_c4dbgpf("docval. slurping the string. pos={}", m_state->pos.offset);
2223
csubstr scalar = _slurp_doc_scalar();
2224
_c4dbgpf("docval. after slurp: {}, at node {}: '{}'", m_state->pos.offset, m_state->node_id, scalar);
2225
m_tree->to_val(m_state->node_id, scalar, DOC);
2226
_c4dbgpf("docval. val tag {} -> {}", m_val_tag, normalize_tag(m_val_tag));
2227
m_tree->set_val_tag(m_state->node_id, normalize_tag(m_val_tag));
2228
m_val_tag.clear();
2229
if(!m_val_anchor.empty())
2230
{
2231
_c4dbgpf("setting val anchor[{}]='{}'", m_state->node_id, m_val_anchor);
2232
m_tree->set_val_anchor(m_state->node_id, m_val_anchor);
2233
m_val_anchor.clear();
2234
}
2235
_end_stream();
2236
}
2237
}
2238
}
2239
return true;
2240
}
2241
2242
//-----------------------------------------------------------------------------
2243
csubstr Parser::_slurp_doc_scalar()
2244
{
2245
csubstr s = m_state->line_contents.rem;
2246
size_t pos = m_state->pos.offset;
2247
_RYML_CB_ASSERT(m_stack.m_callbacks, m_state->line_contents.full.find("---") != csubstr::npos);
2248
_c4dbgpf("slurp 0 '{}'. REM='{}'", s, m_buf.sub(m_state->pos.offset));
2249
if(s.len == 0)
2250
{
2251
_line_ended();
2252
_scan_line();
2253
s = m_state->line_contents.rem;
2254
pos = m_state->pos.offset;
2255
}
2256
2257
size_t skipws = s.first_not_of(" \t");
2258
_c4dbgpf("slurp 1 '{}'. REM='{}'", s, m_buf.sub(m_state->pos.offset));
2259
if(skipws != npos)
2260
{
2261
_line_progressed(skipws);
2262
s = m_state->line_contents.rem;
2263
pos = m_state->pos.offset;
2264
_c4dbgpf("slurp 2 '{}'. REM='{}'", s, m_buf.sub(m_state->pos.offset));
2265
}
2266
2267
_RYML_CB_ASSERT(m_stack.m_callbacks, m_val_anchor.empty());
2268
_handle_val_anchors_and_refs();
2269
if(!m_val_anchor.empty())
2270
{
2271
s = m_state->line_contents.rem;
2272
skipws = s.first_not_of(" \t");
2273
if(skipws != npos)
2274
{
2275
_line_progressed(skipws);
2276
}
2277
s = m_state->line_contents.rem;
2278
pos = m_state->pos.offset;
2279
_c4dbgpf("slurp 3 '{}'. REM='{}'", s, m_buf.sub(m_state->pos.offset));
2280
}
2281
2282
if(s.begins_with('\''))
2283
{
2284
m_state->scalar_col = m_state->line_contents.current_col(s);
2285
return _scan_squot_scalar();
2286
}
2287
else if(s.begins_with('"'))
2288
{
2289
m_state->scalar_col = m_state->line_contents.current_col(s);
2290
return _scan_dquot_scalar();
2291
}
2292
else if(s.begins_with('|') || s.begins_with('>'))
2293
{
2294
return _scan_block();
2295
}
2296
2297
_c4dbgpf("slurp 4 '{}'. REM='{}'", s, m_buf.sub(m_state->pos.offset));
2298
2299
m_state->scalar_col = m_state->line_contents.current_col(s);
2300
_RYML_CB_ASSERT(m_stack.m_callbacks, s.end() >= m_buf.begin() + pos);
2301
_line_progressed(static_cast<size_t>(s.end() - (m_buf.begin() + pos)));
2302
2303
_c4dbgpf("slurp 5 '{}'. REM='{}'", s, m_buf.sub(m_state->pos.offset));
2304
2305
if(_at_line_end())
2306
{
2307
_c4dbgpf("at line end. curr='{}'", s);
2308
s = _extend_scanned_scalar(s);
2309
}
2310
2311
_c4dbgpf("scalar was '{}'", s);
2312
2313
return s;
2314
}
2315
2316
2317
//-----------------------------------------------------------------------------
2318
2319
bool Parser::_scan_scalar_seq_blck(csubstr *C4_RESTRICT scalar, bool *C4_RESTRICT quoted)
2320
{
2321
_RYML_CB_ASSERT(m_stack.m_callbacks, has_any(RSEQ));
2322
_RYML_CB_ASSERT(m_stack.m_callbacks, has_any(RVAL));
2323
_RYML_CB_ASSERT(m_stack.m_callbacks, ! has_any(RKEY));
2324
_RYML_CB_ASSERT(m_stack.m_callbacks, ! has_any(FLOW));
2325
2326
csubstr s = m_state->line_contents.rem;
2327
if(s.len == 0)
2328
return false;
2329
s = s.trim(" \t");
2330
if(s.len == 0)
2331
return false;
2332
2333
if(s.begins_with('\''))
2334
{
2335
_c4dbgp("got a ': scanning single-quoted scalar");
2336
m_state->scalar_col = m_state->line_contents.current_col(s);
2337
*scalar = _scan_squot_scalar();
2338
*quoted = true;
2339
return true;
2340
}
2341
else if(s.begins_with('"'))
2342
{
2343
_c4dbgp("got a \": scanning double-quoted scalar");
2344
m_state->scalar_col = m_state->line_contents.current_col(s);
2345
*scalar = _scan_dquot_scalar();
2346
*quoted = true;
2347
return true;
2348
}
2349
else if(s.begins_with('|') || s.begins_with('>'))
2350
{
2351
*scalar = _scan_block();
2352
*quoted = true;
2353
return true;
2354
}
2355
else if(has_any(RTOP) && _is_doc_sep(s))
2356
{
2357
return false;
2358
}
2359
2360
_c4dbgp("RSEQ|RVAL");
2361
if( ! _is_scalar_next__rseq_rval(s))
2362
return false;
2363
_RYML_WITH_TAB_TOKENS(else if(s.begins_with("-\t"))
2364
return false;
2365
)
2366
2367
if(s.ends_with(':'))
2368
{
2369
--s.len;
2370
}
2371
else
2372
{
2373
auto first = s.first_of_any(": " _RYML_WITH_TAB_TOKENS( , ":\t"), " #");
2374
if(first)
2375
s.len = first.pos;
2376
}
2377
s = s.trimr(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' '));
2378
2379
if(s.empty())
2380
return false;
2381
2382
m_state->scalar_col = m_state->line_contents.current_col(s);
2383
_RYML_CB_ASSERT(m_stack.m_callbacks, s.str >= m_state->line_contents.rem.str);
2384
_line_progressed(static_cast<size_t>(s.str - m_state->line_contents.rem.str) + s.len);
2385
2386
if(_at_line_end() && s != '~')
2387
{
2388
_c4dbgpf("at line end. curr='{}'", s);
2389
s = _extend_scanned_scalar(s);
2390
}
2391
2392
_c4dbgpf("scalar was '{}'", s);
2393
2394
*scalar = s;
2395
*quoted = false;
2396
return true;
2397
}
2398
2399
bool Parser::_scan_scalar_map_blck(csubstr *C4_RESTRICT scalar, bool *C4_RESTRICT quoted)
2400
{
2401
_c4dbgp("_scan_scalar_map_blck");
2402
_RYML_CB_ASSERT(m_stack.m_callbacks, has_any(RMAP));
2403
_RYML_CB_ASSERT(m_stack.m_callbacks, ! has_any(FLOW));
2404
_RYML_CB_ASSERT(m_stack.m_callbacks, has_any(RKEY|RVAL));
2405
2406
csubstr s = m_state->line_contents.rem;
2407
#ifdef RYML_NO_COVERAGE__TO_BE_DELETED__OR_REFACTORED
2408
if(s.len == 0)
2409
return false;
2410
#endif
2411
s = s.trim(" \t");
2412
if(s.len == 0)
2413
return false;
2414
2415
if(s.begins_with('\''))
2416
{
2417
_c4dbgp("got a ': scanning single-quoted scalar");
2418
m_state->scalar_col = m_state->line_contents.current_col(s);
2419
*scalar = _scan_squot_scalar();
2420
*quoted = true;
2421
return true;
2422
}
2423
else if(s.begins_with('"'))
2424
{
2425
_c4dbgp("got a \": scanning double-quoted scalar");
2426
m_state->scalar_col = m_state->line_contents.current_col(s);
2427
*scalar = _scan_dquot_scalar();
2428
*quoted = true;
2429
return true;
2430
}
2431
else if(s.begins_with('|') || s.begins_with('>'))
2432
{
2433
*scalar = _scan_block();
2434
*quoted = true;
2435
return true;
2436
}
2437
else if(has_any(RTOP) && _is_doc_sep(s))
2438
{
2439
return false;
2440
}
2441
2442
if( ! _is_scalar_next__rmap(s))
2443
return false;
2444
2445
size_t colon_token = s.find(": ");
2446
if(colon_token == npos)
2447
{
2448
_RYML_WITH_OR_WITHOUT_TAB_TOKENS(
2449
// with tab tokens
2450
colon_token = s.find(":\t");
2451
if(colon_token == npos)
2452
{
2453
_RYML_CB_ASSERT(m_stack.m_callbacks, s.len > 0);
2454
colon_token = s.find(':');
2455
if(colon_token != s.len-1)
2456
colon_token = npos;
2457
}
2458
,
2459
// without tab tokens
2460
colon_token = s.find(':');
2461
_RYML_CB_ASSERT(m_stack.m_callbacks, s.len > 0);
2462
if(colon_token != s.len-1)
2463
colon_token = npos;
2464
)
2465
}
2466
2467
if(has_all(RKEY))
2468
{
2469
_RYML_CB_ASSERT(m_stack.m_callbacks, !s.begins_with(' '));
2470
if(has_any(QMRK))
2471
{
2472
_c4dbgp("RMAP|RKEY|CPLX");
2473
_RYML_CB_ASSERT(m_stack.m_callbacks, has_any(RMAP));
2474
if(s.begins_with("? ") || s == '?')
2475
return false;
2476
s = s.left_of(colon_token);
2477
s = s.left_of(s.first_of("#"));
2478
s = s.trimr(" \t");
2479
if(s.begins_with("---"))
2480
return false;
2481
else if(s.begins_with("..."))
2482
return false;
2483
}
2484
else
2485
{
2486
_c4dbgp("RMAP|RKEY");
2487
_RYML_CB_CHECK(m_stack.m_callbacks, !s.begins_with('{'));
2488
if(s.begins_with("? ") || s == '?')
2489
return false;
2490
s = s.left_of(colon_token);
2491
s = s.trimr(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' '));
2492
if(s.begins_with("---"))
2493
{
2494
return false;
2495
}
2496
else if(s.begins_with("..."))
2497
{
2498
return false;
2499
}
2500
}
2501
}
2502
else if(has_all(RVAL))
2503
{
2504
_c4dbgp("RMAP|RVAL");
2505
_RYML_CB_ASSERT(m_stack.m_callbacks, has_none(QMRK));
2506
if( ! _is_scalar_next__rmap_val(s))
2507
return false;
2508
_RYML_WITH_TAB_TOKENS(
2509
else if(s.begins_with("-\t"))
2510
return false;
2511
)
2512
_c4dbgp("RMAP|RVAL: scalar");
2513
s = s.left_of(s.find(" #")); // is there a comment?
2514
s = s.left_of(s.find("\t#")); // is there a comment?
2515
s = s.trim(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' '));
2516
if(s.begins_with("---"))
2517
return false;
2518
#ifdef RYML_NO_COVERAGE__TO_BE_DELETED__OR_REFACTORED
2519
else if(s.begins_with("..."))
2520
return false;
2521
#endif
2522
}
2523
2524
if(s.empty())
2525
return false;
2526
2527
m_state->scalar_col = m_state->line_contents.current_col(s);
2528
_RYML_CB_ASSERT(m_stack.m_callbacks, s.str >= m_state->line_contents.rem.str);
2529
_line_progressed(static_cast<size_t>(s.str - m_state->line_contents.rem.str) + s.len);
2530
2531
if(_at_line_end() && s != '~')
2532
{
2533
_c4dbgpf("at line end. curr='{}'", s);
2534
s = _extend_scanned_scalar(s);
2535
}
2536
2537
_c4dbgpf("scalar was '{}'", s);
2538
2539
*scalar = s;
2540
*quoted = false;
2541
return true;
2542
}
2543
2544
bool Parser::_scan_scalar_seq_flow(csubstr *C4_RESTRICT scalar, bool *C4_RESTRICT quoted)
2545
{
2546
_RYML_CB_ASSERT(m_stack.m_callbacks, has_any(RSEQ));
2547
_RYML_CB_ASSERT(m_stack.m_callbacks, has_any(FLOW));
2548
_RYML_CB_ASSERT(m_stack.m_callbacks, has_any(RVAL));
2549
_RYML_CB_ASSERT(m_stack.m_callbacks, ! has_any(RKEY));
2550
2551
csubstr s = m_state->line_contents.rem;
2552
if(s.len == 0)
2553
return false;
2554
s = s.trim(" \t");
2555
if(s.len == 0)
2556
return false;
2557
2558
if(s.begins_with('\''))
2559
{
2560
_c4dbgp("got a ': scanning single-quoted scalar");
2561
m_state->scalar_col = m_state->line_contents.current_col(s);
2562
*scalar = _scan_squot_scalar();
2563
*quoted = true;
2564
return true;
2565
}
2566
else if(s.begins_with('"'))
2567
{
2568
_c4dbgp("got a \": scanning double-quoted scalar");
2569
m_state->scalar_col = m_state->line_contents.current_col(s);
2570
*scalar = _scan_dquot_scalar();
2571
*quoted = true;
2572
return true;
2573
}
2574
2575
if(has_all(RVAL))
2576
{
2577
_c4dbgp("RSEQ|RVAL");
2578
if( ! _is_scalar_next__rseq_rval(s))
2579
return false;
2580
_RYML_WITH_TAB_TOKENS(else if(s.begins_with("-\t"))
2581
return false;
2582
)
2583
_c4dbgp("RSEQ|RVAL|FLOW");
2584
s = s.left_of(s.first_of(",]"));
2585
if(s.ends_with(':'))
2586
{
2587
--s.len;
2588
}
2589
else
2590
{
2591
auto first = s.first_of_any(": " _RYML_WITH_TAB_TOKENS( , ":\t"), " #");
2592
if(first)
2593
s.len = first.pos;
2594
}
2595
s = s.trimr(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' '));
2596
}
2597
2598
if(s.empty())
2599
return false;
2600
2601
m_state->scalar_col = m_state->line_contents.current_col(s);
2602
_RYML_CB_ASSERT(m_stack.m_callbacks, s.str >= m_state->line_contents.rem.str);
2603
_line_progressed(static_cast<size_t>(s.str - m_state->line_contents.rem.str) + s.len);
2604
2605
if(_at_line_end() && s != '~')
2606
{
2607
_c4dbgpf("at line end. curr='{}'", s);
2608
s = _extend_scanned_scalar(s);
2609
}
2610
2611
_c4dbgpf("scalar was '{}'", s);
2612
2613
*scalar = s;
2614
*quoted = false;
2615
return true;
2616
}
2617
2618
bool Parser::_scan_scalar_map_flow(csubstr *C4_RESTRICT scalar, bool *C4_RESTRICT quoted)
2619
{
2620
_RYML_CB_ASSERT(m_stack.m_callbacks, has_any(RMAP));
2621
_RYML_CB_ASSERT(m_stack.m_callbacks, has_any(FLOW));
2622
_RYML_CB_ASSERT(m_stack.m_callbacks, has_any(RKEY|RVAL));
2623
2624
csubstr s = m_state->line_contents.rem;
2625
if(s.len == 0)
2626
return false;
2627
s = s.trim(" \t");
2628
if(s.len == 0)
2629
return false;
2630
2631
if(s.begins_with('\''))
2632
{
2633
_c4dbgp("got a ': scanning single-quoted scalar");
2634
m_state->scalar_col = m_state->line_contents.current_col(s);
2635
*scalar = _scan_squot_scalar();
2636
*quoted = true;
2637
return true;
2638
}
2639
else if(s.begins_with('"'))
2640
{
2641
_c4dbgp("got a \": scanning double-quoted scalar");
2642
m_state->scalar_col = m_state->line_contents.current_col(s);
2643
*scalar = _scan_dquot_scalar();
2644
*quoted = true;
2645
return true;
2646
}
2647
2648
if( ! _is_scalar_next__rmap(s))
2649
return false;
2650
2651
if(has_all(RKEY))
2652
{
2653
_RYML_CB_ASSERT(m_stack.m_callbacks, !s.begins_with(' '));
2654
size_t colon_token = s.find(": ");
2655
if(colon_token == npos)
2656
{
2657
_RYML_WITH_OR_WITHOUT_TAB_TOKENS(
2658
// with tab tokens
2659
colon_token = s.find(":\t");
2660
if(colon_token == npos)
2661
{
2662
_RYML_CB_ASSERT(m_stack.m_callbacks, s.len > 0);
2663
colon_token = s.find(':');
2664
if(colon_token != s.len-1)
2665
colon_token = npos;
2666
}
2667
,
2668
// without tab tokens
2669
colon_token = s.find(':');
2670
_RYML_CB_ASSERT(m_stack.m_callbacks, s.len > 0);
2671
if(colon_token != s.len-1)
2672
colon_token = npos;
2673
)
2674
}
2675
if(s.begins_with("? ") || s == '?')
2676
return false;
2677
if(has_any(QMRK))
2678
{
2679
_c4dbgp("RMAP|RKEY|CPLX");
2680
_RYML_CB_ASSERT(m_stack.m_callbacks, has_any(RMAP));
2681
s = s.left_of(colon_token);
2682
s = s.left_of(s.first_of("#"));
2683
s = s.left_of(s.first_of(':'));
2684
s = s.trimr(" \t");
2685
if(s.begins_with("---"))
2686
return false;
2687
else if(s.begins_with("..."))
2688
return false;
2689
}
2690
else
2691
{
2692
_RYML_CB_CHECK(m_stack.m_callbacks, !s.begins_with('{'));
2693
_c4dbgp("RMAP|RKEY");
2694
s = s.left_of(colon_token);
2695
s = s.trimr(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' '));
2696
_c4dbgpf("RMAP|RKEY|FLOW: '{}'", s);
2697
s = s.left_of(s.first_of(",}"));
2698
if(s.ends_with(':'))
2699
--s.len;
2700
}
2701
}
2702
else if(has_all(RVAL))
2703
{
2704
_c4dbgp("RMAP|RVAL");
2705
_RYML_CB_ASSERT(m_stack.m_callbacks, has_none(QMRK));
2706
if( ! _is_scalar_next__rmap_val(s))
2707
return false;
2708
_RYML_WITH_TAB_TOKENS(else if(s.begins_with("-\t"))
2709
return false;
2710
)
2711
_c4dbgp("RMAP|RVAL|FLOW");
2712
if(has_none(RSEQIMAP))
2713
s = s.left_of(s.first_of(",}"));
2714
else
2715
s = s.left_of(s.first_of(",]"));
2716
s = s.left_of(s.find(" #")); // is there a comment?
2717
s = s.left_of(s.find("\t#")); // is there a comment?
2718
s = s.trim(_RYML_WITH_OR_WITHOUT_TAB_TOKENS(" \t", ' '));
2719
}
2720
2721
if(s.empty())
2722
return false;
2723
2724
m_state->scalar_col = m_state->line_contents.current_col(s);
2725
_RYML_CB_ASSERT(m_stack.m_callbacks, s.str >= m_state->line_contents.rem.str);
2726
_line_progressed(static_cast<size_t>(s.str - m_state->line_contents.rem.str) + s.len);
2727
2728
if(_at_line_end() && s != '~')
2729
{
2730
_c4dbgpf("at line end. curr='{}'", s);
2731
s = _extend_scanned_scalar(s);
2732
}
2733
2734
_c4dbgpf("scalar was '{}'", s);
2735
2736
*scalar = s;
2737
*quoted = false;
2738
return true;
2739
}
2740
2741
bool Parser::_scan_scalar_unk(csubstr *C4_RESTRICT scalar, bool *C4_RESTRICT quoted)
2742
{
2743
_RYML_CB_ASSERT(m_stack.m_callbacks, has_any(RUNK));
2744
2745
csubstr s = m_state->line_contents.rem;
2746
if(s.len == 0)
2747
return false;
2748
s = s.trim(" \t");
2749
if(s.len == 0)
2750
return false;
2751
2752
if(s.begins_with('\''))
2753
{
2754
_c4dbgp("got a ': scanning single-quoted scalar");
2755
m_state->scalar_col = m_state->line_contents.current_col(s);
2756
*scalar = _scan_squot_scalar();
2757
*quoted = true;
2758
return true;
2759
}
2760
else if(s.begins_with('"'))
2761
{
2762
_c4dbgp("got a \": scanning double-quoted scalar");
2763
m_state->scalar_col = m_state->line_contents.current_col(s);
2764
*scalar = _scan_dquot_scalar();
2765
*quoted = true;
2766
return true;
2767
}
2768
else if(s.begins_with('|') || s.begins_with('>'))
2769
{
2770
*scalar = _scan_block();
2771
*quoted = true;
2772
return true;
2773
}
2774
else if(has_any(RTOP) && _is_doc_sep(s))
2775
{
2776
return false;
2777
}
2778
2779
_c4dbgpf("RUNK '[{}]~~~{}~~~", s.len, s);
2780
if( ! _is_scalar_next__runk(s))
2781
{
2782
_c4dbgp("RUNK: no scalar next");
2783
return false;
2784
}
2785
size_t pos = s.find(" #");
2786
if(pos != npos)
2787
{
2788
_c4dbgpf("RUNK: found ' #' at {}", pos);
2789
s = s.left_of(pos);
2790
}
2791
pos = s.find(": ");
2792
if(pos != npos)
2793
{
2794
_c4dbgpf("RUNK: found ': ' at {}", pos);
2795
s = s.left_of(pos);
2796
}
2797
else if(s.ends_with(':'))
2798
{
2799
_c4dbgp("RUNK: ends with ':'");
2800
s = s.left_of(s.len-1);
2801
}
2802
_RYML_WITH_TAB_TOKENS(
2803
else if((pos = s.find(":\t")) != npos) // TABS
2804
{
2805
_c4dbgp("RUNK: ends with ':\\t'");
2806
s = s.left_of(pos);
2807
})
2808
else
2809
{
2810
_c4dbgp("RUNK: trimming left of ,");
2811
s = s.left_of(s.first_of(','));
2812
}
2813
s = s.trim(" \t");
2814
_c4dbgpf("RUNK: scalar=[{}]~~~{}~~~", s.len, s);
2815
2816
if(s.empty())
2817
return false;
2818
2819
m_state->scalar_col = m_state->line_contents.current_col(s);
2820
_RYML_CB_ASSERT(m_stack.m_callbacks, s.str >= m_state->line_contents.rem.str);
2821
_line_progressed(static_cast<size_t>(s.str - m_state->line_contents.rem.str) + s.len);
2822
2823
if(_at_line_end() && s != '~')
2824
{
2825
_c4dbgpf("at line end. curr=[{}]~~~{}~~", s.len, s);
2826
s = _extend_scanned_scalar(s);
2827
}
2828
2829
_c4dbgpf("scalar was [{}]~~~{}~~~", s.len, s);
2830
2831
*scalar = s;
2832
*quoted = false;
2833
return true;
2834
}
2835
2836
2837
//-----------------------------------------------------------------------------
2838
2839
csubstr Parser::_extend_scanned_scalar(csubstr s)
2840
{
2841
if(has_all(RMAP|RKEY|QMRK))
2842
{
2843
size_t scalar_indentation = has_any(FLOW) ? 0 : m_state->scalar_col;
2844
_c4dbgpf("extend_scalar: explicit key! indref={} scalar_indentation={} scalar_col={}", m_state->indref, scalar_indentation, m_state->scalar_col);
2845
csubstr n = _scan_to_next_nonempty_line(scalar_indentation);
2846
if(!n.empty())
2847
{
2848
substr full = _scan_complex_key(s, n).trimr(" \t\r\n");
2849
if(full != s)
2850
s = _filter_plain_scalar(full, scalar_indentation);
2851
}
2852
}
2853
// deal with plain (unquoted) scalars that continue to the next line
2854
else if(!s.begins_with_any("*")) // cannot be a plain scalar if it starts with * (that's an anchor reference)
2855
{
2856
_c4dbgpf("extend_scalar: line ended, scalar='{}'", s);
2857
if(has_none(FLOW))
2858
{
2859
size_t scalar_indentation = m_state->indref + 1;
2860
if(has_all(RUNK) && scalar_indentation == 1)
2861
scalar_indentation = 0;
2862
csubstr n = _scan_to_next_nonempty_line(scalar_indentation);
2863
if(!n.empty())
2864
{
2865
_c4dbgpf("rscalar[IMPL]: state_indref={} state_indentation={} scalar_indentation={}", m_state->indref, m_state->line_contents.indentation, scalar_indentation);
2866
_RYML_CB_ASSERT(m_stack.m_callbacks, m_state->line_contents.full.is_super(n));
2867
substr full = _scan_plain_scalar_blck(s, n, scalar_indentation);
2868
if(full.len >= s.len)
2869
s = _filter_plain_scalar(full, scalar_indentation);
2870
}
2871
}
2872
else
2873
{
2874
_RYML_CB_ASSERT(m_stack.m_callbacks, has_all(FLOW));
2875
csubstr n = _scan_to_next_nonempty_line(/*indentation*/0);
2876
if(!n.empty())
2877
{
2878
_c4dbgp("rscalar[FLOW]");
2879
substr full = _scan_plain_scalar_flow(s, n);
2880
s = _filter_plain_scalar(full, /*indentation*/0);
2881
}
2882
}
2883
}
2884
2885
return s;
2886
}
2887
2888
2889
//-----------------------------------------------------------------------------
2890
2891
substr Parser::_scan_plain_scalar_flow(csubstr currscalar, csubstr peeked_line)
2892
{
2893
static constexpr const csubstr chars = "[]{}?#,";
2894
size_t pos = peeked_line.first_of(chars);
2895
bool first = true;
2896
while(pos != 0)
2897
{
2898
if(has_all(RMAP|RKEY) || has_any(RUNK))
2899
{
2900
csubstr tpkl = peeked_line.triml(' ').trimr("\r\n");
2901
if(tpkl.begins_with(": ") || tpkl == ':')
2902
{
2903
_c4dbgpf("rscalar[FLOW]: map value starts on the peeked line: '{}'", peeked_line);
2904
peeked_line = peeked_line.first(0);
2905
break;
2906
}
2907
else
2908
{
2909
auto colon_pos = peeked_line.first_of_any(": ", ":");
2910
if(colon_pos && colon_pos.pos < pos)
2911
{
2912
peeked_line = peeked_line.first(colon_pos.pos);
2913
_c4dbgpf("rscalar[FLOW]: found colon at {}. peeked='{}'", colon_pos.pos, peeked_line);
2914
_RYML_CB_ASSERT(m_stack.m_callbacks, peeked_line.end() >= m_state->line_contents.rem.begin());
2915
_line_progressed(static_cast<size_t>(peeked_line.end() - m_state->line_contents.rem.begin()));
2916
break;
2917
}
2918
}
2919
}
2920
if(pos != npos)
2921
{
2922
_c4dbgpf("rscalar[FLOW]: found special character '{}' at {}, stopping: '{}'", peeked_line[pos], pos, peeked_line.left_of(pos).trimr("\r\n"));
2923
peeked_line = peeked_line.left_of(pos);
2924
_RYML_CB_ASSERT(m_stack.m_callbacks, peeked_line.end() >= m_state->line_contents.rem.begin());
2925
_line_progressed(static_cast<size_t>(peeked_line.end() - m_state->line_contents.rem.begin()));
2926
break;
2927
}
2928
_c4dbgpf("rscalar[FLOW]: append another line, full: '{}'", peeked_line.trimr("\r\n"));
2929
if(!first)
2930
{
2931
RYML_CHECK(_advance_to_peeked());
2932
}
2933
peeked_line = _scan_to_next_nonempty_line(/*indentation*/0);
2934
if(peeked_line.empty())
2935
{
2936
_c4err("expected token or continuation");
2937
}
2938
pos = peeked_line.first_of(chars);
2939
first = false;
2940
}
2941
substr full(m_buf.str + (currscalar.str - m_buf.str), m_buf.begin() + m_state->pos.offset);
2942
full = full.trimr("\n\r ");
2943
return full;
2944
}
2945
2946
2947
//-----------------------------------------------------------------------------
2948
2949
substr Parser::_scan_plain_scalar_blck(csubstr currscalar, csubstr peeked_line, size_t indentation)
2950
{
2951
_RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.is_super(currscalar));
2952
// NOTE. there's a problem with _scan_to_next_nonempty_line(), as it counts newlines twice
2953
// size_t offs = m_state->pos.offset; // so we workaround by directly counting from the end of the given scalar
2954
_RYML_CB_ASSERT(m_stack.m_callbacks, currscalar.end() >= m_buf.begin());
2955
size_t offs = static_cast<size_t>(currscalar.end() - m_buf.begin());
2956
_RYML_CB_ASSERT(m_stack.m_callbacks, peeked_line.begins_with(' ', indentation));
2957
while(true)
2958
{
2959
_c4dbgpf("rscalar[IMPL]: continuing... ref_indentation={}", indentation);
2960
if(peeked_line.begins_with("...") || peeked_line.begins_with("---"))
2961
{
2962
_c4dbgpf("rscalar[IMPL]: document termination next -- bail now '{}'", peeked_line.trimr("\r\n"));
2963
break;
2964
}
2965
else if(( ! peeked_line.begins_with(' ', indentation))) // is the line deindented?
2966
{
2967
if(!peeked_line.trim(" \r\n\t").empty()) // is the line not blank?
2968
{
2969
_c4dbgpf("rscalar[IMPL]: deindented line, not blank -- bail now '{}'", peeked_line.trimr("\r\n"));
2970
break;
2971
}
2972
_c4dbgpf("rscalar[IMPL]: line is blank and has less indentation: ref={} line={}: '{}'", indentation, peeked_line.first_not_of(' ') == csubstr::npos ? 0 : peeked_line.first_not_of(' '), peeked_line.trimr("\r\n"));
2973
_c4dbgpf("rscalar[IMPL]: ... searching for a line starting at indentation {}", indentation);
2974
csubstr next_peeked = _scan_to_next_nonempty_line(indentation);
2975
if(next_peeked.empty())
2976
{
2977
_c4dbgp("rscalar[IMPL]: ... finished.");
2978
break;
2979
}
2980
_c4dbgp("rscalar[IMPL]: ... continuing.");
2981
peeked_line = next_peeked;
2982
}
2983
2984
_c4dbgpf("rscalar[IMPL]: line contents: '{}'", peeked_line.right_of(indentation, true).trimr("\r\n"));
2985
size_t token_pos;
2986
if(peeked_line.find(": ") != npos)
2987
{
2988
_line_progressed(peeked_line.find(": "));
2989
_c4err("': ' is not a valid token in plain flow (unquoted) scalars");
2990
}
2991
else if(peeked_line.ends_with(':'))
2992
{
2993
_line_progressed(peeked_line.find(':'));
2994
_c4err("lines cannot end with ':' in plain flow (unquoted) scalars");
2995
}
2996
else if((token_pos = peeked_line.find(" #")) != npos)
2997
{
2998
_line_progressed(token_pos);
2999
break;
3000
//_c4err("' #' is not a valid token in plain flow (unquoted) scalars");
3001
}
3002
3003
_c4dbgpf("rscalar[IMPL]: append another line: (len={})'{}'", peeked_line.len, peeked_line.trimr("\r\n"));
3004
if(!_advance_to_peeked())
3005
{
3006
_c4dbgp("rscalar[IMPL]: file finishes after the scalar");
3007
break;
3008
}
3009
peeked_line = m_state->line_contents.rem;
3010
}
3011
_RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.offset >= offs);
3012
substr full(m_buf.str + (currscalar.str - m_buf.str),
3013
currscalar.len + (m_state->pos.offset - offs));
3014
full = full.trimr("\r\n ");
3015
return full;
3016
}
3017
3018
substr Parser::_scan_complex_key(csubstr currscalar, csubstr peeked_line)
3019
{
3020
_RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.is_super(currscalar));
3021
// NOTE. there's a problem with _scan_to_next_nonempty_line(), as it counts newlines twice
3022
// size_t offs = m_state->pos.offset; // so we workaround by directly counting from the end of the given scalar
3023
_RYML_CB_ASSERT(m_stack.m_callbacks, currscalar.end() >= m_buf.begin());
3024
size_t offs = static_cast<size_t>(currscalar.end() - m_buf.begin());
3025
while(true)
3026
{
3027
_c4dbgp("rcplxkey: continuing...");
3028
if(peeked_line.begins_with("...") || peeked_line.begins_with("---"))
3029
{
3030
_c4dbgpf("rcplxkey: document termination next -- bail now '{}'", peeked_line.trimr("\r\n"));
3031
break;
3032
}
3033
else
3034
{
3035
size_t pos = peeked_line.first_of("?:[]{}");
3036
if(pos == csubstr::npos)
3037
{
3038
pos = peeked_line.find("- ");
3039
}
3040
if(pos != csubstr::npos)
3041
{
3042
_c4dbgpf("rcplxkey: found special characters at pos={}: '{}'", pos, peeked_line.trimr("\r\n"));
3043
_line_progressed(pos);
3044
break;
3045
}
3046
}
3047
3048
_c4dbgpf("rcplxkey: no special chars found '{}'", peeked_line.trimr("\r\n"));
3049
csubstr next_peeked = _scan_to_next_nonempty_line(0);
3050
if(next_peeked.empty())
3051
{
3052
_c4dbgp("rcplxkey: empty ... finished.");
3053
break;
3054
}
3055
_c4dbgp("rcplxkey: ... continuing.");
3056
peeked_line = next_peeked;
3057
3058
_c4dbgpf("rcplxkey: line contents: '{}'", peeked_line.trimr("\r\n"));
3059
size_t colpos;
3060
if((colpos = peeked_line.find(": ")) != npos)
3061
{
3062
_c4dbgp("rcplxkey: found ': ', stopping.");
3063
_line_progressed(colpos);
3064
break;
3065
}
3066
#ifdef RYML_NO_COVERAGE__TO_BE_DELETED
3067
else if((colpos = peeked_line.ends_with(':')))
3068
{
3069
_c4dbgp("rcplxkey: ends with ':', stopping.");
3070
_line_progressed(colpos);
3071
break;
3072
}
3073
#endif
3074
_c4dbgpf("rcplxkey: append another line: (len={})'{}'", peeked_line.len, peeked_line.trimr("\r\n"));
3075
if(!_advance_to_peeked())
3076
{
3077
_c4dbgp("rcplxkey: file finishes after the scalar");
3078
break;
3079
}
3080
peeked_line = m_state->line_contents.rem;
3081
}
3082
_RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.offset >= offs);
3083
substr full(m_buf.str + (currscalar.str - m_buf.str),
3084
currscalar.len + (m_state->pos.offset - offs));
3085
return full;
3086
}
3087
3088
//! scans to the next non-blank line starting with the given indentation
3089
csubstr Parser::_scan_to_next_nonempty_line(size_t indentation)
3090
{
3091
csubstr next_peeked;
3092
while(true)
3093
{
3094
_c4dbgpf("rscalar: ... curr offset: {} indentation={}", m_state->pos.offset, indentation);
3095
next_peeked = _peek_next_line(m_state->pos.offset);
3096
csubstr next_peeked_triml = next_peeked.triml(' ');
3097
_c4dbgpf("rscalar: ... next peeked line='{}'", next_peeked.trimr("\r\n"));
3098
if(next_peeked_triml.begins_with('#'))
3099
{
3100
_c4dbgp("rscalar: ... first non-space character is #");
3101
return {};
3102
}
3103
else if(next_peeked.begins_with(' ', indentation))
3104
{
3105
_c4dbgpf("rscalar: ... begins at same indentation {}, assuming continuation", indentation);
3106
_advance_to_peeked();
3107
return next_peeked;
3108
}
3109
else // check for de-indentation
3110
{
3111
csubstr trimmed = next_peeked_triml.trimr("\t\r\n");
3112
_c4dbgpf("rscalar: ... deindented! trimmed='{}'", trimmed);
3113
if(!trimmed.empty())
3114
{
3115
_c4dbgp("rscalar: ... and not empty. bailing out.");
3116
return {};
3117
}
3118
}
3119
if(!_advance_to_peeked())
3120
{
3121
_c4dbgp("rscalar: file finished");
3122
return {};
3123
}
3124
}
3125
return {};
3126
}
3127
3128
// returns false when the file finished
3129
bool Parser::_advance_to_peeked()
3130
{
3131
_line_progressed(m_state->line_contents.rem.len);
3132
_line_ended(); // advances to the peeked-at line, consuming all remaining (probably newline) characters on the current line
3133
_RYML_CB_ASSERT(m_stack.m_callbacks, m_state->line_contents.rem.first_of("\r\n") == csubstr::npos);
3134
_c4dbgpf("advance to peeked: scan more... pos={} len={}", m_state->pos.offset, m_buf.len);
3135
_scan_line(); // puts the peeked-at line in the buffer
3136
if(_finished_file())
3137
{
3138
_c4dbgp("rscalar: finished file!");
3139
return false;
3140
}
3141
return true;
3142
}
3143
3144
//-----------------------------------------------------------------------------
3145
3146
C4_ALWAYS_INLINE size_t _extend_from_combined_newline(char nl, char following)
3147
{
3148
return (nl == '\n' && following == '\r') || (nl == '\r' && following == '\n');
3149
}
3150
3151
//! look for the next newline chars, and jump to the right of those
3152
csubstr from_next_line(csubstr rem)
3153
{
3154
size_t nlpos = rem.first_of("\r\n");
3155
if(nlpos == csubstr::npos)
3156
return {};
3157
const char nl = rem[nlpos];
3158
rem = rem.right_of(nlpos);
3159
if(rem.empty())
3160
return {};
3161
if(_extend_from_combined_newline(nl, rem.front()))
3162
rem = rem.sub(1);
3163
return rem;
3164
}
3165
3166
csubstr Parser::_peek_next_line(size_t pos) const
3167
{
3168
csubstr rem{}; // declare here because of the goto
3169
size_t nlpos{}; // declare here because of the goto
3170
pos = pos == npos ? m_state->pos.offset : pos;
3171
if(pos >= m_buf.len)
3172
goto next_is_empty;
3173
3174
// look for the next newline chars, and jump to the right of those
3175
rem = from_next_line(m_buf.sub(pos));
3176
if(rem.empty())
3177
goto next_is_empty;
3178
3179
// now get everything up to and including the following newline chars
3180
nlpos = rem.first_of("\r\n");
3181
if((nlpos != csubstr::npos) && (nlpos + 1 < rem.len))
3182
nlpos += _extend_from_combined_newline(rem[nlpos], rem[nlpos+1]);
3183
rem = rem.left_of(nlpos, /*include_pos*/true);
3184
3185
_c4dbgpf("peek next line @ {}: (len={})'{}'", pos, rem.len, rem.trimr("\r\n"));
3186
return rem;
3187
3188
next_is_empty:
3189
_c4dbgpf("peek next line @ {}: (len=0)''", pos);
3190
return {};
3191
}
3192
3193
3194
//-----------------------------------------------------------------------------
3195
void Parser::LineContents::reset_with_next_line(csubstr buf, size_t offset)
3196
{
3197
RYML_ASSERT(offset <= buf.len);
3198
char const* C4_RESTRICT b = &buf[offset];
3199
char const* C4_RESTRICT e = b;
3200
// get the current line stripped of newline chars
3201
while(e < buf.end() && (*e != '\n' && *e != '\r'))
3202
++e;
3203
RYML_ASSERT(e >= b);
3204
const csubstr stripped_ = buf.sub(offset, static_cast<size_t>(e - b));
3205
// advance pos to include the first line ending
3206
if(e != buf.end() && *e == '\r')
3207
++e;
3208
if(e != buf.end() && *e == '\n')
3209
++e;
3210
RYML_ASSERT(e >= b);
3211
const csubstr full_ = buf.sub(offset, static_cast<size_t>(e - b));
3212
reset(full_, stripped_);
3213
}
3214
3215
void Parser::_scan_line()
3216
{
3217
if(m_state->pos.offset >= m_buf.len)
3218
{
3219
m_state->line_contents.reset(m_buf.last(0), m_buf.last(0));
3220
return;
3221
}
3222
m_state->line_contents.reset_with_next_line(m_buf, m_state->pos.offset);
3223
}
3224
3225
3226
//-----------------------------------------------------------------------------
3227
void Parser::_line_progressed(size_t ahead)
3228
{
3229
_c4dbgpf("line[{}] ({} cols) progressed by {}: col {}-->{} offset {}-->{}", m_state->pos.line, m_state->line_contents.full.len, ahead, m_state->pos.col, m_state->pos.col+ahead, m_state->pos.offset, m_state->pos.offset+ahead);
3230
m_state->pos.offset += ahead;
3231
m_state->pos.col += ahead;
3232
_RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.col <= m_state->line_contents.stripped.len+1);
3233
m_state->line_contents.rem = m_state->line_contents.rem.sub(ahead);
3234
}
3235
3236
void Parser::_line_ended()
3237
{
3238
_c4dbgpf("line[{}] ({} cols) ended! offset {}-->{}", m_state->pos.line, m_state->line_contents.full.len, m_state->pos.offset, m_state->pos.offset+m_state->line_contents.full.len - m_state->line_contents.stripped.len);
3239
_RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.col == m_state->line_contents.stripped.len+1);
3240
m_state->pos.offset += m_state->line_contents.full.len - m_state->line_contents.stripped.len;
3241
++m_state->pos.line;
3242
m_state->pos.col = 1;
3243
}
3244
3245
void Parser::_line_ended_undo()
3246
{
3247
_RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.col == 1u);
3248
_RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.line > 0u);
3249
_RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.offset >= m_state->line_contents.full.len - m_state->line_contents.stripped.len);
3250
size_t delta = m_state->line_contents.full.len - m_state->line_contents.stripped.len;
3251
_c4dbgpf("line[{}] undo ended! line {}-->{}, offset {}-->{}", m_state->pos.line, m_state->pos.line, m_state->pos.line - 1, m_state->pos.offset, m_state->pos.offset - delta);
3252
m_state->pos.offset -= delta;
3253
--m_state->pos.line;
3254
m_state->pos.col = m_state->line_contents.stripped.len + 1u;
3255
// don't forget to undo also the changes to the remainder of the line
3256
_RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.offset >= m_buf.len || m_buf[m_state->pos.offset] == '\n' || m_buf[m_state->pos.offset] == '\r');
3257
m_state->line_contents.rem = m_buf.sub(m_state->pos.offset, 0);
3258
}
3259
3260
3261
//-----------------------------------------------------------------------------
3262
void Parser::_set_indentation(size_t indentation)
3263
{
3264
m_state->indref = indentation;
3265
_c4dbgpf("state[{}]: saving indentation: {}", m_state-m_stack.begin(), m_state->indref);
3266
}
3267
3268
void Parser::_save_indentation(size_t behind)
3269
{
3270
_RYML_CB_ASSERT(m_stack.m_callbacks, m_state->line_contents.rem.begin() >= m_state->line_contents.full.begin());
3271
m_state->indref = static_cast<size_t>(m_state->line_contents.rem.begin() - m_state->line_contents.full.begin());
3272
_RYML_CB_ASSERT(m_stack.m_callbacks, behind <= m_state->indref);
3273
m_state->indref -= behind;
3274
_c4dbgpf("state[{}]: saving indentation: {}", m_state-m_stack.begin(), m_state->indref);
3275
}
3276
3277
bool Parser::_maybe_set_indentation_from_anchor_or_tag()
3278
{
3279
if(m_key_anchor.not_empty())
3280
{
3281
_c4dbgpf("set indentation from key anchor: {}", m_key_anchor_indentation);
3282
_set_indentation(m_key_anchor_indentation); // this is the column where the anchor starts
3283
return true;
3284
}
3285
else if(m_key_tag.not_empty())
3286
{
3287
_c4dbgpf("set indentation from key tag: {}", m_key_tag_indentation);
3288
_set_indentation(m_key_tag_indentation); // this is the column where the tag starts
3289
return true;
3290
}
3291
return false;
3292
}
3293
3294
3295
//-----------------------------------------------------------------------------
3296
void Parser::_write_key_anchor(size_t node_id)
3297
{
3298
_RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->has_key(node_id));
3299
if( ! m_key_anchor.empty())
3300
{
3301
_c4dbgpf("node={}: set key anchor to '{}'", node_id, m_key_anchor);
3302
m_tree->set_key_anchor(node_id, m_key_anchor);
3303
m_key_anchor.clear();
3304
m_key_anchor_was_before = false;
3305
m_key_anchor_indentation = 0;
3306
}
3307
else if( ! m_tree->is_key_quoted(node_id))
3308
{
3309
csubstr r = m_tree->key(node_id);
3310
if(r.begins_with('*'))
3311
{
3312
_c4dbgpf("node={}: set key reference: '{}'", node_id, r);
3313
m_tree->set_key_ref(node_id, r.sub(1));
3314
}
3315
else if(r == "<<")
3316
{
3317
m_tree->set_key_ref(node_id, r);
3318
_c4dbgpf("node={}: it's an inheriting reference", node_id);
3319
if(m_tree->is_seq(node_id))
3320
{
3321
_c4dbgpf("node={}: inheriting from seq of {}", node_id, m_tree->num_children(node_id));
3322
for(size_t i = m_tree->first_child(node_id); i != NONE; i = m_tree->next_sibling(i))
3323
{
3324
if( ! (m_tree->val(i).begins_with('*')))
3325
_c4err("malformed reference: '{}'", m_tree->val(i));
3326
}
3327
}
3328
else if( ! m_tree->val(node_id).begins_with('*'))
3329
{
3330
_c4err("malformed reference: '{}'", m_tree->val(node_id));
3331
}
3332
//m_tree->set_key_ref(node_id, r);
3333
}
3334
}
3335
}
3336
3337
//-----------------------------------------------------------------------------
3338
void Parser::_write_val_anchor(size_t node_id)
3339
{
3340
if( ! m_val_anchor.empty())
3341
{
3342
_c4dbgpf("node={}: set val anchor to '{}'", node_id, m_val_anchor);
3343
m_tree->set_val_anchor(node_id, m_val_anchor);
3344
m_val_anchor.clear();
3345
}
3346
csubstr r = m_tree->has_val(node_id) ? m_tree->val(node_id) : "";
3347
if(!m_tree->is_val_quoted(node_id) && r.begins_with('*'))
3348
{
3349
_c4dbgpf("node={}: set val reference: '{}'", node_id, r);
3350
RYML_CHECK(!m_tree->has_val_anchor(node_id));
3351
m_tree->set_val_ref(node_id, r.sub(1));
3352
}
3353
}
3354
3355
//-----------------------------------------------------------------------------
3356
void Parser::_push_level(bool explicit_flow_chars)
3357
{
3358
_c4dbgpf("pushing level! currnode={} currlevel={} stacksize={} stackcap={}", m_state->node_id, m_state->level, m_stack.size(), m_stack.capacity());
3359
_RYML_CB_ASSERT(m_stack.m_callbacks, m_state == &m_stack.top());
3360
if(node(m_state) == nullptr)
3361
{
3362
_c4dbgp("pushing level! actually no, current node is null");
3363
//_RYML_CB_ASSERT(m_stack.m_callbacks, ! explicit_flow_chars);
3364
return;
3365
}
3366
flag_t st = RUNK;
3367
if(explicit_flow_chars || has_all(FLOW))
3368
{
3369
st |= FLOW;
3370
}
3371
m_stack.push_top();
3372
m_state = &m_stack.top();
3373
set_flags(st);
3374
m_state->node_id = (size_t)NONE;
3375
m_state->indref = (size_t)NONE;
3376
++m_state->level;
3377
_c4dbgpf("pushing level: now, currlevel={}", m_state->level);
3378
}
3379
3380
void Parser::_pop_level()
3381
{
3382
_c4dbgpf("popping level! currnode={} currlevel={}", m_state->node_id, m_state->level);
3383
if(has_any(RMAP) || m_tree->is_map(m_state->node_id))
3384
{
3385
_stop_map();
3386
}
3387
if(has_any(RSEQ) || m_tree->is_seq(m_state->node_id))
3388
{
3389
_stop_seq();
3390
}
3391
if(m_tree->is_doc(m_state->node_id))
3392
{
3393
_stop_doc();
3394
}
3395
_RYML_CB_ASSERT(m_stack.m_callbacks, m_stack.size() > 1);
3396
_prepare_pop();
3397
m_stack.pop();
3398
m_state = &m_stack.top();
3399
/*if(has_any(RMAP))
3400
{
3401
_toggle_key_val();
3402
}*/
3403
if(m_state->line_contents.indentation == 0)
3404
{
3405
//_RYML_CB_ASSERT(m_stack.m_callbacks, has_none(RTOP));
3406
add_flags(RTOP);
3407
}
3408
_c4dbgpf("popping level: now, currnode={} currlevel={}", m_state->node_id, m_state->level);
3409
}
3410
3411
//-----------------------------------------------------------------------------
3412
void Parser::_start_unk(bool /*as_child*/)
3413
{
3414
_c4dbgp("start_unk");
3415
_push_level();
3416
_move_scalar_from_top();
3417
}
3418
3419
//-----------------------------------------------------------------------------
3420
void Parser::_start_doc(bool as_child)
3421
{
3422
_c4dbgpf("start_doc (as child={})", as_child);
3423
_RYML_CB_ASSERT(m_stack.m_callbacks, node(m_stack.bottom()) == node(m_root_id));
3424
size_t parent_id = m_stack.size() < 2 ? m_root_id : m_stack.top(1).node_id;
3425
_RYML_CB_ASSERT(m_stack.m_callbacks, parent_id != NONE);
3426
_RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_root(parent_id));
3427
_RYML_CB_ASSERT(m_stack.m_callbacks, node(m_state) == nullptr || node(m_state) == node(m_root_id));
3428
if(as_child)
3429
{
3430
_c4dbgpf("start_doc: parent={}", parent_id);
3431
if( ! m_tree->is_stream(parent_id))
3432
{
3433
_c4dbgp("start_doc: rearranging with root as STREAM");
3434
m_tree->set_root_as_stream();
3435
}
3436
m_state->node_id = m_tree->append_child(parent_id);
3437
m_tree->to_doc(m_state->node_id);
3438
}
3439
#ifdef RYML_NO_COVERAGE__TO_BE_DELETED
3440
else
3441
{
3442
_RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_seq(parent_id) || m_tree->empty(parent_id));
3443
m_state->node_id = parent_id;
3444
if( ! m_tree->is_doc(parent_id))
3445
{
3446
m_tree->to_doc(parent_id, DOC);
3447
}
3448
}
3449
#endif
3450
_c4dbgpf("start_doc: id={}", m_state->node_id);
3451
add_flags(RUNK|RTOP|NDOC);
3452
_handle_types();
3453
rem_flags(NDOC);
3454
}
3455
3456
void Parser::_stop_doc()
3457
{
3458
size_t doc_node = m_state->node_id;
3459
_c4dbgpf("stop_doc[{}]", doc_node);
3460
_RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_doc(doc_node));
3461
if(!m_tree->is_seq(doc_node) && !m_tree->is_map(doc_node) && !m_tree->is_val(doc_node))
3462
{
3463
_RYML_CB_ASSERT(m_stack.m_callbacks, has_none(SSCL));
3464
_c4dbgpf("stop_doc[{}]: there was nothing; adding null val", doc_node);
3465
m_tree->to_val(doc_node, {}, DOC);
3466
}
3467
}
3468
3469
void Parser::_end_stream()
3470
{
3471
_c4dbgpf("end_stream, level={} node_id={}", m_state->level, m_state->node_id);
3472
_RYML_CB_ASSERT(m_stack.m_callbacks, ! m_stack.empty());
3473
NodeData *added = nullptr;
3474
if(has_any(SSCL))
3475
{
3476
if(m_tree->is_seq(m_state->node_id))
3477
{
3478
_c4dbgp("append val...");
3479
added = _append_val(_consume_scalar());
3480
}
3481
else if(m_tree->is_map(m_state->node_id))
3482
{
3483
_c4dbgp("append null key val...");
3484
added = _append_key_val_null(m_state->line_contents.rem.str);
3485
#ifdef RYML_NO_COVERAGE__TO_BE_DELETED
3486
if(has_any(RSEQIMAP))
3487
{
3488
_stop_seqimap();
3489
_pop_level();
3490
}
3491
#endif
3492
}
3493
else if(m_tree->is_doc(m_state->node_id) || m_tree->type(m_state->node_id) == NOTYPE)
3494
{
3495
NodeType_e quoted = has_any(QSCL) ? VALQUO : NOTYPE; // do this before consuming the scalar
3496
csubstr scalar = _consume_scalar();
3497
_c4dbgpf("node[{}]: to docval '{}'{}", m_state->node_id, scalar, quoted == VALQUO ? ", quoted" : "");
3498
m_tree->to_val(m_state->node_id, scalar, DOC|quoted);
3499
added = m_tree->get(m_state->node_id);
3500
}
3501
else
3502
{
3503
_c4err("internal error");
3504
}
3505
}
3506
else if(has_all(RSEQ|RVAL) && has_none(FLOW))
3507
{
3508
_c4dbgp("add last...");
3509
added = _append_val_null(m_state->line_contents.rem.str);
3510
}
3511
else if(!m_val_tag.empty() && (m_tree->is_doc(m_state->node_id) || m_tree->type(m_state->node_id) == NOTYPE))
3512
{
3513
csubstr scalar = m_state->line_contents.rem.first(0);
3514
_c4dbgpf("node[{}]: add null scalar as docval", m_state->node_id);
3515
m_tree->to_val(m_state->node_id, scalar, DOC);
3516
added = m_tree->get(m_state->node_id);
3517
}
3518
3519
if(added)
3520
{
3521
size_t added_id = m_tree->id(added);
3522
if(m_tree->is_seq(m_state->node_id) || m_tree->is_doc(m_state->node_id))
3523
{
3524
if(!m_key_anchor.empty())
3525
{
3526
_c4dbgpf("node[{}]: move key to val anchor: '{}'", added_id, m_key_anchor);
3527
m_val_anchor = m_key_anchor;
3528
m_key_anchor = {};
3529
}
3530
if(!m_key_tag.empty())
3531
{
3532
_c4dbgpf("node[{}]: move key to val tag: '{}'", added_id, m_key_tag);
3533
m_val_tag = m_key_tag;
3534
m_key_tag = {};
3535
}
3536
}
3537
#ifdef RYML_NO_COVERAGE__TO_BE_DELETED
3538
if(!m_key_anchor.empty())
3539
{
3540
_c4dbgpf("node[{}]: set key anchor='{}'", added_id, m_key_anchor);
3541
m_tree->set_key_anchor(added_id, m_key_anchor);
3542
m_key_anchor = {};
3543
}
3544
#endif
3545
if(!m_val_anchor.empty())
3546
{
3547
_c4dbgpf("node[{}]: set val anchor='{}'", added_id, m_val_anchor);
3548
m_tree->set_val_anchor(added_id, m_val_anchor);
3549
m_val_anchor = {};
3550
}
3551
#ifdef RYML_NO_COVERAGE__TO_BE_DELETED
3552
if(!m_key_tag.empty())
3553
{
3554
_c4dbgpf("node[{}]: set key tag='{}' -> '{}'", added_id, m_key_tag, normalize_tag(m_key_tag));
3555
m_tree->set_key_tag(added_id, normalize_tag(m_key_tag));
3556
m_key_tag = {};
3557
}
3558
#endif
3559
if(!m_val_tag.empty())
3560
{
3561
_c4dbgpf("node[{}]: set val tag='{}' -> '{}'", added_id, m_val_tag, normalize_tag(m_val_tag));
3562
m_tree->set_val_tag(added_id, normalize_tag(m_val_tag));
3563
m_val_tag = {};
3564
}
3565
}
3566
3567
while(m_stack.size() > 1)
3568
{
3569
_c4dbgpf("popping level: {} (stack sz={})", m_state->level, m_stack.size());
3570
_RYML_CB_ASSERT(m_stack.m_callbacks, ! has_any(SSCL, &m_stack.top()));
3571
if(has_all(RSEQ|FLOW))
3572
_err("closing ] not found");
3573
_pop_level();
3574
}
3575
add_flags(NDOC);
3576
}
3577
3578
void Parser::_start_new_doc(csubstr rem)
3579
{
3580
_c4dbgp("_start_new_doc");
3581
_RYML_CB_ASSERT(m_stack.m_callbacks, rem.begins_with("---"));
3582
C4_UNUSED(rem);
3583
3584
_end_stream();
3585
3586
size_t indref = m_state->indref;
3587
_c4dbgpf("start a document, indentation={}", indref);
3588
_line_progressed(3);
3589
_push_level();
3590
_start_doc();
3591
_set_indentation(indref);
3592
}
3593
3594
3595
//-----------------------------------------------------------------------------
3596
void Parser::_start_map(bool as_child)
3597
{
3598
_c4dbgpf("start_map (as child={})", as_child);
3599
addrem_flags(RMAP|RVAL, RKEY|RUNK);
3600
_RYML_CB_ASSERT(m_stack.m_callbacks, node(m_stack.bottom()) == node(m_root_id));
3601
size_t parent_id = m_stack.size() < 2 ? m_root_id : m_stack.top(1).node_id;
3602
_RYML_CB_ASSERT(m_stack.m_callbacks, parent_id != NONE);
3603
_RYML_CB_ASSERT(m_stack.m_callbacks, node(m_state) == nullptr || node(m_state) == node(m_root_id));
3604
if(as_child)
3605
{
3606
m_state->node_id = m_tree->append_child(parent_id);
3607
if(has_all(SSCL))
3608
{
3609
type_bits key_quoted = NOTYPE;
3610
if(m_state->flags & QSCL) // before consuming the scalar
3611
key_quoted |= KEYQUO;
3612
csubstr key = _consume_scalar();
3613
m_tree->to_map(m_state->node_id, key, key_quoted);
3614
_c4dbgpf("start_map: id={} key='{}'", m_state->node_id, m_tree->key(m_state->node_id));
3615
_write_key_anchor(m_state->node_id);
3616
if( ! m_key_tag.empty())
3617
{
3618
_c4dbgpf("node[{}]: set key tag='{}' -> '{}'", m_state->node_id, m_key_tag, normalize_tag(m_key_tag));
3619
m_tree->set_key_tag(m_state->node_id, normalize_tag(m_key_tag));
3620
m_key_tag.clear();
3621
}
3622
}
3623
else
3624
{
3625
m_tree->to_map(m_state->node_id);
3626
_c4dbgpf("start_map: id={}", m_state->node_id);
3627
}
3628
m_tree->_p(m_state->node_id)->m_val.scalar.str = m_state->line_contents.rem.str;
3629
_write_val_anchor(m_state->node_id);
3630
}
3631
else
3632
{
3633
_RYML_CB_ASSERT(m_stack.m_callbacks, parent_id != NONE);
3634
m_state->node_id = parent_id;
3635
_c4dbgpf("start_map: id={}", m_state->node_id);
3636
type_bits as_doc = 0;
3637
if(m_tree->is_doc(m_state->node_id))
3638
as_doc |= DOC;
3639
if(!m_tree->is_map(parent_id))
3640
{
3641
RYML_CHECK(!m_tree->has_children(parent_id));
3642
m_tree->to_map(parent_id, as_doc);
3643
}
3644
else
3645
{
3646
m_tree->_add_flags(parent_id, as_doc);
3647
}
3648
_move_scalar_from_top();
3649
if(m_key_anchor.not_empty())
3650
m_key_anchor_was_before = true;
3651
_write_val_anchor(parent_id);
3652
if(m_stack.size() >= 2)
3653
{
3654
State const& parent_state = m_stack.top(1);
3655
if(parent_state.flags & RSET)
3656
add_flags(RSET);
3657
}
3658
m_tree->_p(parent_id)->m_val.scalar.str = m_state->line_contents.rem.str;
3659
}
3660
if( ! m_val_tag.empty())
3661
{
3662
_c4dbgpf("node[{}]: set val tag='{}' -> '{}'", m_state->node_id, m_val_tag, normalize_tag(m_val_tag));
3663
m_tree->set_val_tag(m_state->node_id, normalize_tag(m_val_tag));
3664
m_val_tag.clear();
3665
}
3666
}
3667
3668
void Parser::_start_map_unk(bool as_child)
3669
{
3670
_c4dbgpf("start_map_unk (as child={})", as_child);
3671
if(!m_key_anchor_was_before)
3672
{
3673
_c4dbgpf("stash key anchor before starting map... '{}'", m_key_anchor);
3674
csubstr ka = m_key_anchor;
3675
m_key_anchor = {};
3676
_start_map(as_child);
3677
m_key_anchor = ka;
3678
}
3679
else
3680
{
3681
_start_map(as_child);
3682
m_key_anchor_was_before = false;
3683
}
3684
if(m_key_tag2.not_empty())
3685
{
3686
m_key_tag = m_key_tag2;
3687
m_key_tag_indentation = m_key_tag2_indentation;
3688
m_key_tag2.clear();
3689
m_key_tag2_indentation = 0;
3690
}
3691
}
3692
3693
void Parser::_stop_map()
3694
{
3695
_c4dbgpf("stop_map[{}]", m_state->node_id);
3696
_RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_map(m_state->node_id));
3697
if(has_all(QMRK|RKEY) && !has_all(SSCL))
3698
{
3699
_c4dbgpf("stop_map[{}]: RKEY", m_state->node_id);
3700
_store_scalar_null(m_state->line_contents.rem.str);
3701
_append_key_val_null(m_state->line_contents.rem.str);
3702
}
3703
}
3704
3705
3706
//-----------------------------------------------------------------------------
3707
void Parser::_start_seq(bool as_child)
3708
{
3709
_c4dbgpf("start_seq (as child={})", as_child);
3710
if(has_all(RTOP|RUNK))
3711
{
3712
_c4dbgpf("start_seq: moving key tag to val tag: '{}'", m_key_tag);
3713
m_val_tag = m_key_tag;
3714
m_key_tag.clear();
3715
}
3716
addrem_flags(RSEQ|RVAL, RUNK);
3717
_RYML_CB_ASSERT(m_stack.m_callbacks, node(m_stack.bottom()) == node(m_root_id));
3718
size_t parent_id = m_stack.size() < 2 ? m_root_id : m_stack.top(1).node_id;
3719
_RYML_CB_ASSERT(m_stack.m_callbacks, parent_id != NONE);
3720
_RYML_CB_ASSERT(m_stack.m_callbacks, node(m_state) == nullptr || node(m_state) == node(m_root_id));
3721
if(as_child)
3722
{
3723
m_state->node_id = m_tree->append_child(parent_id);
3724
if(has_all(SSCL))
3725
{
3726
_RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_map(parent_id));
3727
type_bits key_quoted = 0;
3728
if(m_state->flags & QSCL) // before consuming the scalar
3729
key_quoted |= KEYQUO;
3730
csubstr key = _consume_scalar();
3731
m_tree->to_seq(m_state->node_id, key, key_quoted);
3732
_c4dbgpf("start_seq: id={} name='{}'", m_state->node_id, m_tree->key(m_state->node_id));
3733
_write_key_anchor(m_state->node_id);
3734
if( ! m_key_tag.empty())
3735
{
3736
_c4dbgpf("start_seq[{}]: set key tag='{}' -> '{}'", m_state->node_id, m_key_tag, normalize_tag(m_key_tag));
3737
m_tree->set_key_tag(m_state->node_id, normalize_tag(m_key_tag));
3738
m_key_tag.clear();
3739
}
3740
}
3741
else
3742
{
3743
type_bits as_doc = 0;
3744
_RYML_CB_ASSERT(m_stack.m_callbacks, !m_tree->is_doc(m_state->node_id));
3745
m_tree->to_seq(m_state->node_id, as_doc);
3746
_c4dbgpf("start_seq: id={}{}", m_state->node_id, as_doc ? " as doc" : "");
3747
}
3748
_write_val_anchor(m_state->node_id);
3749
m_tree->_p(m_state->node_id)->m_val.scalar.str = m_state->line_contents.rem.str;
3750
}
3751
else
3752
{
3753
m_state->node_id = parent_id;
3754
type_bits as_doc = 0;
3755
if(m_tree->is_doc(m_state->node_id))
3756
as_doc |= DOC;
3757
if(!m_tree->is_seq(parent_id))
3758
{
3759
RYML_CHECK(!m_tree->has_children(parent_id));
3760
m_tree->to_seq(parent_id, as_doc);
3761
}
3762
else
3763
{
3764
m_tree->_add_flags(parent_id, as_doc);
3765
}
3766
_move_scalar_from_top();
3767
_c4dbgpf("start_seq: id={}{}", m_state->node_id, as_doc ? " as_doc" : "");
3768
_write_val_anchor(parent_id);
3769
m_tree->_p(parent_id)->m_val.scalar.str = m_state->line_contents.rem.str;
3770
}
3771
if( ! m_val_tag.empty())
3772
{
3773
_c4dbgpf("start_seq[{}]: set val tag='{}' -> '{}'", m_state->node_id, m_val_tag, normalize_tag(m_val_tag));
3774
m_tree->set_val_tag(m_state->node_id, normalize_tag(m_val_tag));
3775
m_val_tag.clear();
3776
}
3777
}
3778
3779
void Parser::_stop_seq()
3780
{
3781
_c4dbgp("stop_seq");
3782
_RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_seq(m_state->node_id));
3783
}
3784
3785
3786
//-----------------------------------------------------------------------------
3787
void Parser::_start_seqimap()
3788
{
3789
_c4dbgpf("start_seqimap at node={}. has_children={}", m_state->node_id, m_tree->has_children(m_state->node_id));
3790
_RYML_CB_ASSERT(m_stack.m_callbacks, has_all(RSEQ|FLOW));
3791
// create a map, and turn the last scalar of this sequence
3792
// into the key of the map's first child. This scalar was
3793
// understood to be a value in the sequence, but it is
3794
// actually a key of a map, implicitly opened here.
3795
// Eg [val, key: val]
3796
//
3797
// Yep, YAML is crazy.
3798
if(m_tree->has_children(m_state->node_id) && m_tree->has_val(m_tree->last_child(m_state->node_id)))
3799
{
3800
size_t prev = m_tree->last_child(m_state->node_id);
3801
NodeType ty = m_tree->_p(prev)->m_type; // don't use type() because it masks out the quotes
3802
NodeScalar tmp = m_tree->valsc(prev);
3803
_c4dbgpf("has children and last child={} has val. saving the scalars, val='{}' quoted={}", prev, tmp.scalar, ty.is_val_quoted());
3804
m_tree->remove(prev);
3805
_push_level();
3806
_start_map();
3807
_store_scalar(tmp.scalar, ty.is_val_quoted());
3808
m_key_anchor = tmp.anchor;
3809
m_key_tag = tmp.tag;
3810
}
3811
else
3812
{
3813
_c4dbgpf("node {} has no children yet, using empty key", m_state->node_id);
3814
_push_level();
3815
_start_map();
3816
_store_scalar_null(m_state->line_contents.rem.str);
3817
}
3818
add_flags(RSEQIMAP|FLOW);
3819
}
3820
3821
void Parser::_stop_seqimap()
3822
{
3823
_c4dbgp("stop_seqimap");
3824
_RYML_CB_ASSERT(m_stack.m_callbacks, has_all(RSEQIMAP));
3825
}
3826
3827
3828
//-----------------------------------------------------------------------------
3829
NodeData* Parser::_append_val(csubstr val, flag_t quoted)
3830
{
3831
_RYML_CB_ASSERT(m_stack.m_callbacks, ! has_all(SSCL));
3832
_RYML_CB_ASSERT(m_stack.m_callbacks, node(m_state) != nullptr);
3833
_RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_seq(m_state->node_id));
3834
type_bits additional_flags = quoted ? VALQUO : NOTYPE;
3835
_c4dbgpf("append val: '{}' to parent id={} (level={}){}", val, m_state->node_id, m_state->level, quoted ? " VALQUO!" : "");
3836
size_t nid = m_tree->append_child(m_state->node_id);
3837
m_tree->to_val(nid, val, additional_flags);
3838
_c4dbgpf("append val: id={} val='{}'", nid, m_tree->get(nid)->m_val.scalar);
3839
if( ! m_val_tag.empty())
3840
{
3841
_c4dbgpf("append val[{}]: set val tag='{}' -> '{}'", nid, m_val_tag, normalize_tag(m_val_tag));
3842
m_tree->set_val_tag(nid, normalize_tag(m_val_tag));
3843
m_val_tag.clear();
3844
}
3845
_write_val_anchor(nid);
3846
return m_tree->get(nid);
3847
}
3848
3849
NodeData* Parser::_append_key_val(csubstr val, flag_t val_quoted)
3850
{
3851
_RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_map(m_state->node_id));
3852
type_bits additional_flags = 0;
3853
if(m_state->flags & QSCL)
3854
additional_flags |= KEYQUO;
3855
if(val_quoted)
3856
additional_flags |= VALQUO;
3857
csubstr key = _consume_scalar();
3858
_c4dbgpf("append keyval: '{}' '{}' to parent id={} (level={}){}{}", key, val, m_state->node_id, m_state->level, (additional_flags & KEYQUO) ? " KEYQUO!" : "", (additional_flags & VALQUO) ? " VALQUO!" : "");
3859
size_t nid = m_tree->append_child(m_state->node_id);
3860
m_tree->to_keyval(nid, key, val, additional_flags);
3861
_c4dbgpf("append keyval: id={} key='{}' val='{}'", nid, m_tree->key(nid), m_tree->val(nid));
3862
if( ! m_key_tag.empty())
3863
{
3864
_c4dbgpf("append keyval[{}]: set key tag='{}' -> '{}'", nid, m_key_tag, normalize_tag(m_key_tag));
3865
m_tree->set_key_tag(nid, normalize_tag(m_key_tag));
3866
m_key_tag.clear();
3867
}
3868
if( ! m_val_tag.empty())
3869
{
3870
_c4dbgpf("append keyval[{}]: set val tag='{}' -> '{}'", nid, m_val_tag, normalize_tag(m_val_tag));
3871
m_tree->set_val_tag(nid, normalize_tag(m_val_tag));
3872
m_val_tag.clear();
3873
}
3874
_write_key_anchor(nid);
3875
_write_val_anchor(nid);
3876
rem_flags(QMRK);
3877
return m_tree->get(nid);
3878
}
3879
3880
3881
//-----------------------------------------------------------------------------
3882
void Parser::_store_scalar(csubstr s, flag_t is_quoted)
3883
{
3884
_c4dbgpf("state[{}]: storing scalar '{}' (flag: {}) (old scalar='{}')",
3885
m_state-m_stack.begin(), s, m_state->flags & SSCL, m_state->scalar);
3886
RYML_CHECK(has_none(SSCL));
3887
add_flags(SSCL | (is_quoted * QSCL));
3888
m_state->scalar = s;
3889
}
3890
3891
csubstr Parser::_consume_scalar()
3892
{
3893
_c4dbgpf("state[{}]: consuming scalar '{}' (flag: {}))", m_state-m_stack.begin(), m_state->scalar, m_state->flags & SSCL);
3894
RYML_CHECK(m_state->flags & SSCL);
3895
csubstr s = m_state->scalar;
3896
rem_flags(SSCL | QSCL);
3897
m_state->scalar.clear();
3898
return s;
3899
}
3900
3901
void Parser::_move_scalar_from_top()
3902
{
3903
if(m_stack.size() < 2) return;
3904
State &prev = m_stack.top(1);
3905
_RYML_CB_ASSERT(m_stack.m_callbacks, m_state == &m_stack.top());
3906
_RYML_CB_ASSERT(m_stack.m_callbacks, m_state != &prev);
3907
if(prev.flags & SSCL)
3908
{
3909
_c4dbgpf("moving scalar '{}' from state[{}] to state[{}] (overwriting '{}')", prev.scalar, &prev-m_stack.begin(), m_state-m_stack.begin(), m_state->scalar);
3910
add_flags(prev.flags & (SSCL | QSCL));
3911
m_state->scalar = prev.scalar;
3912
rem_flags(SSCL | QSCL, &prev);
3913
prev.scalar.clear();
3914
}
3915
}
3916
3917
//-----------------------------------------------------------------------------
3918
/** @todo this function is a monster and needs love. Likely, it needs
3919
* to be split like _scan_scalar_*() */
3920
bool Parser::_handle_indentation()
3921
{
3922
_RYML_CB_ASSERT(m_stack.m_callbacks, has_none(FLOW));
3923
if( ! _at_line_begin())
3924
return false;
3925
3926
size_t ind = m_state->line_contents.indentation;
3927
csubstr rem = m_state->line_contents.rem;
3928
/** @todo instead of trimming, we should use the indentation index from above */
3929
csubstr remt = rem.triml(' ');
3930
3931
if(remt.empty() || remt.begins_with('#')) // this is a blank or comment line
3932
{
3933
_line_progressed(rem.size());
3934
return true;
3935
}
3936
3937
_c4dbgpf("indentation? ind={} indref={}", ind, m_state->indref);
3938
if(ind == m_state->indref)
3939
{
3940
_c4dbgpf("same indentation: {}", ind);
3941
if(!rem.sub(ind).begins_with('-'))
3942
{
3943
_c4dbgp("does not begin with -");
3944
if(has_any(RMAP))
3945
{
3946
if(has_all(SSCL|RVAL))
3947
{
3948
_c4dbgp("add with null val");
3949
_append_key_val_null(rem.str + ind - 1);
3950
addrem_flags(RKEY, RVAL);
3951
}
3952
}
3953
else if(has_any(RSEQ))
3954
{
3955
if(m_stack.size() > 2) // do not pop to root level
3956
{
3957
if(has_any(RNXT))
3958
{
3959
_c4dbgp("end the indentless seq");
3960
_pop_level();
3961
return true;
3962
}
3963
else if(has_any(RVAL))
3964
{
3965
_c4dbgp("add with null val");
3966
_append_val_null(rem.str);
3967
_c4dbgp("end the indentless seq");
3968
_pop_level();
3969
return true;
3970
}
3971
}
3972
}
3973
}
3974
_line_progressed(ind);
3975
return ind > 0;
3976
}
3977
else if(ind < m_state->indref)
3978
{
3979
_c4dbgpf("smaller indentation ({} < {})!!!", ind, m_state->indref);
3980
if(has_all(RVAL))
3981
{
3982
_c4dbgp("there was an empty val -- appending");
3983
if(has_all(RMAP))
3984
{
3985
_RYML_CB_ASSERT(m_stack.m_callbacks, has_all(SSCL));
3986
_append_key_val_null(rem.sub(ind).str - 1);
3987
}
3988
else if(has_all(RSEQ))
3989
{
3990
_RYML_CB_ASSERT(m_stack.m_callbacks, has_none(SSCL));
3991
_append_val_null(rem.sub(ind).str - 1);
3992
}
3993
}
3994
// search the stack frame to jump to based on its indentation
3995
State const* popto = nullptr;
3996
_RYML_CB_ASSERT(m_stack.m_callbacks, m_stack.is_contiguous()); // this search relies on the stack being contiguous
3997
for(State const* s = m_state-1; s >= m_stack.begin(); --s)
3998
{
3999
_c4dbgpf("searching for state with indentation {}. curr={} (level={},node={})", ind, s->indref, s->level, s->node_id);
4000
if(s->indref == ind)
4001
{
4002
_c4dbgpf("gotit!!! level={} node={}", s->level, s->node_id);
4003
popto = s;
4004
// while it may be tempting to think we're done at this
4005
// point, we must still determine whether we're jumping to a
4006
// parent with the same indentation. Consider this case with
4007
// an indentless sequence:
4008
//
4009
// product:
4010
// - sku: BL394D
4011
// quantity: 4
4012
// description: Basketball
4013
// price: 450.00
4014
// - sku: BL4438H
4015
// quantity: 1
4016
// description: Super Hoop
4017
// price: 2392.00 # jumping one level here would be wrong.
4018
// tax: 1234.5 # we must jump two levels
4019
if(popto > m_stack.begin())
4020
{
4021
auto parent = popto - 1;
4022
if(parent->indref == popto->indref)
4023
{
4024
_c4dbgpf("the parent (level={},node={}) has the same indentation ({}). is this in an indentless sequence?", parent->level, parent->node_id, popto->indref);
4025
_c4dbgpf("isseq(popto)={} ismap(parent)={}", m_tree->is_seq(popto->node_id), m_tree->is_map(parent->node_id));
4026
if(m_tree->is_seq(popto->node_id) && m_tree->is_map(parent->node_id))
4027
{
4028
if( ! remt.begins_with('-'))
4029
{
4030
_c4dbgp("this is an indentless sequence");
4031
popto = parent;
4032
}
4033
else
4034
{
4035
_c4dbgp("not an indentless sequence");
4036
}
4037
}
4038
}
4039
}
4040
break;
4041
}
4042
}
4043
if(!popto || popto >= m_state || popto->level >= m_state->level)
4044
{
4045
_c4err("parse error: incorrect indentation?");
4046
}
4047
_c4dbgpf("popping {} levels: from level {} to level {}", m_state->level-popto->level, m_state->level, popto->level);
4048
while(m_state != popto)
4049
{
4050
_c4dbgpf("popping level {} (indentation={})", m_state->level, m_state->indref);
4051
_pop_level();
4052
}
4053
_RYML_CB_ASSERT(m_stack.m_callbacks, ind == m_state->indref);
4054
_line_progressed(ind);
4055
return true;
4056
}
4057
else
4058
{
4059
_c4dbgpf("larger indentation ({} > {})!!!", ind, m_state->indref);
4060
_RYML_CB_ASSERT(m_stack.m_callbacks, ind > m_state->indref);
4061
if(has_all(RMAP|RVAL))
4062
{
4063
if(_is_scalar_next__rmap_val(remt) && (!remt.first_of_any(": ", "? ")) && (!remt.ends_with(":")))
4064
{
4065
_c4dbgpf("actually it seems a value: '{}'", remt);
4066
}
4067
else
4068
{
4069
addrem_flags(RKEY, RVAL);
4070
_start_unk();
4071
//_move_scalar_from_top();
4072
_line_progressed(ind);
4073
_save_indentation();
4074
return true;
4075
}
4076
}
4077
else if(has_all(RSEQ|RVAL))
4078
{
4079
// nothing to do here
4080
}
4081
else
4082
{
4083
_c4err("parse error - indentation should not increase at this point");
4084
}
4085
}
4086
4087
return false;
4088
}
4089
4090
//-----------------------------------------------------------------------------
4091
csubstr Parser::_scan_comment()
4092
{
4093
csubstr s = m_state->line_contents.rem;
4094
_RYML_CB_ASSERT(m_stack.m_callbacks, s.begins_with('#'));
4095
_line_progressed(s.len);
4096
// skip the # character
4097
s = s.sub(1);
4098
// skip leading whitespace
4099
s = s.right_of(s.first_not_of(' '), /*include_pos*/true);
4100
_c4dbgpf("comment was '{}'", s);
4101
return s;
4102
}
4103
4104
//-----------------------------------------------------------------------------
4105
csubstr Parser::_scan_squot_scalar()
4106
{
4107
// quoted scalars can spread over multiple lines!
4108
// nice explanation here: http://yaml-multiline.info/
4109
4110
// a span to the end of the file
4111
size_t b = m_state->pos.offset;
4112
substr s = m_buf.sub(b);
4113
if(s.begins_with(' '))
4114
{
4115
s = s.triml(' ');
4116
_RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.sub(b).is_super(s));
4117
_RYML_CB_ASSERT(m_stack.m_callbacks, s.begin() >= m_buf.sub(b).begin());
4118
_line_progressed((size_t)(s.begin() - m_buf.sub(b).begin()));
4119
}
4120
b = m_state->pos.offset; // take this into account
4121
_RYML_CB_ASSERT(m_stack.m_callbacks, s.begins_with('\''));
4122
4123
// skip the opening quote
4124
_line_progressed(1);
4125
s = s.sub(1);
4126
4127
bool needs_filter = false;
4128
4129
size_t numlines = 1; // we already have one line
4130
size_t pos = npos; // find the pos of the matching quote
4131
while( ! _finished_file())
4132
{
4133
const csubstr line = m_state->line_contents.rem;
4134
bool line_is_blank = true;
4135
_c4dbgpf("scanning single quoted scalar @ line[{}]: ~~~{}~~~", m_state->pos.line, line);
4136
for(size_t i = 0; i < line.len; ++i)
4137
{
4138
const char curr = line.str[i];
4139
if(curr == '\'') // single quotes are escaped with two single quotes
4140
{
4141
const char next = i+1 < line.len ? line.str[i+1] : '~';
4142
if(next != '\'') // so just look for the first quote
4143
{ // without another after it
4144
pos = i;
4145
break;
4146
}
4147
else
4148
{
4149
needs_filter = true; // needs filter to remove escaped quotes
4150
++i; // skip the escaped quote
4151
}
4152
}
4153
else if(curr != ' ')
4154
{
4155
line_is_blank = false;
4156
}
4157
}
4158
4159
// leading whitespace also needs filtering
4160
needs_filter = needs_filter
4161
|| (numlines > 1)
4162
|| line_is_blank
4163
|| (_at_line_begin() && line.begins_with(' '));
4164
4165
if(pos == npos)
4166
{
4167
_line_progressed(line.len);
4168
++numlines;
4169
}
4170
else
4171
{
4172
_RYML_CB_ASSERT(m_stack.m_callbacks, pos >= 0 && pos < m_buf.len);
4173
_RYML_CB_ASSERT(m_stack.m_callbacks, m_buf[m_state->pos.offset + pos] == '\'');
4174
_line_progressed(pos + 1); // progress beyond the quote
4175
pos = m_state->pos.offset - b - 1; // but we stop before it
4176
break;
4177
}
4178
4179
_line_ended();
4180
_scan_line();
4181
}
4182
4183
if(pos == npos)
4184
{
4185
_c4err("reached end of file while looking for closing quote");
4186
}
4187
else
4188
{
4189
_RYML_CB_ASSERT(m_stack.m_callbacks, pos > 0);
4190
_RYML_CB_ASSERT(m_stack.m_callbacks, s.end() >= m_buf.begin() && s.end() <= m_buf.end());
4191
_RYML_CB_ASSERT(m_stack.m_callbacks, s.end() == m_buf.end() || *s.end() == '\'');
4192
s = s.sub(0, pos-1);
4193
}
4194
4195
if(needs_filter)
4196
{
4197
csubstr ret = _filter_squot_scalar(s);
4198
_RYML_CB_ASSERT(m_stack.m_callbacks, ret.len <= s.len || s.empty() || s.trim(' ').empty());
4199
_c4dbgpf("final scalar: \"{}\"", ret);
4200
return ret;
4201
}
4202
4203
_c4dbgpf("final scalar: \"{}\"", s);
4204
4205
return s;
4206
}
4207
4208
//-----------------------------------------------------------------------------
4209
csubstr Parser::_scan_dquot_scalar()
4210
{
4211
// quoted scalars can spread over multiple lines!
4212
// nice explanation here: http://yaml-multiline.info/
4213
4214
// a span to the end of the file
4215
size_t b = m_state->pos.offset;
4216
substr s = m_buf.sub(b);
4217
if(s.begins_with(' '))
4218
{
4219
s = s.triml(' ');
4220
_RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.sub(b).is_super(s));
4221
_RYML_CB_ASSERT(m_stack.m_callbacks, s.begin() >= m_buf.sub(b).begin());
4222
_line_progressed((size_t)(s.begin() - m_buf.sub(b).begin()));
4223
}
4224
b = m_state->pos.offset; // take this into account
4225
_RYML_CB_ASSERT(m_stack.m_callbacks, s.begins_with('"'));
4226
4227
// skip the opening quote
4228
_line_progressed(1);
4229
s = s.sub(1);
4230
4231
bool needs_filter = false;
4232
4233
size_t numlines = 1; // we already have one line
4234
size_t pos = npos; // find the pos of the matching quote
4235
while( ! _finished_file())
4236
{
4237
const csubstr line = m_state->line_contents.rem;
4238
bool line_is_blank = true;
4239
_c4dbgpf("scanning double quoted scalar @ line[{}]: line='{}'", m_state->pos.line, line);
4240
for(size_t i = 0; i < line.len; ++i)
4241
{
4242
const char curr = line.str[i];
4243
if(curr != ' ')
4244
line_is_blank = false;
4245
// every \ is an escape
4246
if(curr == '\\')
4247
{
4248
const char next = i+1 < line.len ? line.str[i+1] : '~';
4249
needs_filter = true;
4250
if(next == '"' || next == '\\')
4251
++i;
4252
}
4253
else if(curr == '"')
4254
{
4255
pos = i;
4256
break;
4257
}
4258
}
4259
4260
// leading whitespace also needs filtering
4261
needs_filter = needs_filter
4262
|| (numlines > 1)
4263
|| line_is_blank
4264
|| (_at_line_begin() && line.begins_with(' '));
4265
4266
if(pos == npos)
4267
{
4268
_line_progressed(line.len);
4269
++numlines;
4270
}
4271
else
4272
{
4273
_RYML_CB_ASSERT(m_stack.m_callbacks, pos >= 0 && pos < m_buf.len);
4274
_RYML_CB_ASSERT(m_stack.m_callbacks, m_buf[m_state->pos.offset + pos] == '"');
4275
_line_progressed(pos + 1); // progress beyond the quote
4276
pos = m_state->pos.offset - b - 1; // but we stop before it
4277
break;
4278
}
4279
4280
_line_ended();
4281
_scan_line();
4282
}
4283
4284
if(pos == npos)
4285
{
4286
_c4err("reached end of file looking for closing quote");
4287
}
4288
else
4289
{
4290
_RYML_CB_ASSERT(m_stack.m_callbacks, pos > 0);
4291
_RYML_CB_ASSERT(m_stack.m_callbacks, s.end() == m_buf.end() || *s.end() == '"');
4292
_RYML_CB_ASSERT(m_stack.m_callbacks, s.end() >= m_buf.begin() && s.end() <= m_buf.end());
4293
s = s.sub(0, pos-1);
4294
}
4295
4296
if(needs_filter)
4297
{
4298
csubstr ret = _filter_dquot_scalar(s);
4299
_c4dbgpf("final scalar: [{}]\"{}\"", ret.len, ret);
4300
_RYML_CB_ASSERT(m_stack.m_callbacks, ret.len <= s.len || s.empty() || s.trim(' ').empty());
4301
return ret;
4302
}
4303
4304
_c4dbgpf("final scalar: \"{}\"", s);
4305
4306
return s;
4307
}
4308
4309
//-----------------------------------------------------------------------------
4310
csubstr Parser::_scan_block()
4311
{
4312
// nice explanation here: http://yaml-multiline.info/
4313
csubstr s = m_state->line_contents.rem;
4314
csubstr trimmed = s.triml(' ');
4315
if(trimmed.str > s.str)
4316
{
4317
_c4dbgp("skipping whitespace");
4318
_RYML_CB_ASSERT(m_stack.m_callbacks, trimmed.str >= s.str);
4319
_line_progressed(static_cast<size_t>(trimmed.str - s.str));
4320
s = trimmed;
4321
}
4322
_RYML_CB_ASSERT(m_stack.m_callbacks, s.begins_with('|') || s.begins_with('>'));
4323
4324
_c4dbgpf("scanning block: specs=\"{}\"", s);
4325
4326
// parse the spec
4327
BlockStyle_e newline = s.begins_with('>') ? BLOCK_FOLD : BLOCK_LITERAL;
4328
BlockChomp_e chomp = CHOMP_CLIP; // default to clip unless + or - are used
4329
size_t indentation = npos; // have to find out if no spec is given
4330
csubstr digits;
4331
if(s.len > 1)
4332
{
4333
_RYML_CB_ASSERT(m_stack.m_callbacks, s.begins_with_any("|>"));
4334
csubstr t = s.sub(1);
4335
_c4dbgpf("scanning block: spec is multichar: '{}'", t);
4336
_RYML_CB_ASSERT(m_stack.m_callbacks, t.len >= 1);
4337
size_t pos = t.first_of("-+");
4338
_c4dbgpf("scanning block: spec chomp char at {}", pos);
4339
if(pos != npos)
4340
{
4341
if(t[pos] == '-')
4342
chomp = CHOMP_STRIP;
4343
else if(t[pos] == '+')
4344
chomp = CHOMP_KEEP;
4345
if(pos == 0)
4346
t = t.sub(1);
4347
else
4348
t = t.first(pos);
4349
}
4350
// from here to the end, only digits are considered
4351
digits = t.left_of(t.first_not_of("0123456789"));
4352
if( ! digits.empty())
4353
{
4354
if( ! c4::atou(digits, &indentation))
4355
_c4err("parse error: could not read decimal");
4356
_c4dbgpf("scanning block: indentation specified: {}. add {} from curr state -> {}", indentation, m_state->indref, indentation+m_state->indref);
4357
indentation += m_state->indref;
4358
}
4359
}
4360
4361
// finish the current line
4362
_line_progressed(s.len);
4363
_line_ended();
4364
_scan_line();
4365
4366
_c4dbgpf("scanning block: style={} chomp={} indentation={}", newline==BLOCK_FOLD ? "fold" : "literal", chomp==CHOMP_CLIP ? "clip" : (chomp==CHOMP_STRIP ? "strip" : "keep"), indentation);
4367
4368
// start with a zero-length block, already pointing at the right place
4369
substr raw_block(m_buf.data() + m_state->pos.offset, size_t(0));// m_state->line_contents.full.sub(0, 0);
4370
_RYML_CB_ASSERT(m_stack.m_callbacks, raw_block.begin() == m_state->line_contents.full.begin());
4371
4372
// read every full line into a raw block,
4373
// from which newlines are to be stripped as needed.
4374
//
4375
// If no explicit indentation was given, pick it from the first
4376
// non-empty line. See
4377
// https://yaml.org/spec/1.2.2/#8111-block-indentation-indicator
4378
size_t num_lines = 0, first = m_state->pos.line, provisional_indentation = npos;
4379
LineContents lc;
4380
while(( ! _finished_file()))
4381
{
4382
// peek next line, but do not advance immediately
4383
lc.reset_with_next_line(m_buf, m_state->pos.offset);
4384
_c4dbgpf("scanning block: peeking at '{}'", lc.stripped);
4385
// evaluate termination conditions
4386
if(indentation != npos)
4387
{
4388
// stop when the line is deindented and not empty
4389
if(lc.indentation < indentation && ( ! lc.rem.trim(" \t\r\n").empty()))
4390
{
4391
if(raw_block.len)
4392
{
4393
_c4dbgpf("scanning block: indentation decreased ref={} thisline={}", indentation, lc.indentation);
4394
}
4395
else
4396
{
4397
_c4err("indentation decreased without any scalar");
4398
}
4399
break;
4400
}
4401
else if(indentation == 0)
4402
{
4403
if((lc.rem == "..." || lc.rem.begins_with("... "))
4404
||
4405
(lc.rem == "---" || lc.rem.begins_with("--- ")))
4406
{
4407
_c4dbgp("scanning block: stop. indentation=0 and stream ended");
4408
break;
4409
}
4410
}
4411
}
4412
else
4413
{
4414
_c4dbgpf("scanning block: indentation ref not set. firstnonws={}", lc.stripped.first_not_of(' '));
4415
if(lc.stripped.first_not_of(' ') != npos) // non-empty line
4416
{
4417
_c4dbgpf("scanning block: line not empty. indref={} indprov={} indentation={}", m_state->indref, provisional_indentation, lc.indentation);
4418
if(provisional_indentation == npos)
4419
{
4420
if(lc.indentation < m_state->indref)
4421
{
4422
_c4dbgpf("scanning block: block terminated indentation={} < indref={}", lc.indentation, m_state->indref);
4423
if(raw_block.len == 0)
4424
{
4425
_c4dbgp("scanning block: was empty, undo next line");
4426
_line_ended_undo();
4427
}
4428
break;
4429
}
4430
else if(lc.indentation == m_state->indref)
4431
{
4432
if(has_any(RSEQ|RMAP))
4433
{
4434
_c4dbgpf("scanning block: block terminated. reading container and indentation={}==indref={}", lc.indentation, m_state->indref);
4435
break;
4436
}
4437
}
4438
_c4dbgpf("scanning block: set indentation ref from this line: ref={}", lc.indentation);
4439
indentation = lc.indentation;
4440
}
4441
else
4442
{
4443
if(lc.indentation >= provisional_indentation)
4444
{
4445
_c4dbgpf("scanning block: set indentation ref from provisional indentation: provisional_ref={}, thisline={}", provisional_indentation, lc.indentation);
4446
//indentation = provisional_indentation ? provisional_indentation : lc.indentation;
4447
indentation = lc.indentation;
4448
}
4449
else
4450
{
4451
break;
4452
//_c4err("parse error: first non-empty block line should have at least the original indentation");
4453
}
4454
}
4455
}
4456
else // empty line
4457
{
4458
_c4dbgpf("scanning block: line empty or {} spaces. line_indentation={} prov_indentation={}", lc.stripped.len, lc.indentation, provisional_indentation);
4459
if(provisional_indentation != npos)
4460
{
4461
if(lc.stripped.len >= provisional_indentation)
4462
{
4463
_c4dbgpf("scanning block: increase provisional_ref {} -> {}", provisional_indentation, lc.stripped.len);
4464
provisional_indentation = lc.stripped.len;
4465
}
4466
#ifdef RYML_NO_COVERAGE__TO_BE_DELETED
4467
else if(lc.indentation >= provisional_indentation && lc.indentation != npos)
4468
{
4469
_c4dbgpf("scanning block: increase provisional_ref {} -> {}", provisional_indentation, lc.indentation);
4470
provisional_indentation = lc.indentation;
4471
}
4472
#endif
4473
}
4474
else
4475
{
4476
provisional_indentation = lc.indentation ? lc.indentation : has_any(RSEQ|RVAL);
4477
_c4dbgpf("scanning block: initialize provisional_ref={}", provisional_indentation);
4478
if(provisional_indentation == npos)
4479
{
4480
provisional_indentation = lc.stripped.len ? lc.stripped.len : has_any(RSEQ|RVAL);
4481
_c4dbgpf("scanning block: initialize provisional_ref={}", provisional_indentation);
4482
}
4483
}
4484
}
4485
}
4486
// advance now that we know the folded scalar continues
4487
m_state->line_contents = lc;
4488
_c4dbgpf("scanning block: append '{}'", m_state->line_contents.rem);
4489
raw_block.len += m_state->line_contents.full.len;
4490
_line_progressed(m_state->line_contents.rem.len);
4491
_line_ended();
4492
++num_lines;
4493
}
4494
_RYML_CB_ASSERT(m_stack.m_callbacks, m_state->pos.line == (first + num_lines) || (raw_block.len == 0));
4495
C4_UNUSED(num_lines);
4496
C4_UNUSED(first);
4497
4498
if(indentation == npos)
4499
{
4500
_c4dbgpf("scanning block: set indentation from provisional: {}", provisional_indentation);
4501
indentation = provisional_indentation;
4502
}
4503
4504
if(num_lines)
4505
_line_ended_undo();
4506
4507
_c4dbgpf("scanning block: raw=~~~{}~~~", raw_block);
4508
4509
// ok! now we strip the newlines and spaces according to the specs
4510
s = _filter_block_scalar(raw_block, newline, chomp, indentation);
4511
4512
_c4dbgpf("scanning block: final=~~~{}~~~", s);
4513
4514
return s;
4515
}
4516
4517
4518
//-----------------------------------------------------------------------------
4519
4520
template<bool backslash_is_escape, bool keep_trailing_whitespace>
4521
bool Parser::_filter_nl(substr r, size_t *C4_RESTRICT i, size_t *C4_RESTRICT pos, size_t indentation)
4522
{
4523
// a debugging scaffold:
4524
#if 0
4525
#define _c4dbgfnl(fmt, ...) _c4dbgpf("filter_nl[{}]: " fmt, *i, __VA_ARGS__)
4526
#else
4527
#define _c4dbgfnl(...)
4528
#endif
4529
4530
const char curr = r[*i];
4531
bool replaced = false;
4532
4533
_RYML_CB_ASSERT(m_stack.m_callbacks, indentation != npos);
4534
_RYML_CB_ASSERT(m_stack.m_callbacks, curr == '\n');
4535
4536
_c4dbgfnl("found newline. sofar=[{}]~~~{}~~~", *pos, m_filter_arena.first(*pos));
4537
size_t ii = *i;
4538
size_t numnl_following = count_following_newlines(r, &ii, indentation);
4539
if(numnl_following)
4540
{
4541
_c4dbgfnl("{} consecutive (empty) lines {} in the middle. totalws={}", 1+numnl_following, ii < r.len ? "in the middle" : "at the end", ii - *i);
4542
for(size_t j = 0; j < numnl_following; ++j)
4543
m_filter_arena.str[(*pos)++] = '\n';
4544
}
4545
else
4546
{
4547
if(r.first_not_of(" \t", *i+1) != npos)
4548
{
4549
m_filter_arena.str[(*pos)++] = ' ';
4550
_c4dbgfnl("single newline. convert to space. ii={}/{}. sofar=[{}]~~~{}~~~", ii, r.len, *pos, m_filter_arena.first(*pos));
4551
replaced = true;
4552
}
4553
else
4554
{
4555
if C4_IF_CONSTEXPR (keep_trailing_whitespace)
4556
{
4557
m_filter_arena.str[(*pos)++] = ' ';
4558
_c4dbgfnl("single newline. convert to space. ii={}/{}. sofar=[{}]~~~{}~~~", ii, r.len, *pos, m_filter_arena.first(*pos));
4559
replaced = true;
4560
}
4561
else
4562
{
4563
_c4dbgfnl("last newline, everything else is whitespace. ii={}/{}", ii, r.len);
4564
*i = r.len;
4565
}
4566
}
4567
if C4_IF_CONSTEXPR (backslash_is_escape)
4568
{
4569
if(ii < r.len && r.str[ii] == '\\')
4570
{
4571
const char next = ii+1 < r.len ? r.str[ii+1] : '\0';
4572
if(next == ' ' || next == '\t')
4573
{
4574
_c4dbgfnl("extend skip to backslash{}", "");
4575
++ii;
4576
}
4577
}
4578
}
4579
}
4580
*i = ii - 1; // correct for the loop increment
4581
4582
#undef _c4dbgfnl
4583
4584
return replaced;
4585
}
4586
4587
4588
//-----------------------------------------------------------------------------
4589
4590
template<bool keep_trailing_whitespace>
4591
void Parser::_filter_ws(substr r, size_t *C4_RESTRICT i, size_t *C4_RESTRICT pos)
4592
{
4593
// a debugging scaffold:
4594
#if 0
4595
#define _c4dbgfws(fmt, ...) _c4dbgpf("filt_nl[{}]: " fmt, *i, __VA_ARGS__)
4596
#else
4597
#define _c4dbgfws(...)
4598
#endif
4599
4600
const char curr = r[*i];
4601
_c4dbgfws("found whitespace '{}'", _c4prc(curr));
4602
_RYML_CB_ASSERT(m_stack.m_callbacks, curr == ' ' || curr == '\t');
4603
4604
size_t first = *i > 0 ? r.first_not_of(" \t", *i) : r.first_not_of(' ', *i);
4605
if(first != npos)
4606
{
4607
if(r[first] == '\n' || r[first] == '\r') // skip trailing whitespace
4608
{
4609
_c4dbgfws("whitespace is trailing on line. firstnonws='{}'@{}", _c4prc(r[first]), first);
4610
*i = first - 1; // correct for the loop increment
4611
}
4612
else // a legit whitespace
4613
{
4614
m_filter_arena.str[(*pos)++] = curr;
4615
_c4dbgfws("legit whitespace. sofar=[{}]~~~{}~~~", *pos, m_filter_arena.first(*pos));
4616
}
4617
}
4618
else
4619
{
4620
_c4dbgfws("... everything else is trailing whitespace{}", "");
4621
if C4_IF_CONSTEXPR (keep_trailing_whitespace)
4622
for(size_t j = *i; j < r.len; ++j)
4623
m_filter_arena.str[(*pos)++] = r[j];
4624
*i = r.len;
4625
}
4626
4627
#undef _c4dbgfws
4628
}
4629
4630
4631
//-----------------------------------------------------------------------------
4632
csubstr Parser::_filter_plain_scalar(substr s, size_t indentation)
4633
{
4634
// a debugging scaffold:
4635
#if 0
4636
#define _c4dbgfps(...) _c4dbgpf("filt_plain_scalar" __VA_ARGS__)
4637
#else
4638
#define _c4dbgfps(...)
4639
#endif
4640
4641
_c4dbgfps("before=~~~{}~~~", s);
4642
4643
substr r = s.triml(" \t");
4644
_grow_filter_arena(r.len);
4645
size_t pos = 0; // the filtered size
4646
bool filtered_chars = false;
4647
for(size_t i = 0; i < r.len; ++i)
4648
{
4649
const char curr = r.str[i];
4650
_c4dbgfps("[{}]: '{}'", i, _c4prc(curr));
4651
if(curr == ' ' || curr == '\t')
4652
{
4653
_filter_ws</*keep_trailing_ws*/false>(r, &i, &pos);
4654
}
4655
else if(curr == '\n')
4656
{
4657
filtered_chars = _filter_nl</*backslash_is_escape*/false, /*keep_trailing_ws*/false>(r, &i, &pos, indentation);
4658
}
4659
else if(curr == '\r') // skip \r --- https://stackoverflow.com/questions/1885900
4660
{
4661
;
4662
}
4663
else
4664
{
4665
m_filter_arena.str[pos++] = r[i];
4666
}
4667
}
4668
4669
_RYML_CB_ASSERT(m_stack.m_callbacks, pos <= m_filter_arena.len);
4670
if(pos < r.len || filtered_chars)
4671
{
4672
r = _finish_filter_arena(r, pos);
4673
}
4674
4675
_RYML_CB_ASSERT(m_stack.m_callbacks, s.len >= r.len);
4676
_c4dbgfps("#filteredchars={} after=~~~{}~~~", s.len - r.len, r);
4677
4678
#undef _c4dbgfps
4679
return r;
4680
}
4681
4682
4683
//-----------------------------------------------------------------------------
4684
csubstr Parser::_filter_squot_scalar(substr s)
4685
{
4686
// a debugging scaffold:
4687
#if 0
4688
#define _c4dbgfsq(...) _c4dbgpf("filt_squo_scalar")
4689
#else
4690
#define _c4dbgfsq(...)
4691
#endif
4692
4693
// from the YAML spec for double-quoted scalars:
4694
// https://yaml.org/spec/1.2-old/spec.html#style/flow/single-quoted
4695
4696
_c4dbgfsq(": before=~~~{}~~~", s);
4697
4698
_grow_filter_arena(s.len);
4699
substr r = s;
4700
size_t pos = 0; // the filtered size
4701
bool filtered_chars = false;
4702
for(size_t i = 0; i < r.len; ++i)
4703
{
4704
const char curr = r[i];
4705
_c4dbgfsq("[{}]: '{}'", i, _c4prc(curr));
4706
if(curr == ' ' || curr == '\t')
4707
{
4708
_filter_ws</*keep_trailing_ws*/true>(r, &i, &pos);
4709
}
4710
else if(curr == '\n')
4711
{
4712
filtered_chars = _filter_nl</*backslash_is_escape*/false, /*keep_trailing_ws*/true>(r, &i, &pos, /*indentation*/0);
4713
}
4714
else if(curr == '\r') // skip \r --- https://stackoverflow.com/questions/1885900
4715
{
4716
;
4717
}
4718
else if(curr == '\'')
4719
{
4720
char next = i+1 < r.len ? r[i+1] : '\0';
4721
if(next == '\'')
4722
{
4723
_c4dbgfsq("[{}]: two consecutive quotes", i);
4724
filtered_chars = true;
4725
m_filter_arena.str[pos++] = '\'';
4726
++i;
4727
}
4728
}
4729
else
4730
{
4731
m_filter_arena.str[pos++] = curr;
4732
}
4733
}
4734
4735
_RYML_CB_ASSERT(m_stack.m_callbacks, pos <= m_filter_arena.len);
4736
if(pos < r.len || filtered_chars)
4737
{
4738
r = _finish_filter_arena(r, pos);
4739
}
4740
4741
_RYML_CB_ASSERT(m_stack.m_callbacks, s.len >= r.len);
4742
_c4dbgpf(": #filteredchars={} after=~~~{}~~~", s.len - r.len, r);
4743
4744
#undef _c4dbgfsq
4745
return r;
4746
}
4747
4748
4749
//-----------------------------------------------------------------------------
4750
csubstr Parser::_filter_dquot_scalar(substr s)
4751
{
4752
// a debugging scaffold:
4753
#if 0
4754
#define _c4dbgfdq(...) _c4dbgpf("filt_dquo_scalar" __VA_ARGS__)
4755
#else
4756
#define _c4dbgfdq(...)
4757
#endif
4758
4759
_c4dbgfdq(": before=~~~{}~~~", s);
4760
4761
// from the YAML spec for double-quoted scalars:
4762
// https://yaml.org/spec/1.2-old/spec.html#style/flow/double-quoted
4763
//
4764
// All leading and trailing white space characters are excluded
4765
// from the content. Each continuation line must therefore contain
4766
// at least one non-space character. Empty lines, if any, are
4767
// consumed as part of the line folding.
4768
4769
_grow_filter_arena(s.len + 2u * s.count('\\'));
4770
substr r = s;
4771
size_t pos = 0; // the filtered size
4772
bool filtered_chars = false;
4773
for(size_t i = 0; i < r.len; ++i)
4774
{
4775
const char curr = r[i];
4776
_c4dbgfdq("[{}]: '{}'", i, _c4prc(curr));
4777
if(curr == ' ' || curr == '\t')
4778
{
4779
_filter_ws</*keep_trailing_ws*/true>(r, &i, &pos);
4780
}
4781
else if(curr == '\n')
4782
{
4783
filtered_chars = _filter_nl</*backslash_is_escape*/true, /*keep_trailing_ws*/true>(r, &i, &pos, /*indentation*/0);
4784
}
4785
else if(curr == '\r') // skip \r --- https://stackoverflow.com/questions/1885900
4786
{
4787
;
4788
}
4789
else if(curr == '\\')
4790
{
4791
char next = i+1 < r.len ? r[i+1] : '\0';
4792
_c4dbgfdq("[{}]: backslash, next='{}'", i, _c4prc(next));
4793
filtered_chars = true;
4794
if(next == '\r')
4795
{
4796
if(i+2 < r.len && r[i+2] == '\n')
4797
{
4798
++i; // newline escaped with \ -- skip both (add only one as i is loop-incremented)
4799
next = '\n';
4800
_c4dbgfdq("[{}]: was \\r\\n, now next='\\n'", i);
4801
}
4802
}
4803
// remember the loop will also increment i
4804
if(next == '\n')
4805
{
4806
size_t ii = i + 2;
4807
for( ; ii < r.len; ++ii)
4808
{
4809
if(r.str[ii] == ' ' || r.str[ii] == '\t') // skip leading whitespace
4810
;
4811
else
4812
break;
4813
}
4814
i += ii - i - 1;
4815
}
4816
else if(next == '"' || next == '/' || next == ' ' || next == '\t') // escapes for json compatibility
4817
{
4818
m_filter_arena.str[pos++] = next;
4819
++i;
4820
}
4821
else if(next == '\r')
4822
{
4823
//++i;
4824
}
4825
else if(next == 'n')
4826
{
4827
m_filter_arena.str[pos++] = '\n';
4828
++i;
4829
}
4830
else if(next == 'r')
4831
{
4832
m_filter_arena.str[pos++] = '\r';
4833
++i; // skip
4834
}
4835
else if(next == 't')
4836
{
4837
m_filter_arena.str[pos++] = '\t';
4838
++i;
4839
}
4840
else if(next == '\\')
4841
{
4842
m_filter_arena.str[pos++] = '\\';
4843
++i;
4844
}
4845
else if(next == 'x') // UTF8
4846
{
4847
if(i + 1u + 2u >= r.len)
4848
_c4err("\\x requires 2 hex digits");
4849
uint8_t byteval = {};
4850
if(!read_hex(r.sub(i + 2u, 2u), &byteval))
4851
_c4err("failed to read \\x codepoint");
4852
m_filter_arena.str[pos++] = *(char*)&byteval;
4853
i += 1u + 2u;
4854
}
4855
else if(next == 'u') // UTF16
4856
{
4857
if(i + 1u + 4u >= r.len)
4858
_c4err("\\u requires 4 hex digits");
4859
char readbuf[8];
4860
csubstr codepoint = r.sub(i + 2u, 4u);
4861
uint32_t codepoint_val = {};
4862
if(!read_hex(codepoint, &codepoint_val))
4863
_c4err("failed to parse \\u codepoint");
4864
size_t numbytes = decode_code_point((uint8_t*)readbuf, sizeof(readbuf), codepoint_val);
4865
C4_ASSERT(numbytes <= 4);
4866
memcpy(m_filter_arena.str + pos, readbuf, numbytes);
4867
pos += numbytes;
4868
i += 1u + 4u;
4869
}
4870
else if(next == 'U') // UTF32
4871
{
4872
if(i + 1u + 8u >= r.len)
4873
_c4err("\\U requires 8 hex digits");
4874
char readbuf[8];
4875
csubstr codepoint = r.sub(i + 2u, 8u);
4876
uint32_t codepoint_val = {};
4877
if(!read_hex(codepoint, &codepoint_val))
4878
_c4err("failed to parse \\U codepoint");
4879
size_t numbytes = decode_code_point((uint8_t*)readbuf, sizeof(readbuf), codepoint_val);
4880
C4_ASSERT(numbytes <= 4);
4881
memcpy(m_filter_arena.str + pos, readbuf, numbytes);
4882
pos += numbytes;
4883
i += 1u + 8u;
4884
}
4885
// https://yaml.org/spec/1.2.2/#rule-c-ns-esc-char
4886
else if(next == '0')
4887
{
4888
m_filter_arena.str[pos++] = '\0';
4889
++i;
4890
}
4891
else if(next == 'b') // backspace
4892
{
4893
m_filter_arena.str[pos++] = '\b';
4894
++i;
4895
}
4896
else if(next == 'f') // form feed
4897
{
4898
m_filter_arena.str[pos++] = '\f';
4899
++i;
4900
}
4901
else if(next == 'a') // bell character
4902
{
4903
m_filter_arena.str[pos++] = '\a';
4904
++i;
4905
}
4906
else if(next == 'v') // vertical tab
4907
{
4908
m_filter_arena.str[pos++] = '\v';
4909
++i;
4910
}
4911
else if(next == 'e') // escape character
4912
{
4913
m_filter_arena.str[pos++] = '\x1b';
4914
++i;
4915
}
4916
else if(next == '_') // unicode non breaking space \u00a0
4917
{
4918
// https://www.compart.com/en/unicode/U+00a0
4919
m_filter_arena.str[pos++] = _RYML_CHCONST(-0x3e, 0xc2);
4920
m_filter_arena.str[pos++] = _RYML_CHCONST(-0x60, 0xa0);
4921
++i;
4922
}
4923
else if(next == 'N') // unicode next line \u0085
4924
{
4925
// https://www.compart.com/en/unicode/U+0085
4926
m_filter_arena.str[pos++] = _RYML_CHCONST(-0x3e, 0xc2);
4927
m_filter_arena.str[pos++] = _RYML_CHCONST(-0x7b, 0x85);
4928
++i;
4929
}
4930
else if(next == 'L') // unicode line separator \u2028
4931
{
4932
// https://www.utf8-chartable.de/unicode-utf8-table.pl?start=8192&number=1024&names=-&utf8=0x&unicodeinhtml=hex
4933
m_filter_arena.str[pos++] = _RYML_CHCONST(-0x1e, 0xe2);
4934
m_filter_arena.str[pos++] = _RYML_CHCONST(-0x80, 0x80);
4935
m_filter_arena.str[pos++] = _RYML_CHCONST(-0x58, 0xa8);
4936
++i;
4937
}
4938
else if(next == 'P') // unicode paragraph separator \u2029
4939
{
4940
// https://www.utf8-chartable.de/unicode-utf8-table.pl?start=8192&number=1024&names=-&utf8=0x&unicodeinhtml=hex
4941
m_filter_arena.str[pos++] = _RYML_CHCONST(-0x1e, 0xe2);
4942
m_filter_arena.str[pos++] = _RYML_CHCONST(-0x80, 0x80);
4943
m_filter_arena.str[pos++] = _RYML_CHCONST(-0x57, 0xa9);
4944
++i;
4945
}
4946
_c4dbgfdq("[{}]: backslash...sofar=[{}]~~~{}~~~", i, pos, m_filter_arena.first(pos));
4947
}
4948
else
4949
{
4950
m_filter_arena.str[pos++] = curr;
4951
}
4952
}
4953
4954
_RYML_CB_ASSERT(m_stack.m_callbacks, pos <= m_filter_arena.len);
4955
if(pos < r.len || filtered_chars)
4956
{
4957
r = _finish_filter_arena(r, pos);
4958
}
4959
4960
_RYML_CB_ASSERT(m_stack.m_callbacks, s.len >= r.len);
4961
_c4dbgpf(": #filteredchars={} after=~~~{}~~~", s.len - r.len, r);
4962
4963
#undef _c4dbgfdq
4964
4965
return r;
4966
}
4967
4968
4969
//-----------------------------------------------------------------------------
4970
bool Parser::_apply_chomp(substr buf, size_t *C4_RESTRICT pos, BlockChomp_e chomp)
4971
{
4972
substr trimmed = buf.first(*pos).trimr('\n');
4973
bool added_newline = false;
4974
switch(chomp)
4975
{
4976
case CHOMP_KEEP:
4977
if(trimmed.len == *pos)
4978
{
4979
_c4dbgpf("chomp=KEEP: add missing newline @{}", *pos);
4980
//m_filter_arena.str[(*pos)++] = '\n';
4981
added_newline = true;
4982
}
4983
break;
4984
case CHOMP_CLIP:
4985
if(trimmed.len == *pos)
4986
{
4987
_c4dbgpf("chomp=CLIP: add missing newline @{}", *pos);
4988
m_filter_arena.str[(*pos)++] = '\n';
4989
added_newline = true;
4990
}
4991
else
4992
{
4993
_c4dbgpf("chomp=CLIP: include single trailing newline @{}", trimmed.len+1);
4994
*pos = trimmed.len + 1;
4995
}
4996
break;
4997
case CHOMP_STRIP:
4998
_c4dbgpf("chomp=STRIP: strip {}-{}-{} newlines", *pos, trimmed.len, *pos-trimmed.len);
4999
*pos = trimmed.len;
5000
break;
5001
default:
5002
_c4err("unknown chomp style");
5003
}
5004
return added_newline;
5005
}
5006
5007
5008
//-----------------------------------------------------------------------------
5009
csubstr Parser::_filter_block_scalar(substr s, BlockStyle_e style, BlockChomp_e chomp, size_t indentation)
5010
{
5011
// a debugging scaffold:
5012
#if 0
5013
#define _c4dbgfbl(fmt, ...) _c4dbgpf("filt_block" fmt, __VA_ARGS__)
5014
#else
5015
#define _c4dbgfbl(...)
5016
#endif
5017
5018
_c4dbgfbl(": indentation={} before=[{}]~~~{}~~~", indentation, s.len, s);
5019
5020
if(chomp != CHOMP_KEEP && s.trim(" \n\r").len == 0u)
5021
{
5022
_c4dbgp("filt_block: empty scalar");
5023
return s.first(0);
5024
}
5025
5026
substr r = s;
5027
5028
switch(style)
5029
{
5030
case BLOCK_LITERAL:
5031
{
5032
_c4dbgp("filt_block: style=literal");
5033
// trim leading whitespace up to indentation
5034
{
5035
size_t numws = r.first_not_of(' ');
5036
if(numws != npos)
5037
{
5038
if(numws > indentation)
5039
r = r.sub(indentation);
5040
else
5041
r = r.sub(numws);
5042
_c4dbgfbl(": after triml=[{}]~~~{}~~~", r.len, r);
5043
}
5044
else
5045
{
5046
if(chomp != CHOMP_KEEP || r.len == 0)
5047
{
5048
_c4dbgfbl(": all spaces {}, return empty", r.len);
5049
return r.first(0);
5050
}
5051
else
5052
{
5053
r[0] = '\n';
5054
return r.first(1);
5055
}
5056
}
5057
}
5058
_grow_filter_arena(s.len + 2u); // use s.len! because we may need to add a newline at the end, so the leading indentation will allow space for that newline
5059
size_t pos = 0; // the filtered size
5060
for(size_t i = 0; i < r.len; ++i)
5061
{
5062
const char curr = r.str[i];
5063
_c4dbgfbl("[{}]='{}' pos={}", i, _c4prc(curr), pos);
5064
if(curr == '\r')
5065
continue;
5066
m_filter_arena.str[pos++] = curr;
5067
if(curr == '\n')
5068
{
5069
_c4dbgfbl("[{}]: found newline", i);
5070
// skip indentation on the next line
5071
csubstr rem = r.sub(i+1);
5072
size_t first = rem.first_not_of(' ');
5073
if(first != npos)
5074
{
5075
_RYML_CB_ASSERT(m_stack.m_callbacks, first < rem.len);
5076
_RYML_CB_ASSERT(m_stack.m_callbacks, i+1+first < r.len);
5077
_c4dbgfbl("[{}]: {} spaces follow before next nonws character @ [{}]='{}'", i, first, i+1+first, rem.str[first]);
5078
if(first < indentation)
5079
{
5080
_c4dbgfbl("[{}]: skip {}<{} spaces from indentation", i, first, indentation);
5081
i += first;
5082
}
5083
else
5084
{
5085
_c4dbgfbl("[{}]: skip {} spaces from indentation", i, indentation);
5086
i += indentation;
5087
}
5088
}
5089
else
5090
{
5091
_RYML_CB_ASSERT(m_stack.m_callbacks, i+1 <= r.len);
5092
first = rem.len;
5093
_c4dbgfbl("[{}]: {} spaces to the end", i, first);
5094
if(first)
5095
{
5096
if(first < indentation)
5097
{
5098
_c4dbgfbl("[{}]: skip everything", i);
5099
--pos;
5100
break;
5101
}
5102
else
5103
{
5104
_c4dbgfbl("[{}]: skip {} spaces from indentation", i, indentation);
5105
i += indentation;
5106
}
5107
}
5108
else if(i+1 == r.len)
5109
{
5110
if(chomp == CHOMP_STRIP)
5111
--pos;
5112
break;
5113
}
5114
}
5115
}
5116
}
5117
_RYML_CB_ASSERT(m_stack.m_callbacks, s.len >= pos);
5118
_c4dbgfbl(": #filteredchars={} after=~~~{}~~~", s.len - r.len, r);
5119
bool changed = _apply_chomp(m_filter_arena, &pos, chomp);
5120
_RYML_CB_ASSERT(m_stack.m_callbacks, pos <= m_filter_arena.len);
5121
_RYML_CB_ASSERT(m_stack.m_callbacks, pos <= s.len);
5122
if(pos < r.len || changed)
5123
{
5124
r = _finish_filter_arena(s, pos); // write into s
5125
}
5126
break;
5127
}
5128
case BLOCK_FOLD:
5129
{
5130
_c4dbgp("filt_block: style=fold");
5131
_grow_filter_arena(r.len + 2);
5132
size_t pos = 0; // the filtered size
5133
bool filtered_chars = false;
5134
bool started = false;
5135
bool is_indented = false;
5136
size_t i = r.first_not_of(' ');
5137
_c4dbgfbl(": first non space at {}", i);
5138
if(i > indentation)
5139
{
5140
is_indented = true;
5141
i = indentation;
5142
}
5143
_c4dbgfbl(": start folding at {}, is_indented={}", i, (int)is_indented);
5144
auto on_change_indentation = [&](size_t numnl_following, size_t last_newl, size_t first_non_whitespace){
5145
_c4dbgfbl("[{}]: add 1+{} newlines", i, numnl_following);
5146
for(size_t j = 0; j < 1 + numnl_following; ++j)
5147
m_filter_arena.str[pos++] = '\n';
5148
for(i = last_newl + 1 + indentation; i < first_non_whitespace; ++i)
5149
{
5150
if(r.str[i] == '\r')
5151
continue;
5152
_c4dbgfbl("[{}]: add '{}'", i, _c4prc(r.str[i]));
5153
m_filter_arena.str[pos++] = r.str[i];
5154
}
5155
--i;
5156
};
5157
for( ; i < r.len; ++i)
5158
{
5159
const char curr = r.str[i];
5160
_c4dbgfbl("[{}]='{}'", i, _c4prc(curr));
5161
if(curr == '\n')
5162
{
5163
filtered_chars = true;
5164
// skip indentation on the next line, and advance over the next non-indented blank lines as well
5165
size_t first_non_whitespace;
5166
size_t numnl_following = (size_t)-1;
5167
while(r[i] == '\n')
5168
{
5169
++numnl_following;
5170
csubstr rem = r.sub(i+1);
5171
size_t first = rem.first_not_of(' ');
5172
_c4dbgfbl("[{}]: found newline. first={} rem.len={}", i, first, rem.len);
5173
if(first != npos)
5174
{
5175
first_non_whitespace = first + i+1;
5176
while(first_non_whitespace < r.len && r[first_non_whitespace] == '\r')
5177
++first_non_whitespace;
5178
_RYML_CB_ASSERT(m_stack.m_callbacks, first < rem.len);
5179
_RYML_CB_ASSERT(m_stack.m_callbacks, i+1+first < r.len);
5180
_c4dbgfbl("[{}]: {} spaces follow before next nonws character @ [{}]='{}'", i, first, i+1+first, _c4prc(rem.str[first]));
5181
if(first < indentation)
5182
{
5183
_c4dbgfbl("[{}]: skip {}<{} spaces from indentation", i, first, indentation);
5184
i += first;
5185
}
5186
else
5187
{
5188
_c4dbgfbl("[{}]: skip {} spaces from indentation", i, indentation);
5189
i += indentation;
5190
if(first > indentation)
5191
{
5192
_c4dbgfbl("[{}]: {} further indented than {}, stop newlining", i, first, indentation);
5193
goto finished_counting_newlines;
5194
}
5195
}
5196
// prepare the next while loop iteration
5197
// by setting i at the next newline after
5198
// an empty line
5199
if(r[first_non_whitespace] == '\n')
5200
i = first_non_whitespace;
5201
else
5202
goto finished_counting_newlines;
5203
}
5204
else
5205
{
5206
_RYML_CB_ASSERT(m_stack.m_callbacks, i+1 <= r.len);
5207
first = rem.len;
5208
first_non_whitespace = first + i+1;
5209
if(first)
5210
{
5211
_c4dbgfbl("[{}]: {} spaces to the end", i, first);
5212
if(first < indentation)
5213
{
5214
_c4dbgfbl("[{}]: skip everything", i);
5215
i += first;
5216
}
5217
else
5218
{
5219
_c4dbgfbl("[{}]: skip {} spaces from indentation", i, indentation);
5220
i += indentation;
5221
if(first > indentation)
5222
{
5223
_c4dbgfbl("[{}]: {} spaces missing. not done yet", i, indentation - first);
5224
goto finished_counting_newlines;
5225
}
5226
}
5227
}
5228
else // if(i+1 == r.len)
5229
{
5230
_c4dbgfbl("[{}]: it's the final newline", i);
5231
_RYML_CB_ASSERT(m_stack.m_callbacks, i+1 == r.len);
5232
_RYML_CB_ASSERT(m_stack.m_callbacks, rem.len == 0);
5233
}
5234
goto end_of_scalar;
5235
}
5236
}
5237
end_of_scalar:
5238
// Write all the trailing newlines. Since we're
5239
// at the end no folding is needed, so write every
5240
// newline (add 1).
5241
_c4dbgfbl("[{}]: add {} trailing newlines", i, 1+numnl_following);
5242
for(size_t j = 0; j < 1 + numnl_following; ++j)
5243
m_filter_arena.str[pos++] = '\n';
5244
break;
5245
finished_counting_newlines:
5246
_c4dbgfbl("[{}]: #newlines={} firstnonws={}", i, numnl_following, first_non_whitespace);
5247
while(first_non_whitespace < r.len && r[first_non_whitespace] == '\t')
5248
++first_non_whitespace;
5249
_c4dbgfbl("[{}]: #newlines={} firstnonws={}", i, numnl_following, first_non_whitespace);
5250
_RYML_CB_ASSERT(m_stack.m_callbacks, first_non_whitespace <= r.len);
5251
size_t last_newl = r.last_of('\n', first_non_whitespace);
5252
size_t this_indentation = first_non_whitespace - last_newl - 1;
5253
_c4dbgfbl("[{}]: #newlines={} firstnonws={} lastnewl={} this_indentation={} vs indentation={}", i, numnl_following, first_non_whitespace, last_newl, this_indentation, indentation);
5254
_RYML_CB_ASSERT(m_stack.m_callbacks, first_non_whitespace >= last_newl + 1);
5255
_RYML_CB_ASSERT(m_stack.m_callbacks, this_indentation >= indentation);
5256
if(!started)
5257
{
5258
_c4dbgfbl("[{}]: #newlines={}. write all leading newlines", i, numnl_following);
5259
for(size_t j = 0; j < 1 + numnl_following; ++j)
5260
m_filter_arena.str[pos++] = '\n';
5261
if(this_indentation > indentation)
5262
{
5263
is_indented = true;
5264
_c4dbgfbl("[{}]: advance ->{}", i, last_newl + indentation);
5265
i = last_newl + indentation;
5266
}
5267
else
5268
{
5269
i = first_non_whitespace - 1;
5270
_c4dbgfbl("[{}]: advance ->{}", i, first_non_whitespace);
5271
}
5272
}
5273
else if(this_indentation == indentation)
5274
{
5275
_c4dbgfbl("[{}]: same indentation", i);
5276
if(!is_indented)
5277
{
5278
if(numnl_following == 0)
5279
{
5280
_c4dbgfbl("[{}]: fold!", i);
5281
m_filter_arena.str[pos++] = ' ';
5282
}
5283
else
5284
{
5285
_c4dbgfbl("[{}]: add {} newlines", i, 1 + numnl_following);
5286
for(size_t j = 0; j < numnl_following; ++j)
5287
m_filter_arena.str[pos++] = '\n';
5288
}
5289
i = first_non_whitespace - 1;
5290
_c4dbgfbl("[{}]: advance {}->{}", i, i, first_non_whitespace);
5291
}
5292
else
5293
{
5294
_c4dbgfbl("[{}]: back to ref indentation", i);
5295
is_indented = false;
5296
on_change_indentation(numnl_following, last_newl, first_non_whitespace);
5297
_c4dbgfbl("[{}]: advance {}->{}", i, i, first_non_whitespace);
5298
}
5299
}
5300
else
5301
{
5302
_c4dbgfbl("[{}]: increased indentation.", i);
5303
is_indented = true;
5304
_RYML_CB_ASSERT(m_stack.m_callbacks, this_indentation > indentation);
5305
on_change_indentation(numnl_following, last_newl, first_non_whitespace);
5306
_c4dbgfbl("[{}]: advance {}->{}", i, i, first_non_whitespace);
5307
}
5308
}
5309
else if(curr != '\r')
5310
{
5311
if(curr != '\t')
5312
started = true;
5313
m_filter_arena.str[pos++] = curr;
5314
}
5315
}
5316
_RYML_CB_ASSERT(m_stack.m_callbacks, pos <= m_filter_arena.len);
5317
_c4dbgfbl(": #filteredchars={} after=[{}]~~~{}~~~", (int)s.len - (int)pos, pos, m_filter_arena.first(pos));
5318
bool changed = _apply_chomp(m_filter_arena, &pos, chomp);
5319
if(pos < r.len || filtered_chars || changed)
5320
{
5321
r = _finish_filter_arena(s, pos); // write into s
5322
}
5323
}
5324
break;
5325
default:
5326
_c4err("unknown block style");
5327
}
5328
5329
_c4dbgfbl(": final=[{}]~~~{}~~~", r.len, r);
5330
5331
#undef _c4dbgfbl
5332
5333
return r;
5334
}
5335
5336
//-----------------------------------------------------------------------------
5337
size_t Parser::_count_nlines(csubstr src)
5338
{
5339
return 1 + src.count('\n');
5340
}
5341
5342
//-----------------------------------------------------------------------------
5343
void Parser::_handle_directive(csubstr directive_)
5344
{
5345
csubstr directive = directive_;
5346
if(directive.begins_with("%TAG"))
5347
{
5348
TagDirective td;
5349
_c4dbgpf("%TAG directive: {}", directive_);
5350
directive = directive.sub(4);
5351
if(!directive.begins_with(' '))
5352
_c4err("malformed tag directive: {}", directive_);
5353
directive = directive.triml(' ');
5354
size_t pos = directive.find(' ');
5355
if(pos == npos)
5356
_c4err("malformed tag directive: {}", directive_);
5357
td.handle = directive.first(pos);
5358
directive = directive.sub(td.handle.len).triml(' ');
5359
pos = directive.find(' ');
5360
if(pos != npos)
5361
directive = directive.first(pos);
5362
td.prefix = directive;
5363
td.next_node_id = m_tree->size();
5364
if(m_tree->size() > 0)
5365
{
5366
size_t prev = m_tree->size() - 1;
5367
if(m_tree->is_root(prev) && m_tree->type(prev) != NOTYPE && !m_tree->is_stream(prev))
5368
++td.next_node_id;
5369
}
5370
_c4dbgpf("%TAG: handle={} prefix={} next_node={}", td.handle, td.prefix, td.next_node_id);
5371
m_tree->add_tag_directive(td);
5372
}
5373
else if(directive.begins_with("%YAML"))
5374
{
5375
_c4dbgpf("%YAML directive! ignoring...: {}", directive);
5376
}
5377
}
5378
5379
//-----------------------------------------------------------------------------
5380
void Parser::set_flags(flag_t f, State * s)
5381
{
5382
#ifdef RYML_DBG
5383
char buf1_[64], buf2_[64];
5384
csubstr buf1 = _prfl(buf1_, f);
5385
csubstr buf2 = _prfl(buf2_, s->flags);
5386
_c4dbgpf("state[{}]: setting flags to {}: before={}", s-m_stack.begin(), buf1, buf2);
5387
#endif
5388
s->flags = f;
5389
}
5390
5391
void Parser::add_flags(flag_t on, State * s)
5392
{
5393
#ifdef RYML_DBG
5394
char buf1_[64], buf2_[64], buf3_[64];
5395
csubstr buf1 = _prfl(buf1_, on);
5396
csubstr buf2 = _prfl(buf2_, s->flags);
5397
csubstr buf3 = _prfl(buf3_, s->flags|on);
5398
_c4dbgpf("state[{}]: adding flags {}: before={} after={}", s-m_stack.begin(), buf1, buf2, buf3);
5399
#endif
5400
s->flags |= on;
5401
}
5402
5403
void Parser::addrem_flags(flag_t on, flag_t off, State * s)
5404
{
5405
#ifdef RYML_DBG
5406
char buf1_[64], buf2_[64], buf3_[64], buf4_[64];
5407
csubstr buf1 = _prfl(buf1_, on);
5408
csubstr buf2 = _prfl(buf2_, off);
5409
csubstr buf3 = _prfl(buf3_, s->flags);
5410
csubstr buf4 = _prfl(buf4_, ((s->flags|on)&(~off)));
5411
_c4dbgpf("state[{}]: adding flags {} / removing flags {}: before={} after={}", s-m_stack.begin(), buf1, buf2, buf3, buf4);
5412
#endif
5413
s->flags |= on;
5414
s->flags &= ~off;
5415
}
5416
5417
void Parser::rem_flags(flag_t off, State * s)
5418
{
5419
#ifdef RYML_DBG
5420
char buf1_[64], buf2_[64], buf3_[64];
5421
csubstr buf1 = _prfl(buf1_, off);
5422
csubstr buf2 = _prfl(buf2_, s->flags);
5423
csubstr buf3 = _prfl(buf3_, s->flags&(~off));
5424
_c4dbgpf("state[{}]: removing flags {}: before={} after={}", s-m_stack.begin(), buf1, buf2, buf3);
5425
#endif
5426
s->flags &= ~off;
5427
}
5428
5429
//-----------------------------------------------------------------------------
5430
5431
csubstr Parser::_prfl(substr buf, flag_t flags)
5432
{
5433
size_t pos = 0;
5434
bool gotone = false;
5435
5436
#define _prflag(fl) \
5437
if((flags & fl) == (fl)) \
5438
{ \
5439
if(gotone) \
5440
{ \
5441
if(pos + 1 < buf.len) \
5442
buf[pos] = '|'; \
5443
++pos; \
5444
} \
5445
csubstr fltxt = #fl; \
5446
if(pos + fltxt.len <= buf.len) \
5447
memcpy(buf.str + pos, fltxt.str, fltxt.len); \
5448
pos += fltxt.len; \
5449
gotone = true; \
5450
}
5451
5452
_prflag(RTOP);
5453
_prflag(RUNK);
5454
_prflag(RMAP);
5455
_prflag(RSEQ);
5456
_prflag(FLOW);
5457
_prflag(QMRK);
5458
_prflag(RKEY);
5459
_prflag(RVAL);
5460
_prflag(RNXT);
5461
_prflag(SSCL);
5462
_prflag(QSCL);
5463
_prflag(RSET);
5464
_prflag(NDOC);
5465
_prflag(RSEQIMAP);
5466
5467
#undef _prflag
5468
5469
RYML_ASSERT(pos <= buf.len);
5470
5471
return buf.first(pos);
5472
}
5473
5474
5475
//-----------------------------------------------------------------------------
5476
//-----------------------------------------------------------------------------
5477
//-----------------------------------------------------------------------------
5478
5479
void Parser::_grow_filter_arena(size_t num_characters_needed)
5480
{
5481
_c4dbgpf("grow: arena={} numchars={}", m_filter_arena.len, num_characters_needed);
5482
if(num_characters_needed <= m_filter_arena.len)
5483
return;
5484
size_t sz = m_filter_arena.len << 1;
5485
_c4dbgpf("grow: sz={}", sz);
5486
sz = num_characters_needed > sz ? num_characters_needed : sz;
5487
_c4dbgpf("grow: sz={}", sz);
5488
sz = sz < 128u ? 128u : sz;
5489
_c4dbgpf("grow: sz={}", sz);
5490
_RYML_CB_ASSERT(m_stack.m_callbacks, sz >= num_characters_needed);
5491
_resize_filter_arena(sz);
5492
}
5493
5494
void Parser::_resize_filter_arena(size_t num_characters)
5495
{
5496
if(num_characters > m_filter_arena.len)
5497
{
5498
_c4dbgpf("resize: sz={}", num_characters);
5499
char *prev = m_filter_arena.str;
5500
if(m_filter_arena.str)
5501
{
5502
_RYML_CB_ASSERT(m_stack.m_callbacks, m_filter_arena.len > 0);
5503
_RYML_CB_FREE(m_stack.m_callbacks, m_filter_arena.str, char, m_filter_arena.len);
5504
}
5505
m_filter_arena.str = _RYML_CB_ALLOC_HINT(m_stack.m_callbacks, char, num_characters, prev);
5506
m_filter_arena.len = num_characters;
5507
}
5508
}
5509
5510
substr Parser::_finish_filter_arena(substr dst, size_t pos)
5511
{
5512
_RYML_CB_ASSERT(m_stack.m_callbacks, pos <= m_filter_arena.len);
5513
_RYML_CB_ASSERT(m_stack.m_callbacks, pos <= dst.len);
5514
memcpy(dst.str, m_filter_arena.str, pos);
5515
return dst.first(pos);
5516
}
5517
5518
5519
//-----------------------------------------------------------------------------
5520
//-----------------------------------------------------------------------------
5521
//-----------------------------------------------------------------------------
5522
5523
csubstr Parser::location_contents(Location const& loc) const
5524
{
5525
_RYML_CB_ASSERT(m_stack.m_callbacks, loc.offset < m_buf.len);
5526
return m_buf.sub(loc.offset);
5527
}
5528
5529
Location Parser::location(ConstNodeRef node) const
5530
{
5531
_RYML_CB_ASSERT(m_stack.m_callbacks, node.valid());
5532
return location(*node.tree(), node.id());
5533
}
5534
5535
Location Parser::location(Tree const& tree, size_t node) const
5536
{
5537
// try hard to avoid getting the location from a null string.
5538
Location loc;
5539
if(_location_from_node(tree, node, &loc, 0))
5540
return loc;
5541
return val_location(m_buf.str);
5542
}
5543
5544
bool Parser::_location_from_node(Tree const& tree, size_t node, Location *C4_RESTRICT loc, size_t level) const
5545
{
5546
if(tree.has_key(node))
5547
{
5548
csubstr k = tree.key(node);
5549
if(C4_LIKELY(k.str != nullptr))
5550
{
5551
_RYML_CB_ASSERT(m_stack.m_callbacks, k.is_sub(m_buf));
5552
_RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.is_super(k));
5553
*loc = val_location(k.str);
5554
return true;
5555
}
5556
}
5557
5558
if(tree.has_val(node))
5559
{
5560
csubstr v = tree.val(node);
5561
if(C4_LIKELY(v.str != nullptr))
5562
{
5563
_RYML_CB_ASSERT(m_stack.m_callbacks, v.is_sub(m_buf));
5564
_RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.is_super(v));
5565
*loc = val_location(v.str);
5566
return true;
5567
}
5568
}
5569
5570
if(tree.is_container(node))
5571
{
5572
if(_location_from_cont(tree, node, loc))
5573
return true;
5574
}
5575
5576
if(tree.type(node) != NOTYPE && level == 0)
5577
{
5578
// try the prev sibling
5579
{
5580
const size_t prev = tree.prev_sibling(node);
5581
if(prev != NONE)
5582
{
5583
if(_location_from_node(tree, prev, loc, level+1))
5584
return true;
5585
}
5586
}
5587
// try the next sibling
5588
{
5589
const size_t next = tree.next_sibling(node);
5590
if(next != NONE)
5591
{
5592
if(_location_from_node(tree, next, loc, level+1))
5593
return true;
5594
}
5595
}
5596
// try the parent
5597
{
5598
const size_t parent = tree.parent(node);
5599
if(parent != NONE)
5600
{
5601
if(_location_from_node(tree, parent, loc, level+1))
5602
return true;
5603
}
5604
}
5605
}
5606
5607
return false;
5608
}
5609
5610
bool Parser::_location_from_cont(Tree const& tree, size_t node, Location *C4_RESTRICT loc) const
5611
{
5612
_RYML_CB_ASSERT(m_stack.m_callbacks, tree.is_container(node));
5613
if(!tree.is_stream(node))
5614
{
5615
const char *node_start = tree._p(node)->m_val.scalar.str; // this was stored in the container
5616
if(tree.has_children(node))
5617
{
5618
size_t child = tree.first_child(node);
5619
if(tree.has_key(child))
5620
{
5621
// when a map starts, the container was set after the key
5622
csubstr k = tree.key(child);
5623
if(k.str && node_start > k.str)
5624
node_start = k.str;
5625
}
5626
}
5627
*loc = val_location(node_start);
5628
return true;
5629
}
5630
else // it's a stream
5631
{
5632
*loc = val_location(m_buf.str); // just return the front of the buffer
5633
}
5634
return true;
5635
}
5636
5637
5638
Location Parser::val_location(const char *val) const
5639
{
5640
if(C4_UNLIKELY(val == nullptr))
5641
return {m_file, 0, 0, 0};
5642
5643
_RYML_CB_CHECK(m_stack.m_callbacks, m_options.locations());
5644
// NOTE: if any of these checks fails, the parser needs to be
5645
// instantiated with locations enabled.
5646
_RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.str == m_newline_offsets_buf.str);
5647
_RYML_CB_ASSERT(m_stack.m_callbacks, m_buf.len == m_newline_offsets_buf.len);
5648
_RYML_CB_ASSERT(m_stack.m_callbacks, m_options.locations());
5649
_RYML_CB_ASSERT(m_stack.m_callbacks, !_locations_dirty());
5650
_RYML_CB_ASSERT(m_stack.m_callbacks, m_newline_offsets != nullptr);
5651
_RYML_CB_ASSERT(m_stack.m_callbacks, m_newline_offsets_size > 0);
5652
// NOTE: the pointer needs to belong to the buffer that was used to parse.
5653
csubstr src = m_buf;
5654
_RYML_CB_CHECK(m_stack.m_callbacks, val != nullptr || src.str == nullptr);
5655
_RYML_CB_CHECK(m_stack.m_callbacks, (val >= src.begin() && val <= src.end()) || (src.str == nullptr && val == nullptr));
5656
// ok. search the first stored newline after the given ptr
5657
using lineptr_type = size_t const* C4_RESTRICT;
5658
lineptr_type lineptr = nullptr;
5659
size_t offset = (size_t)(val - src.begin());
5660
if(m_newline_offsets_size < 30) // TODO magic number
5661
{
5662
// just do a linear search if the size is small.
5663
for(lineptr_type curr = m_newline_offsets, last = m_newline_offsets + m_newline_offsets_size; curr < last; ++curr)
5664
{
5665
if(*curr > offset)
5666
{
5667
lineptr = curr;
5668
break;
5669
}
5670
}
5671
}
5672
else
5673
{
5674
// do a bisection search if the size is not small.
5675
//
5676
// We could use std::lower_bound but this is simple enough and
5677
// spares the include of <algorithm>.
5678
size_t count = m_newline_offsets_size;
5679
size_t step;
5680
lineptr_type it;
5681
lineptr = m_newline_offsets;
5682
while(count)
5683
{
5684
step = count >> 1;
5685
it = lineptr + step;
5686
if(*it < offset)
5687
{
5688
lineptr = ++it;
5689
count -= step + 1;
5690
}
5691
else
5692
{
5693
count = step;
5694
}
5695
}
5696
}
5697
_RYML_CB_ASSERT(m_stack.m_callbacks, lineptr >= m_newline_offsets);
5698
_RYML_CB_ASSERT(m_stack.m_callbacks, lineptr <= m_newline_offsets + m_newline_offsets_size);
5699
_RYML_CB_ASSERT(m_stack.m_callbacks, *lineptr > offset);
5700
Location loc;
5701
loc.name = m_file;
5702
loc.offset = offset;
5703
loc.line = (size_t)(lineptr - m_newline_offsets);
5704
if(lineptr > m_newline_offsets)
5705
loc.col = (offset - *(lineptr-1) - 1u);
5706
else
5707
loc.col = offset;
5708
return loc;
5709
}
5710
5711
void Parser::_prepare_locations()
5712
{
5713
m_newline_offsets_buf = m_buf;
5714
size_t numnewlines = 1u + m_buf.count('\n');
5715
_resize_locations(numnewlines);
5716
m_newline_offsets_size = 0;
5717
for(size_t i = 0; i < m_buf.len; i++)
5718
if(m_buf[i] == '\n')
5719
m_newline_offsets[m_newline_offsets_size++] = i;
5720
m_newline_offsets[m_newline_offsets_size++] = m_buf.len;
5721
_RYML_CB_ASSERT(m_stack.m_callbacks, m_newline_offsets_size == numnewlines);
5722
}
5723
5724
void Parser::_resize_locations(size_t numnewlines)
5725
{
5726
if(numnewlines > m_newline_offsets_capacity)
5727
{
5728
if(m_newline_offsets)
5729
_RYML_CB_FREE(m_stack.m_callbacks, m_newline_offsets, size_t, m_newline_offsets_capacity);
5730
m_newline_offsets = _RYML_CB_ALLOC_HINT(m_stack.m_callbacks, size_t, numnewlines, m_newline_offsets);
5731
m_newline_offsets_capacity = numnewlines;
5732
}
5733
}
5734
5735
bool Parser::_locations_dirty() const
5736
{
5737
return !m_newline_offsets_size;
5738
}
5739
5740
} // namespace yml
5741
} // namespace c4
5742
5743
5744
#if defined(_MSC_VER)
5745
# pragma warning(pop)
5746
#elif defined(__clang__)
5747
# pragma clang diagnostic pop
5748
#elif defined(__GNUC__)
5749
# pragma GCC diagnostic pop
5750
#endif
5751
5752