Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/libdiff/lib/diff_main.c
35083 views
1
/* Generic infrastructure to implement various diff algorithms (implementation). */
2
/*
3
* Copyright (c) 2020 Neels Hofmeyr <[email protected]>
4
*
5
* Permission to use, copy, modify, and distribute this software for any
6
* purpose with or without fee is hereby granted, provided that the above
7
* copyright notice and this permission notice appear in all copies.
8
*
9
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16
*/
17
18
19
#include <sys/queue.h>
20
#include <ctype.h>
21
#include <errno.h>
22
#include <stdint.h>
23
#include <stdlib.h>
24
#include <stdbool.h>
25
#include <stdio.h>
26
#include <string.h>
27
#include <limits.h>
28
#include <unistd.h>
29
30
#include <assert.h>
31
32
#include <arraylist.h>
33
#include <diff_main.h>
34
35
#include "diff_internal.h"
36
#include "diff_debug.h"
37
38
inline enum diff_chunk_type
39
diff_chunk_type(const struct diff_chunk *chunk)
40
{
41
if (!chunk->left_count && !chunk->right_count)
42
return CHUNK_EMPTY;
43
if (!chunk->solved)
44
return CHUNK_ERROR;
45
if (!chunk->right_count)
46
return CHUNK_MINUS;
47
if (!chunk->left_count)
48
return CHUNK_PLUS;
49
if (chunk->left_count != chunk->right_count)
50
return CHUNK_ERROR;
51
return CHUNK_SAME;
52
}
53
54
static int
55
read_at(FILE *f, off_t at_pos, unsigned char *buf, size_t len)
56
{
57
ssize_t r;
58
59
if (fseeko(f, at_pos, SEEK_SET) == -1)
60
return errno;
61
r = fread(buf, sizeof(char), len, f);
62
if ((r == 0 || r < len) && ferror(f))
63
return EIO;
64
if (r != len)
65
return EIO;
66
return 0;
67
}
68
69
static int
70
buf_cmp(const unsigned char *left, size_t left_len,
71
const unsigned char *right, size_t right_len,
72
bool ignore_whitespace)
73
{
74
int cmp;
75
76
if (ignore_whitespace) {
77
int il = 0, ir = 0;
78
while (il < left_len && ir < right_len) {
79
unsigned char cl = left[il];
80
unsigned char cr = right[ir];
81
82
if (isspace((unsigned char)cl) && il < left_len) {
83
il++;
84
continue;
85
}
86
if (isspace((unsigned char)cr) && ir < right_len) {
87
ir++;
88
continue;
89
}
90
91
if (cl > cr)
92
return 1;
93
if (cr > cl)
94
return -1;
95
il++;
96
ir++;
97
}
98
while (il < left_len) {
99
unsigned char cl = left[il++];
100
if (!isspace((unsigned char)cl))
101
return 1;
102
}
103
while (ir < right_len) {
104
unsigned char cr = right[ir++];
105
if (!isspace((unsigned char)cr))
106
return -1;
107
}
108
109
return 0;
110
}
111
112
cmp = memcmp(left, right, MIN(left_len, right_len));
113
if (cmp)
114
return cmp;
115
if (left_len == right_len)
116
return 0;
117
return (left_len > right_len) ? 1 : -1;
118
}
119
120
int
121
diff_atom_cmp(int *cmp,
122
const struct diff_atom *left,
123
const struct diff_atom *right)
124
{
125
off_t remain_left, remain_right;
126
int flags = (left->root->diff_flags | right->root->diff_flags);
127
bool ignore_whitespace = (flags & DIFF_FLAG_IGNORE_WHITESPACE);
128
129
if (!left->len && !right->len) {
130
*cmp = 0;
131
return 0;
132
}
133
if (!ignore_whitespace) {
134
if (!right->len) {
135
*cmp = 1;
136
return 0;
137
}
138
if (!left->len) {
139
*cmp = -1;
140
return 0;
141
}
142
}
143
144
if (left->at != NULL && right->at != NULL) {
145
*cmp = buf_cmp(left->at, left->len, right->at, right->len,
146
ignore_whitespace);
147
return 0;
148
}
149
150
remain_left = left->len;
151
remain_right = right->len;
152
while (remain_left > 0 || remain_right > 0) {
153
const size_t chunksz = 8192;
154
unsigned char buf_left[chunksz], buf_right[chunksz];
155
const uint8_t *p_left, *p_right;
156
off_t n_left, n_right;
157
int r;
158
159
if (!remain_right) {
160
*cmp = 1;
161
return 0;
162
}
163
if (!remain_left) {
164
*cmp = -1;
165
return 0;
166
}
167
168
n_left = MIN(chunksz, remain_left);
169
n_right = MIN(chunksz, remain_right);
170
171
if (left->at == NULL) {
172
r = read_at(left->root->f,
173
left->pos + (left->len - remain_left),
174
buf_left, n_left);
175
if (r) {
176
*cmp = 0;
177
return r;
178
}
179
p_left = buf_left;
180
} else {
181
p_left = left->at + (left->len - remain_left);
182
}
183
184
if (right->at == NULL) {
185
r = read_at(right->root->f,
186
right->pos + (right->len - remain_right),
187
buf_right, n_right);
188
if (r) {
189
*cmp = 0;
190
return r;
191
}
192
p_right = buf_right;
193
} else {
194
p_right = right->at + (right->len - remain_right);
195
}
196
197
r = buf_cmp(p_left, n_left, p_right, n_right,
198
ignore_whitespace);
199
if (r) {
200
*cmp = r;
201
return 0;
202
}
203
204
remain_left -= n_left;
205
remain_right -= n_right;
206
}
207
208
*cmp = 0;
209
return 0;
210
}
211
212
int
213
diff_atom_same(bool *same,
214
const struct diff_atom *left,
215
const struct diff_atom *right)
216
{
217
int cmp;
218
int r;
219
if (left->hash != right->hash) {
220
*same = false;
221
return 0;
222
}
223
r = diff_atom_cmp(&cmp, left, right);
224
if (r) {
225
*same = true;
226
return r;
227
}
228
*same = (cmp == 0);
229
return 0;
230
}
231
232
static struct diff_chunk *
233
diff_state_add_solved_chunk(struct diff_state *state,
234
const struct diff_chunk *chunk)
235
{
236
diff_chunk_arraylist_t *result;
237
struct diff_chunk *new_chunk;
238
enum diff_chunk_type last_t;
239
enum diff_chunk_type new_t;
240
struct diff_chunk *last;
241
242
/* Append to solved chunks; make sure that adjacent chunks of same type are combined, and that a minus chunk
243
* never directly follows a plus chunk. */
244
result = &state->result->chunks;
245
246
last_t = result->len ? diff_chunk_type(&result->head[result->len - 1])
247
: CHUNK_EMPTY;
248
new_t = diff_chunk_type(chunk);
249
250
debug("ADD %s chunk #%u:\n", chunk->solved ? "solved" : "UNSOLVED",
251
result->len);
252
debug("L\n");
253
debug_dump_atoms(&state->left, chunk->left_start, chunk->left_count);
254
debug("R\n");
255
debug_dump_atoms(&state->right, chunk->right_start, chunk->right_count);
256
257
if (result->len) {
258
last = &result->head[result->len - 1];
259
assert(chunk->left_start
260
== last->left_start + last->left_count);
261
assert(chunk->right_start
262
== last->right_start + last->right_count);
263
}
264
265
if (new_t == last_t) {
266
new_chunk = &result->head[result->len - 1];
267
new_chunk->left_count += chunk->left_count;
268
new_chunk->right_count += chunk->right_count;
269
debug(" - added chunk touches previous one of same type, joined:\n");
270
debug("L\n");
271
debug_dump_atoms(&state->left, new_chunk->left_start, new_chunk->left_count);
272
debug("R\n");
273
debug_dump_atoms(&state->right, new_chunk->right_start, new_chunk->right_count);
274
} else if (last_t == CHUNK_PLUS && new_t == CHUNK_MINUS) {
275
enum diff_chunk_type prev_last_t =
276
result->len > 1 ?
277
diff_chunk_type(&result->head[result->len - 2])
278
: CHUNK_EMPTY;
279
/* If a minus-chunk follows a plus-chunk, place it above the plus-chunk->
280
* Is the one before that also a minus? combine. */
281
if (prev_last_t == CHUNK_MINUS) {
282
new_chunk = &result->head[result->len - 2];
283
new_chunk->left_count += chunk->left_count;
284
new_chunk->right_count += chunk->right_count;
285
286
debug(" - added minus-chunk follows plus-chunk,"
287
" put before that plus-chunk and joined"
288
" with preceding minus-chunk:\n");
289
debug("L\n");
290
debug_dump_atoms(&state->left, new_chunk->left_start, new_chunk->left_count);
291
debug("R\n");
292
debug_dump_atoms(&state->right, new_chunk->right_start, new_chunk->right_count);
293
} else {
294
ARRAYLIST_INSERT(new_chunk, *result, result->len - 1);
295
if (!new_chunk)
296
return NULL;
297
*new_chunk = *chunk;
298
299
/* The new minus chunk indicates to which position on
300
* the right it corresponds, even though it doesn't add
301
* any lines on the right. By moving above a plus chunk,
302
* that position on the right has shifted. */
303
last = &result->head[result->len - 1];
304
new_chunk->right_start = last->right_start;
305
306
debug(" - added minus-chunk follows plus-chunk,"
307
" put before that plus-chunk\n");
308
}
309
310
/* That last_t == CHUNK_PLUS indicates to which position on the
311
* left it corresponds, even though it doesn't add any lines on
312
* the left. By inserting/extending the prev_last_t ==
313
* CHUNK_MINUS, that position on the left has shifted. */
314
last = &result->head[result->len - 1];
315
last->left_start = new_chunk->left_start
316
+ new_chunk->left_count;
317
318
} else {
319
ARRAYLIST_ADD(new_chunk, *result);
320
if (!new_chunk)
321
return NULL;
322
*new_chunk = *chunk;
323
}
324
return new_chunk;
325
}
326
327
/* Even if a left or right side is empty, diff output may need to know the
328
* position in that file.
329
* So left_start or right_start must never be NULL -- pass left_count or
330
* right_count as zero to indicate staying at that position without consuming
331
* any lines. */
332
struct diff_chunk *
333
diff_state_add_chunk(struct diff_state *state, bool solved,
334
struct diff_atom *left_start, unsigned int left_count,
335
struct diff_atom *right_start, unsigned int right_count)
336
{
337
struct diff_chunk *new_chunk;
338
struct diff_chunk chunk = {
339
.solved = solved,
340
.left_start = left_start,
341
.left_count = left_count,
342
.right_start = right_start,
343
.right_count = right_count,
344
};
345
346
/* An unsolved chunk means store as intermediate result for later
347
* re-iteration.
348
* If there already are intermediate results, that means even a
349
* following solved chunk needs to go to intermediate results, so that
350
* it is later put in the final correct position in solved chunks.
351
*/
352
if (!solved || state->temp_result.len) {
353
/* Append to temp_result */
354
debug("ADD %s chunk to temp result:\n",
355
chunk.solved ? "solved" : "UNSOLVED");
356
debug("L\n");
357
debug_dump_atoms(&state->left, left_start, left_count);
358
debug("R\n");
359
debug_dump_atoms(&state->right, right_start, right_count);
360
ARRAYLIST_ADD(new_chunk, state->temp_result);
361
if (!new_chunk)
362
return NULL;
363
*new_chunk = chunk;
364
return new_chunk;
365
}
366
367
return diff_state_add_solved_chunk(state, &chunk);
368
}
369
370
static void
371
diff_data_init_root(struct diff_data *d, FILE *f, const uint8_t *data,
372
unsigned long long len, int diff_flags)
373
{
374
*d = (struct diff_data){
375
.f = f,
376
.pos = 0,
377
.data = data,
378
.len = len,
379
.root = d,
380
.diff_flags = diff_flags,
381
};
382
}
383
384
void
385
diff_data_init_subsection(struct diff_data *d, struct diff_data *parent,
386
struct diff_atom *from_atom, unsigned int atoms_count)
387
{
388
struct diff_atom *last_atom;
389
390
debug("diff_data %p parent %p from_atom %p atoms_count %u\n",
391
d, parent, from_atom, atoms_count);
392
debug(" from_atom ");
393
debug_dump_atom(parent, NULL, from_atom);
394
395
if (atoms_count == 0) {
396
*d = (struct diff_data){
397
.f = NULL,
398
.pos = 0,
399
.data = NULL,
400
.len = 0,
401
.root = parent->root,
402
.atoms.head = NULL,
403
.atoms.len = atoms_count,
404
};
405
406
return;
407
}
408
409
last_atom = from_atom + atoms_count - 1;
410
*d = (struct diff_data){
411
.f = NULL,
412
.pos = from_atom->pos,
413
.data = from_atom->at,
414
.len = (last_atom->pos + last_atom->len) - from_atom->pos,
415
.root = parent->root,
416
.atoms.head = from_atom,
417
.atoms.len = atoms_count,
418
};
419
420
debug("subsection:\n");
421
debug_dump(d);
422
}
423
424
void
425
diff_data_free(struct diff_data *diff_data)
426
{
427
if (!diff_data)
428
return;
429
if (diff_data->atoms.allocated)
430
ARRAYLIST_FREE(diff_data->atoms);
431
}
432
433
int
434
diff_algo_none(const struct diff_algo_config *algo_config,
435
struct diff_state *state)
436
{
437
debug("\n** %s\n", __func__);
438
debug("left:\n");
439
debug_dump(&state->left);
440
debug("right:\n");
441
debug_dump(&state->right);
442
debug_dump_myers_graph(&state->left, &state->right, NULL, NULL, 0, NULL,
443
0);
444
445
/* Add a chunk of equal lines, if any */
446
struct diff_atom *l = state->left.atoms.head;
447
unsigned int l_len = state->left.atoms.len;
448
struct diff_atom *r = state->right.atoms.head;
449
unsigned int r_len = state->right.atoms.len;
450
unsigned int equal_atoms_start = 0;
451
unsigned int equal_atoms_end = 0;
452
unsigned int l_idx = 0;
453
unsigned int r_idx = 0;
454
455
while (equal_atoms_start < l_len
456
&& equal_atoms_start < r_len) {
457
int err;
458
bool same;
459
err = diff_atom_same(&same, &l[equal_atoms_start],
460
&r[equal_atoms_start]);
461
if (err)
462
return err;
463
if (!same)
464
break;
465
equal_atoms_start++;
466
}
467
while (equal_atoms_end < (l_len - equal_atoms_start)
468
&& equal_atoms_end < (r_len - equal_atoms_start)) {
469
int err;
470
bool same;
471
err = diff_atom_same(&same, &l[l_len - 1 - equal_atoms_end],
472
&r[r_len - 1 - equal_atoms_end]);
473
if (err)
474
return err;
475
if (!same)
476
break;
477
equal_atoms_end++;
478
}
479
480
/* Add a chunk of equal lines at the start */
481
if (equal_atoms_start) {
482
if (!diff_state_add_chunk(state, true,
483
l, equal_atoms_start,
484
r, equal_atoms_start))
485
return ENOMEM;
486
l_idx += equal_atoms_start;
487
r_idx += equal_atoms_start;
488
}
489
490
/* Add a "minus" chunk with all lines from the left. */
491
if (equal_atoms_start + equal_atoms_end < l_len) {
492
unsigned int add_len = l_len - equal_atoms_start - equal_atoms_end;
493
if (!diff_state_add_chunk(state, true,
494
&l[l_idx], add_len,
495
&r[r_idx], 0))
496
return ENOMEM;
497
l_idx += add_len;
498
}
499
500
/* Add a "plus" chunk with all lines from the right. */
501
if (equal_atoms_start + equal_atoms_end < r_len) {
502
unsigned int add_len = r_len - equal_atoms_start - equal_atoms_end;
503
if (!diff_state_add_chunk(state, true,
504
&l[l_idx], 0,
505
&r[r_idx], add_len))
506
return ENOMEM;
507
r_idx += add_len;
508
}
509
510
/* Add a chunk of equal lines at the end */
511
if (equal_atoms_end) {
512
if (!diff_state_add_chunk(state, true,
513
&l[l_idx], equal_atoms_end,
514
&r[r_idx], equal_atoms_end))
515
return ENOMEM;
516
}
517
518
return DIFF_RC_OK;
519
}
520
521
static int
522
diff_run_algo(const struct diff_algo_config *algo_config,
523
struct diff_state *state)
524
{
525
int rc;
526
527
if (!algo_config || !algo_config->impl
528
|| !state->recursion_depth_left
529
|| !state->left.atoms.len || !state->right.atoms.len) {
530
debug("Fall back to diff_algo_none():%s%s%s\n",
531
(!algo_config || !algo_config->impl) ? " no-cfg" : "",
532
(!state->recursion_depth_left) ? " max-depth" : "",
533
(!state->left.atoms.len || !state->right.atoms.len)?
534
" trivial" : "");
535
return diff_algo_none(algo_config, state);
536
}
537
538
ARRAYLIST_FREE(state->temp_result);
539
ARRAYLIST_INIT(state->temp_result, DIFF_RESULT_ALLOC_BLOCKSIZE);
540
rc = algo_config->impl(algo_config, state);
541
switch (rc) {
542
case DIFF_RC_USE_DIFF_ALGO_FALLBACK:
543
debug("Got DIFF_RC_USE_DIFF_ALGO_FALLBACK (%p)\n",
544
algo_config->fallback_algo);
545
rc = diff_run_algo(algo_config->fallback_algo, state);
546
goto return_rc;
547
548
case DIFF_RC_OK:
549
/* continue below */
550
break;
551
552
default:
553
/* some error happened */
554
goto return_rc;
555
}
556
557
/* Pick up any diff chunks that are still unsolved and feed to
558
* inner_algo. inner_algo will solve unsolved chunks and append to
559
* result, and subsequent solved chunks on this level are then appended
560
* to result afterwards. */
561
int i;
562
for (i = 0; i < state->temp_result.len; i++) {
563
struct diff_chunk *c = &state->temp_result.head[i];
564
if (c->solved) {
565
diff_state_add_solved_chunk(state, c);
566
continue;
567
}
568
569
/* c is an unsolved chunk, feed to inner_algo */
570
struct diff_state inner_state = {
571
.result = state->result,
572
.recursion_depth_left = state->recursion_depth_left - 1,
573
.kd_buf = state->kd_buf,
574
.kd_buf_size = state->kd_buf_size,
575
};
576
diff_data_init_subsection(&inner_state.left, &state->left,
577
c->left_start, c->left_count);
578
diff_data_init_subsection(&inner_state.right, &state->right,
579
c->right_start, c->right_count);
580
581
rc = diff_run_algo(algo_config->inner_algo, &inner_state);
582
state->kd_buf = inner_state.kd_buf;
583
state->kd_buf_size = inner_state.kd_buf_size;
584
if (rc != DIFF_RC_OK)
585
goto return_rc;
586
}
587
588
rc = DIFF_RC_OK;
589
return_rc:
590
ARRAYLIST_FREE(state->temp_result);
591
return rc;
592
}
593
594
int
595
diff_atomize_file(struct diff_data *d,
596
const struct diff_config *config,
597
FILE *f, const uint8_t *data, off_t len, int diff_flags)
598
{
599
if (!config->atomize_func)
600
return EINVAL;
601
602
diff_data_init_root(d, f, data, len, diff_flags);
603
604
return config->atomize_func(config->atomize_func_data, d);
605
606
}
607
608
struct diff_result *
609
diff_main(const struct diff_config *config, struct diff_data *left,
610
struct diff_data *right)
611
{
612
struct diff_result *result = malloc(sizeof(struct diff_result));
613
if (!result)
614
return NULL;
615
616
*result = (struct diff_result){};
617
result->left = left;
618
result->right = right;
619
620
struct diff_state state = {
621
.result = result,
622
.recursion_depth_left = config->max_recursion_depth ?
623
config->max_recursion_depth : UINT_MAX,
624
.kd_buf = NULL,
625
.kd_buf_size = 0,
626
};
627
diff_data_init_subsection(&state.left, left,
628
left->atoms.head,
629
left->atoms.len);
630
diff_data_init_subsection(&state.right, right,
631
right->atoms.head,
632
right->atoms.len);
633
634
result->rc = diff_run_algo(config->algo, &state);
635
free(state.kd_buf);
636
637
return result;
638
}
639
640
void
641
diff_result_free(struct diff_result *result)
642
{
643
if (!result)
644
return;
645
ARRAYLIST_FREE(result->chunks);
646
free(result);
647
}
648
649
int
650
diff_result_contains_printable_chunks(struct diff_result *result)
651
{
652
struct diff_chunk *c;
653
enum diff_chunk_type t;
654
int i;
655
656
for (i = 0; i < result->chunks.len; i++) {
657
c = &result->chunks.head[i];
658
t = diff_chunk_type(c);
659
if (t == CHUNK_MINUS || t == CHUNK_PLUS)
660
return 1;
661
}
662
663
return 0;
664
}
665
666