CoCalc -- analyze.c

GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/diff/src/analyze.c
³⁹⁵³⁰ views
1
/* Analyze file differences for GNU DIFF.
2

3
   Copyright (C) 1988, 1989, 1992, 1993, 1994, 1995, 1998, 2001, 2002,
4
   2004 Free Software Foundation, Inc.
5

6
   This file is part of GNU DIFF.
7

8
   GNU DIFF is free software; you can redistribute it and/or modify
9
   it under the terms of the GNU General Public License as published by
10
   the Free Software Foundation; either version 2, or (at your option)
11
   any later version.
12

13
   GNU DIFF is distributed in the hope that it will be useful,
14
   but WITHOUT ANY WARRANTY; without even the implied warranty of
15
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
   GNU General Public License for more details.
17

18
   You should have received a copy of the GNU General Public License
19
   along with this program; see the file COPYING.
20
   If not, write to the Free Software Foundation,
21
   59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
22

23
/* The basic algorithm is described in:
24
   "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
25
   Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
26
   see especially section 4.2, which describes the variation used below.
27
   Unless the --minimal option is specified, this code uses the TOO_EXPENSIVE
28
   heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
29
   at the price of producing suboptimal output for large inputs with
30
   many differences.
31

32
   The basic algorithm was independently discovered as described in:
33
   "Algorithms for Approximate String Matching", E. Ukkonen,
34
   Information and Control Vol. 64, 1985, pp. 100-118.  */
35

36
#include "diff.h"
37
#include <cmpbuf.h>
38
#include <error.h>
39
#include <file-type.h>
40
#include <xalloc.h>
41

42
static lin *xvec, *yvec;	/* Vectors being compared. */
43
static lin *fdiag;		/* Vector, indexed by diagonal, containing
44
				   1 + the X coordinate of the point furthest
45
				   along the given diagonal in the forward
46
				   search of the edit matrix. */
47
static lin *bdiag;		/* Vector, indexed by diagonal, containing
48
				   the X coordinate of the point furthest
49
				   along the given diagonal in the backward
50
				   search of the edit matrix. */
51
static lin too_expensive;	/* Edit scripts longer than this are too
52
				   expensive to compute.  */
53

54
#define SNAKE_LIMIT 20	/* Snakes bigger than this are considered `big'.  */
55

56
struct partition
57
{
58
  lin xmid, ymid;	/* Midpoints of this partition.  */
59
  bool lo_minimal;	/* Nonzero if low half will be analyzed minimally.  */
60
  bool hi_minimal;	/* Likewise for high half.  */
61
};
62

63
/* Find the midpoint of the shortest edit script for a specified
64
   portion of the two files.
65

66
   Scan from the beginnings of the files, and simultaneously from the ends,
67
   doing a breadth-first search through the space of edit-sequence.
68
   When the two searches meet, we have found the midpoint of the shortest
69
   edit sequence.
70

71
   If FIND_MINIMAL is nonzero, find the minimal edit script regardless
72
   of expense.  Otherwise, if the search is too expensive, use
73
   heuristics to stop the search and report a suboptimal answer.
74

75
   Set PART->(xmid,ymid) to the midpoint (XMID,YMID).  The diagonal number
76
   XMID - YMID equals the number of inserted lines minus the number
77
   of deleted lines (counting only lines before the midpoint).
78

79
   Set PART->lo_minimal to true iff the minimal edit script for the
80
   left half of the partition is known; similarly for PART->hi_minimal.
81

82
   This function assumes that the first lines of the specified portions
83
   of the two files do not match, and likewise that the last lines do not
84
   match.  The caller must trim matching lines from the beginning and end
85
   of the portions it is going to specify.
86

87
   If we return the "wrong" partitions,
88
   the worst this can do is cause suboptimal diff output.
89
   It cannot cause incorrect diff output.  */
90

91
static void
92
diag (lin xoff, lin xlim, lin yoff, lin ylim, bool find_minimal,
93
      struct partition *part)
94
{
95
  lin *const fd = fdiag;	/* Give the compiler a chance. */
96
  lin *const bd = bdiag;	/* Additional help for the compiler. */
97
  lin const *const xv = xvec;	/* Still more help for the compiler. */
98
  lin const *const yv = yvec;	/* And more and more . . . */
99
  lin const dmin = xoff - ylim;	/* Minimum valid diagonal. */
100
  lin const dmax = xlim - yoff;	/* Maximum valid diagonal. */
101
  lin const fmid = xoff - yoff;	/* Center diagonal of top-down search. */
102
  lin const bmid = xlim - ylim;	/* Center diagonal of bottom-up search. */
103
  lin fmin = fmid, fmax = fmid;	/* Limits of top-down search. */
104
  lin bmin = bmid, bmax = bmid;	/* Limits of bottom-up search. */
105
  lin c;			/* Cost. */
106
  bool odd = (fmid - bmid) & 1;	/* True if southeast corner is on an odd
107
				   diagonal with respect to the northwest. */
108

109
  fd[fmid] = xoff;
110
  bd[bmid] = xlim;
111

112
  for (c = 1;; ++c)
113
    {
114
      lin d;			/* Active diagonal. */
115
      bool big_snake = false;
116

117
      /* Extend the top-down search by an edit step in each diagonal. */
118
      fmin > dmin ? fd[--fmin - 1] = -1 : ++fmin;
119
      fmax < dmax ? fd[++fmax + 1] = -1 : --fmax;
120
      for (d = fmax; d >= fmin; d -= 2)
121
	{
122
	  lin x, y, oldx, tlo = fd[d - 1], thi = fd[d + 1];
123

124
	  if (tlo >= thi)
125
	    x = tlo + 1;
126
	  else
127
	    x = thi;
128
	  oldx = x;
129
	  y = x - d;
130
	  while (x < xlim && y < ylim && xv[x] == yv[y])
131
	    ++x, ++y;
132
	  if (x - oldx > SNAKE_LIMIT)
133
	    big_snake = true;
134
	  fd[d] = x;
135
	  if (odd && bmin <= d && d <= bmax && bd[d] <= x)
136
	    {
137
	      part->xmid = x;
138
	      part->ymid = y;
139
	      part->lo_minimal = part->hi_minimal = true;
140
	      return;
141
	    }
142
	}
143

144
      /* Similarly extend the bottom-up search.  */
145
      bmin > dmin ? bd[--bmin - 1] = LIN_MAX : ++bmin;
146
      bmax < dmax ? bd[++bmax + 1] = LIN_MAX : --bmax;
147
      for (d = bmax; d >= bmin; d -= 2)
148
	{
149
	  lin x, y, oldx, tlo = bd[d - 1], thi = bd[d + 1];
150

151
	  if (tlo < thi)
152
	    x = tlo;
153
	  else
154
	    x = thi - 1;
155
	  oldx = x;
156
	  y = x - d;
157
	  while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1])
158
	    --x, --y;
159
	  if (oldx - x > SNAKE_LIMIT)
160
	    big_snake = true;
161
	  bd[d] = x;
162
	  if (!odd && fmin <= d && d <= fmax && x <= fd[d])
163
	    {
164
	      part->xmid = x;
165
	      part->ymid = y;
166
	      part->lo_minimal = part->hi_minimal = true;
167
	      return;
168
	    }
169
	}
170

171
      if (find_minimal)
172
	continue;
173

174
      /* Heuristic: check occasionally for a diagonal that has made
175
	 lots of progress compared with the edit distance.
176
	 If we have any such, find the one that has made the most
177
	 progress and return it as if it had succeeded.
178

179
	 With this heuristic, for files with a constant small density
180
	 of changes, the algorithm is linear in the file size.  */
181

182
      if (200 < c && big_snake && speed_large_files)
183
	{
184
	  lin best = 0;
185

186
	  for (d = fmax; d >= fmin; d -= 2)
187
	    {
188
	      lin dd = d - fmid;
189
	      lin x = fd[d];
190
	      lin y = x - d;
191
	      lin v = (x - xoff) * 2 - dd;
192
	      if (v > 12 * (c + (dd < 0 ? -dd : dd)))
193
		{
194
		  if (v > best
195
		      && xoff + SNAKE_LIMIT <= x && x < xlim
196
		      && yoff + SNAKE_LIMIT <= y && y < ylim)
197
		    {
198
		      /* We have a good enough best diagonal;
199
			 now insist that it end with a significant snake.  */
200
		      int k;
201

202
		      for (k = 1; xv[x - k] == yv[y - k]; k++)
203
			if (k == SNAKE_LIMIT)
204
			  {
205
			    best = v;
206
			    part->xmid = x;
207
			    part->ymid = y;
208
			    break;
209
			  }
210
		    }
211
		}
212
	    }
213
	  if (best > 0)
214
	    {
215
	      part->lo_minimal = true;
216
	      part->hi_minimal = false;
217
	      return;
218
	    }
219

220
	  best = 0;
221
	  for (d = bmax; d >= bmin; d -= 2)
222
	    {
223
	      lin dd = d - bmid;
224
	      lin x = bd[d];
225
	      lin y = x - d;
226
	      lin v = (xlim - x) * 2 + dd;
227
	      if (v > 12 * (c + (dd < 0 ? -dd : dd)))
228
		{
229
		  if (v > best
230
		      && xoff < x && x <= xlim - SNAKE_LIMIT
231
		      && yoff < y && y <= ylim - SNAKE_LIMIT)
232
		    {
233
		      /* We have a good enough best diagonal;
234
			 now insist that it end with a significant snake.  */
235
		      int k;
236

237
		      for (k = 0; xv[x + k] == yv[y + k]; k++)
238
			if (k == SNAKE_LIMIT - 1)
239
			  {
240
			    best = v;
241
			    part->xmid = x;
242
			    part->ymid = y;
243
			    break;
244
			  }
245
		    }
246
		}
247
	    }
248
	  if (best > 0)
249
	    {
250
	      part->lo_minimal = false;
251
	      part->hi_minimal = true;
252
	      return;
253
	    }
254
	}
255

256
      /* Heuristic: if we've gone well beyond the call of duty,
257
	 give up and report halfway between our best results so far.  */
258
      if (c >= too_expensive)
259
	{
260
	  lin fxybest, fxbest;
261
	  lin bxybest, bxbest;
262

263
	  fxbest = bxbest = 0;  /* Pacify `gcc -Wall'.  */
264

265
	  /* Find forward diagonal that maximizes X + Y.  */
266
	  fxybest = -1;
267
	  for (d = fmax; d >= fmin; d -= 2)
268
	    {
269
	      lin x = MIN (fd[d], xlim);
270
	      lin y = x - d;
271
	      if (ylim < y)
272
		x = ylim + d, y = ylim;
273
	      if (fxybest < x + y)
274
		{
275
		  fxybest = x + y;
276
		  fxbest = x;
277
		}
278
	    }
279

280
	  /* Find backward diagonal that minimizes X + Y.  */
281
	  bxybest = LIN_MAX;
282
	  for (d = bmax; d >= bmin; d -= 2)
283
	    {
284
	      lin x = MAX (xoff, bd[d]);
285
	      lin y = x - d;
286
	      if (y < yoff)
287
		x = yoff + d, y = yoff;
288
	      if (x + y < bxybest)
289
		{
290
		  bxybest = x + y;
291
		  bxbest = x;
292
		}
293
	    }
294

295
	  /* Use the better of the two diagonals.  */
296
	  if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
297
	    {
298
	      part->xmid = fxbest;
299
	      part->ymid = fxybest - fxbest;
300
	      part->lo_minimal = true;
301
	      part->hi_minimal = false;
302
	    }
303
	  else
304
	    {
305
	      part->xmid = bxbest;
306
	      part->ymid = bxybest - bxbest;
307
	      part->lo_minimal = false;
308
	      part->hi_minimal = true;
309
	    }
310
	  return;
311
	}
312
    }
313
}
314

315
/* Compare in detail contiguous subsequences of the two files
316
   which are known, as a whole, to match each other.
317

318
   The results are recorded in the vectors files[N].changed, by
319
   storing 1 in the element for each line that is an insertion or deletion.
320

321
   The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
322

323
   Note that XLIM, YLIM are exclusive bounds.
324
   All line numbers are origin-0 and discarded lines are not counted.
325

326
   If FIND_MINIMAL, find a minimal difference no matter how
327
   expensive it is.  */
328

329
static void
330
compareseq (lin xoff, lin xlim, lin yoff, lin ylim, bool find_minimal)
331
{
332
  lin const *xv = xvec; /* Help the compiler.  */
333
  lin const *yv = yvec;
334

335
  /* Slide down the bottom initial diagonal. */
336
  while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff])
337
    ++xoff, ++yoff;
338
  /* Slide up the top initial diagonal. */
339
  while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1])
340
    --xlim, --ylim;
341

342
  /* Handle simple cases. */
343
  if (xoff == xlim)
344
    while (yoff < ylim)
345
      files[1].changed[files[1].realindexes[yoff++]] = 1;
346
  else if (yoff == ylim)
347
    while (xoff < xlim)
348
      files[0].changed[files[0].realindexes[xoff++]] = 1;
349
  else
350
    {
351
      struct partition part;
352

353
      /* Find a point of correspondence in the middle of the files.  */
354
      diag (xoff, xlim, yoff, ylim, find_minimal, &part);
355

356
      /* Use the partitions to split this problem into subproblems.  */
357
      compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal);
358
      compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal);
359
    }
360
}
361

362
/* Discard lines from one file that have no matches in the other file.
363

364
   A line which is discarded will not be considered by the actual
365
   comparison algorithm; it will be as if that line were not in the file.
366
   The file's `realindexes' table maps virtual line numbers
367
   (which don't count the discarded lines) into real line numbers;
368
   this is how the actual comparison algorithm produces results
369
   that are comprehensible when the discarded lines are counted.
370

371
   When we discard a line, we also mark it as a deletion or insertion
372
   so that it will be printed in the output.  */
373

374
static void
375
discard_confusing_lines (struct file_data filevec[])
376
{
377
  int f;
378
  lin i;
379
  char *discarded[2];
380
  lin *equiv_count[2];
381
  lin *p;
382

383
  /* Allocate our results.  */
384
  p = xmalloc ((filevec[0].buffered_lines + filevec[1].buffered_lines)
385
	       * (2 * sizeof *p));
386
  for (f = 0; f < 2; f++)
387
    {
388
      filevec[f].undiscarded = p;  p += filevec[f].buffered_lines;
389
      filevec[f].realindexes = p;  p += filevec[f].buffered_lines;
390
    }
391

392
  /* Set up equiv_count[F][I] as the number of lines in file F
393
     that fall in equivalence class I.  */
394

395
  p = zalloc (filevec[0].equiv_max * (2 * sizeof *p));
396
  equiv_count[0] = p;
397
  equiv_count[1] = p + filevec[0].equiv_max;
398

399
  for (i = 0; i < filevec[0].buffered_lines; ++i)
400
    ++equiv_count[0][filevec[0].equivs[i]];
401
  for (i = 0; i < filevec[1].buffered_lines; ++i)
402
    ++equiv_count[1][filevec[1].equivs[i]];
403

404
  /* Set up tables of which lines are going to be discarded.  */
405

406
  discarded[0] = zalloc (filevec[0].buffered_lines
407
			 + filevec[1].buffered_lines);
408
  discarded[1] = discarded[0] + filevec[0].buffered_lines;
409

410
  /* Mark to be discarded each line that matches no line of the other file.
411
     If a line matches many lines, mark it as provisionally discardable.  */
412

413
  for (f = 0; f < 2; f++)
414
    {
415
      size_t end = filevec[f].buffered_lines;
416
      char *discards = discarded[f];
417
      lin *counts = equiv_count[1 - f];
418
      lin *equivs = filevec[f].equivs;
419
      size_t many = 5;
420
      size_t tem = end / 64;
421

422
      /* Multiply MANY by approximate square root of number of lines.
423
	 That is the threshold for provisionally discardable lines.  */
424
      while ((tem = tem >> 2) > 0)
425
	many *= 2;
426

427
      for (i = 0; i < end; i++)
428
	{
429
	  lin nmatch;
430
	  if (equivs[i] == 0)
431
	    continue;
432
	  nmatch = counts[equivs[i]];
433
	  if (nmatch == 0)
434
	    discards[i] = 1;
435
	  else if (nmatch > many)
436
	    discards[i] = 2;
437
	}
438
    }
439

440
  /* Don't really discard the provisional lines except when they occur
441
     in a run of discardables, with nonprovisionals at the beginning
442
     and end.  */
443

444
  for (f = 0; f < 2; f++)
445
    {
446
      lin end = filevec[f].buffered_lines;
447
      register char *discards = discarded[f];
448

449
      for (i = 0; i < end; i++)
450
	{
451
	  /* Cancel provisional discards not in middle of run of discards.  */
452
	  if (discards[i] == 2)
453
	    discards[i] = 0;
454
	  else if (discards[i] != 0)
455
	    {
456
	      /* We have found a nonprovisional discard.  */
457
	      register lin j;
458
	      lin length;
459
	      lin provisional = 0;
460

461
	      /* Find end of this run of discardable lines.
462
		 Count how many are provisionally discardable.  */
463
	      for (j = i; j < end; j++)
464
		{
465
		  if (discards[j] == 0)
466
		    break;
467
		  if (discards[j] == 2)
468
		    ++provisional;
469
		}
470

471
	      /* Cancel provisional discards at end, and shrink the run.  */
472
	      while (j > i && discards[j - 1] == 2)
473
		discards[--j] = 0, --provisional;
474

475
	      /* Now we have the length of a run of discardable lines
476
		 whose first and last are not provisional.  */
477
	      length = j - i;
478

479
	      /* If 1/4 of the lines in the run are provisional,
480
		 cancel discarding of all provisional lines in the run.  */
481
	      if (provisional * 4 > length)
482
		{
483
		  while (j > i)
484
		    if (discards[--j] == 2)
485
		      discards[j] = 0;
486
		}
487
	      else
488
		{
489
		  register lin consec;
490
		  lin minimum = 1;
491
		  lin tem = length >> 2;
492

493
		  /* MINIMUM is approximate square root of LENGTH/4.
494
		     A subrun of two or more provisionals can stand
495
		     when LENGTH is at least 16.
496
		     A subrun of 4 or more can stand when LENGTH >= 64.  */
497
		  while (0 < (tem >>= 2))
498
		    minimum <<= 1;
499
		  minimum++;
500

501
		  /* Cancel any subrun of MINIMUM or more provisionals
502
		     within the larger run.  */
503
		  for (j = 0, consec = 0; j < length; j++)
504
		    if (discards[i + j] != 2)
505
		      consec = 0;
506
		    else if (minimum == ++consec)
507
		      /* Back up to start of subrun, to cancel it all.  */
508
		      j -= consec;
509
		    else if (minimum < consec)
510
		      discards[i + j] = 0;
511

512
		  /* Scan from beginning of run
513
		     until we find 3 or more nonprovisionals in a row
514
		     or until the first nonprovisional at least 8 lines in.
515
		     Until that point, cancel any provisionals.  */
516
		  for (j = 0, consec = 0; j < length; j++)
517
		    {
518
		      if (j >= 8 && discards[i + j] == 1)
519
			break;
520
		      if (discards[i + j] == 2)
521
			consec = 0, discards[i + j] = 0;
522
		      else if (discards[i + j] == 0)
523
			consec = 0;
524
		      else
525
			consec++;
526
		      if (consec == 3)
527
			break;
528
		    }
529

530
		  /* I advances to the last line of the run.  */
531
		  i += length - 1;
532

533
		  /* Same thing, from end.  */
534
		  for (j = 0, consec = 0; j < length; j++)
535
		    {
536
		      if (j >= 8 && discards[i - j] == 1)
537
			break;
538
		      if (discards[i - j] == 2)
539
			consec = 0, discards[i - j] = 0;
540
		      else if (discards[i - j] == 0)
541
			consec = 0;
542
		      else
543
			consec++;
544
		      if (consec == 3)
545
			break;
546
		    }
547
		}
548
	    }
549
	}
550
    }
551

552
  /* Actually discard the lines. */
553
  for (f = 0; f < 2; f++)
554
    {
555
      char *discards = discarded[f];
556
      lin end = filevec[f].buffered_lines;
557
      lin j = 0;
558
      for (i = 0; i < end; ++i)
559
	if (minimal || discards[i] == 0)
560
	  {
561
	    filevec[f].undiscarded[j] = filevec[f].equivs[i];
562
	    filevec[f].realindexes[j++] = i;
563
	  }
564
	else
565
	  filevec[f].changed[i] = 1;
566
      filevec[f].nondiscarded_lines = j;
567
    }
568

569
  free (discarded[0]);
570
  free (equiv_count[0]);
571
}
572

573
/* Adjust inserts/deletes of identical lines to join changes
574
   as much as possible.
575

576
   We do something when a run of changed lines include a
577
   line at one end and have an excluded, identical line at the other.
578
   We are free to choose which identical line is included.
579
   `compareseq' usually chooses the one at the beginning,
580
   but usually it is cleaner to consider the following identical line
581
   to be the "change".  */
582

583
static void
584
shift_boundaries (struct file_data filevec[])
585
{
586
  int f;
587

588
  for (f = 0; f < 2; f++)
589
    {
590
      char *changed = filevec[f].changed;
591
      char *other_changed = filevec[1 - f].changed;
592
      lin const *equivs = filevec[f].equivs;
593
      lin i = 0;
594
      lin j = 0;
595
      lin i_end = filevec[f].buffered_lines;
596

597
      while (1)
598
	{
599
	  lin runlength, start, corresponding;
600

601
	  /* Scan forwards to find beginning of another run of changes.
602
	     Also keep track of the corresponding point in the other file.  */
603

604
	  while (i < i_end && !changed[i])
605
	    {
606
	      while (other_changed[j++])
607
		continue;
608
	      i++;
609
	    }
610

611
	  if (i == i_end)
612
	    break;
613

614
	  start = i;
615

616
	  /* Find the end of this run of changes.  */
617

618
	  while (changed[++i])
619
	    continue;
620
	  while (other_changed[j])
621
	    j++;
622

623
	  do
624
	    {
625
	      /* Record the length of this run of changes, so that
626
		 we can later determine whether the run has grown.  */
627
	      runlength = i - start;
628

629
	      /* Move the changed region back, so long as the
630
		 previous unchanged line matches the last changed one.
631
		 This merges with previous changed regions.  */
632

633
	      while (start && equivs[start - 1] == equivs[i - 1])
634
		{
635
		  changed[--start] = 1;
636
		  changed[--i] = 0;
637
		  while (changed[start - 1])
638
		    start--;
639
		  while (other_changed[--j])
640
		    continue;
641
		}
642

643
	      /* Set CORRESPONDING to the end of the changed run, at the last
644
		 point where it corresponds to a changed run in the other file.
645
		 CORRESPONDING == I_END means no such point has been found.  */
646
	      corresponding = other_changed[j - 1] ? i : i_end;
647

648
	      /* Move the changed region forward, so long as the
649
		 first changed line matches the following unchanged one.
650
		 This merges with following changed regions.
651
		 Do this second, so that if there are no merges,
652
		 the changed region is moved forward as far as possible.  */
653

654
	      while (i != i_end && equivs[start] == equivs[i])
655
		{
656
		  changed[start++] = 0;
657
		  changed[i++] = 1;
658
		  while (changed[i])
659
		    i++;
660
		  while (other_changed[++j])
661
		    corresponding = i;
662
		}
663
	    }
664
	  while (runlength != i - start);
665

666
	  /* If possible, move the fully-merged run of changes
667
	     back to a corresponding run in the other file.  */
668

669
	  while (corresponding < i)
670
	    {
671
	      changed[--start] = 1;
672
	      changed[--i] = 0;
673
	      while (other_changed[--j])
674
		continue;
675
	    }
676
	}
677
    }
678
}
679

680
/* Cons an additional entry onto the front of an edit script OLD.
681
   LINE0 and LINE1 are the first affected lines in the two files (origin 0).
682
   DELETED is the number of lines deleted here from file 0.
683
   INSERTED is the number of lines inserted here in file 1.
684

685
   If DELETED is 0 then LINE0 is the number of the line before
686
   which the insertion was done; vice versa for INSERTED and LINE1.  */
687

688
static struct change *
689
add_change (lin line0, lin line1, lin deleted, lin inserted,
690
	    struct change *old)
691
{
692
  struct change *new = xmalloc (sizeof *new);
693

694
  new->line0 = line0;
695
  new->line1 = line1;
696
  new->inserted = inserted;
697
  new->deleted = deleted;
698
  new->link = old;
699
  return new;
700
}
701

702
/* Scan the tables of which lines are inserted and deleted,
703
   producing an edit script in reverse order.  */
704

705
static struct change *
706
build_reverse_script (struct file_data const filevec[])
707
{
708
  struct change *script = 0;
709
  char *changed0 = filevec[0].changed;
710
  char *changed1 = filevec[1].changed;
711
  lin len0 = filevec[0].buffered_lines;
712
  lin len1 = filevec[1].buffered_lines;
713

714
  /* Note that changedN[len0] does exist, and is 0.  */
715

716
  lin i0 = 0, i1 = 0;
717

718
  while (i0 < len0 || i1 < len1)
719
    {
720
      if (changed0[i0] | changed1[i1])
721
	{
722
	  lin line0 = i0, line1 = i1;
723

724
	  /* Find # lines changed here in each file.  */
725
	  while (changed0[i0]) ++i0;
726
	  while (changed1[i1]) ++i1;
727

728
	  /* Record this change.  */
729
	  script = add_change (line0, line1, i0 - line0, i1 - line1, script);
730
	}
731

732
      /* We have reached lines in the two files that match each other.  */
733
      i0++, i1++;
734
    }
735

736
  return script;
737
}
738

739
/* Scan the tables of which lines are inserted and deleted,
740
   producing an edit script in forward order.  */
741

742
static struct change *
743
build_script (struct file_data const filevec[])
744
{
745
  struct change *script = 0;
746
  char *changed0 = filevec[0].changed;
747
  char *changed1 = filevec[1].changed;
748
  lin i0 = filevec[0].buffered_lines, i1 = filevec[1].buffered_lines;
749

750
  /* Note that changedN[-1] does exist, and is 0.  */
751

752
  while (i0 >= 0 || i1 >= 0)
753
    {
754
      if (changed0[i0 - 1] | changed1[i1 - 1])
755
	{
756
	  lin line0 = i0, line1 = i1;
757

758
	  /* Find # lines changed here in each file.  */
759
	  while (changed0[i0 - 1]) --i0;
760
	  while (changed1[i1 - 1]) --i1;
761

762
	  /* Record this change.  */
763
	  script = add_change (i0, i1, line0 - i0, line1 - i1, script);
764
	}
765

766
      /* We have reached lines in the two files that match each other.  */
767
      i0--, i1--;
768
    }
769

770
  return script;
771
}
772

773
/* If CHANGES, briefly report that two files differed.
774
   Return 2 if trouble, CHANGES otherwise.  */
775
static int
776
briefly_report (int changes, struct file_data const filevec[])
777
{
778
  if (changes)
779
    {
780
      char const *label0 = file_label[0] ? file_label[0] : filevec[0].name;
781
      char const *label1 = file_label[1] ? file_label[1] : filevec[1].name;
782
      message ("Files %s and %s differ\n", label0, label1);
783
      if (! brief)
784
	changes = 2;
785
    }
786

787
  return changes;
788
}
789

790
/* Report the differences of two files.  */
791
int
792
diff_2_files (struct comparison *cmp)
793
{
794
  lin diags;
795
  int f;
796
  struct change *e, *p;
797
  struct change *script;
798
  int changes;
799

800

801
  /* If we have detected that either file is binary,
802
     compare the two files as binary.  This can happen
803
     only when the first chunk is read.
804
     Also, --brief without any --ignore-* options means
805
     we can speed things up by treating the files as binary.  */
806

807
  if (read_files (cmp->file, files_can_be_treated_as_binary))
808
    {
809
      /* Files with different lengths must be different.  */
810
      if (cmp->file[0].stat.st_size != cmp->file[1].stat.st_size
811
	  && (cmp->file[0].desc < 0 || S_ISREG (cmp->file[0].stat.st_mode))
812
	  && (cmp->file[1].desc < 0 || S_ISREG (cmp->file[1].stat.st_mode)))
813
	changes = 1;
814

815
      /* Standard input equals itself.  */
816
      else if (cmp->file[0].desc == cmp->file[1].desc)
817
	changes = 0;
818

819
      else
820
	/* Scan both files, a buffer at a time, looking for a difference.  */
821
	{
822
	  /* Allocate same-sized buffers for both files.  */
823
	  size_t lcm_max = PTRDIFF_MAX - 1;
824
	  size_t buffer_size =
825
	    buffer_lcm (sizeof (word),
826
			buffer_lcm (STAT_BLOCKSIZE (cmp->file[0].stat),
827
				    STAT_BLOCKSIZE (cmp->file[1].stat),
828
				    lcm_max),
829
			lcm_max);
830
	  for (f = 0; f < 2; f++)
831
	    cmp->file[f].buffer = xrealloc (cmp->file[f].buffer, buffer_size);
832

833
	  for (;; cmp->file[0].buffered = cmp->file[1].buffered = 0)
834
	    {
835
	      /* Read a buffer's worth from both files.  */
836
	      for (f = 0; f < 2; f++)
837
		if (0 <= cmp->file[f].desc)
838
		  file_block_read (&cmp->file[f],
839
				   buffer_size - cmp->file[f].buffered);
840

841
	      /* If the buffers differ, the files differ.  */
842
	      if (cmp->file[0].buffered != cmp->file[1].buffered
843
		  || memcmp (cmp->file[0].buffer,
844
			     cmp->file[1].buffer,
845
			     cmp->file[0].buffered))
846
		{
847
		  changes = 1;
848
		  break;
849
		}
850

851
	      /* If we reach end of file, the files are the same.  */
852
	      if (cmp->file[0].buffered != buffer_size)
853
		{
854
		  changes = 0;
855
		  break;
856
		}
857
	    }
858
	}
859

860
      changes = briefly_report (changes, cmp->file);
861
    }
862
  else
863
    {
864
      /* Allocate vectors for the results of comparison:
865
	 a flag for each line of each file, saying whether that line
866
	 is an insertion or deletion.
867
	 Allocate an extra element, always 0, at each end of each vector.  */
868

869
      size_t s = cmp->file[0].buffered_lines + cmp->file[1].buffered_lines + 4;
870
      char *flag_space = zalloc (s);
871
      cmp->file[0].changed = flag_space + 1;
872
      cmp->file[1].changed = flag_space + cmp->file[0].buffered_lines + 3;
873

874
      /* Some lines are obviously insertions or deletions
875
	 because they don't match anything.  Detect them now, and
876
	 avoid even thinking about them in the main comparison algorithm.  */
877

878
      discard_confusing_lines (cmp->file);
879

880
      /* Now do the main comparison algorithm, considering just the
881
	 undiscarded lines.  */
882

883
      xvec = cmp->file[0].undiscarded;
884
      yvec = cmp->file[1].undiscarded;
885
      diags = (cmp->file[0].nondiscarded_lines
886
	       + cmp->file[1].nondiscarded_lines + 3);
887
      fdiag = xmalloc (diags * (2 * sizeof *fdiag));
888
      bdiag = fdiag + diags;
889
      fdiag += cmp->file[1].nondiscarded_lines + 1;
890
      bdiag += cmp->file[1].nondiscarded_lines + 1;
891

892
      /* Set TOO_EXPENSIVE to be approximate square root of input size,
893
	 bounded below by 256.  */
894
      too_expensive = 1;
895
      for (;  diags != 0;  diags >>= 2)
896
	too_expensive <<= 1;
897
      too_expensive = MAX (256, too_expensive);
898

899
      files[0] = cmp->file[0];
900
      files[1] = cmp->file[1];
901

902
      compareseq (0, cmp->file[0].nondiscarded_lines,
903
		  0, cmp->file[1].nondiscarded_lines, minimal);
904

905
      free (fdiag - (cmp->file[1].nondiscarded_lines + 1));
906

907
      /* Modify the results slightly to make them prettier
908
	 in cases where that can validly be done.  */
909

910
      shift_boundaries (cmp->file);
911

912
      /* Get the results of comparison in the form of a chain
913
	 of `struct change's -- an edit script.  */
914

915
      if (output_style == OUTPUT_ED)
916
	script = build_reverse_script (cmp->file);
917
      else
918
	script = build_script (cmp->file);
919

920
      /* Set CHANGES if we had any diffs.
921
	 If some changes are ignored, we must scan the script to decide.  */
922
      if (ignore_blank_lines || ignore_regexp.fastmap)
923
	{
924
	  struct change *next = script;
925
	  changes = 0;
926

927
	  while (next && changes == 0)
928
	    {
929
	      struct change *this, *end;
930
	      lin first0, last0, first1, last1;
931

932
	      /* Find a set of changes that belong together.  */
933
	      this = next;
934
	      end = find_change (next);
935

936
	      /* Disconnect them from the rest of the changes, making them
937
		 a hunk, and remember the rest for next iteration.  */
938
	      next = end->link;
939
	      end->link = 0;
940

941
	      /* Determine whether this hunk is really a difference.  */
942
	      if (analyze_hunk (this, &first0, &last0, &first1, &last1))
943
		changes = 1;
944

945
	      /* Reconnect the script so it will all be freed properly.  */
946
	      end->link = next;
947
	    }
948
	}
949
      else
950
	changes = (script != 0);
951

952
      if (brief)
953
	changes = briefly_report (changes, cmp->file);
954
      else
955
	{
956
	  if (changes | !no_diff_means_no_output)
957
	    {
958
	      /* Record info for starting up output,
959
		 to be used if and when we have some output to print.  */
960
	      setup_output (file_label[0] ? file_label[0] : cmp->file[0].name,
961
			    file_label[1] ? file_label[1] : cmp->file[1].name,
962
			    cmp->parent != 0);
963

964
	      switch (output_style)
965
		{
966
		case OUTPUT_CONTEXT:
967
		  print_context_script (script, false);
968
		  break;
969

970
		case OUTPUT_UNIFIED:
971
		  print_context_script (script, true);
972
		  break;
973

974
		case OUTPUT_ED:
975
		  print_ed_script (script);
976
		  break;
977

978
		case OUTPUT_FORWARD_ED:
979
		  pr_forward_ed_script (script);
980
		  break;
981

982
		case OUTPUT_RCS:
983
		  print_rcs_script (script);
984
		  break;
985

986
		case OUTPUT_NORMAL:
987
		  print_normal_script (script);
988
		  break;
989

990
		case OUTPUT_IFDEF:
991
		  print_ifdef_script (script);
992
		  break;
993

994
		case OUTPUT_SDIFF:
995
		  print_sdiff_script (script);
996
		  break;
997

998
		default:
999
		  abort ();
1000
		}
1001

1002
	      finish_output ();
1003
	    }
1004
	}
1005

1006
      free (cmp->file[0].undiscarded);
1007

1008
      free (flag_space);
1009

1010
      for (f = 0; f < 2; f++)
1011
	{
1012
	  free (cmp->file[f].equivs);
1013
	  free (cmp->file[f].linbuf + cmp->file[f].linbuf_base);
1014
	}
1015

1016
      for (e = script; e; e = p)
1017
	{
1018
	  p = e->link;
1019
	  free (e);
1020
	}
1021

1022
      if (! ROBUST_OUTPUT_STYLE (output_style))
1023
	for (f = 0; f < 2; ++f)
1024
	  if (cmp->file[f].missing_newline)
1025
	    {
1026
	      error (0, 0, "%s: %s\n",
1027
		     file_label[f] ? file_label[f] : cmp->file[f].name,
1028
		     _("No newline at end of file"));
1029
	      changes = 2;
1030
	    }
1031
    }
1032

1033
  if (cmp->file[0].buffer != cmp->file[1].buffer)
1034
    free (cmp->file[0].buffer);
1035
  free (cmp->file[1].buffer);
1036

1037
  return changes;
1038
}
1039

1040
Product

Resources

Company