CoCalc -- diff.c

GitHub Repository: freebsd/pkg
Path: blob/main/libpkg/diff.c
²⁰⁶⁵ views
1
/*
2
** Copyright (c) 2007 D. Richard Hipp
3
**
4
** This program is free software; you can redistribute it and/or
5
** modify it under the terms of the Simplified BSD License (also
6
** known as the "2-Clause License" or "FreeBSD License".)
7

8
** This program is distributed in the hope that it will be useful,
9
** but without any warranty; without even the implied warranty of
10
** merchantability or fitness for a particular purpose.
11
**
12
** Author contact information:
13
**   [email protected]
14
**   http://www.hwaci.com/drh/
15
**
16
*******************************************************************************
17
**
18
** This file contains code used to compute a "diff" between two
19
** text files.
20
*/
21
#include <sys/types.h>
22

23
#include <string.h>
24
#include <stdlib.h>
25

26
#include "private/utils.h"
27
#include "xmalloc.h"
28

29
/*
30
** Maximum length of a line in a text file, in bytes.  (2**13 = 8192 bytes)
31
*/
32
#define LENGTH_MASK_SZ  13
33
#define LENGTH_MASK     ((1<<LENGTH_MASK_SZ)-1)
34

35
/*
36
** Information about each line of a file being diffed.
37
**
38
** The lower LENGTH_MASK_SZ bits of the hash (DLine.h) are the length
39
** of the line.  If any line is longer than LENGTH_MASK characters,
40
** the file is considered binary.
41
*/
42
typedef struct DLine DLine;
43
struct DLine {
44
  const char *z;        /* The text of the line */
45
  unsigned int h;       /* Hash of the line */
46
  unsigned short indent;  /* Indent of the line. Only !=0 with -w/-Z option */
47
  unsigned short n;     /* number of bytes */
48
  unsigned int iNext;   /* 1+(Index of next line with same the same hash) */
49

50
  /* an array of DLine elements serves two purposes.  The fields
51
  ** above are one per line of input text.  But each entry is also
52
  ** a bucket in a hash table, as follows: */
53
  unsigned int iHash;   /* 1+(first entry in the hash chain) */
54
};
55

56
/*
57
** Length of a dline
58
*/
59
#define LENGTH(X)   ((X)->n)
60

61
/*
62
** A context for running a raw diff.
63
**
64
** The aEdit[] array describes the raw diff.  Each triple of integers in
65
** aEdit[] means:
66
**
67
**   (1) COPY:   Number of lines aFrom and aTo have in common
68
**   (2) DELETE: Number of lines found only in aFrom
69
**   (3) INSERT: Number of lines found only in aTo
70
**
71
** The triples repeat until all lines of both aFrom and aTo are accounted
72
** for.
73
*/
74
typedef struct DContext DContext;
75
struct DContext {
76
  int *aEdit;        /* Array of copy/delete/insert triples */
77
  int nEdit;         /* Number of integers (3x num of triples) in aEdit[] */
78
  int nEditAlloc;    /* Space allocated for aEdit[] */
79
  DLine *aFrom;      /* File on left side of the diff */
80
  int nFrom;         /* Number of lines in aFrom[] */
81
  DLine *aTo;        /* File on right side of the diff */
82
  int nTo;           /* Number of lines in aTo[] */
83
  int (*same_fn)(const DLine *, const DLine *); /* Function to be used for comparing */
84
};
85

86
/*
87
** Return an array of DLine objects containing a pointer to the
88
** start of each line and a hash of that line.  The lower
89
** bits of the hash store the length of each line.
90
**
91
** Trailing whitespace is removed from each line.  2010-08-20:  Not any
92
** more.  If trailing whitespace is ignored, the "patch" command gets
93
** confused by the diff output.  Ticket [a9f7b23c2e376af5b0e5b]
94
**
95
** Return 0 if the file is binary or contains a line that is
96
** too long.
97
**
98
** Profiling show that in most cases this routine consumes the bulk of
99
** the CPU time on a diff.
100
*/
101
static DLine *break_into_lines(char *z, int *pnLine){
102
  int nLine, i, j, k, s, x;
103
  unsigned int h, h2;
104
  DLine *a;
105

106
  int n = strlen(z);
107
  /* Count the number of lines.  Allocate space to hold
108
  ** the returned array.
109
  */
110
  for(i=j=0, nLine=1; i<n; i++, j++){
111
    int c = z[i];
112
    if( c==0 ){
113
      return 0;
114
    }
115
    if( c=='\n' && z[i+1]!=0 ){
116
      nLine++;
117
      if( j>LENGTH_MASK ){
118
        return 0;
119
      }
120
      j = 0;
121
    }
122
  }
123
  if( j>LENGTH_MASK ){
124
    return 0;
125
  }
126
  a = xcalloc(nLine, sizeof(a[0]) );
127
  if( n==0 ){
128
    *pnLine = 0;
129
    return a;
130
  }
131

132
  /* Fill in the array */
133
  for(i=0; i<nLine; i++){
134
    for(j=0; z[j] && z[j]!='\n'; j++){}
135
    a[i].z = z;
136
    k = j;
137
    a[i].n = k;
138
    s = 0;
139
    for(h=0, x=s; x<k; x++){
140
      h = h ^ (h<<2) ^ z[x];
141
    }
142
    a[i].indent = s;
143
    a[i].h = h = (h<<LENGTH_MASK_SZ) | (k-s);
144
    h2 = h % nLine;
145
    a[i].iNext = a[h2].iHash;
146
    a[h2].iHash = i+1;
147
    z += j+1;
148
  }
149

150
  /* Return results */
151
  *pnLine = nLine;
152
  return a;
153
}
154

155
/*
156
** Return true if two DLine elements are identical.
157
*/
158
static int same_dline(const DLine *pA, const DLine *pB){
159
  return pA->h==pB->h && memcmp(pA->z,pB->z, pA->h&LENGTH_MASK)==0;
160
}
161

162

163
/*
164
** Minimum of two values
165
*/
166
static int minInt(int a, int b){ return a<b ? a : b; }
167

168
/*
169
** Compute the optimal longest common subsequence (LCS) using an
170
** exhaustive search.  This version of the LCS is only used for
171
** shorter input strings since runtime is O(N*N) where N is the
172
** input string length.
173
*/
174
static void optimalLCS(
175
  DContext *p,               /* Two files being compared */
176
  int iS1, int iE1,          /* Range of lines in p->aFrom[] */
177
  int iS2, int iE2,          /* Range of lines in p->aTo[] */
178
  int *piSX, int *piEX,      /* Write p->aFrom[] common segment here */
179
  int *piSY, int *piEY       /* Write p->aTo[] common segment here */
180
){
181
  int mxLength = 0;          /* Length of longest common subsequence */
182
  int i, j;                  /* Loop counters */
183
  int k;                     /* Length of a candidate subsequence */
184
  int iSXb = iS1;            /* Best match so far */
185
  int iSYb = iS2;            /* Best match so far */
186

187
  for(i=iS1; i<iE1-mxLength; i++){
188
    for(j=iS2; j<iE2-mxLength; j++){
189
      if( !p->same_fn(&p->aFrom[i], &p->aTo[j]) ) continue;
190
      if( mxLength && !p->same_fn(&p->aFrom[i+mxLength], &p->aTo[j+mxLength]) ){
191
        continue;
192
      }
193
      k = 1;
194
      while( i+k<iE1 && j+k<iE2 && p->same_fn(&p->aFrom[i+k],&p->aTo[j+k]) ){
195
        k++;
196
      }
197
      if( k>mxLength ){
198
        iSXb = i;
199
        iSYb = j;
200
        mxLength = k;
201
      }
202
    }
203
  }
204
  *piSX = iSXb;
205
  *piEX = iSXb + mxLength;
206
  *piSY = iSYb;
207
  *piEY = iSYb + mxLength;
208
}
209

210
/*
211
** Compare two blocks of text on lines iS1 through iE1-1 of the aFrom[]
212
** file and lines iS2 through iE2-1 of the aTo[] file.  Locate a sequence
213
** of lines in these two blocks that are exactly the same.  Return
214
** the bounds of the matching sequence.
215
**
216
** If there are two or more possible answers of the same length, the
217
** returned sequence should be the one closest to the center of the
218
** input range.
219
**
220
** Ideally, the common sequence should be the longest possible common
221
** sequence.  However, an exact computation of LCS is O(N*N) which is
222
** way too slow for larger files.  So this routine uses an O(N)
223
** heuristic approximation based on hashing that usually works about
224
** as well.  But if the O(N) algorithm doesn't get a good solution
225
** and N is not too large, we fall back to an exact solution by
226
** calling optimalLCS().
227
*/
228
static void longestCommonSequence(
229
  DContext *p,               /* Two files being compared */
230
  int iS1, int iE1,          /* Range of lines in p->aFrom[] */
231
  int iS2, int iE2,          /* Range of lines in p->aTo[] */
232
  int *piSX, int *piEX,      /* Write p->aFrom[] common segment here */
233
  int *piSY, int *piEY       /* Write p->aTo[] common segment here */
234
){
235
  int i, j, k;               /* Loop counters */
236
  int n;                     /* Loop limit */
237
  DLine *pA, *pB;            /* Pointers to lines */
238
  int iSX, iSY, iEX, iEY;    /* Current match */
239
  int skew = 0;              /* How lopsided is the match */
240
  int dist = 0;              /* Distance of match from center */
241
  int mid;                   /* Center of the span */
242
  int iSXb, iSYb, iEXb, iEYb;   /* Best match so far */
243
  int iSXp, iSYp, iEXp, iEYp;   /* Previous match */
244
  int64_t bestScore;      /* Best score so far */
245
  int64_t score;          /* Score for current candidate LCS */
246
  int span;                     /* combined width of the input sequences */
247

248
  span = (iE1 - iS1) + (iE2 - iS2);
249
  bestScore = -10000;
250
  iSXb = iSXp = iS1;
251
  iEXb = iEXp = iS1;
252
  iSYb = iSYp = iS2;
253
  iEYb = iEYp = iS2;
254
  mid = (iE1 + iS1)/2;
255
  for(i=iS1; i<iE1; i++){
256
    int limit = 0;
257
    j = p->aTo[p->aFrom[i].h % p->nTo].iHash;
258
    while( j>0
259
      && (j-1<iS2 || j>=iE2 || !p->same_fn(&p->aFrom[i], &p->aTo[j-1]))
260
    ){
261
      if( limit++ > 10 ){
262
        j = 0;
263
        break;
264
      }
265
      j = p->aTo[j-1].iNext;
266
    }
267
    if( j==0 ) continue;
268
    if( i<iEXb && j>=iSYb && j<iEYb ) continue;
269
    if( i<iEXp && j>=iSYp && j<iEYp ) continue;
270
    iSX = i;
271
    iSY = j-1;
272
    pA = &p->aFrom[iSX-1];
273
    pB = &p->aTo[iSY-1];
274
    n = minInt(iSX-iS1, iSY-iS2);
275
    for(k=0; k<n && p->same_fn(pA,pB); k++, pA--, pB--){}
276
    iSX -= k;
277
    iSY -= k;
278
    iEX = i+1;
279
    iEY = j;
280
    pA = &p->aFrom[iEX];
281
    pB = &p->aTo[iEY];
282
    n = minInt(iE1-iEX, iE2-iEY);
283
    for(k=0; k<n && p->same_fn(pA,pB); k++, pA++, pB++){}
284
    iEX += k;
285
    iEY += k;
286
    skew = (iSX-iS1) - (iSY-iS2);
287
    if( skew<0 ) skew = -skew;
288
    dist = (iSX+iEX)/2 - mid;
289
    if( dist<0 ) dist = -dist;
290
    score = (iEX - iSX)*(int64_t)span - (skew + dist);
291
    if( score>bestScore ){
292
      bestScore = score;
293
      iSXb = iSX;
294
      iSYb = iSY;
295
      iEXb = iEX;
296
      iEYb = iEY;
297
    }else if( iEX>iEXp ){
298
      iSXp = iSX;
299
      iSYp = iSY;
300
      iEXp = iEX;
301
      iEYp = iEY;
302
    }
303
  }
304
  if( iSXb==iEXb && (iE1-iS1)*(iE2-iS2)<400 ){
305
    /* If no common sequence is found using the hashing heuristic and
306
    ** the input is not too big, use the expensive exact solution */
307
    optimalLCS(p, iS1, iE1, iS2, iE2, piSX, piEX, piSY, piEY);
308
  }else{
309
    *piSX = iSXb;
310
    *piSY = iSYb;
311
    *piEX = iEXb;
312
    *piEY = iEYb;
313
  }
314
}
315

316
/*
317
** Expand the size of aEdit[] array to hold at least nEdit elements.
318
*/
319
static void expandEdit(DContext *p, int nEdit){
320
  p->aEdit = xrealloc(p->aEdit, nEdit*sizeof(int));
321
  p->nEditAlloc = nEdit;
322
}
323

324
/*
325
** Append a new COPY/DELETE/INSERT triple.
326
*/
327
static void appendTriple(DContext *p, int nCopy, int nDel, int nIns){
328
  /* printf("APPEND %d/%d/%d\n", nCopy, nDel, nIns); */
329
  if( p->nEdit>=3 ){
330
    if( p->aEdit[p->nEdit-1]==0 ){
331
      if( p->aEdit[p->nEdit-2]==0 ){
332
        p->aEdit[p->nEdit-3] += nCopy;
333
        p->aEdit[p->nEdit-2] += nDel;
334
        p->aEdit[p->nEdit-1] += nIns;
335
        return;
336
      }
337
      if( nCopy==0 ){
338
        p->aEdit[p->nEdit-2] += nDel;
339
        p->aEdit[p->nEdit-1] += nIns;
340
        return;
341
      }
342
    }
343
    if( nCopy==0 && nDel==0 ){
344
      p->aEdit[p->nEdit-1] += nIns;
345
      return;
346
    }
347
  }
348
  if( p->nEdit+3>p->nEditAlloc ){
349
    expandEdit(p, p->nEdit*2 + 15);
350
    if( p->aEdit==0 ) return;
351
  }
352
  p->aEdit[p->nEdit++] = nCopy;
353
  p->aEdit[p->nEdit++] = nDel;
354
  p->aEdit[p->nEdit++] = nIns;
355
}
356

357
/*
358
** Do a single step in the difference.  Compute a sequence of
359
** copy/delete/insert steps that will convert lines iS1 through iE1-1 of
360
** the input into lines iS2 through iE2-1 of the output and write
361
** that sequence into the difference context.
362
**
363
** The algorithm is to find a block of common text near the middle
364
** of the two segments being diffed.  Then recursively compute
365
** differences on the blocks before and after that common segment.
366
** Special cases apply if either input segment is empty or if the
367
** two segments have no text in common.
368
*/
369
static void diff_step(DContext *p, int iS1, int iE1, int iS2, int iE2){
370
  int iSX, iEX, iSY, iEY;
371

372
  if( iE1<=iS1 ){
373
    /* The first segment is empty */
374
    if( iE2>iS2 ){
375
      appendTriple(p, 0, 0, iE2-iS2);
376
    }
377
    return;
378
  }
379
  if( iE2<=iS2 ){
380
    /* The second segment is empty */
381
    appendTriple(p, 0, iE1-iS1, 0);
382
    return;
383
  }
384

385
  /* Find the longest matching segment between the two sequences */
386
  longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY);
387

388
  if( iEX>iSX ){
389
    /* A common segment has been found.
390
    ** Recursively diff either side of the matching segment */
391
    diff_step(p, iS1, iSX, iS2, iSY);
392
    if( iEX>iSX ){
393
      appendTriple(p, iEX - iSX, 0, 0);
394
    }
395
    diff_step(p, iEX, iE1, iEY, iE2);
396
  }else{
397
    /* The two segments have nothing in common.  Delete the first then
398
    ** insert the second. */
399
    appendTriple(p, 0, iE1-iS1, iE2-iS2);
400
  }
401
}
402

403
/*
404
** Compute the differences between two files already loaded into
405
** the DContext structure.
406
**
407
** A divide and conquer technique is used.  We look for a large
408
** block of common text that is in the middle of both files.  Then
409
** compute the difference on those parts of the file before and
410
** after the common block.  This technique is fast, but it does
411
** not necessarily generate the minimum difference set.  On the
412
** other hand, we do not need a minimum difference set, only one
413
** that makes sense to human readers, which this algorithm does.
414
**
415
** Any common text at the beginning and end of the two files is
416
** removed before starting the divide-and-conquer algorithm.
417
*/
418
static void diff_all(DContext *p){
419
  int mnE, iS, iE1, iE2;
420

421
  /* Carve off the common header and footer */
422
  iE1 = p->nFrom;
423
  iE2 = p->nTo;
424
  while( iE1>0 && iE2>0 && p->same_fn(&p->aFrom[iE1-1], &p->aTo[iE2-1]) ){
425
    iE1--;
426
    iE2--;
427
  }
428
  mnE = iE1<iE2 ? iE1 : iE2;
429
  for(iS=0; iS<mnE && p->same_fn(&p->aFrom[iS],&p->aTo[iS]); iS++){}
430

431
  /* do the difference */
432
  if( iS>0 ){
433
    appendTriple(p, iS, 0, 0);
434
  }
435
  diff_step(p, iS, iE1, iS, iE2);
436
  if( iE1<p->nFrom ){
437
    appendTriple(p, p->nFrom - iE1, 0, 0);
438
  }
439

440
  /* Terminate the COPY/DELETE/INSERT triples with three zeros */
441
  expandEdit(p, p->nEdit+3);
442
  if( p->aEdit ){
443
    p->aEdit[p->nEdit++] = 0;
444
    p->aEdit[p->nEdit++] = 0;
445
    p->aEdit[p->nEdit++] = 0;
446
  }
447
}
448

449
/*
450
** Generate a report of the differences between files pA and pB.
451
** If pOut is not NULL then a unified diff is appended there.  It
452
** is assumed that pOut has already been initialized.  If pOut is
453
** NULL, then a pointer to an array of integers is returned.
454
** The integers come in triples.  For each triple,
455
** the elements are the number of lines copied, the number of
456
** lines deleted, and the number of lines inserted.  The vector
457
** is terminated by a triple of all zeros.
458
**
459
** This diff utility does not work on binary files.  If a binary
460
** file is encountered, 0 is returned and pOut is written with
461
** text "cannot compute difference between binary files".
462
*/
463
int *
464
text_diff(
465
  char *pA,   /* FROM file */
466
  char *pB   /* TO file */
467
){
468
  DContext c;
469

470
  /* Prepare the input files */
471
  memset(&c, 0, sizeof(c));
472
  c.same_fn = same_dline;
473
  c.aFrom = break_into_lines(pA, &c.nFrom);
474
  c.aTo = break_into_lines(pB, &c.nTo);
475
  if( c.aFrom==0 || c.aTo==0 ){
476
    free(c.aFrom);
477
    free(c.aTo);
478
    return 0;
479
  }
480

481
  /* Compute the difference */
482
  diff_all(&c);
483
    /* If a context diff is not requested, then return the
484
    ** array of COPY/DELETE/INSERT triples.
485
    */
486
    free(c.aFrom);
487
    free(c.aTo);
488
    return c.aEdit;
489
}
490

491
Product

Resources

Company