/*1** Copyright (c) 2007 D. Richard Hipp2**3** This program is free software; you can redistribute it and/or4** modify it under the terms of the Simplified BSD License (also5** known as the "2-Clause License" or "FreeBSD License".)67** This program is distributed in the hope that it will be useful,8** but without any warranty; without even the implied warranty of9** merchantability or fitness for a particular purpose.10**11** Author contact information:12** [email protected]13** http://www.hwaci.com/drh/14**15*******************************************************************************16**17** This file contains code used to compute a "diff" between two18** text files.19*/20#include <sys/types.h>2122#include <string.h>23#include <stdlib.h>2425#include "private/utils.h"26#include "xmalloc.h"2728/*29** Maximum length of a line in a text file, in bytes. (2**13 = 8192 bytes)30*/31#define LENGTH_MASK_SZ 1332#define LENGTH_MASK ((1<<LENGTH_MASK_SZ)-1)3334/*35** Information about each line of a file being diffed.36**37** The lower LENGTH_MASK_SZ bits of the hash (DLine.h) are the length38** of the line. If any line is longer than LENGTH_MASK characters,39** the file is considered binary.40*/41typedef struct DLine DLine;42struct DLine {43const char *z; /* The text of the line */44unsigned int h; /* Hash of the line */45unsigned short indent; /* Indent of the line. Only !=0 with -w/-Z option */46unsigned short n; /* number of bytes */47unsigned int iNext; /* 1+(Index of next line with same the same hash) */4849/* an array of DLine elements serves two purposes. The fields50** above are one per line of input text. But each entry is also51** a bucket in a hash table, as follows: */52unsigned int iHash; /* 1+(first entry in the hash chain) */53};5455/*56** Length of a dline57*/58#define LENGTH(X) ((X)->n)5960/*61** A context for running a raw diff.62**63** The aEdit[] array describes the raw diff. Each triple of integers in64** aEdit[] means:65**66** (1) COPY: Number of lines aFrom and aTo have in common67** (2) DELETE: Number of lines found only in aFrom68** (3) INSERT: Number of lines found only in aTo69**70** The triples repeat until all lines of both aFrom and aTo are accounted71** for.72*/73typedef struct DContext DContext;74struct DContext {75int *aEdit; /* Array of copy/delete/insert triples */76int nEdit; /* Number of integers (3x num of triples) in aEdit[] */77int nEditAlloc; /* Space allocated for aEdit[] */78DLine *aFrom; /* File on left side of the diff */79int nFrom; /* Number of lines in aFrom[] */80DLine *aTo; /* File on right side of the diff */81int nTo; /* Number of lines in aTo[] */82int (*same_fn)(const DLine *, const DLine *); /* Function to be used for comparing */83};8485/*86** Return an array of DLine objects containing a pointer to the87** start of each line and a hash of that line. The lower88** bits of the hash store the length of each line.89**90** Trailing whitespace is removed from each line. 2010-08-20: Not any91** more. If trailing whitespace is ignored, the "patch" command gets92** confused by the diff output. Ticket [a9f7b23c2e376af5b0e5b]93**94** Return 0 if the file is binary or contains a line that is95** too long.96**97** Profiling show that in most cases this routine consumes the bulk of98** the CPU time on a diff.99*/100static DLine *break_into_lines(char *z, int *pnLine){101int nLine, i, j, k, s, x;102unsigned int h, h2;103DLine *a;104105int n = strlen(z);106/* Count the number of lines. Allocate space to hold107** the returned array.108*/109for(i=j=0, nLine=1; i<n; i++, j++){110int c = z[i];111if( c==0 ){112return 0;113}114if( c=='\n' && z[i+1]!=0 ){115nLine++;116if( j>LENGTH_MASK ){117return 0;118}119j = 0;120}121}122if( j>LENGTH_MASK ){123return 0;124}125a = xcalloc(nLine, sizeof(a[0]) );126if( n==0 ){127*pnLine = 0;128return a;129}130131/* Fill in the array */132for(i=0; i<nLine; i++){133for(j=0; z[j] && z[j]!='\n'; j++){}134a[i].z = z;135k = j;136a[i].n = k;137s = 0;138for(h=0, x=s; x<k; x++){139h = h ^ (h<<2) ^ z[x];140}141a[i].indent = s;142a[i].h = h = (h<<LENGTH_MASK_SZ) | (k-s);143h2 = h % nLine;144a[i].iNext = a[h2].iHash;145a[h2].iHash = i+1;146z += j+1;147}148149/* Return results */150*pnLine = nLine;151return a;152}153154/*155** Return true if two DLine elements are identical.156*/157static int same_dline(const DLine *pA, const DLine *pB){158return pA->h==pB->h && memcmp(pA->z,pB->z, pA->h&LENGTH_MASK)==0;159}160161162/*163** Minimum of two values164*/165static int minInt(int a, int b){ return a<b ? a : b; }166167/*168** Compute the optimal longest common subsequence (LCS) using an169** exhaustive search. This version of the LCS is only used for170** shorter input strings since runtime is O(N*N) where N is the171** input string length.172*/173static void optimalLCS(174DContext *p, /* Two files being compared */175int iS1, int iE1, /* Range of lines in p->aFrom[] */176int iS2, int iE2, /* Range of lines in p->aTo[] */177int *piSX, int *piEX, /* Write p->aFrom[] common segment here */178int *piSY, int *piEY /* Write p->aTo[] common segment here */179){180int mxLength = 0; /* Length of longest common subsequence */181int i, j; /* Loop counters */182int k; /* Length of a candidate subsequence */183int iSXb = iS1; /* Best match so far */184int iSYb = iS2; /* Best match so far */185186for(i=iS1; i<iE1-mxLength; i++){187for(j=iS2; j<iE2-mxLength; j++){188if( !p->same_fn(&p->aFrom[i], &p->aTo[j]) ) continue;189if( mxLength && !p->same_fn(&p->aFrom[i+mxLength], &p->aTo[j+mxLength]) ){190continue;191}192k = 1;193while( i+k<iE1 && j+k<iE2 && p->same_fn(&p->aFrom[i+k],&p->aTo[j+k]) ){194k++;195}196if( k>mxLength ){197iSXb = i;198iSYb = j;199mxLength = k;200}201}202}203*piSX = iSXb;204*piEX = iSXb + mxLength;205*piSY = iSYb;206*piEY = iSYb + mxLength;207}208209/*210** Compare two blocks of text on lines iS1 through iE1-1 of the aFrom[]211** file and lines iS2 through iE2-1 of the aTo[] file. Locate a sequence212** of lines in these two blocks that are exactly the same. Return213** the bounds of the matching sequence.214**215** If there are two or more possible answers of the same length, the216** returned sequence should be the one closest to the center of the217** input range.218**219** Ideally, the common sequence should be the longest possible common220** sequence. However, an exact computation of LCS is O(N*N) which is221** way too slow for larger files. So this routine uses an O(N)222** heuristic approximation based on hashing that usually works about223** as well. But if the O(N) algorithm doesn't get a good solution224** and N is not too large, we fall back to an exact solution by225** calling optimalLCS().226*/227static void longestCommonSequence(228DContext *p, /* Two files being compared */229int iS1, int iE1, /* Range of lines in p->aFrom[] */230int iS2, int iE2, /* Range of lines in p->aTo[] */231int *piSX, int *piEX, /* Write p->aFrom[] common segment here */232int *piSY, int *piEY /* Write p->aTo[] common segment here */233){234int i, j, k; /* Loop counters */235int n; /* Loop limit */236DLine *pA, *pB; /* Pointers to lines */237int iSX, iSY, iEX, iEY; /* Current match */238int skew = 0; /* How lopsided is the match */239int dist = 0; /* Distance of match from center */240int mid; /* Center of the span */241int iSXb, iSYb, iEXb, iEYb; /* Best match so far */242int iSXp, iSYp, iEXp, iEYp; /* Previous match */243int64_t bestScore; /* Best score so far */244int64_t score; /* Score for current candidate LCS */245int span; /* combined width of the input sequences */246247span = (iE1 - iS1) + (iE2 - iS2);248bestScore = -10000;249iSXb = iSXp = iS1;250iEXb = iEXp = iS1;251iSYb = iSYp = iS2;252iEYb = iEYp = iS2;253mid = (iE1 + iS1)/2;254for(i=iS1; i<iE1; i++){255int limit = 0;256j = p->aTo[p->aFrom[i].h % p->nTo].iHash;257while( j>0258&& (j-1<iS2 || j>=iE2 || !p->same_fn(&p->aFrom[i], &p->aTo[j-1]))259){260if( limit++ > 10 ){261j = 0;262break;263}264j = p->aTo[j-1].iNext;265}266if( j==0 ) continue;267if( i<iEXb && j>=iSYb && j<iEYb ) continue;268if( i<iEXp && j>=iSYp && j<iEYp ) continue;269iSX = i;270iSY = j-1;271pA = &p->aFrom[iSX-1];272pB = &p->aTo[iSY-1];273n = minInt(iSX-iS1, iSY-iS2);274for(k=0; k<n && p->same_fn(pA,pB); k++, pA--, pB--){}275iSX -= k;276iSY -= k;277iEX = i+1;278iEY = j;279pA = &p->aFrom[iEX];280pB = &p->aTo[iEY];281n = minInt(iE1-iEX, iE2-iEY);282for(k=0; k<n && p->same_fn(pA,pB); k++, pA++, pB++){}283iEX += k;284iEY += k;285skew = (iSX-iS1) - (iSY-iS2);286if( skew<0 ) skew = -skew;287dist = (iSX+iEX)/2 - mid;288if( dist<0 ) dist = -dist;289score = (iEX - iSX)*(int64_t)span - (skew + dist);290if( score>bestScore ){291bestScore = score;292iSXb = iSX;293iSYb = iSY;294iEXb = iEX;295iEYb = iEY;296}else if( iEX>iEXp ){297iSXp = iSX;298iSYp = iSY;299iEXp = iEX;300iEYp = iEY;301}302}303if( iSXb==iEXb && (iE1-iS1)*(iE2-iS2)<400 ){304/* If no common sequence is found using the hashing heuristic and305** the input is not too big, use the expensive exact solution */306optimalLCS(p, iS1, iE1, iS2, iE2, piSX, piEX, piSY, piEY);307}else{308*piSX = iSXb;309*piSY = iSYb;310*piEX = iEXb;311*piEY = iEYb;312}313}314315/*316** Expand the size of aEdit[] array to hold at least nEdit elements.317*/318static void expandEdit(DContext *p, int nEdit){319p->aEdit = xrealloc(p->aEdit, nEdit*sizeof(int));320p->nEditAlloc = nEdit;321}322323/*324** Append a new COPY/DELETE/INSERT triple.325*/326static void appendTriple(DContext *p, int nCopy, int nDel, int nIns){327/* printf("APPEND %d/%d/%d\n", nCopy, nDel, nIns); */328if( p->nEdit>=3 ){329if( p->aEdit[p->nEdit-1]==0 ){330if( p->aEdit[p->nEdit-2]==0 ){331p->aEdit[p->nEdit-3] += nCopy;332p->aEdit[p->nEdit-2] += nDel;333p->aEdit[p->nEdit-1] += nIns;334return;335}336if( nCopy==0 ){337p->aEdit[p->nEdit-2] += nDel;338p->aEdit[p->nEdit-1] += nIns;339return;340}341}342if( nCopy==0 && nDel==0 ){343p->aEdit[p->nEdit-1] += nIns;344return;345}346}347if( p->nEdit+3>p->nEditAlloc ){348expandEdit(p, p->nEdit*2 + 15);349if( p->aEdit==0 ) return;350}351p->aEdit[p->nEdit++] = nCopy;352p->aEdit[p->nEdit++] = nDel;353p->aEdit[p->nEdit++] = nIns;354}355356/*357** Do a single step in the difference. Compute a sequence of358** copy/delete/insert steps that will convert lines iS1 through iE1-1 of359** the input into lines iS2 through iE2-1 of the output and write360** that sequence into the difference context.361**362** The algorithm is to find a block of common text near the middle363** of the two segments being diffed. Then recursively compute364** differences on the blocks before and after that common segment.365** Special cases apply if either input segment is empty or if the366** two segments have no text in common.367*/368static void diff_step(DContext *p, int iS1, int iE1, int iS2, int iE2){369int iSX, iEX, iSY, iEY;370371if( iE1<=iS1 ){372/* The first segment is empty */373if( iE2>iS2 ){374appendTriple(p, 0, 0, iE2-iS2);375}376return;377}378if( iE2<=iS2 ){379/* The second segment is empty */380appendTriple(p, 0, iE1-iS1, 0);381return;382}383384/* Find the longest matching segment between the two sequences */385longestCommonSequence(p, iS1, iE1, iS2, iE2, &iSX, &iEX, &iSY, &iEY);386387if( iEX>iSX ){388/* A common segment has been found.389** Recursively diff either side of the matching segment */390diff_step(p, iS1, iSX, iS2, iSY);391if( iEX>iSX ){392appendTriple(p, iEX - iSX, 0, 0);393}394diff_step(p, iEX, iE1, iEY, iE2);395}else{396/* The two segments have nothing in common. Delete the first then397** insert the second. */398appendTriple(p, 0, iE1-iS1, iE2-iS2);399}400}401402/*403** Compute the differences between two files already loaded into404** the DContext structure.405**406** A divide and conquer technique is used. We look for a large407** block of common text that is in the middle of both files. Then408** compute the difference on those parts of the file before and409** after the common block. This technique is fast, but it does410** not necessarily generate the minimum difference set. On the411** other hand, we do not need a minimum difference set, only one412** that makes sense to human readers, which this algorithm does.413**414** Any common text at the beginning and end of the two files is415** removed before starting the divide-and-conquer algorithm.416*/417static void diff_all(DContext *p){418int mnE, iS, iE1, iE2;419420/* Carve off the common header and footer */421iE1 = p->nFrom;422iE2 = p->nTo;423while( iE1>0 && iE2>0 && p->same_fn(&p->aFrom[iE1-1], &p->aTo[iE2-1]) ){424iE1--;425iE2--;426}427mnE = iE1<iE2 ? iE1 : iE2;428for(iS=0; iS<mnE && p->same_fn(&p->aFrom[iS],&p->aTo[iS]); iS++){}429430/* do the difference */431if( iS>0 ){432appendTriple(p, iS, 0, 0);433}434diff_step(p, iS, iE1, iS, iE2);435if( iE1<p->nFrom ){436appendTriple(p, p->nFrom - iE1, 0, 0);437}438439/* Terminate the COPY/DELETE/INSERT triples with three zeros */440expandEdit(p, p->nEdit+3);441if( p->aEdit ){442p->aEdit[p->nEdit++] = 0;443p->aEdit[p->nEdit++] = 0;444p->aEdit[p->nEdit++] = 0;445}446}447448/*449** Generate a report of the differences between files pA and pB.450** If pOut is not NULL then a unified diff is appended there. It451** is assumed that pOut has already been initialized. If pOut is452** NULL, then a pointer to an array of integers is returned.453** The integers come in triples. For each triple,454** the elements are the number of lines copied, the number of455** lines deleted, and the number of lines inserted. The vector456** is terminated by a triple of all zeros.457**458** This diff utility does not work on binary files. If a binary459** file is encountered, 0 is returned and pOut is written with460** text "cannot compute difference between binary files".461*/462int *463text_diff(464char *pA, /* FROM file */465char *pB /* TO file */466){467DContext c;468469/* Prepare the input files */470memset(&c, 0, sizeof(c));471c.same_fn = same_dline;472c.aFrom = break_into_lines(pA, &c.nFrom);473c.aTo = break_into_lines(pB, &c.nTo);474if( c.aFrom==0 || c.aTo==0 ){475free(c.aFrom);476free(c.aTo);477return 0;478}479480/* Compute the difference */481diff_all(&c);482/* If a context diff is not requested, then return the483** array of COPY/DELETE/INSERT triples.484*/485free(c.aFrom);486free(c.aTo);487return c.aEdit;488}489490491