/* Myers diff algorithm implementation, invented by Eugene W. Myers [1].1* Implementations of both the Myers Divide Et Impera (using linear space)2* and the canonical Myers algorithm (using quadratic space). */3/*4* Copyright (c) 2020 Neels Hofmeyr <[email protected]>5*6* Permission to use, copy, modify, and distribute this software for any7* purpose with or without fee is hereby granted, provided that the above8* copyright notice and this permission notice appear in all copies.9*10* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES11* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF12* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR13* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES14* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN15* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF16* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.17*/1819#include <stdbool.h>20#include <stdint.h>21#include <stdlib.h>22#include <string.h>23#include <stdio.h>24#include <errno.h>2526#include <arraylist.h>27#include <diff_main.h>2829#include "diff_internal.h"30#include "diff_debug.h"3132/* Myers' diff algorithm [1] is nicely explained in [2].33* [1] http://www.xmailserver.org/diff2.pdf34* [2] https://blog.jcoglan.com/2017/02/12/the-myers-diff-algorithm-part-1/ ff.35*36* Myers approaches finding the smallest diff as a graph problem.37* The crux is that the original algorithm requires quadratic amount of memory:38* both sides' lengths added, and that squared. So if we're diffing lines of39* text, two files with 1000 lines each would blow up to a matrix of about40* 2000 * 2000 ints of state, about 16 Mb of RAM to figure out 2 kb of text.41* The solution is using Myers' "divide and conquer" extension algorithm, which42* does the original traversal from both ends of the files to reach a middle43* where these "snakes" touch, hence does not need to backtrace the traversal,44* and so gets away with only keeping a single column of that huge state matrix45* in memory.46*/4748struct diff_box {49unsigned int left_start;50unsigned int left_end;51unsigned int right_start;52unsigned int right_end;53};5455/* If the two contents of a file are A B C D E and X B C Y,56* the Myers diff graph looks like:57*58* k0 k159* \ \60* k-1 0 1 2 3 4 561* \ A B C D E62* 0 o-o-o-o-o-o63* X | | | | | |64* 1 o-o-o-o-o-o65* B | |\| | | |66* 2 o-o-o-o-o-o67* C | | |\| | |68* 3 o-o-o-o-o-o69* Y | | | | | |\70* 4 o-o-o-o-o-o c171* \ \72* c-1 c073*74* Moving right means delete an atom from the left-hand-side,75* Moving down means add an atom from the right-hand-side.76* Diagonals indicate identical atoms on both sides, the challenge is to use as77* many diagonals as possible.78*79* The original Myers algorithm walks all the way from the top left to the80* bottom right, remembers all steps, and then backtraces to find the shortest81* path. However, that requires keeping the entire graph in memory, which needs82* quadratic space.83*84* Myers adds a variant that uses linear space -- note, not linear time, only85* linear space: walk forward and backward, find a meeting point in the middle,86* and recurse on the two separate sections. This is called "divide and87* conquer".88*89* d: the step number, starting with 0, a.k.a. the distance from the starting90* point.91* k: relative index in the state array for the forward scan, indicating on92* which diagonal through the diff graph we currently are.93* c: relative index in the state array for the backward scan, indicating the94* diagonal number from the bottom up.95*96* The "divide and conquer" traversal through the Myers graph looks like this:97*98* | d= 0 1 2 3 2 1 099* ----+--------------------------------------------100* k= | c=101* 4 | 3102* |103* 3 | 3,0 5,2 2104* | / \105* 2 | 2,0 5,3 1106* | / \107* 1 | 1,0 4,3 >= 4,3 5,4<-- 0108* | / / \ /109* 0 | -->0,0 3,3 4,4 -1110* | \ / /111* -1 | 0,1 1,2 3,4 -2112* | \ /113* -2 | 0,2 -3114* | \115* | 0,3116* | forward-> <-backward117*118* x,y pairs here are the coordinates in the Myers graph:119* x = atom index in left-side source, y = atom index in the right-side source.120*121* Only one forward column and one backward column are kept in mem, each need at122* most left.len + 1 + right.len items. Note that each d step occupies either123* the even or the odd items of a column: if e.g. the previous column is in the124* odd items, the next column is formed in the even items, without overwriting125* the previous column's results.126*127* Also note that from the diagonal index k and the x coordinate, the y128* coordinate can be derived:129* y = x - k130* Hence the state array only needs to keep the x coordinate, i.e. the position131* in the left-hand file, and the y coordinate, i.e. position in the right-hand132* file, is derived from the index in the state array.133*134* The two traces meet at 4,3, the first step (here found in the forward135* traversal) where a forward position is on or past a backward traced position136* on the same diagonal.137*138* This divides the problem space into:139*140* 0 1 2 3 4 5141* A B C D E142* 0 o-o-o-o-o143* X | | | | |144* 1 o-o-o-o-o145* B | |\| | |146* 2 o-o-o-o-o147* C | | |\| |148* 3 o-o-o-o-*-o *: forward and backward meet here149* Y | |150* 4 o-o151*152* Doing the same on each section lead to:153*154* 0 1 2 3 4 5155* A B C D E156* 0 o-o157* X | |158* 1 o-b b: backward d=1 first reaches here (sliding up the snake)159* B \ f: then forward d=2 reaches here (sliding down the snake)160* 2 o As result, the box from b to f is found to be identical;161* C \ leaving a top box from 0,0 to 1,1 and a bottom trivial162* 3 f-o tail 3,3 to 4,3.163*164* 3 o-*165* Y |166* 4 o *: forward and backward meet here167*168* and solving the last top left box gives:169*170* 0 1 2 3 4 5171* A B C D E -A172* 0 o-o +X173* X | B174* 1 o C175* B \ -D176* 2 o -E177* C \ +Y178* 3 o-o-o179* Y |180* 4 o181*182*/183184#define xk_to_y(X, K) ((X) - (K))185#define xc_to_y(X, C, DELTA) ((X) - (C) + (DELTA))186#define k_to_c(K, DELTA) ((K) + (DELTA))187#define c_to_k(C, DELTA) ((C) - (DELTA))188189/* Do one forwards step in the "divide and conquer" graph traversal.190* left: the left side to diff.191* right: the right side to diff against.192* kd_forward: the traversal state for forwards traversal, modified by this193* function.194* This is carried over between invocations with increasing d.195* kd_forward points at the center of the state array, allowing196* negative indexes.197* kd_backward: the traversal state for backwards traversal, to find a meeting198* point.199* Since forwards is done first, kd_backward will be valid for d -200* 1, not d.201* kd_backward points at the center of the state array, allowing202* negative indexes.203* d: Step or distance counter, indicating for what value of d the kd_forward204* should be populated.205* For d == 0, kd_forward[0] is initialized, i.e. the first invocation should206* be for d == 0.207* meeting_snake: resulting meeting point, if any.208* Return true when a meeting point has been identified.209*/210static int211diff_divide_myers_forward(bool *found_midpoint,212struct diff_data *left, struct diff_data *right,213int *kd_forward, int *kd_backward, int d,214struct diff_box *meeting_snake)215{216int delta = (int)right->atoms.len - (int)left->atoms.len;217int k;218int x;219int prev_x;220int prev_y;221int x_before_slide;222*found_midpoint = false;223224for (k = d; k >= -d; k -= 2) {225if (k < -(int)right->atoms.len || k > (int)left->atoms.len) {226/* This diagonal is completely outside of the Myers227* graph, don't calculate it. */228if (k < 0) {229/* We are traversing negatively, and already230* below the entire graph, nothing will come of231* this. */232debug(" break\n");233break;234}235debug(" continue\n");236continue;237}238if (d == 0) {239/* This is the initializing step. There is no prev_k240* yet, get the initial x from the top left of the Myers241* graph. */242x = 0;243prev_x = x;244prev_y = xk_to_y(x, k);245}246/* Favoring "-" lines first means favoring moving rightwards in247* the Myers graph.248* For this, all k should derive from k - 1, only the bottom249* most k derive from k + 1:250*251* | d= 0 1 2252* ----+----------------253* k= |254* 2 | 2,0 <-- from prev_k = 2 - 1 = 1255* | /256* 1 | 1,0257* | /258* 0 | -->0,0 3,3259* | \\ /260* -1 | 0,1 <-- bottom most for d=1 from261* | \\ prev_k = -1 + 1 = 0262* -2 | 0,2 <-- bottom most for d=2 from263* prev_k = -2 + 1 = -1264*265* Except when a k + 1 from a previous run already means a266* further advancement in the graph.267* If k == d, there is no k + 1 and k - 1 is the only option.268* If k < d, use k + 1 in case that yields a larger x. Also use269* k + 1 if k - 1 is outside the graph.270*/271else if (k > -d272&& (k == d273|| (k - 1 >= -(int)right->atoms.len274&& kd_forward[k - 1] >= kd_forward[k + 1]))) {275/* Advance from k - 1.276* From position prev_k, step to the right in the Myers277* graph: x += 1.278*/279int prev_k = k - 1;280prev_x = kd_forward[prev_k];281prev_y = xk_to_y(prev_x, prev_k);282x = prev_x + 1;283} else {284/* The bottom most one.285* From position prev_k, step to the bottom in the Myers286* graph: y += 1.287* Incrementing y is achieved by decrementing k while288* keeping the same x.289* (since we're deriving y from y = x - k).290*/291int prev_k = k + 1;292prev_x = kd_forward[prev_k];293prev_y = xk_to_y(prev_x, prev_k);294x = prev_x;295}296297x_before_slide = x;298/* Slide down any snake that we might find here. */299while (x < left->atoms.len && xk_to_y(x, k) < right->atoms.len) {300bool same;301int r = diff_atom_same(&same,302&left->atoms.head[x],303&right->atoms.head[304xk_to_y(x, k)]);305if (r)306return r;307if (!same)308break;309x++;310}311kd_forward[k] = x;312#if 0313if (x_before_slide != x) {314debug(" down %d similar lines\n", x - x_before_slide);315}316317#if DEBUG318{319int fi;320for (fi = d; fi >= k; fi--) {321debug("kd_forward[%d] = (%d, %d)\n", fi,322kd_forward[fi], kd_forward[fi] - fi);323}324}325#endif326#endif327328if (x < 0 || x > left->atoms.len329|| xk_to_y(x, k) < 0 || xk_to_y(x, k) > right->atoms.len)330continue;331332/* Figured out a new forwards traversal, see if this has gone333* onto or even past a preceding backwards traversal.334*335* If the delta in length is odd, then d and backwards_d hit the336* same state indexes:337* | d= 0 1 2 1 0338* ----+---------------- ----------------339* k= | c=340* 4 | 3341* |342* 3 | 2343* | same344* 2 | 2,0====5,3 1345* | / \346* 1 | 1,0 5,4<-- 0347* | / /348* 0 | -->0,0 3,3====4,4 -1349* | \ /350* -1 | 0,1 -2351* | \352* -2 | 0,2 -3353* |354*355* If the delta is even, they end up off-by-one, i.e. on356* different diagonals:357*358* | d= 0 1 2 1 0359* ----+---------------- ----------------360* | c=361* 3 | 3362* |363* 2 | 2,0 off 2364* | / \\365* 1 | 1,0 4,3 1366* | / // \367* 0 | -->0,0 3,3 4,4<-- 0368* | \ / /369* -1 | 0,1 3,4 -1370* | \ //371* -2 | 0,2 -2372* |373*374* So in the forward path, we can only match up diagonals when375* the delta is odd.376*/377if ((delta & 1) == 0)378continue;379/* Forwards is done first, so the backwards one was still at380* d - 1. Can't do this for d == 0. */381int backwards_d = d - 1;382if (backwards_d < 0)383continue;384385/* If both sides have the same length, forward and backward386* start on the same diagonal, meaning the backwards state index387* c == k.388* As soon as the lengths are not the same, the backwards389* traversal starts on a different diagonal, and c = k shifted390* by the difference in length.391*/392int c = k_to_c(k, delta);393394/* When the file sizes are very different, the traversal trees395* start on far distant diagonals.396* They don't necessarily meet straight on. See whether this397* forward value is on a diagonal that is also valid in398* kd_backward[], and match them if so. */399if (c >= -backwards_d && c <= backwards_d) {400/* Current k is on a diagonal that exists in401* kd_backward[]. If the two x positions have met or402* passed (forward walked onto or past backward), then403* we've found a midpoint / a mid-box.404*405* When forwards and backwards traversals meet, the406* endpoints of the mid-snake are not the two points in407* kd_forward and kd_backward, but rather the section408* that was slid (if any) of the current409* forward/backward traversal only.410*411* For example:412*413* o414* \415* o416* \417* o418* \419* o420* \421* X o o422* | | |423* o-o-o o424* \|425* M426* \427* o428* \429* A o430* | |431* o-o-o432*433* The forward traversal reached M from the top and slid434* downwards to A. The backward traversal already435* reached X, which is not a straight line from M436* anymore, so picking a mid-snake from M to X would437* yield a mistake.438*439* The correct mid-snake is between M and A. M is where440* the forward traversal hit the diagonal that the441* backward traversal has already passed, and A is what442* it reaches when sliding down identical lines.443*/444int backward_x = kd_backward[c];445if (x >= backward_x) {446if (x_before_slide != x) {447/* met after sliding up a mid-snake */448*meeting_snake = (struct diff_box){449.left_start = x_before_slide,450.left_end = x,451.right_start = xc_to_y(x_before_slide,452c, delta),453.right_end = xk_to_y(x, k),454};455} else {456/* met after a side step, non-identical457* line. Mark that as box divider458* instead. This makes sure that459* myers_divide never returns the same460* box that came as input, avoiding461* "infinite" looping. */462*meeting_snake = (struct diff_box){463.left_start = prev_x,464.left_end = x,465.right_start = prev_y,466.right_end = xk_to_y(x, k),467};468}469debug("HIT x=(%u,%u) - y=(%u,%u)\n",470meeting_snake->left_start,471meeting_snake->right_start,472meeting_snake->left_end,473meeting_snake->right_end);474debug_dump_myers_graph(left, right, NULL,475kd_forward, d,476kd_backward, d-1);477*found_midpoint = true;478return 0;479}480}481}482483return 0;484}485486/* Do one backwards step in the "divide and conquer" graph traversal.487* left: the left side to diff.488* right: the right side to diff against.489* kd_forward: the traversal state for forwards traversal, to find a meeting490* point.491* Since forwards is done first, after this, both kd_forward and492* kd_backward will be valid for d.493* kd_forward points at the center of the state array, allowing494* negative indexes.495* kd_backward: the traversal state for backwards traversal, to find a meeting496* point.497* This is carried over between invocations with increasing d.498* kd_backward points at the center of the state array, allowing499* negative indexes.500* d: Step or distance counter, indicating for what value of d the kd_backward501* should be populated.502* Before the first invocation, kd_backward[0] shall point at the bottom503* right of the Myers graph (left.len, right.len).504* The first invocation will be for d == 1.505* meeting_snake: resulting meeting point, if any.506* Return true when a meeting point has been identified.507*/508static int509diff_divide_myers_backward(bool *found_midpoint,510struct diff_data *left, struct diff_data *right,511int *kd_forward, int *kd_backward, int d,512struct diff_box *meeting_snake)513{514int delta = (int)right->atoms.len - (int)left->atoms.len;515int c;516int x;517int prev_x;518int prev_y;519int x_before_slide;520521*found_midpoint = false;522523for (c = d; c >= -d; c -= 2) {524if (c < -(int)left->atoms.len || c > (int)right->atoms.len) {525/* This diagonal is completely outside of the Myers526* graph, don't calculate it. */527if (c < 0) {528/* We are traversing negatively, and already529* below the entire graph, nothing will come of530* this. */531break;532}533continue;534}535if (d == 0) {536/* This is the initializing step. There is no prev_c537* yet, get the initial x from the bottom right of the538* Myers graph. */539x = left->atoms.len;540prev_x = x;541prev_y = xc_to_y(x, c, delta);542}543/* Favoring "-" lines first means favoring moving rightwards in544* the Myers graph.545* For this, all c should derive from c - 1, only the bottom546* most c derive from c + 1:547*548* 2 1 0549* ---------------------------------------------------550* c=551* 3552*553* from prev_c = c - 1 --> 5,2 2554* \555* 5,3 1556* \557* 4,3 5,4<-- 0558* \ /559* bottom most for d=1 from c + 1 --> 4,4 -1560* /561* bottom most for d=2 --> 3,4 -2562*563* Except when a c + 1 from a previous run already means a564* further advancement in the graph.565* If c == d, there is no c + 1 and c - 1 is the only option.566* If c < d, use c + 1 in case that yields a larger x.567* Also use c + 1 if c - 1 is outside the graph.568*/569else if (c > -d && (c == d570|| (c - 1 >= -(int)right->atoms.len571&& kd_backward[c - 1] <= kd_backward[c + 1]))) {572/* A top one.573* From position prev_c, step upwards in the Myers574* graph: y -= 1.575* Decrementing y is achieved by incrementing c while576* keeping the same x. (since we're deriving y from577* y = x - c + delta).578*/579int prev_c = c - 1;580prev_x = kd_backward[prev_c];581prev_y = xc_to_y(prev_x, prev_c, delta);582x = prev_x;583} else {584/* The bottom most one.585* From position prev_c, step to the left in the Myers586* graph: x -= 1.587*/588int prev_c = c + 1;589prev_x = kd_backward[prev_c];590prev_y = xc_to_y(prev_x, prev_c, delta);591x = prev_x - 1;592}593594/* Slide up any snake that we might find here (sections of595* identical lines on both sides). */596#if 0597debug("c=%d x-1=%d Yb-1=%d-1=%d\n", c, x-1, xc_to_y(x, c,598delta),599xc_to_y(x, c, delta)-1);600if (x > 0) {601debug(" l=");602debug_dump_atom(left, right, &left->atoms.head[x-1]);603}604if (xc_to_y(x, c, delta) > 0) {605debug(" r=");606debug_dump_atom(right, left,607&right->atoms.head[xc_to_y(x, c, delta)-1]);608}609#endif610x_before_slide = x;611while (x > 0 && xc_to_y(x, c, delta) > 0) {612bool same;613int r = diff_atom_same(&same,614&left->atoms.head[x-1],615&right->atoms.head[616xc_to_y(x, c, delta)-1]);617if (r)618return r;619if (!same)620break;621x--;622}623kd_backward[c] = x;624#if 0625if (x_before_slide != x) {626debug(" up %d similar lines\n", x_before_slide - x);627}628629if (DEBUG) {630int fi;631for (fi = d; fi >= c; fi--) {632debug("kd_backward[%d] = (%d, %d)\n",633fi,634kd_backward[fi],635kd_backward[fi] - fi + delta);636}637}638#endif639640if (x < 0 || x > left->atoms.len641|| xc_to_y(x, c, delta) < 0642|| xc_to_y(x, c, delta) > right->atoms.len)643continue;644645/* Figured out a new backwards traversal, see if this has gone646* onto or even past a preceding forwards traversal.647*648* If the delta in length is even, then d and backwards_d hit649* the same state indexes -- note how this is different from in650* the forwards traversal, because now both d are the same:651*652* | d= 0 1 2 2 1 0653* ----+---------------- --------------------654* k= | c=655* 4 |656* |657* 3 | 3658* | same659* 2 | 2,0====5,2 2660* | / \661* 1 | 1,0 5,3 1662* | / / \663* 0 | -->0,0 3,3====4,3 5,4<-- 0664* | \ / /665* -1 | 0,1 4,4 -1666* | \667* -2 | 0,2 -2668* |669* -3670* If the delta is odd, they end up off-by-one, i.e. on671* different diagonals.672* So in the backward path, we can only match up diagonals when673* the delta is even.674*/675if ((delta & 1) != 0)676continue;677/* Forwards was done first, now both d are the same. */678int forwards_d = d;679680/* As soon as the lengths are not the same, the681* backwards traversal starts on a different diagonal,682* and c = k shifted by the difference in length.683*/684int k = c_to_k(c, delta);685686/* When the file sizes are very different, the traversal trees687* start on far distant diagonals.688* They don't necessarily meet straight on. See whether this689* backward value is also on a valid diagonal in kd_forward[],690* and match them if so. */691if (k >= -forwards_d && k <= forwards_d) {692/* Current c is on a diagonal that exists in693* kd_forward[]. If the two x positions have met or694* passed (backward walked onto or past forward), then695* we've found a midpoint / a mid-box.696*697* When forwards and backwards traversals meet, the698* endpoints of the mid-snake are not the two points in699* kd_forward and kd_backward, but rather the section700* that was slid (if any) of the current701* forward/backward traversal only.702*703* For example:704*705* o-o-o706* | |707* o A708* | \709* o o710* \711* M712* |\713* o o-o-o714* | | |715* o o X716* \717* o718* \719* o720* \721* o722*723* The backward traversal reached M from the bottom and724* slid upwards. The forward traversal already reached725* X, which is not a straight line from M anymore, so726* picking a mid-snake from M to X would yield a727* mistake.728*729* The correct mid-snake is between M and A. M is where730* the backward traversal hit the diagonal that the731* forwards traversal has already passed, and A is what732* it reaches when sliding up identical lines.733*/734735int forward_x = kd_forward[k];736if (forward_x >= x) {737if (x_before_slide != x) {738/* met after sliding down a mid-snake */739*meeting_snake = (struct diff_box){740.left_start = x,741.left_end = x_before_slide,742.right_start = xc_to_y(x, c, delta),743.right_end = xk_to_y(x_before_slide, k),744};745} else {746/* met after a side step, non-identical747* line. Mark that as box divider748* instead. This makes sure that749* myers_divide never returns the same750* box that came as input, avoiding751* "infinite" looping. */752*meeting_snake = (struct diff_box){753.left_start = x,754.left_end = prev_x,755.right_start = xc_to_y(x, c, delta),756.right_end = prev_y,757};758}759debug("HIT x=%u,%u - y=%u,%u\n",760meeting_snake->left_start,761meeting_snake->right_start,762meeting_snake->left_end,763meeting_snake->right_end);764debug_dump_myers_graph(left, right, NULL,765kd_forward, d,766kd_backward, d);767*found_midpoint = true;768return 0;769}770}771}772return 0;773}774775/* Integer square root approximation */776static int777shift_sqrt(int val)778{779int i;780for (i = 1; val > 0; val >>= 2)781i <<= 1;782return i;783}784785#define DIFF_EFFORT_MIN 1024786787/* Myers "Divide et Impera": tracing forwards from the start and backwards from788* the end to find a midpoint that divides the problem into smaller chunks.789* Requires only linear amounts of memory. */790int791diff_algo_myers_divide(const struct diff_algo_config *algo_config,792struct diff_state *state)793{794int rc = ENOMEM;795struct diff_data *left = &state->left;796struct diff_data *right = &state->right;797int *kd_buf;798799debug("\n** %s\n", __func__);800debug("left:\n");801debug_dump(left);802debug("right:\n");803debug_dump(right);804805/* Allocate two columns of a Myers graph, one for the forward and one806* for the backward traversal. */807unsigned int max = left->atoms.len + right->atoms.len;808size_t kd_len = max + 1;809size_t kd_buf_size = kd_len << 1;810811if (state->kd_buf_size < kd_buf_size) {812kd_buf = reallocarray(state->kd_buf, kd_buf_size,813sizeof(int));814if (!kd_buf)815return ENOMEM;816state->kd_buf = kd_buf;817state->kd_buf_size = kd_buf_size;818} else819kd_buf = state->kd_buf;820int i;821for (i = 0; i < kd_buf_size; i++)822kd_buf[i] = -1;823int *kd_forward = kd_buf;824int *kd_backward = kd_buf + kd_len;825int max_effort = shift_sqrt(max/2);826827if (max_effort < DIFF_EFFORT_MIN)828max_effort = DIFF_EFFORT_MIN;829830/* The 'k' axis in Myers spans positive and negative indexes, so point831* the kd to the middle.832* It is then possible to index from -max/2 .. max/2. */833kd_forward += max/2;834kd_backward += max/2;835836int d;837struct diff_box mid_snake = {};838bool found_midpoint = false;839for (d = 0; d <= (max/2); d++) {840int r;841r = diff_divide_myers_forward(&found_midpoint, left, right,842kd_forward, kd_backward, d,843&mid_snake);844if (r)845return r;846if (found_midpoint)847break;848r = diff_divide_myers_backward(&found_midpoint, left, right,849kd_forward, kd_backward, d,850&mid_snake);851if (r)852return r;853if (found_midpoint)854break;855856/* Limit the effort spent looking for a mid snake. If files have857* very few lines in common, the effort spent to find nice mid858* snakes is just not worth it, the diff result will still be859* essentially minus everything on the left, plus everything on860* the right, with a few useless matches here and there. */861if (d > max_effort) {862/* pick the furthest reaching point from863* kd_forward and kd_backward, and use that as a864* midpoint, to not step into another diff algo865* recursion with unchanged box. */866int delta = (int)right->atoms.len - (int)left->atoms.len;867int x = 0;868int y;869int i;870int best_forward_i = 0;871int best_forward_distance = 0;872int best_backward_i = 0;873int best_backward_distance = 0;874int distance;875int best_forward_x;876int best_forward_y;877int best_backward_x;878int best_backward_y;879880debug("~~~ HIT d = %d > max_effort = %d\n", d, max_effort);881debug_dump_myers_graph(left, right, NULL,882kd_forward, d,883kd_backward, d);884885for (i = d; i >= -d; i -= 2) {886if (i >= -(int)right->atoms.len && i <= (int)left->atoms.len) {887x = kd_forward[i];888y = xk_to_y(x, i);889distance = x + y;890if (distance > best_forward_distance) {891best_forward_distance = distance;892best_forward_i = i;893}894}895896if (i >= -(int)left->atoms.len && i <= (int)right->atoms.len) {897x = kd_backward[i];898y = xc_to_y(x, i, delta);899distance = (right->atoms.len - x)900+ (left->atoms.len - y);901if (distance >= best_backward_distance) {902best_backward_distance = distance;903best_backward_i = i;904}905}906}907908/* The myers-divide didn't meet in the middle. We just909* figured out the places where the forward path910* advanced the most, and the backward path advanced the911* most. Just divide at whichever one of those two is better.912*913* o-o914* |915* o916* \917* o918* \919* F <-- cut here920*921*922*923* or here --> B924* \925* o926* \927* o928* |929* o-o930*/931best_forward_x = kd_forward[best_forward_i];932best_forward_y = xk_to_y(best_forward_x, best_forward_i);933best_backward_x = kd_backward[best_backward_i];934best_backward_y = xc_to_y(best_backward_x, best_backward_i, delta);935936if (best_forward_distance >= best_backward_distance) {937x = best_forward_x;938y = best_forward_y;939} else {940x = best_backward_x;941y = best_backward_y;942}943944debug("max_effort cut at x=%d y=%d\n", x, y);945if (x < 0 || y < 0946|| x > left->atoms.len || y > right->atoms.len)947break;948949found_midpoint = true;950mid_snake = (struct diff_box){951.left_start = x,952.left_end = x,953.right_start = y,954.right_end = y,955};956break;957}958}959960if (!found_midpoint) {961/* Divide and conquer failed to find a meeting point. Use the962* fallback_algo defined in the algo_config (leave this to the963* caller). This is just paranoia/sanity, we normally should964* always find a midpoint.965*/966debug(" no midpoint \n");967rc = DIFF_RC_USE_DIFF_ALGO_FALLBACK;968goto return_rc;969} else {970debug(" mid snake L: %u to %u of %u R: %u to %u of %u\n",971mid_snake.left_start, mid_snake.left_end, left->atoms.len,972mid_snake.right_start, mid_snake.right_end,973right->atoms.len);974975/* Section before the mid-snake. */976debug("Section before the mid-snake\n");977978struct diff_atom *left_atom = &left->atoms.head[0];979unsigned int left_section_len = mid_snake.left_start;980struct diff_atom *right_atom = &right->atoms.head[0];981unsigned int right_section_len = mid_snake.right_start;982983if (left_section_len && right_section_len) {984/* Record an unsolved chunk, the caller will apply985* inner_algo() on this chunk. */986if (!diff_state_add_chunk(state, false,987left_atom, left_section_len,988right_atom,989right_section_len))990goto return_rc;991} else if (left_section_len && !right_section_len) {992/* Only left atoms and none on the right, they form a993* "minus" chunk, then. */994if (!diff_state_add_chunk(state, true,995left_atom, left_section_len,996right_atom, 0))997goto return_rc;998} else if (!left_section_len && right_section_len) {999/* No left atoms, only atoms on the right, they form a1000* "plus" chunk, then. */1001if (!diff_state_add_chunk(state, true,1002left_atom, 0,1003right_atom,1004right_section_len))1005goto return_rc;1006}1007/* else: left_section_len == 0 and right_section_len == 0, i.e.1008* nothing before the mid-snake. */10091010if (mid_snake.left_end > mid_snake.left_start1011|| mid_snake.right_end > mid_snake.right_start) {1012/* The midpoint is a section of identical data on both1013* sides, or a certain differing line: that section1014* immediately becomes a solved chunk. */1015debug("the mid-snake\n");1016if (!diff_state_add_chunk(state, true,1017&left->atoms.head[mid_snake.left_start],1018mid_snake.left_end - mid_snake.left_start,1019&right->atoms.head[mid_snake.right_start],1020mid_snake.right_end - mid_snake.right_start))1021goto return_rc;1022}10231024/* Section after the mid-snake. */1025debug("Section after the mid-snake\n");1026debug(" left_end %u right_end %u\n",1027mid_snake.left_end, mid_snake.right_end);1028debug(" left_count %u right_count %u\n",1029left->atoms.len, right->atoms.len);1030left_atom = &left->atoms.head[mid_snake.left_end];1031left_section_len = left->atoms.len - mid_snake.left_end;1032right_atom = &right->atoms.head[mid_snake.right_end];1033right_section_len = right->atoms.len - mid_snake.right_end;10341035if (left_section_len && right_section_len) {1036/* Record an unsolved chunk, the caller will apply1037* inner_algo() on this chunk. */1038if (!diff_state_add_chunk(state, false,1039left_atom, left_section_len,1040right_atom,1041right_section_len))1042goto return_rc;1043} else if (left_section_len && !right_section_len) {1044/* Only left atoms and none on the right, they form a1045* "minus" chunk, then. */1046if (!diff_state_add_chunk(state, true,1047left_atom, left_section_len,1048right_atom, 0))1049goto return_rc;1050} else if (!left_section_len && right_section_len) {1051/* No left atoms, only atoms on the right, they form a1052* "plus" chunk, then. */1053if (!diff_state_add_chunk(state, true,1054left_atom, 0,1055right_atom,1056right_section_len))1057goto return_rc;1058}1059/* else: left_section_len == 0 and right_section_len == 0, i.e.1060* nothing after the mid-snake. */1061}10621063rc = DIFF_RC_OK;10641065return_rc:1066debug("** END %s\n", __func__);1067return rc;1068}10691070/* Myers Diff tracing from the start all the way through to the end, requiring1071* quadratic amounts of memory. This can fail if the required space surpasses1072* algo_config->permitted_state_size. */1073int1074diff_algo_myers(const struct diff_algo_config *algo_config,1075struct diff_state *state)1076{1077/* do a diff_divide_myers_forward() without a _backward(), so that it1078* walks forward across the entire files to reach the end. Keep each1079* run's state, and do a final backtrace. */1080int rc = ENOMEM;1081struct diff_data *left = &state->left;1082struct diff_data *right = &state->right;1083int *kd_buf;10841085debug("\n** %s\n", __func__);1086debug("left:\n");1087debug_dump(left);1088debug("right:\n");1089debug_dump(right);1090debug_dump_myers_graph(left, right, NULL, NULL, 0, NULL, 0);10911092/* Allocate two columns of a Myers graph, one for the forward and one1093* for the backward traversal. */1094unsigned int max = left->atoms.len + right->atoms.len;1095size_t kd_len = max + 1 + max;1096size_t kd_buf_size = kd_len * kd_len;1097size_t kd_state_size = kd_buf_size * sizeof(int);1098debug("state size: %zu\n", kd_state_size);1099if (kd_buf_size < kd_len /* overflow? */1100|| (SIZE_MAX / kd_len ) < kd_len1101|| kd_state_size > algo_config->permitted_state_size) {1102debug("state size %zu > permitted_state_size %zu, use fallback_algo\n",1103kd_state_size, algo_config->permitted_state_size);1104return DIFF_RC_USE_DIFF_ALGO_FALLBACK;1105}11061107if (state->kd_buf_size < kd_buf_size) {1108kd_buf = reallocarray(state->kd_buf, kd_buf_size,1109sizeof(int));1110if (!kd_buf)1111return ENOMEM;1112state->kd_buf = kd_buf;1113state->kd_buf_size = kd_buf_size;1114} else1115kd_buf = state->kd_buf;11161117int i;1118for (i = 0; i < kd_buf_size; i++)1119kd_buf[i] = -1;11201121/* The 'k' axis in Myers spans positive and negative indexes, so point1122* the kd to the middle.1123* It is then possible to index from -max .. max. */1124int *kd_origin = kd_buf + max;1125int *kd_column = kd_origin;11261127int d;1128int backtrack_d = -1;1129int backtrack_k = 0;1130int k;1131int x, y;1132for (d = 0; d <= max; d++, kd_column += kd_len) {1133debug("-- %s d=%d\n", __func__, d);11341135for (k = d; k >= -d; k -= 2) {1136if (k < -(int)right->atoms.len1137|| k > (int)left->atoms.len) {1138/* This diagonal is completely outside of the1139* Myers graph, don't calculate it. */1140if (k < -(int)right->atoms.len)1141debug(" %d k <"1142" -(int)right->atoms.len %d\n",1143k, -(int)right->atoms.len);1144else1145debug(" %d k > left->atoms.len %d\n", k,1146left->atoms.len);1147if (k < 0) {1148/* We are traversing negatively, and1149* already below the entire graph,1150* nothing will come of this. */1151debug(" break\n");1152break;1153}1154debug(" continue\n");1155continue;1156}11571158if (d == 0) {1159/* This is the initializing step. There is no1160* prev_k yet, get the initial x from the top1161* left of the Myers graph. */1162x = 0;1163} else {1164int *kd_prev_column = kd_column - kd_len;11651166/* Favoring "-" lines first means favoring1167* moving rightwards in the Myers graph.1168* For this, all k should derive from k - 1,1169* only the bottom most k derive from k + 1:1170*1171* | d= 0 1 21172* ----+----------------1173* k= |1174* 2 | 2,0 <-- from1175* | / prev_k = 2 - 1 = 11176* 1 | 1,01177* | /1178* 0 | -->0,0 3,31179* | \\ /1180* -1 | 0,1 <-- bottom most for d=11181* | \\ from prev_k = -1+1 = 01182* -2 | 0,2 <-- bottom most for1183* d=2 from1184* prev_k = -2+1 = -11185*1186* Except when a k + 1 from a previous run1187* already means a further advancement in the1188* graph.1189* If k == d, there is no k + 1 and k - 1 is the1190* only option.1191* If k < d, use k + 1 in case that yields a1192* larger x. Also use k + 1 if k - 1 is outside1193* the graph.1194*/1195if (k > -d1196&& (k == d1197|| (k - 1 >= -(int)right->atoms.len1198&& kd_prev_column[k - 1]1199>= kd_prev_column[k + 1]))) {1200/* Advance from k - 1.1201* From position prev_k, step to the1202* right in the Myers graph: x += 1.1203*/1204int prev_k = k - 1;1205int prev_x = kd_prev_column[prev_k];1206x = prev_x + 1;1207} else {1208/* The bottom most one.1209* From position prev_k, step to the1210* bottom in the Myers graph: y += 1.1211* Incrementing y is achieved by1212* decrementing k while keeping the same1213* x. (since we're deriving y from y =1214* x - k).1215*/1216int prev_k = k + 1;1217int prev_x = kd_prev_column[prev_k];1218x = prev_x;1219}1220}12211222/* Slide down any snake that we might find here. */1223while (x < left->atoms.len1224&& xk_to_y(x, k) < right->atoms.len) {1225bool same;1226int r = diff_atom_same(&same,1227&left->atoms.head[x],1228&right->atoms.head[1229xk_to_y(x, k)]);1230if (r)1231return r;1232if (!same)1233break;1234x++;1235}1236kd_column[k] = x;12371238if (x == left->atoms.len1239&& xk_to_y(x, k) == right->atoms.len) {1240/* Found a path */1241backtrack_d = d;1242backtrack_k = k;1243debug("Reached the end at d = %d, k = %d\n",1244backtrack_d, backtrack_k);1245break;1246}1247}12481249if (backtrack_d >= 0)1250break;1251}12521253debug_dump_myers_graph(left, right, kd_origin, NULL, 0, NULL, 0);12541255/* backtrack. A matrix spanning from start to end of the file is ready:1256*1257* | d= 0 1 2 3 41258* ----+---------------------------------1259* k= |1260* 3 |1261* |1262* 2 | 2,01263* | /1264* 1 | 1,0 4,31265* | / / \1266* 0 | -->0,0 3,3 4,4 --> backtrack_d = 4, backtrack_k = 01267* | \ / \1268* -1 | 0,1 3,41269* | \1270* -2 | 0,21271* |1272*1273* From (4,4) backwards, find the previous position that is the largest, and remember it.1274*1275*/1276for (d = backtrack_d, k = backtrack_k; d >= 0; d--) {1277x = kd_column[k];1278y = xk_to_y(x, k);12791280/* When the best position is identified, remember it for that1281* kd_column.1282* That kd_column is no longer needed otherwise, so just1283* re-purpose kd_column[0] = x and kd_column[1] = y,1284* so that there is no need to allocate more memory.1285*/1286kd_column[0] = x;1287kd_column[1] = y;1288debug("Backtrack d=%d: xy=(%d, %d)\n",1289d, kd_column[0], kd_column[1]);12901291/* Don't access memory before kd_buf */1292if (d == 0)1293break;1294int *kd_prev_column = kd_column - kd_len;12951296/* When y == 0, backtracking downwards (k-1) is the only way.1297* When x == 0, backtracking upwards (k+1) is the only way.1298*1299* | d= 0 1 2 3 41300* ----+---------------------------------1301* k= |1302* 3 |1303* | ..y == 01304* 2 | 2,01305* | /1306* 1 | 1,0 4,31307* | / / \1308* 0 | -->0,0 3,3 4,4 --> backtrack_d = 4,1309* | \ / \ backtrack_k = 01310* -1 | 0,1 3,41311* | \1312* -2 | 0,2__1313* | x == 01314*/1315if (y == 01316|| (x > 01317&& kd_prev_column[k - 1] >= kd_prev_column[k + 1])) {1318k = k - 1;1319debug("prev k=k-1=%d x=%d y=%d\n",1320k, kd_prev_column[k],1321xk_to_y(kd_prev_column[k], k));1322} else {1323k = k + 1;1324debug("prev k=k+1=%d x=%d y=%d\n",1325k, kd_prev_column[k],1326xk_to_y(kd_prev_column[k], k));1327}1328kd_column = kd_prev_column;1329}13301331/* Forwards again, this time recording the diff chunks.1332* Definitely start from 0,0. kd_column[0] may actually point to the1333* bottom of a snake starting at 0,0 */1334x = 0;1335y = 0;13361337kd_column = kd_origin;1338for (d = 0; d <= backtrack_d; d++, kd_column += kd_len) {1339int next_x = kd_column[0];1340int next_y = kd_column[1];1341debug("Forward track from xy(%d,%d) to xy(%d,%d)\n",1342x, y, next_x, next_y);13431344struct diff_atom *left_atom = &left->atoms.head[x];1345int left_section_len = next_x - x;1346struct diff_atom *right_atom = &right->atoms.head[y];1347int right_section_len = next_y - y;13481349rc = ENOMEM;1350if (left_section_len && right_section_len) {1351/* This must be a snake slide.1352* Snake slides have a straight line leading into them1353* (except when starting at (0,0)). Find out whether the1354* lead-in is horizontal or vertical:1355*1356* left1357* ---------->1358* |1359* r| o-o o1360* i| \ |1361* g| o o1362* h| \ \1363* t| o o1364* v1365*1366* If left_section_len > right_section_len, the lead-in1367* is horizontal, meaning first remove one atom from the1368* left before sliding down the snake.1369* If right_section_len > left_section_len, the lead-in1370* is vetical, so add one atom from the right before1371* sliding down the snake. */1372if (left_section_len == right_section_len + 1) {1373if (!diff_state_add_chunk(state, true,1374left_atom, 1,1375right_atom, 0))1376goto return_rc;1377left_atom++;1378left_section_len--;1379} else if (right_section_len == left_section_len + 1) {1380if (!diff_state_add_chunk(state, true,1381left_atom, 0,1382right_atom, 1))1383goto return_rc;1384right_atom++;1385right_section_len--;1386} else if (left_section_len != right_section_len) {1387/* The numbers are making no sense. Should never1388* happen. */1389rc = DIFF_RC_USE_DIFF_ALGO_FALLBACK;1390goto return_rc;1391}13921393if (!diff_state_add_chunk(state, true,1394left_atom, left_section_len,1395right_atom,1396right_section_len))1397goto return_rc;1398} else if (left_section_len && !right_section_len) {1399/* Only left atoms and none on the right, they form a1400* "minus" chunk, then. */1401if (!diff_state_add_chunk(state, true,1402left_atom, left_section_len,1403right_atom, 0))1404goto return_rc;1405} else if (!left_section_len && right_section_len) {1406/* No left atoms, only atoms on the right, they form a1407* "plus" chunk, then. */1408if (!diff_state_add_chunk(state, true,1409left_atom, 0,1410right_atom,1411right_section_len))1412goto return_rc;1413}14141415x = next_x;1416y = next_y;1417}14181419rc = DIFF_RC_OK;14201421return_rc:1422debug("** END %s rc=%d\n", __func__, rc);1423return rc;1424}142514261427