Path: blob/main/contrib/libdiff/include/diff_main.h
35065 views
/* Generic infrastructure to implement various diff algorithms. */1/*2* Copyright (c) 2020 Neels Hofmeyr <[email protected]>3*4* Permission to use, copy, modify, and distribute this software for any5* purpose with or without fee is hereby granted, provided that the above6* copyright notice and this permission notice appear in all copies.7*8* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES9* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF10* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR11* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES12* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN13* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF14* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.15*/1617struct diff_range {18int start;19int end;20};2122/* List of all possible return codes of a diff invocation. */23#define DIFF_RC_USE_DIFF_ALGO_FALLBACK -124#define DIFF_RC_OK 025/* Any positive return values are errno values from sys/errno.h */2627struct diff_atom {28struct diff_data *root; /* back pointer to root diff data */2930off_t pos; /* set whether memory-mapped or not */31const uint8_t *at; /* only set if memory-mapped */32off_t len;3334/* This hash is just a very cheap speed up for finding *mismatching*35* atoms. When hashes match, we still need to compare entire atoms to36* find out whether they are indeed identical or not.37* Calculated over all atom bytes with diff_atom_hash_update(). */38unsigned int hash;39};4041/* Mix another atom_byte into the provided hash value and return the result.42* The hash value passed in for the first byte of the atom must be zero. */43unsigned int44diff_atom_hash_update(unsigned int hash, unsigned char atom_byte);4546/* Compare two atoms for equality. Return 0 on success, or errno on failure.47* Set cmp to -1, 0, or 1, just like strcmp(). */48int49diff_atom_cmp(int *cmp,50const struct diff_atom *left,51const struct diff_atom *right);525354/* The atom's index in the entire file. For atoms divided by lines of text, this55* yields the line number (starting with 0). Also works for diff_data that56* reference only a subsection of a file, always reflecting the global position57* in the file (and not the relative position within the subsection). */58#define diff_atom_root_idx(DIFF_DATA, ATOM) \59((ATOM) && ((ATOM) >= (DIFF_DATA)->root->atoms.head) \60? (unsigned int)((ATOM) - ((DIFF_DATA)->root->atoms.head)) \61: (DIFF_DATA)->root->atoms.len)6263/* The atom's index within DIFF_DATA. For atoms divided by lines of text, this64* yields the line number (starting with 0). */65#define diff_atom_idx(DIFF_DATA, ATOM) \66((ATOM) && ((ATOM) >= (DIFF_DATA)->atoms.head) \67? (unsigned int)((ATOM) - ((DIFF_DATA)->atoms.head)) \68: (DIFF_DATA)->atoms.len)6970#define foreach_diff_atom(ATOM, FIRST_ATOM, COUNT) \71for ((ATOM) = (FIRST_ATOM); \72(ATOM) \73&& ((ATOM) >= (FIRST_ATOM)) \74&& ((ATOM) - (FIRST_ATOM) < (COUNT)); \75(ATOM)++)7677#define diff_data_foreach_atom(ATOM, DIFF_DATA) \78foreach_diff_atom(ATOM, (DIFF_DATA)->atoms.head, (DIFF_DATA)->atoms.len)7980#define diff_data_foreach_atom_from(FROM, ATOM, DIFF_DATA) \81for ((ATOM) = (FROM); \82(ATOM) \83&& ((ATOM) >= (DIFF_DATA)->atoms.head) \84&& ((ATOM) - (DIFF_DATA)->atoms.head < (DIFF_DATA)->atoms.len); \85(ATOM)++)8687#define diff_data_foreach_atom_backwards_from(FROM, ATOM, DIFF_DATA) \88for ((ATOM) = (FROM); \89(ATOM) \90&& ((ATOM) >= (DIFF_DATA)->atoms.head) \91&& ((ATOM) - (DIFF_DATA)->atoms.head >= 0); \92(ATOM)--)9394/* For each file, there is a "root" struct diff_data referencing the entire95* file, which the atoms are parsed from. In recursion of diff algorithm, there96* may be "child" struct diff_data only referencing a subsection of the file,97* re-using the atoms parsing. For "root" structs, atoms_allocated will be98* nonzero, indicating that the array of atoms is owned by that struct. For99* "child" structs, atoms_allocated == 0, to indicate that the struct is100* referencing a subset of atoms. */101struct diff_data {102FILE *f; /* if root diff_data and not memory-mapped */103off_t pos; /* if not memory-mapped */104const uint8_t *data; /* if memory-mapped */105off_t len;106107int atomizer_flags;108ARRAYLIST(struct diff_atom) atoms;109struct diff_data *root;110struct diff_data *current;111void *algo_data;112113int diff_flags;114115int err;116};117118/* Flags set by file atomizer. */119#define DIFF_ATOMIZER_FOUND_BINARY_DATA 0x00000001120#define DIFF_ATOMIZER_FILE_TRUNCATED 0x00000002121122/* Flags set by caller of diff_main(). */123#define DIFF_FLAG_IGNORE_WHITESPACE 0x00000001124#define DIFF_FLAG_SHOW_PROTOTYPES 0x00000002125#define DIFF_FLAG_FORCE_TEXT_DATA 0x00000004126127void diff_data_free(struct diff_data *diff_data);128129struct diff_chunk;130typedef ARRAYLIST(struct diff_chunk) diff_chunk_arraylist_t;131132struct diff_result {133int rc;134135/*136* Pointers to diff data passed in via diff_main.137* Do not free these diff_data before freeing the diff_result struct.138*/139struct diff_data *left;140struct diff_data *right;141142diff_chunk_arraylist_t chunks;143};144145enum diff_chunk_type {146CHUNK_EMPTY,147CHUNK_PLUS,148CHUNK_MINUS,149CHUNK_SAME,150CHUNK_ERROR,151};152153enum diff_chunk_type diff_chunk_type(const struct diff_chunk *c);154155struct diff_state;156157/* Signature of a utility function to divide a file into diff atoms.158* An example is diff_atomize_text_by_line() in diff_atomize_text.c.159*160* func_data: context pointer (free to be used by implementation).161* d: struct diff_data with d->data and d->len already set up, and162* d->atoms to be created and d->atomizer_flags to be set up.163*/164typedef int (*diff_atomize_func_t)(void *func_data, struct diff_data *d);165166extern int diff_atomize_text_by_line(void *func_data, struct diff_data *d);167168struct diff_algo_config;169typedef int (*diff_algo_impl_t)(170const struct diff_algo_config *algo_config, struct diff_state *state);171172/* Form a result with all left-side removed and all right-side added, i.e. no173* actual diff algorithm involved. */174int diff_algo_none(const struct diff_algo_config *algo_config,175struct diff_state *state);176177/* Myers Diff tracing from the start all the way through to the end, requiring178* quadratic amounts of memory. This can fail if the required space surpasses179* algo_config->permitted_state_size. */180extern int diff_algo_myers(const struct diff_algo_config *algo_config,181struct diff_state *state);182183/* Myers "Divide et Impera": tracing forwards from the start and backwards from184* the end to find a midpoint that divides the problem into smaller chunks.185* Requires only linear amounts of memory. */186extern int diff_algo_myers_divide(187const struct diff_algo_config *algo_config, struct diff_state *state);188189/* Patience Diff algorithm, which divides a larger diff into smaller chunks. For190* very specific scenarios, it may lead to a complete diff result by itself, but191* needs a fallback algo to solve chunks that don't have common-unique atoms. */192extern int diff_algo_patience(193const struct diff_algo_config *algo_config, struct diff_state *state);194195/* Diff algorithms to use, possibly nested. For example:196*197* struct diff_algo_config myers, patience, myers_divide;198*199* myers = (struct diff_algo_config){200* .impl = diff_algo_myers,201* .permitted_state_size = 32 * 1024 * 1024,202* // When too large, do diff_algo_patience:203* .fallback_algo = &patience,204* };205*206* const struct diff_algo_config patience = (struct diff_algo_config){207* .impl = diff_algo_patience,208* // After subdivision, do Patience again:209* .inner_algo = &patience,210* // If subdivision failed, do Myers Divide et Impera:211* .fallback_algo = &myers_then_myers_divide,212* };213*214* const struct diff_algo_config myers_divide = (struct diff_algo_config){215* .impl = diff_algo_myers_divide,216* // When division succeeded, start from the top:217* .inner_algo = &myers_then_myers_divide,218* // (fallback_algo = NULL implies diff_algo_none).219* };220* struct diff_config config = {221* .algo = &myers,222* ...223* };224* diff_main(&config, ...);225*/226struct diff_algo_config {227diff_algo_impl_t impl;228229/* Fail this algo if it would use more than this amount of memory, and230* instead use fallback_algo (diff_algo_myers). permitted_state_size ==231* 0 means no limitation. */232size_t permitted_state_size;233234/* For algorithms that divide into smaller chunks, use this algorithm to235* solve the divided chunks. */236const struct diff_algo_config *inner_algo;237238/* If the algorithm fails (e.g. diff_algo_myers_if_small needs too large239* state, or diff_algo_patience can't find any common-unique atoms),240* then use this algorithm instead. */241const struct diff_algo_config *fallback_algo;242};243244struct diff_config {245diff_atomize_func_t atomize_func;246void *atomize_func_data;247248const struct diff_algo_config *algo;249250/* How deep to step into subdivisions of a source file, a paranoia /251* safety measure to guard against infinite loops through diff252* algorithms. When the maximum recursion is reached, employ253* diff_algo_none (i.e. remove all left atoms and add all right atoms).254*/255unsigned int max_recursion_depth;256};257258int diff_atomize_file(struct diff_data *d, const struct diff_config *config,259FILE *f, const uint8_t *data, off_t len, int diff_flags);260struct diff_result *diff_main(const struct diff_config *config,261struct diff_data *left,262struct diff_data *right);263void diff_result_free(struct diff_result *result);264int diff_result_contains_printable_chunks(struct diff_result *result);265266267