Path: blob/main/contrib/libdiff/lib/diff_atomize_text.c
35065 views
/* Split source by line breaks, and calculate a simplistic checksum. */1/*2* Copyright (c) 2020 Neels Hofmeyr <[email protected]>3*4* Permission to use, copy, modify, and distribute this software for any5* purpose with or without fee is hereby granted, provided that the above6* copyright notice and this permission notice appear in all copies.7*8* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES9* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF10* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR11* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES12* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN13* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF14* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.15*/1617#include <errno.h>18#include <setjmp.h>19#include <signal.h>20#include <stdbool.h>21#include <stdint.h>22#include <stdio.h>23#include <stdlib.h>24#include <unistd.h>25#include <ctype.h>2627#include <arraylist.h>28#include <diff_main.h>2930#include "diff_internal.h"31#include "diff_debug.h"3233unsigned int34diff_atom_hash_update(unsigned int hash, unsigned char atom_byte)35{36return hash * 23 + atom_byte;37}3839static int40diff_data_atomize_text_lines_fd(struct diff_data *d)41{42off_t pos = 0;43const off_t end = pos + d->len;44unsigned int array_size_estimate = d->len / 50;45unsigned int pow2 = 1;46bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);47bool embedded_nul = false;4849while (array_size_estimate >>= 1)50pow2++;5152ARRAYLIST_INIT(d->atoms, 1 << pow2);5354if (fseek(d->root->f, 0L, SEEK_SET) == -1)55return errno;5657while (pos < end) {58off_t line_end = pos;59unsigned int hash = 0;60unsigned char buf[512];61size_t r, i;62struct diff_atom *atom;63int eol = 0;6465while (eol == 0 && line_end < end) {66r = fread(buf, sizeof(char), sizeof(buf), d->root->f);67if (r == 0 && ferror(d->root->f))68return EIO;69i = 0;70while (eol == 0 && i < r) {71if (buf[i] != '\r' && buf[i] != '\n') {72if (!ignore_whitespace73|| !isspace((unsigned char)buf[i]))74hash = diff_atom_hash_update(75hash, buf[i]);76if (buf[i] == '\0')77embedded_nul = true;78line_end++;79} else80eol = buf[i];81i++;82}83}8485/* When not at the end of data, the line ending char ('\r' or86* '\n') must follow */87if (line_end < end)88line_end++;89/* If that was an '\r', also pull in any following '\n' */90if (line_end < end && eol == '\r') {91if (fseeko(d->root->f, line_end, SEEK_SET) == -1)92return errno;93r = fread(buf, sizeof(char), sizeof(buf), d->root->f);94if (r == 0 && ferror(d->root->f))95return EIO;96if (r > 0 && buf[0] == '\n')97line_end++;98}99100/* Record the found line as diff atom */101ARRAYLIST_ADD(atom, d->atoms);102if (!atom)103return ENOMEM;104105*atom = (struct diff_atom){106.root = d,107.pos = pos,108.at = NULL, /* atom data is not memory-mapped */109.len = line_end - pos,110.hash = hash,111};112113/* Starting point for next line: */114pos = line_end;115if (fseeko(d->root->f, pos, SEEK_SET) == -1)116return errno;117}118119/* File are considered binary if they contain embedded '\0' bytes. */120if (embedded_nul)121d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;122123return DIFF_RC_OK;124}125126static sigjmp_buf diff_data_signal_env;127static void128diff_data_signal_handler(int sig)129{130siglongjmp(diff_data_signal_env, sig);131}132133static int134diff_data_atomize_text_lines_mmap(struct diff_data *d)135{136struct sigaction act, oact;137const uint8_t *volatile pos = d->data;138const uint8_t *end = pos + d->len;139bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);140bool embedded_nul = false;141unsigned int array_size_estimate = d->len / 50;142unsigned int pow2 = 1;143while (array_size_estimate >>= 1)144pow2++;145146ARRAYLIST_INIT(d->atoms, 1 << pow2);147148sigemptyset(&act.sa_mask);149act.sa_flags = 0;150act.sa_handler = diff_data_signal_handler;151sigaction(SIGBUS, &act, &oact);152if (sigsetjmp(diff_data_signal_env, 0) > 0) {153/*154* The file was truncated while we were reading it. Set155* the end pointer to the beginning of the line we were156* trying to read, adjust the file length, and set a flag.157*/158end = pos;159d->len = end - d->data;160d->atomizer_flags |= DIFF_ATOMIZER_FILE_TRUNCATED;161}162while (pos < end) {163const uint8_t *line_start = pos, *line_end = pos;164unsigned int hash = 0;165166while (line_end < end && *line_end != '\r' && *line_end != '\n') {167if (!ignore_whitespace168|| !isspace((unsigned char)*line_end))169hash = diff_atom_hash_update(hash, *line_end);170if (*line_end == '\0')171embedded_nul = true;172line_end++;173}174175/* When not at the end of data, the line ending char ('\r' or176* '\n') must follow */177if (line_end < end && *line_end == '\r')178line_end++;179if (line_end < end && *line_end == '\n')180line_end++;181182/* Record the found line as diff atom */183struct diff_atom *atom;184ARRAYLIST_ADD(atom, d->atoms);185if (!atom)186return ENOMEM;187188*atom = (struct diff_atom){189.root = d,190.pos = (off_t)(line_start - d->data),191.at = line_start,192.len = line_end - line_start,193.hash = hash,194};195196/* Starting point for next line: */197pos = line_end;198}199sigaction(SIGBUS, &oact, NULL);200201/* File are considered binary if they contain embedded '\0' bytes. */202if (embedded_nul)203d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;204205return DIFF_RC_OK;206}207208static int209diff_data_atomize_text_lines(struct diff_data *d)210{211if (d->data == NULL)212return diff_data_atomize_text_lines_fd(d);213else214return diff_data_atomize_text_lines_mmap(d);215}216217int218diff_atomize_text_by_line(void *func_data, struct diff_data *d)219{220return diff_data_atomize_text_lines(d);221}222223224