Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/libdiff/lib/diff_atomize_text.c
35065 views
1
/* Split source by line breaks, and calculate a simplistic checksum. */
2
/*
3
* Copyright (c) 2020 Neels Hofmeyr <[email protected]>
4
*
5
* Permission to use, copy, modify, and distribute this software for any
6
* purpose with or without fee is hereby granted, provided that the above
7
* copyright notice and this permission notice appear in all copies.
8
*
9
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16
*/
17
18
#include <errno.h>
19
#include <setjmp.h>
20
#include <signal.h>
21
#include <stdbool.h>
22
#include <stdint.h>
23
#include <stdio.h>
24
#include <stdlib.h>
25
#include <unistd.h>
26
#include <ctype.h>
27
28
#include <arraylist.h>
29
#include <diff_main.h>
30
31
#include "diff_internal.h"
32
#include "diff_debug.h"
33
34
unsigned int
35
diff_atom_hash_update(unsigned int hash, unsigned char atom_byte)
36
{
37
return hash * 23 + atom_byte;
38
}
39
40
static int
41
diff_data_atomize_text_lines_fd(struct diff_data *d)
42
{
43
off_t pos = 0;
44
const off_t end = pos + d->len;
45
unsigned int array_size_estimate = d->len / 50;
46
unsigned int pow2 = 1;
47
bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
48
bool embedded_nul = false;
49
50
while (array_size_estimate >>= 1)
51
pow2++;
52
53
ARRAYLIST_INIT(d->atoms, 1 << pow2);
54
55
if (fseek(d->root->f, 0L, SEEK_SET) == -1)
56
return errno;
57
58
while (pos < end) {
59
off_t line_end = pos;
60
unsigned int hash = 0;
61
unsigned char buf[512];
62
size_t r, i;
63
struct diff_atom *atom;
64
int eol = 0;
65
66
while (eol == 0 && line_end < end) {
67
r = fread(buf, sizeof(char), sizeof(buf), d->root->f);
68
if (r == 0 && ferror(d->root->f))
69
return EIO;
70
i = 0;
71
while (eol == 0 && i < r) {
72
if (buf[i] != '\r' && buf[i] != '\n') {
73
if (!ignore_whitespace
74
|| !isspace((unsigned char)buf[i]))
75
hash = diff_atom_hash_update(
76
hash, buf[i]);
77
if (buf[i] == '\0')
78
embedded_nul = true;
79
line_end++;
80
} else
81
eol = buf[i];
82
i++;
83
}
84
}
85
86
/* When not at the end of data, the line ending char ('\r' or
87
* '\n') must follow */
88
if (line_end < end)
89
line_end++;
90
/* If that was an '\r', also pull in any following '\n' */
91
if (line_end < end && eol == '\r') {
92
if (fseeko(d->root->f, line_end, SEEK_SET) == -1)
93
return errno;
94
r = fread(buf, sizeof(char), sizeof(buf), d->root->f);
95
if (r == 0 && ferror(d->root->f))
96
return EIO;
97
if (r > 0 && buf[0] == '\n')
98
line_end++;
99
}
100
101
/* Record the found line as diff atom */
102
ARRAYLIST_ADD(atom, d->atoms);
103
if (!atom)
104
return ENOMEM;
105
106
*atom = (struct diff_atom){
107
.root = d,
108
.pos = pos,
109
.at = NULL, /* atom data is not memory-mapped */
110
.len = line_end - pos,
111
.hash = hash,
112
};
113
114
/* Starting point for next line: */
115
pos = line_end;
116
if (fseeko(d->root->f, pos, SEEK_SET) == -1)
117
return errno;
118
}
119
120
/* File are considered binary if they contain embedded '\0' bytes. */
121
if (embedded_nul)
122
d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
123
124
return DIFF_RC_OK;
125
}
126
127
static sigjmp_buf diff_data_signal_env;
128
static void
129
diff_data_signal_handler(int sig)
130
{
131
siglongjmp(diff_data_signal_env, sig);
132
}
133
134
static int
135
diff_data_atomize_text_lines_mmap(struct diff_data *d)
136
{
137
struct sigaction act, oact;
138
const uint8_t *volatile pos = d->data;
139
const uint8_t *end = pos + d->len;
140
bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
141
bool embedded_nul = false;
142
unsigned int array_size_estimate = d->len / 50;
143
unsigned int pow2 = 1;
144
while (array_size_estimate >>= 1)
145
pow2++;
146
147
ARRAYLIST_INIT(d->atoms, 1 << pow2);
148
149
sigemptyset(&act.sa_mask);
150
act.sa_flags = 0;
151
act.sa_handler = diff_data_signal_handler;
152
sigaction(SIGBUS, &act, &oact);
153
if (sigsetjmp(diff_data_signal_env, 0) > 0) {
154
/*
155
* The file was truncated while we were reading it. Set
156
* the end pointer to the beginning of the line we were
157
* trying to read, adjust the file length, and set a flag.
158
*/
159
end = pos;
160
d->len = end - d->data;
161
d->atomizer_flags |= DIFF_ATOMIZER_FILE_TRUNCATED;
162
}
163
while (pos < end) {
164
const uint8_t *line_start = pos, *line_end = pos;
165
unsigned int hash = 0;
166
167
while (line_end < end && *line_end != '\r' && *line_end != '\n') {
168
if (!ignore_whitespace
169
|| !isspace((unsigned char)*line_end))
170
hash = diff_atom_hash_update(hash, *line_end);
171
if (*line_end == '\0')
172
embedded_nul = true;
173
line_end++;
174
}
175
176
/* When not at the end of data, the line ending char ('\r' or
177
* '\n') must follow */
178
if (line_end < end && *line_end == '\r')
179
line_end++;
180
if (line_end < end && *line_end == '\n')
181
line_end++;
182
183
/* Record the found line as diff atom */
184
struct diff_atom *atom;
185
ARRAYLIST_ADD(atom, d->atoms);
186
if (!atom)
187
return ENOMEM;
188
189
*atom = (struct diff_atom){
190
.root = d,
191
.pos = (off_t)(line_start - d->data),
192
.at = line_start,
193
.len = line_end - line_start,
194
.hash = hash,
195
};
196
197
/* Starting point for next line: */
198
pos = line_end;
199
}
200
sigaction(SIGBUS, &oact, NULL);
201
202
/* File are considered binary if they contain embedded '\0' bytes. */
203
if (embedded_nul)
204
d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
205
206
return DIFF_RC_OK;
207
}
208
209
static int
210
diff_data_atomize_text_lines(struct diff_data *d)
211
{
212
if (d->data == NULL)
213
return diff_data_atomize_text_lines_fd(d);
214
else
215
return diff_data_atomize_text_lines_mmap(d);
216
}
217
218
int
219
diff_atomize_text_by_line(void *func_data, struct diff_data *d)
220
{
221
return diff_data_atomize_text_lines(d);
222
}
223
224