Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
7643 views
1
#include "mupdf/pdf.h"
2
3
/* Scan file for objects and reconstruct xref table */
4
5
/* Define in PDF 1.7 to be 8388607, but mupdf is more lenient. */
6
#define MAX_OBJECT_NUMBER (10 << 20)
7
8
struct entry
9
{
10
int num;
11
int gen;
12
int ofs;
13
int stm_ofs;
14
int stm_len;
15
};
16
17
int
18
pdf_repair_obj(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf, int *stmofsp, int *stmlenp, pdf_obj **encrypt, pdf_obj **id, pdf_obj **page, int *tmpofs)
19
{
20
fz_stream *file = doc->file;
21
pdf_token tok;
22
int stm_len;
23
24
*stmofsp = 0;
25
if (stmlenp)
26
*stmlenp = -1;
27
28
stm_len = 0;
29
30
/* On entry to this function, we know that we've just seen
31
* '<int> <int> obj'. We expect the next thing we see to be a
32
* pdf object. Regardless of the type of thing we meet next
33
* we only need to fully parse it if it is a dictionary. */
34
tok = pdf_lex(ctx, file, buf);
35
36
if (tok == PDF_TOK_OPEN_DICT)
37
{
38
pdf_obj *dict, *obj;
39
40
/* Send NULL xref so we don't try to resolve references */
41
fz_try(ctx)
42
{
43
dict = pdf_parse_dict(ctx, doc, file, buf);
44
}
45
fz_catch(ctx)
46
{
47
fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
48
/* Don't let a broken object at EOF overwrite a good one */
49
if (file->eof)
50
fz_rethrow_message(ctx, "broken object at EOF ignored");
51
/* Silently swallow the error */
52
dict = pdf_new_dict(ctx, doc, 2);
53
}
54
55
if (encrypt && id)
56
{
57
obj = pdf_dict_get(ctx, dict, PDF_NAME_Type);
58
if (pdf_name_eq(ctx, obj, PDF_NAME_XRef))
59
{
60
obj = pdf_dict_get(ctx, dict, PDF_NAME_Encrypt);
61
if (obj)
62
{
63
pdf_drop_obj(ctx, *encrypt);
64
*encrypt = pdf_keep_obj(ctx, obj);
65
}
66
67
obj = pdf_dict_get(ctx, dict, PDF_NAME_ID);
68
if (obj)
69
{
70
pdf_drop_obj(ctx, *id);
71
*id = pdf_keep_obj(ctx, obj);
72
}
73
}
74
}
75
76
obj = pdf_dict_get(ctx, dict, PDF_NAME_Length);
77
if (!pdf_is_indirect(ctx, obj) && pdf_is_int(ctx, obj))
78
stm_len = pdf_to_int(ctx, obj);
79
80
if (doc->file_reading_linearly && page)
81
{
82
obj = pdf_dict_get(ctx, dict, PDF_NAME_Type);
83
if (pdf_name_eq(ctx, obj, PDF_NAME_Page))
84
{
85
pdf_drop_obj(ctx, *page);
86
*page = pdf_keep_obj(ctx, dict);
87
}
88
}
89
90
pdf_drop_obj(ctx, dict);
91
}
92
93
while ( tok != PDF_TOK_STREAM &&
94
tok != PDF_TOK_ENDOBJ &&
95
tok != PDF_TOK_ERROR &&
96
tok != PDF_TOK_EOF &&
97
tok != PDF_TOK_INT )
98
{
99
*tmpofs = fz_tell(ctx, file);
100
if (*tmpofs < 0)
101
fz_throw(ctx, FZ_ERROR_GENERIC, "cannot tell in file");
102
tok = pdf_lex(ctx, file, buf);
103
}
104
105
if (tok == PDF_TOK_STREAM)
106
{
107
int c = fz_read_byte(ctx, file);
108
if (c == '\r') {
109
c = fz_peek_byte(ctx, file);
110
if (c == '\n')
111
fz_read_byte(ctx, file);
112
}
113
114
*stmofsp = fz_tell(ctx, file);
115
if (*stmofsp < 0)
116
fz_throw(ctx, FZ_ERROR_GENERIC, "cannot seek in file");
117
118
if (stm_len > 0)
119
{
120
fz_seek(ctx, file, *stmofsp + stm_len, 0);
121
fz_try(ctx)
122
{
123
tok = pdf_lex(ctx, file, buf);
124
}
125
fz_catch(ctx)
126
{
127
fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
128
fz_warn(ctx, "cannot find endstream token, falling back to scanning");
129
}
130
if (tok == PDF_TOK_ENDSTREAM)
131
goto atobjend;
132
fz_seek(ctx, file, *stmofsp, 0);
133
}
134
135
(void)fz_read(ctx, file, (unsigned char *) buf->scratch, 9);
136
137
while (memcmp(buf->scratch, "endstream", 9) != 0)
138
{
139
c = fz_read_byte(ctx, file);
140
if (c == EOF)
141
break;
142
memmove(&buf->scratch[0], &buf->scratch[1], 8);
143
buf->scratch[8] = c;
144
}
145
146
if (stmlenp)
147
*stmlenp = fz_tell(ctx, file) - *stmofsp - 9;
148
149
atobjend:
150
*tmpofs = fz_tell(ctx, file);
151
if (*tmpofs < 0)
152
fz_throw(ctx, FZ_ERROR_GENERIC, "cannot tell in file");
153
tok = pdf_lex(ctx, file, buf);
154
if (tok != PDF_TOK_ENDOBJ)
155
fz_warn(ctx, "object missing 'endobj' token");
156
else
157
{
158
/* Read another token as we always return the next one */
159
*tmpofs = fz_tell(ctx, file);
160
if (*tmpofs < 0)
161
fz_throw(ctx, FZ_ERROR_GENERIC, "cannot tell in file");
162
tok = pdf_lex(ctx, file, buf);
163
}
164
}
165
return tok;
166
}
167
168
static void
169
pdf_repair_obj_stm(fz_context *ctx, pdf_document *doc, int num, int gen)
170
{
171
pdf_obj *obj;
172
fz_stream *stm = NULL;
173
pdf_token tok;
174
int i, n, count;
175
pdf_lexbuf buf;
176
177
fz_var(stm);
178
179
pdf_lexbuf_init(ctx, &buf, PDF_LEXBUF_SMALL);
180
181
fz_try(ctx)
182
{
183
obj = pdf_load_object(ctx, doc, num, gen);
184
185
count = pdf_to_int(ctx, pdf_dict_get(ctx, obj, PDF_NAME_N));
186
187
pdf_drop_obj(ctx, obj);
188
189
stm = pdf_open_stream(ctx, doc, num, gen);
190
191
for (i = 0; i < count; i++)
192
{
193
pdf_xref_entry *entry;
194
195
tok = pdf_lex(ctx, stm, &buf);
196
if (tok != PDF_TOK_INT)
197
fz_throw(ctx, FZ_ERROR_GENERIC, "corrupt object stream (%d %d R)", num, gen);
198
199
n = buf.i;
200
if (n < 0)
201
{
202
fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", n, i);
203
continue;
204
}
205
else if (n >= pdf_xref_len(ctx, doc))
206
{
207
fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", n, i);
208
continue;
209
}
210
211
entry = pdf_get_populating_xref_entry(ctx, doc, n);
212
entry->ofs = num;
213
entry->gen = i;
214
entry->stm_ofs = 0;
215
pdf_drop_obj(ctx, entry->obj);
216
entry->obj = NULL;
217
entry->type = 'o';
218
219
tok = pdf_lex(ctx, stm, &buf);
220
if (tok != PDF_TOK_INT)
221
fz_throw(ctx, FZ_ERROR_GENERIC, "corrupt object stream (%d %d R)", num, gen);
222
}
223
}
224
fz_always(ctx)
225
{
226
fz_drop_stream(ctx, stm);
227
pdf_lexbuf_fin(ctx, &buf);
228
}
229
fz_catch(ctx)
230
{
231
fz_rethrow_message(ctx, "cannot load object stream object (%d %d R)", num, gen);
232
}
233
}
234
235
void
236
pdf_repair_xref(fz_context *ctx, pdf_document *doc)
237
{
238
pdf_obj *dict, *obj = NULL;
239
pdf_obj *length;
240
241
pdf_obj *encrypt = NULL;
242
pdf_obj *id = NULL;
243
pdf_obj **roots = NULL;
244
pdf_obj *info = NULL;
245
246
struct entry *list = NULL;
247
int listlen;
248
int listcap;
249
int maxnum = 0;
250
251
int num = 0;
252
int gen = 0;
253
int tmpofs, numofs = 0, genofs = 0;
254
int stm_len, stm_ofs;
255
pdf_token tok;
256
int next;
257
int i, n, c;
258
pdf_lexbuf *buf = &doc->lexbuf.base;
259
int num_roots = 0;
260
int max_roots = 0;
261
262
fz_var(encrypt);
263
fz_var(id);
264
fz_var(roots);
265
fz_var(num_roots);
266
fz_var(max_roots);
267
fz_var(info);
268
fz_var(list);
269
fz_var(obj);
270
271
if (doc->repair_attempted)
272
fz_throw(ctx, FZ_ERROR_GENERIC, "Repair failed already - not trying again");
273
doc->repair_attempted = 1;
274
275
doc->dirty = 1;
276
/* Can't support incremental update after repair */
277
doc->freeze_updates = 1;
278
279
fz_seek(ctx, doc->file, 0, 0);
280
281
fz_try(ctx)
282
{
283
pdf_xref_entry *entry;
284
listlen = 0;
285
listcap = 1024;
286
list = fz_malloc_array(ctx, listcap, sizeof(struct entry));
287
288
/* look for '%PDF' version marker within first kilobyte of file */
289
n = fz_read(ctx, doc->file, (unsigned char *)buf->scratch, fz_mini(buf->size, 1024));
290
291
fz_seek(ctx, doc->file, 0, 0);
292
for (i = 0; i < n - 4; i++)
293
{
294
if (memcmp(&buf->scratch[i], "%PDF", 4) == 0)
295
{
296
fz_seek(ctx, doc->file, i + 8, 0); /* skip "%PDF-X.Y" */
297
break;
298
}
299
}
300
301
/* skip comment line after version marker since some generators
302
* forget to terminate the comment with a newline */
303
c = fz_read_byte(ctx, doc->file);
304
while (c >= 0 && (c == ' ' || c == '%'))
305
c = fz_read_byte(ctx, doc->file);
306
fz_unread_byte(ctx, doc->file);
307
308
while (1)
309
{
310
tmpofs = fz_tell(ctx, doc->file);
311
if (tmpofs < 0)
312
fz_throw(ctx, FZ_ERROR_GENERIC, "cannot tell in file");
313
314
fz_try(ctx)
315
{
316
tok = pdf_lex_no_string(ctx, doc->file, buf);
317
}
318
fz_catch(ctx)
319
{
320
fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
321
fz_warn(ctx, "ignoring the rest of the file");
322
break;
323
}
324
325
/* If we have the next token already, then we'll jump
326
* back here, rather than going through the top of
327
* the loop. */
328
have_next_token:
329
330
if (tok == PDF_TOK_INT)
331
{
332
if (buf->i < 0)
333
{
334
num = 0;
335
gen = 0;
336
continue;
337
}
338
numofs = genofs;
339
num = gen;
340
genofs = tmpofs;
341
gen = buf->i;
342
}
343
344
else if (tok == PDF_TOK_OBJ)
345
{
346
fz_try(ctx)
347
{
348
stm_len = 0;
349
stm_ofs = 0;
350
tok = pdf_repair_obj(ctx, doc, buf, &stm_ofs, &stm_len, &encrypt, &id, NULL, &tmpofs);
351
}
352
fz_catch(ctx)
353
{
354
fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
355
/* If we haven't seen a root yet, there is nothing
356
* we can do, but give up. Otherwise, we'll make
357
* do. */
358
if (!roots)
359
fz_rethrow(ctx);
360
fz_warn(ctx, "cannot parse object (%d %d R) - ignoring rest of file", num, gen);
361
break;
362
}
363
364
if (num <= 0 || num > MAX_OBJECT_NUMBER)
365
{
366
fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", num, gen);
367
goto have_next_token;
368
}
369
370
gen = fz_clampi(gen, 0, 65535);
371
372
if (listlen + 1 == listcap)
373
{
374
listcap = (listcap * 3) / 2;
375
list = fz_resize_array(ctx, list, listcap, sizeof(struct entry));
376
}
377
378
list[listlen].num = num;
379
list[listlen].gen = gen;
380
list[listlen].ofs = numofs;
381
list[listlen].stm_ofs = stm_ofs;
382
list[listlen].stm_len = stm_len;
383
listlen ++;
384
385
if (num > maxnum)
386
maxnum = num;
387
388
goto have_next_token;
389
}
390
391
/* If we find a dictionary it is probably the trailer,
392
* but could be a stream (or bogus) dictionary caused
393
* by a corrupt file. */
394
else if (tok == PDF_TOK_OPEN_DICT)
395
{
396
fz_try(ctx)
397
{
398
dict = pdf_parse_dict(ctx, doc, doc->file, buf);
399
}
400
fz_catch(ctx)
401
{
402
fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
403
/* If this was the real trailer dict
404
* it was broken, in which case we are
405
* in trouble. Keep going though in
406
* case this was just a bogus dict. */
407
continue;
408
}
409
410
obj = pdf_dict_get(ctx, dict, PDF_NAME_Encrypt);
411
if (obj)
412
{
413
pdf_drop_obj(ctx, encrypt);
414
encrypt = pdf_keep_obj(ctx, obj);
415
}
416
417
obj = pdf_dict_get(ctx, dict, PDF_NAME_ID);
418
if (obj && (!id || !encrypt || pdf_dict_get(ctx, dict, PDF_NAME_Encrypt)))
419
{
420
pdf_drop_obj(ctx, id);
421
id = pdf_keep_obj(ctx, obj);
422
}
423
424
obj = pdf_dict_get(ctx, dict, PDF_NAME_Root);
425
if (obj)
426
{
427
if (num_roots == max_roots)
428
{
429
int new_max_roots = max_roots * 2;
430
if (new_max_roots == 0)
431
new_max_roots = 4;
432
roots = fz_resize_array(ctx, roots, new_max_roots, sizeof(*roots));
433
max_roots = new_max_roots;
434
}
435
roots[num_roots++] = pdf_keep_obj(ctx, obj);
436
}
437
438
obj = pdf_dict_get(ctx, dict, PDF_NAME_Info);
439
if (obj)
440
{
441
pdf_drop_obj(ctx, info);
442
info = pdf_keep_obj(ctx, obj);
443
}
444
445
pdf_drop_obj(ctx, dict);
446
obj = NULL;
447
}
448
449
else if (tok == PDF_TOK_EOF)
450
break;
451
else
452
{
453
if (tok == PDF_TOK_ERROR)
454
fz_read_byte(ctx, doc->file);
455
num = 0;
456
gen = 0;
457
}
458
459
}
460
461
/* make xref reasonable */
462
463
/*
464
Dummy access to entry to assure sufficient space in the xref table
465
and avoid repeated reallocs in the loop
466
*/
467
/* Ensure that the first xref table is a 'solid' one from
468
* 0 to maxnum. */
469
pdf_ensure_solid_xref(ctx, doc, maxnum);
470
471
for (i = 0; i < listlen; i++)
472
{
473
entry = pdf_get_populating_xref_entry(ctx, doc, list[i].num);
474
entry->type = 'n';
475
entry->ofs = list[i].ofs;
476
entry->gen = list[i].gen;
477
478
entry->stm_ofs = list[i].stm_ofs;
479
480
/* correct stream length for unencrypted documents */
481
if (!encrypt && list[i].stm_len >= 0)
482
{
483
dict = pdf_load_object(ctx, doc, list[i].num, list[i].gen);
484
485
length = pdf_new_int(ctx, doc, list[i].stm_len);
486
pdf_dict_put(ctx, dict, PDF_NAME_Length, length);
487
pdf_drop_obj(ctx, length);
488
489
pdf_drop_obj(ctx, dict);
490
}
491
}
492
493
entry = pdf_get_populating_xref_entry(ctx, doc, 0);
494
entry->type = 'f';
495
entry->ofs = 0;
496
entry->gen = 65535;
497
entry->stm_ofs = 0;
498
499
next = 0;
500
for (i = pdf_xref_len(ctx, doc) - 1; i >= 0; i--)
501
{
502
entry = pdf_get_populating_xref_entry(ctx, doc, i);
503
if (entry->type == 'f')
504
{
505
entry->ofs = next;
506
if (entry->gen < 65535)
507
entry->gen ++;
508
next = i;
509
}
510
}
511
512
/* create a repaired trailer, Root will be added later */
513
514
obj = pdf_new_dict(ctx, doc, 5);
515
/* During repair there is only a single xref section */
516
pdf_set_populating_xref_trailer(ctx, doc, obj);
517
pdf_drop_obj(ctx, obj);
518
obj = NULL;
519
520
obj = pdf_new_int(ctx, doc, maxnum + 1);
521
pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME_Size, obj);
522
pdf_drop_obj(ctx, obj);
523
obj = NULL;
524
525
if (roots)
526
{
527
int i;
528
for (i = num_roots-1; i > 0; i--)
529
{
530
if (pdf_is_dict(ctx, roots[i]))
531
break;
532
}
533
if (i >= 0)
534
{
535
pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME_Root, roots[i]);
536
}
537
}
538
if (info)
539
{
540
pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME_Info, info);
541
pdf_drop_obj(ctx, info);
542
info = NULL;
543
}
544
545
if (encrypt)
546
{
547
if (pdf_is_indirect(ctx, encrypt))
548
{
549
/* create new reference with non-NULL xref pointer */
550
obj = pdf_new_indirect(ctx, doc, pdf_to_num(ctx, encrypt), pdf_to_gen(ctx, encrypt));
551
pdf_drop_obj(ctx, encrypt);
552
encrypt = obj;
553
obj = NULL;
554
}
555
pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME_Encrypt, encrypt);
556
pdf_drop_obj(ctx, encrypt);
557
encrypt = NULL;
558
}
559
560
if (id)
561
{
562
if (pdf_is_indirect(ctx, id))
563
{
564
/* create new reference with non-NULL xref pointer */
565
obj = pdf_new_indirect(ctx, doc, pdf_to_num(ctx, id), pdf_to_gen(ctx, id));
566
pdf_drop_obj(ctx, id);
567
id = obj;
568
obj = NULL;
569
}
570
pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME_ID, id);
571
pdf_drop_obj(ctx, id);
572
id = NULL;
573
}
574
575
fz_free(ctx, list);
576
}
577
fz_always(ctx)
578
{
579
int i;
580
581
for (i = 0; i < num_roots; i++)
582
pdf_drop_obj(ctx, roots[i]);
583
fz_free(ctx, roots);
584
}
585
fz_catch(ctx)
586
{
587
pdf_drop_obj(ctx, encrypt);
588
pdf_drop_obj(ctx, id);
589
pdf_drop_obj(ctx, obj);
590
pdf_drop_obj(ctx, info);
591
fz_free(ctx, list);
592
fz_rethrow(ctx);
593
}
594
}
595
596
void
597
pdf_repair_obj_stms(fz_context *ctx, pdf_document *doc)
598
{
599
pdf_obj *dict;
600
int i;
601
int xref_len = pdf_xref_len(ctx, doc);
602
603
for (i = 0; i < xref_len; i++)
604
{
605
pdf_xref_entry *entry = pdf_get_populating_xref_entry(ctx, doc, i);
606
607
if (entry->stm_ofs)
608
{
609
dict = pdf_load_object(ctx, doc, i, 0);
610
fz_try(ctx)
611
{
612
if (pdf_name_eq(ctx, pdf_dict_get(ctx, dict, PDF_NAME_Type), PDF_NAME_ObjStm))
613
pdf_repair_obj_stm(ctx, doc, i, 0);
614
}
615
fz_catch(ctx)
616
{
617
fz_warn(ctx, "ignoring broken object stream (%d 0 R)", i);
618
}
619
pdf_drop_obj(ctx, dict);
620
}
621
}
622
623
/* Ensure that streamed objects reside inside a known non-streamed object */
624
for (i = 0; i < xref_len; i++)
625
{
626
pdf_xref_entry *entry = pdf_get_populating_xref_entry(ctx, doc, i);
627
628
if (entry->type == 'o' && pdf_get_populating_xref_entry(ctx, doc, entry->ofs)->type != 'n')
629
fz_throw(ctx, FZ_ERROR_GENERIC, "invalid reference to non-object-stream: %d (%d 0 R)", entry->ofs, i);
630
}
631
}
632
633