CoCalc -- pdf-repair.c

bin / mupdf / mupdf-1.7 / source / pdf / pdf-repair.c
⁷⁶⁴³ views
1
#include "mupdf/pdf.h"
2

3
/* Scan file for objects and reconstruct xref table */
4

5
/* Define in PDF 1.7 to be 8388607, but mupdf is more lenient. */
6
#define MAX_OBJECT_NUMBER (10 << 20)
7

8
struct entry
9
{
10
	int num;
11
	int gen;
12
	int ofs;
13
	int stm_ofs;
14
	int stm_len;
15
};
16

17
int
18
pdf_repair_obj(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf, int *stmofsp, int *stmlenp, pdf_obj **encrypt, pdf_obj **id, pdf_obj **page, int *tmpofs)
19
{
20
	fz_stream *file = doc->file;
21
	pdf_token tok;
22
	int stm_len;
23

24
	*stmofsp = 0;
25
	if (stmlenp)
26
		*stmlenp = -1;
27

28
	stm_len = 0;
29

30
	/* On entry to this function, we know that we've just seen
31
	 * '<int> <int> obj'. We expect the next thing we see to be a
32
	 * pdf object. Regardless of the type of thing we meet next
33
	 * we only need to fully parse it if it is a dictionary. */
34
	tok = pdf_lex(ctx, file, buf);
35

36
	if (tok == PDF_TOK_OPEN_DICT)
37
	{
38
		pdf_obj *dict, *obj;
39

40
		/* Send NULL xref so we don't try to resolve references */
41
		fz_try(ctx)
42
		{
43
			dict = pdf_parse_dict(ctx, doc, file, buf);
44
		}
45
		fz_catch(ctx)
46
		{
47
			fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
48
			/* Don't let a broken object at EOF overwrite a good one */
49
			if (file->eof)
50
				fz_rethrow_message(ctx, "broken object at EOF ignored");
51
			/* Silently swallow the error */
52
			dict = pdf_new_dict(ctx, doc, 2);
53
		}
54

55
		if (encrypt && id)
56
		{
57
			obj = pdf_dict_get(ctx, dict, PDF_NAME_Type);
58
			if (pdf_name_eq(ctx, obj, PDF_NAME_XRef))
59
			{
60
				obj = pdf_dict_get(ctx, dict, PDF_NAME_Encrypt);
61
				if (obj)
62
				{
63
					pdf_drop_obj(ctx, *encrypt);
64
					*encrypt = pdf_keep_obj(ctx, obj);
65
				}
66

67
				obj = pdf_dict_get(ctx, dict, PDF_NAME_ID);
68
				if (obj)
69
				{
70
					pdf_drop_obj(ctx, *id);
71
					*id = pdf_keep_obj(ctx, obj);
72
				}
73
			}
74
		}
75

76
		obj = pdf_dict_get(ctx, dict, PDF_NAME_Length);
77
		if (!pdf_is_indirect(ctx, obj) && pdf_is_int(ctx, obj))
78
			stm_len = pdf_to_int(ctx, obj);
79

80
		if (doc->file_reading_linearly && page)
81
		{
82
			obj = pdf_dict_get(ctx, dict, PDF_NAME_Type);
83
			if (pdf_name_eq(ctx, obj, PDF_NAME_Page))
84
			{
85
				pdf_drop_obj(ctx, *page);
86
				*page = pdf_keep_obj(ctx, dict);
87
			}
88
		}
89

90
		pdf_drop_obj(ctx, dict);
91
	}
92

93
	while ( tok != PDF_TOK_STREAM &&
94
		tok != PDF_TOK_ENDOBJ &&
95
		tok != PDF_TOK_ERROR &&
96
		tok != PDF_TOK_EOF &&
97
		tok != PDF_TOK_INT )
98
	{
99
		*tmpofs = fz_tell(ctx, file);
100
		if (*tmpofs < 0)
101
			fz_throw(ctx, FZ_ERROR_GENERIC, "cannot tell in file");
102
		tok = pdf_lex(ctx, file, buf);
103
	}
104

105
	if (tok == PDF_TOK_STREAM)
106
	{
107
		int c = fz_read_byte(ctx, file);
108
		if (c == '\r') {
109
			c = fz_peek_byte(ctx, file);
110
			if (c == '\n')
111
				fz_read_byte(ctx, file);
112
		}
113

114
		*stmofsp = fz_tell(ctx, file);
115
		if (*stmofsp < 0)
116
			fz_throw(ctx, FZ_ERROR_GENERIC, "cannot seek in file");
117

118
		if (stm_len > 0)
119
		{
120
			fz_seek(ctx, file, *stmofsp + stm_len, 0);
121
			fz_try(ctx)
122
			{
123
				tok = pdf_lex(ctx, file, buf);
124
			}
125
			fz_catch(ctx)
126
			{
127
				fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
128
				fz_warn(ctx, "cannot find endstream token, falling back to scanning");
129
			}
130
			if (tok == PDF_TOK_ENDSTREAM)
131
				goto atobjend;
132
			fz_seek(ctx, file, *stmofsp, 0);
133
		}
134

135
		(void)fz_read(ctx, file, (unsigned char *) buf->scratch, 9);
136

137
		while (memcmp(buf->scratch, "endstream", 9) != 0)
138
		{
139
			c = fz_read_byte(ctx, file);
140
			if (c == EOF)
141
				break;
142
			memmove(&buf->scratch[0], &buf->scratch[1], 8);
143
			buf->scratch[8] = c;
144
		}
145

146
		if (stmlenp)
147
			*stmlenp = fz_tell(ctx, file) - *stmofsp - 9;
148

149
atobjend:
150
		*tmpofs = fz_tell(ctx, file);
151
		if (*tmpofs < 0)
152
			fz_throw(ctx, FZ_ERROR_GENERIC, "cannot tell in file");
153
		tok = pdf_lex(ctx, file, buf);
154
		if (tok != PDF_TOK_ENDOBJ)
155
			fz_warn(ctx, "object missing 'endobj' token");
156
		else
157
		{
158
			/* Read another token as we always return the next one */
159
			*tmpofs = fz_tell(ctx, file);
160
			if (*tmpofs < 0)
161
				fz_throw(ctx, FZ_ERROR_GENERIC, "cannot tell in file");
162
			tok = pdf_lex(ctx, file, buf);
163
		}
164
	}
165
	return tok;
166
}
167

168
static void
169
pdf_repair_obj_stm(fz_context *ctx, pdf_document *doc, int num, int gen)
170
{
171
	pdf_obj *obj;
172
	fz_stream *stm = NULL;
173
	pdf_token tok;
174
	int i, n, count;
175
	pdf_lexbuf buf;
176

177
	fz_var(stm);
178

179
	pdf_lexbuf_init(ctx, &buf, PDF_LEXBUF_SMALL);
180

181
	fz_try(ctx)
182
	{
183
		obj = pdf_load_object(ctx, doc, num, gen);
184

185
		count = pdf_to_int(ctx, pdf_dict_get(ctx, obj, PDF_NAME_N));
186

187
		pdf_drop_obj(ctx, obj);
188

189
		stm = pdf_open_stream(ctx, doc, num, gen);
190

191
		for (i = 0; i < count; i++)
192
		{
193
			pdf_xref_entry *entry;
194

195
			tok = pdf_lex(ctx, stm, &buf);
196
			if (tok != PDF_TOK_INT)
197
				fz_throw(ctx, FZ_ERROR_GENERIC, "corrupt object stream (%d %d R)", num, gen);
198

199
			n = buf.i;
200
			if (n < 0)
201
			{
202
				fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", n, i);
203
				continue;
204
			}
205
			else if (n >= pdf_xref_len(ctx, doc))
206
			{
207
				fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", n, i);
208
				continue;
209
			}
210

211
			entry = pdf_get_populating_xref_entry(ctx, doc, n);
212
			entry->ofs = num;
213
			entry->gen = i;
214
			entry->stm_ofs = 0;
215
			pdf_drop_obj(ctx, entry->obj);
216
			entry->obj = NULL;
217
			entry->type = 'o';
218

219
			tok = pdf_lex(ctx, stm, &buf);
220
			if (tok != PDF_TOK_INT)
221
				fz_throw(ctx, FZ_ERROR_GENERIC, "corrupt object stream (%d %d R)", num, gen);
222
		}
223
	}
224
	fz_always(ctx)
225
	{
226
		fz_drop_stream(ctx, stm);
227
		pdf_lexbuf_fin(ctx, &buf);
228
	}
229
	fz_catch(ctx)
230
	{
231
		fz_rethrow_message(ctx, "cannot load object stream object (%d %d R)", num, gen);
232
	}
233
}
234

235
void
236
pdf_repair_xref(fz_context *ctx, pdf_document *doc)
237
{
238
	pdf_obj *dict, *obj = NULL;
239
	pdf_obj *length;
240

241
	pdf_obj *encrypt = NULL;
242
	pdf_obj *id = NULL;
243
	pdf_obj **roots = NULL;
244
	pdf_obj *info = NULL;
245

246
	struct entry *list = NULL;
247
	int listlen;
248
	int listcap;
249
	int maxnum = 0;
250

251
	int num = 0;
252
	int gen = 0;
253
	int tmpofs, numofs = 0, genofs = 0;
254
	int stm_len, stm_ofs;
255
	pdf_token tok;
256
	int next;
257
	int i, n, c;
258
	pdf_lexbuf *buf = &doc->lexbuf.base;
259
	int num_roots = 0;
260
	int max_roots = 0;
261

262
	fz_var(encrypt);
263
	fz_var(id);
264
	fz_var(roots);
265
	fz_var(num_roots);
266
	fz_var(max_roots);
267
	fz_var(info);
268
	fz_var(list);
269
	fz_var(obj);
270

271
	if (doc->repair_attempted)
272
		fz_throw(ctx, FZ_ERROR_GENERIC, "Repair failed already - not trying again");
273
	doc->repair_attempted = 1;
274

275
	doc->dirty = 1;
276
	/* Can't support incremental update after repair */
277
	doc->freeze_updates = 1;
278

279
	fz_seek(ctx, doc->file, 0, 0);
280

281
	fz_try(ctx)
282
	{
283
		pdf_xref_entry *entry;
284
		listlen = 0;
285
		listcap = 1024;
286
		list = fz_malloc_array(ctx, listcap, sizeof(struct entry));
287

288
		/* look for '%PDF' version marker within first kilobyte of file */
289
		n = fz_read(ctx, doc->file, (unsigned char *)buf->scratch, fz_mini(buf->size, 1024));
290

291
		fz_seek(ctx, doc->file, 0, 0);
292
		for (i = 0; i < n - 4; i++)
293
		{
294
			if (memcmp(&buf->scratch[i], "%PDF", 4) == 0)
295
			{
296
				fz_seek(ctx, doc->file, i + 8, 0); /* skip "%PDF-X.Y" */
297
				break;
298
			}
299
		}
300

301
		/* skip comment line after version marker since some generators
302
		 * forget to terminate the comment with a newline */
303
		c = fz_read_byte(ctx, doc->file);
304
		while (c >= 0 && (c == ' ' || c == '%'))
305
			c = fz_read_byte(ctx, doc->file);
306
		fz_unread_byte(ctx, doc->file);
307

308
		while (1)
309
		{
310
			tmpofs = fz_tell(ctx, doc->file);
311
			if (tmpofs < 0)
312
				fz_throw(ctx, FZ_ERROR_GENERIC, "cannot tell in file");
313

314
			fz_try(ctx)
315
			{
316
				tok = pdf_lex_no_string(ctx, doc->file, buf);
317
			}
318
			fz_catch(ctx)
319
			{
320
				fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
321
				fz_warn(ctx, "ignoring the rest of the file");
322
				break;
323
			}
324

325
			/* If we have the next token already, then we'll jump
326
			 * back here, rather than going through the top of
327
			 * the loop. */
328
		have_next_token:
329

330
			if (tok == PDF_TOK_INT)
331
			{
332
				if (buf->i < 0)
333
				{
334
					num = 0;
335
					gen = 0;
336
					continue;
337
				}
338
				numofs = genofs;
339
				num = gen;
340
				genofs = tmpofs;
341
				gen = buf->i;
342
			}
343

344
			else if (tok == PDF_TOK_OBJ)
345
			{
346
				fz_try(ctx)
347
				{
348
					stm_len = 0;
349
					stm_ofs = 0;
350
					tok = pdf_repair_obj(ctx, doc, buf, &stm_ofs, &stm_len, &encrypt, &id, NULL, &tmpofs);
351
				}
352
				fz_catch(ctx)
353
				{
354
					fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
355
					/* If we haven't seen a root yet, there is nothing
356
					 * we can do, but give up. Otherwise, we'll make
357
					 * do. */
358
					if (!roots)
359
						fz_rethrow(ctx);
360
					fz_warn(ctx, "cannot parse object (%d %d R) - ignoring rest of file", num, gen);
361
					break;
362
				}
363

364
				if (num <= 0 || num > MAX_OBJECT_NUMBER)
365
				{
366
					fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", num, gen);
367
					goto have_next_token;
368
				}
369

370
				gen = fz_clampi(gen, 0, 65535);
371

372
				if (listlen + 1 == listcap)
373
				{
374
					listcap = (listcap * 3) / 2;
375
					list = fz_resize_array(ctx, list, listcap, sizeof(struct entry));
376
				}
377

378
				list[listlen].num = num;
379
				list[listlen].gen = gen;
380
				list[listlen].ofs = numofs;
381
				list[listlen].stm_ofs = stm_ofs;
382
				list[listlen].stm_len = stm_len;
383
				listlen ++;
384

385
				if (num > maxnum)
386
					maxnum = num;
387

388
				goto have_next_token;
389
			}
390

391
			/* If we find a dictionary it is probably the trailer,
392
			 * but could be a stream (or bogus) dictionary caused
393
			 * by a corrupt file. */
394
			else if (tok == PDF_TOK_OPEN_DICT)
395
			{
396
				fz_try(ctx)
397
				{
398
					dict = pdf_parse_dict(ctx, doc, doc->file, buf);
399
				}
400
				fz_catch(ctx)
401
				{
402
					fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
403
					/* If this was the real trailer dict
404
					 * it was broken, in which case we are
405
					 * in trouble. Keep going though in
406
					 * case this was just a bogus dict. */
407
					continue;
408
				}
409

410
				obj = pdf_dict_get(ctx, dict, PDF_NAME_Encrypt);
411
				if (obj)
412
				{
413
					pdf_drop_obj(ctx, encrypt);
414
					encrypt = pdf_keep_obj(ctx, obj);
415
				}
416

417
				obj = pdf_dict_get(ctx, dict, PDF_NAME_ID);
418
				if (obj && (!id || !encrypt || pdf_dict_get(ctx, dict, PDF_NAME_Encrypt)))
419
				{
420
					pdf_drop_obj(ctx, id);
421
					id = pdf_keep_obj(ctx, obj);
422
				}
423

424
				obj = pdf_dict_get(ctx, dict, PDF_NAME_Root);
425
				if (obj)
426
				{
427
					if (num_roots == max_roots)
428
					{
429
						int new_max_roots = max_roots * 2;
430
						if (new_max_roots == 0)
431
							new_max_roots = 4;
432
						roots = fz_resize_array(ctx, roots, new_max_roots, sizeof(*roots));
433
						max_roots = new_max_roots;
434
					}
435
					roots[num_roots++] = pdf_keep_obj(ctx, obj);
436
				}
437

438
				obj = pdf_dict_get(ctx, dict, PDF_NAME_Info);
439
				if (obj)
440
				{
441
					pdf_drop_obj(ctx, info);
442
					info = pdf_keep_obj(ctx, obj);
443
				}
444

445
				pdf_drop_obj(ctx, dict);
446
				obj = NULL;
447
			}
448

449
			else if (tok == PDF_TOK_EOF)
450
				break;
451
			else
452
			{
453
				if (tok == PDF_TOK_ERROR)
454
					fz_read_byte(ctx, doc->file);
455
				num = 0;
456
				gen = 0;
457
			}
458

459
		}
460

461
		/* make xref reasonable */
462

463
		/*
464
			Dummy access to entry to assure sufficient space in the xref table
465
			and avoid repeated reallocs in the loop
466
		*/
467
		/* Ensure that the first xref table is a 'solid' one from
468
		 * 0 to maxnum. */
469
		pdf_ensure_solid_xref(ctx, doc, maxnum);
470

471
		for (i = 0; i < listlen; i++)
472
		{
473
			entry = pdf_get_populating_xref_entry(ctx, doc, list[i].num);
474
			entry->type = 'n';
475
			entry->ofs = list[i].ofs;
476
			entry->gen = list[i].gen;
477

478
			entry->stm_ofs = list[i].stm_ofs;
479

480
			/* correct stream length for unencrypted documents */
481
			if (!encrypt && list[i].stm_len >= 0)
482
			{
483
				dict = pdf_load_object(ctx, doc, list[i].num, list[i].gen);
484

485
				length = pdf_new_int(ctx, doc, list[i].stm_len);
486
				pdf_dict_put(ctx, dict, PDF_NAME_Length, length);
487
				pdf_drop_obj(ctx, length);
488

489
				pdf_drop_obj(ctx, dict);
490
			}
491
		}
492

493
		entry = pdf_get_populating_xref_entry(ctx, doc, 0);
494
		entry->type = 'f';
495
		entry->ofs = 0;
496
		entry->gen = 65535;
497
		entry->stm_ofs = 0;
498

499
		next = 0;
500
		for (i = pdf_xref_len(ctx, doc) - 1; i >= 0; i--)
501
		{
502
			entry = pdf_get_populating_xref_entry(ctx, doc, i);
503
			if (entry->type == 'f')
504
			{
505
				entry->ofs = next;
506
				if (entry->gen < 65535)
507
					entry->gen ++;
508
				next = i;
509
			}
510
		}
511

512
		/* create a repaired trailer, Root will be added later */
513

514
		obj = pdf_new_dict(ctx, doc, 5);
515
		/* During repair there is only a single xref section */
516
		pdf_set_populating_xref_trailer(ctx, doc, obj);
517
		pdf_drop_obj(ctx, obj);
518
		obj = NULL;
519

520
		obj = pdf_new_int(ctx, doc, maxnum + 1);
521
		pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME_Size, obj);
522
		pdf_drop_obj(ctx, obj);
523
		obj = NULL;
524

525
		if (roots)
526
		{
527
			int i;
528
			for (i = num_roots-1; i > 0; i--)
529
			{
530
				if (pdf_is_dict(ctx, roots[i]))
531
					break;
532
			}
533
			if (i >= 0)
534
			{
535
				pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME_Root, roots[i]);
536
			}
537
		}
538
		if (info)
539
		{
540
			pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME_Info, info);
541
			pdf_drop_obj(ctx, info);
542
			info = NULL;
543
		}
544

545
		if (encrypt)
546
		{
547
			if (pdf_is_indirect(ctx, encrypt))
548
			{
549
				/* create new reference with non-NULL xref pointer */
550
				obj = pdf_new_indirect(ctx, doc, pdf_to_num(ctx, encrypt), pdf_to_gen(ctx, encrypt));
551
				pdf_drop_obj(ctx, encrypt);
552
				encrypt = obj;
553
				obj = NULL;
554
			}
555
			pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME_Encrypt, encrypt);
556
			pdf_drop_obj(ctx, encrypt);
557
			encrypt = NULL;
558
		}
559

560
		if (id)
561
		{
562
			if (pdf_is_indirect(ctx, id))
563
			{
564
				/* create new reference with non-NULL xref pointer */
565
				obj = pdf_new_indirect(ctx, doc, pdf_to_num(ctx, id), pdf_to_gen(ctx, id));
566
				pdf_drop_obj(ctx, id);
567
				id = obj;
568
				obj = NULL;
569
			}
570
			pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME_ID, id);
571
			pdf_drop_obj(ctx, id);
572
			id = NULL;
573
		}
574

575
		fz_free(ctx, list);
576
	}
577
	fz_always(ctx)
578
	{
579
		int i;
580

581
		for (i = 0; i < num_roots; i++)
582
			pdf_drop_obj(ctx, roots[i]);
583
		fz_free(ctx, roots);
584
	}
585
	fz_catch(ctx)
586
	{
587
		pdf_drop_obj(ctx, encrypt);
588
		pdf_drop_obj(ctx, id);
589
		pdf_drop_obj(ctx, obj);
590
		pdf_drop_obj(ctx, info);
591
		fz_free(ctx, list);
592
		fz_rethrow(ctx);
593
	}
594
}
595

596
void
597
pdf_repair_obj_stms(fz_context *ctx, pdf_document *doc)
598
{
599
	pdf_obj *dict;
600
	int i;
601
	int xref_len = pdf_xref_len(ctx, doc);
602

603
	for (i = 0; i < xref_len; i++)
604
	{
605
		pdf_xref_entry *entry = pdf_get_populating_xref_entry(ctx, doc, i);
606

607
		if (entry->stm_ofs)
608
		{
609
			dict = pdf_load_object(ctx, doc, i, 0);
610
			fz_try(ctx)
611
			{
612
				if (pdf_name_eq(ctx, pdf_dict_get(ctx, dict, PDF_NAME_Type), PDF_NAME_ObjStm))
613
					pdf_repair_obj_stm(ctx, doc, i, 0);
614
			}
615
			fz_catch(ctx)
616
			{
617
				fz_warn(ctx, "ignoring broken object stream (%d 0 R)", i);
618
			}
619
			pdf_drop_obj(ctx, dict);
620
		}
621
	}
622

623
	/* Ensure that streamed objects reside inside a known non-streamed object */
624
	for (i = 0; i < xref_len; i++)
625
	{
626
		pdf_xref_entry *entry = pdf_get_populating_xref_entry(ctx, doc, i);
627

628
		if (entry->type == 'o' && pdf_get_populating_xref_entry(ctx, doc, entry->ofs)->type != 'n')
629
			fz_throw(ctx, FZ_ERROR_GENERIC, "invalid reference to non-object-stream: %d (%d 0 R)", entry->ofs, i);
630
	}
631
}
632

633
Product

Resources

Company