CoCalc -- pdf-parse.c

bin / mupdf / mupdf-1.7 / source / pdf / pdf-parse.c
¹⁰⁷⁰⁸ views
1
#include "mupdf/pdf.h"
2

3
fz_rect *
4
pdf_to_rect(fz_context *ctx, pdf_obj *array, fz_rect *r)
5
{
6
	float a = pdf_to_real(ctx, pdf_array_get(ctx, array, 0));
7
	float b = pdf_to_real(ctx, pdf_array_get(ctx, array, 1));
8
	float c = pdf_to_real(ctx, pdf_array_get(ctx, array, 2));
9
	float d = pdf_to_real(ctx, pdf_array_get(ctx, array, 3));
10
	r->x0 = fz_min(a, c);
11
	r->y0 = fz_min(b, d);
12
	r->x1 = fz_max(a, c);
13
	r->y1 = fz_max(b, d);
14
	return r;
15
}
16

17
fz_matrix *
18
pdf_to_matrix(fz_context *ctx, pdf_obj *array, fz_matrix *m)
19
{
20
	m->a = pdf_to_real(ctx, pdf_array_get(ctx, array, 0));
21
	m->b = pdf_to_real(ctx, pdf_array_get(ctx, array, 1));
22
	m->c = pdf_to_real(ctx, pdf_array_get(ctx, array, 2));
23
	m->d = pdf_to_real(ctx, pdf_array_get(ctx, array, 3));
24
	m->e = pdf_to_real(ctx, pdf_array_get(ctx, array, 4));
25
	m->f = pdf_to_real(ctx, pdf_array_get(ctx, array, 5));
26
	return m;
27
}
28

29
/* Convert Unicode/PdfDocEncoding string into utf-8 */
30
char *
31
pdf_to_utf8(fz_context *ctx, pdf_document *doc, pdf_obj *src)
32
{
33
	fz_buffer *strmbuf = NULL;
34
	unsigned char *srcptr;
35
	char *dstptr, *dst;
36
	int srclen;
37
	int dstlen = 0;
38
	int ucs;
39
	int i;
40

41
	fz_var(strmbuf);
42
	fz_try(ctx)
43
	{
44
		if (pdf_is_string(ctx, src))
45
		{
46
			srcptr = (unsigned char *) pdf_to_str_buf(ctx, src);
47
			srclen = pdf_to_str_len(ctx, src);
48
		}
49
		else if (pdf_is_stream(ctx, doc, pdf_to_num(ctx, src), pdf_to_gen(ctx, src)))
50
		{
51
			strmbuf = pdf_load_stream(ctx, doc, pdf_to_num(ctx, src), pdf_to_gen(ctx, src));
52
			srclen = fz_buffer_storage(ctx, strmbuf, (unsigned char **)&srcptr);
53
		}
54
		else
55
		{
56
			srclen = 0;
57
		}
58

59
		if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255)
60
		{
61
			for (i = 2; i + 1 < srclen; i += 2)
62
			{
63
				ucs = srcptr[i] << 8 | srcptr[i+1];
64
				dstlen += fz_runelen(ucs);
65
			}
66

67
			dstptr = dst = fz_malloc(ctx, dstlen + 1);
68

69
			for (i = 2; i + 1 < srclen; i += 2)
70
			{
71
				ucs = srcptr[i] << 8 | srcptr[i+1];
72
				dstptr += fz_runetochar(dstptr, ucs);
73
			}
74
		}
75
		else if (srclen >= 2 && srcptr[0] == 255 && srcptr[1] == 254)
76
		{
77
			for (i = 2; i + 1 < srclen; i += 2)
78
			{
79
				ucs = srcptr[i] | srcptr[i+1] << 8;
80
				dstlen += fz_runelen(ucs);
81
			}
82

83
			dstptr = dst = fz_malloc(ctx, dstlen + 1);
84

85
			for (i = 2; i + 1 < srclen; i += 2)
86
			{
87
				ucs = srcptr[i] | srcptr[i+1] << 8;
88
				dstptr += fz_runetochar(dstptr, ucs);
89
			}
90
		}
91
		else
92
		{
93
			for (i = 0; i < srclen; i++)
94
				dstlen += fz_runelen(pdf_doc_encoding[srcptr[i]]);
95

96
			dstptr = dst = fz_malloc(ctx, dstlen + 1);
97

98
			for (i = 0; i < srclen; i++)
99
			{
100
				ucs = pdf_doc_encoding[srcptr[i]];
101
				dstptr += fz_runetochar(dstptr, ucs);
102
			}
103
		}
104
	}
105
	fz_always(ctx)
106
	{
107
		fz_drop_buffer(ctx, strmbuf);
108
	}
109
	fz_catch(ctx)
110
	{
111
		fz_rethrow(ctx);
112
	}
113

114
	*dstptr = '\0';
115
	return dst;
116
}
117

118
/* Convert Unicode/PdfDocEncoding string into ucs-2 */
119
unsigned short *
120
pdf_to_ucs2(fz_context *ctx, pdf_document *doc, pdf_obj *src)
121
{
122
	unsigned char *srcptr = (unsigned char *) pdf_to_str_buf(ctx, src);
123
	unsigned short *dstptr, *dst;
124
	int srclen = pdf_to_str_len(ctx, src);
125
	int i;
126

127
	if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255)
128
	{
129
		dstptr = dst = fz_malloc_array(ctx, (srclen - 2) / 2 + 1, sizeof(short));
130
		for (i = 2; i + 1 < srclen; i += 2)
131
			*dstptr++ = srcptr[i] << 8 | srcptr[i+1];
132
	}
133
	else if (srclen >= 2 && srcptr[0] == 255 && srcptr[1] == 254)
134
	{
135
		dstptr = dst = fz_malloc_array(ctx, (srclen - 2) / 2 + 1, sizeof(short));
136
		for (i = 2; i + 1 < srclen; i += 2)
137
			*dstptr++ = srcptr[i] | srcptr[i+1] << 8;
138
	}
139
	else
140
	{
141
		dstptr = dst = fz_malloc_array(ctx, srclen + 1, sizeof(short));
142
		for (i = 0; i < srclen; i++)
143
			*dstptr++ = pdf_doc_encoding[srcptr[i]];
144
	}
145

146
	*dstptr = '\0';
147
	return dst;
148
}
149

150
/* allow to convert to UCS-2 without the need for an fz_context */
151
/* (buffer must be at least (fz_to_str_len(src) + 1) * 2 bytes in size) */
152
void
153
pdf_to_ucs2_buf(fz_context *ctx, unsigned short *buffer, pdf_obj *src)
154
{
155
	unsigned char *srcptr = (unsigned char *) pdf_to_str_buf(ctx, src);
156
	unsigned short *dstptr = buffer;
157
	int srclen = pdf_to_str_len(ctx, src);
158
	int i;
159

160
	if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255)
161
	{
162
		for (i = 2; i + 1 < srclen; i += 2)
163
			*dstptr++ = srcptr[i] << 8 | srcptr[i+1];
164
	}
165
	else if (srclen >= 2 && srcptr[0] == 255 && srcptr[1] == 254)
166
	{
167
		for (i = 2; i + 1 < srclen; i += 2)
168
			*dstptr++ = srcptr[i] | srcptr[i+1] << 8;
169
	}
170
	else
171
	{
172
		for (i = 0; i < srclen; i++)
173
			*dstptr++ = pdf_doc_encoding[srcptr[i]];
174
	}
175

176
	*dstptr = '\0';
177
}
178

179
/* Convert UCS-2 string into PdfDocEncoding for authentication */
180
char *
181
pdf_from_ucs2(fz_context *ctx, pdf_document *doc, unsigned short *src)
182
{
183
	int i, j, len;
184
	char *docstr;
185

186
	len = 0;
187
	while (src[len])
188
		len++;
189

190
	docstr = fz_malloc(ctx, len + 1);
191

192
	for (i = 0; i < len; i++)
193
	{
194
		/* shortcut: check if the character has the same code point in both encodings */
195
		if (0 < src[i] && src[i] < 256 && pdf_doc_encoding[src[i]] == src[i]) {
196
			docstr[i] = src[i];
197
			continue;
198
		}
199

200
		/* search through pdf_docencoding for the character's code point */
201
		for (j = 0; j < 256; j++)
202
			if (pdf_doc_encoding[j] == src[i])
203
				break;
204
		docstr[i] = j;
205

206
		/* fail, if a character can't be encoded */
207
		if (!docstr[i])
208
		{
209
			fz_free(ctx, docstr);
210
			return NULL;
211
		}
212
	}
213
	docstr[len] = '\0';
214

215
	return docstr;
216
}
217

218
pdf_obj *
219
pdf_to_utf8_name(fz_context *ctx, pdf_document *doc, pdf_obj *src)
220
{
221
	char *buf = pdf_to_utf8(ctx, doc, src);
222
	pdf_obj *dst = pdf_new_name(ctx, doc, buf);
223
	fz_free(ctx, buf);
224
	return dst;
225
}
226

227
pdf_obj *
228
pdf_parse_array(fz_context *ctx, pdf_document *doc, fz_stream *file, pdf_lexbuf *buf)
229
{
230
	pdf_obj *ary = NULL;
231
	pdf_obj *obj = NULL;
232
	int a = 0, b = 0, n = 0;
233
	pdf_token tok;
234
	pdf_obj *op = NULL;
235

236
	fz_var(obj);
237

238
	ary = pdf_new_array(ctx, doc, 4);
239

240
	fz_try(ctx)
241
	{
242
		while (1)
243
		{
244
			tok = pdf_lex(ctx, file, buf);
245

246
			if (tok != PDF_TOK_INT && tok != PDF_TOK_R)
247
			{
248
				if (n > 0)
249
				{
250
					obj = pdf_new_int(ctx, doc, a);
251
					pdf_array_push(ctx, ary, obj);
252
					pdf_drop_obj(ctx, obj);
253
					obj = NULL;
254
				}
255
				if (n > 1)
256
				{
257
					obj = pdf_new_int(ctx, doc, b);
258
					pdf_array_push(ctx, ary, obj);
259
					pdf_drop_obj(ctx, obj);
260
					obj = NULL;
261
				}
262
				n = 0;
263
			}
264

265
			if (tok == PDF_TOK_INT && n == 2)
266
			{
267
				obj = pdf_new_int(ctx, doc, a);
268
				pdf_array_push(ctx, ary, obj);
269
				pdf_drop_obj(ctx, obj);
270
				obj = NULL;
271
				a = b;
272
				n --;
273
			}
274

275
			switch (tok)
276
			{
277
			case PDF_TOK_CLOSE_ARRAY:
278
				op = ary;
279
				goto end;
280

281
			case PDF_TOK_INT:
282
				if (n == 0)
283
					a = buf->i;
284
				if (n == 1)
285
					b = buf->i;
286
				n ++;
287
				break;
288

289
			case PDF_TOK_R:
290
				if (n != 2)
291
					fz_throw(ctx, FZ_ERROR_GENERIC, "cannot parse indirect reference in array");
292
				obj = pdf_new_indirect(ctx, doc, a, b);
293
				pdf_array_push(ctx, ary, obj);
294
				pdf_drop_obj(ctx, obj);
295
				obj = NULL;
296
				n = 0;
297
				break;
298

299
			case PDF_TOK_OPEN_ARRAY:
300
				obj = pdf_parse_array(ctx, doc, file, buf);
301
				pdf_array_push(ctx, ary, obj);
302
				pdf_drop_obj(ctx, obj);
303
				obj = NULL;
304
				break;
305

306
			case PDF_TOK_OPEN_DICT:
307
				obj = pdf_parse_dict(ctx, doc, file, buf);
308
				pdf_array_push(ctx, ary, obj);
309
				pdf_drop_obj(ctx, obj);
310
				obj = NULL;
311
				break;
312

313
			case PDF_TOK_NAME:
314
				obj = pdf_new_name(ctx, doc, buf->scratch);
315
				pdf_array_push(ctx, ary, obj);
316
				pdf_drop_obj(ctx, obj);
317
				obj = NULL;
318
				break;
319
			case PDF_TOK_REAL:
320
				obj = pdf_new_real(ctx, doc, buf->f);
321
				pdf_array_push(ctx, ary, obj);
322
				pdf_drop_obj(ctx, obj);
323
				obj = NULL;
324
				break;
325
			case PDF_TOK_STRING:
326
				obj = pdf_new_string(ctx, doc, buf->scratch, buf->len);
327
				pdf_array_push(ctx, ary, obj);
328
				pdf_drop_obj(ctx, obj);
329
				obj = NULL;
330
				break;
331
			case PDF_TOK_TRUE:
332
				obj = pdf_new_bool(ctx, doc, 1);
333
				pdf_array_push(ctx, ary, obj);
334
				pdf_drop_obj(ctx, obj);
335
				obj = NULL;
336
				break;
337
			case PDF_TOK_FALSE:
338
				obj = pdf_new_bool(ctx, doc, 0);
339
				pdf_array_push(ctx, ary, obj);
340
				pdf_drop_obj(ctx, obj);
341
				obj = NULL;
342
				break;
343
			case PDF_TOK_NULL:
344
				obj = pdf_new_null(ctx, doc);
345
				pdf_array_push(ctx, ary, obj);
346
				pdf_drop_obj(ctx, obj);
347
				obj = NULL;
348
				break;
349

350
			default:
351
				fz_throw(ctx, FZ_ERROR_GENERIC, "cannot parse token in array");
352
			}
353
		}
354
end:
355
		{}
356
	}
357
	fz_catch(ctx)
358
	{
359
		pdf_drop_obj(ctx, obj);
360
		pdf_drop_obj(ctx, ary);
361
		fz_rethrow_message(ctx, "cannot parse array");
362
	}
363
	return op;
364
}
365

366
pdf_obj *
367
pdf_parse_dict(fz_context *ctx, pdf_document *doc, fz_stream *file, pdf_lexbuf *buf)
368
{
369
	pdf_obj *dict;
370
	pdf_obj *key = NULL;
371
	pdf_obj *val = NULL;
372
	pdf_token tok;
373
	int a, b;
374

375
	dict = pdf_new_dict(ctx, doc, 8);
376

377
	fz_var(key);
378
	fz_var(val);
379

380
	fz_try(ctx)
381
	{
382
		while (1)
383
		{
384
			tok = pdf_lex(ctx, file, buf);
385
	skip:
386
			if (tok == PDF_TOK_CLOSE_DICT)
387
				break;
388

389
			/* for BI .. ID .. EI in content streams */
390
			if (tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID"))
391
				break;
392

393
			if (tok != PDF_TOK_NAME)
394
				fz_throw(ctx, FZ_ERROR_GENERIC, "invalid key in dict");
395

396
			key = pdf_new_name(ctx, doc, buf->scratch);
397

398
			tok = pdf_lex(ctx, file, buf);
399

400
			switch (tok)
401
			{
402
			case PDF_TOK_OPEN_ARRAY:
403
				val = pdf_parse_array(ctx, doc, file, buf);
404
				break;
405

406
			case PDF_TOK_OPEN_DICT:
407
				val = pdf_parse_dict(ctx, doc, file, buf);
408
				break;
409

410
			case PDF_TOK_NAME: val = pdf_new_name(ctx, doc, buf->scratch); break;
411
			case PDF_TOK_REAL: val = pdf_new_real(ctx, doc, buf->f); break;
412
			case PDF_TOK_STRING: val = pdf_new_string(ctx, doc, buf->scratch, buf->len); break;
413
			case PDF_TOK_TRUE: val = pdf_new_bool(ctx, doc, 1); break;
414
			case PDF_TOK_FALSE: val = pdf_new_bool(ctx, doc, 0); break;
415
			case PDF_TOK_NULL: val = pdf_new_null(ctx, doc); break;
416

417
			case PDF_TOK_INT:
418
				/* 64-bit to allow for numbers > INT_MAX and overflow */
419
				a = buf->i;
420
				tok = pdf_lex(ctx, file, buf);
421
				if (tok == PDF_TOK_CLOSE_DICT || tok == PDF_TOK_NAME ||
422
					(tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID")))
423
				{
424
					val = pdf_new_int(ctx, doc, a);
425
					pdf_dict_put(ctx, dict, key, val);
426
					pdf_drop_obj(ctx, val);
427
					val = NULL;
428
					pdf_drop_obj(ctx, key);
429
					key = NULL;
430
					goto skip;
431
				}
432
				if (tok == PDF_TOK_INT)
433
				{
434
					b = buf->i;
435
					tok = pdf_lex(ctx, file, buf);
436
					if (tok == PDF_TOK_R)
437
					{
438
						val = pdf_new_indirect(ctx, doc, a, b);
439
						break;
440
					}
441
				}
442
				fz_throw(ctx, FZ_ERROR_GENERIC, "invalid indirect reference in dict");
443

444
			default:
445
				fz_throw(ctx, FZ_ERROR_GENERIC, "unknown token in dict");
446
			}
447

448
			pdf_dict_put(ctx, dict, key, val);
449
			pdf_drop_obj(ctx, val);
450
			val = NULL;
451
			pdf_drop_obj(ctx, key);
452
			key = NULL;
453
		}
454
	}
455
	fz_catch(ctx)
456
	{
457
		pdf_drop_obj(ctx, dict);
458
		pdf_drop_obj(ctx, key);
459
		pdf_drop_obj(ctx, val);
460
		fz_rethrow_message(ctx, "cannot parse dict");
461
	}
462
	return dict;
463
}
464

465
pdf_obj *
466
pdf_parse_stm_obj(fz_context *ctx, pdf_document *doc, fz_stream *file, pdf_lexbuf *buf)
467
{
468
	pdf_token tok;
469

470
	tok = pdf_lex(ctx, file, buf);
471

472
	switch (tok)
473
	{
474
	case PDF_TOK_OPEN_ARRAY:
475
		return pdf_parse_array(ctx, doc, file, buf);
476
	case PDF_TOK_OPEN_DICT:
477
		return pdf_parse_dict(ctx, doc, file, buf);
478
	case PDF_TOK_NAME: return pdf_new_name(ctx, doc, buf->scratch); break;
479
	case PDF_TOK_REAL: return pdf_new_real(ctx, doc, buf->f); break;
480
	case PDF_TOK_STRING: return pdf_new_string(ctx, doc, buf->scratch, buf->len); break;
481
	case PDF_TOK_TRUE: return pdf_new_bool(ctx, doc, 1); break;
482
	case PDF_TOK_FALSE: return pdf_new_bool(ctx, doc, 0); break;
483
	case PDF_TOK_NULL: return pdf_new_null(ctx, doc); break;
484
	case PDF_TOK_INT: return pdf_new_int(ctx, doc, buf->i); break;
485
	default: fz_throw(ctx, FZ_ERROR_GENERIC, "unknown token in object stream");
486
	}
487
}
488

489
pdf_obj *
490
pdf_parse_ind_obj(fz_context *ctx, pdf_document *doc,
491
	fz_stream *file, pdf_lexbuf *buf,
492
	int *onum, int *ogen, int *ostmofs, int *try_repair)
493
{
494
	pdf_obj *obj = NULL;
495
	int num = 0, gen = 0, stm_ofs;
496
	pdf_token tok;
497
	int a, b;
498

499
	fz_var(obj);
500

501
	tok = pdf_lex(ctx, file, buf);
502
	if (tok != PDF_TOK_INT)
503
	{
504
		if (try_repair)
505
			*try_repair = 1;
506
		fz_throw(ctx, FZ_ERROR_GENERIC, "expected object number");
507
	}
508
	num = buf->i;
509

510
	tok = pdf_lex(ctx, file, buf);
511
	if (tok != PDF_TOK_INT)
512
	{
513
		if (try_repair)
514
			*try_repair = 1;
515
		fz_throw(ctx, FZ_ERROR_GENERIC, "expected generation number (%d ? obj)", num);
516
	}
517
	gen = buf->i;
518

519
	tok = pdf_lex(ctx, file, buf);
520
	if (tok != PDF_TOK_OBJ)
521
	{
522
		if (try_repair)
523
			*try_repair = 1;
524
		fz_throw(ctx, FZ_ERROR_GENERIC, "expected 'obj' keyword (%d %d ?)", num, gen);
525
	}
526

527
	tok = pdf_lex(ctx, file, buf);
528

529
	switch (tok)
530
	{
531
	case PDF_TOK_OPEN_ARRAY:
532
		obj = pdf_parse_array(ctx, doc, file, buf);
533
		break;
534

535
	case PDF_TOK_OPEN_DICT:
536
		obj = pdf_parse_dict(ctx, doc, file, buf);
537
		break;
538

539
	case PDF_TOK_NAME: obj = pdf_new_name(ctx, doc, buf->scratch); break;
540
	case PDF_TOK_REAL: obj = pdf_new_real(ctx, doc, buf->f); break;
541
	case PDF_TOK_STRING: obj = pdf_new_string(ctx, doc, buf->scratch, buf->len); break;
542
	case PDF_TOK_TRUE: obj = pdf_new_bool(ctx, doc, 1); break;
543
	case PDF_TOK_FALSE: obj = pdf_new_bool(ctx, doc, 0); break;
544
	case PDF_TOK_NULL: obj = pdf_new_null(ctx, doc); break;
545

546
	case PDF_TOK_INT:
547
		a = buf->i;
548
		tok = pdf_lex(ctx, file, buf);
549

550
		if (tok == PDF_TOK_STREAM || tok == PDF_TOK_ENDOBJ)
551
		{
552
			obj = pdf_new_int(ctx, doc, a);
553
			goto skip;
554
		}
555
		if (tok == PDF_TOK_INT)
556
		{
557
			b = buf->i;
558
			tok = pdf_lex(ctx, file, buf);
559
			if (tok == PDF_TOK_R)
560
			{
561
				obj = pdf_new_indirect(ctx, doc, a, b);
562
				break;
563
			}
564
		}
565
		fz_throw(ctx, FZ_ERROR_GENERIC, "expected 'R' keyword (%d %d R)", num, gen);
566

567
	case PDF_TOK_ENDOBJ:
568
		obj = pdf_new_null(ctx, doc);
569
		goto skip;
570

571
	default:
572
		fz_throw(ctx, FZ_ERROR_GENERIC, "syntax error in object (%d %d R)", num, gen);
573
	}
574

575
	fz_try(ctx)
576
	{
577
		tok = pdf_lex(ctx, file, buf);
578
	}
579
	fz_catch(ctx)
580
	{
581
		pdf_drop_obj(ctx, obj);
582
		fz_rethrow_message(ctx, "cannot parse indirect object (%d %d R)", num, gen);
583
	}
584

585
skip:
586
	if (tok == PDF_TOK_STREAM)
587
	{
588
		int c = fz_read_byte(ctx, file);
589
		while (c == ' ')
590
			c = fz_read_byte(ctx, file);
591
		if (c == '\r')
592
		{
593
			c = fz_peek_byte(ctx, file);
594
			if (c != '\n')
595
				fz_warn(ctx, "line feed missing after stream begin marker (%d %d R)", num, gen);
596
			else
597
				fz_read_byte(ctx, file);
598
		}
599
		stm_ofs = fz_tell(ctx, file);
600
	}
601
	else if (tok == PDF_TOK_ENDOBJ)
602
	{
603
		stm_ofs = 0;
604
	}
605
	else
606
	{
607
		fz_warn(ctx, "expected 'endobj' or 'stream' keyword (%d %d R)", num, gen);
608
		stm_ofs = 0;
609
	}
610

611
	if (onum) *onum = num;
612
	if (ogen) *ogen = gen;
613
	if (ostmofs) *ostmofs = stm_ofs;
614
	return obj;
615
}
616

617
Product

Resources

Company