Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
7639 views
1
#include "mupdf/pdf.h"
2
3
fz_rect *
4
pdf_to_rect(fz_context *ctx, pdf_obj *array, fz_rect *r)
5
{
6
float a = pdf_to_real(ctx, pdf_array_get(ctx, array, 0));
7
float b = pdf_to_real(ctx, pdf_array_get(ctx, array, 1));
8
float c = pdf_to_real(ctx, pdf_array_get(ctx, array, 2));
9
float d = pdf_to_real(ctx, pdf_array_get(ctx, array, 3));
10
r->x0 = fz_min(a, c);
11
r->y0 = fz_min(b, d);
12
r->x1 = fz_max(a, c);
13
r->y1 = fz_max(b, d);
14
return r;
15
}
16
17
fz_matrix *
18
pdf_to_matrix(fz_context *ctx, pdf_obj *array, fz_matrix *m)
19
{
20
m->a = pdf_to_real(ctx, pdf_array_get(ctx, array, 0));
21
m->b = pdf_to_real(ctx, pdf_array_get(ctx, array, 1));
22
m->c = pdf_to_real(ctx, pdf_array_get(ctx, array, 2));
23
m->d = pdf_to_real(ctx, pdf_array_get(ctx, array, 3));
24
m->e = pdf_to_real(ctx, pdf_array_get(ctx, array, 4));
25
m->f = pdf_to_real(ctx, pdf_array_get(ctx, array, 5));
26
return m;
27
}
28
29
/* Convert Unicode/PdfDocEncoding string into utf-8 */
30
char *
31
pdf_to_utf8(fz_context *ctx, pdf_document *doc, pdf_obj *src)
32
{
33
fz_buffer *strmbuf = NULL;
34
unsigned char *srcptr;
35
char *dstptr, *dst;
36
int srclen;
37
int dstlen = 0;
38
int ucs;
39
int i;
40
41
fz_var(strmbuf);
42
fz_try(ctx)
43
{
44
if (pdf_is_string(ctx, src))
45
{
46
srcptr = (unsigned char *) pdf_to_str_buf(ctx, src);
47
srclen = pdf_to_str_len(ctx, src);
48
}
49
else if (pdf_is_stream(ctx, doc, pdf_to_num(ctx, src), pdf_to_gen(ctx, src)))
50
{
51
strmbuf = pdf_load_stream(ctx, doc, pdf_to_num(ctx, src), pdf_to_gen(ctx, src));
52
srclen = fz_buffer_storage(ctx, strmbuf, (unsigned char **)&srcptr);
53
}
54
else
55
{
56
srclen = 0;
57
}
58
59
if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255)
60
{
61
for (i = 2; i + 1 < srclen; i += 2)
62
{
63
ucs = srcptr[i] << 8 | srcptr[i+1];
64
dstlen += fz_runelen(ucs);
65
}
66
67
dstptr = dst = fz_malloc(ctx, dstlen + 1);
68
69
for (i = 2; i + 1 < srclen; i += 2)
70
{
71
ucs = srcptr[i] << 8 | srcptr[i+1];
72
dstptr += fz_runetochar(dstptr, ucs);
73
}
74
}
75
else if (srclen >= 2 && srcptr[0] == 255 && srcptr[1] == 254)
76
{
77
for (i = 2; i + 1 < srclen; i += 2)
78
{
79
ucs = srcptr[i] | srcptr[i+1] << 8;
80
dstlen += fz_runelen(ucs);
81
}
82
83
dstptr = dst = fz_malloc(ctx, dstlen + 1);
84
85
for (i = 2; i + 1 < srclen; i += 2)
86
{
87
ucs = srcptr[i] | srcptr[i+1] << 8;
88
dstptr += fz_runetochar(dstptr, ucs);
89
}
90
}
91
else
92
{
93
for (i = 0; i < srclen; i++)
94
dstlen += fz_runelen(pdf_doc_encoding[srcptr[i]]);
95
96
dstptr = dst = fz_malloc(ctx, dstlen + 1);
97
98
for (i = 0; i < srclen; i++)
99
{
100
ucs = pdf_doc_encoding[srcptr[i]];
101
dstptr += fz_runetochar(dstptr, ucs);
102
}
103
}
104
}
105
fz_always(ctx)
106
{
107
fz_drop_buffer(ctx, strmbuf);
108
}
109
fz_catch(ctx)
110
{
111
fz_rethrow(ctx);
112
}
113
114
*dstptr = '\0';
115
return dst;
116
}
117
118
/* Convert Unicode/PdfDocEncoding string into ucs-2 */
119
unsigned short *
120
pdf_to_ucs2(fz_context *ctx, pdf_document *doc, pdf_obj *src)
121
{
122
unsigned char *srcptr = (unsigned char *) pdf_to_str_buf(ctx, src);
123
unsigned short *dstptr, *dst;
124
int srclen = pdf_to_str_len(ctx, src);
125
int i;
126
127
if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255)
128
{
129
dstptr = dst = fz_malloc_array(ctx, (srclen - 2) / 2 + 1, sizeof(short));
130
for (i = 2; i + 1 < srclen; i += 2)
131
*dstptr++ = srcptr[i] << 8 | srcptr[i+1];
132
}
133
else if (srclen >= 2 && srcptr[0] == 255 && srcptr[1] == 254)
134
{
135
dstptr = dst = fz_malloc_array(ctx, (srclen - 2) / 2 + 1, sizeof(short));
136
for (i = 2; i + 1 < srclen; i += 2)
137
*dstptr++ = srcptr[i] | srcptr[i+1] << 8;
138
}
139
else
140
{
141
dstptr = dst = fz_malloc_array(ctx, srclen + 1, sizeof(short));
142
for (i = 0; i < srclen; i++)
143
*dstptr++ = pdf_doc_encoding[srcptr[i]];
144
}
145
146
*dstptr = '\0';
147
return dst;
148
}
149
150
/* allow to convert to UCS-2 without the need for an fz_context */
151
/* (buffer must be at least (fz_to_str_len(src) + 1) * 2 bytes in size) */
152
void
153
pdf_to_ucs2_buf(fz_context *ctx, unsigned short *buffer, pdf_obj *src)
154
{
155
unsigned char *srcptr = (unsigned char *) pdf_to_str_buf(ctx, src);
156
unsigned short *dstptr = buffer;
157
int srclen = pdf_to_str_len(ctx, src);
158
int i;
159
160
if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255)
161
{
162
for (i = 2; i + 1 < srclen; i += 2)
163
*dstptr++ = srcptr[i] << 8 | srcptr[i+1];
164
}
165
else if (srclen >= 2 && srcptr[0] == 255 && srcptr[1] == 254)
166
{
167
for (i = 2; i + 1 < srclen; i += 2)
168
*dstptr++ = srcptr[i] | srcptr[i+1] << 8;
169
}
170
else
171
{
172
for (i = 0; i < srclen; i++)
173
*dstptr++ = pdf_doc_encoding[srcptr[i]];
174
}
175
176
*dstptr = '\0';
177
}
178
179
/* Convert UCS-2 string into PdfDocEncoding for authentication */
180
char *
181
pdf_from_ucs2(fz_context *ctx, pdf_document *doc, unsigned short *src)
182
{
183
int i, j, len;
184
char *docstr;
185
186
len = 0;
187
while (src[len])
188
len++;
189
190
docstr = fz_malloc(ctx, len + 1);
191
192
for (i = 0; i < len; i++)
193
{
194
/* shortcut: check if the character has the same code point in both encodings */
195
if (0 < src[i] && src[i] < 256 && pdf_doc_encoding[src[i]] == src[i]) {
196
docstr[i] = src[i];
197
continue;
198
}
199
200
/* search through pdf_docencoding for the character's code point */
201
for (j = 0; j < 256; j++)
202
if (pdf_doc_encoding[j] == src[i])
203
break;
204
docstr[i] = j;
205
206
/* fail, if a character can't be encoded */
207
if (!docstr[i])
208
{
209
fz_free(ctx, docstr);
210
return NULL;
211
}
212
}
213
docstr[len] = '\0';
214
215
return docstr;
216
}
217
218
pdf_obj *
219
pdf_to_utf8_name(fz_context *ctx, pdf_document *doc, pdf_obj *src)
220
{
221
char *buf = pdf_to_utf8(ctx, doc, src);
222
pdf_obj *dst = pdf_new_name(ctx, doc, buf);
223
fz_free(ctx, buf);
224
return dst;
225
}
226
227
pdf_obj *
228
pdf_parse_array(fz_context *ctx, pdf_document *doc, fz_stream *file, pdf_lexbuf *buf)
229
{
230
pdf_obj *ary = NULL;
231
pdf_obj *obj = NULL;
232
int a = 0, b = 0, n = 0;
233
pdf_token tok;
234
pdf_obj *op = NULL;
235
236
fz_var(obj);
237
238
ary = pdf_new_array(ctx, doc, 4);
239
240
fz_try(ctx)
241
{
242
while (1)
243
{
244
tok = pdf_lex(ctx, file, buf);
245
246
if (tok != PDF_TOK_INT && tok != PDF_TOK_R)
247
{
248
if (n > 0)
249
{
250
obj = pdf_new_int(ctx, doc, a);
251
pdf_array_push(ctx, ary, obj);
252
pdf_drop_obj(ctx, obj);
253
obj = NULL;
254
}
255
if (n > 1)
256
{
257
obj = pdf_new_int(ctx, doc, b);
258
pdf_array_push(ctx, ary, obj);
259
pdf_drop_obj(ctx, obj);
260
obj = NULL;
261
}
262
n = 0;
263
}
264
265
if (tok == PDF_TOK_INT && n == 2)
266
{
267
obj = pdf_new_int(ctx, doc, a);
268
pdf_array_push(ctx, ary, obj);
269
pdf_drop_obj(ctx, obj);
270
obj = NULL;
271
a = b;
272
n --;
273
}
274
275
switch (tok)
276
{
277
case PDF_TOK_CLOSE_ARRAY:
278
op = ary;
279
goto end;
280
281
case PDF_TOK_INT:
282
if (n == 0)
283
a = buf->i;
284
if (n == 1)
285
b = buf->i;
286
n ++;
287
break;
288
289
case PDF_TOK_R:
290
if (n != 2)
291
fz_throw(ctx, FZ_ERROR_GENERIC, "cannot parse indirect reference in array");
292
obj = pdf_new_indirect(ctx, doc, a, b);
293
pdf_array_push(ctx, ary, obj);
294
pdf_drop_obj(ctx, obj);
295
obj = NULL;
296
n = 0;
297
break;
298
299
case PDF_TOK_OPEN_ARRAY:
300
obj = pdf_parse_array(ctx, doc, file, buf);
301
pdf_array_push(ctx, ary, obj);
302
pdf_drop_obj(ctx, obj);
303
obj = NULL;
304
break;
305
306
case PDF_TOK_OPEN_DICT:
307
obj = pdf_parse_dict(ctx, doc, file, buf);
308
pdf_array_push(ctx, ary, obj);
309
pdf_drop_obj(ctx, obj);
310
obj = NULL;
311
break;
312
313
case PDF_TOK_NAME:
314
obj = pdf_new_name(ctx, doc, buf->scratch);
315
pdf_array_push(ctx, ary, obj);
316
pdf_drop_obj(ctx, obj);
317
obj = NULL;
318
break;
319
case PDF_TOK_REAL:
320
obj = pdf_new_real(ctx, doc, buf->f);
321
pdf_array_push(ctx, ary, obj);
322
pdf_drop_obj(ctx, obj);
323
obj = NULL;
324
break;
325
case PDF_TOK_STRING:
326
obj = pdf_new_string(ctx, doc, buf->scratch, buf->len);
327
pdf_array_push(ctx, ary, obj);
328
pdf_drop_obj(ctx, obj);
329
obj = NULL;
330
break;
331
case PDF_TOK_TRUE:
332
obj = pdf_new_bool(ctx, doc, 1);
333
pdf_array_push(ctx, ary, obj);
334
pdf_drop_obj(ctx, obj);
335
obj = NULL;
336
break;
337
case PDF_TOK_FALSE:
338
obj = pdf_new_bool(ctx, doc, 0);
339
pdf_array_push(ctx, ary, obj);
340
pdf_drop_obj(ctx, obj);
341
obj = NULL;
342
break;
343
case PDF_TOK_NULL:
344
obj = pdf_new_null(ctx, doc);
345
pdf_array_push(ctx, ary, obj);
346
pdf_drop_obj(ctx, obj);
347
obj = NULL;
348
break;
349
350
default:
351
fz_throw(ctx, FZ_ERROR_GENERIC, "cannot parse token in array");
352
}
353
}
354
end:
355
{}
356
}
357
fz_catch(ctx)
358
{
359
pdf_drop_obj(ctx, obj);
360
pdf_drop_obj(ctx, ary);
361
fz_rethrow_message(ctx, "cannot parse array");
362
}
363
return op;
364
}
365
366
pdf_obj *
367
pdf_parse_dict(fz_context *ctx, pdf_document *doc, fz_stream *file, pdf_lexbuf *buf)
368
{
369
pdf_obj *dict;
370
pdf_obj *key = NULL;
371
pdf_obj *val = NULL;
372
pdf_token tok;
373
int a, b;
374
375
dict = pdf_new_dict(ctx, doc, 8);
376
377
fz_var(key);
378
fz_var(val);
379
380
fz_try(ctx)
381
{
382
while (1)
383
{
384
tok = pdf_lex(ctx, file, buf);
385
skip:
386
if (tok == PDF_TOK_CLOSE_DICT)
387
break;
388
389
/* for BI .. ID .. EI in content streams */
390
if (tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID"))
391
break;
392
393
if (tok != PDF_TOK_NAME)
394
fz_throw(ctx, FZ_ERROR_GENERIC, "invalid key in dict");
395
396
key = pdf_new_name(ctx, doc, buf->scratch);
397
398
tok = pdf_lex(ctx, file, buf);
399
400
switch (tok)
401
{
402
case PDF_TOK_OPEN_ARRAY:
403
val = pdf_parse_array(ctx, doc, file, buf);
404
break;
405
406
case PDF_TOK_OPEN_DICT:
407
val = pdf_parse_dict(ctx, doc, file, buf);
408
break;
409
410
case PDF_TOK_NAME: val = pdf_new_name(ctx, doc, buf->scratch); break;
411
case PDF_TOK_REAL: val = pdf_new_real(ctx, doc, buf->f); break;
412
case PDF_TOK_STRING: val = pdf_new_string(ctx, doc, buf->scratch, buf->len); break;
413
case PDF_TOK_TRUE: val = pdf_new_bool(ctx, doc, 1); break;
414
case PDF_TOK_FALSE: val = pdf_new_bool(ctx, doc, 0); break;
415
case PDF_TOK_NULL: val = pdf_new_null(ctx, doc); break;
416
417
case PDF_TOK_INT:
418
/* 64-bit to allow for numbers > INT_MAX and overflow */
419
a = buf->i;
420
tok = pdf_lex(ctx, file, buf);
421
if (tok == PDF_TOK_CLOSE_DICT || tok == PDF_TOK_NAME ||
422
(tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID")))
423
{
424
val = pdf_new_int(ctx, doc, a);
425
pdf_dict_put(ctx, dict, key, val);
426
pdf_drop_obj(ctx, val);
427
val = NULL;
428
pdf_drop_obj(ctx, key);
429
key = NULL;
430
goto skip;
431
}
432
if (tok == PDF_TOK_INT)
433
{
434
b = buf->i;
435
tok = pdf_lex(ctx, file, buf);
436
if (tok == PDF_TOK_R)
437
{
438
val = pdf_new_indirect(ctx, doc, a, b);
439
break;
440
}
441
}
442
fz_throw(ctx, FZ_ERROR_GENERIC, "invalid indirect reference in dict");
443
444
default:
445
fz_throw(ctx, FZ_ERROR_GENERIC, "unknown token in dict");
446
}
447
448
pdf_dict_put(ctx, dict, key, val);
449
pdf_drop_obj(ctx, val);
450
val = NULL;
451
pdf_drop_obj(ctx, key);
452
key = NULL;
453
}
454
}
455
fz_catch(ctx)
456
{
457
pdf_drop_obj(ctx, dict);
458
pdf_drop_obj(ctx, key);
459
pdf_drop_obj(ctx, val);
460
fz_rethrow_message(ctx, "cannot parse dict");
461
}
462
return dict;
463
}
464
465
pdf_obj *
466
pdf_parse_stm_obj(fz_context *ctx, pdf_document *doc, fz_stream *file, pdf_lexbuf *buf)
467
{
468
pdf_token tok;
469
470
tok = pdf_lex(ctx, file, buf);
471
472
switch (tok)
473
{
474
case PDF_TOK_OPEN_ARRAY:
475
return pdf_parse_array(ctx, doc, file, buf);
476
case PDF_TOK_OPEN_DICT:
477
return pdf_parse_dict(ctx, doc, file, buf);
478
case PDF_TOK_NAME: return pdf_new_name(ctx, doc, buf->scratch); break;
479
case PDF_TOK_REAL: return pdf_new_real(ctx, doc, buf->f); break;
480
case PDF_TOK_STRING: return pdf_new_string(ctx, doc, buf->scratch, buf->len); break;
481
case PDF_TOK_TRUE: return pdf_new_bool(ctx, doc, 1); break;
482
case PDF_TOK_FALSE: return pdf_new_bool(ctx, doc, 0); break;
483
case PDF_TOK_NULL: return pdf_new_null(ctx, doc); break;
484
case PDF_TOK_INT: return pdf_new_int(ctx, doc, buf->i); break;
485
default: fz_throw(ctx, FZ_ERROR_GENERIC, "unknown token in object stream");
486
}
487
}
488
489
pdf_obj *
490
pdf_parse_ind_obj(fz_context *ctx, pdf_document *doc,
491
fz_stream *file, pdf_lexbuf *buf,
492
int *onum, int *ogen, int *ostmofs, int *try_repair)
493
{
494
pdf_obj *obj = NULL;
495
int num = 0, gen = 0, stm_ofs;
496
pdf_token tok;
497
int a, b;
498
499
fz_var(obj);
500
501
tok = pdf_lex(ctx, file, buf);
502
if (tok != PDF_TOK_INT)
503
{
504
if (try_repair)
505
*try_repair = 1;
506
fz_throw(ctx, FZ_ERROR_GENERIC, "expected object number");
507
}
508
num = buf->i;
509
510
tok = pdf_lex(ctx, file, buf);
511
if (tok != PDF_TOK_INT)
512
{
513
if (try_repair)
514
*try_repair = 1;
515
fz_throw(ctx, FZ_ERROR_GENERIC, "expected generation number (%d ? obj)", num);
516
}
517
gen = buf->i;
518
519
tok = pdf_lex(ctx, file, buf);
520
if (tok != PDF_TOK_OBJ)
521
{
522
if (try_repair)
523
*try_repair = 1;
524
fz_throw(ctx, FZ_ERROR_GENERIC, "expected 'obj' keyword (%d %d ?)", num, gen);
525
}
526
527
tok = pdf_lex(ctx, file, buf);
528
529
switch (tok)
530
{
531
case PDF_TOK_OPEN_ARRAY:
532
obj = pdf_parse_array(ctx, doc, file, buf);
533
break;
534
535
case PDF_TOK_OPEN_DICT:
536
obj = pdf_parse_dict(ctx, doc, file, buf);
537
break;
538
539
case PDF_TOK_NAME: obj = pdf_new_name(ctx, doc, buf->scratch); break;
540
case PDF_TOK_REAL: obj = pdf_new_real(ctx, doc, buf->f); break;
541
case PDF_TOK_STRING: obj = pdf_new_string(ctx, doc, buf->scratch, buf->len); break;
542
case PDF_TOK_TRUE: obj = pdf_new_bool(ctx, doc, 1); break;
543
case PDF_TOK_FALSE: obj = pdf_new_bool(ctx, doc, 0); break;
544
case PDF_TOK_NULL: obj = pdf_new_null(ctx, doc); break;
545
546
case PDF_TOK_INT:
547
a = buf->i;
548
tok = pdf_lex(ctx, file, buf);
549
550
if (tok == PDF_TOK_STREAM || tok == PDF_TOK_ENDOBJ)
551
{
552
obj = pdf_new_int(ctx, doc, a);
553
goto skip;
554
}
555
if (tok == PDF_TOK_INT)
556
{
557
b = buf->i;
558
tok = pdf_lex(ctx, file, buf);
559
if (tok == PDF_TOK_R)
560
{
561
obj = pdf_new_indirect(ctx, doc, a, b);
562
break;
563
}
564
}
565
fz_throw(ctx, FZ_ERROR_GENERIC, "expected 'R' keyword (%d %d R)", num, gen);
566
567
case PDF_TOK_ENDOBJ:
568
obj = pdf_new_null(ctx, doc);
569
goto skip;
570
571
default:
572
fz_throw(ctx, FZ_ERROR_GENERIC, "syntax error in object (%d %d R)", num, gen);
573
}
574
575
fz_try(ctx)
576
{
577
tok = pdf_lex(ctx, file, buf);
578
}
579
fz_catch(ctx)
580
{
581
pdf_drop_obj(ctx, obj);
582
fz_rethrow_message(ctx, "cannot parse indirect object (%d %d R)", num, gen);
583
}
584
585
skip:
586
if (tok == PDF_TOK_STREAM)
587
{
588
int c = fz_read_byte(ctx, file);
589
while (c == ' ')
590
c = fz_read_byte(ctx, file);
591
if (c == '\r')
592
{
593
c = fz_peek_byte(ctx, file);
594
if (c != '\n')
595
fz_warn(ctx, "line feed missing after stream begin marker (%d %d R)", num, gen);
596
else
597
fz_read_byte(ctx, file);
598
}
599
stm_ofs = fz_tell(ctx, file);
600
}
601
else if (tok == PDF_TOK_ENDOBJ)
602
{
603
stm_ofs = 0;
604
}
605
else
606
{
607
fz_warn(ctx, "expected 'endobj' or 'stream' keyword (%d %d R)", num, gen);
608
stm_ofs = 0;
609
}
610
611
if (onum) *onum = num;
612
if (ogen) *ogen = gen;
613
if (ostmofs) *ostmofs = stm_ofs;
614
return obj;
615
}
616
617