Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
7639 views
1
#include "mupdf/pdf.h"
2
3
#define IS_NUMBER \
4
'+':case'-':case'.':case'0':case'1':case'2':case'3':\
5
case'4':case'5':case'6':case'7':case'8':case'9'
6
#define IS_WHITE \
7
'\000':case'\011':case'\012':case'\014':case'\015':case'\040'
8
#define IS_HEX \
9
'0':case'1':case'2':case'3':case'4':case'5':case'6':\
10
case'7':case'8':case'9':case'A':case'B':case'C':\
11
case'D':case'E':case'F':case'a':case'b':case'c':\
12
case'd':case'e':case'f'
13
#define IS_DELIM \
14
'(':case')':case'<':case'>':case'[':case']':case'{':\
15
case'}':case'/':case'%'
16
17
#define RANGE_0_9 \
18
'0':case'1':case'2':case'3':case'4':case'5':\
19
case'6':case'7':case'8':case'9'
20
#define RANGE_a_f \
21
'a':case'b':case'c':case'd':case'e':case'f'
22
#define RANGE_A_F \
23
'A':case'B':case'C':case'D':case'E':case'F'
24
#define RANGE_0_7 \
25
'0':case'1':case'2':case'3':case'4':case'5':case'6':case'7'
26
27
static inline int iswhite(int ch)
28
{
29
return
30
ch == '\000' ||
31
ch == '\011' ||
32
ch == '\012' ||
33
ch == '\014' ||
34
ch == '\015' ||
35
ch == '\040';
36
}
37
38
static inline int unhex(int ch)
39
{
40
if (ch >= '0' && ch <= '9') return ch - '0';
41
if (ch >= 'A' && ch <= 'F') return ch - 'A' + 0xA;
42
if (ch >= 'a' && ch <= 'f') return ch - 'a' + 0xA;
43
return 0;
44
}
45
46
static void
47
lex_white(fz_context *ctx, fz_stream *f)
48
{
49
int c;
50
do {
51
c = fz_read_byte(ctx, f);
52
} while ((c <= 32) && (iswhite(c)));
53
if (c != EOF)
54
fz_unread_byte(ctx, f);
55
}
56
57
static void
58
lex_comment(fz_context *ctx, fz_stream *f)
59
{
60
int c;
61
do {
62
c = fz_read_byte(ctx, f);
63
} while ((c != '\012') && (c != '\015') && (c != EOF));
64
}
65
66
static int
67
lex_number(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf, int c)
68
{
69
int neg = 0;
70
int i = 0;
71
int n;
72
int d;
73
float v;
74
75
/* Initially we might have +, -, . or a digit */
76
switch (c)
77
{
78
case '.':
79
goto loop_after_dot;
80
case '-':
81
neg = 1;
82
break;
83
case '+':
84
break;
85
default: /* Must be a digit */
86
i = c - '0';
87
break;
88
}
89
90
while (1)
91
{
92
c = fz_read_byte(ctx, f);
93
switch (c)
94
{
95
case '.':
96
goto loop_after_dot;
97
case RANGE_0_9:
98
i = 10*i + c - '0';
99
/* FIXME: Need overflow check here; do we care? */
100
break;
101
default:
102
fz_unread_byte(ctx, f);
103
/* Fallthrough */
104
case EOF:
105
if (neg)
106
i = -i;
107
buf->i = i;
108
return PDF_TOK_INT;
109
}
110
}
111
112
/* In here, we've seen a dot, so can accept just digits */
113
loop_after_dot:
114
n = 0;
115
d = 1;
116
while (1)
117
{
118
c = fz_read_byte(ctx, f);
119
switch (c)
120
{
121
case RANGE_0_9:
122
if (d >= INT_MAX/10)
123
goto underflow;
124
n = n*10 + (c - '0');
125
d *= 10;
126
break;
127
default:
128
fz_unread_byte(ctx, f);
129
/* Fallthrough */
130
case EOF:
131
v = (float)i + ((float)n / (float)d);
132
if (neg)
133
v = -v;
134
buf->f = v;
135
return PDF_TOK_REAL;
136
}
137
}
138
139
underflow:
140
/* Ignore any digits after here, because they are too small */
141
while (1)
142
{
143
c = fz_read_byte(ctx, f);
144
switch (c)
145
{
146
case RANGE_0_9:
147
break;
148
default:
149
fz_unread_byte(ctx, f);
150
/* Fallthrough */
151
case EOF:
152
v = (float)i + ((float)n / (float)d);
153
if (neg)
154
v = -v;
155
buf->f = v;
156
return PDF_TOK_REAL;
157
}
158
}
159
}
160
161
static void
162
lex_name(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf)
163
{
164
char *s = buf->scratch;
165
int n = buf->size;
166
167
while (n > 1)
168
{
169
int c = fz_read_byte(ctx, f);
170
switch (c)
171
{
172
case IS_WHITE:
173
case IS_DELIM:
174
fz_unread_byte(ctx, f);
175
goto end;
176
case EOF:
177
goto end;
178
case '#':
179
{
180
int d;
181
c = fz_read_byte(ctx, f);
182
switch (c)
183
{
184
case RANGE_0_9:
185
d = (c - '0') << 4;
186
break;
187
case RANGE_a_f:
188
d = (c - 'a' + 10) << 4;
189
break;
190
case RANGE_A_F:
191
d = (c - 'A' + 10) << 4;
192
break;
193
default:
194
fz_unread_byte(ctx, f);
195
/* fallthrough */
196
case EOF:
197
goto end;
198
}
199
c = fz_read_byte(ctx, f);
200
switch (c)
201
{
202
case RANGE_0_9:
203
c -= '0';
204
break;
205
case RANGE_a_f:
206
c -= 'a' - 10;
207
break;
208
case RANGE_A_F:
209
c -= 'A' - 10;
210
break;
211
default:
212
fz_unread_byte(ctx, f);
213
/* fallthrough */
214
case EOF:
215
*s++ = d;
216
n--;
217
goto end;
218
}
219
*s++ = d + c;
220
n--;
221
break;
222
}
223
default:
224
*s++ = c;
225
n--;
226
break;
227
}
228
}
229
end:
230
*s = '\0';
231
buf->len = s - buf->scratch;
232
}
233
234
static int
235
lex_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb)
236
{
237
char *s = lb->scratch;
238
char *e = s + lb->size;
239
int bal = 1;
240
int oct;
241
int c;
242
243
while (1)
244
{
245
if (s == e)
246
{
247
s += pdf_lexbuf_grow(ctx, lb);
248
e = lb->scratch + lb->size;
249
}
250
c = fz_read_byte(ctx, f);
251
switch (c)
252
{
253
case EOF:
254
goto end;
255
case '(':
256
bal++;
257
*s++ = c;
258
break;
259
case ')':
260
bal --;
261
if (bal == 0)
262
goto end;
263
*s++ = c;
264
break;
265
case '\\':
266
c = fz_read_byte(ctx, f);
267
switch (c)
268
{
269
case EOF:
270
goto end;
271
case 'n':
272
*s++ = '\n';
273
break;
274
case 'r':
275
*s++ = '\r';
276
break;
277
case 't':
278
*s++ = '\t';
279
break;
280
case 'b':
281
*s++ = '\b';
282
break;
283
case 'f':
284
*s++ = '\f';
285
break;
286
case '(':
287
*s++ = '(';
288
break;
289
case ')':
290
*s++ = ')';
291
break;
292
case '\\':
293
*s++ = '\\';
294
break;
295
case RANGE_0_7:
296
oct = c - '0';
297
c = fz_read_byte(ctx, f);
298
if (c >= '0' && c <= '7')
299
{
300
oct = oct * 8 + (c - '0');
301
c = fz_read_byte(ctx, f);
302
if (c >= '0' && c <= '7')
303
oct = oct * 8 + (c - '0');
304
else if (c != EOF)
305
fz_unread_byte(ctx, f);
306
}
307
else if (c != EOF)
308
fz_unread_byte(ctx, f);
309
*s++ = oct;
310
break;
311
case '\n':
312
break;
313
case '\r':
314
c = fz_read_byte(ctx, f);
315
if ((c != '\n') && (c != EOF))
316
fz_unread_byte(ctx, f);
317
break;
318
default:
319
*s++ = c;
320
}
321
break;
322
default:
323
*s++ = c;
324
break;
325
}
326
}
327
end:
328
lb->len = s - lb->scratch;
329
return PDF_TOK_STRING;
330
}
331
332
static int
333
lex_hex_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb)
334
{
335
char *s = lb->scratch;
336
char *e = s + lb->size;
337
int a = 0, x = 0;
338
int c;
339
340
while (1)
341
{
342
if (s == e)
343
{
344
s += pdf_lexbuf_grow(ctx, lb);
345
e = lb->scratch + lb->size;
346
}
347
c = fz_read_byte(ctx, f);
348
switch (c)
349
{
350
case IS_WHITE:
351
break;
352
case IS_HEX:
353
if (x)
354
{
355
*s++ = a * 16 + unhex(c);
356
x = !x;
357
}
358
else
359
{
360
a = unhex(c);
361
x = !x;
362
}
363
break;
364
case '>':
365
case EOF:
366
goto end;
367
default:
368
fz_warn(ctx, "ignoring invalid character in hex string");
369
}
370
}
371
end:
372
lb->len = s - lb->scratch;
373
return PDF_TOK_STRING;
374
}
375
376
static pdf_token
377
pdf_token_from_keyword(char *key)
378
{
379
switch (*key)
380
{
381
case 'R':
382
if (!strcmp(key, "R")) return PDF_TOK_R;
383
break;
384
case 't':
385
if (!strcmp(key, "true")) return PDF_TOK_TRUE;
386
if (!strcmp(key, "trailer")) return PDF_TOK_TRAILER;
387
break;
388
case 'f':
389
if (!strcmp(key, "false")) return PDF_TOK_FALSE;
390
break;
391
case 'n':
392
if (!strcmp(key, "null")) return PDF_TOK_NULL;
393
break;
394
case 'o':
395
if (!strcmp(key, "obj")) return PDF_TOK_OBJ;
396
break;
397
case 'e':
398
if (!strcmp(key, "endobj")) return PDF_TOK_ENDOBJ;
399
if (!strcmp(key, "endstream")) return PDF_TOK_ENDSTREAM;
400
break;
401
case 's':
402
if (!strcmp(key, "stream")) return PDF_TOK_STREAM;
403
if (!strcmp(key, "startxref")) return PDF_TOK_STARTXREF;
404
break;
405
case 'x':
406
if (!strcmp(key, "xref")) return PDF_TOK_XREF;
407
break;
408
default:
409
break;
410
}
411
412
return PDF_TOK_KEYWORD;
413
}
414
415
void pdf_lexbuf_init(fz_context *ctx, pdf_lexbuf *lb, int size)
416
{
417
lb->size = lb->base_size = size;
418
lb->len = 0;
419
lb->scratch = &lb->buffer[0];
420
}
421
422
void pdf_lexbuf_fin(fz_context *ctx, pdf_lexbuf *lb)
423
{
424
if (lb && lb->size != lb->base_size)
425
fz_free(ctx, lb->scratch);
426
}
427
428
ptrdiff_t pdf_lexbuf_grow(fz_context *ctx, pdf_lexbuf *lb)
429
{
430
char *old = lb->scratch;
431
int newsize = lb->size * 2;
432
if (lb->size == lb->base_size)
433
{
434
lb->scratch = fz_malloc(ctx, newsize);
435
memcpy(lb->scratch, lb->buffer, lb->size);
436
}
437
else
438
{
439
lb->scratch = fz_resize_array(ctx, lb->scratch, newsize, 1);
440
}
441
lb->size = newsize;
442
return lb->scratch - old;
443
}
444
445
pdf_token
446
pdf_lex(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf)
447
{
448
while (1)
449
{
450
int c = fz_read_byte(ctx, f);
451
switch (c)
452
{
453
case EOF:
454
return PDF_TOK_EOF;
455
case IS_WHITE:
456
lex_white(ctx, f);
457
break;
458
case '%':
459
lex_comment(ctx, f);
460
break;
461
case '/':
462
lex_name(ctx, f, buf);
463
return PDF_TOK_NAME;
464
case '(':
465
return lex_string(ctx, f, buf);
466
case ')':
467
fz_warn(ctx, "lexical error (unexpected ')')");
468
continue;
469
case '<':
470
c = fz_read_byte(ctx, f);
471
if (c == '<')
472
{
473
return PDF_TOK_OPEN_DICT;
474
}
475
else
476
{
477
fz_unread_byte(ctx, f);
478
return lex_hex_string(ctx, f, buf);
479
}
480
case '>':
481
c = fz_read_byte(ctx, f);
482
if (c == '>')
483
{
484
return PDF_TOK_CLOSE_DICT;
485
}
486
fz_warn(ctx, "lexical error (unexpected '>')");
487
if (c == EOF)
488
{
489
return PDF_TOK_EOF;
490
}
491
fz_unread_byte(ctx, f);
492
continue;
493
case '[':
494
return PDF_TOK_OPEN_ARRAY;
495
case ']':
496
return PDF_TOK_CLOSE_ARRAY;
497
case '{':
498
return PDF_TOK_OPEN_BRACE;
499
case '}':
500
return PDF_TOK_CLOSE_BRACE;
501
case IS_NUMBER:
502
return lex_number(ctx, f, buf, c);
503
default: /* isregular: !isdelim && !iswhite && c != EOF */
504
fz_unread_byte(ctx, f);
505
lex_name(ctx, f, buf);
506
return pdf_token_from_keyword(buf->scratch);
507
}
508
}
509
}
510
511
pdf_token
512
pdf_lex_no_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf)
513
{
514
while (1)
515
{
516
int c = fz_read_byte(ctx, f);
517
switch (c)
518
{
519
case EOF:
520
return PDF_TOK_EOF;
521
case IS_WHITE:
522
lex_white(ctx, f);
523
break;
524
case '%':
525
lex_comment(ctx, f);
526
break;
527
case '/':
528
lex_name(ctx, f, buf);
529
return PDF_TOK_NAME;
530
case '(':
531
continue;
532
case ')':
533
continue;
534
case '<':
535
c = fz_read_byte(ctx, f);
536
if (c == '<')
537
{
538
return PDF_TOK_OPEN_DICT;
539
}
540
else
541
{
542
continue;
543
}
544
case '>':
545
c = fz_read_byte(ctx, f);
546
if (c == '>')
547
{
548
return PDF_TOK_CLOSE_DICT;
549
}
550
if (c == EOF)
551
{
552
return PDF_TOK_EOF;
553
}
554
fz_unread_byte(ctx, f);
555
continue;
556
case '[':
557
return PDF_TOK_OPEN_ARRAY;
558
case ']':
559
return PDF_TOK_CLOSE_ARRAY;
560
case '{':
561
return PDF_TOK_OPEN_BRACE;
562
case '}':
563
return PDF_TOK_CLOSE_BRACE;
564
case IS_NUMBER:
565
return lex_number(ctx, f, buf, c);
566
default: /* isregular: !isdelim && !iswhite && c != EOF */
567
fz_unread_byte(ctx, f);
568
lex_name(ctx, f, buf);
569
return pdf_token_from_keyword(buf->scratch);
570
}
571
}
572
}
573
574
void pdf_print_token(fz_context *ctx, fz_buffer *fzbuf, int tok, pdf_lexbuf *buf)
575
{
576
switch (tok)
577
{
578
case PDF_TOK_NAME:
579
fz_buffer_printf(ctx, fzbuf, "/%s", buf->scratch);
580
break;
581
case PDF_TOK_STRING:
582
if (buf->len >= buf->size)
583
pdf_lexbuf_grow(ctx, buf);
584
buf->scratch[buf->len] = 0;
585
fz_buffer_cat_pdf_string(ctx, fzbuf, buf->scratch);
586
break;
587
case PDF_TOK_OPEN_DICT:
588
fz_buffer_printf(ctx, fzbuf, "<<");
589
break;
590
case PDF_TOK_CLOSE_DICT:
591
fz_buffer_printf(ctx, fzbuf, ">>");
592
break;
593
case PDF_TOK_OPEN_ARRAY:
594
fz_buffer_printf(ctx, fzbuf, "[");
595
break;
596
case PDF_TOK_CLOSE_ARRAY:
597
fz_buffer_printf(ctx, fzbuf, "]");
598
break;
599
case PDF_TOK_OPEN_BRACE:
600
fz_buffer_printf(ctx, fzbuf, "{");
601
break;
602
case PDF_TOK_CLOSE_BRACE:
603
fz_buffer_printf(ctx, fzbuf, "}");
604
break;
605
case PDF_TOK_INT:
606
fz_buffer_printf(ctx, fzbuf, "%d", buf->i);
607
break;
608
case PDF_TOK_REAL:
609
{
610
fz_buffer_printf(ctx, fzbuf, "%g", buf->f);
611
}
612
break;
613
default:
614
fz_buffer_printf(ctx, fzbuf, "%s", buf->scratch);
615
break;
616
}
617
}
618
619