#include "mupdf/pdf.h"
#define IS_NUMBER \
'+':case'-':case'.':case'0':case'1':case'2':case'3':\
case'4':case'5':case'6':case'7':case'8':case'9'
#define IS_WHITE \
'\000':case'\011':case'\012':case'\014':case'\015':case'\040'
#define IS_HEX \
'0':case'1':case'2':case'3':case'4':case'5':case'6':\
case'7':case'8':case'9':case'A':case'B':case'C':\
case'D':case'E':case'F':case'a':case'b':case'c':\
case'd':case'e':case'f'
#define IS_DELIM \
'(':case')':case'<':case'>':case'[':case']':case'{':\
case'}':case'/':case'%'
#define RANGE_0_9 \
'0':case'1':case'2':case'3':case'4':case'5':\
case'6':case'7':case'8':case'9'
#define RANGE_a_f \
'a':case'b':case'c':case'd':case'e':case'f'
#define RANGE_A_F \
'A':case'B':case'C':case'D':case'E':case'F'
#define RANGE_0_7 \
'0':case'1':case'2':case'3':case'4':case'5':case'6':case'7'
static inline int iswhite(int ch)
{
return
ch == '\000' ||
ch == '\011' ||
ch == '\012' ||
ch == '\014' ||
ch == '\015' ||
ch == '\040';
}
static inline int unhex(int ch)
{
if (ch >= '0' && ch <= '9') return ch - '0';
if (ch >= 'A' && ch <= 'F') return ch - 'A' + 0xA;
if (ch >= 'a' && ch <= 'f') return ch - 'a' + 0xA;
return 0;
}
static void
lex_white(fz_context *ctx, fz_stream *f)
{
int c;
do {
c = fz_read_byte(ctx, f);
} while ((c <= 32) && (iswhite(c)));
if (c != EOF)
fz_unread_byte(ctx, f);
}
static void
lex_comment(fz_context *ctx, fz_stream *f)
{
int c;
do {
c = fz_read_byte(ctx, f);
} while ((c != '\012') && (c != '\015') && (c != EOF));
}
static int
lex_number(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf, int c)
{
int neg = 0;
int i = 0;
int n;
int d;
float v;
switch (c)
{
case '.':
goto loop_after_dot;
case '-':
neg = 1;
break;
case '+':
break;
default:
i = c - '0';
break;
}
while (1)
{
c = fz_read_byte(ctx, f);
switch (c)
{
case '.':
goto loop_after_dot;
case RANGE_0_9:
i = 10*i + c - '0';
break;
default:
fz_unread_byte(ctx, f);
case EOF:
if (neg)
i = -i;
buf->i = i;
return PDF_TOK_INT;
}
}
loop_after_dot:
n = 0;
d = 1;
while (1)
{
c = fz_read_byte(ctx, f);
switch (c)
{
case RANGE_0_9:
if (d >= INT_MAX/10)
goto underflow;
n = n*10 + (c - '0');
d *= 10;
break;
default:
fz_unread_byte(ctx, f);
case EOF:
v = (float)i + ((float)n / (float)d);
if (neg)
v = -v;
buf->f = v;
return PDF_TOK_REAL;
}
}
underflow:
while (1)
{
c = fz_read_byte(ctx, f);
switch (c)
{
case RANGE_0_9:
break;
default:
fz_unread_byte(ctx, f);
case EOF:
v = (float)i + ((float)n / (float)d);
if (neg)
v = -v;
buf->f = v;
return PDF_TOK_REAL;
}
}
}
static void
lex_name(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf)
{
char *s = buf->scratch;
int n = buf->size;
while (n > 1)
{
int c = fz_read_byte(ctx, f);
switch (c)
{
case IS_WHITE:
case IS_DELIM:
fz_unread_byte(ctx, f);
goto end;
case EOF:
goto end;
case '#':
{
int d;
c = fz_read_byte(ctx, f);
switch (c)
{
case RANGE_0_9:
d = (c - '0') << 4;
break;
case RANGE_a_f:
d = (c - 'a' + 10) << 4;
break;
case RANGE_A_F:
d = (c - 'A' + 10) << 4;
break;
default:
fz_unread_byte(ctx, f);
case EOF:
goto end;
}
c = fz_read_byte(ctx, f);
switch (c)
{
case RANGE_0_9:
c -= '0';
break;
case RANGE_a_f:
c -= 'a' - 10;
break;
case RANGE_A_F:
c -= 'A' - 10;
break;
default:
fz_unread_byte(ctx, f);
case EOF:
*s++ = d;
n--;
goto end;
}
*s++ = d + c;
n--;
break;
}
default:
*s++ = c;
n--;
break;
}
}
end:
*s = '\0';
buf->len = s - buf->scratch;
}
static int
lex_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb)
{
char *s = lb->scratch;
char *e = s + lb->size;
int bal = 1;
int oct;
int c;
while (1)
{
if (s == e)
{
s += pdf_lexbuf_grow(ctx, lb);
e = lb->scratch + lb->size;
}
c = fz_read_byte(ctx, f);
switch (c)
{
case EOF:
goto end;
case '(':
bal++;
*s++ = c;
break;
case ')':
bal --;
if (bal == 0)
goto end;
*s++ = c;
break;
case '\\':
c = fz_read_byte(ctx, f);
switch (c)
{
case EOF:
goto end;
case 'n':
*s++ = '\n';
break;
case 'r':
*s++ = '\r';
break;
case 't':
*s++ = '\t';
break;
case 'b':
*s++ = '\b';
break;
case 'f':
*s++ = '\f';
break;
case '(':
*s++ = '(';
break;
case ')':
*s++ = ')';
break;
case '\\':
*s++ = '\\';
break;
case RANGE_0_7:
oct = c - '0';
c = fz_read_byte(ctx, f);
if (c >= '0' && c <= '7')
{
oct = oct * 8 + (c - '0');
c = fz_read_byte(ctx, f);
if (c >= '0' && c <= '7')
oct = oct * 8 + (c - '0');
else if (c != EOF)
fz_unread_byte(ctx, f);
}
else if (c != EOF)
fz_unread_byte(ctx, f);
*s++ = oct;
break;
case '\n':
break;
case '\r':
c = fz_read_byte(ctx, f);
if ((c != '\n') && (c != EOF))
fz_unread_byte(ctx, f);
break;
default:
*s++ = c;
}
break;
default:
*s++ = c;
break;
}
}
end:
lb->len = s - lb->scratch;
return PDF_TOK_STRING;
}
static int
lex_hex_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb)
{
char *s = lb->scratch;
char *e = s + lb->size;
int a = 0, x = 0;
int c;
while (1)
{
if (s == e)
{
s += pdf_lexbuf_grow(ctx, lb);
e = lb->scratch + lb->size;
}
c = fz_read_byte(ctx, f);
switch (c)
{
case IS_WHITE:
break;
case IS_HEX:
if (x)
{
*s++ = a * 16 + unhex(c);
x = !x;
}
else
{
a = unhex(c);
x = !x;
}
break;
case '>':
case EOF:
goto end;
default:
fz_warn(ctx, "ignoring invalid character in hex string");
}
}
end:
lb->len = s - lb->scratch;
return PDF_TOK_STRING;
}
static pdf_token
pdf_token_from_keyword(char *key)
{
switch (*key)
{
case 'R':
if (!strcmp(key, "R")) return PDF_TOK_R;
break;
case 't':
if (!strcmp(key, "true")) return PDF_TOK_TRUE;
if (!strcmp(key, "trailer")) return PDF_TOK_TRAILER;
break;
case 'f':
if (!strcmp(key, "false")) return PDF_TOK_FALSE;
break;
case 'n':
if (!strcmp(key, "null")) return PDF_TOK_NULL;
break;
case 'o':
if (!strcmp(key, "obj")) return PDF_TOK_OBJ;
break;
case 'e':
if (!strcmp(key, "endobj")) return PDF_TOK_ENDOBJ;
if (!strcmp(key, "endstream")) return PDF_TOK_ENDSTREAM;
break;
case 's':
if (!strcmp(key, "stream")) return PDF_TOK_STREAM;
if (!strcmp(key, "startxref")) return PDF_TOK_STARTXREF;
break;
case 'x':
if (!strcmp(key, "xref")) return PDF_TOK_XREF;
break;
default:
break;
}
return PDF_TOK_KEYWORD;
}
void pdf_lexbuf_init(fz_context *ctx, pdf_lexbuf *lb, int size)
{
lb->size = lb->base_size = size;
lb->len = 0;
lb->scratch = &lb->buffer[0];
}
void pdf_lexbuf_fin(fz_context *ctx, pdf_lexbuf *lb)
{
if (lb && lb->size != lb->base_size)
fz_free(ctx, lb->scratch);
}
ptrdiff_t pdf_lexbuf_grow(fz_context *ctx, pdf_lexbuf *lb)
{
char *old = lb->scratch;
int newsize = lb->size * 2;
if (lb->size == lb->base_size)
{
lb->scratch = fz_malloc(ctx, newsize);
memcpy(lb->scratch, lb->buffer, lb->size);
}
else
{
lb->scratch = fz_resize_array(ctx, lb->scratch, newsize, 1);
}
lb->size = newsize;
return lb->scratch - old;
}
pdf_token
pdf_lex(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf)
{
while (1)
{
int c = fz_read_byte(ctx, f);
switch (c)
{
case EOF:
return PDF_TOK_EOF;
case IS_WHITE:
lex_white(ctx, f);
break;
case '%':
lex_comment(ctx, f);
break;
case '/':
lex_name(ctx, f, buf);
return PDF_TOK_NAME;
case '(':
return lex_string(ctx, f, buf);
case ')':
fz_warn(ctx, "lexical error (unexpected ')')");
continue;
case '<':
c = fz_read_byte(ctx, f);
if (c == '<')
{
return PDF_TOK_OPEN_DICT;
}
else
{
fz_unread_byte(ctx, f);
return lex_hex_string(ctx, f, buf);
}
case '>':
c = fz_read_byte(ctx, f);
if (c == '>')
{
return PDF_TOK_CLOSE_DICT;
}
fz_warn(ctx, "lexical error (unexpected '>')");
if (c == EOF)
{
return PDF_TOK_EOF;
}
fz_unread_byte(ctx, f);
continue;
case '[':
return PDF_TOK_OPEN_ARRAY;
case ']':
return PDF_TOK_CLOSE_ARRAY;
case '{':
return PDF_TOK_OPEN_BRACE;
case '}':
return PDF_TOK_CLOSE_BRACE;
case IS_NUMBER:
return lex_number(ctx, f, buf, c);
default:
fz_unread_byte(ctx, f);
lex_name(ctx, f, buf);
return pdf_token_from_keyword(buf->scratch);
}
}
}
pdf_token
pdf_lex_no_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf)
{
while (1)
{
int c = fz_read_byte(ctx, f);
switch (c)
{
case EOF:
return PDF_TOK_EOF;
case IS_WHITE:
lex_white(ctx, f);
break;
case '%':
lex_comment(ctx, f);
break;
case '/':
lex_name(ctx, f, buf);
return PDF_TOK_NAME;
case '(':
continue;
case ')':
continue;
case '<':
c = fz_read_byte(ctx, f);
if (c == '<')
{
return PDF_TOK_OPEN_DICT;
}
else
{
continue;
}
case '>':
c = fz_read_byte(ctx, f);
if (c == '>')
{
return PDF_TOK_CLOSE_DICT;
}
if (c == EOF)
{
return PDF_TOK_EOF;
}
fz_unread_byte(ctx, f);
continue;
case '[':
return PDF_TOK_OPEN_ARRAY;
case ']':
return PDF_TOK_CLOSE_ARRAY;
case '{':
return PDF_TOK_OPEN_BRACE;
case '}':
return PDF_TOK_CLOSE_BRACE;
case IS_NUMBER:
return lex_number(ctx, f, buf, c);
default:
fz_unread_byte(ctx, f);
lex_name(ctx, f, buf);
return pdf_token_from_keyword(buf->scratch);
}
}
}
void pdf_print_token(fz_context *ctx, fz_buffer *fzbuf, int tok, pdf_lexbuf *buf)
{
switch (tok)
{
case PDF_TOK_NAME:
fz_buffer_printf(ctx, fzbuf, "/%s", buf->scratch);
break;
case PDF_TOK_STRING:
if (buf->len >= buf->size)
pdf_lexbuf_grow(ctx, buf);
buf->scratch[buf->len] = 0;
fz_buffer_cat_pdf_string(ctx, fzbuf, buf->scratch);
break;
case PDF_TOK_OPEN_DICT:
fz_buffer_printf(ctx, fzbuf, "<<");
break;
case PDF_TOK_CLOSE_DICT:
fz_buffer_printf(ctx, fzbuf, ">>");
break;
case PDF_TOK_OPEN_ARRAY:
fz_buffer_printf(ctx, fzbuf, "[");
break;
case PDF_TOK_CLOSE_ARRAY:
fz_buffer_printf(ctx, fzbuf, "]");
break;
case PDF_TOK_OPEN_BRACE:
fz_buffer_printf(ctx, fzbuf, "{");
break;
case PDF_TOK_CLOSE_BRACE:
fz_buffer_printf(ctx, fzbuf, "}");
break;
case PDF_TOK_INT:
fz_buffer_printf(ctx, fzbuf, "%d", buf->i);
break;
case PDF_TOK_REAL:
{
fz_buffer_printf(ctx, fzbuf, "%g", buf->f);
}
break;
default:
fz_buffer_printf(ctx, fzbuf, "%s", buf->scratch);
break;
}
}