Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
7639 views
1
/*
2
* pdfextract -- the ultimate way to extract images and fonts from pdfs
3
*/
4
5
#include "mupdf/pdf.h"
6
7
static pdf_document *doc = NULL;
8
static fz_context *ctx = NULL;
9
static int dorgb = 0;
10
11
static void usage(void)
12
{
13
fprintf(stderr, "usage: mutool extract [options] file.pdf [object numbers]\n");
14
fprintf(stderr, "\t-p\tpassword\n");
15
fprintf(stderr, "\t-r\tconvert images to rgb\n");
16
exit(1);
17
}
18
19
static int isimage(pdf_obj *obj)
20
{
21
pdf_obj *type = pdf_dict_get(ctx, obj, PDF_NAME_Subtype);
22
return pdf_name_eq(ctx, type, PDF_NAME_Image);
23
}
24
25
static int isfontdesc(pdf_obj *obj)
26
{
27
pdf_obj *type = pdf_dict_get(ctx, obj, PDF_NAME_Type);
28
return pdf_name_eq(ctx, type, PDF_NAME_FontDescriptor);
29
}
30
31
static void writepixmap(fz_context *ctx, fz_pixmap *pix, char *file, int rgb)
32
{
33
char buf[1024];
34
fz_pixmap *converted = NULL;
35
36
if (!pix)
37
return;
38
39
if (rgb && pix->colorspace && pix->colorspace != fz_device_rgb(ctx))
40
{
41
fz_irect bbox;
42
converted = fz_new_pixmap_with_bbox(ctx, fz_device_rgb(ctx), fz_pixmap_bbox(ctx, pix, &bbox));
43
fz_convert_pixmap(ctx, converted, pix);
44
pix = converted;
45
}
46
47
if (pix->n <= 4)
48
{
49
snprintf(buf, sizeof(buf), "%s.png", file);
50
printf("extracting image %s\n", buf);
51
fz_write_png(ctx, pix, buf, 0);
52
}
53
else
54
{
55
snprintf(buf, sizeof(buf), "%s.pam", file);
56
printf("extracting image %s\n", buf);
57
fz_write_pam(ctx, pix, buf, 0);
58
}
59
60
fz_drop_pixmap(ctx, converted);
61
}
62
63
static void saveimage(int num)
64
{
65
fz_image *image;
66
fz_pixmap *pix;
67
pdf_obj *ref;
68
char buf[32];
69
70
ref = pdf_new_indirect(ctx, doc, num, 0);
71
72
/* TODO: detect DCTD and save as jpeg */
73
74
image = pdf_load_image(ctx, doc, ref);
75
pix = fz_new_pixmap_from_image(ctx, image, 0, 0);
76
fz_drop_image(ctx, image);
77
78
snprintf(buf, sizeof(buf), "img-%04d", num);
79
writepixmap(ctx, pix, buf, dorgb);
80
81
fz_drop_pixmap(ctx, pix);
82
pdf_drop_obj(ctx, ref);
83
}
84
85
static void savefont(pdf_obj *dict, int num)
86
{
87
char namebuf[1024];
88
fz_buffer *buf;
89
pdf_obj *stream = NULL;
90
pdf_obj *obj;
91
char *ext = "";
92
FILE *f;
93
char *fontname = "font";
94
int n, len;
95
unsigned char *data;
96
97
obj = pdf_dict_get(ctx, dict, PDF_NAME_FontName);
98
if (obj)
99
fontname = pdf_to_name(ctx, obj);
100
101
obj = pdf_dict_get(ctx, dict, PDF_NAME_FontFile);
102
if (obj)
103
{
104
stream = obj;
105
ext = "pfa";
106
}
107
108
obj = pdf_dict_get(ctx, dict, PDF_NAME_FontFile2);
109
if (obj)
110
{
111
stream = obj;
112
ext = "ttf";
113
}
114
115
obj = pdf_dict_get(ctx, dict, PDF_NAME_FontFile3);
116
if (obj)
117
{
118
stream = obj;
119
120
obj = pdf_dict_get(ctx, obj, PDF_NAME_Subtype);
121
if (obj && !pdf_is_name(ctx, obj))
122
fz_throw(ctx, FZ_ERROR_GENERIC, "invalid font descriptor subtype");
123
124
if (pdf_name_eq(ctx, obj, PDF_NAME_Type1C))
125
ext = "cff";
126
else if (pdf_name_eq(ctx, obj, PDF_NAME_CIDFontType0C))
127
ext = "cid";
128
else if (pdf_name_eq(ctx, obj, PDF_NAME_OpenType))
129
ext = "otf";
130
else
131
fz_throw(ctx, FZ_ERROR_GENERIC, "unhandled font type '%s'", pdf_to_name(ctx, obj));
132
}
133
134
if (!stream)
135
{
136
fz_warn(ctx, "unhandled font type");
137
return;
138
}
139
140
buf = pdf_load_stream(ctx, doc, pdf_to_num(ctx, stream), pdf_to_gen(ctx, stream));
141
142
snprintf(namebuf, sizeof(namebuf), "%s-%04d.%s", fontname, num, ext);
143
printf("extracting font %s\n", namebuf);
144
145
f = fopen(namebuf, "wb");
146
if (!f)
147
fz_throw(ctx, FZ_ERROR_GENERIC, "cannot create font file");
148
149
len = fz_buffer_storage(ctx, buf, &data);
150
n = fwrite(data, 1, len, f);
151
if (n < len)
152
fz_throw(ctx, FZ_ERROR_GENERIC, "cannot write font file");
153
154
if (fclose(f) < 0)
155
fz_throw(ctx, FZ_ERROR_GENERIC, "cannot close font file");
156
157
fz_drop_buffer(ctx, buf);
158
}
159
160
static void showobject(int num)
161
{
162
pdf_obj *obj;
163
164
if (!doc)
165
fz_throw(ctx, FZ_ERROR_GENERIC, "no file specified");
166
167
fz_try(ctx)
168
{
169
obj = pdf_load_object(ctx, doc, num, 0);
170
171
if (isimage(obj))
172
saveimage(num);
173
else if (isfontdesc(obj))
174
savefont(obj, num);
175
176
pdf_drop_obj(ctx, obj);
177
}
178
fz_catch(ctx)
179
{
180
fz_warn(ctx, "ignoring object %d", num);
181
}
182
}
183
184
int pdfextract_main(int argc, char **argv)
185
{
186
char *infile;
187
char *password = "";
188
int c, o;
189
190
while ((c = fz_getopt(argc, argv, "p:r")) != -1)
191
{
192
switch (c)
193
{
194
case 'p': password = fz_optarg; break;
195
case 'r': dorgb++; break;
196
default: usage(); break;
197
}
198
}
199
200
if (fz_optind == argc)
201
usage();
202
203
infile = argv[fz_optind++];
204
205
ctx = fz_new_context(NULL, NULL, FZ_STORE_UNLIMITED);
206
if (!ctx)
207
{
208
fprintf(stderr, "cannot initialise context\n");
209
exit(1);
210
}
211
212
doc = pdf_open_document(ctx, infile);
213
if (pdf_needs_password(ctx, doc))
214
if (!pdf_authenticate_password(ctx, doc, password))
215
fz_throw(ctx, FZ_ERROR_GENERIC, "cannot authenticate password: %s", infile);
216
217
if (fz_optind == argc)
218
{
219
int len = pdf_count_objects(ctx, doc);
220
for (o = 1; o < len; o++)
221
showobject(o);
222
}
223
else
224
{
225
while (fz_optind < argc)
226
{
227
showobject(atoi(argv[fz_optind]));
228
fz_optind++;
229
}
230
}
231
232
pdf_close_document(ctx, doc);
233
fz_flush_warnings(ctx);
234
fz_drop_context(ctx);
235
return 0;
236
}
237
238