Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
7639 views
1
#include "mupdf/pdf.h"
2
3
typedef struct globals_s
4
{
5
pdf_document *doc;
6
fz_context *ctx;
7
} globals;
8
9
static int
10
string_in_names_list(fz_context *ctx, pdf_obj *p, pdf_obj *names_list)
11
{
12
int n = pdf_array_len(ctx, names_list);
13
int i;
14
char *str = pdf_to_str_buf(ctx, p);
15
16
for (i = 0; i < n ; i += 2)
17
{
18
if (!strcmp(pdf_to_str_buf(ctx, pdf_array_get(ctx, names_list, i)), str))
19
return 1;
20
}
21
return 0;
22
}
23
24
/*
25
* Recreate page tree to only retain specified pages.
26
*/
27
28
static void retainpage(fz_context *ctx, pdf_document *doc, pdf_obj *parent, pdf_obj *kids, int page)
29
{
30
pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, page-1);
31
pdf_obj *pageobj = pdf_resolve_indirect(ctx, pageref);
32
33
pdf_dict_put(ctx, pageobj, PDF_NAME_Parent, parent);
34
35
/* Store page object in new kids array */
36
pdf_array_push(ctx, kids, pageref);
37
}
38
39
static void retainpages(fz_context *ctx, globals *glo, int argc, char **argv)
40
{
41
pdf_obj *oldroot, *root, *pages, *kids, *countobj, *parent, *olddests;
42
pdf_document *doc = glo->doc;
43
int argidx = 0;
44
pdf_obj *names_list = NULL;
45
int pagecount;
46
int i;
47
48
/* Keep only pages/type and (reduced) dest entries to avoid
49
* references to unretained pages */
50
oldroot = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME_Root);
51
pages = pdf_dict_get(ctx, oldroot, PDF_NAME_Pages);
52
olddests = pdf_load_name_tree(ctx, doc, PDF_NAME_Dests);
53
54
root = pdf_new_dict(ctx, doc, 2);
55
pdf_dict_put(ctx, root, PDF_NAME_Type, pdf_dict_get(ctx, oldroot, PDF_NAME_Type));
56
pdf_dict_put(ctx, root, PDF_NAME_Pages, pdf_dict_get(ctx, oldroot, PDF_NAME_Pages));
57
58
pdf_update_object(ctx, doc, pdf_to_num(ctx, oldroot), root);
59
60
pdf_drop_obj(ctx, root);
61
62
/* Create a new kids array with only the pages we want to keep */
63
parent = pdf_new_indirect(ctx, doc, pdf_to_num(ctx, pages), pdf_to_gen(ctx, pages));
64
kids = pdf_new_array(ctx, doc, 1);
65
66
/* Retain pages specified */
67
while (argc - argidx)
68
{
69
int page, spage, epage;
70
char *spec, *dash;
71
char *pagelist = argv[argidx];
72
73
pagecount = pdf_count_pages(ctx, doc);
74
spec = fz_strsep(&pagelist, ",");
75
while (spec)
76
{
77
dash = strchr(spec, '-');
78
79
if (dash == spec)
80
spage = epage = pagecount;
81
else
82
spage = epage = atoi(spec);
83
84
if (dash)
85
{
86
if (strlen(dash) > 1)
87
epage = atoi(dash + 1);
88
else
89
epage = pagecount;
90
}
91
92
spage = fz_clampi(spage, 1, pagecount);
93
epage = fz_clampi(epage, 1, pagecount);
94
95
if (spage < epage)
96
for (page = spage; page <= epage; ++page)
97
retainpage(ctx, doc, parent, kids, page);
98
else
99
for (page = spage; page >= epage; --page)
100
retainpage(ctx, doc, parent, kids, page);
101
102
spec = fz_strsep(&pagelist, ",");
103
}
104
105
argidx++;
106
}
107
108
pdf_drop_obj(ctx, parent);
109
110
/* Update page count and kids array */
111
countobj = pdf_new_int(ctx, doc, pdf_array_len(ctx, kids));
112
pdf_dict_put(ctx, pages, PDF_NAME_Count, countobj);
113
pdf_drop_obj(ctx, countobj);
114
pdf_dict_put(ctx, pages, PDF_NAME_Kids, kids);
115
pdf_drop_obj(ctx, kids);
116
117
/* Also preserve the (partial) Dests name tree */
118
if (olddests)
119
{
120
pdf_obj *names = pdf_new_dict(ctx, doc, 1);
121
pdf_obj *dests = pdf_new_dict(ctx, doc, 1);
122
int len = pdf_dict_len(ctx, olddests);
123
124
names_list = pdf_new_array(ctx, doc, 32);
125
126
for (i = 0; i < len; i++)
127
{
128
pdf_obj *key = pdf_dict_get_key(ctx, olddests, i);
129
pdf_obj *val = pdf_dict_get_val(ctx, olddests, i);
130
pdf_obj *dest = pdf_dict_get(ctx, val, PDF_NAME_D);
131
132
dest = pdf_array_get(ctx, dest ? dest : val, 0);
133
if (pdf_array_contains(ctx, pdf_dict_get(ctx, pages, PDF_NAME_Kids), dest))
134
{
135
pdf_obj *key_str = pdf_new_string(ctx, doc, pdf_to_name(ctx, key), strlen(pdf_to_name(ctx, key)));
136
pdf_array_push(ctx, names_list, key_str);
137
pdf_array_push(ctx, names_list, val);
138
pdf_drop_obj(ctx, key_str);
139
}
140
}
141
142
root = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME_Root);
143
pdf_dict_put(ctx, dests, PDF_NAME_Names, names_list);
144
pdf_dict_put(ctx, names, PDF_NAME_Dests, dests);
145
pdf_dict_put(ctx, root, PDF_NAME_Names, names);
146
147
pdf_drop_obj(ctx, names);
148
pdf_drop_obj(ctx, dests);
149
pdf_drop_obj(ctx, names_list);
150
pdf_drop_obj(ctx, olddests);
151
}
152
153
/* Force the next call to pdf_count_pages to recount */
154
glo->doc->page_count = 0;
155
156
/* Edit each pages /Annot list to remove any links that point to
157
* nowhere. */
158
pagecount = pdf_count_pages(ctx, doc);
159
for (i = 0; i < pagecount; i++)
160
{
161
pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, i);
162
pdf_obj *pageobj = pdf_resolve_indirect(ctx, pageref);
163
164
pdf_obj *annots = pdf_dict_get(ctx, pageobj, PDF_NAME_Annots);
165
166
int len = pdf_array_len(ctx, annots);
167
int j;
168
169
for (j = 0; j < len; j++)
170
{
171
pdf_obj *o = pdf_array_get(ctx, annots, j);
172
pdf_obj *p;
173
174
if (!pdf_name_eq(ctx, pdf_dict_get(ctx, o, PDF_NAME_Subtype), PDF_NAME_Link))
175
continue;
176
177
p = pdf_dict_get(ctx, o, PDF_NAME_A);
178
if (!pdf_name_eq(ctx, pdf_dict_get(ctx, p, PDF_NAME_S), PDF_NAME_GoTo))
179
continue;
180
181
if (string_in_names_list(ctx, pdf_dict_get(ctx, p, PDF_NAME_D), names_list))
182
continue;
183
184
/* FIXME: Should probably look at Next too */
185
186
/* Remove this annotation */
187
pdf_array_delete(ctx, annots, j);
188
j--;
189
}
190
}
191
}
192
193
void pdf_clean_file(fz_context *ctx, char *infile, char *outfile, char *password, fz_write_options *opts, char *argv[], int argc)
194
{
195
globals glo = { 0 };
196
197
glo.ctx = ctx;
198
199
fz_try(ctx)
200
{
201
glo.doc = pdf_open_document(ctx, infile);
202
if (pdf_needs_password(ctx, glo.doc))
203
if (!pdf_authenticate_password(ctx, glo.doc, password))
204
fz_throw(glo.ctx, FZ_ERROR_GENERIC, "cannot authenticate password: %s", infile);
205
206
/* Only retain the specified subset of the pages */
207
if (argc)
208
retainpages(ctx, &glo, argc, argv);
209
210
pdf_write_document(ctx, glo.doc, outfile, opts);
211
}
212
fz_always(ctx)
213
{
214
pdf_close_document(ctx, glo.doc);
215
}
216
fz_catch(ctx)
217
{
218
if (opts && opts->errors)
219
*opts->errors = *opts->errors+1;
220
}
221
}
222
223