Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
quarto-dev
GitHub Repository: quarto-dev/quarto-cli
Path: blob/main/tools/pdf-tag-tree.ts
6446 views
1
#!/usr/bin/env -S deno run --allow-read --allow-env
2
/**
3
* pdf-tag-tree.ts
4
*
5
* Extracts and displays the PDF structure tree (tag hierarchy) with MCIDs.
6
* Useful for debugging ensurePdfTextPositions issues.
7
*
8
* Usage: quarto run tools/pdf-tag-tree.ts <pdf-file> <search-text>
9
*
10
* The search-text is required and determines which page's structure tree to display.
11
*/
12
13
import * as pdfjsLib from "npm:[email protected]/legacy/build/pdf.mjs";
14
15
interface StructTreeContent {
16
type: "content";
17
id: string;
18
}
19
20
interface StructTreeNode {
21
role: string;
22
children?: (StructTreeNode | StructTreeContent)[];
23
alt?: string;
24
lang?: string;
25
}
26
27
interface TextMarkedContent {
28
type: string;
29
id?: string;
30
tag?: string;
31
}
32
33
interface TextItem {
34
str: string;
35
transform: number[];
36
width: number;
37
height: number;
38
}
39
40
function isStructTreeContent(child: any): child is StructTreeContent {
41
return child && typeof child === "object" && child.type === "content";
42
}
43
44
function isTextMarkedContent(item: any): item is TextMarkedContent {
45
return "type" in item && typeof item.type === "string";
46
}
47
48
// Build a map from MCID to the path of tags leading to it
49
function buildMcidPaths(
50
node: StructTreeNode,
51
path: string[] = [],
52
result: Map<string, { path: string[]; role: string; attrs: Record<string, any> }> = new Map()
53
): Map<string, { path: string[]; role: string; attrs: Record<string, any> }> {
54
const currentPath = [...path, node.role];
55
56
for (const child of node.children ?? []) {
57
if (isStructTreeContent(child)) {
58
// This is an MCID reference
59
const attrs: Record<string, any> = {};
60
if (node.alt) attrs.alt = node.alt;
61
if (node.lang) attrs.lang = node.lang;
62
63
result.set(child.id, {
64
path: currentPath,
65
role: node.role,
66
attrs
67
});
68
} else {
69
// Recurse into child structure nodes
70
buildMcidPaths(child, currentPath, result);
71
}
72
}
73
74
return result;
75
}
76
77
// Pretty print the structure tree
78
function printStructTree(
79
node: StructTreeNode,
80
indent: number = 0,
81
maxDepth: number = 10,
82
highlightMcids: Set<string> = new Set()
83
): void {
84
if (indent > maxDepth) {
85
console.log(" ".repeat(indent * 2) + "...(truncated)");
86
return;
87
}
88
89
const attrs: string[] = [];
90
if (node.alt) attrs.push(`alt="${node.alt}"`);
91
if (node.lang) attrs.push(`lang="${node.lang}"`);
92
93
const attrStr = attrs.length > 0 ? ` [${attrs.join(", ")}]` : "";
94
95
let mcids: string[] = [];
96
let childNodes: StructTreeNode[] = [];
97
98
for (const child of node.children ?? []) {
99
if (isStructTreeContent(child)) {
100
mcids.push(child.id);
101
} else {
102
childNodes.push(child);
103
}
104
}
105
106
const mcidStr = mcids.length > 0 ? ` (MCIDs: ${mcids.join(", ")})` : "";
107
const hasMatch = mcids.some(id => highlightMcids.has(id));
108
const matchMarker = hasMatch ? " # <-- found" : "";
109
console.log(" ".repeat(indent * 2) + `<${node.role}>${attrStr}${mcidStr}${matchMarker}`);
110
111
for (const child of childNodes) {
112
printStructTree(child, indent + 1, maxDepth, highlightMcids);
113
}
114
}
115
116
async function main() {
117
const file = Deno.args[0];
118
const searchText = Deno.args[1];
119
120
if (!file || !searchText) {
121
console.error("Usage: quarto run tools/pdf-tag-tree.ts <pdf-file> <search-text>");
122
Deno.exit(1);
123
}
124
125
const data = await Deno.readFile(file);
126
const pdf = await pdfjsLib.getDocument({
127
data,
128
useWorkerFetch: false,
129
isEvalSupported: false,
130
useSystemFonts: true,
131
}).promise;
132
133
// First pass: find which page contains the search text
134
let foundPage: number | null = null;
135
136
for (let pageNum = 1; pageNum <= pdf.numPages; pageNum++) {
137
const page = await pdf.getPage(pageNum);
138
const textContent = await page.getTextContent({ includeMarkedContent: true });
139
140
for (const item of textContent.items) {
141
if (!isTextMarkedContent(item)) {
142
const textItem = item as TextItem;
143
if (textItem.str.includes(searchText)) {
144
foundPage = pageNum;
145
break;
146
}
147
}
148
}
149
if (foundPage) break;
150
}
151
152
if (!foundPage) {
153
console.error(`Error: "${searchText}" not found in PDF`);
154
Deno.exit(1);
155
}
156
157
console.log(`Found "${searchText}" on page ${foundPage}\n`);
158
159
// Get the page with the search text
160
const page = await pdf.getPage(foundPage);
161
const structTree = await page.getStructTree();
162
163
// Build MCID paths for this page
164
const mcidPaths = structTree ? buildMcidPaths(structTree as StructTreeNode) : new Map();
165
166
// Get text content and find MCIDs containing the search text
167
const textContent = await page.getTextContent({ includeMarkedContent: true });
168
169
// First pass: collect MCIDs that contain the search text
170
const matchingMcids = new Set<string>();
171
let currentMcid: string | null = null;
172
173
for (const item of textContent.items) {
174
if (isTextMarkedContent(item)) {
175
const mcidValue = (item as any).id;
176
if (item.type === "beginMarkedContentProps" && mcidValue !== undefined) {
177
currentMcid = mcidValue;
178
} else if (item.type === "endMarkedContent") {
179
currentMcid = null;
180
}
181
} else {
182
const textItem = item as TextItem;
183
if (textItem.str.includes(searchText) && currentMcid !== null) {
184
matchingMcids.add(currentMcid);
185
}
186
}
187
}
188
189
console.log(`=== STRUCTURE TREE (Page ${foundPage}) ===\n`);
190
if (structTree) {
191
printStructTree(structTree as StructTreeNode, 0, 15, matchingMcids);
192
} else {
193
console.log("No structure tree found (PDF may not be tagged)");
194
}
195
console.log("\n");
196
197
console.log(`=== TEXT ITEMS CONTAINING "${searchText}" ===\n`);
198
199
currentMcid = null;
200
201
for (const item of textContent.items) {
202
if (isTextMarkedContent(item)) {
203
const mcidValue = (item as any).id;
204
if (item.type === "beginMarkedContentProps" && mcidValue !== undefined) {
205
currentMcid = mcidValue;
206
} else if (item.type === "endMarkedContent") {
207
currentMcid = null;
208
}
209
} else {
210
const textItem = item as TextItem;
211
if (textItem.str.includes(searchText)) {
212
const x = textItem.transform[4];
213
const y = textItem.transform[5];
214
const pathInfo = currentMcid ? mcidPaths.get(currentMcid) : null;
215
216
console.log(`Text: "${textItem.str}"`);
217
console.log(` MCID: ${currentMcid}`);
218
console.log(` Position: x=${x.toFixed(1)}, y=${y.toFixed(1)}`);
219
if (pathInfo) {
220
console.log(` Tag path: ${pathInfo.path.join(" > ")}`);
221
if (Object.keys(pathInfo.attrs).length > 0) {
222
console.log(` Attrs: ${JSON.stringify(pathInfo.attrs)}`);
223
}
224
}
225
console.log();
226
}
227
}
228
}
229
}
230
231
main().catch(console.error);
232
233