Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
quarto-dev
GitHub Repository: quarto-dev/quarto-cli
Path: blob/main/tests/verify-pdf-text-position.ts
6446 views
1
/*
2
* verify-pdf-text-position.ts
3
*
4
* PDF text position verification using semantic structure tree.
5
* Uses pdfjs-dist directly to access MCIDs and structure tree.
6
*
7
* REQUIREMENTS:
8
* This module requires tagged PDFs with PDF 1.4+ structure tree support.
9
* Tagged PDFs contain Marked Content Identifiers (MCIDs) that link text
10
* content to semantic structure elements (P, H1, Figure, Table, etc.).
11
*
12
* Currently confirmed working:
13
* - Typst: Produces tagged PDFs by default
14
*
15
* Not yet working:
16
* - LaTeX: Requires \DocumentMetadata{} before \documentclass for tagging,
17
* which Quarto doesn't currently support. When LaTeX tagged PDF support
18
* is available, this module should work with minimal changes since we
19
* use only basic PDF 1.4 tagged structure features.
20
* - ConTeXt: Pandoc supports +tagging extension, but Quarto's context
21
* format doesn't compile to PDF.
22
*
23
* SPECIAL ROLES:
24
* - role: "Decoration" - Use for untagged page elements like headers, footers,
25
* page numbers, and other decorations. These use text item bounds directly
26
* instead of requiring MCID/structure tree support.
27
* - role: "Page" - Use for the entire page bounds. Requires `page` field to
28
* specify which page number (1-indexed). The `text` field is ignored.
29
* Useful for NOT assertions since Page intersects all content on that page.
30
*
31
* Copyright (C) 2020-2025 Posit Software, PBC
32
*/
33
34
import { assert } from "testing/asserts";
35
import { z } from "zod";
36
import { ExecuteOutput, Verify } from "./test.ts";
37
38
// ============================================================================
39
// Zod Schemas and Type Definitions
40
// ============================================================================
41
42
// Edge schema for precise bbox edge selection
43
export const EdgeSchema = z.enum(["left", "right", "top", "bottom"]);
44
export type Edge = z.infer<typeof EdgeSchema>;
45
46
// Relation schemas
47
export const DirectionalRelationSchema = z.enum(["leftOf", "rightOf", "above", "below"]);
48
export const AlignmentRelationSchema = z.enum(["leftAligned", "rightAligned", "topAligned", "bottomAligned"]);
49
export const RelationSchema = z.union([DirectionalRelationSchema, AlignmentRelationSchema]);
50
51
export type DirectionalRelation = z.infer<typeof DirectionalRelationSchema>;
52
export type AlignmentRelation = z.infer<typeof AlignmentRelationSchema>;
53
export type Relation = z.infer<typeof RelationSchema>;
54
55
// Text selector schema
56
// Note: Label/ID checking is not supported because:
57
// 1. Typst does not write labels to PDF StructElem /ID attributes (labels become
58
// named destinations for links, but not structure element identifiers)
59
// 2. Even if IDs were present, pdf.js doesn't expose /ID through getStructTree()
60
export const TextSelectorSchema = z.object({
61
text: z.string().optional(), // Text to search for (ignored for role: "Page")
62
role: z.string().optional(), // PDF 1.4 structure role: P, H1, H2, Figure, Table, Span, etc.
63
page: z.number().optional(), // Page number (1-indexed), required for role: "Page"
64
edge: EdgeSchema.optional(), // Which edge to use for comparison (overrides relation default)
65
granularity: z.string().optional(), // Aggregate bbox to ancestor with this role (e.g., "Div", "P")
66
});
67
export type TextSelector = z.infer<typeof TextSelectorSchema>;
68
69
// Subject/object can be a string or a TextSelector
70
const SubjectObjectSchema = z.union([z.string(), TextSelectorSchema]);
71
72
// Tag-only assertion: validates semantic role without position comparison
73
export const TagOnlyAssertionSchema = z.object({
74
subject: SubjectObjectSchema,
75
}).strict();
76
export type TagOnlyAssertion = z.infer<typeof TagOnlyAssertionSchema>;
77
78
// Directional assertion: leftOf, rightOf, above, below with optional distance constraints
79
export const DirectionalAssertionSchema = z.object({
80
subject: SubjectObjectSchema,
81
relation: DirectionalRelationSchema,
82
object: SubjectObjectSchema,
83
byMin: z.number().optional(), // Minimum distance between edges
84
byMax: z.number().optional(), // Maximum distance between edges
85
}).refine(
86
(data) => data.byMin === undefined || data.byMax === undefined || data.byMin <= data.byMax,
87
{ message: "byMin must be <= byMax" }
88
);
89
export type DirectionalAssertion = z.infer<typeof DirectionalAssertionSchema>;
90
91
// Alignment assertion: leftAligned, rightAligned, topAligned, bottomAligned with tolerance
92
export const AlignmentAssertionSchema = z.object({
93
subject: SubjectObjectSchema,
94
relation: AlignmentRelationSchema,
95
object: SubjectObjectSchema,
96
tolerance: z.number().optional(), // Default: 2pt
97
}).strict();
98
export type AlignmentAssertion = z.infer<typeof AlignmentAssertionSchema>;
99
100
// Union of all assertion types
101
export const PdfTextPositionAssertionSchema = z.union([
102
DirectionalAssertionSchema,
103
AlignmentAssertionSchema,
104
TagOnlyAssertionSchema,
105
]);
106
export type PdfTextPositionAssertion = z.infer<typeof PdfTextPositionAssertionSchema>;
107
108
// Type guards for assertion discrimination (using Zod safeParse)
109
export function isDirectionalAssertion(a: unknown): a is DirectionalAssertion {
110
return DirectionalAssertionSchema.safeParse(a).success;
111
}
112
113
export function isAlignmentAssertion(a: unknown): a is AlignmentAssertion {
114
return AlignmentAssertionSchema.safeParse(a).success;
115
}
116
117
export function isTagOnlyAssertion(a: unknown): a is TagOnlyAssertion {
118
return TagOnlyAssertionSchema.safeParse(a).success;
119
}
120
121
// Computed bounding box
122
interface BBox {
123
x: number;
124
y: number;
125
width: number;
126
height: number;
127
page: number;
128
}
129
130
// Internal: text item with MCID tracking
131
interface MarkedTextItem {
132
str: string;
133
x: number;
134
y: number;
135
width: number;
136
height: number;
137
mcid: string | null; // e.g., "p2R_mc0"
138
page: number;
139
}
140
141
// Structure tree node (from pdfjs-dist)
142
interface StructTreeNode {
143
role: string;
144
children?: (StructTreeNode | StructTreeContent)[];
145
alt?: string;
146
lang?: string;
147
}
148
149
interface StructTreeContent {
150
type: "content" | "object" | "annotation";
151
id: string;
152
}
153
154
// Text content item types from pdfjs-dist
155
interface TextItem {
156
str: string;
157
dir: string;
158
transform: number[];
159
width: number;
160
height: number;
161
fontName: string;
162
hasEOL: boolean;
163
}
164
165
interface TextMarkedContent {
166
type: "beginMarkedContent" | "beginMarkedContentProps" | "endMarkedContent";
167
id?: string;
168
tag?: string;
169
}
170
171
// Internal: resolved selector with computed bounds
172
interface ResolvedSelector {
173
selector: TextSelector;
174
textItem: MarkedTextItem;
175
structNode: StructTreeNode | null;
176
bbox: BBox;
177
}
178
179
// ============================================================================
180
// Constants
181
// ============================================================================
182
183
const DEFAULT_ALIGNMENT_TOLERANCE = 2;
184
185
// ============================================================================
186
// Relation Predicates and Edge Logic
187
// ============================================================================
188
189
// Coordinate system: origin at top-left, y increases downward
190
191
// Derive relation sets from Zod schemas
192
const directionalRelations: Set<Relation> = new Set(DirectionalRelationSchema.options);
193
const alignmentRelations: Set<Relation> = new Set(AlignmentRelationSchema.options);
194
195
// Default edges for each relation (from spec table)
196
const relationDefaults: Record<Relation, { subject: Edge; object: Edge }> = {
197
leftOf: { subject: "right", object: "left" },
198
rightOf: { subject: "left", object: "right" },
199
above: { subject: "bottom", object: "top" },
200
below: { subject: "top", object: "bottom" },
201
leftAligned: { subject: "left", object: "left" },
202
rightAligned: { subject: "right", object: "right" },
203
topAligned: { subject: "top", object: "top" },
204
bottomAligned: { subject: "bottom", object: "bottom" },
205
};
206
207
// Extract edge value from bbox
208
function getEdgeValue(bbox: BBox, edge: Edge): number {
209
switch (edge) {
210
case "left":
211
return bbox.x;
212
case "right":
213
return bbox.x + bbox.width;
214
case "top":
215
return bbox.y;
216
case "bottom":
217
return bbox.y + bbox.height;
218
}
219
}
220
221
// Evaluate directional relation with edge overrides and distance constraints
222
interface DirectionalResult {
223
passed: boolean;
224
subjectEdge: Edge;
225
objectEdge: Edge;
226
subjectValue: number;
227
objectValue: number;
228
distance: number;
229
failureReason?: string;
230
}
231
232
function evaluateDirectionalRelation(
233
relation: DirectionalRelation,
234
subjectBBox: BBox,
235
objectBBox: BBox,
236
subjectEdgeOverride?: Edge,
237
objectEdgeOverride?: Edge,
238
byMin?: number,
239
byMax?: number,
240
): DirectionalResult {
241
const defaults = relationDefaults[relation];
242
const subjectEdge = subjectEdgeOverride ?? defaults.subject;
243
const objectEdge = objectEdgeOverride ?? defaults.object;
244
245
const subjectValue = getEdgeValue(subjectBBox, subjectEdge);
246
const objectValue = getEdgeValue(objectBBox, objectEdge);
247
248
// Distance calculation depends on relation direction
249
// For leftOf/above: distance = objectEdge - subjectEdge (positive when relation holds)
250
// For rightOf/below: distance = subjectEdge - objectEdge (positive when relation holds)
251
let distance: number;
252
let directionPassed: boolean;
253
254
if (relation === "leftOf" || relation === "above") {
255
distance = objectValue - subjectValue;
256
directionPassed = subjectValue < objectValue;
257
} else {
258
// rightOf or below
259
distance = subjectValue - objectValue;
260
directionPassed = subjectValue > objectValue;
261
}
262
263
const result: DirectionalResult = {
264
passed: true,
265
subjectEdge,
266
objectEdge,
267
subjectValue,
268
objectValue,
269
distance,
270
};
271
272
// Check directional constraint
273
if (!directionPassed) {
274
result.passed = false;
275
result.failureReason = "directional constraint not satisfied";
276
return result;
277
}
278
279
// Check byMin constraint
280
if (byMin !== undefined && distance < byMin) {
281
result.passed = false;
282
result.failureReason = `distance ${distance.toFixed(1)}pt < byMin ${byMin}pt`;
283
return result;
284
}
285
286
// Check byMax constraint
287
if (byMax !== undefined && distance > byMax) {
288
result.passed = false;
289
result.failureReason = `distance ${distance.toFixed(1)}pt > byMax ${byMax}pt`;
290
return result;
291
}
292
293
return result;
294
}
295
296
// Evaluate alignment relation with edge overrides
297
interface AlignmentResult {
298
passed: boolean;
299
subjectEdge: Edge;
300
objectEdge: Edge;
301
subjectValue: number;
302
objectValue: number;
303
difference: number;
304
}
305
306
function evaluateAlignmentRelation(
307
relation: AlignmentRelation,
308
subjectBBox: BBox,
309
objectBBox: BBox,
310
tolerance: number,
311
subjectEdgeOverride?: Edge,
312
objectEdgeOverride?: Edge,
313
): AlignmentResult {
314
const defaults = relationDefaults[relation];
315
const subjectEdge = subjectEdgeOverride ?? defaults.subject;
316
const objectEdge = objectEdgeOverride ?? defaults.object;
317
318
const subjectValue = getEdgeValue(subjectBBox, subjectEdge);
319
const objectValue = getEdgeValue(objectBBox, objectEdge);
320
const difference = Math.abs(subjectValue - objectValue);
321
322
return {
323
passed: difference <= tolerance,
324
subjectEdge,
325
objectEdge,
326
subjectValue,
327
objectValue,
328
difference,
329
};
330
}
331
332
// ============================================================================
333
// Helper Functions
334
// ============================================================================
335
336
function normalizeSelector(s: string | TextSelector): TextSelector {
337
if (typeof s === "string") {
338
return { text: s };
339
}
340
return s;
341
}
342
343
function isStructTreeContent(node: StructTreeNode | StructTreeContent): node is StructTreeContent {
344
return "type" in node && (node.type === "content" || node.type === "object" || node.type === "annotation");
345
}
346
347
function isTextItem(item: TextItem | TextMarkedContent): item is TextItem {
348
return "str" in item && typeof item.str === "string";
349
}
350
351
function isTextMarkedContent(item: TextItem | TextMarkedContent): item is TextMarkedContent {
352
return "type" in item && typeof item.type === "string";
353
}
354
355
/**
356
* Extract MarkedTextItem[] from pdfjs getTextContent result.
357
* Tracks current MCID as we iterate through interleaved items.
358
*/
359
function extractMarkedTextItems(
360
items: (TextItem | TextMarkedContent)[],
361
pageNum: number,
362
pageHeight: number,
363
): MarkedTextItem[] {
364
const result: MarkedTextItem[] = [];
365
let currentMcid: string | null = null;
366
367
for (const item of items) {
368
if (isTextMarkedContent(item)) {
369
if (item.type === "beginMarkedContentProps" && item.id) {
370
currentMcid = item.id;
371
} else if (item.type === "endMarkedContent") {
372
currentMcid = null;
373
}
374
} else if (isTextItem(item)) {
375
// Transform: [scaleX, skewX, skewY, scaleY, translateX, translateY]
376
const tm = item.transform;
377
const x = tm[4];
378
// Convert from PDF coordinates (bottom-left origin) to top-left origin
379
const y = pageHeight - tm[5];
380
const height = Math.sqrt(tm[2] * tm[2] + tm[3] * tm[3]);
381
382
result.push({
383
str: item.str,
384
x,
385
y,
386
width: item.width,
387
height,
388
mcid: currentMcid,
389
page: pageNum,
390
});
391
}
392
}
393
394
return result;
395
}
396
397
/**
398
* Recursively build MCID -> StructNode map and parent map from structure tree.
399
* Returns the struct node that directly contains the MCID content, plus a map
400
* from each struct node to its parent for tree traversal.
401
*/
402
function buildMcidStructMap(
403
tree: StructTreeNode | null,
404
mcidMap: Map<string, StructTreeNode> = new Map(),
405
parentMap: Map<StructTreeNode, StructTreeNode> = new Map(),
406
parentNode: StructTreeNode | null = null,
407
): { mcidMap: Map<string, StructTreeNode>; parentMap: Map<StructTreeNode, StructTreeNode> } {
408
if (!tree) return { mcidMap, parentMap };
409
410
for (const child of tree.children ?? []) {
411
if (isStructTreeContent(child)) {
412
if (child.type === "content" && child.id) {
413
// Map MCID to the parent struct node (the semantic element)
414
mcidMap.set(child.id, parentNode ?? tree);
415
}
416
} else {
417
// Record parent for tree traversal
418
if (parentNode) {
419
parentMap.set(child, parentNode);
420
} else {
421
// Root-level children have tree as parent
422
parentMap.set(child, tree);
423
}
424
// Recurse into child struct nodes
425
buildMcidStructMap(child, mcidMap, parentMap, child);
426
}
427
}
428
429
return { mcidMap, parentMap };
430
}
431
432
/**
433
* Collect only direct MCIDs under a structure node (non-recursive).
434
* Does not descend into child structure nodes.
435
*/
436
function collectDirectMcids(node: StructTreeNode): string[] {
437
const mcids: string[] = [];
438
439
for (const child of node.children ?? []) {
440
if (isStructTreeContent(child)) {
441
if (child.type === "content" && child.id) {
442
mcids.push(child.id);
443
}
444
}
445
// Do NOT recurse into child struct nodes
446
}
447
448
return mcids;
449
}
450
451
/**
452
* Recursively collect ALL MCIDs under a structure node and its descendants.
453
* Used for granularity aggregation to compute bbox of an entire subtree.
454
*/
455
function collectAllMcids(node: StructTreeNode): string[] {
456
const mcids: string[] = [];
457
458
for (const child of node.children ?? []) {
459
if (isStructTreeContent(child)) {
460
if (child.type === "content" && child.id) {
461
mcids.push(child.id);
462
}
463
} else {
464
// Recurse into child struct nodes
465
mcids.push(...collectAllMcids(child));
466
}
467
}
468
469
return mcids;
470
}
471
472
/**
473
* Walk up the structure tree to find the nearest ancestor with a matching role.
474
* Returns null if no ancestor with the target role is found.
475
*/
476
function findAncestorWithRole(
477
node: StructTreeNode,
478
targetRole: string,
479
parentMap: Map<StructTreeNode, StructTreeNode>,
480
): StructTreeNode | null {
481
let current: StructTreeNode | undefined = node;
482
while (current) {
483
if (current.role === targetRole) {
484
return current;
485
}
486
current = parentMap.get(current);
487
}
488
return null;
489
}
490
491
/**
492
* Check if a string is whitespace-only (including empty).
493
* Used to filter out horizontal skip spaces in PDF content.
494
*/
495
function isWhitespaceOnly(str: string): boolean {
496
return str.trim().length === 0;
497
}
498
499
/**
500
* Compute union bounding box from multiple items.
501
* Filters out whitespace-only text items to avoid including horizontal skips.
502
*/
503
function unionBBox(items: MarkedTextItem[]): BBox | null {
504
// Filter out whitespace-only items (these are often horizontal skips)
505
const contentItems = items.filter((item) => !isWhitespaceOnly(item.str));
506
if (contentItems.length === 0) return null;
507
508
let minX = Infinity;
509
let minY = Infinity;
510
let maxX = -Infinity;
511
let maxY = -Infinity;
512
const page = contentItems[0].page;
513
514
for (const item of contentItems) {
515
minX = Math.min(minX, item.x);
516
minY = Math.min(minY, item.y);
517
maxX = Math.max(maxX, item.x + item.width);
518
maxY = Math.max(maxY, item.y + item.height);
519
}
520
521
return {
522
x: minX,
523
y: minY,
524
width: maxX - minX,
525
height: maxY - minY,
526
page,
527
};
528
}
529
530
/**
531
* Compute semantic bounding box for a structure node.
532
* Uses only direct MCIDs (non-recursive) to avoid including nested elements
533
* like margin content that may be children of body paragraphs.
534
*/
535
function computeStructBBox(
536
node: StructTreeNode,
537
mcidToTextItems: Map<string, MarkedTextItem[]>,
538
): BBox | null {
539
const mcids = collectDirectMcids(node);
540
const items = mcids.flatMap((id) => mcidToTextItems.get(id) ?? []);
541
return unionBBox(items);
542
}
543
544
// ============================================================================
545
// Main Predicate
546
// ============================================================================
547
548
/**
549
* Verify spatial positions of text in a rendered PDF using semantic structure.
550
* Uses pdfjs-dist to access MCIDs and structure tree.
551
*/
552
export const ensurePdfTextPositions = (
553
file: string,
554
assertions: PdfTextPositionAssertion[],
555
noMatchAssertions?: PdfTextPositionAssertion[],
556
): Verify => {
557
return {
558
name: `Inspecting ${file} for text position assertions`,
559
verify: async (_output: ExecuteOutput[]) => {
560
const errors: string[] = [];
561
562
// Internal normalized assertion type for processing
563
type NormalizedAssertion = {
564
subject: TextSelector;
565
relation?: Relation;
566
object?: TextSelector;
567
tolerance: number;
568
byMin?: number;
569
byMax?: number;
570
};
571
572
// Validate and normalize an assertion using Zod
573
const normalizeAssertion = (a: unknown, index: number): NormalizedAssertion | null => {
574
// Try parsing as each type in order of specificity
575
const directionalResult = DirectionalAssertionSchema.safeParse(a);
576
if (directionalResult.success) {
577
const d = directionalResult.data;
578
return {
579
subject: normalizeSelector(d.subject),
580
relation: d.relation,
581
object: normalizeSelector(d.object),
582
tolerance: DEFAULT_ALIGNMENT_TOLERANCE,
583
byMin: d.byMin,
584
byMax: d.byMax,
585
};
586
}
587
588
const alignmentResult = AlignmentAssertionSchema.safeParse(a);
589
if (alignmentResult.success) {
590
const al = alignmentResult.data;
591
return {
592
subject: normalizeSelector(al.subject),
593
relation: al.relation,
594
object: normalizeSelector(al.object),
595
tolerance: al.tolerance ?? DEFAULT_ALIGNMENT_TOLERANCE,
596
};
597
}
598
599
const tagOnlyResult = TagOnlyAssertionSchema.safeParse(a);
600
if (tagOnlyResult.success) {
601
return {
602
subject: normalizeSelector(tagOnlyResult.data.subject),
603
tolerance: DEFAULT_ALIGNMENT_TOLERANCE,
604
};
605
}
606
607
// None of the schemas matched - report validation error
608
const fullResult = PdfTextPositionAssertionSchema.safeParse(a);
609
if (!fullResult.success) {
610
const zodErrors = fullResult.error.errors.map(e => `${e.path.join(".")}: ${e.message}`).join("; ");
611
errors.push(`Assertion ${index + 1} is invalid: ${zodErrors}`);
612
}
613
return null;
614
};
615
616
// Stage 1: Parse and validate assertions
617
const normalizedAssertions = assertions
618
.map((a, i) => normalizeAssertion(a, i))
619
.filter((a): a is NormalizedAssertion => a !== null);
620
621
const normalizedNoMatch = noMatchAssertions
622
?.map((a, i) => normalizeAssertion(a, i + assertions.length))
623
.filter((a): a is NormalizedAssertion => a !== null);
624
625
// Track search texts and their selectors (to know if Decoration role is requested)
626
// Page role selectors are tracked separately since they don't need text search
627
const searchTexts = new Set<string>();
628
const textToSelectors = new Map<string, TextSelector[]>();
629
const pageSelectors = new Map<number, TextSelector>(); // page number -> selector
630
631
// Helper: check if selector is a Page role (no text search needed)
632
const isPageRole = (sel: TextSelector): boolean => sel.role === "Page";
633
634
// Helper: get unique key for a selector (for resolvedSelectors map)
635
// Includes granularity since different granularity settings need different bbox computation
636
const selectorKey = (sel: TextSelector): string => {
637
if (isPageRole(sel)) {
638
return `Page:${sel.page}`;
639
}
640
const base = sel.text ?? "";
641
if (sel.granularity) {
642
return `${base}@${sel.granularity}`;
643
}
644
return base;
645
};
646
647
// Track unique selectors by their full key (including granularity)
648
const uniqueSelectors = new Map<string, TextSelector>();
649
650
const addSelector = (sel: TextSelector) => {
651
if (isPageRole(sel)) {
652
if (sel.page === undefined) {
653
errors.push(`Page role requires 'page' field to specify page number`);
654
return;
655
}
656
pageSelectors.set(sel.page, sel);
657
} else {
658
if (!sel.text) {
659
errors.push(`Selector requires 'text' field (unless role is "Page")`);
660
return;
661
}
662
searchTexts.add(sel.text);
663
const existing = textToSelectors.get(sel.text) ?? [];
664
existing.push(sel);
665
textToSelectors.set(sel.text, existing);
666
// Also track by full key for resolution
667
uniqueSelectors.set(selectorKey(sel), sel);
668
}
669
};
670
671
for (const a of normalizedAssertions) {
672
addSelector(a.subject);
673
if (a.object) addSelector(a.object);
674
}
675
for (const a of normalizedNoMatch ?? []) {
676
addSelector(a.subject);
677
if (a.object) addSelector(a.object);
678
}
679
680
// Helper: check if any selector for this text is a Decoration (untagged content)
681
const isDecoration = (text: string): boolean => {
682
const selectors = textToSelectors.get(text) ?? [];
683
return selectors.some((s) => s.role === "Decoration");
684
};
685
686
// Stage 2: Load PDF with pdfjs-dist
687
// deno-lint-ignore no-explicit-any
688
const pdfjsLib = await import("pdfjs-dist") as any;
689
const buffer = await Deno.readFile(file);
690
const doc = await pdfjsLib.getDocument({ data: buffer }).promise;
691
692
// Stage 3 & 4: Extract content and structure tree per page
693
const allTextItems: MarkedTextItem[] = [];
694
const mcidToTextItems = new Map<string, MarkedTextItem[]>();
695
const mcidToStructNode = new Map<string, StructTreeNode>();
696
const structNodeToParent = new Map<StructTreeNode, StructTreeNode>();
697
const pageDimensions = new Map<number, { width: number; height: number }>();
698
699
for (let pageNum = 1; pageNum <= doc.numPages; pageNum++) {
700
const page = await doc.getPage(pageNum);
701
const viewport = page.getViewport({ scale: 1.0 });
702
703
// Store page dimensions for Page role
704
pageDimensions.set(pageNum, { width: viewport.width, height: viewport.height });
705
706
// Get text content with marked content
707
const textContent = await page.getTextContent({
708
includeMarkedContent: true,
709
});
710
711
const pageItems = extractMarkedTextItems(
712
textContent.items,
713
pageNum,
714
viewport.height,
715
);
716
allTextItems.push(...pageItems);
717
718
// Build MCID -> text items map
719
for (const item of pageItems) {
720
if (item.mcid) {
721
const existing = mcidToTextItems.get(item.mcid) ?? [];
722
existing.push(item);
723
mcidToTextItems.set(item.mcid, existing);
724
}
725
}
726
727
// Get structure tree and build MCID -> struct node map + parent map
728
const structTree = await page.getStructTree();
729
if (structTree) {
730
const { mcidMap, parentMap } = buildMcidStructMap(structTree);
731
for (const [k, v] of mcidMap) {
732
mcidToStructNode.set(k, v);
733
}
734
for (const [k, v] of parentMap) {
735
structNodeToParent.set(k, v);
736
}
737
}
738
}
739
740
// Stage 5: Find text items for each search text (must be unique, unless Decoration)
741
const foundTexts = new Map<string, MarkedTextItem>();
742
const ambiguousTexts = new Set<string>();
743
for (const searchText of searchTexts) {
744
const matches = allTextItems.filter((t) => t.str.includes(searchText));
745
if (matches.length === 1) {
746
foundTexts.set(searchText, matches[0]);
747
} else if (matches.length > 1) {
748
// Decoration role (headers, footers) naturally repeat on each page - allow first match
749
if (isDecoration(searchText)) {
750
foundTexts.set(searchText, matches[0]);
751
} else {
752
ambiguousTexts.add(searchText);
753
errors.push(
754
`Text "${searchText}" is ambiguous - found ${matches.length} matches. Use a more specific search string.`,
755
);
756
}
757
}
758
// If matches.length === 0, we'll report "not found" later
759
}
760
761
// Stage 6 & 7: Resolve selectors to structure nodes and compute bboxes
762
const resolvedSelectors = new Map<string, ResolvedSelector>();
763
764
// First, resolve Page role selectors (no text search needed)
765
for (const [pageNum, sel] of pageSelectors) {
766
const dims = pageDimensions.get(pageNum);
767
if (!dims) {
768
errors.push(`Page ${pageNum} does not exist in PDF (has ${pageDimensions.size} pages)`);
769
continue;
770
}
771
const key = selectorKey(sel);
772
resolvedSelectors.set(key, {
773
selector: sel,
774
textItem: { str: "", x: 0, y: 0, width: 0, height: 0, mcid: null, page: pageNum },
775
structNode: null,
776
bbox: {
777
x: 0,
778
y: 0,
779
width: dims.width,
780
height: dims.height,
781
page: pageNum,
782
},
783
});
784
}
785
786
// Then, resolve text-based selectors (iterate by unique selector key to handle granularity)
787
for (const [key, selector] of uniqueSelectors) {
788
const searchText = selector.text!;
789
const textItem = foundTexts.get(searchText);
790
if (!textItem) {
791
// Don't report "not found" if we already reported "ambiguous"
792
if (!ambiguousTexts.has(searchText)) {
793
errors.push(`Text not found in PDF: "${searchText}"`);
794
}
795
continue;
796
}
797
798
let structNode: StructTreeNode | null = null;
799
let bbox: BBox;
800
801
// Decoration role: use text item bounds directly (for headers, footers, page decorations)
802
if (isDecoration(searchText)) {
803
bbox = {
804
x: textItem.x,
805
y: textItem.y,
806
width: textItem.width,
807
height: textItem.height,
808
page: textItem.page,
809
};
810
} else if (!textItem.mcid) {
811
errors.push(
812
`Text "${searchText}" has no MCID - PDF may not be tagged. Use role: "Decoration" for untagged page elements like headers/footers.`,
813
);
814
continue;
815
} else {
816
structNode = mcidToStructNode.get(textItem.mcid) ?? null;
817
818
// Check for granularity: aggregate bbox to ancestor with target role
819
if (selector.granularity && structNode) {
820
const ancestor = findAncestorWithRole(structNode, selector.granularity, structNodeToParent);
821
if (ancestor) {
822
// Collect ALL MCIDs recursively under that ancestor
823
const allMcids = collectAllMcids(ancestor);
824
const allItems = allMcids.flatMap((id) => mcidToTextItems.get(id) ?? []);
825
const ancestorBBox = unionBBox(allItems);
826
if (ancestorBBox) {
827
bbox = ancestorBBox;
828
} else {
829
errors.push(
830
`Could not compute bbox for "${searchText}" with granularity "${selector.granularity}" - no content items found`,
831
);
832
continue;
833
}
834
} else {
835
errors.push(
836
`No ancestor with role "${selector.granularity}" found for "${searchText}"`,
837
);
838
continue;
839
}
840
} else {
841
// Same-MCID approach: compute bbox from all text items sharing this MCID
842
const mcidItems = mcidToTextItems.get(textItem.mcid);
843
if (mcidItems && mcidItems.length > 0) {
844
const mcidBBox = unionBBox(mcidItems);
845
if (mcidBBox) {
846
bbox = mcidBBox;
847
} else {
848
errors.push(
849
`Could not compute bbox for "${searchText}" - all text items in MCID are whitespace-only`,
850
);
851
continue;
852
}
853
} else {
854
errors.push(
855
`No text items found for MCID ${textItem.mcid} containing "${searchText}"`,
856
);
857
continue;
858
}
859
}
860
}
861
862
resolvedSelectors.set(key, {
863
selector,
864
textItem,
865
structNode,
866
bbox,
867
});
868
}
869
870
// Validate role assertions (skip Page role since it's a virtual selector)
871
for (const a of normalizedAssertions) {
872
if (isPageRole(a.subject)) continue; // Page role has no struct node to validate
873
874
const resolved = resolvedSelectors.get(selectorKey(a.subject));
875
if (!resolved) continue;
876
877
if (a.subject.role && resolved.structNode) {
878
if (resolved.structNode.role !== a.subject.role) {
879
errors.push(
880
`Role mismatch for "${a.subject.text}": expected ${a.subject.role}, got ${resolved.structNode.role}`,
881
);
882
}
883
}
884
885
if (a.object && !isPageRole(a.object)) {
886
const resolvedObj = resolvedSelectors.get(selectorKey(a.object));
887
if (!resolvedObj) continue;
888
889
if (a.object.role && resolvedObj.structNode) {
890
if (resolvedObj.structNode.role !== a.object.role) {
891
errors.push(
892
`Role mismatch for "${a.object.text}": expected ${a.object.role}, got ${resolvedObj.structNode.role}`,
893
);
894
}
895
}
896
}
897
}
898
899
// Stage 8: Evaluate position assertions
900
// Note: Zod validation in Stage 1 already handles:
901
// - Unknown relations
902
// - byMin/byMax with alignment relations (via .strict())
903
// - byMin > byMax (via .refine())
904
for (const a of normalizedAssertions) {
905
// Tag-only assertions (no relation/object)
906
if (!a.relation || !a.object) {
907
continue; // Already validated in stage 6
908
}
909
910
const subjectKey = selectorKey(a.subject);
911
const objectKey = selectorKey(a.object);
912
const subjectResolved = resolvedSelectors.get(subjectKey);
913
const objectResolved = resolvedSelectors.get(objectKey);
914
915
if (!subjectResolved || !objectResolved) {
916
continue; // Error already recorded
917
}
918
919
// Check same page
920
if (subjectResolved.bbox.page !== objectResolved.bbox.page) {
921
errors.push(
922
`Cannot compare positions: "${subjectKey}" is on page ${subjectResolved.bbox.page}, ` +
923
`"${objectKey}" is on page ${objectResolved.bbox.page}`,
924
);
925
continue;
926
}
927
928
// Evaluate relation based on type (Zod guarantees valid relation type)
929
const isDirectional = directionalRelations.has(a.relation);
930
if (isDirectional) {
931
const result = evaluateDirectionalRelation(
932
a.relation as DirectionalRelation,
933
subjectResolved.bbox,
934
objectResolved.bbox,
935
a.subject.edge,
936
a.object.edge,
937
a.byMin,
938
a.byMax,
939
);
940
941
if (!result.passed) {
942
const distanceInfo = a.byMin !== undefined || a.byMax !== undefined
943
? ` Distance: ${result.distance.toFixed(1)}pt` +
944
(a.byMin !== undefined ? ` (required >= ${a.byMin}pt)` : "") +
945
(a.byMax !== undefined ? ` (required <= ${a.byMax}pt)` : "")
946
: "";
947
errors.push(
948
`Position assertion failed (page ${subjectResolved.bbox.page}): "${subjectKey}" is NOT ${a.relation} "${objectKey}".` +
949
` Subject.${result.subjectEdge}=${result.subjectValue.toFixed(1)},` +
950
` Object.${result.objectEdge}=${result.objectValue.toFixed(1)}.${distanceInfo}` +
951
(result.failureReason ? ` (${result.failureReason})` : ""),
952
);
953
}
954
} else {
955
// Alignment relation
956
const result = evaluateAlignmentRelation(
957
a.relation as AlignmentRelation,
958
subjectResolved.bbox,
959
objectResolved.bbox,
960
a.tolerance,
961
a.subject.edge,
962
a.object.edge,
963
);
964
965
if (!result.passed) {
966
errors.push(
967
`Position assertion failed (page ${subjectResolved.bbox.page}): "${subjectKey}" is NOT ${a.relation} "${objectKey}".` +
968
` Subject.${result.subjectEdge}=${result.subjectValue.toFixed(1)},` +
969
` Object.${result.objectEdge}=${result.objectValue.toFixed(1)}.` +
970
` Difference: ${result.difference.toFixed(1)}pt (tolerance: ${a.tolerance}pt)`,
971
);
972
}
973
}
974
}
975
976
// Evaluate negative assertions
977
// Note: Zod validation already handled in Stage 1
978
for (const a of normalizedNoMatch ?? []) {
979
if (!a.relation || !a.object) continue;
980
981
const subjectKey = selectorKey(a.subject);
982
const objectKey = selectorKey(a.object);
983
const subjectResolved = resolvedSelectors.get(subjectKey);
984
const objectResolved = resolvedSelectors.get(objectKey);
985
986
if (!subjectResolved || !objectResolved) {
987
continue; // Assertion trivially doesn't hold
988
}
989
990
if (subjectResolved.bbox.page !== objectResolved.bbox.page) {
991
continue; // Assertion trivially doesn't hold
992
}
993
994
// Evaluate relation based on type (Zod guarantees valid relation type)
995
const isDirectional = directionalRelations.has(a.relation);
996
let passed: boolean;
997
let resultInfo: string;
998
999
if (isDirectional) {
1000
const result = evaluateDirectionalRelation(
1001
a.relation as DirectionalRelation,
1002
subjectResolved.bbox,
1003
objectResolved.bbox,
1004
a.subject.edge,
1005
a.object.edge,
1006
a.byMin,
1007
a.byMax,
1008
);
1009
passed = result.passed;
1010
resultInfo = `Subject.${result.subjectEdge}=${result.subjectValue.toFixed(1)}, ` +
1011
`Object.${result.objectEdge}=${result.objectValue.toFixed(1)}, ` +
1012
`distance=${result.distance.toFixed(1)}pt`;
1013
} else {
1014
const result = evaluateAlignmentRelation(
1015
a.relation as AlignmentRelation,
1016
subjectResolved.bbox,
1017
objectResolved.bbox,
1018
a.tolerance,
1019
a.subject.edge,
1020
a.object.edge,
1021
);
1022
passed = result.passed;
1023
resultInfo = `Subject.${result.subjectEdge}=${result.subjectValue.toFixed(1)}, ` +
1024
`Object.${result.objectEdge}=${result.objectValue.toFixed(1)}, ` +
1025
`difference=${result.difference.toFixed(1)}pt`;
1026
}
1027
1028
if (passed) {
1029
errors.push(
1030
`Negative assertion failed (page ${subjectResolved.bbox.page}): "${subjectKey}" IS ${a.relation} "${objectKey}" (expected NOT to be). ` +
1031
resultInfo,
1032
);
1033
}
1034
}
1035
1036
// Stage 9: Aggregate errors
1037
if (errors.length > 0) {
1038
assert(
1039
false,
1040
`PDF position assertions failed in ${file}:\n${errors.map((e, i) => ` ${i + 1}. ${e}`).join("\n")}`,
1041
);
1042
}
1043
},
1044
};
1045
};
1046
1047