Path: blob/main/tests/verify-pdf-text-position.ts
6446 views
/*1* verify-pdf-text-position.ts2*3* PDF text position verification using semantic structure tree.4* Uses pdfjs-dist directly to access MCIDs and structure tree.5*6* REQUIREMENTS:7* This module requires tagged PDFs with PDF 1.4+ structure tree support.8* Tagged PDFs contain Marked Content Identifiers (MCIDs) that link text9* content to semantic structure elements (P, H1, Figure, Table, etc.).10*11* Currently confirmed working:12* - Typst: Produces tagged PDFs by default13*14* Not yet working:15* - LaTeX: Requires \DocumentMetadata{} before \documentclass for tagging,16* which Quarto doesn't currently support. When LaTeX tagged PDF support17* is available, this module should work with minimal changes since we18* use only basic PDF 1.4 tagged structure features.19* - ConTeXt: Pandoc supports +tagging extension, but Quarto's context20* format doesn't compile to PDF.21*22* SPECIAL ROLES:23* - role: "Decoration" - Use for untagged page elements like headers, footers,24* page numbers, and other decorations. These use text item bounds directly25* instead of requiring MCID/structure tree support.26* - role: "Page" - Use for the entire page bounds. Requires `page` field to27* specify which page number (1-indexed). The `text` field is ignored.28* Useful for NOT assertions since Page intersects all content on that page.29*30* Copyright (C) 2020-2025 Posit Software, PBC31*/3233import { assert } from "testing/asserts";34import { z } from "zod";35import { ExecuteOutput, Verify } from "./test.ts";3637// ============================================================================38// Zod Schemas and Type Definitions39// ============================================================================4041// Edge schema for precise bbox edge selection42export const EdgeSchema = z.enum(["left", "right", "top", "bottom"]);43export type Edge = z.infer<typeof EdgeSchema>;4445// Relation schemas46export const DirectionalRelationSchema = z.enum(["leftOf", "rightOf", "above", "below"]);47export const AlignmentRelationSchema = z.enum(["leftAligned", "rightAligned", "topAligned", "bottomAligned"]);48export const RelationSchema = z.union([DirectionalRelationSchema, AlignmentRelationSchema]);4950export type DirectionalRelation = z.infer<typeof DirectionalRelationSchema>;51export type AlignmentRelation = z.infer<typeof AlignmentRelationSchema>;52export type Relation = z.infer<typeof RelationSchema>;5354// Text selector schema55// Note: Label/ID checking is not supported because:56// 1. Typst does not write labels to PDF StructElem /ID attributes (labels become57// named destinations for links, but not structure element identifiers)58// 2. Even if IDs were present, pdf.js doesn't expose /ID through getStructTree()59export const TextSelectorSchema = z.object({60text: z.string().optional(), // Text to search for (ignored for role: "Page")61role: z.string().optional(), // PDF 1.4 structure role: P, H1, H2, Figure, Table, Span, etc.62page: z.number().optional(), // Page number (1-indexed), required for role: "Page"63edge: EdgeSchema.optional(), // Which edge to use for comparison (overrides relation default)64granularity: z.string().optional(), // Aggregate bbox to ancestor with this role (e.g., "Div", "P")65});66export type TextSelector = z.infer<typeof TextSelectorSchema>;6768// Subject/object can be a string or a TextSelector69const SubjectObjectSchema = z.union([z.string(), TextSelectorSchema]);7071// Tag-only assertion: validates semantic role without position comparison72export const TagOnlyAssertionSchema = z.object({73subject: SubjectObjectSchema,74}).strict();75export type TagOnlyAssertion = z.infer<typeof TagOnlyAssertionSchema>;7677// Directional assertion: leftOf, rightOf, above, below with optional distance constraints78export const DirectionalAssertionSchema = z.object({79subject: SubjectObjectSchema,80relation: DirectionalRelationSchema,81object: SubjectObjectSchema,82byMin: z.number().optional(), // Minimum distance between edges83byMax: z.number().optional(), // Maximum distance between edges84}).refine(85(data) => data.byMin === undefined || data.byMax === undefined || data.byMin <= data.byMax,86{ message: "byMin must be <= byMax" }87);88export type DirectionalAssertion = z.infer<typeof DirectionalAssertionSchema>;8990// Alignment assertion: leftAligned, rightAligned, topAligned, bottomAligned with tolerance91export const AlignmentAssertionSchema = z.object({92subject: SubjectObjectSchema,93relation: AlignmentRelationSchema,94object: SubjectObjectSchema,95tolerance: z.number().optional(), // Default: 2pt96}).strict();97export type AlignmentAssertion = z.infer<typeof AlignmentAssertionSchema>;9899// Union of all assertion types100export const PdfTextPositionAssertionSchema = z.union([101DirectionalAssertionSchema,102AlignmentAssertionSchema,103TagOnlyAssertionSchema,104]);105export type PdfTextPositionAssertion = z.infer<typeof PdfTextPositionAssertionSchema>;106107// Type guards for assertion discrimination (using Zod safeParse)108export function isDirectionalAssertion(a: unknown): a is DirectionalAssertion {109return DirectionalAssertionSchema.safeParse(a).success;110}111112export function isAlignmentAssertion(a: unknown): a is AlignmentAssertion {113return AlignmentAssertionSchema.safeParse(a).success;114}115116export function isTagOnlyAssertion(a: unknown): a is TagOnlyAssertion {117return TagOnlyAssertionSchema.safeParse(a).success;118}119120// Computed bounding box121interface BBox {122x: number;123y: number;124width: number;125height: number;126page: number;127}128129// Internal: text item with MCID tracking130interface MarkedTextItem {131str: string;132x: number;133y: number;134width: number;135height: number;136mcid: string | null; // e.g., "p2R_mc0"137page: number;138}139140// Structure tree node (from pdfjs-dist)141interface StructTreeNode {142role: string;143children?: (StructTreeNode | StructTreeContent)[];144alt?: string;145lang?: string;146}147148interface StructTreeContent {149type: "content" | "object" | "annotation";150id: string;151}152153// Text content item types from pdfjs-dist154interface TextItem {155str: string;156dir: string;157transform: number[];158width: number;159height: number;160fontName: string;161hasEOL: boolean;162}163164interface TextMarkedContent {165type: "beginMarkedContent" | "beginMarkedContentProps" | "endMarkedContent";166id?: string;167tag?: string;168}169170// Internal: resolved selector with computed bounds171interface ResolvedSelector {172selector: TextSelector;173textItem: MarkedTextItem;174structNode: StructTreeNode | null;175bbox: BBox;176}177178// ============================================================================179// Constants180// ============================================================================181182const DEFAULT_ALIGNMENT_TOLERANCE = 2;183184// ============================================================================185// Relation Predicates and Edge Logic186// ============================================================================187188// Coordinate system: origin at top-left, y increases downward189190// Derive relation sets from Zod schemas191const directionalRelations: Set<Relation> = new Set(DirectionalRelationSchema.options);192const alignmentRelations: Set<Relation> = new Set(AlignmentRelationSchema.options);193194// Default edges for each relation (from spec table)195const relationDefaults: Record<Relation, { subject: Edge; object: Edge }> = {196leftOf: { subject: "right", object: "left" },197rightOf: { subject: "left", object: "right" },198above: { subject: "bottom", object: "top" },199below: { subject: "top", object: "bottom" },200leftAligned: { subject: "left", object: "left" },201rightAligned: { subject: "right", object: "right" },202topAligned: { subject: "top", object: "top" },203bottomAligned: { subject: "bottom", object: "bottom" },204};205206// Extract edge value from bbox207function getEdgeValue(bbox: BBox, edge: Edge): number {208switch (edge) {209case "left":210return bbox.x;211case "right":212return bbox.x + bbox.width;213case "top":214return bbox.y;215case "bottom":216return bbox.y + bbox.height;217}218}219220// Evaluate directional relation with edge overrides and distance constraints221interface DirectionalResult {222passed: boolean;223subjectEdge: Edge;224objectEdge: Edge;225subjectValue: number;226objectValue: number;227distance: number;228failureReason?: string;229}230231function evaluateDirectionalRelation(232relation: DirectionalRelation,233subjectBBox: BBox,234objectBBox: BBox,235subjectEdgeOverride?: Edge,236objectEdgeOverride?: Edge,237byMin?: number,238byMax?: number,239): DirectionalResult {240const defaults = relationDefaults[relation];241const subjectEdge = subjectEdgeOverride ?? defaults.subject;242const objectEdge = objectEdgeOverride ?? defaults.object;243244const subjectValue = getEdgeValue(subjectBBox, subjectEdge);245const objectValue = getEdgeValue(objectBBox, objectEdge);246247// Distance calculation depends on relation direction248// For leftOf/above: distance = objectEdge - subjectEdge (positive when relation holds)249// For rightOf/below: distance = subjectEdge - objectEdge (positive when relation holds)250let distance: number;251let directionPassed: boolean;252253if (relation === "leftOf" || relation === "above") {254distance = objectValue - subjectValue;255directionPassed = subjectValue < objectValue;256} else {257// rightOf or below258distance = subjectValue - objectValue;259directionPassed = subjectValue > objectValue;260}261262const result: DirectionalResult = {263passed: true,264subjectEdge,265objectEdge,266subjectValue,267objectValue,268distance,269};270271// Check directional constraint272if (!directionPassed) {273result.passed = false;274result.failureReason = "directional constraint not satisfied";275return result;276}277278// Check byMin constraint279if (byMin !== undefined && distance < byMin) {280result.passed = false;281result.failureReason = `distance ${distance.toFixed(1)}pt < byMin ${byMin}pt`;282return result;283}284285// Check byMax constraint286if (byMax !== undefined && distance > byMax) {287result.passed = false;288result.failureReason = `distance ${distance.toFixed(1)}pt > byMax ${byMax}pt`;289return result;290}291292return result;293}294295// Evaluate alignment relation with edge overrides296interface AlignmentResult {297passed: boolean;298subjectEdge: Edge;299objectEdge: Edge;300subjectValue: number;301objectValue: number;302difference: number;303}304305function evaluateAlignmentRelation(306relation: AlignmentRelation,307subjectBBox: BBox,308objectBBox: BBox,309tolerance: number,310subjectEdgeOverride?: Edge,311objectEdgeOverride?: Edge,312): AlignmentResult {313const defaults = relationDefaults[relation];314const subjectEdge = subjectEdgeOverride ?? defaults.subject;315const objectEdge = objectEdgeOverride ?? defaults.object;316317const subjectValue = getEdgeValue(subjectBBox, subjectEdge);318const objectValue = getEdgeValue(objectBBox, objectEdge);319const difference = Math.abs(subjectValue - objectValue);320321return {322passed: difference <= tolerance,323subjectEdge,324objectEdge,325subjectValue,326objectValue,327difference,328};329}330331// ============================================================================332// Helper Functions333// ============================================================================334335function normalizeSelector(s: string | TextSelector): TextSelector {336if (typeof s === "string") {337return { text: s };338}339return s;340}341342function isStructTreeContent(node: StructTreeNode | StructTreeContent): node is StructTreeContent {343return "type" in node && (node.type === "content" || node.type === "object" || node.type === "annotation");344}345346function isTextItem(item: TextItem | TextMarkedContent): item is TextItem {347return "str" in item && typeof item.str === "string";348}349350function isTextMarkedContent(item: TextItem | TextMarkedContent): item is TextMarkedContent {351return "type" in item && typeof item.type === "string";352}353354/**355* Extract MarkedTextItem[] from pdfjs getTextContent result.356* Tracks current MCID as we iterate through interleaved items.357*/358function extractMarkedTextItems(359items: (TextItem | TextMarkedContent)[],360pageNum: number,361pageHeight: number,362): MarkedTextItem[] {363const result: MarkedTextItem[] = [];364let currentMcid: string | null = null;365366for (const item of items) {367if (isTextMarkedContent(item)) {368if (item.type === "beginMarkedContentProps" && item.id) {369currentMcid = item.id;370} else if (item.type === "endMarkedContent") {371currentMcid = null;372}373} else if (isTextItem(item)) {374// Transform: [scaleX, skewX, skewY, scaleY, translateX, translateY]375const tm = item.transform;376const x = tm[4];377// Convert from PDF coordinates (bottom-left origin) to top-left origin378const y = pageHeight - tm[5];379const height = Math.sqrt(tm[2] * tm[2] + tm[3] * tm[3]);380381result.push({382str: item.str,383x,384y,385width: item.width,386height,387mcid: currentMcid,388page: pageNum,389});390}391}392393return result;394}395396/**397* Recursively build MCID -> StructNode map and parent map from structure tree.398* Returns the struct node that directly contains the MCID content, plus a map399* from each struct node to its parent for tree traversal.400*/401function buildMcidStructMap(402tree: StructTreeNode | null,403mcidMap: Map<string, StructTreeNode> = new Map(),404parentMap: Map<StructTreeNode, StructTreeNode> = new Map(),405parentNode: StructTreeNode | null = null,406): { mcidMap: Map<string, StructTreeNode>; parentMap: Map<StructTreeNode, StructTreeNode> } {407if (!tree) return { mcidMap, parentMap };408409for (const child of tree.children ?? []) {410if (isStructTreeContent(child)) {411if (child.type === "content" && child.id) {412// Map MCID to the parent struct node (the semantic element)413mcidMap.set(child.id, parentNode ?? tree);414}415} else {416// Record parent for tree traversal417if (parentNode) {418parentMap.set(child, parentNode);419} else {420// Root-level children have tree as parent421parentMap.set(child, tree);422}423// Recurse into child struct nodes424buildMcidStructMap(child, mcidMap, parentMap, child);425}426}427428return { mcidMap, parentMap };429}430431/**432* Collect only direct MCIDs under a structure node (non-recursive).433* Does not descend into child structure nodes.434*/435function collectDirectMcids(node: StructTreeNode): string[] {436const mcids: string[] = [];437438for (const child of node.children ?? []) {439if (isStructTreeContent(child)) {440if (child.type === "content" && child.id) {441mcids.push(child.id);442}443}444// Do NOT recurse into child struct nodes445}446447return mcids;448}449450/**451* Recursively collect ALL MCIDs under a structure node and its descendants.452* Used for granularity aggregation to compute bbox of an entire subtree.453*/454function collectAllMcids(node: StructTreeNode): string[] {455const mcids: string[] = [];456457for (const child of node.children ?? []) {458if (isStructTreeContent(child)) {459if (child.type === "content" && child.id) {460mcids.push(child.id);461}462} else {463// Recurse into child struct nodes464mcids.push(...collectAllMcids(child));465}466}467468return mcids;469}470471/**472* Walk up the structure tree to find the nearest ancestor with a matching role.473* Returns null if no ancestor with the target role is found.474*/475function findAncestorWithRole(476node: StructTreeNode,477targetRole: string,478parentMap: Map<StructTreeNode, StructTreeNode>,479): StructTreeNode | null {480let current: StructTreeNode | undefined = node;481while (current) {482if (current.role === targetRole) {483return current;484}485current = parentMap.get(current);486}487return null;488}489490/**491* Check if a string is whitespace-only (including empty).492* Used to filter out horizontal skip spaces in PDF content.493*/494function isWhitespaceOnly(str: string): boolean {495return str.trim().length === 0;496}497498/**499* Compute union bounding box from multiple items.500* Filters out whitespace-only text items to avoid including horizontal skips.501*/502function unionBBox(items: MarkedTextItem[]): BBox | null {503// Filter out whitespace-only items (these are often horizontal skips)504const contentItems = items.filter((item) => !isWhitespaceOnly(item.str));505if (contentItems.length === 0) return null;506507let minX = Infinity;508let minY = Infinity;509let maxX = -Infinity;510let maxY = -Infinity;511const page = contentItems[0].page;512513for (const item of contentItems) {514minX = Math.min(minX, item.x);515minY = Math.min(minY, item.y);516maxX = Math.max(maxX, item.x + item.width);517maxY = Math.max(maxY, item.y + item.height);518}519520return {521x: minX,522y: minY,523width: maxX - minX,524height: maxY - minY,525page,526};527}528529/**530* Compute semantic bounding box for a structure node.531* Uses only direct MCIDs (non-recursive) to avoid including nested elements532* like margin content that may be children of body paragraphs.533*/534function computeStructBBox(535node: StructTreeNode,536mcidToTextItems: Map<string, MarkedTextItem[]>,537): BBox | null {538const mcids = collectDirectMcids(node);539const items = mcids.flatMap((id) => mcidToTextItems.get(id) ?? []);540return unionBBox(items);541}542543// ============================================================================544// Main Predicate545// ============================================================================546547/**548* Verify spatial positions of text in a rendered PDF using semantic structure.549* Uses pdfjs-dist to access MCIDs and structure tree.550*/551export const ensurePdfTextPositions = (552file: string,553assertions: PdfTextPositionAssertion[],554noMatchAssertions?: PdfTextPositionAssertion[],555): Verify => {556return {557name: `Inspecting ${file} for text position assertions`,558verify: async (_output: ExecuteOutput[]) => {559const errors: string[] = [];560561// Internal normalized assertion type for processing562type NormalizedAssertion = {563subject: TextSelector;564relation?: Relation;565object?: TextSelector;566tolerance: number;567byMin?: number;568byMax?: number;569};570571// Validate and normalize an assertion using Zod572const normalizeAssertion = (a: unknown, index: number): NormalizedAssertion | null => {573// Try parsing as each type in order of specificity574const directionalResult = DirectionalAssertionSchema.safeParse(a);575if (directionalResult.success) {576const d = directionalResult.data;577return {578subject: normalizeSelector(d.subject),579relation: d.relation,580object: normalizeSelector(d.object),581tolerance: DEFAULT_ALIGNMENT_TOLERANCE,582byMin: d.byMin,583byMax: d.byMax,584};585}586587const alignmentResult = AlignmentAssertionSchema.safeParse(a);588if (alignmentResult.success) {589const al = alignmentResult.data;590return {591subject: normalizeSelector(al.subject),592relation: al.relation,593object: normalizeSelector(al.object),594tolerance: al.tolerance ?? DEFAULT_ALIGNMENT_TOLERANCE,595};596}597598const tagOnlyResult = TagOnlyAssertionSchema.safeParse(a);599if (tagOnlyResult.success) {600return {601subject: normalizeSelector(tagOnlyResult.data.subject),602tolerance: DEFAULT_ALIGNMENT_TOLERANCE,603};604}605606// None of the schemas matched - report validation error607const fullResult = PdfTextPositionAssertionSchema.safeParse(a);608if (!fullResult.success) {609const zodErrors = fullResult.error.errors.map(e => `${e.path.join(".")}: ${e.message}`).join("; ");610errors.push(`Assertion ${index + 1} is invalid: ${zodErrors}`);611}612return null;613};614615// Stage 1: Parse and validate assertions616const normalizedAssertions = assertions617.map((a, i) => normalizeAssertion(a, i))618.filter((a): a is NormalizedAssertion => a !== null);619620const normalizedNoMatch = noMatchAssertions621?.map((a, i) => normalizeAssertion(a, i + assertions.length))622.filter((a): a is NormalizedAssertion => a !== null);623624// Track search texts and their selectors (to know if Decoration role is requested)625// Page role selectors are tracked separately since they don't need text search626const searchTexts = new Set<string>();627const textToSelectors = new Map<string, TextSelector[]>();628const pageSelectors = new Map<number, TextSelector>(); // page number -> selector629630// Helper: check if selector is a Page role (no text search needed)631const isPageRole = (sel: TextSelector): boolean => sel.role === "Page";632633// Helper: get unique key for a selector (for resolvedSelectors map)634// Includes granularity since different granularity settings need different bbox computation635const selectorKey = (sel: TextSelector): string => {636if (isPageRole(sel)) {637return `Page:${sel.page}`;638}639const base = sel.text ?? "";640if (sel.granularity) {641return `${base}@${sel.granularity}`;642}643return base;644};645646// Track unique selectors by their full key (including granularity)647const uniqueSelectors = new Map<string, TextSelector>();648649const addSelector = (sel: TextSelector) => {650if (isPageRole(sel)) {651if (sel.page === undefined) {652errors.push(`Page role requires 'page' field to specify page number`);653return;654}655pageSelectors.set(sel.page, sel);656} else {657if (!sel.text) {658errors.push(`Selector requires 'text' field (unless role is "Page")`);659return;660}661searchTexts.add(sel.text);662const existing = textToSelectors.get(sel.text) ?? [];663existing.push(sel);664textToSelectors.set(sel.text, existing);665// Also track by full key for resolution666uniqueSelectors.set(selectorKey(sel), sel);667}668};669670for (const a of normalizedAssertions) {671addSelector(a.subject);672if (a.object) addSelector(a.object);673}674for (const a of normalizedNoMatch ?? []) {675addSelector(a.subject);676if (a.object) addSelector(a.object);677}678679// Helper: check if any selector for this text is a Decoration (untagged content)680const isDecoration = (text: string): boolean => {681const selectors = textToSelectors.get(text) ?? [];682return selectors.some((s) => s.role === "Decoration");683};684685// Stage 2: Load PDF with pdfjs-dist686// deno-lint-ignore no-explicit-any687const pdfjsLib = await import("pdfjs-dist") as any;688const buffer = await Deno.readFile(file);689const doc = await pdfjsLib.getDocument({ data: buffer }).promise;690691// Stage 3 & 4: Extract content and structure tree per page692const allTextItems: MarkedTextItem[] = [];693const mcidToTextItems = new Map<string, MarkedTextItem[]>();694const mcidToStructNode = new Map<string, StructTreeNode>();695const structNodeToParent = new Map<StructTreeNode, StructTreeNode>();696const pageDimensions = new Map<number, { width: number; height: number }>();697698for (let pageNum = 1; pageNum <= doc.numPages; pageNum++) {699const page = await doc.getPage(pageNum);700const viewport = page.getViewport({ scale: 1.0 });701702// Store page dimensions for Page role703pageDimensions.set(pageNum, { width: viewport.width, height: viewport.height });704705// Get text content with marked content706const textContent = await page.getTextContent({707includeMarkedContent: true,708});709710const pageItems = extractMarkedTextItems(711textContent.items,712pageNum,713viewport.height,714);715allTextItems.push(...pageItems);716717// Build MCID -> text items map718for (const item of pageItems) {719if (item.mcid) {720const existing = mcidToTextItems.get(item.mcid) ?? [];721existing.push(item);722mcidToTextItems.set(item.mcid, existing);723}724}725726// Get structure tree and build MCID -> struct node map + parent map727const structTree = await page.getStructTree();728if (structTree) {729const { mcidMap, parentMap } = buildMcidStructMap(structTree);730for (const [k, v] of mcidMap) {731mcidToStructNode.set(k, v);732}733for (const [k, v] of parentMap) {734structNodeToParent.set(k, v);735}736}737}738739// Stage 5: Find text items for each search text (must be unique, unless Decoration)740const foundTexts = new Map<string, MarkedTextItem>();741const ambiguousTexts = new Set<string>();742for (const searchText of searchTexts) {743const matches = allTextItems.filter((t) => t.str.includes(searchText));744if (matches.length === 1) {745foundTexts.set(searchText, matches[0]);746} else if (matches.length > 1) {747// Decoration role (headers, footers) naturally repeat on each page - allow first match748if (isDecoration(searchText)) {749foundTexts.set(searchText, matches[0]);750} else {751ambiguousTexts.add(searchText);752errors.push(753`Text "${searchText}" is ambiguous - found ${matches.length} matches. Use a more specific search string.`,754);755}756}757// If matches.length === 0, we'll report "not found" later758}759760// Stage 6 & 7: Resolve selectors to structure nodes and compute bboxes761const resolvedSelectors = new Map<string, ResolvedSelector>();762763// First, resolve Page role selectors (no text search needed)764for (const [pageNum, sel] of pageSelectors) {765const dims = pageDimensions.get(pageNum);766if (!dims) {767errors.push(`Page ${pageNum} does not exist in PDF (has ${pageDimensions.size} pages)`);768continue;769}770const key = selectorKey(sel);771resolvedSelectors.set(key, {772selector: sel,773textItem: { str: "", x: 0, y: 0, width: 0, height: 0, mcid: null, page: pageNum },774structNode: null,775bbox: {776x: 0,777y: 0,778width: dims.width,779height: dims.height,780page: pageNum,781},782});783}784785// Then, resolve text-based selectors (iterate by unique selector key to handle granularity)786for (const [key, selector] of uniqueSelectors) {787const searchText = selector.text!;788const textItem = foundTexts.get(searchText);789if (!textItem) {790// Don't report "not found" if we already reported "ambiguous"791if (!ambiguousTexts.has(searchText)) {792errors.push(`Text not found in PDF: "${searchText}"`);793}794continue;795}796797let structNode: StructTreeNode | null = null;798let bbox: BBox;799800// Decoration role: use text item bounds directly (for headers, footers, page decorations)801if (isDecoration(searchText)) {802bbox = {803x: textItem.x,804y: textItem.y,805width: textItem.width,806height: textItem.height,807page: textItem.page,808};809} else if (!textItem.mcid) {810errors.push(811`Text "${searchText}" has no MCID - PDF may not be tagged. Use role: "Decoration" for untagged page elements like headers/footers.`,812);813continue;814} else {815structNode = mcidToStructNode.get(textItem.mcid) ?? null;816817// Check for granularity: aggregate bbox to ancestor with target role818if (selector.granularity && structNode) {819const ancestor = findAncestorWithRole(structNode, selector.granularity, structNodeToParent);820if (ancestor) {821// Collect ALL MCIDs recursively under that ancestor822const allMcids = collectAllMcids(ancestor);823const allItems = allMcids.flatMap((id) => mcidToTextItems.get(id) ?? []);824const ancestorBBox = unionBBox(allItems);825if (ancestorBBox) {826bbox = ancestorBBox;827} else {828errors.push(829`Could not compute bbox for "${searchText}" with granularity "${selector.granularity}" - no content items found`,830);831continue;832}833} else {834errors.push(835`No ancestor with role "${selector.granularity}" found for "${searchText}"`,836);837continue;838}839} else {840// Same-MCID approach: compute bbox from all text items sharing this MCID841const mcidItems = mcidToTextItems.get(textItem.mcid);842if (mcidItems && mcidItems.length > 0) {843const mcidBBox = unionBBox(mcidItems);844if (mcidBBox) {845bbox = mcidBBox;846} else {847errors.push(848`Could not compute bbox for "${searchText}" - all text items in MCID are whitespace-only`,849);850continue;851}852} else {853errors.push(854`No text items found for MCID ${textItem.mcid} containing "${searchText}"`,855);856continue;857}858}859}860861resolvedSelectors.set(key, {862selector,863textItem,864structNode,865bbox,866});867}868869// Validate role assertions (skip Page role since it's a virtual selector)870for (const a of normalizedAssertions) {871if (isPageRole(a.subject)) continue; // Page role has no struct node to validate872873const resolved = resolvedSelectors.get(selectorKey(a.subject));874if (!resolved) continue;875876if (a.subject.role && resolved.structNode) {877if (resolved.structNode.role !== a.subject.role) {878errors.push(879`Role mismatch for "${a.subject.text}": expected ${a.subject.role}, got ${resolved.structNode.role}`,880);881}882}883884if (a.object && !isPageRole(a.object)) {885const resolvedObj = resolvedSelectors.get(selectorKey(a.object));886if (!resolvedObj) continue;887888if (a.object.role && resolvedObj.structNode) {889if (resolvedObj.structNode.role !== a.object.role) {890errors.push(891`Role mismatch for "${a.object.text}": expected ${a.object.role}, got ${resolvedObj.structNode.role}`,892);893}894}895}896}897898// Stage 8: Evaluate position assertions899// Note: Zod validation in Stage 1 already handles:900// - Unknown relations901// - byMin/byMax with alignment relations (via .strict())902// - byMin > byMax (via .refine())903for (const a of normalizedAssertions) {904// Tag-only assertions (no relation/object)905if (!a.relation || !a.object) {906continue; // Already validated in stage 6907}908909const subjectKey = selectorKey(a.subject);910const objectKey = selectorKey(a.object);911const subjectResolved = resolvedSelectors.get(subjectKey);912const objectResolved = resolvedSelectors.get(objectKey);913914if (!subjectResolved || !objectResolved) {915continue; // Error already recorded916}917918// Check same page919if (subjectResolved.bbox.page !== objectResolved.bbox.page) {920errors.push(921`Cannot compare positions: "${subjectKey}" is on page ${subjectResolved.bbox.page}, ` +922`"${objectKey}" is on page ${objectResolved.bbox.page}`,923);924continue;925}926927// Evaluate relation based on type (Zod guarantees valid relation type)928const isDirectional = directionalRelations.has(a.relation);929if (isDirectional) {930const result = evaluateDirectionalRelation(931a.relation as DirectionalRelation,932subjectResolved.bbox,933objectResolved.bbox,934a.subject.edge,935a.object.edge,936a.byMin,937a.byMax,938);939940if (!result.passed) {941const distanceInfo = a.byMin !== undefined || a.byMax !== undefined942? ` Distance: ${result.distance.toFixed(1)}pt` +943(a.byMin !== undefined ? ` (required >= ${a.byMin}pt)` : "") +944(a.byMax !== undefined ? ` (required <= ${a.byMax}pt)` : "")945: "";946errors.push(947`Position assertion failed (page ${subjectResolved.bbox.page}): "${subjectKey}" is NOT ${a.relation} "${objectKey}".` +948` Subject.${result.subjectEdge}=${result.subjectValue.toFixed(1)},` +949` Object.${result.objectEdge}=${result.objectValue.toFixed(1)}.${distanceInfo}` +950(result.failureReason ? ` (${result.failureReason})` : ""),951);952}953} else {954// Alignment relation955const result = evaluateAlignmentRelation(956a.relation as AlignmentRelation,957subjectResolved.bbox,958objectResolved.bbox,959a.tolerance,960a.subject.edge,961a.object.edge,962);963964if (!result.passed) {965errors.push(966`Position assertion failed (page ${subjectResolved.bbox.page}): "${subjectKey}" is NOT ${a.relation} "${objectKey}".` +967` Subject.${result.subjectEdge}=${result.subjectValue.toFixed(1)},` +968` Object.${result.objectEdge}=${result.objectValue.toFixed(1)}.` +969` Difference: ${result.difference.toFixed(1)}pt (tolerance: ${a.tolerance}pt)`,970);971}972}973}974975// Evaluate negative assertions976// Note: Zod validation already handled in Stage 1977for (const a of normalizedNoMatch ?? []) {978if (!a.relation || !a.object) continue;979980const subjectKey = selectorKey(a.subject);981const objectKey = selectorKey(a.object);982const subjectResolved = resolvedSelectors.get(subjectKey);983const objectResolved = resolvedSelectors.get(objectKey);984985if (!subjectResolved || !objectResolved) {986continue; // Assertion trivially doesn't hold987}988989if (subjectResolved.bbox.page !== objectResolved.bbox.page) {990continue; // Assertion trivially doesn't hold991}992993// Evaluate relation based on type (Zod guarantees valid relation type)994const isDirectional = directionalRelations.has(a.relation);995let passed: boolean;996let resultInfo: string;997998if (isDirectional) {999const result = evaluateDirectionalRelation(1000a.relation as DirectionalRelation,1001subjectResolved.bbox,1002objectResolved.bbox,1003a.subject.edge,1004a.object.edge,1005a.byMin,1006a.byMax,1007);1008passed = result.passed;1009resultInfo = `Subject.${result.subjectEdge}=${result.subjectValue.toFixed(1)}, ` +1010`Object.${result.objectEdge}=${result.objectValue.toFixed(1)}, ` +1011`distance=${result.distance.toFixed(1)}pt`;1012} else {1013const result = evaluateAlignmentRelation(1014a.relation as AlignmentRelation,1015subjectResolved.bbox,1016objectResolved.bbox,1017a.tolerance,1018a.subject.edge,1019a.object.edge,1020);1021passed = result.passed;1022resultInfo = `Subject.${result.subjectEdge}=${result.subjectValue.toFixed(1)}, ` +1023`Object.${result.objectEdge}=${result.objectValue.toFixed(1)}, ` +1024`difference=${result.difference.toFixed(1)}pt`;1025}10261027if (passed) {1028errors.push(1029`Negative assertion failed (page ${subjectResolved.bbox.page}): "${subjectKey}" IS ${a.relation} "${objectKey}" (expected NOT to be). ` +1030resultInfo,1031);1032}1033}10341035// Stage 9: Aggregate errors1036if (errors.length > 0) {1037assert(1038false,1039`PDF position assertions failed in ${file}:\n${errors.map((e, i) => ` ${i + 1}. ${e}`).join("\n")}`,1040);1041}1042},1043};1044};104510461047