Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
quarto-dev
GitHub Repository: quarto-dev/quarto-cli
Path: blob/main/tests/verify-pdf-text-position.ts
12923 views
1
/*
2
* verify-pdf-text-position.ts
3
*
4
* PDF text position verification using semantic structure tree.
5
* Uses pdfjs-dist directly to access MCIDs and structure tree.
6
*
7
* REQUIREMENTS:
8
* This module requires tagged PDFs with PDF 1.4+ structure tree support.
9
* Tagged PDFs contain Marked Content Identifiers (MCIDs) that link text
10
* content to semantic structure elements (P, H1, Figure, Table, etc.).
11
*
12
* Currently confirmed working:
13
* - Typst: Produces tagged PDFs by default
14
*
15
* Not yet working:
16
* - LaTeX: Requires \DocumentMetadata{} before \documentclass for tagging,
17
* which Quarto doesn't currently support. When LaTeX tagged PDF support
18
* is available, this module should work with minimal changes since we
19
* use only basic PDF 1.4 tagged structure features.
20
* - ConTeXt: Pandoc supports +tagging extension, but Quarto's context
21
* format doesn't compile to PDF.
22
*
23
* SPECIAL ROLES:
24
* - role: "Decoration" - Use for untagged page elements like headers, footers,
25
* page numbers, and other decorations. These use text item bounds directly
26
* instead of requiring MCID/structure tree support.
27
* - role: "Page" - Use for the entire page bounds. Requires `page` field to
28
* specify which page number (1-indexed). The `text` field is ignored.
29
* Useful for NOT assertions since Page intersects all content on that page.
30
*
31
* Copyright (C) 2020-2025 Posit Software, PBC
32
*/
33
34
import { assert } from "testing/asserts";
35
import { z } from "zod";
36
import { ExecuteOutput, Verify } from "./test.ts";
37
38
// ============================================================================
39
// Zod Schemas and Type Definitions
40
// ============================================================================
41
42
// Edge schema for precise bbox edge selection
43
export const EdgeSchema = z.enum(["left", "right", "top", "bottom", "centerX", "centerY"]);
44
export type Edge = z.infer<typeof EdgeSchema>;
45
46
// Relation schemas
47
export const DirectionalRelationSchema = z.enum(["leftOf", "rightOf", "above", "below"]);
48
export const AlignmentRelationSchema = z.enum(["leftAligned", "rightAligned", "topAligned", "bottomAligned"]);
49
export const RelationSchema = z.union([DirectionalRelationSchema, AlignmentRelationSchema]);
50
51
export type DirectionalRelation = z.infer<typeof DirectionalRelationSchema>;
52
export type AlignmentRelation = z.infer<typeof AlignmentRelationSchema>;
53
export type Relation = z.infer<typeof RelationSchema>;
54
55
// Text selector schema
56
// Note: Label/ID checking is not supported because:
57
// 1. Typst does not write labels to PDF StructElem /ID attributes (labels become
58
// named destinations for links, but not structure element identifiers)
59
// 2. Even if IDs were present, pdf.js doesn't expose /ID through getStructTree()
60
export const TextSelectorSchema = z.object({
61
text: z.string().optional(), // Text to search for (ignored for role: "Page")
62
role: z.string().optional(), // PDF 1.4 structure role: P, H1, H2, Figure, Table, Span, etc.
63
page: z.number().optional(), // Page number (1-indexed), required for role: "Page"
64
edge: EdgeSchema.optional(), // Which edge to use for comparison (overrides relation default)
65
granularity: z.string().optional(), // Aggregate bbox to ancestor with this role (e.g., "Div", "P")
66
});
67
export type TextSelector = z.infer<typeof TextSelectorSchema>;
68
69
// Subject/object can be a string or a TextSelector
70
const SubjectObjectSchema = z.union([z.string(), TextSelectorSchema]);
71
72
// Tag-only assertion: validates semantic role without position comparison
73
export const TagOnlyAssertionSchema = z.object({
74
subject: SubjectObjectSchema,
75
}).strict();
76
export type TagOnlyAssertion = z.infer<typeof TagOnlyAssertionSchema>;
77
78
// Directional assertion: leftOf, rightOf, above, below with optional distance constraints
79
export const DirectionalAssertionSchema = z.object({
80
subject: SubjectObjectSchema,
81
relation: DirectionalRelationSchema,
82
object: SubjectObjectSchema,
83
byMin: z.number().optional(), // Minimum distance between edges
84
byMax: z.number().optional(), // Maximum distance between edges
85
}).refine(
86
(data) => data.byMin === undefined || data.byMax === undefined || data.byMin <= data.byMax,
87
{ message: "byMin must be <= byMax" }
88
);
89
export type DirectionalAssertion = z.infer<typeof DirectionalAssertionSchema>;
90
91
// Alignment assertion: leftAligned, rightAligned, topAligned, bottomAligned with tolerance
92
export const AlignmentAssertionSchema = z.object({
93
subject: SubjectObjectSchema,
94
relation: AlignmentRelationSchema,
95
object: SubjectObjectSchema,
96
tolerance: z.number().optional(), // Default: 2pt
97
}).strict();
98
export type AlignmentAssertion = z.infer<typeof AlignmentAssertionSchema>;
99
100
// Union of all assertion types
101
export const PdfTextPositionAssertionSchema = z.union([
102
DirectionalAssertionSchema,
103
AlignmentAssertionSchema,
104
TagOnlyAssertionSchema,
105
]);
106
export type PdfTextPositionAssertion = z.infer<typeof PdfTextPositionAssertionSchema>;
107
108
// Type guards for assertion discrimination (using Zod safeParse)
109
export function isDirectionalAssertion(a: unknown): a is DirectionalAssertion {
110
return DirectionalAssertionSchema.safeParse(a).success;
111
}
112
113
export function isAlignmentAssertion(a: unknown): a is AlignmentAssertion {
114
return AlignmentAssertionSchema.safeParse(a).success;
115
}
116
117
export function isTagOnlyAssertion(a: unknown): a is TagOnlyAssertion {
118
return TagOnlyAssertionSchema.safeParse(a).success;
119
}
120
121
// Computed bounding box
122
interface BBox {
123
x: number;
124
y: number;
125
width: number;
126
height: number;
127
page: number;
128
}
129
130
// Internal: text item with MCID tracking
131
interface MarkedTextItem {
132
str: string;
133
x: number;
134
y: number;
135
width: number;
136
height: number;
137
mcid: string | null; // e.g., "p2R_mc0"
138
page: number;
139
}
140
141
// Structure tree node (from pdfjs-dist)
142
interface StructTreeNode {
143
role: string;
144
children?: (StructTreeNode | StructTreeContent)[];
145
alt?: string;
146
lang?: string;
147
}
148
149
interface StructTreeContent {
150
type: "content" | "object" | "annotation";
151
id: string;
152
}
153
154
// Text content item types from pdfjs-dist
155
interface TextItem {
156
str: string;
157
dir: string;
158
transform: number[];
159
width: number;
160
height: number;
161
fontName: string;
162
hasEOL: boolean;
163
}
164
165
interface TextMarkedContent {
166
type: "beginMarkedContent" | "beginMarkedContentProps" | "endMarkedContent";
167
id?: string;
168
tag?: string;
169
}
170
171
// Internal: resolved selector with computed bounds
172
interface ResolvedSelector {
173
selector: TextSelector;
174
textItem: MarkedTextItem;
175
structNode: StructTreeNode | null;
176
bbox: BBox;
177
}
178
179
// ============================================================================
180
// Constants
181
// ============================================================================
182
183
const DEFAULT_ALIGNMENT_TOLERANCE = 2;
184
185
// ============================================================================
186
// Relation Predicates and Edge Logic
187
// ============================================================================
188
189
// Coordinate system: origin at top-left, y increases downward
190
191
// Derive relation sets from Zod schemas
192
const directionalRelations: Set<Relation> = new Set(DirectionalRelationSchema.options);
193
const alignmentRelations: Set<Relation> = new Set(AlignmentRelationSchema.options);
194
195
// Default edges for each relation (from spec table)
196
const relationDefaults: Record<Relation, { subject: Edge; object: Edge }> = {
197
leftOf: { subject: "right", object: "left" },
198
rightOf: { subject: "left", object: "right" },
199
above: { subject: "bottom", object: "top" },
200
below: { subject: "top", object: "bottom" },
201
leftAligned: { subject: "left", object: "left" },
202
rightAligned: { subject: "right", object: "right" },
203
topAligned: { subject: "top", object: "top" },
204
bottomAligned: { subject: "bottom", object: "bottom" },
205
};
206
207
// Extract edge value from bbox
208
function getEdgeValue(bbox: BBox, edge: Edge): number {
209
switch (edge) {
210
case "left":
211
return bbox.x;
212
case "right":
213
return bbox.x + bbox.width;
214
case "top":
215
return bbox.y;
216
case "bottom":
217
return bbox.y + bbox.height;
218
case "centerX":
219
return bbox.x + bbox.width / 2;
220
case "centerY":
221
return bbox.y + bbox.height / 2;
222
}
223
}
224
225
// Evaluate directional relation with edge overrides and distance constraints
226
interface DirectionalResult {
227
passed: boolean;
228
subjectEdge: Edge;
229
objectEdge: Edge;
230
subjectValue: number;
231
objectValue: number;
232
distance: number;
233
failureReason?: string;
234
}
235
236
function evaluateDirectionalRelation(
237
relation: DirectionalRelation,
238
subjectBBox: BBox,
239
objectBBox: BBox,
240
subjectEdgeOverride?: Edge,
241
objectEdgeOverride?: Edge,
242
byMin?: number,
243
byMax?: number,
244
): DirectionalResult {
245
const defaults = relationDefaults[relation];
246
const subjectEdge = subjectEdgeOverride ?? defaults.subject;
247
const objectEdge = objectEdgeOverride ?? defaults.object;
248
249
const subjectValue = getEdgeValue(subjectBBox, subjectEdge);
250
const objectValue = getEdgeValue(objectBBox, objectEdge);
251
252
// Distance calculation depends on relation direction
253
// For leftOf/above: distance = objectEdge - subjectEdge (positive when relation holds)
254
// For rightOf/below: distance = subjectEdge - objectEdge (positive when relation holds)
255
let distance: number;
256
let directionPassed: boolean;
257
258
if (relation === "leftOf" || relation === "above") {
259
distance = objectValue - subjectValue;
260
directionPassed = subjectValue < objectValue;
261
} else {
262
// rightOf or below
263
distance = subjectValue - objectValue;
264
directionPassed = subjectValue > objectValue;
265
}
266
267
const result: DirectionalResult = {
268
passed: true,
269
subjectEdge,
270
objectEdge,
271
subjectValue,
272
objectValue,
273
distance,
274
};
275
276
// Check directional constraint
277
if (!directionPassed) {
278
result.passed = false;
279
result.failureReason = "directional constraint not satisfied";
280
return result;
281
}
282
283
// Check byMin constraint
284
if (byMin !== undefined && distance < byMin) {
285
result.passed = false;
286
result.failureReason = `distance ${distance.toFixed(1)}pt < byMin ${byMin}pt`;
287
return result;
288
}
289
290
// Check byMax constraint
291
if (byMax !== undefined && distance > byMax) {
292
result.passed = false;
293
result.failureReason = `distance ${distance.toFixed(1)}pt > byMax ${byMax}pt`;
294
return result;
295
}
296
297
return result;
298
}
299
300
// Evaluate alignment relation with edge overrides
301
interface AlignmentResult {
302
passed: boolean;
303
subjectEdge: Edge;
304
objectEdge: Edge;
305
subjectValue: number;
306
objectValue: number;
307
difference: number;
308
}
309
310
function evaluateAlignmentRelation(
311
relation: AlignmentRelation,
312
subjectBBox: BBox,
313
objectBBox: BBox,
314
tolerance: number,
315
subjectEdgeOverride?: Edge,
316
objectEdgeOverride?: Edge,
317
): AlignmentResult {
318
const defaults = relationDefaults[relation];
319
const subjectEdge = subjectEdgeOverride ?? defaults.subject;
320
const objectEdge = objectEdgeOverride ?? defaults.object;
321
322
const subjectValue = getEdgeValue(subjectBBox, subjectEdge);
323
const objectValue = getEdgeValue(objectBBox, objectEdge);
324
const difference = Math.abs(subjectValue - objectValue);
325
326
return {
327
passed: difference <= tolerance,
328
subjectEdge,
329
objectEdge,
330
subjectValue,
331
objectValue,
332
difference,
333
};
334
}
335
336
// ============================================================================
337
// Helper Functions
338
// ============================================================================
339
340
function normalizeSelector(s: string | TextSelector): TextSelector {
341
if (typeof s === "string") {
342
return { text: s };
343
}
344
return s;
345
}
346
347
function isStructTreeContent(node: StructTreeNode | StructTreeContent): node is StructTreeContent {
348
return "type" in node && (node.type === "content" || node.type === "object" || node.type === "annotation");
349
}
350
351
function isTextItem(item: TextItem | TextMarkedContent): item is TextItem {
352
return "str" in item && typeof item.str === "string";
353
}
354
355
function isTextMarkedContent(item: TextItem | TextMarkedContent): item is TextMarkedContent {
356
return "type" in item && typeof item.type === "string";
357
}
358
359
/**
360
* Extract MarkedTextItem[] from pdfjs getTextContent result.
361
* Tracks current MCID as we iterate through interleaved items.
362
*/
363
function extractMarkedTextItems(
364
items: (TextItem | TextMarkedContent)[],
365
pageNum: number,
366
pageHeight: number,
367
): MarkedTextItem[] {
368
const result: MarkedTextItem[] = [];
369
let currentMcid: string | null = null;
370
371
for (const item of items) {
372
if (isTextMarkedContent(item)) {
373
if (item.type === "beginMarkedContentProps" && item.id) {
374
currentMcid = item.id;
375
} else if (item.type === "endMarkedContent") {
376
currentMcid = null;
377
}
378
} else if (isTextItem(item)) {
379
// Transform: [scaleX, skewX, skewY, scaleY, translateX, translateY]
380
const tm = item.transform;
381
const x = tm[4];
382
// Convert from PDF coordinates (bottom-left origin) to top-left origin
383
const y = pageHeight - tm[5];
384
const height = Math.sqrt(tm[2] * tm[2] + tm[3] * tm[3]);
385
386
result.push({
387
str: item.str,
388
x,
389
y,
390
width: item.width,
391
height,
392
mcid: currentMcid,
393
page: pageNum,
394
});
395
}
396
}
397
398
return result;
399
}
400
401
/**
402
* Recursively build MCID -> StructNode map and parent map from structure tree.
403
* Returns the struct node that directly contains the MCID content, plus a map
404
* from each struct node to its parent for tree traversal.
405
*/
406
function buildMcidStructMap(
407
tree: StructTreeNode | null,
408
mcidMap: Map<string, StructTreeNode> = new Map(),
409
parentMap: Map<StructTreeNode, StructTreeNode> = new Map(),
410
parentNode: StructTreeNode | null = null,
411
): { mcidMap: Map<string, StructTreeNode>; parentMap: Map<StructTreeNode, StructTreeNode> } {
412
if (!tree) return { mcidMap, parentMap };
413
414
for (const child of tree.children ?? []) {
415
if (isStructTreeContent(child)) {
416
if (child.type === "content" && child.id) {
417
// Map MCID to the parent struct node (the semantic element)
418
mcidMap.set(child.id, parentNode ?? tree);
419
}
420
} else {
421
// Record parent for tree traversal
422
if (parentNode) {
423
parentMap.set(child, parentNode);
424
} else {
425
// Root-level children have tree as parent
426
parentMap.set(child, tree);
427
}
428
// Recurse into child struct nodes
429
buildMcidStructMap(child, mcidMap, parentMap, child);
430
}
431
}
432
433
return { mcidMap, parentMap };
434
}
435
436
/**
437
* Collect only direct MCIDs under a structure node (non-recursive).
438
* Does not descend into child structure nodes.
439
*/
440
function collectDirectMcids(node: StructTreeNode): string[] {
441
const mcids: string[] = [];
442
443
for (const child of node.children ?? []) {
444
if (isStructTreeContent(child)) {
445
if (child.type === "content" && child.id) {
446
mcids.push(child.id);
447
}
448
}
449
// Do NOT recurse into child struct nodes
450
}
451
452
return mcids;
453
}
454
455
/**
456
* Recursively collect ALL MCIDs under a structure node and its descendants.
457
* Used for granularity aggregation to compute bbox of an entire subtree.
458
*/
459
function collectAllMcids(node: StructTreeNode): string[] {
460
const mcids: string[] = [];
461
462
for (const child of node.children ?? []) {
463
if (isStructTreeContent(child)) {
464
if (child.type === "content" && child.id) {
465
mcids.push(child.id);
466
}
467
} else {
468
// Recurse into child struct nodes
469
mcids.push(...collectAllMcids(child));
470
}
471
}
472
473
return mcids;
474
}
475
476
/**
477
* Walk up the structure tree to find the nearest ancestor with a matching role.
478
* Returns null if no ancestor with the target role is found.
479
*/
480
function findAncestorWithRole(
481
node: StructTreeNode,
482
targetRole: string,
483
parentMap: Map<StructTreeNode, StructTreeNode>,
484
): StructTreeNode | null {
485
let current: StructTreeNode | undefined = node;
486
while (current) {
487
if (current.role === targetRole) {
488
return current;
489
}
490
current = parentMap.get(current);
491
}
492
return null;
493
}
494
495
/**
496
* Check if a string is whitespace-only (including empty).
497
* Used to filter out horizontal skip spaces in PDF content.
498
*/
499
function isWhitespaceOnly(str: string): boolean {
500
return str.trim().length === 0;
501
}
502
503
/**
504
* Compute union bounding box from multiple items.
505
* Filters out whitespace-only text items to avoid including horizontal skips.
506
*/
507
function unionBBox(items: MarkedTextItem[]): BBox | null {
508
// Filter out whitespace-only items (these are often horizontal skips)
509
const contentItems = items.filter((item) => !isWhitespaceOnly(item.str));
510
if (contentItems.length === 0) return null;
511
512
let minX = Infinity;
513
let minY = Infinity;
514
let maxX = -Infinity;
515
let maxY = -Infinity;
516
const page = contentItems[0].page;
517
518
for (const item of contentItems) {
519
minX = Math.min(minX, item.x);
520
minY = Math.min(minY, item.y);
521
maxX = Math.max(maxX, item.x + item.width);
522
maxY = Math.max(maxY, item.y + item.height);
523
}
524
525
return {
526
x: minX,
527
y: minY,
528
width: maxX - minX,
529
height: maxY - minY,
530
page,
531
};
532
}
533
534
/**
535
* Compute semantic bounding box for a structure node.
536
* Uses only direct MCIDs (non-recursive) to avoid including nested elements
537
* like margin content that may be children of body paragraphs.
538
*/
539
function computeStructBBox(
540
node: StructTreeNode,
541
mcidToTextItems: Map<string, MarkedTextItem[]>,
542
): BBox | null {
543
const mcids = collectDirectMcids(node);
544
const items = mcids.flatMap((id) => mcidToTextItems.get(id) ?? []);
545
return unionBBox(items);
546
}
547
548
// ============================================================================
549
// Main Predicate
550
// ============================================================================
551
552
/**
553
* Verify spatial positions of text in a rendered PDF using semantic structure.
554
* Uses pdfjs-dist to access MCIDs and structure tree.
555
*/
556
export const ensurePdfTextPositions = (
557
file: string,
558
assertions: PdfTextPositionAssertion[],
559
noMatchAssertions?: PdfTextPositionAssertion[],
560
): Verify => {
561
return {
562
name: `Inspecting ${file} for text position assertions`,
563
verify: async (_output: ExecuteOutput[]) => {
564
const errors: string[] = [];
565
566
// Internal normalized assertion type for processing
567
type NormalizedAssertion = {
568
subject: TextSelector;
569
relation?: Relation;
570
object?: TextSelector;
571
tolerance: number;
572
byMin?: number;
573
byMax?: number;
574
};
575
576
// Validate and normalize an assertion using Zod
577
const normalizeAssertion = (a: unknown, index: number): NormalizedAssertion | null => {
578
// Try parsing as each type in order of specificity
579
const directionalResult = DirectionalAssertionSchema.safeParse(a);
580
if (directionalResult.success) {
581
const d = directionalResult.data;
582
return {
583
subject: normalizeSelector(d.subject),
584
relation: d.relation,
585
object: normalizeSelector(d.object),
586
tolerance: DEFAULT_ALIGNMENT_TOLERANCE,
587
byMin: d.byMin,
588
byMax: d.byMax,
589
};
590
}
591
592
const alignmentResult = AlignmentAssertionSchema.safeParse(a);
593
if (alignmentResult.success) {
594
const al = alignmentResult.data;
595
return {
596
subject: normalizeSelector(al.subject),
597
relation: al.relation,
598
object: normalizeSelector(al.object),
599
tolerance: al.tolerance ?? DEFAULT_ALIGNMENT_TOLERANCE,
600
};
601
}
602
603
const tagOnlyResult = TagOnlyAssertionSchema.safeParse(a);
604
if (tagOnlyResult.success) {
605
return {
606
subject: normalizeSelector(tagOnlyResult.data.subject),
607
tolerance: DEFAULT_ALIGNMENT_TOLERANCE,
608
};
609
}
610
611
// None of the schemas matched - report validation error
612
const fullResult = PdfTextPositionAssertionSchema.safeParse(a);
613
if (!fullResult.success) {
614
const zodErrors = fullResult.error.errors.map(e => `${e.path.join(".")}: ${e.message}`).join("; ");
615
errors.push(`Assertion ${index + 1} is invalid: ${zodErrors}`);
616
}
617
return null;
618
};
619
620
// Stage 1: Parse and validate assertions
621
const normalizedAssertions = assertions
622
.map((a, i) => normalizeAssertion(a, i))
623
.filter((a): a is NormalizedAssertion => a !== null);
624
625
const normalizedNoMatch = noMatchAssertions
626
?.map((a, i) => normalizeAssertion(a, i + assertions.length))
627
.filter((a): a is NormalizedAssertion => a !== null);
628
629
// Track search texts and their selectors (to know if Decoration role is requested)
630
// Page role selectors are tracked separately since they don't need text search
631
const searchTexts = new Set<string>();
632
const textToSelectors = new Map<string, TextSelector[]>();
633
const pageSelectors = new Map<number, TextSelector>(); // page number -> selector
634
635
// Helper: check if selector is a Page role (no text search needed)
636
const isPageRole = (sel: TextSelector): boolean => sel.role === "Page";
637
638
// Helper: get unique key for a selector (for resolvedSelectors map)
639
// Includes granularity since different granularity settings need different bbox computation
640
const selectorKey = (sel: TextSelector): string => {
641
if (isPageRole(sel)) {
642
return `Page:${sel.page}`;
643
}
644
const base = sel.text ?? "";
645
if (sel.granularity) {
646
return `${base}@${sel.granularity}`;
647
}
648
return base;
649
};
650
651
// Track unique selectors by their full key (including granularity)
652
const uniqueSelectors = new Map<string, TextSelector>();
653
654
const addSelector = (sel: TextSelector) => {
655
if (isPageRole(sel)) {
656
if (sel.page === undefined) {
657
errors.push(`Page role requires 'page' field to specify page number`);
658
return;
659
}
660
pageSelectors.set(sel.page, sel);
661
} else {
662
if (!sel.text) {
663
errors.push(`Selector requires 'text' field (unless role is "Page")`);
664
return;
665
}
666
searchTexts.add(sel.text);
667
const existing = textToSelectors.get(sel.text) ?? [];
668
existing.push(sel);
669
textToSelectors.set(sel.text, existing);
670
// Also track by full key for resolution
671
uniqueSelectors.set(selectorKey(sel), sel);
672
}
673
};
674
675
for (const a of normalizedAssertions) {
676
addSelector(a.subject);
677
if (a.object) addSelector(a.object);
678
}
679
for (const a of normalizedNoMatch ?? []) {
680
addSelector(a.subject);
681
if (a.object) addSelector(a.object);
682
}
683
684
// Helper: check if any selector for this text is a Decoration (untagged content)
685
const isDecoration = (text: string): boolean => {
686
const selectors = textToSelectors.get(text) ?? [];
687
return selectors.some((s) => s.role === "Decoration");
688
};
689
690
// Stage 2: Load PDF with pdfjs-dist
691
// deno-lint-ignore no-explicit-any
692
const pdfjsLib = await import("pdfjs-dist") as any;
693
const buffer = await Deno.readFile(file);
694
const doc = await pdfjsLib.getDocument({ data: buffer }).promise;
695
696
// Stage 3 & 4: Extract content and structure tree per page
697
const allTextItems: MarkedTextItem[] = [];
698
const mcidToTextItems = new Map<string, MarkedTextItem[]>();
699
const mcidToStructNode = new Map<string, StructTreeNode>();
700
const structNodeToParent = new Map<StructTreeNode, StructTreeNode>();
701
const pageDimensions = new Map<number, { width: number; height: number }>();
702
703
for (let pageNum = 1; pageNum <= doc.numPages; pageNum++) {
704
const page = await doc.getPage(pageNum);
705
const viewport = page.getViewport({ scale: 1.0 });
706
707
// Store page dimensions for Page role
708
pageDimensions.set(pageNum, { width: viewport.width, height: viewport.height });
709
710
// Get text content with marked content
711
const textContent = await page.getTextContent({
712
includeMarkedContent: true,
713
});
714
715
const pageItems = extractMarkedTextItems(
716
textContent.items,
717
pageNum,
718
viewport.height,
719
);
720
allTextItems.push(...pageItems);
721
722
// Build MCID -> text items map
723
for (const item of pageItems) {
724
if (item.mcid) {
725
const existing = mcidToTextItems.get(item.mcid) ?? [];
726
existing.push(item);
727
mcidToTextItems.set(item.mcid, existing);
728
}
729
}
730
731
// Get structure tree and build MCID -> struct node map + parent map
732
const structTree = await page.getStructTree();
733
if (structTree) {
734
const { mcidMap, parentMap } = buildMcidStructMap(structTree);
735
for (const [k, v] of mcidMap) {
736
mcidToStructNode.set(k, v);
737
}
738
for (const [k, v] of parentMap) {
739
structNodeToParent.set(k, v);
740
}
741
}
742
}
743
744
// Stage 5: Find text items for each search text (must be unique, unless Decoration)
745
const foundTexts = new Map<string, MarkedTextItem>();
746
const ambiguousTexts = new Set<string>();
747
for (const searchText of searchTexts) {
748
const matches = allTextItems.filter((t) => t.str.includes(searchText));
749
if (matches.length === 1) {
750
foundTexts.set(searchText, matches[0]);
751
} else if (matches.length > 1) {
752
// Decoration role (headers, footers) naturally repeat on each page - allow first match
753
if (isDecoration(searchText)) {
754
foundTexts.set(searchText, matches[0]);
755
} else {
756
ambiguousTexts.add(searchText);
757
errors.push(
758
`Text "${searchText}" is ambiguous - found ${matches.length} matches. Use a more specific search string.`,
759
);
760
}
761
}
762
// If matches.length === 0, we'll report "not found" later
763
}
764
765
// Stage 6 & 7: Resolve selectors to structure nodes and compute bboxes
766
const resolvedSelectors = new Map<string, ResolvedSelector>();
767
768
// First, resolve Page role selectors (no text search needed)
769
for (const [pageNum, sel] of pageSelectors) {
770
const dims = pageDimensions.get(pageNum);
771
if (!dims) {
772
errors.push(`Page ${pageNum} does not exist in PDF (has ${pageDimensions.size} pages)`);
773
continue;
774
}
775
const key = selectorKey(sel);
776
resolvedSelectors.set(key, {
777
selector: sel,
778
textItem: { str: "", x: 0, y: 0, width: 0, height: 0, mcid: null, page: pageNum },
779
structNode: null,
780
bbox: {
781
x: 0,
782
y: 0,
783
width: dims.width,
784
height: dims.height,
785
page: pageNum,
786
},
787
});
788
}
789
790
// Then, resolve text-based selectors (iterate by unique selector key to handle granularity)
791
for (const [key, selector] of uniqueSelectors) {
792
const searchText = selector.text!;
793
const textItem = foundTexts.get(searchText);
794
if (!textItem) {
795
// Don't report "not found" if we already reported "ambiguous"
796
if (!ambiguousTexts.has(searchText)) {
797
errors.push(`Text not found in PDF: "${searchText}"`);
798
}
799
continue;
800
}
801
802
let structNode: StructTreeNode | null = null;
803
let bbox: BBox;
804
805
// Decoration role: use text item bounds directly (for headers, footers, page decorations)
806
if (isDecoration(searchText)) {
807
bbox = {
808
x: textItem.x,
809
y: textItem.y,
810
width: textItem.width,
811
height: textItem.height,
812
page: textItem.page,
813
};
814
} else if (!textItem.mcid) {
815
errors.push(
816
`Text "${searchText}" has no MCID - PDF may not be tagged. Use role: "Decoration" for untagged page elements like headers/footers.`,
817
);
818
continue;
819
} else {
820
structNode = mcidToStructNode.get(textItem.mcid) ?? null;
821
822
// Check for granularity: aggregate bbox to ancestor with target role
823
if (selector.granularity && structNode) {
824
const ancestor = findAncestorWithRole(structNode, selector.granularity, structNodeToParent);
825
if (ancestor) {
826
// Collect ALL MCIDs recursively under that ancestor
827
const allMcids = collectAllMcids(ancestor);
828
const allItems = allMcids.flatMap((id) => mcidToTextItems.get(id) ?? []);
829
const ancestorBBox = unionBBox(allItems);
830
if (ancestorBBox) {
831
bbox = ancestorBBox;
832
} else {
833
errors.push(
834
`Could not compute bbox for "${searchText}" with granularity "${selector.granularity}" - no content items found`,
835
);
836
continue;
837
}
838
} else {
839
errors.push(
840
`No ancestor with role "${selector.granularity}" found for "${searchText}"`,
841
);
842
continue;
843
}
844
} else {
845
// Same-MCID approach: compute bbox from all text items sharing this MCID
846
const mcidItems = mcidToTextItems.get(textItem.mcid);
847
if (mcidItems && mcidItems.length > 0) {
848
const mcidBBox = unionBBox(mcidItems);
849
if (mcidBBox) {
850
bbox = mcidBBox;
851
} else {
852
errors.push(
853
`Could not compute bbox for "${searchText}" - all text items in MCID are whitespace-only`,
854
);
855
continue;
856
}
857
} else {
858
errors.push(
859
`No text items found for MCID ${textItem.mcid} containing "${searchText}"`,
860
);
861
continue;
862
}
863
}
864
}
865
866
resolvedSelectors.set(key, {
867
selector,
868
textItem,
869
structNode,
870
bbox,
871
});
872
}
873
874
// Validate role assertions (skip Page role since it's a virtual selector)
875
for (const a of normalizedAssertions) {
876
if (isPageRole(a.subject)) continue; // Page role has no struct node to validate
877
878
const resolved = resolvedSelectors.get(selectorKey(a.subject));
879
if (!resolved) continue;
880
881
if (a.subject.role && resolved.structNode) {
882
if (resolved.structNode.role !== a.subject.role) {
883
errors.push(
884
`Role mismatch for "${a.subject.text}": expected ${a.subject.role}, got ${resolved.structNode.role}`,
885
);
886
}
887
}
888
889
if (a.object && !isPageRole(a.object)) {
890
const resolvedObj = resolvedSelectors.get(selectorKey(a.object));
891
if (!resolvedObj) continue;
892
893
if (a.object.role && resolvedObj.structNode) {
894
if (resolvedObj.structNode.role !== a.object.role) {
895
errors.push(
896
`Role mismatch for "${a.object.text}": expected ${a.object.role}, got ${resolvedObj.structNode.role}`,
897
);
898
}
899
}
900
}
901
}
902
903
// Stage 8: Evaluate position assertions
904
// Note: Zod validation in Stage 1 already handles:
905
// - Unknown relations
906
// - byMin/byMax with alignment relations (via .strict())
907
// - byMin > byMax (via .refine())
908
for (const a of normalizedAssertions) {
909
// Tag-only assertions (no relation/object)
910
if (!a.relation || !a.object) {
911
continue; // Already validated in stage 6
912
}
913
914
const subjectKey = selectorKey(a.subject);
915
const objectKey = selectorKey(a.object);
916
const subjectResolved = resolvedSelectors.get(subjectKey);
917
const objectResolved = resolvedSelectors.get(objectKey);
918
919
if (!subjectResolved || !objectResolved) {
920
continue; // Error already recorded
921
}
922
923
// Check same page
924
if (subjectResolved.bbox.page !== objectResolved.bbox.page) {
925
errors.push(
926
`Cannot compare positions: "${subjectKey}" is on page ${subjectResolved.bbox.page}, ` +
927
`"${objectKey}" is on page ${objectResolved.bbox.page}`,
928
);
929
continue;
930
}
931
932
// Evaluate relation based on type (Zod guarantees valid relation type)
933
const isDirectional = directionalRelations.has(a.relation);
934
if (isDirectional) {
935
const result = evaluateDirectionalRelation(
936
a.relation as DirectionalRelation,
937
subjectResolved.bbox,
938
objectResolved.bbox,
939
a.subject.edge,
940
a.object.edge,
941
a.byMin,
942
a.byMax,
943
);
944
945
if (!result.passed) {
946
const distanceInfo = a.byMin !== undefined || a.byMax !== undefined
947
? ` Distance: ${result.distance.toFixed(1)}pt` +
948
(a.byMin !== undefined ? ` (required >= ${a.byMin}pt)` : "") +
949
(a.byMax !== undefined ? ` (required <= ${a.byMax}pt)` : "")
950
: "";
951
errors.push(
952
`Position assertion failed (page ${subjectResolved.bbox.page}): "${subjectKey}" is NOT ${a.relation} "${objectKey}".` +
953
` Subject.${result.subjectEdge}=${result.subjectValue.toFixed(1)},` +
954
` Object.${result.objectEdge}=${result.objectValue.toFixed(1)}.${distanceInfo}` +
955
(result.failureReason ? ` (${result.failureReason})` : ""),
956
);
957
}
958
} else {
959
// Alignment relation
960
const result = evaluateAlignmentRelation(
961
a.relation as AlignmentRelation,
962
subjectResolved.bbox,
963
objectResolved.bbox,
964
a.tolerance,
965
a.subject.edge,
966
a.object.edge,
967
);
968
969
if (!result.passed) {
970
errors.push(
971
`Position assertion failed (page ${subjectResolved.bbox.page}): "${subjectKey}" is NOT ${a.relation} "${objectKey}".` +
972
` Subject.${result.subjectEdge}=${result.subjectValue.toFixed(1)},` +
973
` Object.${result.objectEdge}=${result.objectValue.toFixed(1)}.` +
974
` Difference: ${result.difference.toFixed(1)}pt (tolerance: ${a.tolerance}pt)`,
975
);
976
}
977
}
978
}
979
980
// Evaluate negative assertions
981
// Note: Zod validation already handled in Stage 1
982
for (const a of normalizedNoMatch ?? []) {
983
if (!a.relation || !a.object) continue;
984
985
const subjectKey = selectorKey(a.subject);
986
const objectKey = selectorKey(a.object);
987
const subjectResolved = resolvedSelectors.get(subjectKey);
988
const objectResolved = resolvedSelectors.get(objectKey);
989
990
if (!subjectResolved || !objectResolved) {
991
continue; // Assertion trivially doesn't hold
992
}
993
994
if (subjectResolved.bbox.page !== objectResolved.bbox.page) {
995
continue; // Assertion trivially doesn't hold
996
}
997
998
// Evaluate relation based on type (Zod guarantees valid relation type)
999
const isDirectional = directionalRelations.has(a.relation);
1000
let passed: boolean;
1001
let resultInfo: string;
1002
1003
if (isDirectional) {
1004
const result = evaluateDirectionalRelation(
1005
a.relation as DirectionalRelation,
1006
subjectResolved.bbox,
1007
objectResolved.bbox,
1008
a.subject.edge,
1009
a.object.edge,
1010
a.byMin,
1011
a.byMax,
1012
);
1013
passed = result.passed;
1014
resultInfo = `Subject.${result.subjectEdge}=${result.subjectValue.toFixed(1)}, ` +
1015
`Object.${result.objectEdge}=${result.objectValue.toFixed(1)}, ` +
1016
`distance=${result.distance.toFixed(1)}pt`;
1017
} else {
1018
const result = evaluateAlignmentRelation(
1019
a.relation as AlignmentRelation,
1020
subjectResolved.bbox,
1021
objectResolved.bbox,
1022
a.tolerance,
1023
a.subject.edge,
1024
a.object.edge,
1025
);
1026
passed = result.passed;
1027
resultInfo = `Subject.${result.subjectEdge}=${result.subjectValue.toFixed(1)}, ` +
1028
`Object.${result.objectEdge}=${result.objectValue.toFixed(1)}, ` +
1029
`difference=${result.difference.toFixed(1)}pt`;
1030
}
1031
1032
if (passed) {
1033
errors.push(
1034
`Negative assertion failed (page ${subjectResolved.bbox.page}): "${subjectKey}" IS ${a.relation} "${objectKey}" (expected NOT to be). ` +
1035
resultInfo,
1036
);
1037
}
1038
}
1039
1040
// Stage 9: Aggregate errors
1041
if (errors.length > 0) {
1042
assert(
1043
false,
1044
`PDF position assertions failed in ${file}:\n${errors.map((e, i) => ` ${i + 1}. ${e}`).join("\n")}`,
1045
);
1046
}
1047
},
1048
};
1049
};
1050
1051