From 6b18677d806b557e5253370a647b2248b0bdc136 Mon Sep 17 00:00:00 2001 From: Yury Fedoseev Date: Wed, 13 May 2026 18:36:43 -0700 Subject: [PATCH 01/18] =?UTF-8?q?release:=20v0.1.2=20=E2=80=94=20round-tri?= =?UTF-8?q?p=20fidelity,=20IR=20layout,=20perf,=20embedded=20fonts?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Major work for the v0.1.2 release branch. Consolidates 32 commits of round-trip fidelity fixes, IR enrichment, performance, and test coverage into a single release commit. # Performance - xlsx: O(1) cell-style lookups via HashMap (replaces linear Vec scan in the hot per-cell formatting path) # Round-trip fidelity (PDF → office → PDF) - DOCX/PPTX/XLSX: preserve images, fonts, columns end-to-end - Alignment, spacing, footers, rules survive both directions - PPTX ThematicBreak encoded as a 30-char U+2500 marker run that downstream PDF renderers detect and re-emit as a real
- DOCX on an empty paragraph recovers as Element::ThematicBreak in IR # DOCX - Parse into IR FramePosition (layout-preserving paths like pdf_oxide's to_docx_bytes_layout) - Heading carries frame_position - Parse floating drawings and vector shapes (line/rect with stroke/fill RGB) - Preserve per-section page sizes; emit per-section on multi-section IR - Preserve through to IR's font_size_half_pt - Include header/footer text in to_markdown and to_ir - Embedded fonts under /word/fonts/ are parsed and exposed on DocxDocument.embedded_fonts; strip_embedded_font_filename recovers the original face name from font__. (fixes greedy alphabetic-trim regression) - parse_drawing decomposed into focused recursive helpers - Plumb paragraph alignment + inline image collection # PPTX - Real Title+Body slide layout instead of blank - Paginate slides (~250 cap) + synthesize Slide N heading on to_ir - Wrap shapes in positioned TextBox + parse slide background - Don't wrap zero-size shape positions in TextBox - Propagate slide size to per-section page_setup - Preserve run font sizes (sz attribute → font_size_hundredths_pt) - Parse paragraph algn + spcBef → IR alignment + space_before - Picture shapes carry embed_rid + bytes + format resolved via pre-built media map - PPTX font embedding under /ppt/fonts/ - Structured chart text extraction (parses nodes into per-chart text blocks rendered as ## Chart N in markdown) # XLSX - Per-worksheet page_setup round-trip via / with inch/mm/cm/paperSize parsing - Preserve font sizes through IR; emit prose XLSX as paragraphs when a 1-column sheet has long-text cells - Unique worksheet names in ir_to_xlsx - New numfmt module: built-in IDs 0-44 (general, fixed, commas, percent, currency, scientific, accounting) + custom format strings (multi-section, [Red] color directive, currency prefix, quoted literal suffix, scale-by-thousand) - Worksheet drawings: WorksheetPicture + WorksheetTextShape parsed from xl/drawings/, anchor coords in EMU - Embedded fonts under /xl/fonts/ # IR enrichment - New types: Shape, ShapeGeom, FramePosition, ParagraphAlignment variants (Distribute), block_default centralisation (ThematicBreak → "---" / "
", PageBreak/Shape invisible in flow, TextBox recursively renders children) - New helpers: first_inline_font_size_pt, inline_to_element_block, build_nested_list (flat / 2-level / 3-level nesting) - Heading carries frame_position + alignment - Section.background_rgb propagated from PPTX slide background # Writers - DOCX: wire fontTable, heading styles, embed fonts, core props, dedup runs - PPTX: cap slides at ~250 (PowerPoint hard limit), autoFit, set_title_aligned - XLSX: split long paragraphs across cells; unique sheet names # Refactors - core: unified font embedding helper + cross-format font-size invariant (HalfPoint::from_word_sz / from_drawingml_sz) - ir: consolidate inline_to_element / build_nested_list / first_inline_font_size_pt (used by all 3 IR converters) - ir_render: extract block_default to centralise no-flow defaults (compiler-enforced exhaustiveness on new Element variants) # Tests - 98 new unit tests across the touched modules (core, xlsx/numfmt, xlsx/worksheet, docx/formatting, docx/mod, pptx/slide, ir, ir_render). All in-module #[cfg(test)] blocks; no new integration files. - Final state: 535/535 tests pass on default, --features parallel, --features mmap, --features parallel+mmap # Cleanup - cargo fmt clean - cargo clippy --workspace --all-targets -- -D warnings clean - 0 build warnings - maturin build (python feature) and wasm-pack build (wasm feature) both produce working packages; Python smoke verifies Document / EditableDocument / XlsxWriter / PptxWriter / create_from_markdown all functional --- CHANGELOG.md | 155 +++++ Cargo.lock | 6 +- Cargo.toml | 2 +- bench_rust/Cargo.toml | 2 +- crates/office_oxide_cli/Cargo.toml | 2 +- crates/office_oxide_mcp/Cargo.toml | 2 +- csharp/OfficeOxide/OfficeOxide.csproj | 2 +- go/cmd/install/main.go | 2 +- js/package-lock.json | 4 +- js/package.json | 2 +- pyproject.toml | 2 +- src/convert_doc.rs | 1 + src/convert_docx.rs | 494 +++++++++++++--- src/convert_ppt.rs | 1 + src/convert_pptx.rs | 221 ++++--- src/convert_xlsx.rs | 333 ++++++++++- src/core/core_properties.rs | 162 ++++++ src/core/embedded_fonts.rs | 119 ++++ src/core/mod.rs | 5 + src/core/opc.rs | 14 + src/core/relationships.rs | 10 + src/core/units.rs | 75 +++ src/create.rs | 790 +++++++++++++++++++++++--- src/docx/document.rs | 11 + src/docx/formatting.rs | 193 +++++++ src/docx/image.rs | 71 ++- src/docx/mod.rs | 757 ++++++++++++++++++++++-- src/docx/text.rs | 83 +++ src/docx/write.rs | 284 ++++++++- src/ir.rs | 363 ++++++++++++ src/ir_from_markdown.rs | 1 + src/ir_render.rs | 247 ++++++-- src/pptx/mod.rs | 97 +++- src/pptx/shape.rs | 25 +- src/pptx/slide.rs | 648 ++++++++++++++++++++- src/pptx/text.rs | 36 ++ src/pptx/write.rs | 320 ++++++++++- src/xlsx/mod.rs | 698 ++++++++++++++++++++++- src/xlsx/numfmt.rs | 478 ++++++++++++++++ src/xlsx/styles.rs | 46 +- src/xlsx/text.rs | 121 ++++ src/xlsx/worksheet.rs | 465 +++++++++++++++ src/xlsx/write.rs | 744 ++++++++++++++++++++++-- tests/office_integration.rs | 37 +- tests/write_integration.rs | 1 + wasm-pkg/package.json | 2 +- 46 files changed, 7601 insertions(+), 533 deletions(-) create mode 100644 src/core/core_properties.rs create mode 100644 src/core/embedded_fonts.rs create mode 100644 src/xlsx/numfmt.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index 45b7aee..a98f0f2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,161 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.1.2] - 2026-05-13 + +> Round-trip fidelity, IR layout features, embedded fonts, XLSX number formatting, and an O(1) style-lookup perf win. + +### Performance + +- **XLSX styles**: cell-format lookups now use a `HashMap`, replacing + the linear `Vec` scan in `format_cell_value` / `is_date_cell`. + Per-cell formatting becomes O(1); large styled workbooks parse + noticeably faster with no API change. + +### Round-trip fidelity (PDF → office → PDF) + +- **Alignment, spacing, footers, and horizontal rules** preserved end-to-end + through both `to_docx` and `to_pptx` writers. +- **Images, fonts, and column layouts** preserved across DOCX, PPTX, and + XLSX. Source-PDF font programs that previously registered as empty + subsets now embed correctly. +- **`Element::ThematicBreak`** encoded in PPTX as a centered 30-char run + of `U+2500 BOX DRAWINGS LIGHT HORIZONTAL`. Downstream PDF renderers + detect the all-U+2500 content and re-emit a real horizontal rule. +- **DOCX horizontal rules** recovered from the conventional encoding + (empty paragraph + ``) back into `Element::ThematicBreak`. + +### DOCX + +- **`` parsed into IR** as `FramePosition` (twips, page-anchored) + on both `Paragraph` and `Heading`. Used by layout-preserving paths + (e.g. pdf_oxide's `to_docx_bytes_layout`). +- **Floating drawings and vector shapes**: `` images plus + `` preset shapes (line, rect) with stroke/fill RGB and + stroke width round-trip through `DrawingInfo`. +- **Per-section page sizes** preserved through `to_ir`; multi-section IR + emits per-section ``. +- **`` preserved** through to IR's `font_size_half_pt`. +- **Headers and footers** now included in `to_markdown` and `to_ir` + (previously silently dropped). +- **Embedded fonts** under `/word/fonts/` exposed on + `DocxDocument.embedded_fonts`. `strip_embedded_font_filename` recovers + the original face name from `font__.` (fixes greedy + alphabetic-trim regression where `TeXGyreTermesX-` was returned + instead of `TeXGyreTermesX-Regular`). +- **`parse_drawing` decomposed** into focused recursive helpers + (`parse_inline_or_anchor_body`, `parse_anchor_position`, + `parse_shape_properties`, etc.) for readability. +- **Run-level `` plumbed into `TextSpan.font_name`**; + `` propagated to `Section.columns`. + +### PPTX + +- **Pagination**: each slide forces a `SectionBreakType::NextPage` so two + slides never share a rendered page. +- **Real Title+Body slide layout** emitted by the writer instead of a blank + layout, so PowerPoint shows placeholder hints in edit mode. +- **Slide background**: `` + parsed into `Slide.background_rgb` and propagated to `Section.background_rgb`. +- **Positioned text boxes**: shapes with explicit `` coordinates + wrap their content in `Element::TextBox` so downstream renderers can + place them at absolute EMU coordinates. Zero-size shapes skip the wrapper. +- **Slide size → page setup**: `` propagated to each + section's `PageSetup`. +- **Run font sizes preserved** via new `TextRun.font_size_hundredths_pt` + (parsed from ``). +- **Paragraph alignment** parsed from `` (all five + variants: `l` / `ctr` / `r` / `just` / `dist`) into + `TextParagraph.alignment`. **Space-before** parsed from + ``. +- **Title alignment propagation**: `find_title` returns text + first + paragraph's alignment, seeding both `Section.title` and the synthesised + level-2 Heading's alignment. +- **Picture shapes** now carry `embed_rid`, `data`, and `format` + (resolved via a pre-built media map at parse time, so the parallel + slide parser doesn't need the OPC reader). +- **Font embedding** under `/ppt/fonts/`. +- **Structured chart text extraction**: `` parts parsed into + per-chart text blocks rendered as `## Chart N` in markdown / search / + PDF without needing a graphical chart renderer. +- **Compaction**: consecutive H1/H2 cover-page headings fold into one + slide instead of fragmenting; long XLSX paragraphs split across cells + to respect ~32k char-per-cell limits. +- **Slide cap**: writer caps at ~250 slides (PowerPoint's hard limit). + +### XLSX + +- **Per-worksheet `page_setup`** round-trips via `` (inches) + and `` (paperWidth/paperHeight with mm/cm/in suffix or + `paperSize` enum 1–13). New `Worksheet.page_setup`. +- **`numfmt` module** (`crate::xlsx::numfmt`): built-in IDs 0–44 (general, + fixed, commas, percent, currency, scientific, accounting) and a + simplified custom format-string parser (multi-section, `[Red]` color + directives stripped, currency prefix from `[$€-407]`, quoted literal + suffix, percent and thousands separators). Applied to numeric cells + during `format_cell_value` and `write_cell_value_fast`. +- **Font sizes** preserved through IR; long-text single-column sheets + emit as paragraphs instead of a tall 1-column GFM table. +- **Unique worksheet names** in `ir_to_xlsx` (duplicates suffixed with + `_2`, `_3`, …). +- **Drawings**: `xl/drawings/drawingN.xml` parsed into + `Worksheet.images` (`WorksheetPicture` with EMU coords + bytes) and + `Worksheet.text_shapes` (`WorksheetTextShape` for layout-mode text + boxes from `to_xlsx_bytes_layout`). +- **Embedded fonts** under `/xl/fonts/`. + +### IR enrichment + +- **New types**: `Shape` (vector shape anchored at absolute EMU coords), + `ShapeGeom` (`Line`, `Rect`), `FramePosition` (twip-anchored frame). +- **`Heading`** gains `frame_position` + `alignment`. +- **`Section`** gains `background_rgb`. +- **`ParagraphAlignment`** gains the `Distribute` variant. +- **`Element::Shape(Shape)`** variant for vector shapes. +- **New helpers**: `first_inline_font_size_pt`, `inline_to_element_block`, + `build_nested_list` (flat / 2-level / 3-level recursion). +- **Centralized defaults** in `ir_render::block_default`: ThematicBreak + renders as `"---"` / `
`; PageBreak / ColumnBreak / Shape are + invisible in flow; TextBox / Footnote / Endnote recursively render + children. Adding a new `Element` variant forces a compile error + in `block_default::default_plain` instead of silent fallthrough. + +### Core + +- **`crate::core::core_properties`**: shared `docProps/core.xml` generator + used by all three writers. Emits `dc:title`, `dc:creator`, `dc:subject`, + `dc:description`, `cp:keywords`, `dcterms:created`, `dcterms:modified` + from the IR's `Metadata`. Empty fields are omitted entirely. +- **`crate::core::embedded_fonts`**: unified font-embedding helper + (`write_embedded_fonts`, `sanitize_font_filename`). All three formats + share the layout `font__.ttf`. +- **`HalfPoint::from_word_sz` / `from_drawingml_sz` / `to_drawingml_sz` / + `from_points_rounded`**: cross-format font-size invariants + (DrawingML hundredths-of-a-point vs WML half-points). + +### Tests + +- **+98 unit tests** across the modules touched in this release: + `core::embedded_fonts`, `core::core_properties`, `core::units`, + `xlsx::numfmt`, `xlsx::worksheet`, `docx::formatting`, `docx::mod`, + `pptx::slide`, `ir`, `ir_render`. +- **535 / 535 tests pass** across default, `--features parallel`, + `--features mmap`, and `--features parallel,mmap` builds. +- `cargo fmt` clean. `cargo clippy --workspace --all-targets -- -D warnings` + clean. + +### Bindings + +- **Python wheel** (maturin, PyO3 0.28) builds cleanly and exposes + `Document`, `EditableDocument`, `XlsxWriter`, `PptxWriter`, + `OfficeOxideError`, `create_from_markdown`, `extract_text`, + `to_markdown`, `to_html`, `version`. +- **WASM** package (`wasm-pack build --target web/node/bundler`) builds + cleanly with `--features wasm`. +- **C#** package bumped to 0.1.2 (csproj only — no API changes). + +[0.1.2]: https://github.com/yfedoseev/office_oxide/compare/v0.1.1...v0.1.2 + ## [0.1.1] - 2026-04-30 > Richer IR type system, DOCX writer output, improved PPTX/XLSX IR renderers, and writer APIs in all language bindings diff --git a/Cargo.lock b/Cargo.lock index a965793..bc0b74b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -286,7 +286,7 @@ dependencies = [ [[package]] name = "office_oxide" -version = "0.1.1" +version = "0.1.2" dependencies = [ "atoi_simd", "encoding_rs", @@ -307,7 +307,7 @@ dependencies = [ [[package]] name = "office_oxide_cli" -version = "0.1.1" +version = "0.1.2" dependencies = [ "clap", "office_oxide", @@ -316,7 +316,7 @@ dependencies = [ [[package]] name = "office_oxide_mcp" -version = "0.1.1" +version = "0.1.2" dependencies = [ "office_oxide", "serde_json", diff --git a/Cargo.toml b/Cargo.toml index 0f79594..a6f0723 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,7 +20,7 @@ match_like_matches_macro = "allow" manual_find = "allow" [workspace.package] -version = "0.1.1" +version = "0.1.2" edition = "2024" license = "MIT OR Apache-2.0" repository = "https://github.com/yfedoseev/office_oxide" diff --git a/bench_rust/Cargo.toml b/bench_rust/Cargo.toml index 9d5cc8c..6cbedb5 100644 --- a/bench_rust/Cargo.toml +++ b/bench_rust/Cargo.toml @@ -5,7 +5,7 @@ [package] name = "bench_rust" -version = "0.1.1" +version = "0.1.2" edition = "2021" [dependencies] diff --git a/crates/office_oxide_cli/Cargo.toml b/crates/office_oxide_cli/Cargo.toml index a2be3e8..d62f4a9 100644 --- a/crates/office_oxide_cli/Cargo.toml +++ b/crates/office_oxide_cli/Cargo.toml @@ -21,7 +21,7 @@ name = "office-oxide" path = "src/main.rs" [dependencies] -office_oxide = { version = "0.1.1", path = "../.." } +office_oxide = { version = "0.1.2", path = "../.." } clap = { version = "4", features = ["derive"] } serde_json = "1" diff --git a/crates/office_oxide_mcp/Cargo.toml b/crates/office_oxide_mcp/Cargo.toml index 8254454..1334937 100644 --- a/crates/office_oxide_mcp/Cargo.toml +++ b/crates/office_oxide_mcp/Cargo.toml @@ -21,7 +21,7 @@ name = "office-oxide-mcp" path = "src/main.rs" [dependencies] -office_oxide = { version = "0.1.1", path = "../.." } +office_oxide = { version = "0.1.2", path = "../.." } serde_json = "1" [package.metadata.binstall] diff --git a/csharp/OfficeOxide/OfficeOxide.csproj b/csharp/OfficeOxide/OfficeOxide.csproj index 302f567..21060fa 100644 --- a/csharp/OfficeOxide/OfficeOxide.csproj +++ b/csharp/OfficeOxide/OfficeOxide.csproj @@ -12,7 +12,7 @@ true OfficeOxide - 0.1.1 + 0.1.2 OfficeOxide Yury Fedoseev office_oxide diff --git a/go/cmd/install/main.go b/go/cmd/install/main.go index 4b790c0..7565f2e 100644 --- a/go/cmd/install/main.go +++ b/go/cmd/install/main.go @@ -32,7 +32,7 @@ import ( ) // Bumped in lockstep with the Rust crate. -const defaultVersion = "0.1.1" +const defaultVersion = "0.1.2" const releaseBase = "https://github.com/yfedoseev/office_oxide/releases/download" diff --git a/js/package-lock.json b/js/package-lock.json index 236fd78..95e1dc8 100644 --- a/js/package-lock.json +++ b/js/package-lock.json @@ -1,12 +1,12 @@ { "name": "office-oxide", - "version": "0.1.1", + "version": "0.1.2", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "office-oxide", - "version": "0.1.1", + "version": "0.1.2", "hasInstallScript": true, "license": "MIT OR Apache-2.0", "dependencies": { diff --git a/js/package.json b/js/package.json index e4d6389..e849d8b 100644 --- a/js/package.json +++ b/js/package.json @@ -1,6 +1,6 @@ { "name": "office-oxide", - "version": "0.1.1", + "version": "0.1.2", "description": "Fast Office document processing (DOCX/XLSX/PPTX/DOC/XLS/PPT) for Node.js — native bindings backed by the Rust office_oxide library.", "license": "MIT OR Apache-2.0", "author": "Yury Fedoseev", diff --git a/pyproject.toml b/pyproject.toml index 80d9b37..17d1dd6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "office-oxide" -version = "0.1.1" +version = "0.1.2" description = "The fastest Office document processing library for Python — DOCX, XLSX, PPTX, DOC, XLS, PPT" requires-python = ">=3.8" license = { text = "MIT OR Apache-2.0" } diff --git a/src/convert_doc.rs b/src/convert_doc.rs index dc72ce4..1661f4a 100644 --- a/src/convert_doc.rs +++ b/src/convert_doc.rs @@ -30,6 +30,7 @@ pub(crate) fn doc_to_ir(doc: &crate::doc::DocDocument) -> DocumentIR { bold: true, ..TextSpan::plain(trimmed) })], + ..Default::default() })); } else { elements.push(Element::Paragraph(Paragraph { diff --git a/src/convert_docx.rs b/src/convert_docx.rs index aee035a..6963279 100644 --- a/src/convert_docx.rs +++ b/src/convert_docx.rs @@ -2,38 +2,156 @@ use crate::format::DocumentFormat; use crate::ir::*; pub(crate) fn docx_to_ir(doc: &crate::docx::DocxDocument) -> DocumentIR { - let mut elements = Vec::new(); - convert_block_elements(&doc.body.elements, &mut elements, doc); - - // Extract title from first heading - let title = elements.iter().find_map(|e| { - if let Element::Heading(h) = e { - Some( - h.content - .iter() - .filter_map(|c| match c { - InlineContent::Text(span) => Some(span.text.as_str()), - _ => None, - }) - .collect::(), - ) + // Build per-section block-element windows from `body.section_breaks`. + // Each break index is the exclusive end of one section. Trailing + // elements after the last break go into a final section described + // by the body-level ``. + let breaks = &doc.body.section_breaks; + let total = doc.body.elements.len(); + + let mut windows: Vec<(usize, usize)> = Vec::new(); + let mut prev = 0; + for &b in breaks { + let end = b.min(total); + if end > prev { + windows.push((prev, end)); + } + prev = end; + } + if prev < total || windows.is_empty() { + windows.push((prev, total)); + } + + // Bring page-level headers and footers into the IR. Without this any + // downstream renderer (PDF, search, plain-text) loses non-body content + // like "My header" / "My footer" / page numbers / running titles. The + // split between header and footer uses the section ref counts (same + // approach as `to_markdown`). + let n_header_refs: usize = doc.sections.iter().map(|s| s.header_refs.len()).sum(); + let mut header_blocks: Vec = Vec::new(); + let mut footer_blocks: Vec = Vec::new(); + for (idx, hf) in doc.headers_footers.iter().enumerate() { + let mut tmp: Vec = Vec::new(); + convert_block_elements(&hf.content, &mut tmp, doc); + if tmp + .iter() + .all(|e| matches!(e, Element::Paragraph(p) if p.content.is_empty())) + { + continue; + } + if idx < n_header_refs { + header_blocks.extend(tmp); } else { - None + footer_blocks.extend(tmp); } - }); + } + let header = if header_blocks.is_empty() { + None + } else { + Some(HeaderFooter { + content: header_blocks, + }) + }; + let footer = if footer_blocks.is_empty() { + None + } else { + Some(HeaderFooter { + content: footer_blocks, + }) + }; + + let mut ir_sections: Vec
= Vec::with_capacity(windows.len()); + let mut doc_title: Option = None; + + for (idx, (start, end)) in windows.iter().copied().enumerate() { + let mut elements = Vec::new(); + convert_block_elements(&doc.body.elements[start..end], &mut elements, doc); + + let title = elements.iter().find_map(|e| { + if let Element::Heading(h) = e { + Some( + h.content + .iter() + .filter_map(|c| match c { + InlineContent::Text(span) => Some(span.text.as_str()), + _ => None, + }) + .collect::(), + ) + } else { + None + } + }); + if doc_title.is_none() { + doc_title = title.clone(); + } + + let page_setup = doc.sections.get(idx).map(section_props_to_page_setup); + // Propagate the multi-column layout out of the source DOCX so + // the IR carries `Section.columns` for the renderer. Without + // this, a PDF→DOCX→PDF round-trip of a 2-column source paper + // (arxiv preprints etc.) collapsed back to a single column on + // read because the column count was dropped at this hop. + let columns = doc + .sections + .get(idx) + .and_then(|sp| sp.columns) + .filter(|n| *n >= 2) + .map(|n| ColumnLayout { + count: n, + ..Default::default() + }); + + let break_type = if idx == 0 { + SectionBreakType::Continuous + } else { + SectionBreakType::NextPage + }; + + ir_sections.push(Section { + title, + elements, + page_setup, + break_type, + columns, + header: header.clone(), + footer: footer.clone(), + ..Default::default() + }); + } DocumentIR { metadata: Metadata { format: DocumentFormat::Docx, - title: title.clone(), + title: doc_title, ..Default::default() }, - sections: vec![Section { - title, - elements, - ..Default::default() - }], + sections: ir_sections, + } +} + +fn section_props_to_page_setup(sp: &crate::docx::SectionProperties) -> PageSetup { + let mut ps = PageSetup::default(); + if let Some(size) = &sp.page_size { + ps.width_twips = size.width.0.max(0) as u32; + ps.height_twips = size.height.0.max(0) as u32; + if let Some(crate::docx::PageOrientation::Landscape) = size.orient { + ps.landscape = true; + } + } + if let Some(m) = &sp.margins { + ps.margin_top_twips = m.top.0.max(0) as u32; + ps.margin_bottom_twips = m.bottom.0.max(0) as u32; + ps.margin_left_twips = m.left.0.max(0) as u32; + ps.margin_right_twips = m.right.0.max(0) as u32; + if let Some(h) = m.header { + ps.header_distance_twips = h.0.max(0) as u32; + } + if let Some(f) = m.footer { + ps.footer_distance_twips = f.0.max(0) as u32; + } } + ps } fn convert_block_elements( @@ -58,15 +176,39 @@ fn convert_block_elements( // Check for heading let heading_level = resolve_heading_level(p, doc); + let alignment = paragraph_alignment(p); + + // Detect "horizontal rule" encoding: empty paragraph + // with a single bottom border. pdf_to_ir round-trips + // ThematicBreak through DOCX as exactly this shape; + // recover it here so the renderer draws a rule. + let inline = convert_paragraph_inline(p, doc); + let is_empty_para = inline.iter().all(|ic| { + matches!(ic, + crate::ir::InlineContent::Text(s) if s.text.is_empty() + ) + }); + let has_bottom_border = p + .properties + .as_ref() + .is_some_and(|pp| pp.has_bottom_border); + if is_empty_para && has_bottom_border { + elements.push(Element::ThematicBreak); + i += 1; + continue; + } if let Some(level) = heading_level { elements.push(Element::Heading(Heading { level: (level + 1).min(6), content: convert_paragraph_inline(p, doc), + frame_position: paragraph_frame_position(p), + alignment, })); } else { // Check for page break in runs let (before_break, has_break) = split_at_page_break(p, doc); + let frame_pos = paragraph_frame_position(p); if !before_break.is_empty() || !has_break { elements.push(Element::Paragraph(Paragraph { content: if before_break.is_empty() && !has_break { @@ -74,6 +216,8 @@ fn convert_block_elements( } else { before_break }, + frame_position: frame_pos, + alignment, ..Default::default() })); } @@ -81,6 +225,19 @@ fn convert_block_elements( elements.push(Element::ThematicBreak); } } + // Promote any floating drawings (anchored images, vector + // shapes) embedded in this paragraph to paragraph-sibling + // IR elements so the positional renderer can lay them out + // alongside the text frame. + collect_paragraph_floats(p, doc, elements); + // Promote inline drawings (`` wrapper) to + // paragraph-sibling Image elements as well. Without this + // every embedded raster image (e.g. logos, figures, the + // CFR federal seal) lost its bytes on the way through + // the IR — the inline-content model has no Image + // variant, so hoisting to a sibling Element is the + // only way to carry the bitmap forward. + collect_paragraph_inline_images(p, doc, elements); i += 1; }, crate::docx::BlockElement::Table(t) => { @@ -91,6 +248,191 @@ fn convert_block_elements( } } +/// Pull `` data out of a paragraph's properties into the IR +/// position type. Returns `None` if the paragraph isn't absolutely +/// positioned (the common case). +/// Walk a paragraph's runs and emit one IR `Element` for every +/// floating (anchored) drawing — both raster pictures and vector +/// `` shapes. Inline drawings are left for the inline-content +/// path. Promoting floats to paragraph siblings keeps the positional +/// renderer simple: it can iterate a flat element list and place each +/// one at its absolute coordinates. +/// Walk a paragraph's runs and emit one IR `Element::Image` for every +/// inline drawing (`` wrapper). Counterpart to +/// `collect_paragraph_floats` which handles ``-anchored +/// drawings. The IR's `InlineContent` enum has no Image variant so +/// inline drawings can't ride along with the rest of a paragraph's +/// runs; instead we hoist them as paragraph-sibling Element::Image +/// nodes right after the surrounding text paragraph. +fn collect_paragraph_inline_images( + p: &crate::docx::Paragraph, + doc: &crate::docx::DocxDocument, + out: &mut Vec, +) { + for pc in &p.content { + let runs: &[crate::docx::Run] = match pc { + crate::docx::ParagraphContent::Run(r) => std::slice::from_ref(r), + crate::docx::ParagraphContent::Hyperlink(hl) => &hl.runs, + }; + for run in runs { + for rc in &run.content { + if let crate::docx::RunContent::Drawing(d) = rc { + if !d.inline { + continue; + } + if d.relationship_id.is_empty() { + continue; + } + let (data, ext) = match doc.images.get(&d.relationship_id).cloned() { + Some(v) => v, + None => continue, + }; + let format = + ext.as_deref() + .and_then(|e| match e.to_ascii_lowercase().as_str() { + "png" => Some(ImageFormat::Png), + "jpg" | "jpeg" => Some(ImageFormat::Jpeg), + "gif" => Some(ImageFormat::Gif), + _ => None, + }); + out.push(Element::Image(Image { + alt_text: d.description.clone(), + data: Some(data), + format, + display_width_emu: Some(d.width.0.max(0) as u64), + display_height_emu: Some(d.height.0.max(0) as u64), + positioning: ImagePositioning::Inline, + ..Default::default() + })); + } + } + } + } +} + +fn collect_paragraph_floats( + p: &crate::docx::Paragraph, + doc: &crate::docx::DocxDocument, + out: &mut Vec, +) { + for pc in &p.content { + let runs: &[crate::docx::Run] = match pc { + crate::docx::ParagraphContent::Run(r) => std::slice::from_ref(r), + crate::docx::ParagraphContent::Hyperlink(hl) => &hl.runs, + }; + for run in runs { + for rc in &run.content { + if let crate::docx::RunContent::Drawing(d) = rc { + if d.inline { + continue; + } + if let Some(el) = drawing_to_float_element(d, doc) { + out.push(el); + } + } + } + } + } +} + +fn drawing_to_float_element( + d: &crate::docx::DrawingInfo, + doc: &crate::docx::DocxDocument, +) -> Option { + use crate::docx::{AnchorFrame, ShapeKind}; + + let pos = d.anchor_position?; + let to_ir_anchor = |f: AnchorFrame| match f { + AnchorFrame::Page => FloatAnchor::Page, + AnchorFrame::Margin => FloatAnchor::Margin, + AnchorFrame::Column => FloatAnchor::Column, + AnchorFrame::Paragraph => FloatAnchor::Paragraph, + AnchorFrame::Line | AnchorFrame::Character => FloatAnchor::Page, + }; + let h_anchor = to_ir_anchor(pos.h_relative_from); + let v_anchor = to_ir_anchor(pos.v_relative_from); + let width_emu = d.width.0.max(0) as u64; + let height_emu = d.height.0.max(0) as u64; + + // Vector shape takes precedence: a `` with `prstGeom` + // never carries a ``, so the relationship_id is empty. + if let Some(shape) = &d.shape { + let kind = match shape.kind { + ShapeKind::Line => ShapeGeom::Line, + ShapeKind::Rect => ShapeGeom::Rect, + }; + return Some(Element::Shape(Shape { + kind, + x_emu: pos.x_emu, + y_emu: pos.y_emu, + width_emu, + height_emu, + h_anchor, + v_anchor, + stroke_rgb: shape.stroke_rgb.map(|(r, g, b)| [r, g, b]), + fill_rgb: shape.fill_rgb.map(|(r, g, b)| [r, g, b]), + stroke_w_emu: shape.stroke_w_emu, + })); + } + + if d.relationship_id.is_empty() { + return None; + } + let (data, ext) = doc.images.get(&d.relationship_id).cloned()?; + let format = ext.as_deref().and_then(|e| match e { + "png" => Some(ImageFormat::Png), + "jpg" | "jpeg" => Some(ImageFormat::Jpeg), + _ => None, + }); + Some(Element::Image(Image { + alt_text: d.description.clone(), + data: Some(data), + format, + display_width_emu: Some(width_emu), + display_height_emu: Some(height_emu), + positioning: ImagePositioning::Floating(FloatingImage { + x_emu: pos.x_emu, + y_emu: pos.y_emu, + width_emu, + height_emu, + h_anchor, + v_anchor, + text_wrap: TextWrap::default(), + allow_overlap: true, + }), + ..Default::default() + })) +} + +/// Translate a paragraph's `` justification into the IR's +/// `ParagraphAlignment`. `Left` (and `Both`/`Distribute`) collapse +/// to `None` so the renderer uses default left-alignment without +/// emitting an explicit override. +fn paragraph_alignment(p: &crate::docx::Paragraph) -> Option { + let jc = p + .properties + .as_ref() + .and_then(|pp| pp.justification.as_ref())?; + match jc { + crate::docx::Justification::Center => Some(ParagraphAlignment::Center), + crate::docx::Justification::Right => Some(ParagraphAlignment::Right), + crate::docx::Justification::Both => Some(ParagraphAlignment::Justify), + crate::docx::Justification::Distribute => Some(ParagraphAlignment::Distribute), + crate::docx::Justification::Left => None, + } +} + +fn paragraph_frame_position(p: &crate::docx::Paragraph) -> Option { + p.properties.as_ref().and_then(|props| { + props.frame_position.as_ref().map(|f| FramePosition { + x_twips: f.x_twips, + y_twips: f.y_twips, + width_twips: f.width_twips, + height_twips: f.height_twips, + }) + }) +} + fn resolve_heading_level( p: &crate::docx::Paragraph, doc: &crate::docx::DocxDocument, @@ -150,6 +492,35 @@ fn convert_run( .as_ref() .and_then(|rp| rp.strike.or(rp.dstrike)) .unwrap_or(false); + // `` is already in half-points; IR uses the + // same encoding. See `crate::core::units::HalfPoint::from_word_sz` + // for the cross-format invariant (also: PPTX hundredths-pt, + // XLSX points-as-f32 must convert here). + let font_size_half_pt = run.properties.as_ref().and_then(|rp| { + rp.font_size + .map(|hp| crate::core::units::HalfPoint::from_word_sz(hp.0).0) + }); + // `` carries the run's face name. Without + // forwarding it onto `TextSpan.font_name`, the IR→PDF renderer + // falls back to the page builder's default font (Helvetica) and + // every PDF→DOCX→PDF round-trip loses every typeface — even when + // the DOCX writer correctly embedded the source-PDF font program + // under `word/fonts/`. + let font_name = run.properties.as_ref().and_then(|rp| rp.font_name.clone()); + // Propagate `` so PDF→DOCX→PDF round-trips + // preserve coloured text (red "0" in `pdfs_pdfium/text_color.pdf` + // and the like). Theme / system / auto colours fall through to + // the renderer default for now — resolving them properly needs the + // document's `theme.xml`, which the current convert path doesn't + // thread in. + let text_color = run + .properties + .as_ref() + .and_then(|rp| rp.color.as_ref()) + .and_then(|c| match c { + crate::core::theme::ColorRef::Rgb(rgb) => Some(rgb.0), + _ => None, + }); for rc in &run.content { match rc { @@ -160,6 +531,9 @@ fn convert_run( italic, strikethrough: strike, hyperlink: hyperlink_url.map(|s| s.to_string()), + font_size_half_pt, + font_name: font_name.clone(), + color: text_color, ..Default::default() })); }, @@ -175,10 +549,16 @@ fn convert_run( content.push(InlineContent::Text(TextSpan::plain("\t"))); }, crate::docx::RunContent::Drawing(drawing) => { - // Emit as a separate image element — but we're in inline context, - // so we just note the alt text inline - if drawing.description.is_some() { - content.push(InlineContent::Text(TextSpan::plain(""))); + // Inline drawings handled at the paragraph level via + // `collect_paragraph_inline_images`. The inline-content + // model has no Image variant; hoisting here would + // require splitting paragraphs around each drawing, + // which loses spans. Just record alt text so the + // run's surrounding text doesn't lose semantic continuity. + if let Some(alt) = drawing.description.clone() { + if !alt.is_empty() { + content.push(InlineContent::Text(TextSpan::plain(alt))); + } } }, } @@ -263,65 +643,7 @@ fn convert_list_group( } // Build nested list structure from flat (ilvl, content) pairs - Element::List(build_nested_list(is_ordered, &items, 0)) -} - -fn inline_to_element(content: Vec) -> Vec { - if content.is_empty() { - Vec::new() - } else { - vec![Element::Paragraph(Paragraph { - content, - ..Default::default() - })] - } -} - -fn build_nested_list(ordered: bool, items: &[(u8, Vec)], base_level: u8) -> List { - let mut list_items = Vec::new(); - let mut idx = 0; - - while idx < items.len() { - let (ilvl, content) = &items[idx]; - if *ilvl == base_level { - // Collect any nested items immediately following at deeper levels - let mut nested = None; - let nested_start = idx + 1; - let mut nested_end = nested_start; - while nested_end < items.len() && items[nested_end].0 > base_level { - nested_end += 1; - } - if nested_end > nested_start { - nested = Some(build_nested_list( - ordered, - &items[nested_start..nested_end], - base_level + 1, - )); - } - list_items.push(ListItem { - content: inline_to_element(content.clone()), - nested, - }); - idx = if nested_end > nested_start { - nested_end - } else { - idx + 1 - }; - } else { - // Item at unexpected level — just add it flat - list_items.push(ListItem { - content: inline_to_element(content.clone()), - nested: None, - }); - idx += 1; - } - } - - List { - ordered, - items: list_items, - ..Default::default() - } + Element::List(crate::ir::build_nested_list(is_ordered, &items, 0)) } // --------------------------------------------------------------------------- diff --git a/src/convert_ppt.rs b/src/convert_ppt.rs index 77d432d..83fec2d 100644 --- a/src/convert_ppt.rs +++ b/src/convert_ppt.rs @@ -26,6 +26,7 @@ pub(crate) fn ppt_to_ir(doc: &crate::ppt::PptDocument) -> DocumentIR { bold: true, ..TextSpan::plain(text) })], + ..Default::default() })); }, TextType::Body | TextType::HalfBody | TextType::QuarterBody => { diff --git a/src/convert_pptx.rs b/src/convert_pptx.rs index 937408e..96ec625 100644 --- a/src/convert_pptx.rs +++ b/src/convert_pptx.rs @@ -2,12 +2,42 @@ use crate::format::DocumentFormat; use crate::ir::*; pub(crate) fn pptx_to_ir(doc: &crate::pptx::PptxDocument) -> DocumentIR { + // Slide size sits at presentation level — every slide in the + // deck shares it. EMU → twips is /635 (914400 EMU per inch, + // 1440 twips per inch → 914400/1440 = 635). + let page_setup = doc.presentation.slide_size.as_ref().map(|sz| PageSetup { + width_twips: (sz.cx.max(0) / 635) as u32, + height_twips: (sz.cy.max(0) / 635) as u32, + landscape: sz.cx > sz.cy, + ..Default::default() + }); + let mut sections = Vec::new(); - for slide in &doc.slides { - let title = find_title_text(&slide.shapes); + for slide in doc.slides.iter() { + let title_with_algn = find_title(&slide.shapes); + let title = title_with_algn.as_ref().map(|(t, _)| t.clone()); + let title_alignment = title_with_algn.as_ref().and_then(|(_, a)| a.clone()); let mut elements = Vec::new(); + // Lead each slide with the title placeholder text as a + // heading so it has visible demarcation in the rendered + // PDF/HTML output. When the slide has no title we used to + // synthesise "Slide N" — that was useful for markdown anchors + // but pure visual noise in paginated output, where every + // slide already starts on its own page via the NextPage break. + // Worse, the synthesised heading rendered as 20 pt bold and + // contributed ~50 pt of fixed vertical overhead per section, + // which inflated PDF→PPTX→PDF round-trip page counts. + if let Some(ref t) = title { + elements.push(Element::Heading(Heading { + level: 2, + content: vec![InlineContent::Text(TextSpan::plain(t.clone()))], + alignment: title_alignment.clone(), + ..Default::default() + })); + } + // Sort shapes spatially let mut shape_entries: Vec<(Option<&crate::pptx::ShapePosition>, &crate::pptx::Shape)> = Vec::new(); @@ -18,6 +48,11 @@ pub(crate) fn pptx_to_ir(doc: &crate::pptx::PptxDocument) -> DocumentIR { convert_shape(shape, &mut elements); } + // Propagate slide background colour to the section so the + // PDF renderer can paint a full-slide rectangle before laying + // down shapes. + let background_rgb = slide.background_rgb; + // Add notes as paragraphs at end if let Some(ref notes) = slide.notes { if !notes.is_empty() { @@ -28,9 +63,21 @@ pub(crate) fn pptx_to_ir(doc: &crate::pptx::PptxDocument) -> DocumentIR { } } + // Each PPTX slide is its own page when rendered to PDF or + // any paginated format. Default `Continuous` would let two + // slides share a page, which is wrong for slide content. + let break_type = if sections.is_empty() { + SectionBreakType::Continuous + } else { + SectionBreakType::NextPage + }; + sections.push(Section { title: title.clone(), elements, + break_type, + page_setup: page_setup.clone(), + background_rgb, ..Default::default() }); } @@ -86,7 +133,10 @@ fn is_title_placeholder(ph_type: Option<&str>) -> bool { matches!(ph_type, Some("title" | "ctrTitle")) } -fn find_title_text(shapes: &[crate::pptx::Shape]) -> Option { +/// Locate the title placeholder and return its text together with the +/// alignment of the first paragraph. Used by `pptx_to_ir` to seed both +/// `Section.title` and the synthesised level-2 Heading's alignment. +fn find_title(shapes: &[crate::pptx::Shape]) -> Option<(String, Option)> { for shape in shapes { match shape { crate::pptx::Shape::AutoShape(auto) @@ -98,13 +148,14 @@ fn find_title_text(shapes: &[crate::pptx::Shape]) -> Option { if let Some(ref tb) = auto.text_body { let text = plain_text_from_body(tb); if !text.is_empty() { - return Some(text); + let algn = tb.paragraphs.first().and_then(|p| p.alignment.clone()); + return Some((text, algn)); } } }, crate::pptx::Shape::Group(grp) => { - if let Some(title) = find_title_text(&grp.children) { - return Some(title); + if let Some(t) = find_title(&grp.children) { + return Some(t); } }, _ => {}, @@ -142,14 +193,36 @@ fn convert_shape(shape: &crate::pptx::Shape, elements: &mut Vec) { } if let Some(ref tb) = auto.text_body { - convert_text_body(tb, elements); + let mut inner = Vec::new(); + convert_text_body(tb, &mut inner); + if inner.is_empty() { + return; + } + push_positional_textbox(elements, inner, auto.position.as_ref()); } }, crate::pptx::Shape::Picture(pic) => { - elements.push(Element::Image(Image { + // Carry the resolved media bytes through so the PDF renderer + // (`render_pptx_textbox_content`) can paint the actual + // picture at its shape rectangle. `embed_rid` is preserved + // as alt-text fallback only when the relationship couldn't + // be resolved — we still want a placeholder element so the + // shape's position survives in plain-text / markdown output. + let format = pic.format.as_deref().and_then(image_format_from_ext); + let (display_w, display_h) = pic + .position + .as_ref() + .map(|p| (Some(p.cx.max(0) as u64), Some(p.cy.max(0) as u64))) + .unwrap_or((None, None)); + let img_el = Element::Image(Image { alt_text: pic.alt_text.clone(), + data: pic.data.clone(), + format, + display_width_emu: display_w, + display_height_emu: display_h, ..Default::default() - })); + }); + push_positional_textbox(elements, vec![img_el], pic.position.as_ref()); }, crate::pptx::Shape::Group(grp) => { for child in &grp.children { @@ -158,13 +231,49 @@ fn convert_shape(shape: &crate::pptx::Shape, elements: &mut Vec) { }, crate::pptx::Shape::GraphicFrame(gf) => { if let crate::pptx::GraphicContent::Table(ref tbl) = gf.content { - elements.push(convert_pptx_table(tbl)); + let table_el = convert_pptx_table(tbl); + push_positional_textbox(elements, vec![table_el], gf.position.as_ref()); } }, crate::pptx::Shape::Connector(_) => {}, } } +/// Wrap a shape's converted IR content in a `TextBox` carrying its +/// absolute `(x, y, cx, cy)` EMU rectangle. The PPTX renderer uses +/// these coordinates to paint each shape at its source position +/// instead of flowing them as a single long page. +/// +/// When the source shape has no `` (rare — placeholders that +/// inherit geometry from a slide layout), the inner content is pushed +/// as flow elements so plain-text / markdown rendering still sees it. +fn push_positional_textbox( + elements: &mut Vec, + content: Vec, + position: Option<&crate::pptx::ShapePosition>, +) { + // Wrap in `Element::TextBox` only when the source shape carried a + // *real* ``. Placeholders that inherit geometry from the + // slide layout parse as `ShapePosition { x: 0, y: 0, cx: 0, cy: 0 }` + // — wrapping those in TextBox tells the renderer "place this 0×0 + // rectangle at (0, 0)" which collapses every paragraph onto the + // top-left corner. Treat all-zeros as "no position" so the + // content flows normally instead. + let real_position = position.filter(|p| p.cx > 0 && p.cy > 0); + if let Some(pos) = real_position { + elements.push(Element::TextBox(TextBox { + content, + x_emu: Some(pos.x), + y_emu: Some(pos.y), + width_emu: Some(pos.cx.max(0) as u64), + height_emu: Some(pos.cy.max(0) as u64), + ..Default::default() + })); + } else { + elements.extend(content); + } +} + fn convert_text_body(body: &crate::pptx::TextBody, elements: &mut Vec) { // Check if any paragraph has level > 0 — treat as list let has_levels = body.paragraphs.iter().any(|p| p.level > 0); @@ -175,13 +284,23 @@ fn convert_text_body(body: &crate::pptx::TextBody, elements: &mut Vec) for para in &body.paragraphs { items.push((para.level as u8, convert_text_paragraph_inline(para))); } - elements.push(Element::List(build_nested_list(false, &items, 0))); + elements.push(Element::List(crate::ir::build_nested_list(false, &items, 0))); } else { for para in &body.paragraphs { let content = convert_text_paragraph_inline(para); - if !content.is_empty() { + // Honour space_before from PPTX so spacer paragraphs + // emitted by pdf_to_ir round-trip with their full vertical + // gap. Convert hundredths-of-pt → twips: hundredths * 0.2 + // (1pt = 20 twips, so pt*100 → twips = (pt*100)/5). + let space_before_twips = para.space_before_hundredths_pt.map(|h| h.div_ceil(5)); + // Empty paragraphs serve as vertical spacers — keep them + // in the IR even when content is empty so the renderer + // can advance the cursor by the requested amount. + if !content.is_empty() || space_before_twips.is_some() { elements.push(Element::Paragraph(Paragraph { content, + alignment: para.alignment.clone(), + space_before_twips, ..Default::default() })); } @@ -199,12 +318,19 @@ fn convert_text_paragraph_inline(para: &crate::pptx::TextParagraph) -> Vec Some(url.clone()), crate::pptx::HyperlinkTarget::Internal(_) => None, }); + let font_size_half_pt = run.font_size_hundredths_pt.map(|hp| { + crate::core::units::HalfPoint::from_drawingml_sz(hp) + .0 + .max(1) + }); content.push(InlineContent::Text(TextSpan { text: run.text.clone(), bold: run.bold.unwrap_or(false), italic: run.italic.unwrap_or(false), strikethrough: run.strikethrough, hyperlink, + font_size_half_pt, + color: run.color_rgb, ..Default::default() })); } @@ -222,59 +348,6 @@ fn convert_text_paragraph_inline(para: &crate::pptx::TextParagraph) -> Vec) -> Vec { - if content.is_empty() { - Vec::new() - } else { - vec![Element::Paragraph(Paragraph { - content, - ..Default::default() - })] - } -} - -fn build_nested_list(ordered: bool, items: &[(u8, Vec)], base_level: u8) -> List { - let mut list_items = Vec::new(); - let mut idx = 0; - - while idx < items.len() { - let (level, content) = &items[idx]; - if *level <= base_level { - let nested_start = idx + 1; - let mut nested_end = nested_start; - while nested_end < items.len() && items[nested_end].0 > base_level { - nested_end += 1; - } - let nested = if nested_end > nested_start { - Some(build_nested_list(ordered, &items[nested_start..nested_end], base_level + 1)) - } else { - None - }; - list_items.push(ListItem { - content: inline_to_element(content.clone()), - nested, - }); - idx = if nested_end > nested_start { - nested_end - } else { - idx + 1 - }; - } else { - list_items.push(ListItem { - content: inline_to_element(content.clone()), - nested: None, - }); - idx += 1; - } - } - - List { - ordered, - items: list_items, - ..Default::default() - } -} - fn convert_pptx_table(table: &crate::pptx::Table) -> Element { let mut ir_rows = Vec::new(); @@ -294,6 +367,7 @@ fn convert_pptx_table(table: &crate::pptx::Table) -> Element { if !content.is_empty() { cell_elements.push(Element::Paragraph(Paragraph { content, + alignment: para.alignment.clone(), ..Default::default() })); } @@ -320,3 +394,20 @@ fn convert_pptx_table(table: &crate::pptx::Table) -> Element { ..Default::default() }) } + +/// Map a lowercase file extension (`"png"`, `"jpeg"`, `"emf"`, …) to +/// the matching `ImageFormat` variant. Used by `convert_shape` when +/// converting a parsed PPTX `` whose underlying media part the +/// PPTX reader resolved into bytes + extension. +fn image_format_from_ext(ext: &str) -> Option { + match ext { + "png" => Some(ImageFormat::Png), + "jpg" | "jpeg" => Some(ImageFormat::Jpeg), + "gif" => Some(ImageFormat::Gif), + "tif" | "tiff" => Some(ImageFormat::Tiff), + "bmp" => Some(ImageFormat::Bmp), + "emf" => Some(ImageFormat::Emf), + "wmf" => Some(ImageFormat::Wmf), + _ => None, + } +} diff --git a/src/convert_xlsx.rs b/src/convert_xlsx.rs index 3c95d9e..5869963 100644 --- a/src/convert_xlsx.rs +++ b/src/convert_xlsx.rs @@ -1,50 +1,296 @@ use crate::format::DocumentFormat; use crate::ir::*; +/// Parse a 6-char hex colour like `"FFA500"` into `[r, g, b]`. +fn parse_hex_rgb(s: &str) -> Option<[u8; 3]> { + let s = s.trim_start_matches('#'); + if s.len() != 6 { + return None; + } + let r = u8::from_str_radix(&s[0..2], 16).ok()?; + let g = u8::from_str_radix(&s[2..4], 16).ok()?; + let b = u8::from_str_radix(&s[4..6], 16).ok()?; + Some([r, g, b]) +} + pub(crate) fn xlsx_to_ir(doc: &crate::xlsx::XlsxDocument) -> DocumentIR { - let mut sections = Vec::new(); + // Pre-compute date style indices once — avoids re-scanning format strings per cell. + let date_indices = doc.date_style_indices(); - for ws in &doc.worksheets { - let mut rows = Vec::new(); + // Single String buffer reused across all cells — clear() keeps the heap + // allocation; std::mem::take() moves it into TextSpan for non-empty cells. + let mut buf = String::new(); - for (row_idx, row) in ws.rows.iter().enumerate() { - let mut cells = Vec::new(); + let mut sections = Vec::new(); + + for (ws_idx, ws) in doc.worksheets.iter().enumerate() { + // First pass: parse all rows into (cells, style-indices). + // Each entry is a Vec of (text, style_index_for_first_non_empty_cell) + // — we keep style index for the first non-empty cell in each row + // because that's what carries the font info we want to recover. + let mut parsed_rows: Vec)>> = Vec::with_capacity(ws.rows.len()); + for row in &ws.rows { + let mut cells: Vec<(String, Option)> = Vec::with_capacity(row.cells.len()); for cell in &row.cells { - let text = doc.format_cell_value(cell); - cells.push(TableCell { - content: vec![Element::Paragraph(Paragraph { - content: if text.is_empty() { - Vec::new() - } else { - vec![InlineContent::Text(TextSpan::plain(text))] - }, - ..Default::default() - })], - col_span: 1, - row_span: 1, - ..Default::default() - }); + buf.clear(); + doc.write_cell_value_fast(cell, &mut buf, &date_indices); + let text = if buf.is_empty() { + String::new() + } else { + std::mem::take(&mut buf) + }; + cells.push((text, cell.style_index)); } + // Drop trailing empty cells. + while cells.last().is_some_and(|(t, _)| t.is_empty()) { + cells.pop(); + } + parsed_rows.push(cells); + } - rows.push(TableRow { - cells, - is_header: row_idx == 0, + // Decide row layout: a worksheet whose rows mostly have at most one + // non-empty cell is "document style" — flowing text laid out one + // paragraph per row. Render those rows as Paragraphs (not as a + // 1-column Table) so the downstream PDF renderer flows them like + // body text and honours per-paragraph font sizes. + // + // We choose Paragraph mode when ≥80 % of non-empty rows have ≤1 + // non-empty cell. That's permissive enough to handle real + // worksheets that mostly hold prose but still emit a Table when a + // genuine grid is present. + let mut prose_score = 0usize; + let mut nonempty_rows = 0usize; + for cells in &parsed_rows { + let nc = cells.iter().filter(|(t, _)| !t.is_empty()).count(); + if nc == 0 { + continue; + } + nonempty_rows += 1; + if nc <= 1 { + prose_score += 1; + } + } + let prose_mode = nonempty_rows >= 3 && prose_score * 100 >= nonempty_rows * 80; + + // Materialise any pictures or text shapes anchored on the + // worksheet as positional IR elements so they survive the + // round-trip back to PDF. Pictures wrap an `Element::Image` + // in an `Element::TextBox`; text shapes wrap a styled + // paragraph the same way. The flow renderer then paints + // both at their absolute EMU rectangle (see + // `render_text_box`). + let mut image_elements: Vec = + Vec::with_capacity(ws.images.len() + ws.text_shapes.len()); + for ts in &ws.text_shapes { + let mut span = TextSpan::plain(ts.text.clone()); + if let Some(sz) = ts.font_size_pt { + span.font_size_half_pt = + Some(crate::core::units::HalfPoint::from_points_rounded(sz as f64).0); + } + if ts.bold { + span.bold = true; + } + if ts.italic { + span.italic = true; + } + if let Some(ref hex) = ts.color_hex { + if let Some(rgb) = parse_hex_rgb(hex) { + span.color = Some(rgb); + } + } + if let Some(ref f) = ts.font_name { + span.font_name = Some(f.clone()); + } + let para = Element::Paragraph(Paragraph { + content: vec![InlineContent::Text(span)], ..Default::default() }); + image_elements.push(Element::TextBox(TextBox { + content: vec![para], + x_emu: Some(ts.x_emu), + y_emu: Some(ts.y_emu), + width_emu: Some(ts.cx_emu.max(0) as u64), + height_emu: Some(ts.cy_emu.max(0) as u64), + ..Default::default() + })); } + for pic in &ws.images { + let format = image_format_from_ext(&pic.format); + let img = Image { + alt_text: pic.alt_text.clone(), + data: Some(pic.data.clone()), + format, + display_width_emu: Some(pic.cx_emu.max(0) as u64), + display_height_emu: Some(pic.cy_emu.max(0) as u64), + ..Default::default() + }; + // Wrap in TextBox so downstream renderers can paint at the + // exact (x_emu, y_emu) anchor instead of inline-after-text. + // When the source drawing was a cell-anchor and we + // couldn't resolve EMU coords (cx == 0), drop the wrap so + // the image flows inline at the section start. + if pic.cx_emu > 0 && pic.cy_emu > 0 { + image_elements.push(Element::TextBox(TextBox { + content: vec![Element::Image(img)], + x_emu: Some(pic.x_emu), + y_emu: Some(pic.y_emu), + width_emu: Some(pic.cx_emu.max(0) as u64), + height_emu: Some(pic.cy_emu.max(0) as u64), + ..Default::default() + })); + } else { + image_elements.push(Element::Image(img)); + } + } + + let elements = if prose_mode { + // Each row → one Paragraph. Pull font size from cell style if the + // worksheet's stylesheet is loaded. Skip empty rows entirely + // (they were just visual separators). + let mut out: Vec = Vec::new(); + for cells in &parsed_rows { + // Find the first non-empty cell. + let Some((text, style_idx)) = cells.iter().find(|(t, _)| !t.is_empty()).cloned() + else { + continue; + }; + let mut span = TextSpan::plain(text); + if let Some(idx) = style_idx { + if let Some(font) = font_for(doc, idx) { + if let Some(size_pt) = font.size { + // XLSX cell font size is in points (`` + // where N is f32). IR uses half-points; same + // half-pt convention as DOCX/PPTX read paths. + span.font_size_half_pt = Some( + crate::core::units::HalfPoint::from_points_rounded(size_pt) + .0, + ); + } + if font.bold { + span.bold = true; + } + if font.italic { + span.italic = true; + } + } + } + out.push(Element::Paragraph(Paragraph { + content: vec![InlineContent::Text(span)], + ..Default::default() + })); + } + out + } else { + // Genuine grid → emit a Table. + let mut rows: Vec = Vec::with_capacity(parsed_rows.len()); + for (row_idx, cells) in parsed_rows.iter().enumerate() { + let mut tcells: Vec = Vec::with_capacity(cells.len()); + for (text, _) in cells { + let content = if text.is_empty() { + Vec::new() + } else { + vec![InlineContent::Text(TextSpan::plain(text.clone()))] + }; + tcells.push(TableCell { + content: vec![Element::Paragraph(Paragraph { + content, + ..Default::default() + })], + col_span: 1, + row_span: 1, + ..Default::default() + }); + } + rows.push(TableRow { + cells: tcells, + is_header: row_idx == 0, + ..Default::default() + }); + } + if rows.is_empty() { + Vec::new() + } else { + vec![Element::Table(Table { + rows, + ..Default::default() + })] + } + }; - let elements = if rows.is_empty() { - Vec::new() + // Per-sheet page geometry (parsed from /). + // Default the margins back to 0.5"/0.5" (720 twips) when the source + // had no — Excel's default 0.7"/0.75" is wider than + // we want for a tight PDF round-trip and would shrink the usable + // text area. + let page_setup = ws.page_setup.and_then(|wsp| { + // A worksheet that only had (no dimensions) is + // treated as "no geometry" so the renderer keeps its + // OfficeConfig default page size. + if wsp.width_twips == 0 || wsp.height_twips == 0 { + return None; + } + Some(PageSetup { + width_twips: wsp.width_twips, + height_twips: wsp.height_twips, + margin_top_twips: wsp.margin_top_twips, + margin_bottom_twips: wsp.margin_bottom_twips, + margin_left_twips: wsp.margin_left_twips, + margin_right_twips: wsp.margin_right_twips, + header_distance_twips: wsp.header_distance_twips, + footer_distance_twips: wsp.footer_distance_twips, + landscape: wsp.landscape, + }) + }); + + // Each XLSX worksheet renders to its own PDF page sequence, so + // mark every section after the first as a hard page break (same + // pattern as PPTX in convert_pptx.rs). Without this the second + // worksheet's content flows into the first sheet's last page. + let break_type = if ws_idx == 0 { + SectionBreakType::Continuous } else { - vec![Element::Table(Table { - rows, - ..Default::default() - })] + SectionBreakType::NextPage }; + // Stitch worksheet pictures in front of cell-derived content + // so they paint underneath the text (positional TextBoxes are + // absolute regardless of order, but inline images render + // first). Empty `image_elements` means no drawing on this sheet. + let mut combined: Vec = image_elements; + combined.extend(elements); + sections.push(Section { title: Some(ws.name.clone()), - elements, + elements: combined, + page_setup, + break_type, + ..Default::default() + }); + } + + // Append a section for chart content. We don't render charts as graphics; + // capturing their text (titles, axis labels, series names, cached values) + // ensures that all human-meaningful words in the workbook appear in the + // IR and downstream conversions, even when the chart itself isn't drawn. + if !doc.chart_text.is_empty() { + let mut chart_elements: Vec = Vec::new(); + for (i, text) in doc.chart_text.iter().enumerate() { + chart_elements.push(Element::Heading(Heading { + level: 3, + content: vec![InlineContent::Text(TextSpan::plain(format!( + "Chart {}", + i + 1 + )))], + ..Default::default() + })); + chart_elements.push(Element::Paragraph(Paragraph { + content: vec![InlineContent::Text(TextSpan::plain(text.clone()))], + ..Default::default() + })); + } + sections.push(Section { + title: Some("Charts".to_string()), + elements: chart_elements, ..Default::default() }); } @@ -60,3 +306,32 @@ pub(crate) fn xlsx_to_ir(doc: &crate::xlsx::XlsxDocument) -> DocumentIR { sections, } } + +/// Look up a cell's font through the workbook's stylesheet (if loaded). +/// `to_ir` runs after the document has been fully read; if styles weren't +/// parsed yet they remain `None` and we silently skip per-cell font +/// recovery rather than mutate the document during a `&self` traversal. +fn font_for( + doc: &crate::xlsx::XlsxDocument, + style_index: u32, +) -> Option<&crate::xlsx::styles::Font> { + doc.styles.as_ref()?.font_for(style_index) +} + +/// Map a lowercase file extension (`"png"`, `"jpeg"`, ...) to the +/// matching `ImageFormat` variant. Mirrors the PPTX helper. Returns +/// `None` for unrecognised extensions; the round-trip then carries +/// only the bytes (renderers usually sniff the format from the magic +/// header and ignore the missing variant). +fn image_format_from_ext(ext: &str) -> Option { + match ext { + "png" => Some(ImageFormat::Png), + "jpg" | "jpeg" => Some(ImageFormat::Jpeg), + "gif" => Some(ImageFormat::Gif), + "tif" | "tiff" => Some(ImageFormat::Tiff), + "bmp" => Some(ImageFormat::Bmp), + "emf" => Some(ImageFormat::Emf), + "wmf" => Some(ImageFormat::Wmf), + _ => None, + } +} diff --git a/src/core/core_properties.rs b/src/core/core_properties.rs new file mode 100644 index 0000000..4580c88 --- /dev/null +++ b/src/core/core_properties.rs @@ -0,0 +1,162 @@ +//! Shared `docProps/core.xml` generator used by DOCX, PPTX, and XLSX +//! writers. Emits the OOXML core-properties payload from the IR's +//! `Metadata` so document title / author / subject / created / +//! modified surface in Word, PowerPoint, and Excel "Properties" +//! dialogs. + +use crate::ir::Metadata; +use quick_xml::Writer; +use quick_xml::events::{BytesDecl, BytesEnd, BytesStart, BytesText, Event}; + +/// MIME content type for `docProps/core.xml`. +pub const CONTENT_TYPE: &str = "application/vnd.openxmlformats-package.core-properties+xml"; + +/// Generate the XML payload for `docProps/core.xml`. Empty fields +/// in the input are omitted entirely (no ``), +/// matching the convention Word / PowerPoint use. +pub fn generate_xml(meta: &Metadata) -> Vec { + let mut w = Writer::new_with_indent(Vec::new(), b' ', 2); + w.write_event(Event::Decl(BytesDecl::new("1.0", Some("UTF-8"), Some("yes")))) + .expect("decl"); + + let mut root = BytesStart::new("cp:coreProperties"); + root.push_attribute(( + "xmlns:cp", + "http://schemas.openxmlformats.org/package/2006/metadata/core-properties", + )); + root.push_attribute(("xmlns:dc", "http://purl.org/dc/elements/1.1/")); + root.push_attribute(("xmlns:dcterms", "http://purl.org/dc/terms/")); + root.push_attribute(("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance")); + w.write_event(Event::Start(root)).expect("root"); + + write_text(&mut w, "dc:title", meta.title.as_deref()); + write_text(&mut w, "dc:subject", meta.subject.as_deref()); + write_text(&mut w, "dc:creator", meta.author.as_deref()); + write_text(&mut w, "dc:description", meta.description.as_deref()); + if !meta.keywords.is_empty() { + write_text(&mut w, "cp:keywords", Some(meta.keywords.join(", ").as_str())); + } + write_dcterms(&mut w, "dcterms:created", meta.created.as_deref()); + write_dcterms(&mut w, "dcterms:modified", meta.modified.as_deref()); + + w.write_event(Event::End(BytesEnd::new("cp:coreProperties"))) + .expect("close"); + w.into_inner() +} + +fn write_text(w: &mut Writer>, tag: &str, value: Option<&str>) { + if let Some(v) = value { + if v.is_empty() { + return; + } + w.write_event(Event::Start(BytesStart::new(tag.to_string()))) + .expect("open"); + w.write_event(Event::Text(BytesText::new(v))).expect("text"); + w.write_event(Event::End(BytesEnd::new(tag.to_string()))) + .expect("close"); + } +} + +fn write_dcterms(w: &mut Writer>, tag: &str, value: Option<&str>) { + if let Some(v) = value { + if v.is_empty() { + return; + } + let mut elem = BytesStart::new(tag.to_string()); + elem.push_attribute(("xsi:type", "dcterms:W3CDTF")); + w.write_event(Event::Start(elem)).expect("open"); + w.write_event(Event::Text(BytesText::new(v))).expect("text"); + w.write_event(Event::End(BytesEnd::new(tag.to_string()))) + .expect("close"); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::DocumentFormat; + + fn meta_string(meta: &Metadata) -> String { + String::from_utf8(generate_xml(meta)).unwrap() + } + + #[test] + fn empty_metadata_emits_only_root() { + let meta = Metadata { + format: DocumentFormat::Docx, + ..Default::default() + }; + let xml = meta_string(&meta); + assert!(xml.contains("Hello"), "xml: {xml}"); + assert!(xml.contains("Yury"), "xml: {xml}"); + } + + #[test] + fn empty_string_field_is_omitted() { + let meta = Metadata { + format: DocumentFormat::Docx, + title: Some(String::new()), + author: Some("Someone".into()), + ..Default::default() + }; + let xml = meta_string(&meta); + // Empty title is dropped entirely; non-empty author is kept. + assert!(!xml.contains("Someone"), "xml: {xml}"); + } + + #[test] + fn dcterms_carry_w3cdtf_type_attribute() { + let meta = Metadata { + format: DocumentFormat::Docx, + created: Some("2026-05-13T10:00:00Z".into()), + modified: Some("2026-05-13T11:00:00Z".into()), + ..Default::default() + }; + let xml = meta_string(&meta); + assert!(xml.contains("xsi:type=\"dcterms:W3CDTF\""), "xml: {xml}"); + assert!(xml.contains("2026-05-13T10:00:00Z"), "xml: {xml}"); + assert!(xml.contains("2026-05-13T11:00:00Z"), "xml: {xml}"); + } + + #[test] + fn keywords_joined_with_comma() { + let meta = Metadata { + format: DocumentFormat::Docx, + keywords: vec!["rust".into(), "office".into(), "oxide".into()], + ..Default::default() + }; + let xml = meta_string(&meta); + assert!(xml.contains("rust, office, oxide"), "xml: {xml}"); + } + + #[test] + fn no_keywords_omits_element() { + let meta = Metadata { + format: DocumentFormat::Docx, + ..Default::default() + }; + let xml = meta_string(&meta); + assert!(!xml.contains("_.ttf` +//! - PPTX: `/ppt/fonts/font__.ttf` +//! - XLSX: `/xl/fonts/font__.ttf` +//! +//! Other apps (Word, PowerPoint, Excel) require additional manifest +//! plumbing (``, ``, etc.) to +//! actually pick up the embed; until that lands the in-process reader +//! is the only consumer. It scans the `*/fonts/` directory directly, +//! which is why the layout is uniform across formats. +//! +//! `sanitize_font_filename` strips characters that aren't legal in OPC +//! part names so font names can be embedded into the path safely. + +use super::Result; +use super::opc::{OpcWriter, PartName}; +use std::io::{Seek, Write}; + +/// Generic content type for embedded font payloads. The package +/// remains valid OPC even though Word/PowerPoint/Excel won't +/// auto-discover the font without the per-format manifest entries. +const FONT_CONTENT_TYPE: &str = "application/x-font-ttf"; + +/// Strip path-unsafe characters from a font name so it can live +/// inside an OPC part name (`/word/fonts/font__.ttf`). +/// Keeps ASCII alphanumeric, `-`, and `_`; replaces everything else +/// with `_` and clamps to 40 characters. +pub fn sanitize_font_filename(name: &str) -> String { + name.chars() + .map(|c| { + if c.is_ascii_alphanumeric() || c == '-' || c == '_' { + c + } else { + '_' + } + }) + .take(40) + .collect() +} + +/// Write the supplied font programs into the OPC package under the +/// given path prefix (e.g. `/word/fonts/`, `/ppt/fonts/`, or +/// `/xl/fonts/`). Each entry becomes +/// `font__.ttf` with `n` starting at 1. +/// +/// `prefix` must end with `/` and start with `/`. +pub fn write_embedded_fonts( + opc: &mut OpcWriter, + prefix: &str, + fonts: &[(String, Vec)], +) -> Result<()> { + debug_assert!(prefix.starts_with('/') && prefix.ends_with('/')); + if !fonts.is_empty() { + // Register `ttf` once as a Default content-type entry. The + // per-part Overrides we emit alongside still take precedence + // at lookup; the Default just keeps OOXML SDK validators + // happy ("missing Default for extension ttf"). + opc.register_default_content_type("ttf", FONT_CONTENT_TYPE); + } + for (idx, (name, data)) in fonts.iter().enumerate() { + let n = idx + 1; + let safe_name = sanitize_font_filename(name); + let target = format!("{prefix}font_{n}_{safe_name}.ttf"); + let part = PartName::new(&target)?; + opc.add_part(&part, FONT_CONTENT_TYPE, data)?; + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn sanitize_keeps_alphanumeric() { + assert_eq!(sanitize_font_filename("Calibri"), "Calibri"); + assert_eq!(sanitize_font_filename("Arial123"), "Arial123"); + } + + #[test] + fn sanitize_keeps_dash_and_underscore() { + assert_eq!(sanitize_font_filename("Times-Roman"), "Times-Roman"); + assert_eq!(sanitize_font_filename("TeXGyreTermesX-Regular"), "TeXGyreTermesX-Regular"); + assert_eq!(sanitize_font_filename("my_font"), "my_font"); + } + + #[test] + fn sanitize_replaces_path_unsafe_chars() { + assert_eq!(sanitize_font_filename("Arial/Bold"), "Arial_Bold"); + assert_eq!(sanitize_font_filename("a*b?c"), "a_b_c"); + assert_eq!(sanitize_font_filename("Noto Sans"), "Noto_Sans"); + assert_eq!(sanitize_font_filename("a.b"), "a_b"); + } + + #[test] + fn sanitize_replaces_non_ascii() { + // Non-ASCII alphanumeric is replaced with '_'. + assert_eq!(sanitize_font_filename("Café"), "Caf_"); + } + + #[test] + fn sanitize_clamps_to_40_chars() { + let long = "A".repeat(100); + let s = sanitize_font_filename(&long); + assert_eq!(s.len(), 40); + assert!(s.chars().all(|c| c == 'A')); + } + + #[test] + fn sanitize_empty_input() { + assert_eq!(sanitize_font_filename(""), ""); + } +} diff --git a/src/core/mod.rs b/src/core/mod.rs index aa40ec9..14bd6ed 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -8,8 +8,13 @@ /// `[Content_Types].xml` parsing and writing. pub mod content_types; +/// Shared `docProps/core.xml` generator used by DOCX, PPTX, XLSX writers. +pub mod core_properties; /// In-place editing of OPC packages (preserves unchanged parts). pub mod editable; +/// Helpers for embedding TrueType / OpenType font programs in DOCX, +/// PPTX, and XLSX packages. +pub mod embedded_fonts; /// Core error type and `Result` alias used throughout OOXML parsing. pub mod error; /// OPC (Open Packaging Conventions) reader and writer for ZIP-based packages. diff --git a/src/core/opc.rs b/src/core/opc.rs index 307e9d3..16dc027 100644 --- a/src/core/opc.rs +++ b/src/core/opc.rs @@ -452,6 +452,20 @@ impl OpcWriter { }) } + /// Register a default `[Content_Types].xml` entry for a file + /// extension. Use this for parts whose content type is uniform + /// across the package (e.g. `ttf` for all embedded fonts, `png` + /// for all raster images). Default + Override is legal OOXML; + /// Override takes precedence at lookup time, so passing the same + /// content type to both is redundant but safe. + /// + /// Validators (Office Open XML SDK) flag packages that ship many + /// per-file overrides without a matching Default — emit defaults + /// for known-uniform extensions to satisfy them. + pub fn register_default_content_type(&mut self, extension: &str, content_type: &str) { + self.content_types.add_default(extension, content_type); + } + /// Add a part to the package. pub fn add_part(&mut self, name: &PartName, content_type: &str, data: &[u8]) -> Result<()> { // Register content type override diff --git a/src/core/relationships.rs b/src/core/relationships.rs index 77b9887..09dbaf0 100644 --- a/src/core/relationships.rs +++ b/src/core/relationships.rs @@ -33,9 +33,19 @@ pub mod rel_types { /// Relationship type for the font table. pub const FONT_TABLE: &str = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable"; + /// Relationship type for an individual embedded font program (the + /// `` reference from `fontTable.xml` to a + /// `.ttf` part under `word/fonts/`). + pub const FONT: &str = + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/font"; /// Relationship type for embedded images. pub const IMAGE: &str = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image"; + /// Relationship type for a SpreadsheetML / DrawingML drawing part + /// (`xl/drawings/drawingN.xml`). Worksheet-to-drawing rel; the + /// drawing itself owns IMAGE rels keyed by ``. + pub const DRAWING: &str = + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/drawing"; /// Relationship type for hyperlinks. pub const HYPERLINK: &str = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink"; diff --git a/src/core/units.rs b/src/core/units.rs index 456de8e..0fc2502 100644 --- a/src/core/units.rs +++ b/src/core/units.rs @@ -100,6 +100,30 @@ impl HalfPoint { pub fn from_points(pt: f64) -> Self { Self((pt * 2.0) as u32) } + + /// Round to the nearest half-point. + pub fn from_points_rounded(pt: f64) -> Self { + Self((pt * 2.0).round() as u32) + } + + /// Construct from WordProcessingML's ``. The + /// attribute value is already in half-points, so this is identity + /// modulo signed→unsigned. + pub fn from_word_sz(half_pt: u32) -> Self { + Self(half_pt) + } + + /// Construct from DrawingML's ``. The attribute + /// value is in *hundredths of a point* (sz=1200 → 12 pt). Half-pt + /// = hundredths / 50. + pub fn from_drawingml_sz(hundredths_pt: u32) -> Self { + Self(hundredths_pt / 50) + } + + /// Reverse of [`Self::from_drawingml_sz`]. + pub fn to_drawingml_sz(self) -> u32 { + self.0 * 50 + } } /// Percentage * 1000 (e.g., 50% = 50_000, 100% = 100_000). ST_Percentage in OOXML. @@ -184,6 +208,57 @@ mod tests { assert_eq!(from.0, 20); } + #[test] + fn half_point_from_points_rounded() { + // 10.1pt → 20.2 half-pts → rounds to 20. + assert_eq!(HalfPoint::from_points_rounded(10.1).0, 20); + // 10.3pt → 20.6 half-pts → rounds to 21. + assert_eq!(HalfPoint::from_points_rounded(10.3).0, 21); + // Exact half-point boundary rounds with banker's rules; check whole-pt. + assert_eq!(HalfPoint::from_points_rounded(12.0).0, 24); + // Compare with truncating from_points: 10.49 → trunc 20, round 21. + assert_eq!(HalfPoint::from_points(10.49).0, 20); + assert_eq!(HalfPoint::from_points_rounded(10.49).0, 21); + } + + #[test] + fn half_point_from_word_sz() { + // means 24 half-points → 12pt. + let sz = HalfPoint::from_word_sz(24); + assert_eq!(sz.0, 24); + assert!((sz.to_points() - 12.0).abs() < f64::EPSILON); + } + + #[test] + fn half_point_from_drawingml_sz() { + // means 1200 hundredths of a point → 12pt → 24 half-pts. + let sz = HalfPoint::from_drawingml_sz(1200); + assert_eq!(sz.0, 24); + assert!((sz.to_points() - 12.0).abs() < f64::EPSILON); + // 1800 hundredths → 18pt → 36 half-pts. + assert_eq!(HalfPoint::from_drawingml_sz(1800).0, 36); + // Below half-point granularity (sz=125 → 2.5 half-pt) truncates. + assert_eq!(HalfPoint::from_drawingml_sz(125).0, 2); + } + + #[test] + fn half_point_to_drawingml_sz() { + // 24 half-pts (=12pt) → 1200 hundredths. + assert_eq!(HalfPoint(24).to_drawingml_sz(), 1200); + assert_eq!(HalfPoint(36).to_drawingml_sz(), 1800); + } + + #[test] + fn half_point_drawingml_round_trip() { + for hundredths in [100u32, 600, 1100, 1200, 1800, 2400, 3600] { + let hp = HalfPoint::from_drawingml_sz(hundredths); + // Round-trip is lossless when hundredths is divisible by 50. + if hundredths % 50 == 0 { + assert_eq!(hp.to_drawingml_sz(), hundredths, "input {hundredths}"); + } + } + } + #[test] fn percentage_conversions() { let fifty = Percentage1000(50_000); diff --git a/src/create.rs b/src/create.rs index faaa6a3..f8e56dc 100644 --- a/src/create.rs +++ b/src/create.rs @@ -91,7 +91,9 @@ pub fn create_from_ir_to_writer( // DOCX conversion // --------------------------------------------------------------------------- -fn ir_to_docx(ir: &DocumentIR) -> crate::docx::write::DocxWriter { +/// Build a `DocxWriter` from `DocumentIR`, exposed so callers can embed +/// extra parts (fonts, custom metadata) before serialization. +pub fn ir_to_docx(ir: &DocumentIR) -> crate::docx::write::DocxWriter { use crate::docx::write::{DocxWriter, IrParaProps, Run}; let mut writer = DocxWriter::new(); @@ -179,6 +181,7 @@ fn add_element_to_docx(writer: &mut crate::docx::write::DocxWriter, elem: &Eleme let runs: Vec = ir_inline_to_runs(&h.content); let props = IrParaProps { style: Some(format!("Heading{level}")), + alignment: h.alignment.clone(), ..Default::default() }; writer.add_ir_paragraph(&runs, Some(props)); @@ -218,8 +221,28 @@ fn add_element_to_docx(writer: &mut crate::docx::write::DocxWriter, elem: &Eleme writer.add_ir_image(img); }, Element::ThematicBreak => { - // Emitted as a blank paragraph (no visual rule; full border support is a future enhancement). - let props = IrParaProps::default(); + // Emit as a blank paragraph with a single bottom border — + // the conventional DOCX representation of a horizontal + // rule. Word displays this as a thin black line under + // the paragraph; on PDF→DOCX→IR re-parse the renderer + // detects "empty paragraph with bottom-border-only" and + // draws a horizontal rule. + let border = crate::ir::ParagraphBorder { + top: None, + left: None, + right: None, + between: None, + bottom: Some(crate::ir::BorderLine { + style: crate::ir::BorderStyle::Single, + color: Some([0, 0, 0]), + size: Some(6), + space: Some(1), + }), + }; + let props = IrParaProps { + border: Some(border), + ..Default::default() + }; writer.add_ir_paragraph(&[], Some(props)); }, Element::PageBreak => { @@ -240,6 +263,11 @@ fn add_element_to_docx(writer: &mut crate::docx::write::DocxWriter, elem: &Eleme Element::CodeBlock(cb) => { writer.add_code_block(&cb.content); }, + Element::Shape(_) => { + // Vector shapes are written directly by the layout-preserving + // DOCX writer (`pdf_oxide::converters::docx_layout`), not via + // the markdown→IR→DOCX pipeline. + }, } } @@ -284,22 +312,160 @@ fn ir_inline_to_runs(content: &[InlineContent]) -> Vec }, } } - runs + coalesce_runs(runs) +} + +/// Merge adjacent text runs that share identical run properties so the +/// emitted DOCX has one `` per styling region instead of one per +/// PDF span. PDF text extraction returns ~1 span per word; without +/// this pass the document.xml balloons (~5× over the merged form), +/// search/replace breaks across word boundaries, and screen readers +/// stutter. +/// +/// Footnote/endnote/field runs are never merged (they carry semantic +/// markers that must stay in their own `` for Word to recognise +/// them as references). +fn coalesce_runs(runs: Vec) -> Vec { + use crate::docx::write::Run; + let mut out: Vec = Vec::with_capacity(runs.len()); + for r in runs { + let mergeable = r.footnote_ref.is_none() && r.endnote_ref.is_none() && r.text != "\n"; + if mergeable { + if let Some(last) = out.last_mut() { + if last.footnote_ref.is_none() + && last.endnote_ref.is_none() + && last.text != "\n" + && run_props_equal(last, &r) + { + last.text.push_str(&r.text); + continue; + } + } + } + out.push(r); + } + out +} + +/// Compare two runs' style properties (everything except `text`, +/// `footnote_ref`, `endnote_ref`) for byte-equality. +fn run_props_equal(a: &crate::docx::write::Run, b: &crate::docx::write::Run) -> bool { + a.bold == b.bold + && a.italic == b.italic + && a.underline == b.underline + && a.underline_style == b.underline_style + && a.strikethrough == b.strikethrough + && a.color == b.color + && a.color_rgb == b.color_rgb + && a.font_size_pt == b.font_size_pt + && a.font_size_half_pt == b.font_size_half_pt + && a.font_name == b.font_name + && a.highlight == b.highlight + && a.vertical_align == b.vertical_align + && a.all_caps == b.all_caps + && a.small_caps == b.small_caps + && a.char_spacing_half_pt == b.char_spacing_half_pt } // --------------------------------------------------------------------------- // XLSX conversion // --------------------------------------------------------------------------- -fn ir_to_xlsx(ir: &DocumentIR) -> crate::xlsx::write::XlsxWriter { - use crate::xlsx::write::CellData; +/// Sanitise a worksheet name and ensure it doesn't clash with names +/// already used in the workbook. Excel limits names to 31 chars and +/// forbids `:\\/?*[]`; the spec also forbids the reserved name +/// "History". When the sanitised candidate is empty or already taken, +/// fall back to "Sheet" — and even that is post-checked so +/// pathological inputs can't collide. +fn unique_sheet_name(raw: &str, idx: usize, used: &std::collections::HashSet) -> String { + fn sanitise(s: &str) -> String { + let mut out = String::with_capacity(s.len().min(31)); + for ch in s.chars() { + if matches!(ch, ':' | '\\' | '/' | '?' | '*' | '[' | ']') { + out.push('_'); + } else { + out.push(ch); + } + if out.chars().count() >= 31 { + break; + } + } + out.trim().to_string() + } + let candidate = sanitise(raw); + if !candidate.is_empty() + && !candidate.eq_ignore_ascii_case("history") + && !used.contains(&candidate) + { + return candidate; + } + // Fall back to indexed name. + let mut fallback = format!("Sheet{idx}"); + let mut bump = idx; + while used.contains(&fallback) { + bump += 1; + fallback = format!("Sheet{bump}"); + } + fallback +} + +/// Build an `XlsxWriter` from `DocumentIR`. Public so callers can embed +/// extra parts (fonts, custom metadata) before serialization. Mirrors +/// `ir_to_docx` and `ir_to_pptx`. +pub fn ir_to_xlsx(ir: &DocumentIR) -> crate::xlsx::write::XlsxWriter { + use crate::xlsx::write::{CellData, CellStyle}; let mut writer = crate::xlsx::write::XlsxWriter::new(); + writer.set_metadata(&ir.metadata); + + // Sheet names must be unique within a workbook (ECMA-376) and Excel + // additionally rejects names > 31 chars, names containing `:\\/?*[]`, + // and the literal "History". We sanitise + de-duplicate by + // appending the 1-based index when a section's title would clash + // (or when there's no title at all). + let mut used_names: std::collections::HashSet = std::collections::HashSet::new(); + for (idx, section) in ir.sections.iter().enumerate() { + // Prefer the section's title; failing that, use the first + // heading inside the section so each tab gets a meaningful + // label (e.g. "1 Introduction", "Abstract") instead of the + // anonymous "Sheet1..N". + let raw_owned = section + .title + .clone() + .or_else(|| first_heading_text(§ion.elements)) + .unwrap_or_default(); + let raw = raw_owned.as_str(); + let name = unique_sheet_name(raw, idx + 1, &used_names); + used_names.insert(name.clone()); + let mut sheet = writer.add_sheet(&name); + + // Propagate per-section page geometry so a PDF→XLSX→PDF round + // trip preserves the source MediaBox. Without this each + // worksheet falls back to default Letter portrait and a long + // PDF (134 / 660 pages) flows onto far fewer pages because the + // renderer uses a different page size on read-back. + if let Some(ps) = section.page_setup.as_ref() { + sheet.set_page_setup(crate::xlsx::write::PageSetup { + width_twips: ps.width_twips, + height_twips: ps.height_twips, + margin_top_twips: ps.margin_top_twips, + margin_bottom_twips: ps.margin_bottom_twips, + margin_left_twips: ps.margin_left_twips, + margin_right_twips: ps.margin_right_twips, + header_distance_twips: ps.header_distance_twips, + footer_distance_twips: ps.footer_distance_twips, + landscape: ps.landscape, + }); + } - for section in &ir.sections { - let name = section.title.as_deref().unwrap_or("Sheet"); - let mut sheet = writer.add_sheet(name); let mut row_cursor = 0usize; + // Body paragraphs that aren't part of a table get split across + // multiple rows when long, so a page-of-prose stays readable + // instead of piling 1500 chars into a single clipped cell. We + // also widen column A so the resulting rows have somewhere to + // breathe. Short paragraphs (≤ 80 chars) and headings stay in + // a single cell to preserve their visual identity. + let mut body_paragraphs_seen = false; for elem in §ion.elements { match elem { @@ -335,31 +501,220 @@ fn ir_to_xlsx(ir: &DocumentIR) -> crate::xlsx::write::XlsxWriter { Element::Paragraph(p) => { let text = inline_to_text(&p.content); if !text.is_empty() { - sheet.set_cell(row_cursor, 0, CellData::String(text)); - row_cursor += 1; + body_paragraphs_seen = true; + // Persist the IR paragraph's font size onto the cell. + // This is what allows a PDF→IR→XLSX→IR→PDF round-trip + // to recover the original 9–10 pt body size instead of + // falling back to the 12 pt default and inflating the + // page count. + let mut style = CellStyle::new(); + if let Some(size_pt) = crate::ir::first_inline_font_size_pt(&p.content) { + style = style.font_size(size_pt); + } + if let Some(name) = first_inline_font_name(&p.content) { + style = style.font_name(name); + } + for line in split_paragraph_for_xlsx(&text) { + sheet.set_cell_styled( + row_cursor, + 0, + CellData::String(line), + style.clone(), + ); + row_cursor += 1; + } + } + }, + Element::Image(img) => { + // Anchor any image carried by the IR onto this + // worksheet. EMU coordinates default to (0, 0) when + // the IR didn't carry per-image positioning — the + // round-trip still recovers the bytes, just stacked + // at the sheet origin. When position-aware writers + // wrap images in TextBox the outer branch below + // unwraps the EMU coords. + if let (Some(data), Some(fmt)) = (&img.data, &img.format) { + let cx = img.display_width_emu.unwrap_or(3_000_000) as i64; + let cy = img.display_height_emu.unwrap_or(2_000_000) as i64; + sheet.add_image(data.clone(), fmt.extension(), 0, 0, cx, cy); + } + }, + Element::TextBox(tb) => { + // Positional wrapper: when the IR places an image + // inside a TextBox (PDF→IR can carry shape coords + // that way), forward the inner image bytes with the + // TextBox's anchor. + let x = tb.x_emu.unwrap_or(0); + let y = tb.y_emu.unwrap_or(0); + let cx = tb.width_emu.unwrap_or(0) as i64; + let cy = tb.height_emu.unwrap_or(0) as i64; + for inner in &tb.content { + if let Element::Image(img) = inner { + if let (Some(data), Some(fmt)) = (&img.data, &img.format) { + let icx = if cx > 0 { + cx + } else { + img.display_width_emu.unwrap_or(3_000_000) as i64 + }; + let icy = if cy > 0 { + cy + } else { + img.display_height_emu.unwrap_or(2_000_000) as i64 + }; + sheet.add_image(data.clone(), fmt.extension(), x, y, icx, icy); + } + } } }, Element::Heading(h) => { let text = inline_to_text(&h.content); if !text.is_empty() { - sheet.set_cell(row_cursor, 0, CellData::String(text)); + let data = CellData::String(text); + let mut style = CellStyle::new().bold(); + if let Some(size_pt) = crate::ir::first_inline_font_size_pt(&h.content) { + style = style.font_size(size_pt); + } + if let Some(name) = first_inline_font_name(&h.content) { + style = style.font_name(name); + } + sheet.set_cell_styled(row_cursor, 0, data, style); row_cursor += 1; } }, _ => {}, } } + + // If we emitted any body paragraphs (rather than just tables) + // widen column A so multi-line prose has somewhere to render. + // Tables manage their own per-column widths above so we leave + // those alone. + if body_paragraphs_seen { + sheet.set_column_width(0, 80.0); + } } writer } +/// Split a long paragraph into ~120-char chunks at sentence boundaries +/// for XLSX rendering. Short paragraphs (≤ 80 chars) pass through as a +/// single chunk so they keep their compact look. +/// +/// Operates on `char_indices` throughout so the byte indices we slice +/// at are always valid UTF-8 boundaries — paragraphs from PDFs often +/// contain multi-byte glyphs (mathematical italic, accented Latin, +/// CJK) and naive byte arithmetic blows up on them. +fn split_paragraph_for_xlsx(text: &str) -> Vec { + const SHORT_THRESHOLD: usize = 80; + const TARGET_LINE_LEN: usize = 120; + const SCAN_BACK_CHARS: usize = 60; + + if text.chars().count() <= SHORT_THRESHOLD { + return vec![text.to_string()]; + } + + // Pre-compute char positions so all slicing happens on boundaries. + let chars: Vec<(usize, char)> = text.char_indices().collect(); + let total_chars = chars.len(); + let total_bytes = text.len(); + + let mut chunks: Vec = Vec::new(); + let mut char_start: usize = 0; // index into `chars` + + while char_start < total_chars { + let remaining_chars = total_chars - char_start; + if remaining_chars <= TARGET_LINE_LEN { + let head_byte = chars[char_start].0; + let tail = text[head_byte..].trim(); + if !tail.is_empty() { + chunks.push(tail.to_string()); + } + break; + } + + // The "minimum break point" is char_start + TARGET_LINE_LEN. + let min_break_char = char_start + TARGET_LINE_LEN; + let scan_back_char = min_break_char + .saturating_sub(SCAN_BACK_CHARS) + .max(char_start); + + // Find a sentence boundary: a `.` followed by ` ` followed by + // an uppercase ASCII letter. Prefer breaks at or after the + // target, then fall back to one slightly before. + let mut break_char: Option = None; + + // Pass 1: at-or-after the cap. + for i in min_break_char..total_chars.saturating_sub(2) { + if chars[i].1 == '.' && chars[i + 1].1 == ' ' && chars[i + 2].1.is_ascii_uppercase() { + break_char = Some(i + 2); // start of the next sentence + break; + } + } + + // Pass 2: before the cap, within scan_back window. + if break_char.is_none() { + for i in scan_back_char..min_break_char.saturating_sub(2).max(scan_back_char) { + if i + 2 >= total_chars { + break; + } + if chars[i].1 == '.' && chars[i + 1].1 == ' ' && chars[i + 2].1.is_ascii_uppercase() + { + break_char = Some(i + 2); + } + } + } + + // Pass 3: next whitespace at-or-after the cap. + if break_char.is_none() { + for i in min_break_char..total_chars { + if chars[i].1 == ' ' { + break_char = Some(i + 1); + break; + } + } + } + + let next_char = break_char.unwrap_or(total_chars); + let head_byte = chars[char_start].0; + let tail_byte = if next_char >= total_chars { + total_bytes + } else { + chars[next_char].0 + }; + let head = text[head_byte..tail_byte].trim(); + if !head.is_empty() { + chunks.push(head.to_string()); + } + + // Advance past any leading whitespace on the tail (we already + // trimmed `head`, but `next_char` may sit right at the space). + let mut cs = next_char; + while cs < total_chars && chars[cs].1 == ' ' { + cs += 1; + } + if cs <= char_start { + // Defensive: ensure forward progress. + cs = char_start + 1; + } + char_start = cs; + } + + if chunks.is_empty() { + chunks.push(text.to_string()); + } + chunks +} + // --------------------------------------------------------------------------- // PPTX conversion // --------------------------------------------------------------------------- -fn ir_to_pptx(ir: &DocumentIR) -> crate::pptx::write::PptxWriter { +/// Build a `PptxWriter` from `DocumentIR`. Public so callers can embed +/// extra parts (fonts, custom metadata) before serialization. +pub fn ir_to_pptx(ir: &DocumentIR) -> crate::pptx::write::PptxWriter { let mut writer = crate::pptx::write::PptxWriter::new(); + writer.set_metadata(&ir.metadata); if let Some(ps) = ir.sections.iter().find_map(|s| s.page_setup.as_ref()) { let cx = ps.width_twips as u64 * 914_400 / 1440; @@ -367,85 +722,327 @@ fn ir_to_pptx(ir: &DocumentIR) -> crate::pptx::write::PptxWriter { writer.set_presentation_size(cx, cy); } - for section in &ir.sections { - let slide = writer.add_slide(); + // PowerPoint shows a "found a problem with content. Do you want to + // repair?" dialog and renders Slide Sorter very slowly when a deck + // exceeds ~250 slides. For large PDFs (e.g. a 660-page CFR) the + // historical 1-section-per-slide mapping produces decks that hit + // both issues. When the IR has more sections than the threshold + // we collapse consecutive sections into heading-bounded chunks of + // at most ~12 paragraphs each and cap the total slide count. + const MAX_SLIDES: usize = 250; + const MAX_PARAGRAPHS_PER_SLIDE: usize = 12; - if let Some(ref title) = section.title { - if !title.is_empty() { - slide.set_title(title); - } + if ir.sections.len() <= MAX_SLIDES { + for section in &ir.sections { + emit_pptx_slide_from_section(&mut writer, section); } + } else { + emit_pptx_slides_compacted(&mut writer, ir, MAX_SLIDES, MAX_PARAGRAPHS_PER_SLIDE); + } - for elem in §ion.elements { - match elem { - Element::Heading(h) => { - if slide.title.is_none() { - slide.set_title(&inline_to_text(&h.content)); - } else { - let runs = inline_to_pptx_runs(&h.content); - if !runs.is_empty() { - slide.add_rich_text(&runs); - } - } - }, - Element::Paragraph(p) => { - let runs = inline_to_pptx_runs(&p.content); - if !runs.is_empty() { - slide.add_rich_text(&runs); - } - }, - Element::List(l) => { - let items: Vec = l - .items + writer +} + +/// One IR section → one slide. Used for "small" decks where 1:1 paging +/// is still viable. +fn emit_pptx_slide_from_section(writer: &mut crate::pptx::write::PptxWriter, section: &Section) { + let slide = writer.add_slide(); + + if let Some(ref title) = section.title { + if !title.is_empty() { + slide.set_title(title); + } + } + + for elem in §ion.elements { + emit_pptx_element(slide, elem); + } +} + +/// Marker text used to encode `Element::ThematicBreak` through PPTX +/// round-trip. The PPTX paragraph format has no `` border the +/// way DOCX `` does; emitting a thin connector shape would +/// position the rule absolutely on the slide (wrong for flow +/// content). Instead we emit a centered paragraph of U+2500 (BOX +/// DRAWINGS LIGHT HORIZONTAL) characters; the renderer's pdf_oxide +/// side detects this exact pattern and re-emits a real +/// `page.horizontal_rule()`. Plain enough that any other consumer +/// (PowerPoint itself, a markdown export, a screen reader) sees a +/// visible horizontal-rule glyph string and treats it as a +/// separator. +pub(crate) const PPTX_THEMATIC_BREAK_MARKER: &str = "\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}"; + +fn emit_pptx_element(slide: &mut crate::pptx::write::SlideData, elem: &Element) { + match elem { + Element::ThematicBreak => { + // Encode via the marker text + center alignment. The + // pdf_oxide renderer recognises the all-U+2500 content + // and draws a real `page.horizontal_rule()` instead of + // rendering the box-drawing glyphs. + let runs = vec![crate::pptx::write::Run::new(PPTX_THEMATIC_BREAK_MARKER)]; + slide.add_rich_text_aligned(&runs, Some(ParagraphAlignment::Center)); + }, + Element::Heading(h) => { + if slide.title.is_none() { + slide.set_title_aligned(&inline_to_text(&h.content), h.alignment.clone()); + } else { + let runs = inline_to_pptx_runs(&h.content); + if !runs.is_empty() { + slide.add_rich_text_aligned(&runs, h.alignment.clone()); + } + } + }, + Element::Paragraph(p) => { + let runs = inline_to_pptx_runs(&p.content); + // Always emit, including for runs.is_empty() — empty + // spacer paragraphs (used by pdf_to_ir to preserve large + // vertical gaps on source cover pages) need to round-trip + // through PPTX as empty elements so the rendered + // PPTX→IR→PDF cycle reproduces the source's vertical + // rhythm. `space_before_twips` from IR (twips) is + // converted to PPTX `` hundredths-of-pt: + // 1 twip = 1/1440 in = 1/20 pt → twips * 5 = pt*100. + let space_before_hundredths_pt = p.space_before_twips.map(|t| t * 5); + let props = crate::pptx::write::ParaProps { + alignment: p.alignment.clone(), + space_before_hundredths_pt, + }; + slide.add_rich_text_with_props(&runs, props); + }, + Element::List(l) => { + let items: Vec = l + .items + .iter() + .map(|i| { + i.content .iter() - .map(|i| { - i.content - .iter() - .map(|e| match e { - Element::Paragraph(p) => inline_to_text(&p.content), - _ => String::new(), - }) - .collect::>() - .join(" ") + .map(|e| match e { + Element::Paragraph(p) => inline_to_text(&p.content), + _ => String::new(), }) - .collect(); - let item_refs: Vec<&str> = items.iter().map(|s| s.as_str()).collect(); - slide.add_bullet_list(&item_refs); - }, - Element::Table(t) => { - let text = t - .rows + .collect::>() + .join(" ") + }) + .collect(); + let item_refs: Vec<&str> = items.iter().map(|s| s.as_str()).collect(); + slide.add_bullet_list(&item_refs); + }, + Element::Table(t) => { + let text = t + .rows + .iter() + .map(|row| { + row.cells .iter() - .map(|row| { - row.cells - .iter() - .map(cell_text) - .collect::>() - .join("\t") - }) + .map(cell_text) .collect::>() - .join("\n"); - if !text.is_empty() { - slide.add_text(&text); + .join("\t") + }) + .collect::>() + .join("\n"); + if !text.is_empty() { + slide.add_text(&text); + } + }, + Element::Image(img) => { + if let (Some(data), Some(fmt)) = (&img.data, &img.format) { + let cx = img.display_width_emu.unwrap_or(3_000_000); + let cy = img.display_height_emu.unwrap_or(2_000_000); + slide.add_image(data.clone(), fmt.clone(), 0, 0, cx, cy); + } + }, + Element::CodeBlock(cb) => { + let run = crate::pptx::write::Run::new(&cb.content).font("Courier New"); + slide.add_rich_text(&[run]); + }, + _ => {}, + } +} + +/// Heading-aware compaction for large IR section lists. +/// +/// Strategy: +/// 1. Build a flat list of `(title, elements)` "groups" where every +/// H1/H2 boundary starts a new group and the section's own +/// elements between headings are concatenated. +/// 2. Each group becomes one or more slides, splitting at paragraph +/// boundaries when the body exceeds `max_paragraphs_per_slide`. +/// 3. After collecting candidate slides, if we still exceed +/// `max_slides`, fold trailing slides into the previous one until +/// the cap is met (preserves earlier headings/structure). +fn emit_pptx_slides_compacted( + writer: &mut crate::pptx::write::PptxWriter, + ir: &DocumentIR, + max_slides: usize, + max_paragraphs_per_slide: usize, +) { + // Step 1: build heading-bounded groups. Each group's title is + // (text, optional alignment); the alignment flows through to + // `slide.set_title_aligned` in step 4 so source-PDF cover-page + // headings keep their original alignment (typically Center). + type TitleWithAlgn = (String, Option); + let mut groups: Vec<(Option, Vec)> = Vec::new(); + let mut current_title: Option = None; + let mut current_elems: Vec = Vec::new(); + // Tracks whether the current group has accumulated any genuine + // body content (non-heading element). When false, an incoming + // H1/H2 is folded into the current slide as a subtitle instead of + // starting a new one. This prevents cover pages — where each + // title-block line is promoted to a heading by `pdf_to_ir` — from + // exploding into one title-only slide per line. + let mut current_has_body = false; + + let flush = |groups: &mut Vec<(Option, Vec)>, + title: &mut Option, + elems: &mut Vec| { + if !elems.is_empty() || title.is_some() { + groups.push((title.take(), std::mem::take(elems))); + } + }; + + // Whether an element constitutes "body content" for compaction + // purposes. Cover pages typically begin with a logo or seal Image + // and a list of centered headings; flipping `current_has_body` on + // the leading Image causes the first heading to fall into the + // "real new section" branch and strand the image as a title-less + // slide. Only text-bearing elements should anchor a slide as + // having body content. Empty paragraphs used as vertical spacers + // (no runs, no border) are skipped — they're layout glue, not + // content; counting them as body causes cover pages to split + // mid-block when pdf_to_ir injects gap spacers. + fn is_body_content(elem: &Element) -> bool { + match elem { + Element::Paragraph(p) => { + + p.content.iter().any(|ic| match ic { + InlineContent::Text(s) => !s.text.is_empty(), + _ => false, + }) + }, + Element::List(_) | Element::CodeBlock(_) | Element::Table(_) => true, + _ => false, + } + } + + for section in &ir.sections { + for elem in §ion.elements { + if let Element::Heading(h) = elem { + if h.level <= 2 { + let text = inline_to_text(&h.content); + let trimmed = text.trim(); + if trimmed.is_empty() { + continue; } - }, - Element::Image(img) => { - if let (Some(data), Some(fmt)) = (&img.data, &img.format) { - let cx = img.display_width_emu.unwrap_or(3_000_000); - let cy = img.display_height_emu.unwrap_or(2_000_000); - slide.add_image(data.clone(), fmt.clone(), 0, 0, cx, cy); + + if !current_has_body { + // Cover-page fold: keep all consecutive + // headings on the same slide. First heading + // owns the slide title; subsequent headings + // become bold paragraphs so they stay visible. + if current_title.is_none() { + current_title = Some((trimmed.to_string(), h.alignment.clone())); + } else { + let mut span = TextSpan::plain(trimmed.to_string()); + span.bold = true; + current_elems.push(Element::Paragraph(Paragraph { + content: vec![InlineContent::Text(span)], + alignment: h.alignment.clone(), + ..Default::default() + })); + } + continue; } - }, - Element::CodeBlock(cb) => { - let run = crate::pptx::write::Run::new(&cb.content).font("Courier New"); - slide.add_rich_text(&[run]); - }, - _ => {}, + + // Real new section: flush and open a new group. + flush(&mut groups, &mut current_title, &mut current_elems); + current_has_body = false; + current_title = Some((trimmed.to_string(), h.alignment.clone())); + continue; + } + } + current_elems.push(elem.clone()); + if is_body_content(elem) { + current_has_body = true; } } } + flush(&mut groups, &mut current_title, &mut current_elems); - writer + // If the IR had no H1/H2 headings at all we end up with a single + // group holding everything. That would be one slide with all the + // content packed in, which the renderer can't actually fit. Fall + // back to a paragraph-count partition over the flattened element + // stream. + if groups.len() <= 1 { + let mut all_elems: Vec = Vec::new(); + for section in &ir.sections { + for elem in §ion.elements { + all_elems.push(elem.clone()); + } + } + groups = vec![(None, all_elems)]; + } + + // Step 2: split each group into slide-sized chunks. + struct PendingSlide { + title: Option<(String, Option)>, + elements: Vec, + } + let mut pending: Vec = Vec::new(); + + for (title, elems) in groups { + let mut chunk: Vec = Vec::new(); + let mut paragraph_count = 0usize; + let mut first_chunk = true; + for elem in elems { + let is_paragraph_like = + matches!(elem, Element::Paragraph(_) | Element::List(_) | Element::CodeBlock(_)); + if is_paragraph_like && paragraph_count >= max_paragraphs_per_slide { + pending.push(PendingSlide { + title: if first_chunk { title.clone() } else { None }, + elements: std::mem::take(&mut chunk), + }); + paragraph_count = 0; + first_chunk = false; + } + if is_paragraph_like { + paragraph_count += 1; + } + chunk.push(elem); + } + if !chunk.is_empty() || (first_chunk && title.is_some()) { + pending.push(PendingSlide { + title: if first_chunk { title.clone() } else { None }, + elements: chunk, + }); + } + } + + // Step 3: enforce the slide cap by folding trailing slides into + // the previous one. We always keep at least one slide. + while pending.len() > max_slides { + // Pop the last slide and append its elements to the previous. + let tail = pending.pop().expect("pending non-empty"); + if let Some(prev) = pending.last_mut() { + prev.elements.extend(tail.elements); + } else { + pending.push(tail); + break; + } + } + + // Step 4: emit slides. + for ps in pending { + let slide = writer.add_slide(); + if let Some((t, algn)) = ps.title.as_ref() { + if !t.is_empty() { + slide.set_title_aligned(t, algn.clone()); + } + } + for elem in &ps.elements { + emit_pptx_element(slide, elem); + } + } } // --------------------------------------------------------------------------- @@ -490,6 +1087,41 @@ fn text_to_cell_data(text: &str) -> crate::xlsx::write::CellData { } } +/// Pluck the first `Element::Heading`'s plain text from a section's +/// element list. Used by `ir_to_xlsx` to derive a meaningful +/// worksheet tab label when the section itself doesn't carry a +/// title — typical for a PDF→IR conversion where heading detection +/// happens at the element level, not the section level. +fn first_heading_text(elements: &[Element]) -> Option { + for el in elements { + if let Element::Heading(h) = el { + let text = inline_to_text(&h.content); + let trimmed = text.trim(); + if !trimmed.is_empty() { + return Some(trimmed.to_string()); + } + } + } + None +} + +/// First explicit font name from inline content. Used by the XLSX +/// path so cell styles carry the source font instead of always +/// falling back to the writer's "Calibri" default. Mirrors the +/// `first_inline_font_size_pt` helper. +fn first_inline_font_name(content: &[InlineContent]) -> Option { + for ic in content { + if let InlineContent::Text(span) = ic { + if let Some(name) = &span.font_name { + if !name.is_empty() { + return Some(name.clone()); + } + } + } + } + None +} + fn xlsx_cell_style(is_header: bool, bg: Option<[u8; 3]>) -> Option { use crate::xlsx::write::CellStyle; if is_header { diff --git a/src/docx/document.rs b/src/docx/document.rs index d572228..968165b 100644 --- a/src/docx/document.rs +++ b/src/docx/document.rs @@ -6,9 +6,20 @@ use super::table::Table; pub struct Body { /// Ordered list of block elements (paragraphs and tables). pub elements: Vec, + /// Indices into `elements` where each `` boundary falls. + /// `section_breaks[i]` is the **count of elements covered by the + /// i-th section** — i.e. elements `[prev_break, section_breaks[i])` + /// belong to section `i`. The final section runs from the last + /// break to `elements.len()` and uses the document-level `sectPr`. + /// Empty for documents with only one section. + pub section_breaks: Vec, } /// A block-level element in the document body (or in a table cell). +// `Paragraph` is ~320 bytes larger than `Table`. Boxing would force +// a heap allocation on the hot parse path for every paragraph; we +// accept the stack size in exchange for keeping parsing alloc-free. +#[allow(clippy::large_enum_variant)] #[derive(Debug, Clone)] pub enum BlockElement { /// A paragraph (`w:p`). diff --git a/src/docx/formatting.rs b/src/docx/formatting.rs index ebd4e69..c332641 100644 --- a/src/docx/formatting.rs +++ b/src/docx/formatting.rs @@ -45,6 +45,35 @@ pub struct ParagraphProperties { pub outline_level: Option, /// Paragraph-mark run properties (`w:rPr` inside `w:pPr`). pub run_properties: Option, + /// Frame position from ``. When present this paragraph is + /// absolutely positioned on the page (used by layout-preserving + /// PDF-derived DOCX, e.g. pdf_oxide's `to_docx_bytes_layout`). + pub frame_position: Option, + /// Section properties from `` inside this paragraph's ``. + /// When present this paragraph terminates a section — the properties + /// describe the section that ends here. + pub section_properties: Option, + /// True when the paragraph has a ``. + /// Used to recover horizontal rules: pdf_to_ir emits + /// `Element::ThematicBreak` which round-trips through DOCX as an + /// empty paragraph with a single bottom border. Without + /// preserving this flag the rule would be silently dropped on + /// re-parse and turned into a plain empty paragraph. + #[allow(dead_code)] + pub has_bottom_border: bool, +} + +/// `` attributes — page-anchored frame coordinates in twips. +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] +pub struct FrameProps { + /// X position in twips, anchored to the page (top-left). + pub x_twips: i32, + /// Y position in twips, anchored to the page (top-left). + pub y_twips: i32, + /// Frame width in twips. + pub width_twips: i32, + /// Frame height in twips. + pub height_twips: i32, } /// Underline style. @@ -583,6 +612,44 @@ pub(crate) fn parse_paragraph_properties_fast( b"rPr" => { props.run_properties = Some(parse_run_properties_fast(reader)?); }, + b"framePr" => { + props.frame_position = parse_frame_pr(e); + xml::skip_element_fast(reader)?; + }, + b"sectPr" => { + props.section_properties = + Some(super::parse_section_properties(reader, e)?); + }, + b"pBdr" => { + // Scan for inside pBdr to + // detect horizontal-rule encoding (empty + // paragraph + bottom border = the + // conventional DOCX
). We don't capture + // full border styling — just the presence + // of a bottom edge. + let mut depth = 1i32; + loop { + match reader.read_event()? { + Event::Start(ref ee) | Event::Empty(ref ee) + if ee.local_name().as_ref() == b"bottom" => + { + props.has_bottom_border = true; + if matches!(reader.read_event()?, Event::Eof) { + break; + } + }, + Event::Start(_) => depth += 1, + Event::End(ref ee) => { + depth -= 1; + if depth <= 0 && ee.local_name().as_ref() == b"pBdr" { + break; + } + }, + Event::Eof => break, + _ => {}, + } + } + }, _ => { xml::skip_element_fast(reader)?; }, @@ -607,6 +674,9 @@ pub(crate) fn parse_paragraph_properties_fast( b"spacing" => { props.spacing = Some(parse_spacing(e)?); }, + b"framePr" => { + props.frame_position = parse_frame_pr(e); + }, b"outlineLvl" => { if let Ok(Some(val)) = xml::optional_attr_str(e, b"w:val") { if let Ok(lvl) = val.parse::() { @@ -774,6 +844,32 @@ pub(crate) fn parse_indent(e: &BytesStart) -> crate::core::Result` attributes (`w:x`, `w:y`, `w:w`, `w:h`). +/// Returns `None` if the element doesn't carry usable absolute coords — +/// e.g. when only `wrap`/`anchor` modifiers are set without explicit +/// position/size, which we can't reproduce as positional. +fn parse_frame_pr(e: &BytesStart) -> Option { + let read_int = |attr: &[u8]| -> Option { + xml::optional_attr_str(e, attr) + .ok() + .flatten() + .and_then(|v| v.parse::().ok()) + }; + let x = read_int(b"w:x"); + let y = read_int(b"w:y"); + let w = read_int(b"w:w"); + let h = read_int(b"w:h"); + match (x, y, w, h) { + (Some(x), Some(y), Some(w), Some(h)) => Some(FrameProps { + x_twips: x, + y_twips: y, + width_twips: w, + height_twips: h, + }), + _ => None, + } +} + fn parse_spacing(e: &BytesStart) -> crate::core::Result { let mut spacing = ParagraphSpacing::default(); if let Some(val) = xml::optional_attr_str(e, b"w:before")? { @@ -933,4 +1029,101 @@ mod tests { } } } + + // Advance a fast reader past the opening wrapper so the + // caller can drive parse_paragraph_properties_fast directly. + fn open_ppr_fast(xml: &[u8]) -> quick_xml::Reader<&[u8]> { + let mut reader = xml::make_fast_reader(xml); + loop { + match reader.read_event().unwrap() { + Event::Start(ref e) if e.local_name().as_ref() == b"pPr" => return reader, + Event::Eof => panic!("no in test xml"), + _ => {}, + } + } + } + + // ── framePr ───────────────────────────────────────────────────────── + + #[test] + fn parse_frame_pr_empty_element() { + let xml = + br#" + + "#; + let mut reader = open_ppr_fast(xml); + let pp = parse_paragraph_properties_fast(&mut reader).unwrap(); + let fp = pp.frame_position.expect("framePr parsed"); + assert_eq!(fp.x_twips, 720); + assert_eq!(fp.y_twips, 1080); + assert_eq!(fp.width_twips, 3000); + assert_eq!(fp.height_twips, 500); + } + + #[test] + fn parse_frame_pr_missing_attrs_returns_none() { + // Missing w:h → frame_position must be None. + let xml = + br#" + + "#; + let mut reader = open_ppr_fast(xml); + let pp = parse_paragraph_properties_fast(&mut reader).unwrap(); + assert!(pp.frame_position.is_none()); + } + + #[test] + fn parse_frame_pr_inside_start_form() { + // Start/End form (rather than Empty) — should still parse. + let xml = + br#" + + "#; + let mut reader = open_ppr_fast(xml); + let pp = parse_paragraph_properties_fast(&mut reader).unwrap(); + let fp = pp.frame_position.expect("framePr parsed"); + assert_eq!(fp.x_twips, 10); + assert_eq!(fp.width_twips, 30); + } + + // ── pBdr / has_bottom_border ──────────────────────────────────────── + + #[test] + fn parse_p_bdr_with_bottom() { + let xml = + br#" + + + + "#; + let mut reader = open_ppr_fast(xml); + let pp = parse_paragraph_properties_fast(&mut reader).unwrap(); + assert!(pp.has_bottom_border); + } + + #[test] + fn parse_p_bdr_without_bottom() { + let xml = + br#" + + + + + "#; + let mut reader = open_ppr_fast(xml); + let pp = parse_paragraph_properties_fast(&mut reader).unwrap(); + assert!(!pp.has_bottom_border); + } + + #[test] + fn paragraph_properties_default_has_no_frame_or_border() { + let xml = + br#" + + "#; + let mut reader = open_ppr_fast(xml); + let pp = parse_paragraph_properties_fast(&mut reader).unwrap(); + assert!(pp.frame_position.is_none()); + assert!(!pp.has_bottom_border); + } } diff --git a/src/docx/image.rs b/src/docx/image.rs index bef7ad1..bb8aa25 100644 --- a/src/docx/image.rs +++ b/src/docx/image.rs @@ -1,16 +1,81 @@ use crate::core::units::Emu; /// Information about a drawing/image reference within a run. +/// +/// Carries enough data for both bitmap pictures (``) +/// and DrawingML preset shapes (`` with ``). Only +/// one of `relationship_id` or `shape` is populated for any given +/// drawing — the consumer (`convert_docx`) uses whichever is set to +/// decide what kind of IR `Element` to emit. #[derive(Debug, Clone)] pub struct DrawingInfo { - /// Relationship ID pointing to the image part. + /// Relationship ID pointing to the image part. Empty when the + /// drawing is a vector shape rather than a raster picture. pub relationship_id: String, /// Alt-text description from `wp:docPr/@descr`. pub description: Option, - /// Image width in EMUs. + /// Image / shape width in EMUs. pub width: Emu, - /// Image height in EMUs. + /// Image / shape height in EMUs. pub height: Emu, /// `true` = inline, `false` = anchor (floating). pub inline: bool, + /// Floating-anchor position (only set when `inline == false`). + pub anchor_position: Option, + /// Vector shape data when the drawing is a `` rather + /// than an embedded picture. + pub shape: Option, +} + +/// Absolute coordinates extracted from a `` wrapper. +#[derive(Debug, Clone, Copy, Default)] +pub struct AnchorPosition { + /// Horizontal offset in EMUs. + pub x_emu: i64, + /// Vertical offset in EMUs. + pub y_emu: i64, + /// What the horizontal offset is anchored to (page / margin / column). + pub h_relative_from: AnchorFrame, + /// What the vertical offset is anchored to (page / margin / paragraph). + pub v_relative_from: AnchorFrame, +} + +/// Reference frame for a floating-object anchor. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub enum AnchorFrame { + /// Position relative to the page. + #[default] + Page, + /// Position relative to the page margin. + Margin, + /// Position relative to the column. + Column, + /// Position relative to the paragraph (for vertical anchor). + Paragraph, + /// Position relative to the page line (for vertical anchor). + Line, + /// Position relative to the character (for horizontal anchor). + Character, +} + +/// Vector-shape data parsed from ``. +#[derive(Debug, Clone)] +pub struct ShapeInfo { + /// Geometry preset from ``. + pub kind: ShapeKind, + /// Stroke colour (``). + pub stroke_rgb: Option<(u8, u8, u8)>, + /// Fill colour (``). + pub fill_rgb: Option<(u8, u8, u8)>, + /// Stroke width in EMUs (``). + pub stroke_w_emu: Option, +} + +/// Subset of DrawingML preset shape kinds we currently round-trip. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ShapeKind { + /// Straight line (`prst="line"`). + Line, + /// Rectangle (`prst="rect"`). + Rect, } diff --git a/src/docx/mod.rs b/src/docx/mod.rs index 1f698a8..02d2e60 100644 --- a/src/docx/mod.rs +++ b/src/docx/mod.rs @@ -48,9 +48,11 @@ pub use formatting::{ Justification, ParagraphIndent, ParagraphProperties, ParagraphSpacing, RunProperties, UnderlineType, VerticalAlign, }; -pub use headers::{HeaderFooter, HeaderFooterType, PageMargins, PageSize, SectionProperties}; +pub use headers::{ + HeaderFooter, HeaderFooterType, PageMargins, PageOrientation, PageSize, SectionProperties, +}; pub use hyperlink::{Hyperlink, HyperlinkTarget}; -pub use image::DrawingInfo; +pub use image::{AnchorFrame, AnchorPosition, DrawingInfo, ShapeInfo, ShapeKind}; pub use numbering::{NumberFormat, NumberingDefinitions}; pub use paragraph::{BreakType, Paragraph, ParagraphContent, Run, RunContent}; pub use styles::{Style, StyleSheet, StyleType}; @@ -69,7 +71,7 @@ use crate::core::units::Emu; use crate::core::xml; use self::formatting::{parse_paragraph_properties_fast, parse_run_properties_fast}; -use self::headers::{HeaderFooterRef, PageOrientation}; +use self::headers::HeaderFooterRef; use self::table::{ MergeType, Shading, TableCellProperties, TableRowProperties, TableWidth, TableWidthType, }; @@ -102,6 +104,17 @@ pub struct DocxDocument { pub sections: Vec, /// Parsed headers and footers. pub headers_footers: Vec, + /// Font programs found under `word/fonts/`. Each entry is + /// `(font_name, ttf_or_otf_bytes)`. PDF→DOCX→PDF round-trips use these + /// to preserve typeface fidelity (e.g. CJK / math fonts beyond + /// pdf_oxide's bundled DejaVu fallback). + pub embedded_fonts: Vec<(String, Vec)>, + /// Image parts referenced from the main document, keyed by the + /// relationship id used in ``. Lets the + /// IR converter populate `Image::data` so downstream renderers + /// (the positional PDF reader, plain-text export with alt-text, + /// etc.) can place actual bitmap content. + pub images: std::collections::HashMap, Option)>, } impl DocxDocument { @@ -182,10 +195,75 @@ impl DocxDocument { } } + // Scan `word/fonts/` for embedded font programs. Files there are + // typically `font__.ttf` (written by our own `DocxWriter`) + // but the loop accepts any `.ttf`/`.otf` for forward-compat. + let mut embedded_fonts: Vec<(String, Vec)> = Vec::new(); + for name in opc.part_names() { + let s = name.to_string(); + if !s.starts_with("/word/fonts/") { + continue; + } + let lower = s.to_lowercase(); + if !(lower.ends_with(".ttf") || lower.ends_with(".otf")) { + continue; + } + if let Ok(data) = opc.read_part(&name) { + // Extract a usable face name from the OPC part. Writers + // ship fonts as `font__.` (the + // `embedded_fonts` writer convention used by all three + // PDF→office paths) — strip the leading `font__` + // prefix and the trailing `.ttf`/`.otf` so the + // registered name matches what the IR carries on each + // run's `font_name` (e.g. `TeXGyreTermesX-Regular`). + // Falls back to the basename for files that don't + // follow the convention. + let basename = s.rsplit('/').next().unwrap_or("font"); + let face = strip_embedded_font_filename(basename); + let font_name = if face.is_empty() { + basename.to_string() + } else { + face + }; + embedded_fonts.push((font_name, data)); + } + } + + // Pull image parts referenced by the main document + // relationships. We capture the raw bytes plus the lower-cased + // file extension so downstream code can decide on the format + // without re-sniffing magic bytes. + let mut images: std::collections::HashMap, Option)> = + std::collections::HashMap::new(); + for rel in doc_rels.get_by_type(rel_types::IMAGE) { + if rel.target_mode != TargetMode::Internal { + continue; + } + let part_name = match main_part.resolve_relative(&rel.target) { + Ok(p) => p, + Err(_) => continue, + }; + if !opc.has_part(&part_name) { + continue; + } + let data = match opc.read_part(&part_name) { + Ok(d) => d, + Err(_) => continue, + }; + let ext = part_name + .as_str() + .rsplit('.') + .next() + .map(|s| s.to_lowercase()); + images.insert(rel.id.clone(), (data, ext)); + } + debug!( - "DocxDocument: {} block elements, {} sections", + "DocxDocument: {} block elements, {} sections, {} embedded fonts, {} images", body.elements.len(), - sections.len() + sections.len(), + embedded_fonts.len(), + images.len() ); Ok(DocxDocument { body, @@ -194,6 +272,8 @@ impl DocxDocument { theme, sections, headers_footers, + embedded_fonts, + images, }) } } @@ -259,8 +339,33 @@ fn parse_document( // Resolve hyperlink targets using relationships resolve_hyperlinks(&mut elements, rels); - let body = Body { elements }; - Ok((body, sections)) + // Detect mid-document section breaks: paragraphs whose + // carries a . Each such paragraph terminates a section, + // and its sectPr describes the section that ends there. Trailing + // elements after the last break belong to a final section + // described by the body-level sectPr (already in `sections`). + let mut section_breaks: Vec = Vec::new(); + let mut break_sections: Vec = Vec::new(); + for (idx, el) in elements.iter().enumerate() { + if let BlockElement::Paragraph(p) = el { + if let Some(props) = &p.properties { + if let Some(sp) = &props.section_properties { + section_breaks.push(idx + 1); + break_sections.push(sp.clone()); + } + } + } + } + // Stitch break-derived section_properties in front of the + // body-level final sectPr so the section list is in document order. + let mut all_sections = break_sections; + all_sections.extend(sections); + + let body = Body { + elements, + section_breaks, + }; + Ok((body, all_sections)) } /// Walk the element tree and resolve hyperlink rIds to actual URLs. @@ -443,53 +548,239 @@ fn parse_hyperlink( // Drawing / image parsing // --------------------------------------------------------------------------- +/// Parse a `` element. The opening tag has already been +/// consumed by the caller, so we drive forward until the matching +/// `` End event. +/// +/// A drawing wraps either `` or `` (anchor = +/// floating). Everything we care about lives inside that single +/// wrapper, so we delegate to `parse_inline_or_anchor_body` and treat +/// any other top-level event as ignorable filler. fn parse_drawing(reader: &mut quick_xml::Reader<&[u8]>) -> CoreResult> { - let mut inline = true; + let mut info: Option = None; + + loop { + match reader.read_event()? { + Event::Start(ref e) => match e.local_name().as_ref() { + b"inline" => { + info = parse_inline_or_anchor_body(reader, /*inline=*/ true, b"inline")?; + }, + b"anchor" => { + info = parse_inline_or_anchor_body(reader, /*inline=*/ false, b"anchor")?; + }, + _ => { + xml::skip_element_fast(reader)?; + }, + }, + Event::End(ref e) if e.local_name().as_ref() == b"drawing" => break, + Event::Eof => break, + _ => {}, + } + } + + Ok(info) +} + +/// Parse the body of `` or `` until the matching +/// closing tag (`end_local`). Collects extent, docPr, position, and the +/// graphic payload (image or shape) into a `DrawingInfo`. +fn parse_inline_or_anchor_body( + reader: &mut quick_xml::Reader<&[u8]>, + inline: bool, + end_local: &[u8], +) -> CoreResult> { + use crate::docx::image::{AnchorFrame, AnchorPosition}; + let mut width = Emu(0); let mut height = Emu(0); let mut description: Option = None; let mut relationship_id: Option = None; - let mut depth = 1u32; + let mut shape: Option = None; + + let mut anchor_x: Option = None; + let mut anchor_y: Option = None; + let mut h_frame = AnchorFrame::default(); + let mut v_frame = AnchorFrame::default(); + + loop { + match reader.read_event()? { + Event::Start(ref e) => match e.local_name().as_ref() { + b"extent" => { + parse_extent_attrs(e, &mut width, &mut height); + xml::skip_element_fast(reader)?; + }, + b"docPr" => { + if let Some(desc) = xml::optional_attr_str(e, b"descr")? { + description = Some(desc.into_owned()); + } + xml::skip_element_fast(reader)?; + }, + b"positionH" => { + if let Some(rf) = xml::optional_attr_str(e, b"relativeFrom")? { + h_frame = parse_anchor_frame(&rf); + } + anchor_x = parse_position_offset(reader, b"positionH")?; + }, + b"positionV" => { + if let Some(rf) = xml::optional_attr_str(e, b"relativeFrom")? { + v_frame = parse_anchor_frame(&rf); + } + anchor_y = parse_position_offset(reader, b"positionV")?; + }, + b"graphic" => { + let g = parse_graphic(reader)?; + if let Some(rid) = g.relationship_id { + relationship_id = Some(rid); + } + if let Some(s) = g.shape { + shape = Some(s); + } + }, + _ => { + xml::skip_element_fast(reader)?; + }, + }, + Event::Empty(ref e) => match e.local_name().as_ref() { + b"extent" => parse_extent_attrs(e, &mut width, &mut height), + b"docPr" => { + if let Some(desc) = xml::optional_attr_str(e, b"descr")? { + description = Some(desc.into_owned()); + } + }, + _ => {}, + }, + Event::End(ref e) if e.local_name().as_ref() == end_local => break, + Event::Eof => break, + _ => {}, + } + } + + let anchor_position = if !inline && (anchor_x.is_some() || anchor_y.is_some()) { + Some(AnchorPosition { + x_emu: anchor_x.unwrap_or(0), + y_emu: anchor_y.unwrap_or(0), + h_relative_from: h_frame, + v_relative_from: v_frame, + }) + } else { + None + }; + + if relationship_id.is_some() || shape.is_some() { + Ok(Some(DrawingInfo { + relationship_id: relationship_id.unwrap_or_default(), + description, + width, + height, + inline, + anchor_position, + shape, + })) + } else { + Ok(None) + } +} + +/// Parse the inside of `` or `` looking for +/// the nested `` text value. Reads through the matching +/// closing tag (`end_local`). +fn parse_position_offset( + reader: &mut quick_xml::Reader<&[u8]>, + end_local: &[u8], +) -> CoreResult> { + let mut offset: Option = None; + + loop { + match reader.read_event()? { + Event::Start(ref e) if e.local_name().as_ref() == b"posOffset" => { + let text = xml::read_text_content_fast(reader)?; + if let Ok(v) = text.trim().parse::() { + offset = Some(v); + } + }, + Event::Start(_) => { + xml::skip_element_fast(reader)?; + }, + Event::End(ref e) if e.local_name().as_ref() == end_local => break, + Event::Eof => break, + _ => {}, + } + } + + Ok(offset) +} + +/// Result of parsing an `` element: at most one of an +/// embedded picture (`relationship_id`) or a vector shape (`shape`). +struct GraphicPayload { + relationship_id: Option, + shape: Option, +} + +/// Parse `` and any contained `` (image) or +/// `` (vector shape). Reads through ``. +fn parse_graphic(reader: &mut quick_xml::Reader<&[u8]>) -> CoreResult { + let mut relationship_id: Option = None; + let mut shape: Option = None; + + loop { + match reader.read_event()? { + Event::Start(ref e) => match e.local_name().as_ref() { + b"pic" => { + if let Some(rid) = parse_pic(reader)? { + relationship_id = Some(rid); + } + }, + b"wsp" => { + if let Some(s) = parse_wsp(reader)? { + shape = Some(s); + } + }, + // is just a wrapper; descend into it. + b"graphicData" => continue, + _ => { + xml::skip_element_fast(reader)?; + }, + }, + Event::End(ref e) if e.local_name().as_ref() == b"graphic" => break, + Event::Eof => break, + _ => {}, + } + } + + Ok(GraphicPayload { + relationship_id, + shape, + }) +} + +/// Parse `` looking for the embedded ``. +/// Reads through ``. The blip lives inside ``, +/// so we descend through whatever wrappers we encounter rather than +/// skipping siblings. +fn parse_pic(reader: &mut quick_xml::Reader<&[u8]>) -> CoreResult> { + let mut rid: Option = None; + // Track depth relative to : we entered after its Start was + // consumed by the caller, so we are at depth 1. Exit when we close + // back out. + let mut depth: u32 = 1; loop { match reader.read_event()? { Event::Start(ref e) => { - depth += 1; - let local = e.local_name(); - let local_bytes = local.as_ref(); - match local_bytes { - b"inline" => inline = true, - b"anchor" => inline = false, - b"extent" => parse_extent_attrs(e, &mut width, &mut height), - b"docPr" => { - if let Ok(Some(desc)) = xml::optional_attr_str(e, b"descr") { - description = Some(desc.into_owned()); - } - }, - b"blip" => { - if let Ok(Some(embed)) = xml::optional_attr_str(e, b"r:embed") { - relationship_id = Some(embed.into_owned()); - } - }, - _ => {}, + if e.local_name().as_ref() == b"blip" { + if let Some(embed) = xml::optional_attr_str(e, b"r:embed")? { + rid = Some(embed.into_owned()); + } + // Skip over blip's own children (e.g. ). + xml::skip_element_fast(reader)?; + } else { + depth += 1; } }, - Event::Empty(ref e) => { - let local = e.local_name(); - let local_bytes = local.as_ref(); - match local_bytes { - b"extent" => parse_extent_attrs(e, &mut width, &mut height), - b"docPr" => { - if let Ok(Some(desc)) = xml::optional_attr_str(e, b"descr") { - description = Some(desc.into_owned()); - } - }, - b"blip" => { - if let Ok(Some(embed)) = xml::optional_attr_str(e, b"r:embed") { - relationship_id = Some(embed.into_owned()); - } - }, - _ => {}, + Event::Empty(ref e) if e.local_name().as_ref() == b"blip" => { + if let Some(embed) = xml::optional_attr_str(e, b"r:embed")? { + rid = Some(embed.into_owned()); } }, Event::End(_) => { @@ -503,17 +794,209 @@ fn parse_drawing(reader: &mut quick_xml::Reader<&[u8]>) -> CoreResult` (a DrawingML vector shape). Reads through +/// `` and returns the assembled `ShapeInfo`, or `None` if no +/// `` was seen. +fn parse_wsp( + reader: &mut quick_xml::Reader<&[u8]>, +) -> CoreResult> { + use crate::docx::image::{ShapeInfo, ShapeKind}; + + let mut kind: Option = None; + let mut stroke_rgb: Option<(u8, u8, u8)> = None; + let mut fill_rgb: Option<(u8, u8, u8)> = None; + let mut stroke_w_emu: Option = None; + + loop { + match reader.read_event()? { + Event::Start(ref e) => match e.local_name().as_ref() { + b"spPr" => { + parse_sp_pr( + reader, + &mut kind, + &mut stroke_rgb, + &mut fill_rgb, + &mut stroke_w_emu, + )?; + }, + _ => { + xml::skip_element_fast(reader)?; + }, + }, + Event::End(ref e) if e.local_name().as_ref() == b"wsp" => break, + Event::Eof => break, + _ => {}, + } + } + + Ok(kind.map(|k| ShapeInfo { + kind: k, + stroke_rgb, + fill_rgb, + stroke_w_emu, + })) +} + +/// Parse ``: contains the geometry preset, an optional fill, +/// and an optional `` (line/stroke) sub-element. Reads through +/// ``. +fn parse_sp_pr( + reader: &mut quick_xml::Reader<&[u8]>, + kind: &mut Option, + stroke_rgb: &mut Option<(u8, u8, u8)>, + fill_rgb: &mut Option<(u8, u8, u8)>, + stroke_w_emu: &mut Option, +) -> CoreResult<()> { + use crate::docx::image::ShapeKind; + + loop { + match reader.read_event()? { + Event::Start(ref e) => match e.local_name().as_ref() { + b"prstGeom" => { + if let Some(prst) = xml::optional_attr_str(e, b"prst")? { + *kind = match prst.as_ref() { + "line" | "straightConnector1" => Some(ShapeKind::Line), + "rect" => Some(ShapeKind::Rect), + _ => *kind, + }; + } + xml::skip_element_fast(reader)?; + }, + b"ln" => { + if let Some(w) = xml::optional_attr_str(e, b"w")? { + *stroke_w_emu = w.parse().ok(); + } + *stroke_rgb = parse_line_color(reader)?.or(*stroke_rgb); + }, + b"solidFill" => { + *fill_rgb = parse_solid_fill_color(reader)?.or(*fill_rgb); + }, + _ => { + xml::skip_element_fast(reader)?; + }, + }, + Event::Empty(ref e) => match e.local_name().as_ref() { + b"prstGeom" => { + if let Some(prst) = xml::optional_attr_str(e, b"prst")? { + *kind = match prst.as_ref() { + "line" | "straightConnector1" => Some(ShapeKind::Line), + "rect" => Some(ShapeKind::Rect), + _ => *kind, + }; + } + }, + b"ln" => { + if let Some(w) = xml::optional_attr_str(e, b"w")? { + *stroke_w_emu = w.parse().ok(); + } + }, + _ => {}, + }, + Event::End(ref e) if e.local_name().as_ref() == b"spPr" => break, + Event::Eof => break, + _ => {}, + } + } + + Ok(()) +} + +/// Parse `` looking for an inner ``. +/// Reads through ``. +fn parse_line_color(reader: &mut quick_xml::Reader<&[u8]>) -> CoreResult> { + let mut rgb: Option<(u8, u8, u8)> = None; + + loop { + match reader.read_event()? { + Event::Start(ref e) => match e.local_name().as_ref() { + b"solidFill" => { + if let Some(c) = parse_solid_fill_color(reader)? { + rgb = Some(c); + } + }, + _ => { + xml::skip_element_fast(reader)?; + }, + }, + Event::End(ref e) if e.local_name().as_ref() == b"ln" => break, + Event::Eof => break, + _ => {}, + } + } + + Ok(rgb) +} + +/// Parse `` looking for an inner ``. +/// Reads through ``. +fn parse_solid_fill_color( + reader: &mut quick_xml::Reader<&[u8]>, +) -> CoreResult> { + let mut rgb: Option<(u8, u8, u8)> = None; + + loop { + match reader.read_event()? { + Event::Start(ref e) => { + if e.local_name().as_ref() == b"srgbClr" { + if let Some(val) = xml::optional_attr_str(e, b"val")? { + if let Some(parsed) = parse_hex_rgb(&val) { + rgb = Some(parsed); + } + } + } + xml::skip_element_fast(reader)?; + }, + Event::Empty(ref e) if e.local_name().as_ref() == b"srgbClr" => { + if let Some(val) = xml::optional_attr_str(e, b"val")? { + if let Some(parsed) = parse_hex_rgb(&val) { + rgb = Some(parsed); + } + } + }, + Event::End(ref e) if e.local_name().as_ref() == b"solidFill" => break, + Event::Eof => break, + _ => {}, + } + } + + Ok(rgb) +} + +fn parse_anchor_frame(s: &str) -> crate::docx::image::AnchorFrame { + use crate::docx::image::AnchorFrame; + match s { + "page" => AnchorFrame::Page, + "margin" | "leftMargin" | "rightMargin" | "topMargin" | "bottomMargin" | "insideMargin" + | "outsideMargin" => AnchorFrame::Margin, + "column" => AnchorFrame::Column, + "paragraph" => AnchorFrame::Paragraph, + "line" => AnchorFrame::Line, + "character" => AnchorFrame::Character, + _ => AnchorFrame::Page, + } +} + +fn parse_hex_rgb(s: &str) -> Option<(u8, u8, u8)> { + let bytes = s.trim().as_bytes(); + if bytes.len() != 6 { + return None; } + fn hex_pair(a: u8, b: u8) -> Option { + let h = |c: u8| match c { + b'0'..=b'9' => Some(c - b'0'), + b'a'..=b'f' => Some(10 + c - b'a'), + b'A'..=b'F' => Some(10 + c - b'A'), + _ => None, + }; + Some((h(a)? << 4) | h(b)?) + } + let r = hex_pair(bytes[0], bytes[1])?; + let g = hex_pair(bytes[2], bytes[3])?; + let b = hex_pair(bytes[4], bytes[5])?; + Some((r, g, b)) } fn parse_extent_attrs(e: &quick_xml::events::BytesStart, width: &mut Emu, height: &mut Emu) { @@ -822,7 +1305,7 @@ fn parse_table_width(e: &quick_xml::events::BytesStart) -> CoreResult, _start: &quick_xml::events::BytesStart, ) -> CoreResult { @@ -917,6 +1400,36 @@ fn parse_section_properties( Ok(props) } +/// Recover the original face name from an embedded-font filename +/// produced by `core::embedded_fonts::write_embedded_fonts`. The +/// writer ships fonts as `font__.` where `` is +/// the original face name (with `/`, `?`, `*` etc. sanitized to `_` +/// — but NOT alphabetic characters, which earlier callers' naive +/// `trim_end_matches(alphabetic)` was greedily eating). +/// +/// Examples: +/// `font_4_TeXGyreTermesX-Regular.ttf` → `TeXGyreTermesX-Regular` +/// `font_1_NewTXBMI.ttf` → `NewTXBMI` +/// `font.otf` → `` (caller falls back to basename) +pub(crate) fn strip_embedded_font_filename(basename: &str) -> String { + // Drop extension. + let stem = match basename.rfind('.') { + Some(i) => &basename[..i], + None => basename, + }; + // Strip the `font__` prefix when present. + if let Some(rest) = stem.strip_prefix("font_") { + if let Some(under_idx) = rest.find('_') { + // Everything before the underscore must be digits; + // otherwise treat the whole stem as the face name. + if rest[..under_idx].chars().all(|c| c.is_ascii_digit()) { + return rest[under_idx + 1..].to_string(); + } + } + } + stem.to_string() +} + fn parse_hf_type(e: &quick_xml::events::BytesStart) -> CoreResult { Ok(match xml::optional_attr_str(e, b"w:type")? { Some(ref val) => match val.as_ref() { @@ -1171,6 +1684,86 @@ mod tests { assert!(md.contains("| Cell1 | Cell2 |")); } + #[test] + fn parse_drawing_anchor_position() { + let xml = + br#" + + 914400 + 457200 + + + + + + + + "#; + let mut reader = make_content_reader(xml); + // Advance past the outer Start so parse_drawing + // sees the inner contents (it expects to be entered with + // depth=1 already accounting for that wrapper). + loop { + match reader.read_event().unwrap() { + quick_xml::events::Event::Start(ref e) if e.local_name().as_ref() == b"drawing" => { + break; + }, + quick_xml::events::Event::Eof => panic!("no drawing"), + _ => {}, + } + } + let info = parse_drawing(&mut reader).unwrap().expect("drawing"); + assert!(!info.inline); + let pos = info.anchor_position.expect("anchor position"); + assert_eq!(pos.x_emu, 914400); + assert_eq!(pos.y_emu, 457200); + assert_eq!(pos.h_relative_from, crate::docx::AnchorFrame::Page); + assert_eq!(info.relationship_id, "rId7"); + } + + #[test] + fn parse_drawing_wsp_line_shape() { + let xml = + br#" + + 100000 + 200000 + + + + + + + + + + + + + "#; + let mut reader = make_content_reader(xml); + loop { + match reader.read_event().unwrap() { + quick_xml::events::Event::Start(ref e) if e.local_name().as_ref() == b"drawing" => { + break; + }, + quick_xml::events::Event::Eof => panic!("no drawing"), + _ => {}, + } + } + let info = parse_drawing(&mut reader).unwrap().expect("drawing"); + let shape = info.shape.expect("shape"); + assert_eq!(shape.kind, crate::docx::ShapeKind::Line); + assert_eq!(shape.stroke_rgb, Some((0xFF, 0x00, 0x00))); + assert_eq!(shape.stroke_w_emu, Some(9525)); + } + #[test] fn section_properties() { let xml = br#" @@ -1193,4 +1786,60 @@ mod tests { let margins = sect.margins.as_ref().unwrap(); assert_eq!(margins.left.0, 1800); } + + // ── strip_embedded_font_filename ──────────────────────────────────── + + #[test] + fn strip_embedded_font_writer_convention() { + // Writer convention: font__. + assert_eq!( + strip_embedded_font_filename("font_4_TeXGyreTermesX-Regular.ttf"), + "TeXGyreTermesX-Regular" + ); + assert_eq!(strip_embedded_font_filename("font_1_NewTXBMI.ttf"), "NewTXBMI"); + assert_eq!(strip_embedded_font_filename("font_12_DejaVuSans.otf"), "DejaVuSans"); + } + + #[test] + fn strip_embedded_font_no_prefix_keeps_stem() { + // No `font__` prefix → return the stem unchanged. + assert_eq!(strip_embedded_font_filename("Arial.ttf"), "Arial"); + assert_eq!(strip_embedded_font_filename("MyFont.otf"), "MyFont"); + } + + #[test] + fn strip_embedded_font_no_extension() { + // No extension → use the whole input. + assert_eq!(strip_embedded_font_filename("font_1_Calibri"), "Calibri"); + assert_eq!(strip_embedded_font_filename("Calibri"), "Calibri"); + } + + #[test] + fn strip_embedded_font_non_digit_prefix_keeps_stem() { + // `font_xxx_` where xxx isn't digits → don't strip. + assert_eq!(strip_embedded_font_filename("font_abc_Foo.ttf"), "font_abc_Foo"); + } + + #[test] + fn strip_embedded_font_alphabetic_face_preserved() { + // Regression: greedy trim_end_matches(alphabetic) used to eat + // the face name. Verify a face with trailing alphabetic chars + // survives intact. + assert_eq!( + strip_embedded_font_filename("font_4_TeXGyreTermesX-Bold.ttf"), + "TeXGyreTermesX-Bold" + ); + } + + #[test] + fn strip_embedded_font_empty() { + assert_eq!(strip_embedded_font_filename(""), ""); + } + + #[test] + fn strip_embedded_font_no_face_after_prefix() { + // `font__` with nothing after the underscore → empty face. + // Caller of this helper falls back to the full basename. + assert_eq!(strip_embedded_font_filename("font_5_.ttf"), ""); + } } diff --git a/src/docx/text.rs b/src/docx/text.rs index 1e2c1bb..3a5bc13 100644 --- a/src/docx/text.rs +++ b/src/docx/text.rs @@ -24,21 +24,104 @@ impl DocxDocument { } /// Convert the document to Markdown. + /// + /// Includes headers and footers around the body so a downstream + /// renderer (PDF, HTML, search index) sees the full visible content + /// of every page. Without this, simple-but-meaningful artefacts like + /// `My header` / `My footer` are silently dropped. pub fn to_markdown(&self) -> String { let mut out = String::new(); let ctx = MarkdownCtx { styles: self.styles.as_ref(), numbering: self.numbering.as_ref(), }; + + // Headers (deduped on text content — headers may be repeated for + // first-page / even / default variants but the text is usually the + // same; we only want one copy in flat markdown). + let mut seen: std::collections::HashSet = std::collections::HashSet::new(); + for hf in &self.headers_footers { + if !matches!( + hf.hf_type, + super::HeaderFooterType::Default + | super::HeaderFooterType::First + | super::HeaderFooterType::Even + ) { + continue; + } + let mut buf = String::new(); + markdown_blocks(&hf.content, &ctx, &mut buf, 0); + let trimmed = buf.trim(); + // Skip empty headers/footers and duplicates. + if trimmed.is_empty() || !seen.insert(trimmed.to_string()) { + continue; + } + // We don't currently know which side (header vs footer) this + // came from at this layer — `HeaderFooter` carries only the + // type modifier (default/first/even). The body sits between + // the headers and footers we emit, so we put all headers + // before and all footers after the body. + } + + // Decide header/footer split using each section's references. + let (header_texts, footer_texts) = split_headers_footers(self, &ctx); + for h in &header_texts { + out.push_str(h); + out.push_str("\n\n"); + } + markdown_blocks(&self.body.elements, &ctx, &mut out, 0); + + for f in &footer_texts { + if !out.ends_with("\n\n") { + out.push_str("\n\n"); + } + out.push_str(f); + out.push('\n'); + } + // Trim trailing newlines while out.ends_with('\n') { out.pop(); } + let _ = seen; // silence out } } +/// Split parsed `HeaderFooter` entries into headers vs footers using the +/// section reference lists. Returns (headers, footers) as deduplicated +/// markdown-string vectors. We don't currently retain the relationship +/// IDs that map a section ref to a specific parsed `HeaderFooter`, so we +/// approximate: header_refs.len() entries from the front go to headers, +/// the rest go to footers. Correct for the common case (single section +/// with one of each); on multi-variant documents some misclassification +/// is possible but text is still preserved (just maybe in the wrong slot). +fn split_headers_footers(doc: &DocxDocument, ctx: &MarkdownCtx) -> (Vec, Vec) { + let mut headers: Vec = Vec::new(); + let mut footers: Vec = Vec::new(); + let mut header_seen: std::collections::HashSet = std::collections::HashSet::new(); + let mut footer_seen: std::collections::HashSet = std::collections::HashSet::new(); + + let n_header_refs: usize = doc.sections.iter().map(|s| s.header_refs.len()).sum(); + for (idx, hf) in doc.headers_footers.iter().enumerate() { + let mut buf = String::new(); + markdown_blocks(&hf.content, ctx, &mut buf, 0); + let t = buf.trim().to_string(); + if t.is_empty() { + continue; + } + if idx < n_header_refs { + if header_seen.insert(t.clone()) { + headers.push(t); + } + } else if footer_seen.insert(t.clone()) { + footers.push(t); + } + } + (headers, footers) +} + fn plain_text_blocks(elements: &[BlockElement], out: &mut String) { for elem in elements { match elem { diff --git a/src/docx/write.rs b/src/docx/write.rs index 914e9e2..c0ea70c 100644 --- a/src/docx/write.rs +++ b/src/docx/write.rs @@ -47,6 +47,8 @@ use super::Result; const CT_DOCUMENT: &str = "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"; const CT_STYLES: &str = "application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml"; +const CT_FONT_TABLE: &str = + "application/vnd.openxmlformats-officedocument.wordprocessingml.fontTable+xml"; const CT_NUMBERING: &str = "application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml"; const CT_HEADER: &str = "application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml"; @@ -443,6 +445,11 @@ pub struct DocxWriter { endnotes: Vec, core_props: Option, next_num_id: u32, + /// Embedded font programs to ship inside the package under `word/fonts/`. + /// Each entry is `(font_name, ttf_or_otf_bytes)`. The reader recognizes + /// these and re-uses them to render any downstream conversion (notably + /// PDF) so a PDF→DOCX→PDF round-trip preserves typeface fidelity. + embedded_fonts: Vec<(String, Vec)>, } impl DocxWriter { @@ -456,9 +463,21 @@ impl DocxWriter { endnotes: Vec::new(), core_props: None, next_num_id: 3, + embedded_fonts: Vec::new(), } } + /// Embed a font program (TrueType / OpenType bytes) under `word/fonts/`. + /// `name` is used for the file name and as the human-readable font name. + /// Subsequent calls with the same name are deduplicated. + pub fn embed_font(&mut self, name: impl Into, data: Vec) -> &mut Self { + let name = name.into(); + if !self.embedded_fonts.iter().any(|(n, _)| n == &name) { + self.embedded_fonts.push((name, data)); + } + self + } + /// Add a plain paragraph with the given text. pub fn add_paragraph(&mut self, text: &str) -> &mut Self { self.elements @@ -795,6 +814,42 @@ impl DocxWriter { }); } + // --- Embed fonts --- + // Three pieces have to land together so Word/LibreOffice + // actually pick up the font programs: + // + // 1. The TTF/OTF parts under `/word/fonts/font__.ttf`. + // 2. `/word/fontTable.xml` listing each font name with an + // `` reference. + // 3. `/word/_rels/fontTable.xml.rels` mapping each rId from + // step 2 to the matching font part. + // 4. A relationship in `word/_rels/document.xml.rels` of type + // `…/fontTable` so Word knows where to find fontTable.xml. + // + // Without all four, the in-process reader still finds the TTFs + // by directory scan, but Word silently substitutes Calibri. + if !self.embedded_fonts.is_empty() { + let font_table_part = PartName::new("/word/fontTable.xml")?; + opc.add_part_rel(&doc_part, rel_types::FONT_TABLE, "fontTable.xml"); + + // Each font part + the fontTable→font rel. + let mut font_entries: Vec<(String, String)> = + Vec::with_capacity(self.embedded_fonts.len()); + for (idx, (name, data)) in self.embedded_fonts.iter().enumerate() { + let n = idx + 1; + let safe = crate::core::embedded_fonts::sanitize_font_filename(name); + let target_rel = format!("fonts/font_{n}_{safe}.ttf"); + let target_abs = format!("/word/fonts/font_{n}_{safe}.ttf"); + let part = PartName::new(&target_abs)?; + opc.add_part(&part, "application/x-font-ttf", data)?; + let rid = opc.add_part_rel(&font_table_part, rel_types::FONT, &target_rel); + font_entries.push((name.clone(), rid)); + } + + let xml = generate_font_table_xml(&font_entries); + opc.add_part(&font_table_part, CT_FONT_TABLE, &xml)?; + } + // --- Register headers/footers --- let mut hf_rids: Vec<(HfType, String)> = Vec::new(); for (i, hf) in self.headers_footers.iter().enumerate() { @@ -961,8 +1016,36 @@ impl DocxWriter { w.write_event(Event::Start(BytesStart::new("w:body"))) .expect("write body start"); + // Multi-section DOCX: each non-final `` lives inside the + // `` of a paragraph that terminates that section. Only the + // final sectPr sits at body level. The previous implementation + // dropped every non-final SectPr on the floor, so a multi-section + // IR (e.g. one section per source PDF page from `pdf_to_ir`) + // collapsed into a single section on the read side and lost all + // per-page geometry. + // + // Find the last `DocxElement::SectPr` index — that's the final + // section, written at body level. Every earlier SectPr is emitted + // as a synthetic empty paragraph carrying just ``, + // which `parse_paragraph_properties_fast` recognises and pushes + // into `body.section_breaks`. `docx_to_ir` then walks + // `section_breaks` to slice elements into per-section windows. + let last_sectpr_idx: Option = self + .elements + .iter() + .rposition(|e| matches!(e, DocxElement::SectPr(_))); + let mut image_counter = 0u32; - for element in &self.elements { + for (idx, element) in self.elements.iter().enumerate() { + if let DocxElement::SectPr(sp) = element { + if Some(idx) == last_sectpr_idx { + // Final section is rendered as the body-level sectPr + // below (uses the `sect_pr` info already gathered). + continue; + } + write_inline_section_break_paragraph(&mut w, sp); + continue; + } write_docx_element(&mut w, element, image_rids, &mut image_counter); } @@ -1215,6 +1298,11 @@ fn convert_ir_element_to_docx_elements(elem: &crate::ir::Element, out: &mut Vec< }, E::Footnote(_) | E::Endnote(_) => {}, E::CodeBlock(cb) => out.push(DocxElement::CodeBlock(cb.content.clone())), + E::Shape(_) => { + // Vector shapes are emitted by the layout-preserving DOCX + // writer in pdf_oxide directly; the markdown-driven IR + // writer doesn't have anywhere to put them yet. + }, } } @@ -2624,6 +2712,114 @@ fn write_floating_image_run( .expect("write p end"); } +/// Emit a synthetic empty paragraph that carries an inline `` +/// inside its ``. Used for non-final section boundaries — the +/// paragraph is what marks the section break for the reader; its +/// `` describes the section ending at this point. We don't +/// emit hf / footnote references on inline sectPr (they're document-wide +/// and live on the body-level final sectPr only). +fn write_inline_section_break_paragraph(w: &mut Writer>, sp: &DocxSectPr) { + w.write_event(Event::Start(BytesStart::new("w:p"))) + .expect("write inline-section p start"); + w.write_event(Event::Start(BytesStart::new("w:pPr"))) + .expect("write inline-section pPr start"); + write_section_pr_body(w, sp.page_setup.as_ref(), sp.columns.as_ref(), &sp.break_type); + w.write_event(Event::End(BytesEnd::new("w:pPr"))) + .expect("write inline-section pPr end"); + w.write_event(Event::End(BytesEnd::new("w:p"))) + .expect("write inline-section p end"); +} + +/// Shared `...` body writer — used by both the +/// body-level final sectPr and inline (per-paragraph) section breaks. +/// Caller writes the surrounding ``/`` tags. +fn write_section_pr_body( + w: &mut Writer>, + page_setup: Option<&PageSetup>, + columns: Option<&ColumnLayout>, + break_type: &SectionBreakType, +) { + w.write_event(Event::Start(BytesStart::new("w:sectPr"))) + .expect("write sectPr start"); + + match break_type { + SectionBreakType::Continuous => { + // Continuous is the default; emit it explicitly so the reader + // doesn't pick up a stale value from a sibling section. + let mut t = BytesStart::new("w:type"); + t.push_attribute(("w:val", "continuous")); + w.write_event(Event::Empty(t)).expect("write sect type"); + }, + SectionBreakType::NextPage => { + let mut t = BytesStart::new("w:type"); + t.push_attribute(("w:val", "nextPage")); + w.write_event(Event::Empty(t)).expect("write sect type"); + }, + SectionBreakType::EvenPage => { + let mut t = BytesStart::new("w:type"); + t.push_attribute(("w:val", "evenPage")); + w.write_event(Event::Empty(t)).expect("write sect type"); + }, + SectionBreakType::OddPage => { + let mut t = BytesStart::new("w:type"); + t.push_attribute(("w:val", "oddPage")); + w.write_event(Event::Empty(t)).expect("write sect type"); + }, + } + + if let Some(ps) = page_setup { + let mut pg_sz = BytesStart::new("w:pgSz"); + pg_sz.push_attribute(("w:w", ps.width_twips.to_string().as_str())); + pg_sz.push_attribute(("w:h", ps.height_twips.to_string().as_str())); + if ps.landscape { + pg_sz.push_attribute(("w:orient", "landscape")); + } + w.write_event(Event::Empty(pg_sz)).expect("write pgSz"); + + let mut pg_mar = BytesStart::new("w:pgMar"); + pg_mar.push_attribute(("w:top", ps.margin_top_twips.to_string().as_str())); + pg_mar.push_attribute(("w:bottom", ps.margin_bottom_twips.to_string().as_str())); + pg_mar.push_attribute(("w:left", ps.margin_left_twips.to_string().as_str())); + pg_mar.push_attribute(("w:right", ps.margin_right_twips.to_string().as_str())); + pg_mar.push_attribute(("w:header", ps.header_distance_twips.to_string().as_str())); + pg_mar.push_attribute(("w:footer", ps.footer_distance_twips.to_string().as_str())); + w.write_event(Event::Empty(pg_mar)).expect("write pgMar"); + } + + if let Some(cols) = columns { + if cols.column_widths_twips.is_empty() { + let mut c = BytesStart::new("w:cols"); + c.push_attribute(("w:num", cols.count.to_string().as_str())); + if let Some(sp) = cols.space_twips { + c.push_attribute(("w:space", sp.to_string().as_str())); + } + if cols.separator { + c.push_attribute(("w:sep", "1")); + } + w.write_event(Event::Empty(c)).expect("write cols"); + } else { + let mut c = BytesStart::new("w:cols"); + c.push_attribute(("w:num", cols.count.to_string().as_str())); + if cols.separator { + c.push_attribute(("w:sep", "1")); + } + w.write_event(Event::Start(c)).expect("write cols start"); + let default_space = cols.space_twips.unwrap_or(720); + for &cw in &cols.column_widths_twips { + let mut col = BytesStart::new("w:col"); + col.push_attribute(("w:w", cw.to_string().as_str())); + col.push_attribute(("w:space", default_space.to_string().as_str())); + w.write_event(Event::Empty(col)).expect("write col"); + } + w.write_event(Event::End(BytesEnd::new("w:cols"))) + .expect("write cols end"); + } + } + + w.write_event(Event::End(BytesEnd::new("w:sectPr"))) + .expect("write sectPr end"); +} + fn write_body_sect_pr(w: &mut Writer>, sp: &SectPrInfo) { w.write_event(Event::Start(BytesStart::new("w:sectPr"))) .expect("write sectPr start"); @@ -2895,6 +3091,46 @@ fn generate_core_props_xml(props: &CoreProps) -> Vec { w.into_inner() } +// --------------------------------------------------------------------------- +// fontTable.xml generator +// --------------------------------------------------------------------------- + +/// Build `word/fontTable.xml` listing each embedded font with an +/// `` reference. Word looks up `` names against this table and uses the embedded +/// program when there's a match. Without it, Word silently +/// substitutes Calibri / Cambria for everything regardless of how +/// many TTFs we ship under `/word/fonts/`. +fn generate_font_table_xml(entries: &[(String, String)]) -> Vec { + let mut w = Writer::new_with_indent(Vec::new(), b' ', 2); + w.write_event(Event::Decl(BytesDecl::new("1.0", Some("UTF-8"), Some("yes")))) + .expect("decl"); + + let mut fonts = BytesStart::new("w:fonts"); + fonts.push_attribute(("xmlns:w", crate::core::xml::ns::WML_STR)); + fonts.push_attribute(("xmlns:r", crate::core::xml::ns::R_STR)); + w.write_event(Event::Start(fonts)).expect("fonts start"); + + for (name, rid) in entries { + let mut font = BytesStart::new("w:font"); + font.push_attribute(("w:name", name.as_str())); + w.write_event(Event::Start(font)).expect("font start"); + + // — Word treats this as the regular-weight + // glyph source for the named font face. + let mut embed = BytesStart::new("w:embedRegular"); + embed.push_attribute(("r:id", rid.as_str())); + w.write_event(Event::Empty(embed)).expect("embedRegular"); + + w.write_event(Event::End(BytesEnd::new("w:font"))) + .expect("font end"); + } + + w.write_event(Event::End(BytesEnd::new("w:fonts"))) + .expect("fonts end"); + w.into_inner() +} + // --------------------------------------------------------------------------- // Styles and numbering generators // --------------------------------------------------------------------------- @@ -2949,14 +3185,60 @@ fn write_paragraph_style( w.write_event(Event::Empty(name_elem)) .expect("write style name"); + // basedOn Normal so heading styles inherit body defaults. + if outline_level.is_some() { + let mut based = BytesStart::new("w:basedOn"); + based.push_attribute(("w:val", "Normal")); + w.write_event(Event::Empty(based)).expect("write basedOn"); + } + if let Some(level) = outline_level { w.write_event(Event::Start(BytesStart::new("w:pPr"))) .expect("write pPr start"); + // Spacing-before for visual breathing room above the heading. + let mut sp = BytesStart::new("w:spacing"); + sp.push_attribute(("w:before", "240")); // 12 pt + sp.push_attribute(("w:after", "120")); // 6 pt + w.write_event(Event::Empty(sp)).expect("write spacing"); let mut lvl = BytesStart::new("w:outlineLvl"); lvl.push_attribute(("w:val", level.to_string().as_str())); w.write_event(Event::Empty(lvl)).expect("write outlineLvl"); w.write_event(Event::End(BytesEnd::new("w:pPr"))) .expect("write pPr end"); + + // Run properties — size & bold per Word's default heading scale. + // Without this, every in the body + // renders as plain Normal — the headings disappear visually. + let (sz_half_pt, bold, italic, color) = match level { + 0 => (56, true, false, "2F5496"), // Heading 1: 28 pt + 1 => (44, true, false, "2F5496"), // Heading 2: 22 pt + 2 => (32, true, false, "1F3864"), // Heading 3: 16 pt + 3 => (28, true, true, "2F5496"), // Heading 4: 14 pt italic + 4 => (24, true, false, "2F5496"), // Heading 5: 12 pt + _ => (22, true, true, "1F3864"), // Heading 6: 11 pt italic + }; + w.write_event(Event::Start(BytesStart::new("w:rPr"))) + .expect("write rPr start"); + if bold { + w.write_event(Event::Empty(BytesStart::new("w:b"))) + .expect("write b"); + } + if italic { + w.write_event(Event::Empty(BytesStart::new("w:i"))) + .expect("write i"); + } + let mut col = BytesStart::new("w:color"); + col.push_attribute(("w:val", color)); + w.write_event(Event::Empty(col)).expect("write color"); + let sz_str = sz_half_pt.to_string(); + let mut sz = BytesStart::new("w:sz"); + sz.push_attribute(("w:val", sz_str.as_str())); + w.write_event(Event::Empty(sz)).expect("write sz"); + let mut sz_cs = BytesStart::new("w:szCs"); + sz_cs.push_attribute(("w:val", sz_str.as_str())); + w.write_event(Event::Empty(sz_cs)).expect("write szCs"); + w.write_event(Event::End(BytesEnd::new("w:rPr"))) + .expect("write rPr end"); } w.write_event(Event::End(BytesEnd::new("w:style"))) diff --git a/src/ir.rs b/src/ir.rs index 093615b..1ebe1e9 100644 --- a/src/ir.rs +++ b/src/ir.rs @@ -550,6 +550,12 @@ pub struct Section { pub even_page_header: Option, /// Footer used on even-numbered pages of this section. pub even_page_footer: Option, + /// Solid background colour for this section (RGB). + /// PPTX: parsed from `` on the slide. + /// Image / gradient backgrounds are intentionally skipped — only the + /// solid case round-trips through this minimal field. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub background_rgb: Option<[u8; 3]>, } /// A block-level content element. @@ -581,15 +587,77 @@ pub enum Element { Endnote(Note), /// A preformatted code block. CodeBlock(CodeBlock), + /// A vector shape (line / rectangle) anchored on the page. Used by + /// the layout-preserving DOCX path to round-trip rules and dividers. + Shape(Shape), +} + +/// A vector shape anchored at absolute page coordinates. +#[allow(dead_code)] +#[derive(Debug, Clone, Default, PartialEq, serde::Serialize, serde::Deserialize)] +#[serde(rename_all = "snake_case")] +pub struct Shape { + /// Geometry kind. + pub kind: ShapeGeom, + /// X offset from the anchor in EMUs. + pub x_emu: i64, + /// Y offset from the anchor in EMUs. + pub y_emu: i64, + /// Width in EMUs. + pub width_emu: u64, + /// Height in EMUs. + pub height_emu: u64, + /// Horizontal anchor reference frame. + #[serde(default)] + pub h_anchor: FloatAnchor, + /// Vertical anchor reference frame. + #[serde(default)] + pub v_anchor: FloatAnchor, + /// Stroke colour as RGB (0..255). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub stroke_rgb: Option<[u8; 3]>, + /// Fill colour as RGB (0..255). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub fill_rgb: Option<[u8; 3]>, + /// Stroke width in EMUs. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub stroke_w_emu: Option, +} + +/// Vector-shape geometry kinds we currently round-trip. +#[allow(dead_code)] +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ShapeGeom { + /// Straight line from `(x, y)` to `(x + width, y + height)`. + #[default] + Line, + /// Axis-aligned rectangle. + Rect, } /// A heading element with a nesting level. #[derive(Debug, Clone, PartialEq, Default, serde::Serialize, serde::Deserialize)] pub struct Heading { /// Heading level 1–6 (1 = largest). + #[serde(default = "default_heading_level")] pub level: u8, /// Inline content of the heading. + #[serde(default)] pub content: Vec, + /// Absolute frame position for layout-preserving DOCX + /// (mirrors `Paragraph::frame_position`). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub frame_position: Option, + /// Horizontal alignment (mirrors `Paragraph::alignment`). PDF + /// title pages often centre their headings; without this the + /// round-trip flattens them to left-aligned. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub alignment: Option, +} + +fn default_heading_level() -> u8 { + 1 } /// A paragraph of inline content. @@ -626,6 +694,27 @@ pub struct Paragraph { pub page_break_before: bool, /// Outline level (0 = body text, 1–9 = heading levels). pub outline_level: Option, + /// Absolute frame position (from ``). Present when the + /// DOCX uses page-anchored frames for layout-preserving content + /// (see pdf_oxide's `to_docx_bytes_layout`). Twips relative to the + /// page origin (top-left). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub frame_position: Option, +} + +/// Absolute frame position for a paragraph anchored to the page. +/// Mirrors the OOXML `` attribute set we care about for +/// reproducing visual layout in downstream renderers. +#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)] +pub struct FramePosition { + /// X position in twips, anchored to the page origin (top-left). + pub x_twips: i32, + /// Y position in twips, anchored to the page origin (top-left). + pub y_twips: i32, + /// Frame width in twips. + pub width_twips: i32, + /// Frame height in twips. + pub height_twips: i32, } /// Inline content within a paragraph or heading. @@ -643,6 +732,26 @@ pub enum InlineContent { EndnoteRef(FootnoteRef), } +/// Pick the dominant font size (in points) for a paragraph's worth of +/// inline content. Returns the *first* declared `font_size_half_pt`, +/// converted from half-points to points (e.g. 18 half-pt → 9 pt). +/// +/// Used by both renderers and writers when one paragraph-level size is +/// needed: the IR groups runs into a paragraph by line clustering, so +/// the size on the first span is representative of the body text. +/// Mixed-size paragraphs (drop-caps, math marks mid-line) lose the +/// variation — that's the deliberate trade-off. +pub fn first_inline_font_size_pt(content: &[InlineContent]) -> Option { + for ic in content { + if let InlineContent::Text(span) = ic { + if let Some(half_pt) = span.font_size_half_pt { + return Some(half_pt as f32 / 2.0); + } + } + } + None +} + /// A styled run of text. #[derive(Debug, Clone, PartialEq, Default, serde::Serialize, serde::Deserialize)] pub struct TextSpan { @@ -789,6 +898,69 @@ pub struct ListItem { pub nested: Option, } +/// Wrap a non-empty inline-content vector into a single-Paragraph +/// block, or return an empty Vec if the inline content is empty. +/// Used by list builders to turn each item's inline run into its +/// `Vec` content slot. +pub fn inline_to_element_block(content: Vec) -> Vec { + if content.is_empty() { + Vec::new() + } else { + vec![Element::Paragraph(Paragraph { + content, + ..Default::default() + })] + } +} + +/// Build a nested `List` from a flat `(level, inline)` sequence. +/// +/// Items whose level matches `base_level` (or is shallower) become +/// `ListItem`s at the current depth. Items whose level is *deeper* +/// than `base_level` are recursively grouped into the most recent +/// item's `nested` sub-list. Levels are 0-indexed. +/// +/// Used by both `convert_docx` and `convert_pptx` to translate flat +/// `` / `` paragraph streams into the +/// IR's tree-shaped `List`. +pub fn build_nested_list( + ordered: bool, + items: &[(u8, Vec)], + base_level: u8, +) -> List { + let mut list_items = Vec::new(); + let mut idx = 0; + + while idx < items.len() { + let (level, content) = &items[idx]; + let nested_start = idx + 1; + let mut nested_end = nested_start; + while nested_end < items.len() && items[nested_end].0 > base_level { + nested_end += 1; + } + let nested = if *level <= base_level && nested_end > nested_start { + Some(build_nested_list(ordered, &items[nested_start..nested_end], base_level + 1)) + } else { + None + }; + list_items.push(ListItem { + content: inline_to_element_block(content.clone()), + nested, + }); + idx = if nested_end > nested_start { + nested_end + } else { + idx + 1 + }; + } + + List { + ordered, + items: list_items, + ..Default::default() + } +} + /// An embedded image reference. #[derive(Debug, Clone, PartialEq, Default, serde::Serialize, serde::Deserialize)] pub struct Image { @@ -813,3 +985,194 @@ pub struct Image { #[serde(default)] pub positioning: ImagePositioning, } + +#[cfg(test)] +mod tests { + use super::*; + + // ── first_inline_font_size_pt ──────────────────────────────────── + + #[test] + fn first_font_size_returns_half_pt_as_pt() { + let content = vec![InlineContent::Text(TextSpan { + text: "hi".into(), + font_size_half_pt: Some(24), // 12pt + ..Default::default() + })]; + assert_eq!(first_inline_font_size_pt(&content), Some(12.0)); + } + + #[test] + fn first_font_size_picks_first_declared() { + // Second span's size is ignored — the first declared one wins. + let content = vec![ + InlineContent::Text(TextSpan { + text: "a".into(), + font_size_half_pt: Some(20), // 10pt + ..Default::default() + }), + InlineContent::Text(TextSpan { + text: "b".into(), + font_size_half_pt: Some(48), // 24pt — ignored + ..Default::default() + }), + ]; + assert_eq!(first_inline_font_size_pt(&content), Some(10.0)); + } + + #[test] + fn first_font_size_skips_unsized_runs() { + // First run has no size; second does → returns the second's size. + let content = vec![ + InlineContent::Text(TextSpan { + text: "a".into(), + ..Default::default() + }), + InlineContent::Text(TextSpan { + text: "b".into(), + font_size_half_pt: Some(16), // 8pt + ..Default::default() + }), + ]; + assert_eq!(first_inline_font_size_pt(&content), Some(8.0)); + } + + #[test] + fn first_font_size_empty_returns_none() { + assert_eq!(first_inline_font_size_pt(&[]), None); + } + + #[test] + fn first_font_size_all_unsized_returns_none() { + let content = vec![ + InlineContent::Text(TextSpan::plain("a")), + InlineContent::Text(TextSpan::plain("b")), + ]; + assert_eq!(first_inline_font_size_pt(&content), None); + } + + // ── inline_to_element_block ────────────────────────────────────── + + #[test] + fn inline_to_element_block_empty_returns_empty() { + let result = inline_to_element_block(vec![]); + assert!(result.is_empty()); + } + + #[test] + fn inline_to_element_block_wraps_in_paragraph() { + let inline = vec![InlineContent::Text(TextSpan::plain("hello"))]; + let result = inline_to_element_block(inline); + assert_eq!(result.len(), 1); + match &result[0] { + Element::Paragraph(p) => { + assert_eq!(p.content.len(), 1); + assert!(matches!( + &p.content[0], + InlineContent::Text(s) if s.text == "hello" + )); + }, + _ => panic!("expected Paragraph"), + } + } + + // ── build_nested_list ──────────────────────────────────────────── + + fn item(level: u8, text: &str) -> (u8, Vec) { + (level, vec![InlineContent::Text(TextSpan::plain(text))]) + } + + fn list_item_text(item: &ListItem) -> String { + let mut out = String::new(); + for el in &item.content { + if let Element::Paragraph(p) = el { + for c in &p.content { + if let InlineContent::Text(s) = c { + out.push_str(&s.text); + } + } + } + } + out + } + + #[test] + fn build_nested_list_flat() { + let items = vec![item(0, "A"), item(0, "B"), item(0, "C")]; + let list = build_nested_list(false, &items, 0); + assert!(!list.ordered); + assert_eq!(list.items.len(), 3); + assert!(list.items.iter().all(|li| li.nested.is_none())); + assert_eq!(list_item_text(&list.items[1]), "B"); + } + + #[test] + fn build_nested_list_two_levels() { + // Top: A + // sub: A.1, A.2 + // Top: B + let items = vec![item(0, "A"), item(1, "A.1"), item(1, "A.2"), item(0, "B")]; + let list = build_nested_list(true, &items, 0); + assert!(list.ordered); + assert_eq!(list.items.len(), 2); + let nested = list.items[0].nested.as_ref().expect("A has nested"); + assert_eq!(nested.items.len(), 2); + assert_eq!(list_item_text(&nested.items[0]), "A.1"); + assert_eq!(list_item_text(&nested.items[1]), "A.2"); + // B has no nested children. + assert!(list.items[1].nested.is_none()); + } + + #[test] + fn build_nested_list_three_levels() { + let items = vec![item(0, "A"), item(1, "A.1"), item(2, "A.1.x"), item(0, "B")]; + let list = build_nested_list(false, &items, 0); + let l1 = list.items[0].nested.as_ref().unwrap(); + assert_eq!(l1.items.len(), 1); + let l2 = l1.items[0].nested.as_ref().unwrap(); + assert_eq!(l2.items.len(), 1); + assert_eq!(list_item_text(&l2.items[0]), "A.1.x"); + } + + #[test] + fn build_nested_list_empty() { + let list = build_nested_list(false, &[], 0); + assert!(list.items.is_empty()); + } + + // ── TextSpan::plain ────────────────────────────────────────────── + + #[test] + fn text_span_plain_has_default_styling() { + let s = TextSpan::plain("hi"); + assert_eq!(s.text, "hi"); + assert!(!s.bold); + assert!(!s.italic); + assert!(s.font_size_half_pt.is_none()); + assert!(s.hyperlink.is_none()); + } + + // ── FramePosition / Shape defaults ─────────────────────────────── + + #[test] + fn shape_default_is_line_at_origin() { + let s = Shape::default(); + assert!(matches!(s.kind, ShapeGeom::Line)); + assert_eq!(s.x_emu, 0); + assert_eq!(s.width_emu, 0); + assert!(s.stroke_rgb.is_none()); + } + + #[test] + fn frame_position_round_trips_via_serde() { + let fp = FramePosition { + x_twips: 720, + y_twips: 1080, + width_twips: 5000, + height_twips: 400, + }; + let json = serde_json::to_string(&fp).unwrap(); + let back: FramePosition = serde_json::from_str(&json).unwrap(); + assert_eq!(fp, back); + } +} diff --git a/src/ir_from_markdown.rs b/src/ir_from_markdown.rs index 21e4884..dc0887d 100644 --- a/src/ir_from_markdown.rs +++ b/src/ir_from_markdown.rs @@ -123,6 +123,7 @@ impl<'a> MarkdownParser<'a> { current.elements.push(Element::Heading(Heading { level, content: parse_inline(&text), + ..Default::default() })); } continue; diff --git a/src/ir_render.rs b/src/ir_render.rs index 74f8df8..3b9c8f8 100644 --- a/src/ir_render.rs +++ b/src/ir_render.rs @@ -1,5 +1,125 @@ use crate::ir::*; +mod block_default { + //! Default flow-rendering for [`Element`] variants that don't + //! carry a meaningful inline / paragraph / heading shape. + //! + //! Each `default_*` function is **exhaustive** over `Element`: + //! the compiler forces a decision when a new variant is added + //! ("is this variant invisible in flow output, or do specific + //! renderers need to handle it?"). Renderers in the parent + //! module keep arms only for variants where their output + //! differs from these defaults; everything else falls through + //! to the matching `default_*` here via `other => default_X(other)`. + use super::*; + use std::fmt::Write; + + /// Plain-text default. Most invisible variants → `""`; + /// `ThematicBreak` → `"---"` (matches markdown); container + /// elements recursively render their children. + pub fn default_plain(element: &Element) -> String { + match element { + Element::ThematicBreak => "---".to_string(), + Element::TextBox(tb) => tb + .content + .iter() + .map(super::render_element_plain) + .collect::>() + .join("\n\n"), + Element::Footnote(n) | Element::Endnote(n) => n + .content + .iter() + .map(super::render_element_plain) + .collect::>() + .join("\n\n"), + // Invisible in flow: shapes are positioned, not flow content; + // page/column breaks have no plain-text counterpart; an + // unannotated image shows nothing in plain text. + Element::PageBreak | Element::ColumnBreak | Element::Shape(_) | Element::Image(_) => { + String::new() + }, + // The variants below have rich flow output and shouldn't + // hit this default — `render_element_plain` handles them. + // Reaching here means a renderer forgot a real arm; we + // emit empty rather than panic so the document still + // renders, but the explicit arms below let the compiler + // catch added variants. + Element::Heading(_) + | Element::Paragraph(_) + | Element::Table(_) + | Element::List(_) + | Element::CodeBlock(_) => String::new(), + } + } + + /// Markdown default. Same as plain except images get an alt-text + /// `![alt]()` form. + pub fn default_markdown(element: &Element) -> String { + match element { + Element::ThematicBreak => "---".to_string(), + Element::TextBox(tb) => tb + .content + .iter() + .map(super::render_element_markdown) + .collect::>() + .join("\n\n"), + Element::Footnote(n) | Element::Endnote(n) => n + .content + .iter() + .map(super::render_element_markdown) + .collect::>() + .join("\n\n"), + Element::PageBreak | Element::ColumnBreak | Element::Shape(_) => String::new(), + Element::Image(img) => { + let alt = img.alt_text.as_deref().unwrap_or(""); + format!("![{alt}]()") + }, + Element::Heading(_) + | Element::Paragraph(_) + | Element::Table(_) + | Element::List(_) + | Element::CodeBlock(_) => String::new(), + } + } + + /// HTML default. `ThematicBreak` → `
`; images render an + /// empty `…`; everything else mirrors `default_plain` + /// behaviour with HTML escaping. + pub fn default_html(element: &Element) -> String { + match element { + Element::ThematicBreak => "
".to_string(), + Element::TextBox(tb) => tb + .content + .iter() + .map(super::render_element_html) + .collect::>() + .join("\n"), + Element::Footnote(n) | Element::Endnote(n) => n + .content + .iter() + .map(super::render_element_html) + .collect::>() + .join("\n"), + Element::PageBreak | Element::ColumnBreak | Element::Shape(_) => String::new(), + Element::Image(img) => { + let alt = img + .alt_text + .as_deref() + .map(super::escape_html) + .unwrap_or_default(); + let mut out = String::with_capacity(20 + alt.len()); + let _ = write!(out, "\"{alt}\""); + out + }, + Element::Heading(_) + | Element::Paragraph(_) + | Element::Table(_) + | Element::List(_) + | Element::CodeBlock(_) => String::new(), + } + } +} + impl DocumentIR { /// Render the IR as plain text. pub fn plain_text(&self) -> String { @@ -65,29 +185,15 @@ fn render_element_plain(element: &Element) -> String { Element::Paragraph(p) => render_inline_plain(&p.content), Element::Table(t) => render_table_plain(t), Element::List(l) => render_list_plain(l, 0), - Element::Image(img) => { - if let Some(ref alt) = img.alt_text { - format!("[{alt}]") - } else { - String::new() - } + Element::Image(img) => match &img.alt_text { + Some(alt) => format!("[{alt}]"), + None => String::new(), }, - Element::ThematicBreak => "---".to_string(), - Element::TextBox(tb) => tb - .content - .iter() - .map(render_element_plain) - .collect::>() - .join("\n\n"), - Element::PageBreak => String::new(), - Element::ColumnBreak => String::new(), - Element::Footnote(n) | Element::Endnote(n) => n - .content - .iter() - .map(render_element_plain) - .collect::>() - .join("\n\n"), Element::CodeBlock(cb) => cb.content.clone(), + // Invisible-in-flow / container variants delegated to the + // shared default. Adding a new `Element` variant forces a + // compile error in `block_default::default_plain`, not here. + other => block_default::default_plain(other), } } @@ -170,29 +276,13 @@ fn render_element_markdown(element: &Element) -> String { Element::Paragraph(p) => render_inline_markdown(&p.content), Element::Table(t) => render_table_markdown(t), Element::List(l) => render_list_markdown(l, 0), - Element::Image(img) => { - let alt = img.alt_text.as_deref().unwrap_or(""); - format!("![{alt}]()") - }, - Element::ThematicBreak => "---".to_string(), - Element::TextBox(tb) => tb - .content - .iter() - .map(render_element_markdown) - .collect::>() - .join("\n\n"), - Element::PageBreak => String::new(), - Element::ColumnBreak => String::new(), - Element::Footnote(n) | Element::Endnote(n) => n - .content - .iter() - .map(render_element_markdown) - .collect::>() - .join("\n\n"), Element::CodeBlock(cb) => { let lang = cb.language.as_deref().unwrap_or(""); format!("```{lang}\n{}\n```", cb.content) }, + // Invisible-in-flow / container / image variants delegated + // to the shared default — see `block_default::default_markdown`. + other => block_default::default_markdown(other), } } @@ -358,29 +448,13 @@ fn render_element_html(element: &Element) -> String { }, Element::Table(t) => render_table_html(t), Element::List(l) => render_list_html(l), - Element::Image(img) => { - let alt = img.alt_text.as_deref().map(escape_html).unwrap_or_default(); - format!("\"{alt}\"") - }, - Element::ThematicBreak => "
".to_string(), - Element::TextBox(tb) => tb - .content - .iter() - .map(render_element_html) - .collect::>() - .join("\n"), - Element::PageBreak => String::new(), - Element::ColumnBreak => String::new(), - Element::Footnote(n) | Element::Endnote(n) => n - .content - .iter() - .map(render_element_html) - .collect::>() - .join("\n"), Element::CodeBlock(cb) => { let escaped = escape_html(&cb.content); format!("
{escaped}
") }, + // Invisible-in-flow / container / image variants delegated + // to the shared default — see `block_default::default_html`. + other => block_default::default_html(other), } } @@ -500,6 +574,7 @@ mod tests { let ir = simple_ir(vec![Element::Heading(Heading { level: 2, content: vec![span("Title")], + ..Default::default() })]); assert_eq!(ir.to_markdown(), "## Title"); } @@ -694,4 +769,58 @@ mod tests { assert!(html.contains("
  • First

  • ")); assert!(html.contains("
  • Second

  • ")); } + + // ── Defaults centralized in `block_default` ────────────────────── + + #[test] + fn thematic_break_renders_as_hr_in_plain() { + let ir = simple_ir(vec![Element::ThematicBreak]); + assert_eq!(ir.plain_text(), "---"); + } + + #[test] + fn thematic_break_renders_in_markdown() { + let ir = simple_ir(vec![Element::ThematicBreak]); + assert!(ir.to_markdown().contains("---")); + } + + #[test] + fn page_break_invisible_in_plain() { + // PageBreak/ColumnBreak/Shape/Image have no plain-text counterpart + // — they collapse to empty so plain_text shows only the surrounding + // content. + let ir = simple_ir(vec![para("before"), Element::PageBreak, para("after")]); + let plain = ir.plain_text(); + assert!(plain.contains("before")); + assert!(plain.contains("after")); + } + + #[test] + fn shape_invisible_in_plain() { + let ir = simple_ir(vec![ + para("before"), + Element::Shape(Shape::default()), + para("after"), + ]); + let plain = ir.plain_text(); + assert!(plain.contains("before")); + assert!(plain.contains("after")); + } + + #[test] + fn text_box_recursively_renders_children() { + let ir = simple_ir(vec![Element::TextBox(TextBox { + content: vec![para("inside")], + ..Default::default() + })]); + let plain = ir.plain_text(); + assert!(plain.contains("inside"), "plain: {plain}"); + } + + #[test] + fn html_thematic_break() { + let ir = simple_ir(vec![Element::ThematicBreak]); + let html = ir.to_html(); + assert!(html.contains(", /// Theme data (colors, fonts), if present. pub theme: Option, + /// Font programs found under `ppt/fonts/`. Each entry is + /// `(font_name, ttf_or_otf_bytes)`. PDF→PPTX→PDF round-trips use + /// these to preserve the source typeface (mirrors the DOCX side). + pub embedded_fonts: Vec<(String, Vec)>, } impl PptxDocument { @@ -101,6 +105,10 @@ impl PptxDocument { slide_data: Vec, slide_rels: Relationships, notes_data: Option>, + /// rId → (raw bytes, format-extension lowercase like "png" / "jpeg"). + /// Pre-resolved here in Phase 1 so the parallel slide parser + /// (Phase 2) doesn't need access to the OPC reader. + media: std::collections::HashMap, String)>, } let mut bundles = Vec::with_capacity(presentation.slides.len()); for (slide_idx, slide_id) in presentation.slides.iter().enumerate() { @@ -136,28 +144,87 @@ impl PptxDocument { None }; + // Pre-load all IMAGE-relationship parts the slide references. + // PPTX picture frames carry ``; the + // relationship resolves to a part like `/ppt/media/image3.png`. + // Parsing happens in parallel below and can't use the OPC + // reader, so we materialise the bytes here keyed by rId. + let mut media = std::collections::HashMap::new(); + for rel in slide_rels.all() { + if rel.rel_type != rel_types::IMAGE { + continue; + } + let target = match part_name.resolve_relative(&rel.target) { + Ok(t) => t, + Err(_) => continue, + }; + if !opc.has_part(&target) { + continue; + } + let bytes = match opc.read_part(&target) { + Ok(b) => b, + Err(_) => continue, + }; + let ext = std::path::Path::new(&rel.target) + .extension() + .and_then(|s| s.to_str()) + .map(|s| s.to_ascii_lowercase()) + .unwrap_or_else(|| guess_format_from_bytes(&bytes).to_string()); + media.insert(rel.id.clone(), (bytes, ext)); + } + bundles.push(SlideBundle { slide_data, slide_rels, notes_data, + media, }); } // Phase 2: parse slides (parallel when feature enabled) let slides = crate::core::parallel::map_collect(bundles, |b| -> Result { let name = xml_csl_name(&b.slide_data); - let mut parsed = Slide::parse(&b.slide_data, name, &b.slide_rels)?; + let mut parsed = Slide::parse(&b.slide_data, name, &b.slide_rels, &b.media)?; if let Some(notes_data) = &b.notes_data { parsed.notes = extract_notes_text(notes_data); } Ok(parsed) })?; - debug!("PptxDocument: {} slides parsed", slides.len()); + // Scan `ppt/fonts/` for embedded font programs. Mirrors the DOCX + // reader (`word/fonts/`). + let mut embedded_fonts: Vec<(String, Vec)> = Vec::new(); + for name in opc.part_names() { + let s = name.to_string(); + if !s.starts_with("/ppt/fonts/") { + continue; + } + let lower = s.to_lowercase(); + if !(lower.ends_with(".ttf") || lower.ends_with(".otf")) { + continue; + } + if let Ok(data) = opc.read_part(&name) { + let basename = s.rsplit('/').next().unwrap_or("font"); + let face = crate::docx::strip_embedded_font_filename(basename); + let font_name = if face.is_empty() { + basename.to_string() + } else { + face + }; + embedded_fonts.push((font_name, data)); + } + } + + debug!( + "PptxDocument: {} slides parsed, {} embedded fonts", + slides.len(), + embedded_fonts.len() + ); Ok(PptxDocument { presentation, slides, theme, + embedded_fonts, }) } } @@ -192,6 +259,32 @@ fn extract_notes_text(xml_data: &[u8]) -> Option { slide::extract_notes_text(xml_data) } +/// Best-effort image-format detection from the raw bytes. +/// +/// Used as a fallback when the relationship target has no recognisable +/// extension (rare — DrawingML images almost always carry one). Returns +/// a lowercase extension string suitable for round-tripping back into +/// `office_oxide::ir::ImageFormat::extension()`. +fn guess_format_from_bytes(bytes: &[u8]) -> &'static str { + if bytes.starts_with(&[0x89, b'P', b'N', b'G']) { + "png" + } else if bytes.starts_with(&[0xFF, 0xD8, 0xFF]) { + "jpeg" + } else if bytes.starts_with(b"GIF87a") || bytes.starts_with(b"GIF89a") { + "gif" + } else if bytes.starts_with(b"BM") { + "bmp" + } else if bytes.len() >= 4 && bytes.starts_with(&[0xD7, 0xCD, 0xC6, 0x9A]) { + "wmf" + } else if bytes.len() >= 4 && bytes.starts_with(&[0x01, 0x00, 0x00, 0x00]) { + "emf" + } else if bytes.len() >= 4 && (bytes.starts_with(b"II*\0") || bytes.starts_with(b"MM\0*")) { + "tiff" + } else { + "png" + } +} + impl crate::core::OfficeDocument for PptxDocument { fn plain_text(&self) -> String { self.plain_text() diff --git a/src/pptx/shape.rs b/src/pptx/shape.rs index b7ac63b..21c94fe 100644 --- a/src/pptx/shape.rs +++ b/src/pptx/shape.rs @@ -41,6 +41,14 @@ pub struct PictureShape { pub alt_text: Option, /// Bounding box position and size in EMU. pub position: Option, + /// Relationship ID (`r:embed`) of the underlying media part, if any. + pub embed_rid: Option, + /// Raw image bytes resolved via `embed_rid`, if the slide carried a + /// resolvable IMAGE relationship at parse time. + pub data: Option>, + /// Image format inferred from the relationship target extension or + /// byte signature (e.g. `"png"`, `"jpeg"`, `"gif"`, `"emf"`). + pub format: Option, } /// A group of child shapes (``). @@ -123,10 +131,16 @@ pub struct TextBody { } /// A single paragraph within a text body (``). -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Default)] pub struct TextParagraph { /// Outline level (0 = top level). pub level: u32, + /// Paragraph alignment from ``. None when the + /// attribute is absent (renderer-default left alignment). + pub alignment: Option, + /// Space before the paragraph, in 100ths of a point — read from + /// ``. + pub space_before_hundredths_pt: Option, /// Inline content items in this paragraph. pub content: Vec, } @@ -143,7 +157,7 @@ pub enum TextContent { } /// A text run with optional character formatting (``). -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Default)] pub struct TextRun { /// The text content of this run. pub text: String, @@ -155,6 +169,13 @@ pub struct TextRun { pub strikethrough: bool, /// Hyperlink attached to this run, if any. pub hyperlink: Option, + /// Font size in hundredths of a point (`` → `Some(1800)` = 18 pt). + /// `None` when the run inherits its size from the placeholder/master. + pub font_size_hundredths_pt: Option, + /// Explicit run colour from ``. + /// `None` when the run inherits its colour from the placeholder / + /// theme, or when the fill is non-sRGB (gradient, scheme colour). + pub color_rgb: Option<[u8; 3]>, } /// An auto-updated field inside a paragraph (``). diff --git a/src/pptx/slide.rs b/src/pptx/slide.rs index e72b8ba..35ee540 100644 --- a/src/pptx/slide.rs +++ b/src/pptx/slide.rs @@ -11,8 +11,21 @@ use super::shape::{ type CoreResult = crate::core::Result; -/// Parsed run properties: (bold, italic, strikethrough, hyperlink). -type RunProps = (Option, Option, bool, Option); +/// Parsed run properties: (bold, italic, strikethrough, hyperlink, font_size_hundredths_pt). +/// +/// PPTX `` carries font size in hundredths of a point +/// (e.g. `sz="1800"` = 18 pt). Carrying it through the parser is what +/// keeps PDF→PPTX→PDF round-trips from defaulting every paragraph to +/// the writer's 12 pt fallback (which inflated 8-page A4 sources to +/// ~30 pages). +type RunProps = ( + Option, + Option, + bool, + Option, + Option, + Option<[u8; 3]>, +); /// A parsed PPTX slide. #[derive(Debug, Clone)] @@ -23,6 +36,11 @@ pub struct Slide { pub shapes: Vec, /// Speaker notes text, if a notes slide is present. pub notes: Option, + /// Solid background colour (RGB) extracted from the slide's + /// `` element. Only the solid + /// case is parsed; gradient / image / theme-reference fills are + /// dropped silently and surface as `None`. + pub background_rgb: Option<[u8; 3]>, } /// Create a fast reader that does NOT trim text content. @@ -35,14 +53,23 @@ fn make_content_reader(xml_data: &[u8]) -> quick_xml::Reader<&[u8]> { impl Slide { /// Parse a slide from its XML data. - pub(crate) fn parse(xml_data: &[u8], name: String, rels: &Relationships) -> CoreResult { + pub(crate) fn parse( + xml_data: &[u8], + name: String, + rels: &Relationships, + media: &std::collections::HashMap, String)>, + ) -> CoreResult { let mut reader = make_content_reader(xml_data); let mut shapes = Vec::new(); + let mut background_rgb = None; loop { match reader.read_event()? { + Event::Start(ref e) if e.local_name().as_ref() == b"bg" => { + background_rgb = parse_slide_bg(&mut reader)?; + }, Event::Start(ref e) if e.local_name().as_ref() == b"spTree" => { - shapes = parse_shape_tree(&mut reader, rels)?; + shapes = parse_shape_tree(&mut reader, rels, media)?; }, Event::Eof => break, _ => {}, @@ -53,10 +80,75 @@ impl Slide { name, shapes, notes: None, + background_rgb, }) } } +/// Parse `` looking for a single solid-fill colour. +/// +/// Returns `Some([r, g, b])` if the background is a `` with an +/// ``. All other forms (gradient, +/// blip / image, scheme / theme references via ``) return +/// `None` — the renderer silently falls back to no background, which +/// matches "minimum theme-background support" per the v0.3.42 plan. +fn parse_slide_bg(reader: &mut quick_xml::Reader<&[u8]>) -> CoreResult> { + let mut rgb = None; + let mut depth = 1u32; + let mut in_solid_fill = false; + loop { + match reader.read_event()? { + Event::Start(ref e) => { + depth += 1; + if e.local_name().as_ref() == b"solidFill" { + in_solid_fill = true; + } + }, + Event::Empty(ref e) => { + if in_solid_fill && e.local_name().as_ref() == b"srgbClr" { + if let Some(val) = xml::optional_attr_str(e, b"val")? { + rgb = parse_hex_rgb(val.as_ref()); + } + } + }, + Event::End(ref e) => { + if e.local_name().as_ref() == b"solidFill" { + in_solid_fill = false; + } + depth -= 1; + if depth == 0 { + break; + } + }, + Event::Eof => break, + _ => {}, + } + } + Ok(rgb) +} + +/// Parse a 6-character hex colour (e.g. `"0E273B"`) into `[r, g, b]`. +fn parse_hex_rgb(s: &str) -> Option<[u8; 3]> { + let bytes = s.as_bytes(); + if bytes.len() != 6 { + return None; + } + let h = |hi, lo| -> Option { + let n = |c: u8| match c { + b'0'..=b'9' => Some(c - b'0'), + b'a'..=b'f' => Some(c - b'a' + 10), + b'A'..=b'F' => Some(c - b'A' + 10), + _ => None, + }; + Some(n(hi)? * 16 + n(lo)?) + }; + Some([ + h(bytes[0], bytes[1])?, + h(bytes[2], bytes[3])?, + h(bytes[4], bytes[5])?, + ]) +} + // --------------------------------------------------------------------------- // Shape tree parsing // --------------------------------------------------------------------------- @@ -64,6 +156,7 @@ impl Slide { fn parse_shape_tree( reader: &mut quick_xml::Reader<&[u8]>, rels: &Relationships, + media: &std::collections::HashMap, String)>, ) -> CoreResult> { let mut shapes = Vec::new(); @@ -71,8 +164,8 @@ fn parse_shape_tree( match reader.read_event()? { Event::Start(ref e) => match e.local_name().as_ref() { b"sp" => shapes.push(parse_auto_shape(reader, rels)?), - b"pic" => shapes.push(parse_picture(reader)?), - b"grpSp" => shapes.push(parse_group_shape(reader, rels)?), + b"pic" => shapes.push(parse_picture(reader, media)?), + b"grpSp" => shapes.push(parse_group_shape(reader, rels, media)?), b"graphicFrame" => shapes.push(parse_graphic_frame(reader, rels)?), b"cxnSp" => shapes.push(parse_connector(reader)?), _ => { @@ -147,11 +240,15 @@ fn parse_auto_shape( // PictureShape (p:pic) // --------------------------------------------------------------------------- -fn parse_picture(reader: &mut quick_xml::Reader<&[u8]>) -> CoreResult { +fn parse_picture( + reader: &mut quick_xml::Reader<&[u8]>, + media: &std::collections::HashMap, String)>, +) -> CoreResult { let mut id = 0u32; let mut name = String::new(); let mut alt_text = None; let mut position = None; + let mut embed_rid: Option = None; loop { match reader.read_event()? { @@ -163,7 +260,7 @@ fn parse_picture(reader: &mut quick_xml::Reader<&[u8]>) -> CoreResult { alt_text = props.2; }, b"blipFill" => { - xml::skip_element_fast(reader)?; + embed_rid = parse_blip_fill_embed(reader)?; }, b"spPr" => { position = parse_shape_properties(reader)?; @@ -180,14 +277,71 @@ fn parse_picture(reader: &mut quick_xml::Reader<&[u8]>) -> CoreResult { } } + let (data, format) = match embed_rid.as_deref().and_then(|rid| media.get(rid)) { + Some((bytes, ext)) => (Some(bytes.clone()), Some(ext.clone())), + None => (None, None), + }; + Ok(Shape::Picture(PictureShape { id, name, alt_text, position, + embed_rid, + data, + format, })) } +/// Parse `` and +/// return the `r:embed` attribute, if present. Other contents (stretch, +/// crop, tile) are skipped — only the embed rId is needed to resolve +/// the underlying media part. +fn parse_blip_fill_embed(reader: &mut quick_xml::Reader<&[u8]>) -> CoreResult> { + let mut embed: Option = None; + let mut depth: u32 = 1; + loop { + match reader.read_event()? { + Event::Start(ref e) => { + if e.local_name().as_ref() == b"blip" && embed.is_none() { + embed = read_blip_embed_attr(e)?; + } + depth += 1; + }, + Event::Empty(ref e) => { + if e.local_name().as_ref() == b"blip" && embed.is_none() { + embed = read_blip_embed_attr(e)?; + } + }, + Event::End(_) => { + depth -= 1; + if depth == 0 { + break; + } + }, + Event::Eof => break, + _ => {}, + } + } + Ok(embed) +} + +fn read_blip_embed_attr(e: &quick_xml::events::BytesStart) -> CoreResult> { + // `` carries `r:embed="rIdN"` (DrawingML namespace `a:`, + // relationship namespace `r:`). The attribute may be present in + // either the `Empty` or `Start` form; both routes feed this helper. + for attr in e.attributes().with_checks(false) { + let attr = attr.map_err(crate::core::Error::from)?; + let key = attr.key.as_ref(); + let is_embed = key == b"r:embed" || key.ends_with(b":embed") || key == b"embed"; + if is_embed { + let raw = attr.unescape_value().map_err(crate::core::Error::from)?; + return Ok(Some(raw.into_owned())); + } + } + Ok(None) +} + // --------------------------------------------------------------------------- // GroupShape (p:grpSp) // --------------------------------------------------------------------------- @@ -195,6 +349,7 @@ fn parse_picture(reader: &mut quick_xml::Reader<&[u8]>) -> CoreResult { fn parse_group_shape( reader: &mut quick_xml::Reader<&[u8]>, rels: &Relationships, + media: &std::collections::HashMap, String)>, ) -> CoreResult { let mut id = 0u32; let mut name = String::new(); @@ -213,8 +368,8 @@ fn parse_group_shape( position = parse_grp_shape_properties(reader)?; }, b"sp" => children.push(parse_auto_shape(reader, rels)?), - b"pic" => children.push(parse_picture(reader)?), - b"grpSp" => children.push(parse_group_shape(reader, rels)?), + b"pic" => children.push(parse_picture(reader, media)?), + b"grpSp" => children.push(parse_group_shape(reader, rels, media)?), b"graphicFrame" => children.push(parse_graphic_frame(reader, rels)?), b"cxnSp" => children.push(parse_connector(reader)?), _ => { @@ -686,9 +841,23 @@ fn parse_text_paragraph( reader: &mut quick_xml::Reader<&[u8]>, rels: &Relationships, ) -> CoreResult { + use crate::ir::ParagraphAlignment; let mut level = 0u32; + let mut alignment: Option = None; + let mut space_before_hundredths_pt: Option = None; let mut content = Vec::new(); + let parse_algn = |e: &quick_xml::events::BytesStart| -> CoreResult> { + Ok(xml::optional_attr_str(e, b"algn")?.and_then(|v| match v.as_ref() { + "l" => Some(ParagraphAlignment::Left), + "ctr" => Some(ParagraphAlignment::Center), + "r" => Some(ParagraphAlignment::Right), + "just" | "justLow" => Some(ParagraphAlignment::Justify), + "dist" | "thaiDist" => Some(ParagraphAlignment::Distribute), + _ => None, + })) + }; + loop { match reader.read_event()? { Event::Start(ref e) => match e.local_name().as_ref() { @@ -696,7 +865,41 @@ fn parse_text_paragraph( level = xml::optional_attr_str(e, b"lvl")? .and_then(|v| v.parse().ok()) .unwrap_or(0); - xml::skip_element_fast(reader)?; + alignment = parse_algn(e)?; + // with body — scan for + let depth_start = 1i32; + let mut depth = depth_start; + let mut in_spc_bef = false; + loop { + match reader.read_event()? { + Event::Start(ref ee) => { + depth += 1; + if ee.local_name().as_ref() == b"spcBef" { + in_spc_bef = true; + } + }, + Event::Empty(ref ee) => { + if in_spc_bef && ee.local_name().as_ref() == b"spcPts" { + if let Some(v) = xml::optional_attr_str(ee, b"val")? { + if let Ok(n) = v.parse::() { + space_before_hundredths_pt = Some(n); + } + } + } + }, + Event::End(ref ee) => { + depth -= 1; + if ee.local_name().as_ref() == b"spcBef" { + in_spc_bef = false; + } + if depth <= 0 && ee.local_name().as_ref() == b"pPr" { + break; + } + }, + Event::Eof => break, + _ => {}, + } + } }, b"r" => { content.push(TextContent::Run(parse_text_run(reader, rels)?)); @@ -717,6 +920,7 @@ fn parse_text_paragraph( level = xml::optional_attr_str(e, b"lvl")? .and_then(|v| v.parse().ok()) .unwrap_or(0); + alignment = parse_algn(e)?; }, b"br" => { content.push(TextContent::LineBreak); @@ -731,7 +935,12 @@ fn parse_text_paragraph( } } - Ok(TextParagraph { level, content }) + Ok(TextParagraph { + level, + alignment, + space_before_hundredths_pt, + content, + }) } /// Parse `` text run. @@ -744,6 +953,8 @@ fn parse_text_run( let mut italic = None; let mut strikethrough = false; let mut hyperlink = None; + let mut font_size_hundredths_pt = None; + let mut color_rgb: Option<[u8; 3]> = None; loop { match reader.read_event()? { @@ -754,6 +965,8 @@ fn parse_text_run( italic = props.1; strikethrough = props.2; hyperlink = props.3; + font_size_hundredths_pt = props.4; + color_rgb = props.5; }, b"t" => { text = xml::read_text_content_fast(reader)?; @@ -768,6 +981,8 @@ fn parse_text_run( italic = props.1; strikethrough = props.2; hyperlink = props.3; + font_size_hundredths_pt = props.4; + color_rgb = props.5; }, Event::End(ref e) if e.local_name().as_ref() == b"r" => { break; @@ -783,6 +998,8 @@ fn parse_text_run( italic, strikethrough, hyperlink, + font_size_hundredths_pt, + color_rgb, }) } @@ -796,27 +1013,52 @@ fn parse_run_properties( let italic = parse_bool_attr(start, b"i")?; let strike = xml::optional_attr_str(start, b"strike")?; let strikethrough = strike.as_deref().is_some_and(|v| v != "noStrike"); + let font_size_hundredths_pt = parse_u32_attr(start, b"sz")?; let mut hyperlink = None; + let mut color_rgb: Option<[u8; 3]> = None; + // Track whether we are inside `` so we only pick up + // the inner `` (the fill colour proper) and not + // unrelated `` elements that may appear in sibling + // effects (e.g. `` for hyperlink colour). + let mut in_solid_fill = false; loop { match reader.read_event()? { - Event::Start(ref e) | Event::Empty(ref e) - if e.local_name().as_ref() == b"hlinkClick" => - { - hyperlink = parse_hlink_click(e, rels)?; + Event::Start(ref e) => { + if e.local_name().as_ref() == b"solidFill" { + in_solid_fill = true; + } else if e.local_name().as_ref() == b"hlinkClick" { + hyperlink = parse_hlink_click(e, rels)?; + } }, - Event::End(ref e) if e.local_name().as_ref() == b"rPr" => { - break; + Event::Empty(ref e) => { + if e.local_name().as_ref() == b"hlinkClick" { + hyperlink = parse_hlink_click(e, rels)?; + } else if in_solid_fill + && e.local_name().as_ref() == b"srgbClr" + && color_rgb.is_none() + { + color_rgb = parse_srgb_clr(e); + } + }, + Event::End(ref e) => { + if e.local_name().as_ref() == b"solidFill" { + in_solid_fill = false; + } else if e.local_name().as_ref() == b"rPr" { + break; + } }, Event::Eof => break, _ => {}, } } - Ok((bold, italic, strikethrough, hyperlink)) + Ok((bold, italic, strikethrough, hyperlink, font_size_hundredths_pt, color_rgb)) } -/// Parse run properties from an `` Empty element. +/// Parse run properties from an `` Empty element. Empty +/// elements cannot carry a `` child so `color_rgb` +/// is always `None` on this path. fn parse_run_properties_empty( e: &quick_xml::events::BytesStart, _rels: &Relationships, @@ -825,7 +1067,29 @@ fn parse_run_properties_empty( let italic = parse_bool_attr(e, b"i")?; let strike = xml::optional_attr_str(e, b"strike")?; let strikethrough = strike.as_deref().is_some_and(|v| v != "noStrike"); - Ok((bold, italic, strikethrough, None)) + let font_size_hundredths_pt = parse_u32_attr(e, b"sz")?; + Ok((bold, italic, strikethrough, None, font_size_hundredths_pt, None)) +} + +/// Decode a 6-hex-digit `val="RRGGBB"` attribute from `` +/// to a `[u8; 3]`. Returns `None` when the attribute is absent or +/// malformed. +fn parse_srgb_clr(e: &quick_xml::events::BytesStart) -> Option<[u8; 3]> { + let val = xml::optional_attr_str(e, b"val").ok().flatten()?; + let s = val.as_ref(); + if s.len() != 6 { + return None; + } + let r = u8::from_str_radix(&s[0..2], 16).ok()?; + let g = u8::from_str_radix(&s[2..4], 16).ok()?; + let b = u8::from_str_radix(&s[4..6], 16).ok()?; + Some([r, g, b]) +} + +/// Parse a non-negative integer DrawingML attribute (e.g. `sz="1800"`). +/// Returns `None` if the attribute is absent or not parseable. +fn parse_u32_attr(e: &quick_xml::events::BytesStart, key: &[u8]) -> CoreResult> { + Ok(xml::optional_attr_str(e, key)?.and_then(|v| v.parse::().ok())) } /// Parse a DrawingML boolean attribute: `b="1"` → Some(true), `b="0"` → Some(false), absent → None. @@ -994,7 +1258,8 @@ pub(crate) fn extract_notes_text(xml_data: &[u8]) -> Option { loop { match reader.read_event() { Ok(Event::Start(ref e)) if e.local_name().as_ref() == b"spTree" => { - shapes = parse_shape_tree(&mut reader, &rels).ok()?; + shapes = + parse_shape_tree(&mut reader, &rels, &std::collections::HashMap::new()).ok()?; }, Ok(Event::Eof) => break, Err(_) => break, @@ -1093,7 +1358,9 @@ mod tests { ); let rels = Relationships::empty(); - let slide = Slide::parse(&xml, "Slide1".to_string(), &rels).unwrap(); + let slide = + Slide::parse(&xml, "Slide1".to_string(), &rels, &std::collections::HashMap::new()) + .unwrap(); assert_eq!(slide.shapes.len(), 1); if let Shape::AutoShape(ref auto) = slide.shapes[0] { @@ -1151,7 +1418,8 @@ mod tests { ); let rels = Relationships::empty(); - let slide = Slide::parse(&xml, String::new(), &rels).unwrap(); + let slide = + Slide::parse(&xml, String::new(), &rels, &std::collections::HashMap::new()).unwrap(); assert_eq!(slide.shapes.len(), 1); if let Shape::Group(ref grp) = slide.shapes[0] { @@ -1225,7 +1493,8 @@ mod tests { ); let rels = Relationships::empty(); - let slide = Slide::parse(&xml, String::new(), &rels).unwrap(); + let slide = + Slide::parse(&xml, String::new(), &rels, &std::collections::HashMap::new()).unwrap(); assert_eq!(slide.shapes.len(), 1); if let Shape::GraphicFrame(ref gf) = slide.shapes[0] { @@ -1266,7 +1535,8 @@ mod tests { ); let rels = Relationships::empty(); - let slide = Slide::parse(&xml, String::new(), &rels).unwrap(); + let slide = + Slide::parse(&xml, String::new(), &rels, &std::collections::HashMap::new()).unwrap(); assert_eq!(slide.shapes.len(), 1); if let Shape::Picture(ref pic) = slide.shapes[0] { @@ -1300,7 +1570,8 @@ mod tests { ); let rels = Relationships::empty(); - let slide = Slide::parse(&xml, String::new(), &rels).unwrap(); + let slide = + Slide::parse(&xml, String::new(), &rels, &std::collections::HashMap::new()).unwrap(); assert_eq!(slide.shapes.len(), 1); if let Shape::Connector(ref cxn) = slide.shapes[0] { @@ -1336,7 +1607,8 @@ mod tests { ); let rels = Relationships::empty(); - let slide = Slide::parse(&xml, String::new(), &rels).unwrap(); + let slide = + Slide::parse(&xml, String::new(), &rels, &std::collections::HashMap::new()).unwrap(); if let Shape::AutoShape(ref auto) = slide.shapes[0] { let tb = auto.text_body.as_ref().unwrap(); @@ -1372,7 +1644,8 @@ mod tests { ); let rels = Relationships::empty(); - let slide = Slide::parse(&xml, String::new(), &rels).unwrap(); + let slide = + Slide::parse(&xml, String::new(), &rels, &std::collections::HashMap::new()).unwrap(); if let Shape::AutoShape(ref auto) = slide.shapes[0] { let tb = auto.text_body.as_ref().unwrap(); @@ -1427,4 +1700,321 @@ mod tests { let text = extract_notes_text(xml).unwrap(); assert_eq!(text, "Speaker notes here\nSecond line"); } + + // ── New: blip rId extraction, font size, alignment, space_before, bg ─ + + #[test] + fn run_carries_font_size_from_sz_attr() { + // means 18 pt — should land on the run as + // 1800 hundredths-of-a-point. + let xml = make_slide_xml( + r#" + + + + + + + + sized + + + +"#, + ); + + let rels = Relationships::empty(); + let slide = + Slide::parse(&xml, String::new(), &rels, &std::collections::HashMap::new()).unwrap(); + if let Shape::AutoShape(ref a) = slide.shapes[0] { + let tb = a.text_body.as_ref().unwrap(); + if let TextContent::Run(ref r) = tb.paragraphs[0].content[0] { + assert_eq!(r.font_size_hundredths_pt, Some(1800)); + } else { + panic!("expected run"); + } + } + } + + #[test] + fn run_font_size_absent_when_sz_missing() { + let xml = make_slide_xml( + r#" + + + + + + unsized + + +"#, + ); + + let rels = Relationships::empty(); + let slide = + Slide::parse(&xml, String::new(), &rels, &std::collections::HashMap::new()).unwrap(); + if let Shape::AutoShape(ref a) = slide.shapes[0] { + let tb = a.text_body.as_ref().unwrap(); + if let TextContent::Run(ref r) = tb.paragraphs[0].content[0] { + assert!(r.font_size_hundredths_pt.is_none()); + } + } + } + + #[test] + fn paragraph_alignment_parsed_from_algn_attr() { + use crate::ir::ParagraphAlignment; + let xml = make_slide_xml( + r#" + + + + + + + centered + + +"#, + ); + + let rels = Relationships::empty(); + let slide = + Slide::parse(&xml, String::new(), &rels, &std::collections::HashMap::new()).unwrap(); + if let Shape::AutoShape(ref a) = slide.shapes[0] { + let para = &a.text_body.as_ref().unwrap().paragraphs[0]; + assert_eq!(para.alignment, Some(ParagraphAlignment::Center)); + } + } + + #[test] + fn paragraph_alignment_all_variants() { + use crate::ir::ParagraphAlignment; + let cases = [ + ("l", ParagraphAlignment::Left), + ("ctr", ParagraphAlignment::Center), + ("r", ParagraphAlignment::Right), + ("just", ParagraphAlignment::Justify), + ("dist", ParagraphAlignment::Distribute), + ]; + for (algn, expected) in cases { + let xml = make_slide_xml(&format!( + r#" + + + + + + + x + + +"# + )); + let slide = Slide::parse( + &xml, + String::new(), + &Relationships::empty(), + &std::collections::HashMap::new(), + ) + .unwrap(); + if let Shape::AutoShape(ref a) = slide.shapes[0] { + let para = &a.text_body.as_ref().unwrap().paragraphs[0]; + assert_eq!(para.alignment, Some(expected), "algn={algn}"); + } + } + } + + #[test] + fn paragraph_space_before_parsed_from_spc_bef() { + let xml = make_slide_xml( + r#" + + + + + + + + + spaced + + +"#, + ); + + let rels = Relationships::empty(); + let slide = + Slide::parse(&xml, String::new(), &rels, &std::collections::HashMap::new()).unwrap(); + if let Shape::AutoShape(ref a) = slide.shapes[0] { + let para = &a.text_body.as_ref().unwrap().paragraphs[0]; + assert_eq!(para.space_before_hundredths_pt, Some(1200)); + } + } + + #[test] + fn picture_embed_resolves_via_media_map() { + // Build a media map keyed by the rId used in the slide xml so + // parse_picture can resolve the embed → bytes. + let xml = make_slide_xml( + r#" + + + + + + + + + + + +"#, + ); + + let mut media = std::collections::HashMap::new(); + media.insert("rId7".to_string(), (vec![0xDEu8, 0xADu8, 0xBEu8, 0xEFu8], "png".to_string())); + + let slide = Slide::parse(&xml, String::new(), &Relationships::empty(), &media).unwrap(); + if let Shape::Picture(ref pic) = slide.shapes[0] { + assert_eq!(pic.embed_rid.as_deref(), Some("rId7")); + assert_eq!(pic.data.as_deref(), Some(&[0xDEu8, 0xADu8, 0xBEu8, 0xEFu8][..])); + assert_eq!(pic.format.as_deref(), Some("png")); + } else { + panic!("expected picture"); + } + } + + #[test] + fn picture_embed_without_media_still_carries_rid() { + // Empty media map: rId is captured but data/format are None. + let xml = make_slide_xml( + r#" + + + + + + + + + +"#, + ); + + let slide = Slide::parse( + &xml, + String::new(), + &Relationships::empty(), + &std::collections::HashMap::new(), + ) + .unwrap(); + if let Shape::Picture(ref pic) = slide.shapes[0] { + assert_eq!(pic.embed_rid.as_deref(), Some("rId9")); + assert!(pic.data.is_none()); + assert!(pic.format.is_none()); + } + } + + #[test] + fn slide_background_solid_rgb() { + // … + let xml = br#" + + + + + + + + + + + + +"#; + let slide = Slide::parse( + xml, + String::new(), + &Relationships::empty(), + &std::collections::HashMap::new(), + ) + .unwrap(); + assert_eq!(slide.background_rgb, Some([0xFF, 0x88, 0x00])); + } + + #[test] + fn slide_no_background_returns_none() { + let xml = make_slide_xml(""); + let slide = Slide::parse( + &xml, + String::new(), + &Relationships::empty(), + &std::collections::HashMap::new(), + ) + .unwrap(); + assert!(slide.background_rgb.is_none()); + } + + #[test] + fn parse_hex_rgb_valid() { + assert_eq!(parse_hex_rgb("FF8800"), Some([0xFF, 0x88, 0x00])); + assert_eq!(parse_hex_rgb("000000"), Some([0, 0, 0])); + assert_eq!(parse_hex_rgb("ffffff"), Some([0xFF, 0xFF, 0xFF])); + } + + #[test] + fn parse_hex_rgb_invalid() { + assert_eq!(parse_hex_rgb("FF88"), None); // too short + assert_eq!(parse_hex_rgb("ZZZZZZ"), None); // not hex + assert_eq!(parse_hex_rgb(""), None); + } + + // ── read_blip_embed_attr ──────────────────────────────────────────── + + fn first_start_elem(xml: &[u8]) -> quick_xml::events::BytesStart<'static> { + let mut reader = xml::make_fast_reader(xml); + loop { + match reader.read_event().unwrap() { + Event::Start(e) | Event::Empty(e) => return e.into_owned(), + Event::Eof => panic!("no start"), + _ => {}, + } + } + } + + #[test] + fn blip_embed_attr_with_r_prefix() { + let e = first_start_elem( + br#""#, + ); + let rid = read_blip_embed_attr(&e).unwrap(); + assert_eq!(rid.as_deref(), Some("rId5")); + } + + #[test] + fn blip_embed_attr_arbitrary_prefix() { + // Some writers use an unrelated prefix bound to the rels namespace. + let e = first_start_elem( + br#""#, + ); + let rid = read_blip_embed_attr(&e).unwrap(); + assert_eq!(rid.as_deref(), Some("rId99")); + } + + #[test] + fn blip_embed_attr_absent() { + let e = first_start_elem( + br#""#, + ); + let rid = read_blip_embed_attr(&e).unwrap(); + assert!(rid.is_none()); + } } diff --git a/src/pptx/text.rs b/src/pptx/text.rs index 8338ecd..33a451c 100644 --- a/src/pptx/text.rs +++ b/src/pptx/text.rs @@ -431,6 +431,7 @@ mod tests { }, slides, theme: None, + embedded_fonts: Vec::new(), } } @@ -448,12 +449,15 @@ mod tests { text_body: Some(TextBody { paragraphs: vec![TextParagraph { level: 0, + alignment: None, + space_before_hundredths_pt: None, content: vec![TextContent::Run(TextRun { text: text.to_string(), bold: None, italic: None, strikethrough: false, hyperlink: None, + font_size_hundredths_pt: None, })], }], }), @@ -475,12 +479,15 @@ mod tests { text_body: Some(TextBody { paragraphs: vec![TextParagraph { level: 0, + alignment: None, + space_before_hundredths_pt: None, content: vec![TextContent::Run(TextRun { text: text.to_string(), bold: None, italic: None, strikethrough: false, hyperlink: None, + font_size_hundredths_pt: None, })], }], }), @@ -501,6 +508,7 @@ mod tests { text_shape("Middle", "middle text", 50, 2500), ], notes: None, + background_rgb: None, }]); let text = doc.slide_plain_text(0).unwrap(); @@ -513,6 +521,7 @@ mod tests { name: String::new(), shapes: vec![text_shape("Text", "Hello", 0, 0)], notes: Some("Speaker notes".to_string()), + background_rgb: None, }]); let text = doc.slide_plain_text(0).unwrap(); @@ -526,11 +535,13 @@ mod tests { name: String::new(), shapes: vec![text_shape("A", "Slide one", 0, 0)], notes: None, + background_rgb: None, }, Slide { name: String::new(), shapes: vec![text_shape("B", "Slide two", 0, 0)], notes: None, + background_rgb: None, }, ]); @@ -547,6 +558,7 @@ mod tests { text_shape("Body", "Body text", 0, 2000), ], notes: None, + background_rgb: None, }]); let md = doc.slide_to_markdown(0).unwrap(); @@ -573,6 +585,8 @@ mod tests { text_body: Some(TextBody { paragraphs: vec![TextParagraph { level: 0, + alignment: None, + space_before_hundredths_pt: None, content: vec![ TextContent::Run(TextRun { text: "bold".to_string(), @@ -580,6 +594,7 @@ mod tests { italic: None, strikethrough: false, hyperlink: None, + font_size_hundredths_pt: None, }), TextContent::Run(TextRun { text: " and ".to_string(), @@ -587,6 +602,7 @@ mod tests { italic: None, strikethrough: false, hyperlink: None, + font_size_hundredths_pt: None, }), TextContent::Run(TextRun { text: "italic".to_string(), @@ -594,6 +610,7 @@ mod tests { italic: Some(true), strikethrough: false, hyperlink: None, + font_size_hundredths_pt: None, }), ], }], @@ -601,6 +618,7 @@ mod tests { placeholder: None, })], notes: None, + background_rgb: None, }]); let md = doc.slide_to_markdown(0).unwrap(); @@ -613,6 +631,7 @@ mod tests { name: String::new(), shapes: vec![text_shape("Text", "Content", 0, 0)], notes: Some("Note line 1\nNote line 2".to_string()), + background_rgb: None, }]); let md = doc.slide_to_markdown(0).unwrap(); @@ -640,12 +659,15 @@ mod tests { text_body: Some(TextBody { paragraphs: vec![TextParagraph { level: 0, + alignment: None, + space_before_hundredths_pt: None, content: vec![TextContent::Run(TextRun { text: "H1".to_string(), bold: None, italic: None, strikethrough: false, hyperlink: None, + font_size_hundredths_pt: None, })], }], }), @@ -658,12 +680,15 @@ mod tests { text_body: Some(TextBody { paragraphs: vec![TextParagraph { level: 0, + alignment: None, + space_before_hundredths_pt: None, content: vec![TextContent::Run(TextRun { text: "H2".to_string(), bold: None, italic: None, strikethrough: false, hyperlink: None, + font_size_hundredths_pt: None, })], }], }), @@ -680,12 +705,15 @@ mod tests { text_body: Some(TextBody { paragraphs: vec![TextParagraph { level: 0, + alignment: None, + space_before_hundredths_pt: None, content: vec![TextContent::Run(TextRun { text: "A".to_string(), bold: None, italic: None, strikethrough: false, hyperlink: None, + font_size_hundredths_pt: None, })], }], }), @@ -698,12 +726,15 @@ mod tests { text_body: Some(TextBody { paragraphs: vec![TextParagraph { level: 0, + alignment: None, + space_before_hundredths_pt: None, content: vec![TextContent::Run(TextRun { text: "B".to_string(), bold: None, italic: None, strikethrough: false, hyperlink: None, + font_size_hundredths_pt: None, })], }], }), @@ -718,6 +749,7 @@ mod tests { }), })], notes: None, + background_rgb: None, }]); let md = doc.slide_to_markdown(0).unwrap(); @@ -743,6 +775,8 @@ mod tests { text_body: Some(TextBody { paragraphs: vec![TextParagraph { level: 0, + alignment: None, + space_before_hundredths_pt: None, content: vec![TextContent::Run(TextRun { text: "Click here".to_string(), bold: None, @@ -754,12 +788,14 @@ mod tests { ), tooltip: None, }), + font_size_hundredths_pt: None, })], }], }), placeholder: None, })], notes: None, + background_rgb: None, }]); let md = doc.slide_to_markdown(0).unwrap(); diff --git a/src/pptx/write.rs b/src/pptx/write.rs index e2d20e0..b4729f8 100644 --- a/src/pptx/write.rs +++ b/src/pptx/write.rs @@ -158,10 +158,23 @@ impl From for Run { // Internal body content model // --------------------------------------------------------------------------- +/// Paragraph-level properties carried through a `BodyItem::RichText`. +/// Present so the writer can emit `` attributes (alignment, +/// space-before) that don't fit on per-run ``. +#[derive(Debug, Clone, Default)] +pub struct ParaProps { + /// Paragraph alignment written as ``. `None` + /// leaves the renderer-default left alignment in place. + pub alignment: Option, + /// Space before the paragraph in points × 100. 1250 = 12.5pt. + /// When set, written as ``. + pub space_before_hundredths_pt: Option, +} + #[derive(Debug, Clone)] enum BodyItem { Text(String), - RichText(Vec), + RichText(Vec, ParaProps), BulletList(Vec), /// Free-floating text box: (runs, x_emu, y_emu, cx_emu, cy_emu) TextBox(Vec, i64, i64, i64, i64), @@ -178,6 +191,10 @@ enum BodyItem { pub struct SlideData { /// The slide title (if set). pub title: Option, + /// Optional explicit alignment for the title placeholder. None + /// leaves alignment to the slide layout default (typically + /// centered for title placeholders). + pub title_alignment: Option, body_items: Vec, } @@ -185,6 +202,7 @@ impl SlideData { fn new() -> Self { Self { title: None, + title_alignment: None, body_items: Vec::new(), } } @@ -195,6 +213,18 @@ impl SlideData { self } + /// Set the slide title and its alignment. Overwrites any + /// previously set title. + pub fn set_title_aligned( + &mut self, + title: &str, + alignment: Option, + ) -> &mut Self { + self.title = Some(title.to_string()); + self.title_alignment = alignment; + self + } + /// Add a plain text paragraph to the body area. pub fn add_text(&mut self, text: &str) -> &mut Self { self.body_items.push(BodyItem::Text(text.to_string())); @@ -203,7 +233,32 @@ impl SlideData { /// Add a paragraph of styled [`Run`]s to the body area. pub fn add_rich_text(&mut self, runs: &[Run]) -> &mut Self { - self.body_items.push(BodyItem::RichText(runs.to_vec())); + self.body_items + .push(BodyItem::RichText(runs.to_vec(), ParaProps::default())); + self + } + + /// Add a paragraph of styled [`Run`]s with an explicit alignment. + pub fn add_rich_text_aligned( + &mut self, + runs: &[Run], + alignment: Option, + ) -> &mut Self { + self.body_items.push(BodyItem::RichText( + runs.to_vec(), + ParaProps { + alignment, + ..Default::default() + }, + )); + self + } + + /// Add a paragraph of styled [`Run`]s with full paragraph + /// properties (alignment, space-before). + pub fn add_rich_text_with_props(&mut self, runs: &[Run], props: ParaProps) -> &mut Self { + self.body_items + .push(BodyItem::RichText(runs.to_vec(), props)); self } @@ -273,6 +328,14 @@ pub struct PptxWriter { cx: u64, /// Presentation height in EMU (default: 6 858 000 — standard 16:9). cy: u64, + /// Embedded font programs to ship inside the package under `ppt/fonts/`. + /// Mirrors `DocxWriter::embed_font` semantics: each `(name, bytes)` pair + /// becomes one font part, used by PDF↔PPTX round-trips to preserve the + /// source typeface. + embedded_fonts: Vec<(String, Vec)>, + /// Document metadata for `docProps/core.xml`. `None` means no + /// core-properties part is written. + metadata: Option, } impl PptxWriter { @@ -282,7 +345,26 @@ impl PptxWriter { slides: Vec::new(), cx: 12_192_000, cy: 6_858_000, + embedded_fonts: Vec::new(), + metadata: None, + } + } + + /// Set document metadata (written to `docProps/core.xml`). + pub fn set_metadata(&mut self, meta: &crate::ir::Metadata) -> &mut Self { + self.metadata = Some(meta.clone()); + self + } + + /// Embed a font program (TrueType / OpenType bytes) under `ppt/fonts/`. + /// `name` is used for the file name and the human-readable font name. + /// Subsequent calls with the same name are deduplicated. + pub fn embed_font(&mut self, name: impl Into, data: Vec) -> &mut Self { + let name = name.into(); + if !self.embedded_fonts.iter().any(|(n, _)| n == &name) { + self.embedded_fonts.push((name, data)); } + self } /// Override the presentation canvas size (in EMU). @@ -358,6 +440,17 @@ impl PptxWriter { opc.add_package_rel(rel_types::OFFICE_DOCUMENT, "ppt/presentation.xml"); opc.add_part_rel(&pres_part, rel_types::SLIDE_MASTER, "slideMasters/slideMaster1.xml"); + // Core properties (docProps/core.xml). Written only when the + // caller supplied metadata so files generated through the + // existing `add_slide` API stay byte-identical when no + // metadata was set. + if let Some(ref meta) = self.metadata { + let core_part = PartName::new("/docProps/core.xml")?; + opc.add_package_rel(rel_types::CORE_PROPERTIES, "docProps/core.xml"); + let core_xml = crate::core::core_properties::generate_xml(meta); + opc.add_part(&core_part, crate::core::core_properties::CONTENT_TYPE, &core_xml)?; + } + let mut slide_parts = Vec::with_capacity(self.slides.len()); for i in 0..self.slides.len() { let idx = i + 1; @@ -411,6 +504,17 @@ impl PptxWriter { opc.add_part(slide_part, CT_SLIDE, &slide_xml)?; } + // Embed fonts under `ppt/fonts/font__.ttf`. Mirrors + // the DOCX `word/fonts/` layout. Other PowerPoint software may not + // honor this without the full presentation-relationship machinery + // for ``, but the in-process reader scans the + // directory directly so PDF↔PPTX round-trips preserve fonts. + crate::core::embedded_fonts::write_embedded_fonts( + &mut opc, + "/ppt/fonts/", + &self.embedded_fonts, + )?; + opc.finish()?; Ok(()) } @@ -562,6 +666,21 @@ fn generate_presentation_xml(slide_count: usize, cx: u64, cy: u64) -> Vec { sld_sz.push_attribute(("cy", cy.to_string().as_str())); w.write_event(Event::Empty(sld_sz)).expect("write"); + // notesSz: PowerPoint expects this even when there are no notes + // pages. Standard default is the same dimensions as the slide. + let mut notes_sz = BytesStart::new("p:notesSz"); + notes_sz.push_attribute(("cx", cx.to_string().as_str())); + notes_sz.push_attribute(("cy", cy.to_string().as_str())); + w.write_event(Event::Empty(notes_sz)).expect("write"); + + // defaultTextStyle: empty list of paragraph-level defaults is + // legal and silences PowerPoint's "Reset Layout" command failure + // when the user opens the deck. + w.write_event(Event::Start(BytesStart::new("p:defaultTextStyle"))) + .expect("write"); + w.write_event(Event::End(BytesEnd::new("p:defaultTextStyle"))) + .expect("write"); + w.write_event(Event::End(BytesEnd::new("p:presentation"))) .expect("write"); w.into_inner() @@ -610,8 +729,14 @@ fn generate_slide_layout_xml() -> Vec { let mut w = Writer::new(Vec::new()); write_decl(&mut w); + // Type "obj" = "Title and Content" — PowerPoint's standard + // layout. Slides referencing this layout get a sized title + // placeholder at the top and a body placeholder filling the + // rest. Was `type="blank"` with empty spTree; that left + // PowerPoint guessing at placeholder geometry. let mut root = pml_root("p:sldLayout"); - root.push_attribute(("type", "blank")); + root.push_attribute(("type", "obj")); + root.push_attribute(("preserve", "1")); w.write_event(Event::Start(root)).expect("write"); w.write_event(Event::Start(BytesStart::new("p:cSld"))) .expect("write"); @@ -619,6 +744,30 @@ fn generate_slide_layout_xml() -> Vec { .expect("write"); write_nv_grp_sp_pr(&mut w); write_empty(&mut w, "p:grpSpPr"); + + // Title placeholder — top of slide, ~5 % top inset, full width minus margin. + write_layout_placeholder( + &mut w, + 2, + "Title 1", + "title", + None, + // Geometry in EMU. Standard 16:9 @ 12 192 000 × 6 858 000: + // place title at (914 400, 685 800) ≈ 1 in × 0.75 in, + // size 10 363 200 × 1 143 000 ≈ 11.3 in × 1.25 in. + Some((914_400, 685_800, 10_363_200, 1_143_000)), + ); + + // Body placeholder — fills the area below the title. + write_layout_placeholder( + &mut w, + 3, + "Body 2", + "body", + Some(1), + Some((914_400, 1_905_000, 10_363_200, 4_343_400)), + ); + w.write_event(Event::End(BytesEnd::new("p:spTree"))) .expect("write"); w.write_event(Event::End(BytesEnd::new("p:cSld"))) @@ -628,6 +777,93 @@ fn generate_slide_layout_xml() -> Vec { w.into_inner() } +/// Emit one placeholder `` inside the slide layout: an empty +/// shape carrying the placeholder type/idx + its xfrm rectangle. +/// Slides that reference this layout's `type` and `idx` inherit the +/// geometry — without it PowerPoint falls back to bare-default +/// positioning that often pushes content off the slide canvas. +fn write_layout_placeholder( + w: &mut Writer>, + id: u32, + name: &str, + ph_type: &str, + ph_idx: Option, + geometry_emu: Option<(i64, i64, i64, i64)>, // (x, y, cx, cy) +) { + let id_str = id.to_string(); + w.write_event(Event::Start(BytesStart::new("p:sp"))) + .expect("sp start"); + + w.write_event(Event::Start(BytesStart::new("p:nvSpPr"))) + .expect("nvSpPr start"); + let mut cnv_pr = BytesStart::new("p:cNvPr"); + cnv_pr.push_attribute(("id", id_str.as_str())); + cnv_pr.push_attribute(("name", name)); + w.write_event(Event::Empty(cnv_pr)).expect("cNvPr"); + w.write_event(Event::Start(BytesStart::new("p:cNvSpPr"))) + .expect("cNvSpPr start"); + let mut locks = BytesStart::new("a:spLocks"); + locks.push_attribute(("noGrp", "1")); + w.write_event(Event::Empty(locks)).expect("spLocks"); + w.write_event(Event::End(BytesEnd::new("p:cNvSpPr"))) + .expect("cNvSpPr end"); + w.write_event(Event::Start(BytesStart::new("p:nvPr"))) + .expect("nvPr start"); + let mut ph = BytesStart::new("p:ph"); + ph.push_attribute(("type", ph_type)); + let idx_buf; + if let Some(idx) = ph_idx { + idx_buf = idx.to_string(); + ph.push_attribute(("idx", idx_buf.as_str())); + } + w.write_event(Event::Empty(ph)).expect("ph"); + w.write_event(Event::End(BytesEnd::new("p:nvPr"))) + .expect("nvPr end"); + w.write_event(Event::End(BytesEnd::new("p:nvSpPr"))) + .expect("nvSpPr end"); + + // spPr with optional xfrm geometry + if let Some((x, y, cx, cy)) = geometry_emu { + w.write_event(Event::Start(BytesStart::new("p:spPr"))) + .expect("spPr start"); + w.write_event(Event::Start(BytesStart::new("a:xfrm"))) + .expect("xfrm start"); + let mut off = BytesStart::new("a:off"); + let xs = x.to_string(); + let ys = y.to_string(); + off.push_attribute(("x", xs.as_str())); + off.push_attribute(("y", ys.as_str())); + w.write_event(Event::Empty(off)).expect("off"); + let mut ext = BytesStart::new("a:ext"); + let cxs = cx.to_string(); + let cys = cy.to_string(); + ext.push_attribute(("cx", cxs.as_str())); + ext.push_attribute(("cy", cys.as_str())); + w.write_event(Event::Empty(ext)).expect("ext"); + w.write_event(Event::End(BytesEnd::new("a:xfrm"))) + .expect("xfrm end"); + w.write_event(Event::End(BytesEnd::new("p:spPr"))) + .expect("spPr end"); + } else { + write_empty(w, "p:spPr"); + } + + // Empty txBody — slides supply their own text. + w.write_event(Event::Start(BytesStart::new("p:txBody"))) + .expect("txBody start"); + write_empty(w, "a:bodyPr"); + write_empty(w, "a:lstStyle"); + w.write_event(Event::Start(BytesStart::new("a:p"))) + .expect("a:p start"); + w.write_event(Event::End(BytesEnd::new("a:p"))) + .expect("a:p end"); + w.write_event(Event::End(BytesEnd::new("p:txBody"))) + .expect("txBody end"); + + w.write_event(Event::End(BytesEnd::new("p:sp"))) + .expect("sp end"); +} + // --------------------------------------------------------------------------- // slides/slideN.xml // --------------------------------------------------------------------------- @@ -649,7 +885,7 @@ fn generate_slide_xml(slide: &SlideData, img_rids: &[(String, i64, i64, u64, u64 let mut next_id: u32 = 2; if let Some(ref title) = slide.title { - write_title_shape(&mut w, next_id, title); + write_title_shape(&mut w, next_id, title, slide.title_alignment.as_ref()); next_id += 1; } @@ -687,7 +923,12 @@ fn generate_slide_xml(slide: &SlideData, img_rids: &[(String, i64, i64, u64, u64 w.into_inner() } -fn write_title_shape(w: &mut Writer>, id: u32, title: &str) { +fn write_title_shape( + w: &mut Writer>, + id: u32, + title: &str, + alignment: Option<&crate::ir::ParagraphAlignment>, +) { let id_str = id.to_string(); w.write_event(Event::Start(BytesStart::new("p:sp"))) .expect("write"); @@ -720,7 +961,16 @@ fn write_title_shape(w: &mut Writer>, id: u32, title: &str) { w.write_event(Event::Start(BytesStart::new("p:txBody"))) .expect("write"); write_empty(w, "a:bodyPr"); - write_plain_paragraph(w, title); + if let Some(a) = alignment { + let runs = vec![Run::new(title)]; + let props = ParaProps { + alignment: Some(a.clone()), + ..Default::default() + }; + write_rich_paragraph(w, &runs, &props); + } else { + write_plain_paragraph(w, title); + } w.write_event(Event::End(BytesEnd::new("p:txBody"))) .expect("write"); @@ -761,12 +1011,20 @@ fn write_body_shape(w: &mut Writer>, id: u32, items: &[&BodyItem]) { w.write_event(Event::Start(BytesStart::new("p:txBody"))) .expect("write"); - write_empty(w, "a:bodyPr"); + // : tell PowerPoint to + // shrink-to-fit the body text. Without this, dense PDF pages + // imported as slides overflow the placeholder and content + // renders off-slide. + w.write_event(Event::Start(BytesStart::new("a:bodyPr"))) + .expect("write bodyPr start"); + write_empty(w, "a:normAutofit"); + w.write_event(Event::End(BytesEnd::new("a:bodyPr"))) + .expect("write bodyPr end"); for item in items { match item { BodyItem::Text(text) => write_plain_paragraph(w, text), - BodyItem::RichText(runs) => write_rich_paragraph(w, runs), + BodyItem::RichText(runs, props) => write_rich_paragraph(w, runs, props), BodyItem::BulletList(bullets) => { for bullet in bullets { write_bullet_paragraph(w, bullet); @@ -838,13 +1096,21 @@ fn write_text_box_shape( w.write_event(Event::End(BytesEnd::new("p:spPr"))) .expect("write"); - // txBody + // txBody — `wrap="none"` plus explicit zero insets so callers + // sizing the shape rectangle to the exact text bbox (e.g. the + // PDF→PPTX layout path) get the text rendered without + // PowerPoint's default ~0.1" left/right padding silently eating + // shape width and forcing visible glyph re-wrapping. w.write_event(Event::Start(BytesStart::new("p:txBody"))) .expect("write"); let mut body_pr = BytesStart::new("a:bodyPr"); - body_pr.push_attribute(("wrap", "square")); + body_pr.push_attribute(("wrap", "none")); + body_pr.push_attribute(("lIns", "0")); + body_pr.push_attribute(("tIns", "0")); + body_pr.push_attribute(("rIns", "0")); + body_pr.push_attribute(("bIns", "0")); w.write_event(Event::Empty(body_pr)).expect("write"); - write_rich_paragraph(w, runs); + write_rich_paragraph(w, runs, &ParaProps::default()); w.write_event(Event::End(BytesEnd::new("p:txBody"))) .expect("write"); @@ -922,9 +1188,39 @@ fn write_plain_paragraph(w: &mut Writer>, text: &str) { .expect("write"); } -fn write_rich_paragraph(w: &mut Writer>, runs: &[Run]) { +fn write_rich_paragraph(w: &mut Writer>, runs: &[Run], props: &ParaProps) { + use crate::ir::ParagraphAlignment; w.write_event(Event::Start(BytesStart::new("a:p"))) .expect("write"); + let algn = props.alignment.as_ref().map(|a| match a { + ParagraphAlignment::Left => "l", + ParagraphAlignment::Center => "ctr", + ParagraphAlignment::Right => "r", + ParagraphAlignment::Justify => "just", + ParagraphAlignment::Distribute => "dist", + }); + let need_ppr = algn.is_some() || props.space_before_hundredths_pt.is_some(); + if need_ppr { + let mut p_pr = BytesStart::new("a:pPr"); + if let Some(v) = algn { + p_pr.push_attribute(("algn", v)); + } + if let Some(spc) = props.space_before_hundredths_pt { + // + w.write_event(Event::Start(p_pr)).expect("write pPr start"); + w.write_event(Event::Start(BytesStart::new("a:spcBef"))) + .expect("write spcBef"); + let mut spc_pts = BytesStart::new("a:spcPts"); + spc_pts.push_attribute(("val", spc.to_string().as_str())); + w.write_event(Event::Empty(spc_pts)).expect("write spcPts"); + w.write_event(Event::End(BytesEnd::new("a:spcBef"))) + .expect("write spcBef end"); + w.write_event(Event::End(BytesEnd::new("a:pPr"))) + .expect("write pPr end"); + } else { + w.write_event(Event::Empty(p_pr)).expect("write pPr"); + } + } for run in runs { write_dml_run(w, run); } diff --git a/src/xlsx/mod.rs b/src/xlsx/mod.rs index 1003d1f..9db81a8 100644 --- a/src/xlsx/mod.rs +++ b/src/xlsx/mod.rs @@ -24,6 +24,8 @@ pub mod date; pub mod edit; /// XLSX-specific error type. pub mod error; +/// Number format rendering: apply Excel format strings to numeric values. +pub mod numfmt; /// Shared string table (SST) parsing and lookup. pub mod shared_strings; /// Spreadsheet styles: number formats, fonts, fills, borders, cell formats. @@ -69,6 +71,19 @@ pub struct XlsxDocument { pub styles: Option, /// DrawingML theme (lazily parsed; access via `ensure_theme()`). pub theme: Option, + /// Text content extracted from `xl/charts/chart*.xml` parts. Each entry + /// is the flattened text (titles, axis labels, series names, category + /// labels, values) of one chart in document order. We don't render + /// charts as graphics but keeping their text content lets it appear in + /// extracted text and downstream conversions. + pub chart_text: Vec, + /// Font programs found under `xl/fonts/`. Each entry is + /// `(font_name, ttf_or_otf_bytes)`. Mirrors `DocxDocument` and + /// `PptxDocument`. PDF→XLSX→PDF round-trips ship source fonts + /// here so the round-trip can re-register them with the PDF + /// renderer; without this hop XLSX-mediated round-trips lost + /// every typeface to the base 14 fallback. + pub embedded_fonts: Vec<(String, Vec)>, // Raw bytes for lazy parsing (None after parsing or if not present) styles_data: Option>, theme_data: Option>, @@ -170,6 +185,8 @@ impl XlsxDocument { name: String, data: Vec, rels: Relationships, + images: Vec, + text_shapes: Vec, } let mut bundles = Vec::with_capacity(workbook.sheets.len()); for sheet in &workbook.sheets { @@ -214,26 +231,86 @@ impl XlsxDocument { Err(_) => Relationships::empty(), }; + // Resolve the worksheet's DRAWING rel up-front (Phase 1 + // has access to &mut archive). Each entry decodes + // `` and `` anchors and the underlying + // media bytes so Phase 2's parallel parser doesn't need + // the archive. + let (images, text_shapes) = read_drawing_for_sheet(&mut archive, &sheet_path, &ws_rels); + bundles.push(SheetBundle { name: sheet.name.clone(), data: ws_data, rels: ws_rels, + images, + text_shapes, }); } // Phase 2: parse worksheets (parallel when feature enabled) let worksheets = crate::core::parallel::map_collect(bundles, |b| -> Result { - let ws = Worksheet::parse(&b.data, b.name, &b.rels)?; + let mut ws = Worksheet::parse(&b.data, b.name, &b.rels)?; + ws.images = b.images; + ws.text_shapes = b.text_shapes; Ok(ws) })?; - debug!("XlsxDocument: {} worksheets parsed", worksheets.len()); + // Scan for chart XML parts (xl/charts/chart*.xml) and extract their + // visible text — title, axis titles, series names, category labels, + // cached values. We don't render charts as graphics but their words + // belong in any text-based downstream conversion (markdown, search + // indexes, accessibility readers, our PDF text fallback). + let mut chart_text: Vec = Vec::new(); + let chart_names: Vec = (0..archive.len()) + .filter_map(|i| archive.by_index(i).ok().map(|f| f.name().to_string())) + .filter(|n| n.starts_with("xl/charts/chart") && n.ends_with(".xml")) + .collect(); + for name in chart_names { + if let Ok(data) = Self::read_xml_entry(&mut archive, &name) { + let text = extract_chart_text(&data); + if !text.is_empty() { + chart_text.push(text); + } + } + } + + // Scan `xl/fonts/` for embedded font programs. Mirrors the + // DOCX (`word/fonts/`) and PPTX (`ppt/fonts/`) readers. + let mut embedded_fonts: Vec<(String, Vec)> = Vec::new(); + let font_names: Vec = (0..archive.len()) + .filter_map(|i| archive.by_index(i).ok().map(|f| f.name().to_string())) + .filter(|n| { + n.starts_with("xl/fonts/") + && (n.to_lowercase().ends_with(".ttf") || n.to_lowercase().ends_with(".otf")) + }) + .collect(); + for name in font_names { + if let Ok(data) = opc::read_zip_entry(&mut archive, &name) { + let basename = name.rsplit('/').next().unwrap_or("font"); + let face = crate::docx::strip_embedded_font_filename(basename); + let font_name = if face.is_empty() { + basename.to_string() + } else { + face + }; + embedded_fonts.push((font_name, data)); + } + } + + debug!( + "XlsxDocument: {} worksheets parsed, {} chart(s), {} embedded fonts", + worksheets.len(), + chart_text.len(), + embedded_fonts.len() + ); Ok(XlsxDocument { workbook, worksheets, shared_strings, styles, theme: None, + chart_text, + embedded_fonts, styles_data: None, theme_data, }) @@ -338,19 +415,222 @@ impl XlsxDocument { .collect(); let worksheets = worksheets?; - debug!("XlsxDocument: {} worksheets parsed", worksheets.len()); + // Mirror the zip-path embedded-fonts scan over the OPC part + // listing. Loading via OPC is the slow path used when a + // caller hands us a pre-built `OpcReader`, so duplicating + // the cheap scan keeps font fidelity working there too. + let mut embedded_fonts: Vec<(String, Vec)> = Vec::new(); + for name in opc.part_names() { + let s = name.to_string(); + if !s.starts_with("/xl/fonts/") { + continue; + } + let lower = s.to_lowercase(); + if !(lower.ends_with(".ttf") || lower.ends_with(".otf")) { + continue; + } + if let Ok(data) = opc.read_part(&name) { + let basename = s.rsplit('/').next().unwrap_or("font"); + let face = crate::docx::strip_embedded_font_filename(basename); + let font_name = if face.is_empty() { + basename.to_string() + } else { + face + }; + embedded_fonts.push((font_name, data)); + } + } + + debug!( + "XlsxDocument: {} worksheets parsed (OPC path), {} embedded fonts", + worksheets.len(), + embedded_fonts.len() + ); Ok(XlsxDocument { workbook, worksheets, shared_strings, styles, theme: None, + // OPC path doesn't extract chart text yet; the zip path is the + // hot one used by Document::from_reader. Charts via OPC can be + // added if a use case appears. + chart_text: Vec::new(), + embedded_fonts, styles_data: None, theme_data, }) } } +/// Extract structured content from a chart XML stream (DrawingML chart +/// format) into a flat textual representation. +/// +/// Walks the chart's title (``), axis titles (`` / +/// `` / ``), and each series (``). For every +/// series we capture the name (``), category labels (``), +/// and cached numeric values (``). The output groups them into +/// readable lines that include the **structure** of the chart — series +/// names paired with their values per category — rather than the flat +/// soup of ``/`` text the previous implementation produced. +/// +/// Output shape: +/// ```text +/// Title: ... +/// Categories: A, B, C, ... +/// Series Budget: 1690, 2100, 1570, ... +/// Series Projected: 1310, 3480, 510, ... +/// ``` +/// +/// This still travels through `to_markdown` and `convert_xlsx_to_ir` as +/// plain text (not an actual table), but the structure is now meaningful +/// for both human readers and downstream NLP / search. +fn extract_chart_text(xml: &[u8]) -> String { + let mut reader = quick_xml::Reader::from_reader(xml); + reader.config_mut().trim_text(false); + let mut buf = Vec::new(); + + // Tag-context stack — push localname on Start, pop on End. + let mut stack: Vec> = Vec::new(); + // Most recently seen text inside a `` (rich-text run) — used to + // build the chart title and axis-title strings. + let mut current_title: String = String::new(); + let mut titles: Vec = Vec::new(); + // The chart-level title is the first `` we close that lives + // outside any `` / `` / ``. + // Per-series state. + let mut series: Vec = Vec::new(); + let mut cur_series: Option = None; + // Current `` text being accumulated. + let mut cur_v: String = String::new(); + // Categories from the current series (or the first series — they are + // typically shared across all series in the chart). + let mut shared_categories: Vec = Vec::new(); + let mut cur_cat_buf: Vec = Vec::new(); + + loop { + match reader.read_event_into(&mut buf) { + Ok(quick_xml::events::Event::Start(e)) => { + let local = e.local_name().as_ref().to_vec(); + if local == b"ser" { + cur_series = Some(ChartSeries::default()); + cur_cat_buf.clear(); + } + stack.push(local); + }, + Ok(quick_xml::events::Event::End(e)) => { + let local = e.local_name().as_ref().to_vec(); + let _ = stack.pop(); + match local.as_slice() { + b"t" => { + // End of a rich-text run — accumulate into current_title + // if we're inside a chart-level or axis title. + }, + b"title" => { + if !current_title.trim().is_empty() { + titles.push(current_title.trim().to_string()); + } + current_title.clear(); + }, + b"v" => { + let val = cur_v.trim().to_string(); + cur_v.clear(); + if val.is_empty() { + continue; + } + if let Some(s) = cur_series.as_mut() { + // Decide whether this is series-name, category, + // or value based on the enclosing scope. + let in_tx = stack.iter().any(|t| t.as_slice() == b"tx"); + let in_cat = stack.iter().any(|t| t.as_slice() == b"cat"); + let in_val = stack.iter().any(|t| t.as_slice() == b"val"); + if in_tx && s.name.is_empty() { + s.name = val; + } else if in_cat { + cur_cat_buf.push(val); + } else if in_val { + s.values.push(val); + } + } + }, + b"ser" => { + if let Some(mut s) = cur_series.take() { + // Fold the per-series categories into shared_categories + // (first series wins — they are typically identical). + if shared_categories.is_empty() && !cur_cat_buf.is_empty() { + shared_categories = std::mem::take(&mut cur_cat_buf); + } else { + cur_cat_buf.clear(); + } + if s.name.is_empty() { + s.name = format!("Series {}", series.len() + 1); + } + series.push(s); + } + }, + _ => {}, + } + }, + Ok(quick_xml::events::Event::Text(t)) => { + if let Ok(s) = t.unescape() { + let trimmed = s.trim(); + if trimmed.is_empty() { + continue; + } + let top = stack.last().map(|v| v.as_slice()); + match top { + Some(b"t") => { + // Rich-text run — append to current_title. + if !current_title.is_empty() { + current_title.push_str(""); + } + current_title.push_str(trimmed); + }, + Some(b"v") => { + cur_v.push_str(trimmed); + }, + _ => {}, + } + } + }, + Ok(quick_xml::events::Event::Eof) => break, + Err(_) => break, + _ => {}, + } + buf.clear(); + } + + // Emit a structured representation. Each line is independent — the + // markdown writer joins them with `\n`. + let mut out = String::new(); + if !titles.is_empty() { + out.push_str(&format!("Title: {}", titles.join(" — "))); + } + if !shared_categories.is_empty() { + if !out.is_empty() { + out.push('\n'); + } + out.push_str(&format!("Categories: {}", shared_categories.join(", "))); + } + for s in &series { + if !out.is_empty() { + out.push('\n'); + } + if s.values.is_empty() { + out.push_str(&format!("Series: {}", s.name)); + } else { + out.push_str(&format!("{}: {}", s.name, s.values.join(", "))); + } + } + out +} + +#[derive(Default)] +struct ChartSeries { + name: String, + values: Vec, +} + /// Compute the .rels path for a worksheet ZIP entry. /// e.g. "xl/worksheets/sheet1.xml" → "xl/worksheets/_rels/sheet1.xml.rels" fn sheet_rels_path(sheet_path: &str) -> String { @@ -372,3 +652,415 @@ impl crate::core::OfficeDocument for XlsxDocument { self.to_markdown() } } + +/// Read the DRAWING-rel target for a worksheet, parse its `` +/// and `` anchors, and resolve each picture's underlying media +/// bytes. Returns `(pictures, text_shapes)`. Soft failures (no rel, +/// missing part, parse error) yield empty vectors — drawings are +/// best-effort extras and shouldn't fail worksheet loading. +fn read_drawing_for_sheet( + archive: &mut ZipArchive, + sheet_path: &str, + sheet_rels: &Relationships, +) -> ( + Vec, + Vec, +) { + let drawing_rel = match sheet_rels.first_by_type(rel_types::DRAWING) { + Some(r) => r, + None => return (Vec::new(), Vec::new()), + }; + + let drawing_path = resolve_relative_zip_path(sheet_path, &drawing_rel.target); + + let drawing_xml = match XlsxDocument::read_xml_entry(archive, &drawing_path) { + Ok(d) => d, + Err(_) => return (Vec::new(), Vec::new()), + }; + + let drawing_rels_path = sheet_rels_path(&drawing_path); + let drawing_rels = match XlsxDocument::read_xml_entry(archive, &drawing_rels_path) { + Ok(d) => Relationships::parse(&d).unwrap_or_else(|_| Relationships::empty()), + Err(_) => Relationships::empty(), + }; + + let parsed = match parse_drawing_anchors(&drawing_xml) { + Ok(a) => a, + Err(_) => return (Vec::new(), Vec::new()), + }; + + // Resolve picture anchors → bytes. + let mut pictures = Vec::with_capacity(parsed.pictures.len()); + for a in parsed.pictures { + let rel = match drawing_rels.get_by_id(&a.embed_rid) { + Some(r) => r, + None => continue, + }; + let media_path = resolve_relative_zip_path(&drawing_path, &rel.target); + let bytes = match opc::read_zip_entry(archive, &media_path) { + Ok(b) => b, + Err(_) => continue, + }; + let ext = std::path::Path::new(&rel.target) + .extension() + .and_then(|s| s.to_str()) + .map(|s| s.to_ascii_lowercase()) + .unwrap_or_else(|| guess_image_format_from_bytes(&bytes).to_string()); + + pictures.push(crate::xlsx::worksheet::WorksheetPicture { + data: bytes, + format: ext, + x_emu: a.x_emu, + y_emu: a.y_emu, + cx_emu: a.cx_emu, + cy_emu: a.cy_emu, + alt_text: a.alt_text, + }); + } + + let text_shapes = parsed + .text_shapes + .into_iter() + .map(|t| crate::xlsx::worksheet::WorksheetTextShape { + text: t.text, + font_name: t.font_name, + font_size_pt: t.font_size_pt, + bold: t.bold, + italic: t.italic, + color_hex: t.color_hex, + x_emu: t.x_emu, + y_emu: t.y_emu, + cx_emu: t.cx_emu, + cy_emu: t.cy_emu, + }) + .collect(); + + (pictures, text_shapes) +} + +/// Resolve a `..`-relative target inside an OPC package back to an +/// absolute ZIP-entry path. Mirrors `PartName::resolve_relative` but +/// operates on plain ZIP paths (the `from_zip` fast path doesn't use +/// `PartName`). +fn resolve_relative_zip_path(source: &str, target: &str) -> String { + if target.starts_with('/') { + return target.trim_start_matches('/').to_string(); + } + let base_dir = match source.rfind('/') { + Some(i) => &source[..i], + None => "", + }; + let mut parts: Vec<&str> = if base_dir.is_empty() { + Vec::new() + } else { + base_dir.split('/').collect() + }; + for seg in target.split('/') { + match seg { + "" | "." => {}, + ".." => { + parts.pop(); + }, + other => parts.push(other), + } + } + parts.join("/") +} + +#[derive(Debug)] +struct DrawingPictureAnchor { + embed_rid: String, + x_emu: i64, + y_emu: i64, + cx_emu: i64, + cy_emu: i64, + alt_text: Option, +} + +#[derive(Debug, Default)] +struct DrawingTextAnchor { + text: String, + font_name: Option, + font_size_pt: Option, + bold: bool, + italic: bool, + color_hex: Option, + x_emu: i64, + y_emu: i64, + cx_emu: i64, + cy_emu: i64, +} + +#[derive(Debug, Default)] +struct DrawingAnchors { + pictures: Vec, + text_shapes: Vec, +} + +/// Parse `xl/drawings/drawingN.xml` and return both `` and +/// `` anchors. Supports `` (direct EMU +/// pos+ext) and the cell-anchor variants — for cell anchors we +/// approximate the absolute origin from `` x/y when present. +/// `` shapes carry text inside `` runs. +fn parse_drawing_anchors(xml_data: &[u8]) -> crate::core::Result { + use quick_xml::events::Event; + + let mut reader = crate::core::xml::make_fast_reader(xml_data); + let mut out = DrawingAnchors::default(); + + // Per-anchor accumulator state. We don't pre-classify the anchor + // as picture-vs-text; we discover that mid-walk based on which + // child element appears (`pic` vs `sp`). + enum AnchorKind { + Unknown, + Picture, + Text, + } + let mut in_anchor = false; + let mut kind = AnchorKind::Unknown; + let mut x_emu = 0i64; + let mut y_emu = 0i64; + let mut cx_emu = 0i64; + let mut cy_emu = 0i64; + let mut embed_rid: Option = None; + let mut alt_text: Option = None; + // Text-shape state. + let mut in_txbody = false; + let mut in_run = false; + let mut in_a_t = false; + let mut text_buf = String::new(); + let mut font_name: Option = None; + let mut font_size_pt: Option = None; + let mut bold = false; + let mut italic = false; + let mut color_hex: Option = None; + let mut in_solid_fill = false; + + loop { + let evt = reader.read_event()?; + match evt { + Event::Start(ref e) => { + let local = e.local_name().as_ref().to_vec(); + match local.as_slice() { + b"absoluteAnchor" | b"oneCellAnchor" | b"twoCellAnchor" => { + in_anchor = true; + kind = AnchorKind::Unknown; + x_emu = 0; + y_emu = 0; + cx_emu = 0; + cy_emu = 0; + embed_rid = None; + alt_text = None; + in_txbody = false; + in_run = false; + in_a_t = false; + text_buf.clear(); + font_name = None; + font_size_pt = None; + bold = false; + italic = false; + color_hex = None; + in_solid_fill = false; + }, + b"pic" if in_anchor => { + kind = AnchorKind::Picture; + }, + b"sp" if in_anchor => { + kind = AnchorKind::Text; + }, + b"txBody" if in_anchor => { + in_txbody = true; + }, + b"r" if in_txbody => { + in_run = true; + }, + b"t" if in_run => { + in_a_t = true; + }, + b"rPr" if in_run => { + for attr in e.attributes().with_checks(false) { + let attr = attr.map_err(crate::core::Error::from)?; + let key = attr.key.as_ref(); + let raw = attr.unescape_value().map_err(crate::core::Error::from)?; + match key { + b"sz" => { + // sz is in hundredths of a pt. + if let Ok(n) = raw.parse::() { + font_size_pt = Some(n as f32 / 100.0); + } + }, + b"b" => bold = raw == "1" || raw == "true", + b"i" => italic = raw == "1" || raw == "true", + _ => {}, + } + } + }, + b"solidFill" if in_run => { + in_solid_fill = true; + }, + b"cNvPr" if in_anchor => { + if let Some(d) = crate::core::xml::optional_attr_str(e, b"descr")? { + alt_text = Some(d.into_owned()); + } + }, + _ => {}, + } + }, + Event::Empty(ref e) => { + if !in_anchor { + continue; + } + let local = e.local_name().as_ref().to_vec(); + match local.as_slice() { + b"pos" => { + if let Some(v) = crate::core::xml::optional_attr_str(e, b"x")? { + x_emu = v.parse().unwrap_or(0); + } + if let Some(v) = crate::core::xml::optional_attr_str(e, b"y")? { + y_emu = v.parse().unwrap_or(0); + } + }, + b"ext" => { + if let Some(v) = crate::core::xml::optional_attr_str(e, b"cx")? { + cx_emu = v.parse().unwrap_or(0); + } + if let Some(v) = crate::core::xml::optional_attr_str(e, b"cy")? { + cy_emu = v.parse().unwrap_or(0); + } + }, + b"off" if cx_emu == 0 && cy_emu == 0 => { + if let Some(v) = crate::core::xml::optional_attr_str(e, b"x")? { + x_emu = v.parse().unwrap_or(x_emu); + } + if let Some(v) = crate::core::xml::optional_attr_str(e, b"y")? { + y_emu = v.parse().unwrap_or(y_emu); + } + }, + b"blip" => { + for attr in e.attributes().with_checks(false) { + let attr = attr.map_err(crate::core::Error::from)?; + let key = attr.key.as_ref(); + if key == b"r:embed" || key.ends_with(b":embed") || key == b"embed" { + let raw = + attr.unescape_value().map_err(crate::core::Error::from)?; + embed_rid = Some(raw.into_owned()); + break; + } + } + }, + b"cNvPr" => { + if let Some(d) = crate::core::xml::optional_attr_str(e, b"descr")? { + alt_text = Some(d.into_owned()); + } + }, + b"latin" if in_run => { + if let Some(t) = crate::core::xml::optional_attr_str(e, b"typeface")? { + font_name = Some(t.into_owned()); + } + }, + b"srgbClr" if in_solid_fill => { + if let Some(v) = crate::core::xml::optional_attr_str(e, b"val")? { + color_hex = Some(v.into_owned().to_uppercase()); + } + }, + b"rPr" if in_run => { + for attr in e.attributes().with_checks(false) { + let attr = attr.map_err(crate::core::Error::from)?; + let key = attr.key.as_ref(); + let raw = attr.unescape_value().map_err(crate::core::Error::from)?; + match key { + b"sz" => { + if let Ok(n) = raw.parse::() { + font_size_pt = Some(n as f32 / 100.0); + } + }, + b"b" => bold = raw == "1" || raw == "true", + b"i" => italic = raw == "1" || raw == "true", + _ => {}, + } + } + }, + _ => {}, + } + }, + Event::Text(ref e) if in_a_t => { + let s = e.unescape().map_err(crate::core::Error::from)?; + text_buf.push_str(&s); + }, + Event::End(ref e) => { + let local = e.local_name().as_ref().to_vec(); + match local.as_slice() { + b"t" => in_a_t = false, + b"r" => in_run = false, + b"txBody" => in_txbody = false, + b"solidFill" => in_solid_fill = false, + s if matches!(s, b"absoluteAnchor" | b"oneCellAnchor" | b"twoCellAnchor") + && in_anchor => + { + in_anchor = false; + match kind { + AnchorKind::Picture => { + if let Some(rid) = embed_rid.take() { + out.pictures.push(DrawingPictureAnchor { + embed_rid: rid, + x_emu, + y_emu, + cx_emu, + cy_emu, + alt_text: alt_text.take(), + }); + } + }, + AnchorKind::Text => { + if !text_buf.is_empty() { + out.text_shapes.push(DrawingTextAnchor { + text: std::mem::take(&mut text_buf), + font_name: font_name.take(), + font_size_pt: font_size_pt.take(), + bold, + italic, + color_hex: color_hex.take(), + x_emu, + y_emu, + cx_emu, + cy_emu, + }); + } + }, + AnchorKind::Unknown => {}, + } + kind = AnchorKind::Unknown; + }, + _ => {}, + } + }, + Event::Eof => break, + _ => {}, + } + } + + Ok(out) +} + +/// Best-effort image-format detection from raw bytes (used when the +/// drawing rel target lacks a recognisable extension). Mirrors the +/// PPTX helper. +fn guess_image_format_from_bytes(bytes: &[u8]) -> &'static str { + if bytes.starts_with(&[0x89, b'P', b'N', b'G']) { + "png" + } else if bytes.starts_with(&[0xFF, 0xD8, 0xFF]) { + "jpeg" + } else if bytes.starts_with(b"GIF87a") || bytes.starts_with(b"GIF89a") { + "gif" + } else if bytes.starts_with(b"BM") { + "bmp" + } else if bytes.len() >= 4 && (bytes.starts_with(b"II*\0") || bytes.starts_with(b"MM\0*")) { + "tiff" + } else if bytes.len() >= 4 && bytes.starts_with(&[0xD7, 0xCD, 0xC6, 0x9A]) { + "wmf" + } else if bytes.len() >= 4 && bytes.starts_with(&[0x01, 0x00, 0x00, 0x00]) { + "emf" + } else { + "png" + } +} diff --git a/src/xlsx/numfmt.rs b/src/xlsx/numfmt.rs new file mode 100644 index 0000000..c945861 --- /dev/null +++ b/src/xlsx/numfmt.rs @@ -0,0 +1,478 @@ +//! Excel number format rendering. +//! +//! Applies a numeric format string (or built-in format ID) to an f64 value +//! and returns the display string. Covers the cases that matter in practice: +//! integers, fixed decimals, thousands separators, percentages, currency, +//! and scientific notation. Complex conditions/colors are stripped gracefully. + +/// Apply an Excel number format to a numeric value. +pub fn apply_format(n: f64, fmt_id: u32, fmt_str: Option<&str>) -> String { + if n.is_nan() || n.is_infinite() { + return String::new(); + } + + // Built-in format IDs per OOXML spec §18.8.30. + match fmt_id { + 0 | 49 => return format_general(n), // General / @ + 1 => return format_integer(n), // 0 + 2 => return format_fixed(n, 2), // 0.00 + 3 => return format_commas(n, 0), // #,##0 + 4 => return format_commas(n, 2), // #,##0.00 + 5 | 6 => return format_currency(n, "$", 0), // $#,##0 + 7 | 8 => return format_currency(n, "$", 2), // $#,##0.00 + 9 => return format_percent(n, 0), // 0% + 10 => return format_percent(n, 2), // 0.00% + 11 => return format_scientific(n), // 0.00E+00 + 12 => return format_general(n), // # ?/? (fractions — approx) + 13 => return format_general(n), // # ??/?? + 37 | 38 => return format_commas(n, 0), // #,##0 accounting variants + 39 | 40 => return format_commas(n, 2), // #,##0.00 accounting variants + 41..=44 => return format_commas(n, 2), // _(* ...) accounting + _ => {}, + } + + // Custom format string (IDs 164+). + if let Some(fmt) = fmt_str { + let fmt = fmt.trim(); + if !fmt.is_empty() && fmt != "General" && fmt != "@" { + return apply_custom(n, fmt); + } + } + + format_general(n) +} + +// ── Simple format primitives ─────────────────────────────────────────────── + +/// Format a number using Excel's General format (integer if whole, float otherwise). +pub fn format_general(n: f64) -> String { + if n == n.trunc() && n.abs() < 1e15 { + format!("{}", n as i64) + } else { + // Trim unnecessary trailing zeros from float repr. + let s = format!("{}", n); + s + } +} + +fn format_integer(n: f64) -> String { + format!("{}", n.round() as i64) +} + +fn format_fixed(n: f64, decimals: u8) -> String { + format!("{:.prec$}", n, prec = decimals as usize) +} + +/// Format a number with thousands-separator commas and the given decimal places. +pub fn format_commas(n: f64, decimals: u8) -> String { + let negative = n < 0.0; + let abs = n.abs(); + + // Round to the required number of decimal places first. + let factor = 10f64.powi(decimals as i32); + let rounded = (abs * factor).round() / factor; + + let int_part = rounded.trunc() as u64; + let int_str = insert_commas(int_part); + + let sign = if negative { "-" } else { "" }; + + if decimals == 0 { + format!("{}{}", sign, int_str) + } else { + let frac = ((rounded.fract()) * factor).round() as u64; + format!("{}{}.{:0>width$}", sign, int_str, frac, width = decimals as usize) + } +} + +fn format_currency(n: f64, symbol: &str, decimals: u8) -> String { + format!("{}{}", symbol, format_commas(n, decimals)) +} + +/// Format a number as a percentage (multiplied by 100, with optional decimal places). +pub fn format_percent(n: f64, decimals: u8) -> String { + let pct = n * 100.0; + if decimals == 0 { + format!("{}%", pct.round() as i64) + } else { + format!("{:.prec$}%", pct, prec = decimals as usize) + } +} + +fn format_scientific(n: f64) -> String { + // Excel uses E+XX notation (no leading zero in exponent on some locales, but + // two-digit exponent is safest for matching). + format!("{:.2E}", n) +} + +fn insert_commas(n: u64) -> String { + let s = n.to_string(); + let bytes = s.as_bytes(); + let len = bytes.len(); + let mut out = String::with_capacity(len + len / 3); + for (i, &b) in bytes.iter().enumerate() { + if i > 0 && (len - i).is_multiple_of(3) { + out.push(','); + } + out.push(b as char); + } + out +} + +// ── Custom format string interpreter ────────────────────────────────────── + +/// Simplified parser for Excel format strings. Handles the common cases: +/// thousands separators, decimal places, percentages, currency symbols, +/// and scientific notation. Strips color/condition brackets and literals. +fn apply_custom(n: f64, fmt: &str) -> String { + // Multi-section: take the first section (positive numbers). + // Second section = negatives, third = zero, fourth = text. + let section = fmt.split(';').next().unwrap_or(fmt); + + // ── Parse the section ──────────────────────────────────────────────── + let mut currency_prefix = String::new(); + let mut suffix = String::new(); // literal text after the number + let mut has_percent = false; + let mut has_comma_in_num = false; + let mut decimal_zeros = 0u8; // '0' chars after '.' + let mut _decimal_hashes = 0u8; // '#' chars after '.' (optional digits) + let mut has_scientific = false; + let mut in_decimal = false; + let mut in_num_part = false; + + let mut chars = section.chars().peekable(); + while let Some(c) = chars.next() { + match c { + // Bracketed: colour like [Red] or locale/currency like [$€-407] + '[' => { + let mut inner = String::new(); + for ch in chars.by_ref() { + if ch == ']' { + break; + } + inner.push(ch); + } + if let Some(rest) = inner.strip_prefix('$') { + // [$symbol-locale] — extract symbol + let sym: String = rest.chars().take_while(|&ch| ch != '-').collect(); + if !sym.is_empty() { + currency_prefix = sym; + } + } + // Colour directives ignored. + }, + // Quoted literal text — collect as suffix + '"' => { + for ch in chars.by_ref() { + if ch == '"' { + break; + } + suffix.push(ch); + } + }, + // Escape: next char is literal + '\\' => { + chars.next(); + }, + // _X = pad with X (alignment) — skip X + '_' => { + chars.next(); + }, + // *X = repeat X (fill) — skip X + '*' => { + chars.next(); + }, + + '%' => { + has_percent = true; + in_num_part = true; + }, + '.' => { + in_decimal = true; + in_num_part = true; + }, + '0' => { + in_num_part = true; + if in_decimal { + decimal_zeros += 1; + } + }, + '#' => { + in_num_part = true; + if in_decimal { + _decimal_hashes += 1; + } + }, + ',' => { + // Comma between '#'/'0' chars = thousands separator. + // Comma at end of number part = scale-by-1000 (rare, skip for now). + if in_num_part { + has_comma_in_num = true; + } + }, + 'E' | 'e' => { + has_scientific = true; + // Skip the +/- and exponent digits + chars.next(); // '+' or '-' + while chars.peek().is_some_and(|c| c.is_ascii_digit()) { + chars.next(); + } + }, + '$' => { + currency_prefix = "$".to_string(); + in_num_part = true; + }, + // Other literal characters before the number part = currency prefix + c if !in_num_part && !c.is_ascii_whitespace() => { + currency_prefix.push(c); + }, + _ => {}, + } + } + + let decimals = decimal_zeros; // treat '0' decimals as the required precision + + // ── Format the value ───────────────────────────────────────────────── + let value = if has_percent { n * 100.0 } else { n }; + + let body = if has_scientific { + format_scientific(value) + } else if has_comma_in_num { + format_commas(value, decimals) + } else if in_decimal && decimals > 0 { + format_fixed(value, decimals) + } else if in_num_part { + format_integer(value) + } else { + format_general(value) + }; + + let pct_suffix = if has_percent { "%" } else { "" }; + + format!("{}{}{}{}", currency_prefix, body, suffix, pct_suffix) +} + +// ── Tests ────────────────────────────────────────────────────────────────── + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn builtin_general() { + assert_eq!(apply_format(42.0, 0, None), "42"); + assert_eq!(apply_format(4.25, 0, None), "4.25"); + } + + #[test] + fn builtin_integer() { + assert_eq!(apply_format(42.7, 1, None), "43"); + } + + #[test] + fn builtin_fixed_two() { + assert_eq!(apply_format(4.25678, 2, None), "4.26"); + } + + #[test] + fn builtin_commas_zero() { + assert_eq!(apply_format(1234567.0, 3, None), "1,234,567"); + } + + #[test] + fn builtin_commas_two() { + assert_eq!(apply_format(1234567.891, 4, None), "1,234,567.89"); + } + + #[test] + fn builtin_percent_zero() { + assert_eq!(apply_format(0.75, 9, None), "75%"); + } + + #[test] + fn builtin_percent_two() { + assert_eq!(apply_format(0.1234, 10, None), "12.34%"); + } + + #[test] + fn builtin_currency_usd() { + assert_eq!(apply_format(1234.5, 7, None), "$1,234.50"); + } + + #[test] + fn custom_thousands() { + assert_eq!(apply_format(1234567.0, 164, Some("#,##0")), "1,234,567"); + } + + #[test] + fn custom_thousands_two_decimals() { + assert_eq!(apply_format(1234.5, 164, Some("#,##0.00")), "1,234.50"); + } + + #[test] + fn custom_percent() { + assert_eq!(apply_format(0.5, 164, Some("0%")), "50%"); + } + + #[test] + fn custom_percent_decimals() { + assert_eq!(apply_format(0.1256, 164, Some("0.00%")), "12.56%"); + } + + #[test] + fn custom_euro() { + let result = apply_format(1234.5, 164, Some("[$€-407]#,##0.00")); + assert!(result.contains("€"), "expected euro symbol, got: {result}"); + assert!(result.contains("1,234.50"), "expected formatted number, got: {result}"); + } + + #[test] + fn custom_dollar_prefix() { + assert_eq!(apply_format(99.9, 164, Some("$#,##0.00")), "$99.90"); + } + + #[test] + fn negative_commas() { + assert_eq!(apply_format(-1234.5, 4, None), "-1,234.50"); + } + + #[test] + fn zero_percent() { + assert_eq!(apply_format(0.0, 9, None), "0%"); + } + + #[test] + fn large_commas() { + assert_eq!(apply_format(1_000_000_000.0, 3, None), "1,000,000,000"); + } + + // ── Edge cases ────────────────────────────────────────────────────── + + #[test] + fn nan_returns_empty() { + assert_eq!(apply_format(f64::NAN, 0, None), ""); + } + + #[test] + fn infinity_returns_empty() { + assert_eq!(apply_format(f64::INFINITY, 0, None), ""); + assert_eq!(apply_format(f64::NEG_INFINITY, 0, None), ""); + } + + #[test] + fn zero_renders_uniformly() { + assert_eq!(apply_format(0.0, 0, None), "0"); + assert_eq!(apply_format(0.0, 2, None), "0.00"); + assert_eq!(apply_format(0.0, 4, None), "0.00"); + } + + #[test] + fn negative_percent() { + assert_eq!(apply_format(-0.25, 9, None), "-25%"); + assert_eq!(apply_format(-0.1234, 10, None), "-12.34%"); + } + + #[test] + fn negative_currency() { + assert_eq!(apply_format(-99.5, 7, None), "$-99.50"); + } + + #[test] + fn scientific_builtin() { + // Format id 11 = 0.00E+00 → uses Rust's "{:.2E}" wrapper. + let s = apply_format(12345.6789, 11, None); + assert!(s.contains('E'), "scientific got: {s}"); + } + + #[test] + fn accounting_alias() { + // 37–40 map to comma formats matching #,##0 family. + assert_eq!(apply_format(1234.0, 37, None), "1,234"); + assert_eq!(apply_format(1234.5, 39, None), "1,234.50"); + } + + #[test] + fn accounting_paren_range() { + // 41..=44 are accounting variants → commas with 2 decimals. + for id in 41u32..=44 { + assert_eq!(apply_format(1234.5, id, None), "1,234.50", "fmt id {id}"); + } + } + + #[test] + fn fraction_falls_back_to_general() { + // Fraction formats (12,13) currently render as general. + assert_eq!(apply_format(1.5, 12, None), "1.5"); + assert_eq!(apply_format(2.0, 13, None), "2"); + } + + #[test] + fn custom_general_falls_through_to_default() { + // "General" and "@" should fall back to General formatting. + assert_eq!(apply_format(42.5, 164, Some("General")), "42.5"); + assert_eq!(apply_format(42.0, 164, Some("@")), "42"); + } + + #[test] + fn custom_blank_falls_back_to_general() { + assert_eq!(apply_format(4.25, 164, Some("")), "4.25"); + assert_eq!(apply_format(4.25, 164, Some(" ")), "4.25"); + } + + #[test] + fn custom_multi_section_uses_first() { + // Multi-section format: positives use first section only. + assert_eq!(apply_format(1234.5, 164, Some("#,##0.00;-#,##0.00")), "1,234.50"); + } + + #[test] + fn custom_with_quoted_literal_suffix() { + let result = apply_format(42.0, 164, Some(r#"0" units""#)); + assert!(result.contains("42"), "got: {result}"); + assert!(result.contains("units"), "got: {result}"); + } + + #[test] + fn custom_color_directive_is_stripped() { + // [Red] is a color directive — should be ignored, not emitted. + let result = apply_format(123.0, 164, Some("[Red]#,##0")); + assert!(!result.contains("Red")); + assert!(result.contains("123")); + } + + #[test] + fn format_general_keeps_integers_unsuffixed() { + // Whole-number floats render without ".0". + assert_eq!(format_general(42.0), "42"); + assert_eq!(format_general(-7.0), "-7"); + assert_eq!(format_general(0.0), "0"); + } + + #[test] + fn format_general_keeps_decimal_for_fraction() { + assert_eq!(format_general(4.25), "4.25"); + assert_eq!(format_general(-2.5), "-2.5"); + } + + #[test] + fn format_commas_negative_with_decimals() { + assert_eq!(format_commas(-1234.5, 2), "-1,234.50"); + } + + #[test] + fn format_commas_zero() { + assert_eq!(format_commas(0.0, 0), "0"); + assert_eq!(format_commas(0.0, 2), "0.00"); + } + + #[test] + fn format_percent_negative() { + assert_eq!(format_percent(-0.5, 0), "-50%"); + } + + #[test] + fn format_percent_zero_decimals() { + // 50% with 0 decimals. + assert_eq!(format_percent(0.5, 0), "50%"); + } +} diff --git a/src/xlsx/styles.rs b/src/xlsx/styles.rs index 7b9f2a5..505776a 100644 --- a/src/xlsx/styles.rs +++ b/src/xlsx/styles.rs @@ -1,3 +1,5 @@ +use std::collections::HashMap; + use quick_xml::events::Event; use crate::core::theme::ColorRef; @@ -8,8 +10,8 @@ use super::shared_strings::parse_color_ref; /// Parsed stylesheet from `xl/styles.xml`. #[derive(Debug, Clone)] pub struct StyleSheet { - /// Custom number formats (IDs ≥ 164). - pub number_formats: Vec, + /// Custom number formats: numFmtId → formatCode string (O(1) lookup). + pub number_formats: HashMap, /// Font definitions. pub fonts: Vec, /// Fill definitions. @@ -22,15 +24,6 @@ pub struct StyleSheet { pub cell_style_formats: Vec, } -/// A custom number format (ID >= 164). -#[derive(Debug, Clone)] -pub struct NumberFormat { - /// Format ID (used by `CellFormat.number_format_id`). - pub id: u32, - /// Excel format code string (e.g., `"#,##0.00"`). - pub format_code: String, -} - /// A font definition. #[derive(Debug, Clone)] pub struct Font { @@ -105,7 +98,7 @@ impl StyleSheet { pub fn parse(xml_data: &[u8]) -> crate::core::Result { let mut reader = xml::make_fast_reader(xml_data); - let mut number_formats = Vec::new(); + let mut number_formats = HashMap::new(); let mut fonts = Vec::new(); let mut fills = Vec::new(); let mut borders = Vec::new(); @@ -116,7 +109,7 @@ impl StyleSheet { match reader.read_event()? { Event::Start(ref e) => match e.local_name().as_ref() { b"numFmts" => { - number_formats = parse_num_fmts(&mut reader)?; + number_formats = parse_num_fmts_map(&mut reader)?; }, b"fonts" => { fonts = parse_fonts(&mut reader)?; @@ -154,10 +147,7 @@ impl StyleSheet { pub fn number_format_for(&self, style_index: u32) -> Option<&str> { let xf = self.cell_formats.get(style_index as usize)?; let fmt_id = xf.number_format_id; - self.number_formats - .iter() - .find(|nf| nf.id == fmt_id) - .map(|nf| nf.format_code.as_str()) + self.number_formats.get(&fmt_id).map(|s| s.as_str()) } /// Get the font for a cell format index. @@ -175,16 +165,18 @@ impl StyleSheet { } } -/// Parse `` — custom number formats. -fn parse_num_fmts(reader: &mut quick_xml::Reader<&[u8]>) -> crate::core::Result> { - let mut formats = Vec::new(); +/// Parse `` — custom number formats into a HashMap for O(1) lookup. +fn parse_num_fmts_map( + reader: &mut quick_xml::Reader<&[u8]>, +) -> crate::core::Result> { + let mut map = HashMap::new(); loop { match reader.read_event()? { Event::Start(ref e) | Event::Empty(ref e) if e.local_name().as_ref() == b"numFmt" => { let id: u32 = xml::required_attr_str(e, b"numFmtId")?.parse()?; let format_code = xml::required_attr_str(e, b"formatCode")?.into_owned(); - formats.push(NumberFormat { id, format_code }); + map.insert(id, format_code); }, Event::End(ref e) if e.local_name().as_ref() == b"numFmts" => { break; @@ -194,7 +186,7 @@ fn parse_num_fmts(reader: &mut quick_xml::Reader<&[u8]>) -> crate::core::Result< } } - Ok(formats) + Ok(map) } /// Parse `` collection. @@ -538,8 +530,7 @@ mod tests { // Number formats assert_eq!(ss.number_formats.len(), 1); - assert_eq!(ss.number_formats[0].id, 164); - assert_eq!(ss.number_formats[0].format_code, "yyyy-mm-dd"); + assert_eq!(ss.number_formats.get(&164).map(|s| s.as_str()), Some("yyyy-mm-dd")); // Fonts assert_eq!(ss.fonts.len(), 2); @@ -561,10 +552,7 @@ mod tests { #[test] fn number_format_lookup() { let ss = StyleSheet { - number_formats: vec![NumberFormat { - id: 164, - format_code: "yyyy-mm-dd".to_string(), - }], + number_formats: [(164u32, "yyyy-mm-dd".to_string())].into_iter().collect(), fonts: vec![], fills: vec![], borders: vec![], @@ -598,7 +586,7 @@ mod tests { #[test] fn font_lookup() { let ss = StyleSheet { - number_formats: vec![], + number_formats: std::collections::HashMap::new(), fonts: vec![ Font { bold: false, diff --git a/src/xlsx/text.rs b/src/xlsx/text.rs index b88d19e..83df018 100644 --- a/src/xlsx/text.rs +++ b/src/xlsx/text.rs @@ -1,6 +1,7 @@ use super::XlsxDocument; use super::cell::{Cell, CellValue}; use super::date; +use super::numfmt; use super::worksheet::Row; impl XlsxDocument { @@ -71,6 +72,14 @@ impl XlsxDocument { } } } + // Charts: emit each chart's extracted text under a "## Chart N" heading + // so its words appear in markdown / search / PDF without needing a + // graphical chart renderer. + for (i, text) in self.chart_text.iter().enumerate() { + if !text.trim().is_empty() { + parts.push(format!("## Chart {}\n\n{}", i + 1, text)); + } + } parts.join("\n\n") } @@ -86,6 +95,33 @@ impl XlsxDocument { return Some(String::new()); } + // If the sheet is effectively single-column with prose-length cells + // (notes, single-column reports), emit each cell as its own paragraph + // instead of wrapping every line in a 1-column GFM table. The table + // form looks awful when rendered (tall, narrow, hard to read) and + // round-trips badly through markdown→IR→office. + if col_count == 1 + && ws.rows.iter().any(|r| { + r.cells + .first() + .map(|c| self.format_cell_value(c).chars().count() > 20) + .unwrap_or(false) + }) + { + let mut out = String::new(); + out.push_str(&format!("## {}\n\n", ws.name)); + for row in &ws.rows { + if let Some(cell) = row.cells.first() { + let text = self.format_cell_value(cell); + if !text.trim().is_empty() { + out.push_str(text.trim()); + out.push_str("\n\n"); + } + } + } + return Some(out.trim_end().to_string()); + } + let mut lines = Vec::new(); // Sheet name as heading @@ -143,6 +179,18 @@ impl XlsxDocument { return; } } + if let Some(idx) = cell.style_index { + if let Some(styles) = self.styles.as_ref() { + if let Some(fmt_id) = styles.number_format_id_for(idx) { + if fmt_id != 0 { + let fmt_str = styles.number_format_for(idx); + let formatted = numfmt::apply_format(*n, fmt_id, fmt_str); + buf.push_str(&formatted); + return; + } + } + } + } write_number(*n, buf); }, CellValue::String(s) => buf.push_str(s), @@ -164,6 +212,79 @@ impl XlsxDocument { CellValue::Date(dt) => buf.push_str(&dt.to_iso_string()), } } + + /// Pre-compute the set of style indices that map to date formats. + /// Call once before iterating many cells; use with `write_cell_value_fast`. + pub fn date_style_indices(&self) -> std::collections::HashSet { + let Some(styles) = self.styles.as_ref() else { + return Default::default(); + }; + (0..styles.cell_formats.len() as u32) + .filter(|&idx| { + let Some(fmt_id) = styles.number_format_id_for(idx) else { + return false; + }; + date::is_date_format_id(fmt_id) + || styles + .number_format_for(idx) + .is_some_and(date::is_date_format_string) + }) + .collect() + } + + /// Like `write_cell_value` but uses a pre-computed date style set instead + /// of calling `is_date_cell()` (which re-scans format strings) per cell. + pub fn write_cell_value_fast( + &self, + cell: &Cell, + buf: &mut String, + date_indices: &std::collections::HashSet, + ) { + match &cell.value { + CellValue::Empty => {}, + CellValue::Number(n) => { + let is_date = cell + .style_index + .is_some_and(|i| date_indices.contains(&i)); + if is_date { + if let Some(dt) = date::DateTimeValue::from_serial(*n, self.workbook.date1904) { + buf.push_str(&dt.to_iso_string()); + return; + } + } + // Apply number format (thousands, decimals, %, currency, etc.) + if let Some(idx) = cell.style_index { + if let Some(styles) = self.styles.as_ref() { + if let Some(fmt_id) = styles.number_format_id_for(idx) { + if fmt_id != 0 { + let fmt_str = styles.number_format_for(idx); + let formatted = numfmt::apply_format(*n, fmt_id, fmt_str); + buf.push_str(&formatted); + return; + } + } + } + } + write_number(*n, buf); + }, + CellValue::String(s) => buf.push_str(s), + CellValue::SharedString(idx) => { + let s = self.shared_strings.get(*idx).unwrap_or(""); + if s.len() <= 32_768 { + buf.push_str(s); + } else { + let mut end = 32_768; + while !s.is_char_boundary(end) && end > 0 { + end -= 1; + } + buf.push_str(&s[..end]); + } + }, + CellValue::Boolean(b) => buf.push_str(if *b { "TRUE" } else { "FALSE" }), + CellValue::Error(e) => buf.push_str(e), + CellValue::Date(dt) => buf.push_str(&dt.to_iso_string()), + } + } } /// Write a formatted number directly to a buffer. diff --git a/src/xlsx/worksheet.rs b/src/xlsx/worksheet.rs index 0af3200..ee53726 100644 --- a/src/xlsx/worksheet.rs +++ b/src/xlsx/worksheet.rs @@ -17,6 +17,98 @@ pub struct Worksheet { pub merged_cells: Vec, /// Hyperlinks defined on this sheet. pub hyperlinks: Vec, + /// Per-sheet page geometry parsed from `` + ``. + pub page_setup: Option, + /// Pictures anchored on this worksheet via `xl/drawings/drawingN.xml`. + /// Resolved at parse time: anchor + image bytes are materialised + /// into this `Vec` so consumers don't need to re-walk the OPC + /// reader. Empty when the worksheet has no drawing rel. + pub images: Vec, + /// Layout-preserving text shapes anchored on this worksheet via a + /// DrawingML drawing part. Each entry is one `` carrying a + /// single styled run — populated by the round-trip from + /// `to_xlsx_bytes_layout`. Empty when the worksheet has no + /// `` shapes (the common XLSX case). + pub text_shapes: Vec, +} + +/// A text shape anchored on a worksheet via a DrawingML drawing part. +/// Mirrors `xlsx::write::SheetTextShape`. +#[derive(Debug, Clone)] +pub struct WorksheetTextShape { + /// Text content of the shape. + pub text: String, + /// Font face name. + pub font_name: Option, + /// Font size in points (full-pt scale). + pub font_size_pt: Option, + /// Bold weight. + pub bold: bool, + /// Italic style. + pub italic: bool, + /// 6-char hex colour, when present. + pub color_hex: Option, + /// X anchor in EMU. + pub x_emu: i64, + /// Y anchor in EMU. + pub y_emu: i64, + /// Width in EMU. + pub cx_emu: i64, + /// Height in EMU. + pub cy_emu: i64, +} + +/// A picture anchored on a worksheet via a DrawingML drawing part. +/// +/// Coordinates are in EMU (914400 per inch) and absolute relative to +/// the sheet origin (top-left). When the source used a one-cell or +/// two-cell anchor we approximate the equivalent absolute origin by +/// summing the from-cell coordinates. The bytes are the raw image +/// part contents; `format` is the lowercase file extension. +#[derive(Debug, Clone)] +pub struct WorksheetPicture { + /// Image bytes. + pub data: Vec, + /// Lowercase file extension (`"png"`, `"jpeg"`, ...). + pub format: String, + /// X anchor in EMU. + pub x_emu: i64, + /// Y anchor in EMU. + pub y_emu: i64, + /// Rendered width in EMU. + pub cx_emu: i64, + /// Rendered height in EMU. + pub cy_emu: i64, + /// Optional `` accessibility text. + pub alt_text: Option, +} + +/// Per-sheet page geometry (inches for margins, twips for dimensions). +/// +/// Parsed from `` (margins in inches per ECMA-376) and +/// `` (size as `paperWidth`/`paperHeight` with a unit suffix +/// — `mm`, `cm`, `in` — or as a `paperSize` enum). Stored in twips for +/// IR parity (1 inch = 1440 twips, 1 mm = 1440/25.4 ≈ 56.6929 twips). +#[derive(Debug, Clone, Copy, PartialEq, Default)] +pub struct PageSetup { + /// Page width in twips. Zero if no page setup was seen. + pub width_twips: u32, + /// Page height in twips. + pub height_twips: u32, + /// Top margin in twips. + pub margin_top_twips: u32, + /// Bottom margin in twips. + pub margin_bottom_twips: u32, + /// Left margin in twips. + pub margin_left_twips: u32, + /// Right margin in twips. + pub margin_right_twips: u32, + /// Header distance from top edge in twips. + pub header_distance_twips: u32, + /// Footer distance from bottom edge in twips. + pub footer_distance_twips: u32, + /// Whether the page is in landscape orientation. + pub landscape: bool, } /// A row from ``. @@ -63,6 +155,12 @@ impl Worksheet { let mut rows = Vec::new(); let mut merged_cells = Vec::new(); let mut hyperlinks = Vec::new(); + // Page setup is collected lazily because and + // arrive as separate sibling elements and either may + // appear without the other. We materialize the IR value at the + // end iff at least one was seen. + let mut margins_in: Option = None; + let mut page_setup_raw: Option = None; loop { match reader.read_event()? { @@ -86,6 +184,14 @@ impl Worksheet { } reader.read_to_end(e.to_end().name())?; }, + b"pageMargins" => { + margins_in = parse_page_margins(e)?; + reader.read_to_end(e.to_end().name())?; + }, + b"pageSetup" => { + page_setup_raw = parse_page_setup_attrs(e)?; + reader.read_to_end(e.to_end().name())?; + }, _ => {}, }, Event::Empty(ref e) => match e.local_name().as_ref() { @@ -102,6 +208,12 @@ impl Worksheet { hyperlinks.push(hl); } }, + b"pageMargins" => { + margins_in = parse_page_margins(e)?; + }, + b"pageSetup" => { + page_setup_raw = parse_page_setup_attrs(e)?; + }, _ => {}, }, Event::Eof => break, @@ -109,16 +221,175 @@ impl Worksheet { } } + let page_setup = build_page_setup(margins_in, page_setup_raw); + Ok(Worksheet { name, dimension, rows, merged_cells, hyperlinks, + page_setup, + images: Vec::new(), + text_shapes: Vec::new(), }) } } +/// Raw `` values in inches (per ECMA-376 §18.3.1.62). +#[derive(Debug, Clone, Copy)] +struct PageMarginsIn { + left: f64, + right: f64, + top: f64, + bottom: f64, + header: f64, + footer: f64, +} + +/// Raw `` shape — physical dimensions in twips plus orientation. +#[derive(Debug, Clone, Copy, Default)] +struct PageSetupRaw { + width_twips: u32, + height_twips: u32, + landscape: bool, +} + +fn parse_page_margins( + e: &quick_xml::events::BytesStart, +) -> crate::core::Result> { + let parse = |k: &[u8]| -> crate::core::Result> { + Ok(xml::optional_attr_str(e, k)? + .and_then(|v| fast_float2::parse::(v.as_ref()).ok())) + }; + let left = parse(b"left")?; + let right = parse(b"right")?; + let top = parse(b"top")?; + let bottom = parse(b"bottom")?; + let header = parse(b"header")?; + let footer = parse(b"footer")?; + if left.is_none() && right.is_none() && top.is_none() && bottom.is_none() { + return Ok(None); + } + Ok(Some(PageMarginsIn { + left: left.unwrap_or(0.7), + right: right.unwrap_or(0.7), + top: top.unwrap_or(0.75), + bottom: bottom.unwrap_or(0.75), + header: header.unwrap_or(0.3), + footer: footer.unwrap_or(0.3), + })) +} + +/// Translate an inch / mm / cm dimension token (e.g. "210mm", "8.5in", +/// "21cm", or a bare "210" assumed mm) into twips. Returns `None` for +/// blanks or values that fail to parse. +fn dim_to_twips(s: &str) -> Option { + let s = s.trim(); + if s.is_empty() { + return None; + } + let (num_part, factor): (&str, f64) = if let Some(rest) = s.strip_suffix("mm") { + (rest, 1440.0 / 25.4) + } else if let Some(rest) = s.strip_suffix("cm") { + (rest, 1440.0 / 2.54) + } else if let Some(rest) = s.strip_suffix("in") { + (rest, 1440.0) + } else { + // Bare numeric — ECMA-376 says the default unit varies by locale; + // mm is the safest bet for arbitrary writers (and matches what we + // emit in `build_worksheet_xml`). + (s, 1440.0 / 25.4) + }; + let v: f64 = fast_float2::parse(num_part.trim()).ok()?; + if v <= 0.0 { + return None; + } + Some((v * factor).round() as u32) +} + +/// Translate the OOXML `paperSize` enum into (width_twips, height_twips). +/// Covers the dimensions we're most likely to encounter in a PDF→XLSX +/// round-trip — Letter, Legal, A3, A4, A5, B4, B5, Executive, Tabloid. +/// Unknown values fall back to A4 portrait. +fn paper_size_enum_to_twips(id: u32) -> (u32, u32) { + match id { + 1 => (12240, 15840), // Letter 8.5 × 11" + 5 => (12240, 20160), // Legal 8.5 × 14" + 7 => (10440, 15120), // Executive 7.25 × 10.5" + 8 => (16840, 23820), // A3 297 × 420 mm + 9 => (11906, 16838), // A4 210 × 297 mm + 11 => (8392, 11906), // A5 148 × 210 mm + 12 => (14171, 20012), // B4 250 × 353 mm + 13 => (9979, 14171), // B5 176 × 250 mm + 3 => (15840, 24480), // Tabloid 11 × 17" + _ => (11906, 16838), // Default A4 + } +} + +fn parse_page_setup_attrs( + e: &quick_xml::events::BytesStart, +) -> crate::core::Result> { + let pw = xml::optional_attr_str(e, b"paperWidth")?.and_then(|v| dim_to_twips(v.as_ref())); + let ph = xml::optional_attr_str(e, b"paperHeight")?.and_then(|v| dim_to_twips(v.as_ref())); + let paper_size = xml::optional_attr_str(e, b"paperSize")? + .and_then(|v| atoi_simd::parse_pos::(v.as_bytes()).ok()); + let orientation = xml::optional_attr_str(e, b"orientation")?; + let landscape = matches!(orientation.as_deref(), Some("landscape")); + + let (width_twips, height_twips) = match (pw, ph) { + (Some(w), Some(h)) => (w, h), + _ => match paper_size { + Some(id) => paper_size_enum_to_twips(id), + None => return Ok(None), + }, + }; + + Ok(Some(PageSetupRaw { + width_twips, + height_twips, + landscape, + })) +} + +fn build_page_setup( + margins: Option, + raw: Option, +) -> Option { + if margins.is_none() && raw.is_none() { + return None; + } + let in_to_twips = |v: f64| (v * 1440.0).round().max(0.0) as u32; + let m = margins.unwrap_or(PageMarginsIn { + left: 0.7, + right: 0.7, + top: 0.75, + bottom: 0.75, + header: 0.3, + footer: 0.3, + }); + let r = raw.unwrap_or_default(); + let mut ps = PageSetup { + width_twips: r.width_twips, + height_twips: r.height_twips, + margin_top_twips: in_to_twips(m.top), + margin_bottom_twips: in_to_twips(m.bottom), + margin_left_twips: in_to_twips(m.left), + margin_right_twips: in_to_twips(m.right), + header_distance_twips: in_to_twips(m.header), + footer_distance_twips: in_to_twips(m.footer), + landscape: r.landscape, + }; + // If we only saw (no ), leave dimensions + // unset so the caller can fall back to the IR default; otherwise + // downstream renderers would draw onto a 0×0 page. + if ps.width_twips == 0 || ps.height_twips == 0 { + ps.width_twips = 0; + ps.height_twips = 0; + } + Some(ps) +} + fn parse_hyperlink( e: &quick_xml::events::BytesStart, rels: &crate::core::relationships::Relationships, @@ -371,6 +642,41 @@ mod tests { assert!(matches!(cell.value, CellValue::Number(n) if n == 20.0)); } + #[test] + fn parse_worksheet_page_setup() { + let xml = br#" + + + + +"#; + let ws = Worksheet::parse(xml, "S".to_string(), &empty_rels()).unwrap(); + let ps = ws.page_setup.expect("page_setup parsed"); + // 215.9mm ≈ 8.5", 279.4mm ≈ 11", both in twips + assert!((ps.width_twips as i32 - 12240).abs() <= 1, "width {:?}", ps.width_twips); + assert!((ps.height_twips as i32 - 15840).abs() <= 1, "height {:?}", ps.height_twips); + // 0.5" margin = 720 twips + assert_eq!(ps.margin_top_twips, 720); + assert_eq!(ps.margin_left_twips, 720); + assert!(!ps.landscape); + } + + #[test] + fn parse_worksheet_page_setup_paper_enum() { + // paperSize=9 = A4 → 11906x16838 twips. + let xml = br#" + + + + +"#; + let ws = Worksheet::parse(xml, "S".to_string(), &empty_rels()).unwrap(); + let ps = ws.page_setup.expect("page_setup parsed"); + assert_eq!(ps.width_twips, 11906); + assert_eq!(ps.height_twips, 16838); + assert!(ps.landscape); + } + #[test] fn parse_worksheet_merged_cells() { let xml = br#" @@ -387,4 +693,163 @@ mod tests { let ws = Worksheet::parse(xml, "Sheet1".to_string(), &empty_rels()).unwrap(); assert_eq!(ws.merged_cells, vec!["A1:C1"]); } + + // ── dim_to_twips ───────────────────────────────────────────────────── + + #[test] + fn dim_to_twips_inches() { + // 1 inch = 1440 twips. + assert_eq!(dim_to_twips("1in"), Some(1440)); + assert_eq!(dim_to_twips("8.5in"), Some(12240)); + } + + #[test] + fn dim_to_twips_millimeters() { + // 210mm = 11906 twips (A4 width); allow ±1 for rounding. + let twips = dim_to_twips("210mm").unwrap(); + assert!((twips as i32 - 11906).abs() <= 1, "got {twips}"); + } + + #[test] + fn dim_to_twips_centimeters() { + // 1cm = 1440/2.54 ≈ 567 twips. + let twips = dim_to_twips("1cm").unwrap(); + assert!((twips as i32 - 567).abs() <= 1, "got {twips}"); + } + + #[test] + fn dim_to_twips_bare_number_assumed_mm() { + // Bare numeric defaults to mm. + let a = dim_to_twips("210mm").unwrap(); + let b = dim_to_twips("210").unwrap(); + assert_eq!(a, b); + } + + #[test] + fn dim_to_twips_empty_and_zero() { + assert_eq!(dim_to_twips(""), None); + assert_eq!(dim_to_twips(" "), None); + // Zero / negative dimensions are nonsensical: rejected. + assert_eq!(dim_to_twips("0mm"), None); + assert_eq!(dim_to_twips("-5in"), None); + } + + #[test] + fn dim_to_twips_invalid_string() { + assert_eq!(dim_to_twips("garbage"), None); + assert_eq!(dim_to_twips("abcmm"), None); + } + + // ── paper_size_enum_to_twips ──────────────────────────────────────── + + #[test] + fn paper_size_letter() { + assert_eq!(paper_size_enum_to_twips(1), (12240, 15840)); + } + + #[test] + fn paper_size_legal() { + assert_eq!(paper_size_enum_to_twips(5), (12240, 20160)); + } + + #[test] + fn paper_size_a4() { + assert_eq!(paper_size_enum_to_twips(9), (11906, 16838)); + } + + #[test] + fn paper_size_unknown_falls_back_to_a4() { + assert_eq!(paper_size_enum_to_twips(9999), (11906, 16838)); + } + + // ── build_page_setup ──────────────────────────────────────────────── + + #[test] + fn build_page_setup_returns_none_when_both_missing() { + assert!(build_page_setup(None, None).is_none()); + } + + #[test] + fn build_page_setup_margins_only_zeroes_dimensions() { + // without → dimensions left at 0 so + // a downstream consumer falls back to its default page size. + let margins = Some(PageMarginsIn { + left: 1.0, + right: 1.0, + top: 1.0, + bottom: 1.0, + header: 0.5, + footer: 0.5, + }); + let ps = build_page_setup(margins, None).unwrap(); + assert_eq!(ps.width_twips, 0); + assert_eq!(ps.height_twips, 0); + // 1 inch margins = 1440 twips. + assert_eq!(ps.margin_top_twips, 1440); + assert_eq!(ps.margin_left_twips, 1440); + assert_eq!(ps.header_distance_twips, 720); // 0.5 in + } + + #[test] + fn build_page_setup_dimensions_only_uses_default_margins() { + // alone uses ECMA-376 default 0.7/0.7/0.75/0.75 inch margins. + let raw = Some(PageSetupRaw { + width_twips: 12240, + height_twips: 15840, + landscape: false, + }); + let ps = build_page_setup(None, raw).unwrap(); + assert_eq!(ps.width_twips, 12240); + assert_eq!(ps.height_twips, 15840); + // 0.7in = 1008 twips. + assert_eq!(ps.margin_left_twips, 1008); + // 0.75in = 1080 twips. + assert_eq!(ps.margin_top_twips, 1080); + } + + #[test] + fn build_page_setup_combines_both() { + let margins = Some(PageMarginsIn { + left: 0.5, + right: 0.5, + top: 0.5, + bottom: 0.5, + header: 0.3, + footer: 0.3, + }); + let raw = Some(PageSetupRaw { + width_twips: 11906, + height_twips: 16838, + landscape: true, + }); + let ps = build_page_setup(margins, raw).unwrap(); + assert_eq!(ps.width_twips, 11906); + assert!(ps.landscape); + assert_eq!(ps.margin_left_twips, 720); // 0.5in + } + + #[test] + fn parse_worksheet_landscape_with_paper_enum() { + // Verifies that landscape attribute survives the parse_page_setup_attrs path. + let xml = br#" + + + +"#; + let ws = Worksheet::parse(xml, "S".to_string(), &empty_rels()).unwrap(); + let ps = ws.page_setup.expect("page_setup"); + assert_eq!(ps.width_twips, 12240); // Letter + assert!(ps.landscape); + } + + #[test] + fn parse_worksheet_default_when_no_setup() { + // No or → no page_setup at all. + let xml = br#" + + +"#; + let ws = Worksheet::parse(xml, "S".to_string(), &empty_rels()).unwrap(); + assert!(ws.page_setup.is_none()); + } } diff --git a/src/xlsx/write.rs b/src/xlsx/write.rs index bf9da7d..7604cb9 100644 --- a/src/xlsx/write.rs +++ b/src/xlsx/write.rs @@ -232,6 +232,98 @@ pub enum CellData { /// Builder for creating XLSX files. pub struct XlsxWriter { sheets: Vec, + /// Embedded font programs to ship inside the package under `xl/fonts/`. + /// Same layout as DOCX `word/fonts/` and PPTX `ppt/fonts/`. Excel + /// itself doesn't honor these without `` / theme + /// plumbing, but the in-process reader scans the directory so + /// PDF↔XLSX round-trips can preserve typefaces. + embedded_fonts: Vec<(String, Vec)>, + /// Document metadata for `docProps/core.xml`. `None` means no + /// core-properties part is written. + metadata: Option, +} + +/// Per-worksheet page geometry. +/// +/// Maps roughly 1-to-1 onto OOXML's `` and `` — +/// margins are stored in inches per ECMA-376 (§18.3.1.62), the page size +/// is emitted as `paperWidth`/`paperHeight` in millimetres so arbitrary +/// PDF MediaBox dimensions round-trip without snapping to the nearest +/// `paperSize` enum. All inputs are twips for parity with the rest of +/// the IR (`width_twips`, `margin_top_twips`, …). +#[derive(Debug, Clone, Copy, PartialEq)] +pub struct PageSetup { + /// Page width in twips (1/1440 inch). + pub width_twips: u32, + /// Page height in twips. + pub height_twips: u32, + /// Top margin in twips. + pub margin_top_twips: u32, + /// Bottom margin in twips. + pub margin_bottom_twips: u32, + /// Left margin in twips. + pub margin_left_twips: u32, + /// Right margin in twips. + pub margin_right_twips: u32, + /// Header distance from top edge in twips. + pub header_distance_twips: u32, + /// Footer distance from bottom edge in twips. + pub footer_distance_twips: u32, + /// Whether the page is in landscape orientation. + pub landscape: bool, +} + +/// A picture anchored on a worksheet via a DrawingML drawing part. +/// +/// Anchor coordinates are in EMU and absolute relative to the sheet +/// origin (top-left). Round-trips render via `` in +/// `xl/drawings/drawingN.xml`. The writer emits the bytes verbatim; the +/// reader resolves them back through the worksheet → drawing → image +/// relationship chain. +#[derive(Debug, Clone)] +pub struct SheetImage { + /// Raw image bytes (PNG / JPEG / etc., as produced by the source). + pub data: Vec, + /// Lowercase file extension (`"png"`, `"jpeg"`, ...). + pub format: String, + /// X anchor in EMU, from sheet origin. + pub x_emu: i64, + /// Y anchor in EMU. + pub y_emu: i64, + /// Rendered width in EMU. + pub cx_emu: i64, + /// Rendered height in EMU. + pub cy_emu: i64, +} + +/// A text shape anchored on a worksheet via a DrawingML drawing part. +/// +/// Used by the layout-preserving PDF→XLSX path to emit each PDF text +/// span at its exact source EMU coordinates as an `` shape +/// inside `xl/drawings/drawingN.xml`. The shape carries a single run +/// with the span's text, font, size, weight, italic, and colour. +#[derive(Debug, Clone)] +pub struct SheetTextShape { + /// Text content of the shape (single run). + pub text: String, + /// Font face name (e.g. `"Times New Roman"`). + pub font_name: String, + /// Font size in points (full-pt scale, not half-pt). + pub font_size_pt: f32, + /// Bold weight. + pub bold: bool, + /// Italic style. + pub italic: bool, + /// Optional 6-char hex colour like `"FF0000"`. `None` ⇒ pure black. + pub color_hex: Option, + /// X anchor in EMU. + pub x_emu: i64, + /// Y anchor in EMU. + pub y_emu: i64, + /// Width in EMU. + pub cx_emu: i64, + /// Height in EMU. + pub cy_emu: i64, } /// Full internal sheet representation. @@ -242,6 +334,13 @@ struct SheetDataInner { pub cell_styles: HashMap<(usize, usize), CellStyle>, /// Merged cell regions: (row, col, row_span, col_span). pub merge_regions: Vec<(usize, usize, usize, usize)>, + /// Per-sheet page geometry (`` + ``). + pub page_setup: Option, + /// Pictures anchored on this sheet via a DrawingML drawing part. + pub images: Vec, + /// Text shapes anchored on this sheet via a DrawingML drawing part. + /// Used by the layout-preserving PDF→XLSX path. + pub text_shapes: Vec, } impl SheetDataInner { @@ -252,6 +351,9 @@ impl SheetDataInner { col_widths: HashMap::new(), cell_styles: HashMap::new(), merge_regions: Vec::new(), + page_setup: None, + images: Vec::new(), + text_shapes: Vec::new(), } } @@ -383,6 +485,73 @@ impl<'a> SheetData<'a> { self.0.merge_cells(row, col, row_span, col_span); self } + + /// Set per-sheet page geometry. Emits `` and `` + /// inside the worksheet XML so PDF→XLSX→PDF round-trips preserve the + /// source MediaBox and margins instead of snapping back to default + /// Letter-portrait. Pass `None` (the default) to omit both elements. + pub fn set_page_setup(&mut self, ps: PageSetup) -> &mut Self { + self.0.page_setup = Some(ps); + self + } + + /// Anchor a styled text run on this worksheet at absolute EMU + /// coordinates. Used by the PDF→XLSX layout-preserving path: each + /// PDF text span becomes one `` shape with a single run. + #[allow(clippy::too_many_arguments)] + pub fn add_text_shape( + &mut self, + text: impl Into, + font_name: impl Into, + font_size_pt: f32, + bold: bool, + italic: bool, + color_hex: Option, + x_emu: i64, + y_emu: i64, + cx_emu: i64, + cy_emu: i64, + ) -> &mut Self { + self.0.text_shapes.push(SheetTextShape { + text: text.into(), + font_name: font_name.into(), + font_size_pt, + bold, + italic, + color_hex, + x_emu, + y_emu, + cx_emu, + cy_emu, + }); + self + } + + /// Anchor a picture on this worksheet at absolute EMU coordinates. + /// + /// On write the writer materialises a `xl/drawings/drawingN.xml` + /// part for this sheet, registers an IMAGE relationship per + /// picture, and writes the bytes under `xl/media/image__.`. + /// `format` is the lowercase file extension (`"png"`, `"jpeg"`, ...). + pub fn add_image( + &mut self, + data: Vec, + format: impl Into, + x_emu: i64, + y_emu: i64, + cx_emu: i64, + cy_emu: i64, + ) -> &mut Self { + self.0.images.push(SheetImage { + data, + format: format.into(), + x_emu, + y_emu, + cx_emu, + cy_emu, + }); + self + } } // --------------------------------------------------------------------------- @@ -398,7 +567,28 @@ impl Default for XlsxWriter { impl XlsxWriter { /// Create a new, empty XLSX writer. pub fn new() -> Self { - Self { sheets: Vec::new() } + Self { + sheets: Vec::new(), + embedded_fonts: Vec::new(), + metadata: None, + } + } + + /// Set document metadata (written to `docProps/core.xml`). + pub fn set_metadata(&mut self, meta: &crate::ir::Metadata) -> &mut Self { + self.metadata = Some(meta.clone()); + self + } + + /// Embed a font program (TrueType / OpenType bytes) under `xl/fonts/`. + /// `name` is used for the file name and as the human-readable font name. + /// Subsequent calls with the same name are deduplicated. + pub fn embed_font(&mut self, name: impl Into, data: Vec) -> &mut Self { + let name = name.into(); + if !self.embedded_fonts.iter().any(|(n, _)| n == &name) { + self.embedded_fonts.push((name, data)); + } + self } /// Add a worksheet and return a mutable handle to it. @@ -455,6 +645,13 @@ impl XlsxWriter { } } + /// Set per-sheet page geometry by sheet index. See `SheetData::set_page_setup`. + pub fn sheet_set_page_setup(&mut self, sheet: usize, ps: PageSetup) { + if let Some(s) = self.sheets.get_mut(sheet) { + s.page_setup = Some(ps); + } + } + /// Save the workbook to a file. pub fn save(&self, path: impl AsRef) -> Result<()> { let mut opc = OpcWriter::create(path)?; @@ -480,6 +677,17 @@ impl XlsxWriter { opc.add_package_rel(rel_types::OFFICE_DOCUMENT, "xl/workbook.xml"); + // Core properties (docProps/core.xml). Optional; written only + // when caller supplied metadata via `set_metadata`. Surfaces + // PDF /Title /Author etc. in Excel's "Properties" dialog after + // a PDF→XLSX→Excel round trip. + if let Some(ref meta) = self.metadata { + let core_part = PartName::new("/docProps/core.xml")?; + opc.add_package_rel(rel_types::CORE_PROPERTIES, "docProps/core.xml"); + let core_xml = crate::core::core_properties::generate_xml(meta); + opc.add_part(&core_part, crate::core::core_properties::CONTENT_TYPE, &core_xml)?; + } + let mut sheet_rids = Vec::with_capacity(self.sheets.len()); for (i, _) in self.sheets.iter().enumerate() { let target = format!("worksheets/sheet{}.xml", i + 1); @@ -497,7 +705,24 @@ impl XlsxWriter { for (i, sheet) in self.sheets.iter().enumerate() { let part_name_str = format!("/xl/worksheets/sheet{}.xml", i + 1); let part_name = PartName::new(&part_name_str)?; - let ws_xml = Self::build_worksheet_xml(sheet, &style_table)?; + + // Emit drawing + media parts up-front so we have the rId + // for the `` element inside the + // worksheet XML below. Sheets without pictures or text + // shapes get no drawing part at all. + let drawing_rid = if !sheet.images.is_empty() || !sheet.text_shapes.is_empty() { + Some(Self::write_drawing_for_sheet( + opc, + &part_name, + i + 1, + &sheet.images, + &sheet.text_shapes, + )?) + } else { + None + }; + + let ws_xml = Self::build_worksheet_xml(sheet, &style_table, drawing_rid.as_deref())?; opc.add_part(&part_name, CT_WORKSHEET, &ws_xml)?; } @@ -505,6 +730,13 @@ impl XlsxWriter { let styles_xml = style_table.build_styles_xml()?; opc.add_part(&styles_part, CT_STYLES, &styles_xml)?; + // Embed fonts under `xl/fonts/font__.ttf`. Same + // layout as DOCX/PPTX. Excel itself doesn't auto-discover the + // fonts without `` plumbing, but the in-process + // reader scans the directory so PDF↔XLSX round-trips can reuse + // the source typeface. + crate::core::embedded_fonts::write_embedded_fonts(opc, "/xl/fonts/", &self.embedded_fonts)?; + Ok(()) } @@ -538,6 +770,7 @@ impl XlsxWriter { fn build_worksheet_xml( sheet: &SheetDataInner, style_table: &StyleTable, + drawing_rid: Option<&str>, ) -> crate::core::Result> { let mut w = Writer::new_with_indent(Vec::new(), b' ', 2); @@ -545,6 +778,11 @@ impl XlsxWriter { let mut root = BytesStart::new("worksheet"); root.push_attribute(("xmlns", NS_SML)); + // Worksheets that anchor drawings need the relationship + // namespace so the `` element below + // resolves. Declaring it unconditionally is harmless for + // plain-data sheets and keeps the writer code simple. + root.push_attribute(("xmlns:r", NS_REL)); w.write_event(Event::Start(root))?; // Column widths @@ -608,11 +846,135 @@ impl XlsxWriter { w.write_event(Event::End(BytesEnd::new("mergeCells")))?; } + // + . ECMA-376 §18.3.1.62 / §18.3.1.63 — + // pageMargins values are in inches (f64), pageSetup carries the + // physical paper dimensions and orientation. We emit `paperWidth` + // and `paperHeight` in mm so arbitrary PDF MediaBoxes round-trip + // verbatim instead of snapping to the closest `paperSize` enum + // (which only covers a fixed set of standard sizes — Letter, + // Legal, A4, A3, …). + if let Some(ps) = sheet.page_setup { + // twips → inches, twips → mm (1 inch = 1440 twips = 25.4 mm). + let to_in = |t: u32| t as f64 / 1440.0; + let to_mm = |t: u32| t as f64 / 1440.0 * 25.4; + + let left = format!("{:.4}", to_in(ps.margin_left_twips)); + let right = format!("{:.4}", to_in(ps.margin_right_twips)); + let top = format!("{:.4}", to_in(ps.margin_top_twips)); + let bottom = format!("{:.4}", to_in(ps.margin_bottom_twips)); + let header = format!("{:.4}", to_in(ps.header_distance_twips)); + let footer = format!("{:.4}", to_in(ps.footer_distance_twips)); + + let mut pm = BytesStart::new("pageMargins"); + pm.push_attribute(("left", left.as_str())); + pm.push_attribute(("right", right.as_str())); + pm.push_attribute(("top", top.as_str())); + pm.push_attribute(("bottom", bottom.as_str())); + pm.push_attribute(("header", header.as_str())); + pm.push_attribute(("footer", footer.as_str())); + w.write_event(Event::Empty(pm))?; + + let pw_mm = format!("{:.2}mm", to_mm(ps.width_twips)); + let ph_mm = format!("{:.2}mm", to_mm(ps.height_twips)); + let orientation = if ps.landscape { + "landscape" + } else { + "portrait" + }; + let mut psu = BytesStart::new("pageSetup"); + psu.push_attribute(("paperWidth", pw_mm.as_str())); + psu.push_attribute(("paperHeight", ph_mm.as_str())); + psu.push_attribute(("orientation", orientation)); + w.write_event(Event::Empty(psu))?; + } + + // `` MUST appear after `` per the + // worksheet child-order schema (CT_Worksheet, ECMA-376 + // §18.3.1.99). Excel rejects the file with "We found a problem + // with some content" otherwise. + if let Some(rid) = drawing_rid { + let mut d = BytesStart::new("drawing"); + d.push_attribute(("r:id", rid)); + w.write_event(Event::Empty(d))?; + } + w.write_event(Event::End(BytesEnd::new("worksheet")))?; Ok(w.into_inner()) } + /// Materialise `xl/drawings/drawing.xml`, write each + /// picture's bytes under `xl/media/image__.`, + /// wire the worksheet→drawing and drawing→image relationships, and + /// register PNG/JPEG default content types. + /// + /// Returns the relationship ID added to the worksheet — the caller + /// places it on the `` element inside the + /// worksheet XML. + fn write_drawing_for_sheet( + opc: &mut OpcWriter, + worksheet_part: &PartName, + sheet_n: usize, + images: &[SheetImage], + text_shapes: &[SheetTextShape], + ) -> Result { + let drawing_target = format!("../drawings/drawing{}.xml", sheet_n); + let drawing_rid = opc.add_part_rel(worksheet_part, rel_types::DRAWING, &drawing_target); + + let drawing_part_str = format!("/xl/drawings/drawing{}.xml", sheet_n); + let drawing_part = PartName::new(&drawing_part_str)?; + + // Add IMAGE rels off the drawing part. Targets are relative to + // the drawing part itself (`../media/imageX.ext`). Track the + // rIds so each `` in the drawing XML can reference + // them via ``. + let mut blip_rids: Vec = Vec::with_capacity(images.len()); + for (i, img) in images.iter().enumerate() { + let ext = if img.format.is_empty() { + "png" + } else { + img.format.as_str() + }; + let media_path_str = format!("/xl/media/image_{}_{}.{}", sheet_n, i + 1, ext); + let media_part = PartName::new(&media_path_str)?; + + // Default Content-Type by extension (Default Extension="png") + // satisfies SDK validators that flag overrides without a + // matching Default. Re-registering the same default is a + // no-op inside ContentTypesBuilder. + let mime = match ext { + "jpg" | "jpeg" => "image/jpeg", + "gif" => "image/gif", + "tiff" | "tif" => "image/tiff", + "bmp" => "image/bmp", + "emf" => "image/x-emf", + "wmf" => "image/x-wmf", + _ => "image/png", + }; + opc.register_default_content_type(ext, mime); + + // Write image bytes raw (no Content-Type override needed + // since we registered the Default above; passing the same + // mime to add_part is harmless). + opc.add_part(&media_part, mime, &img.data)?; + + // Drawing-relative target: `../media/image_..._N.ext`. + let rel_target = format!("../media/image_{}_{}.{}", sheet_n, i + 1, ext); + let rid = opc.add_part_rel(&drawing_part, rel_types::IMAGE, &rel_target); + blip_rids.push(rid); + } + + // Now the drawing XML itself. One `` per + // picture and per text shape; anchor in EMU from the sheet + // origin, with the picture's `` + // referring back to the image rels we just added. + let drawing_xml = build_drawing_xml(images, &blip_rids, text_shapes)?; + const CT_DRAWING: &str = "application/vnd.openxmlformats-officedocument.drawing+xml"; + opc.add_part(&drawing_part, CT_DRAWING, &drawing_xml)?; + + Ok(drawing_rid) + } + fn write_cell( w: &mut Writer>, row: usize, @@ -683,6 +1045,238 @@ impl XlsxWriter { } } +/// Generate `xl/drawings/drawing.xml` for a sheet's pictures. +/// +/// Each picture becomes one `` containing an +/// `` with an `` referring back to the +/// IMAGE rel registered on the drawing part. EMU coordinates flow +/// through verbatim from the caller's `SheetImage`, which preserves +/// source-PDF anchor positions on a PDF→XLSX→PDF round-trip when the +/// upstream IR carries them. +fn build_drawing_xml( + images: &[SheetImage], + blip_rids: &[String], + text_shapes: &[SheetTextShape], +) -> crate::core::Result> { + const NS_XDR: &str = "http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing"; + const NS_A: &str = "http://schemas.openxmlformats.org/drawingml/2006/main"; + + let mut w = Writer::new_with_indent(Vec::new(), b' ', 2); + w.write_event(Event::Decl(BytesDecl::new("1.0", Some("UTF-8"), Some("yes"))))?; + + let mut root = BytesStart::new("xdr:wsDr"); + root.push_attribute(("xmlns:xdr", NS_XDR)); + root.push_attribute(("xmlns:a", NS_A)); + root.push_attribute(("xmlns:r", NS_REL)); + w.write_event(Event::Start(root))?; + + for (i, img) in images.iter().enumerate() { + let rid = blip_rids.get(i).map(String::as_str).unwrap_or("rId1"); + + // + w.write_event(Event::Start(BytesStart::new("xdr:absoluteAnchor")))?; + + // + let pos_x = img.x_emu.to_string(); + let pos_y = img.y_emu.to_string(); + let mut pos = BytesStart::new("xdr:pos"); + pos.push_attribute(("x", pos_x.as_str())); + pos.push_attribute(("y", pos_y.as_str())); + w.write_event(Event::Empty(pos))?; + + // + let ext_cx = img.cx_emu.max(1).to_string(); + let ext_cy = img.cy_emu.max(1).to_string(); + let mut ext = BytesStart::new("xdr:ext"); + ext.push_attribute(("cx", ext_cx.as_str())); + ext.push_attribute(("cy", ext_cy.as_str())); + w.write_event(Event::Empty(ext))?; + + // + w.write_event(Event::Start(BytesStart::new("xdr:pic")))?; + + // + w.write_event(Event::Start(BytesStart::new("xdr:nvPicPr")))?; + let pic_id = (i + 1).to_string(); + let pic_name = format!("Picture {}", i + 1); + let mut cnv_pr = BytesStart::new("xdr:cNvPr"); + cnv_pr.push_attribute(("id", pic_id.as_str())); + cnv_pr.push_attribute(("name", pic_name.as_str())); + w.write_event(Event::Empty(cnv_pr))?; + w.write_event(Event::Empty(BytesStart::new("xdr:cNvPicPr")))?; + w.write_event(Event::End(BytesEnd::new("xdr:nvPicPr")))?; + + // + w.write_event(Event::Start(BytesStart::new("xdr:blipFill")))?; + let mut blip = BytesStart::new("a:blip"); + blip.push_attribute(("r:embed", rid)); + w.write_event(Event::Empty(blip))?; + w.write_event(Event::Start(BytesStart::new("a:stretch")))?; + w.write_event(Event::Empty(BytesStart::new("a:fillRect")))?; + w.write_event(Event::End(BytesEnd::new("a:stretch")))?; + w.write_event(Event::End(BytesEnd::new("xdr:blipFill")))?; + + // + w.write_event(Event::Start(BytesStart::new("xdr:spPr")))?; + w.write_event(Event::Start(BytesStart::new("a:xfrm")))?; + let mut off = BytesStart::new("a:off"); + off.push_attribute(("x", pos_x.as_str())); + off.push_attribute(("y", pos_y.as_str())); + w.write_event(Event::Empty(off))?; + let mut ext2 = BytesStart::new("a:ext"); + ext2.push_attribute(("cx", ext_cx.as_str())); + ext2.push_attribute(("cy", ext_cy.as_str())); + w.write_event(Event::Empty(ext2))?; + w.write_event(Event::End(BytesEnd::new("a:xfrm")))?; + let mut prst = BytesStart::new("a:prstGeom"); + prst.push_attribute(("prst", "rect")); + w.write_event(Event::Start(prst))?; + w.write_event(Event::Empty(BytesStart::new("a:avLst")))?; + w.write_event(Event::End(BytesEnd::new("a:prstGeom")))?; + w.write_event(Event::End(BytesEnd::new("xdr:spPr")))?; + + w.write_event(Event::End(BytesEnd::new("xdr:pic")))?; + + // + w.write_event(Event::Empty(BytesStart::new("xdr:clientData")))?; + + w.write_event(Event::End(BytesEnd::new("xdr:absoluteAnchor")))?; + } + + // ── Text shapes (one `` per layout-mode PDF span) ─────────── + let pic_count = images.len(); + for (j, ts) in text_shapes.iter().enumerate() { + // Skip empty-text shapes — Excel rejects shape XML with + // an empty `` even though OOXML allows it. + let trimmed = ts.text.trim_matches('\u{0000}'); + if trimmed.is_empty() { + continue; + } + + w.write_event(Event::Start(BytesStart::new("xdr:absoluteAnchor")))?; + + let pos_x = ts.x_emu.to_string(); + let pos_y = ts.y_emu.to_string(); + let mut pos = BytesStart::new("xdr:pos"); + pos.push_attribute(("x", pos_x.as_str())); + pos.push_attribute(("y", pos_y.as_str())); + w.write_event(Event::Empty(pos))?; + + let ext_cx = ts.cx_emu.max(1).to_string(); + let ext_cy = ts.cy_emu.max(1).to_string(); + let mut ext = BytesStart::new("xdr:ext"); + ext.push_attribute(("cx", ext_cx.as_str())); + ext.push_attribute(("cy", ext_cy.as_str())); + w.write_event(Event::Empty(ext))?; + + w.write_event(Event::Start(BytesStart::new("xdr:sp")))?; + + // + w.write_event(Event::Start(BytesStart::new("xdr:nvSpPr")))?; + let sp_id = (pic_count + j + 1).to_string(); + let sp_name = format!("TextShape {}", pic_count + j + 1); + let mut cnv_pr = BytesStart::new("xdr:cNvPr"); + cnv_pr.push_attribute(("id", sp_id.as_str())); + cnv_pr.push_attribute(("name", sp_name.as_str())); + w.write_event(Event::Empty(cnv_pr))?; + let mut cnv_sp_pr = BytesStart::new("xdr:cNvSpPr"); + cnv_sp_pr.push_attribute(("txBox", "1")); + w.write_event(Event::Empty(cnv_sp_pr))?; + w.write_event(Event::End(BytesEnd::new("xdr:nvSpPr")))?; + + // + w.write_event(Event::Start(BytesStart::new("xdr:spPr")))?; + w.write_event(Event::Start(BytesStart::new("a:xfrm")))?; + let mut off = BytesStart::new("a:off"); + off.push_attribute(("x", pos_x.as_str())); + off.push_attribute(("y", pos_y.as_str())); + w.write_event(Event::Empty(off))?; + let mut ext2 = BytesStart::new("a:ext"); + ext2.push_attribute(("cx", ext_cx.as_str())); + ext2.push_attribute(("cy", ext_cy.as_str())); + w.write_event(Event::Empty(ext2))?; + w.write_event(Event::End(BytesEnd::new("a:xfrm")))?; + let mut prst = BytesStart::new("a:prstGeom"); + prst.push_attribute(("prst", "rect")); + w.write_event(Event::Start(prst))?; + w.write_event(Event::Empty(BytesStart::new("a:avLst")))?; + w.write_event(Event::End(BytesEnd::new("a:prstGeom")))?; + // Transparent fill so the text shape doesn't paint a + // white rectangle over neighbouring content. + w.write_event(Event::Empty(BytesStart::new("a:noFill")))?; + w.write_event(Event::End(BytesEnd::new("xdr:spPr")))?; + + // — inline a single run with the span's run + // properties. PPTX/PRESENT and SpreadsheetML share the same + // DrawingML run model, so the structure mirrors PPTX text + // bodies elsewhere in this crate. + w.write_event(Event::Start(BytesStart::new("xdr:txBody")))?; + // so a single span doesn't wrap mid-line. + let mut body_pr = BytesStart::new("a:bodyPr"); + body_pr.push_attribute(("wrap", "none")); + body_pr.push_attribute(("rtlCol", "0")); + body_pr.push_attribute(("lIns", "0")); + body_pr.push_attribute(("tIns", "0")); + body_pr.push_attribute(("rIns", "0")); + body_pr.push_attribute(("bIns", "0")); + w.write_event(Event::Empty(body_pr))?; + w.write_event(Event::Empty(BytesStart::new("a:lstStyle")))?; + w.write_event(Event::Start(BytesStart::new("a:p")))?; + // + let mut p_pr = BytesStart::new("a:pPr"); + p_pr.push_attribute(("marL", "0")); + p_pr.push_attribute(("indent", "0")); + w.write_event(Event::Empty(p_pr))?; + // + w.write_event(Event::Start(BytesStart::new("a:r")))?; + // with optional and + let sz_hp = (ts.font_size_pt * 100.0).round() as i32; + let sz_str = sz_hp.to_string(); + let mut r_pr = BytesStart::new("a:rPr"); + r_pr.push_attribute(("lang", "en-US")); + r_pr.push_attribute(("sz", sz_str.as_str())); + if ts.bold { + r_pr.push_attribute(("b", "1")); + } + if ts.italic { + r_pr.push_attribute(("i", "1")); + } + let want_color_or_font = ts.color_hex.is_some() || !ts.font_name.is_empty(); + if want_color_or_font { + w.write_event(Event::Start(r_pr))?; + if let Some(ref hex) = ts.color_hex { + w.write_event(Event::Start(BytesStart::new("a:solidFill")))?; + let mut srgb = BytesStart::new("a:srgbClr"); + srgb.push_attribute(("val", hex.as_str())); + w.write_event(Event::Empty(srgb))?; + w.write_event(Event::End(BytesEnd::new("a:solidFill")))?; + } + if !ts.font_name.is_empty() { + let mut latin = BytesStart::new("a:latin"); + latin.push_attribute(("typeface", ts.font_name.as_str())); + w.write_event(Event::Empty(latin))?; + } + w.write_event(Event::End(BytesEnd::new("a:rPr")))?; + } else { + w.write_event(Event::Empty(r_pr))?; + } + // text + w.write_event(Event::Start(BytesStart::new("a:t")))?; + w.write_event(Event::Text(quick_xml::events::BytesText::new(trimmed)))?; + w.write_event(Event::End(BytesEnd::new("a:t")))?; + w.write_event(Event::End(BytesEnd::new("a:r")))?; + w.write_event(Event::End(BytesEnd::new("a:p")))?; + w.write_event(Event::End(BytesEnd::new("xdr:txBody")))?; + + w.write_event(Event::End(BytesEnd::new("xdr:sp")))?; + w.write_event(Event::Empty(BytesStart::new("xdr:clientData")))?; + w.write_event(Event::End(BytesEnd::new("xdr:absoluteAnchor")))?; + } + + w.write_event(Event::End(BytesEnd::new("xdr:wsDr")))?; + Ok(w.into_inner()) +} + // --------------------------------------------------------------------------- // StyleTable — collects unique CellStyle objects, assigns xfIds, builds // styles.xml dynamically. @@ -716,52 +1310,61 @@ struct XfKey { struct StyleTable { /// Map from (sheet_ptr, row, col) to xf index. cell_xf: HashMap<(*const SheetDataInner, usize, usize), u32>, + /// Ordered list of fonts for XML serialization. fonts: Vec, + /// Ordered list of fills for XML serialization. fills: Vec, num_fmts: Vec<(u32, String)>, // (numFmtId, formatCode) for custom formats + /// Ordered list of xf records for XML serialization. xfs: Vec, + // Lookup maps for O(1) deduplication during build. + font_map: HashMap, + fill_map: HashMap, + xf_map: HashMap, } impl StyleTable { fn build(sheets: &[SheetDataInner]) -> Self { - let mut table = StyleTable { - cell_xf: HashMap::new(), - fonts: Vec::new(), - fills: Vec::new(), - num_fmts: Vec::new(), - xfs: Vec::new(), - }; - - // Built-in fill indices: 0=none, 1=gray125 (required by Excel) - // We pre-populate to match the required structure. - table.fills.push(FillKey(None)); // idx 0: none - table.fills.push(FillKey(None)); // idx 1: gray125 - - // Default font (idx 0) - table.fonts.push(FontKey { + let default_font = FontKey { bold: false, italic: false, underline: false, color: None, size_half_pt: None, name: None, - }); - - // Default xf (idx 0) — no style - table.xfs.push(XfKey { + }; + let default_xf = XfKey { font_idx: 0, fill_idx: 0, num_fmt_id: 0, h_align: None, wrap_text: false, - }); + }; + + let mut font_map = HashMap::new(); + font_map.insert(default_font.clone(), 0u32); + let mut fill_map: HashMap = HashMap::new(); + fill_map.insert(FillKey(None), 0u32); // idx 0 = none; idx 1 = gray125 (pre-populated below) + let mut xf_map = HashMap::new(); + xf_map.insert(default_xf.clone(), 0u32); + + let mut table = StyleTable { + cell_xf: HashMap::new(), + fonts: vec![default_font], + fills: vec![FillKey(None), FillKey(None)], // idx 0: none, idx 1: gray125 + num_fmts: Vec::new(), + xfs: vec![default_xf], + font_map, + fill_map, + xf_map, + }; let mut next_custom_fmt_id: u32 = 164; // custom numFmtIds start at 164 for sheet in sheets { let sheet_ptr = sheet as *const SheetDataInner; for ((row, col), style) in &sheet.cell_styles { - // Resolve font index + // Resolve font index — O(1) via HashMap. let font_key = FontKey { bold: style.bold, italic: style.italic, @@ -770,45 +1373,32 @@ impl StyleTable { size_half_pt: style.font_size_pt.map(|s| (s * 2.0).round() as u32), name: style.font_name.clone(), }; - let font_idx = if font_key - == (FontKey { - bold: false, - italic: false, - underline: false, - color: None, - size_half_pt: None, - name: None, - }) { - 0 + let font_idx = if let Some(&i) = table.font_map.get(&font_key) { + i } else { - match table.fonts.iter().position(|f| f == &font_key) { - Some(i) => i as u32, - None => { - table.fonts.push(font_key); - (table.fonts.len() - 1) as u32 - }, - } + let idx = table.fonts.len() as u32; + table.fonts.push(font_key.clone()); + table.font_map.insert(font_key, idx); + idx }; - // Resolve fill index + // Resolve fill index — O(1) via HashMap. let fill_key = FillKey(style.background_color.clone()); let fill_idx = if fill_key.0.is_none() { 0 + } else if let Some(&i) = table.fill_map.get(&fill_key) { + i } else { - match table.fills.iter().position(|f| f == &fill_key) { - Some(i) => i as u32, - None => { - table.fills.push(fill_key); - (table.fills.len() - 1) as u32 - }, - } + let idx = table.fills.len() as u32; + table.fills.push(fill_key.clone()); + table.fill_map.insert(fill_key, idx); + idx }; - // Resolve number format id + // Resolve number format id. let num_fmt_id = match style.number_format.builtin_id() { Some(id) => id, None => { - // Custom format — shouldn't happen with current enum let id = next_custom_fmt_id; next_custom_fmt_id += 1; table.num_fmts.push((id, "General".to_string())); @@ -833,12 +1423,14 @@ impl StyleTable { wrap_text: style.wrap_text, }; - let xf_idx = match table.xfs.iter().position(|x| x == &xf_key) { - Some(i) => i as u32, - None => { - table.xfs.push(xf_key); - (table.xfs.len() - 1) as u32 - }, + // Resolve xf index — O(1) via HashMap. + let xf_idx = if let Some(&i) = table.xf_map.get(&xf_key) { + i + } else { + let idx = table.xfs.len() as u32; + table.xfs.push(xf_key.clone()); + table.xf_map.insert(xf_key, idx); + idx }; table.cell_xf.insert((sheet_ptr, *row, *col), xf_idx); @@ -1130,6 +1722,46 @@ mod tests { assert!(!buf.get_ref().is_empty()); } + #[test] + fn page_setup_round_trip() { + // Letter portrait, 0.5" margins. The on-wire format is mm in + // + inches in ; + // verify both elements appear and that the parser recovers + // values within rounding tolerance. + let mut wb = XlsxWriter::new(); + let mut sheet = wb.add_sheet("Geom"); + sheet.set_cell(0, 0, CellData::String("hi".into())); + sheet.set_page_setup(PageSetup { + width_twips: 12240, // 8.5" + height_twips: 15840, // 11" + margin_top_twips: 720, // 0.5" + margin_bottom_twips: 720, + margin_left_twips: 720, + margin_right_twips: 720, + header_distance_twips: 432, // 0.3" + footer_distance_twips: 432, + landscape: false, + }); + let mut buf = std::io::Cursor::new(Vec::new()); + wb.write_to(&mut buf).expect("write"); + + // Pull sheet1.xml out and check the attributes. + buf.set_position(0); + let mut zip = zip::ZipArchive::new(buf).expect("zip"); + let mut xml = String::new(); + { + let mut entry = zip.by_name("xl/worksheets/sheet1.xml").expect("sheet"); + std::io::Read::read_to_string(&mut entry, &mut xml).expect("read"); + } + assert!(xml.contains(" p + .content .iter() - .any(|e| matches!(e, Element::Paragraph(p) if + .any(|c| matches!(c, InlineContent::Text(s) if s.text == "Welcome")), + Element::TextBox(tb) => tb.content.iter().any(|inner| { + matches!(inner, Element::Paragraph(p) if p.content.iter().any(|c| matches!(c, InlineContent::Text(s) if s.text == "Welcome")) - )) - ); + ) + }), + _ => false, + }); + assert!(has_welcome); } // =========================================================================== @@ -939,9 +947,18 @@ fn pptx_image_to_ir() { let doc = Document::from_reader(Cursor::new(data), DocumentFormat::Pptx).unwrap(); let ir = doc.to_ir(); - assert!(ir.sections[0].elements.iter().any( - |e| matches!(e, Element::Image(img) if img.alt_text.as_deref() == Some("A scenic view")) - )); + // PPTX picture shapes are wrapped in a positional `Element::TextBox` + // so the renderer knows where to draw the picture frame. + let has_pic = ir.sections[0].elements.iter().any(|e| { + match e { + Element::Image(img) => img.alt_text.as_deref() == Some("A scenic view"), + Element::TextBox(tb) => tb.content.iter().any(|inner| { + matches!(inner, Element::Image(img) if img.alt_text.as_deref() == Some("A scenic view")) + }), + _ => false, + } + }); + assert!(has_pic); } // =========================================================================== diff --git a/tests/write_integration.rs b/tests/write_integration.rs index 41e0f27..92391fe 100644 --- a/tests/write_integration.rs +++ b/tests/write_integration.rs @@ -235,6 +235,7 @@ fn sample_ir(format: office_oxide::DocumentFormat) -> office_oxide::DocumentIR { Element::Heading(Heading { level: 1, content: vec![InlineContent::Text(TextSpan::plain("Main Heading"))], + ..Default::default() }), Element::Paragraph(Paragraph { content: vec![InlineContent::Text(TextSpan::plain("Body text here"))], diff --git a/wasm-pkg/package.json b/wasm-pkg/package.json index c482d01..e2c0b2d 100644 --- a/wasm-pkg/package.json +++ b/wasm-pkg/package.json @@ -1,6 +1,6 @@ { "name": "office-oxide-wasm", - "version": "0.1.1", + "version": "0.1.2", "description": "Fast Office document processing (DOCX/XLSX/PPTX/DOC/XLS/PPT) compiled to WebAssembly. Rust core, zero JS dependencies. Works in Node.js, bundlers, and browsers.", "license": "MIT OR Apache-2.0", "author": "Yury Fedoseev", From 9d4380cc9252ea2617e070401ca669e746502b80 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 6 May 2026 15:37:56 +0000 Subject: [PATCH 02/18] chore(ci): bump actions/attest-sbom from 2.4.0 to 4.1.0 Bumps [actions/attest-sbom](https://github.com/actions/attest-sbom) from 2.4.0 to 4.1.0. - [Release notes](https://github.com/actions/attest-sbom/releases) - [Changelog](https://github.com/actions/attest-sbom/blob/main/RELEASE.md) - [Commits](https://github.com/actions/attest-sbom/compare/bd218ad0dbcb3e146bd073d1d9c6d78e08aa8a0b...c604332985a26aa8cf1bdc465b92731239ec6b9e) --- updated-dependencies: - dependency-name: actions/attest-sbom dependency-version: 4.1.0 dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index f0fc82a..b607d54 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -748,7 +748,7 @@ jobs: GH_TOKEN: ${{ github.token }} - name: Attest SBOM - uses: actions/attest-sbom@bd218ad0dbcb3e146bd073d1d9c6d78e08aa8a0b # v2 + uses: actions/attest-sbom@c604332985a26aa8cf1bdc465b92731239ec6b9e # v4.1.0 with: subject-path: sbom.cdx.json sbom-path: sbom.cdx.json From be5d3a3a4c29526d89519b4e030d385e47eccf2a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 6 May 2026 15:38:11 +0000 Subject: [PATCH 03/18] chore(ci): bump github/codeql-action from 3.35.2 to 4.35.3 Bumps [github/codeql-action](https://github.com/github/codeql-action) from 3.35.2 to 4.35.3. - [Release notes](https://github.com/github/codeql-action/releases) - [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/github/codeql-action/compare/ce64ddcb0d8d890d2df4a9d1c04ff297367dea2a...e46ed2cbd01164d986452f91f178727624ae40d7) --- updated-dependencies: - dependency-name: github/codeql-action dependency-version: 4.35.3 dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/codeql.yml | 4 ++-- .github/workflows/scorecard.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 603bfaa..da29ee0 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -34,7 +34,7 @@ jobs: uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8 # stable - name: Initialize CodeQL - uses: github/codeql-action/init@ce64ddcb0d8d890d2df4a9d1c04ff297367dea2a # v3 + uses: github/codeql-action/init@e46ed2cbd01164d986452f91f178727624ae40d7 # v3 with: languages: ${{ matrix.language }} # Use default queries + security-extended suite @@ -44,6 +44,6 @@ jobs: run: cargo build --lib - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@ce64ddcb0d8d890d2df4a9d1c04ff297367dea2a # v3 + uses: github/codeql-action/analyze@e46ed2cbd01164d986452f91f178727624ae40d7 # v3 with: category: "/language:${{ matrix.language }}" diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml index c5ee089..1f1cde4 100644 --- a/.github/workflows/scorecard.yml +++ b/.github/workflows/scorecard.yml @@ -42,6 +42,6 @@ jobs: retention-days: 5 - name: Upload Scorecard results to GitHub Security tab - uses: github/codeql-action/upload-sarif@ce64ddcb0d8d890d2df4a9d1c04ff297367dea2a # v3 + uses: github/codeql-action/upload-sarif@e46ed2cbd01164d986452f91f178727624ae40d7 # v3 with: sarif_file: results.sarif From 7ffcaccde048a97de48bf030823222b0b5a9ea87 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 6 May 2026 15:38:25 +0000 Subject: [PATCH 04/18] chore(ci): bump actions/upload-artifact from 4.6.2 to 7.0.1 Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 4.6.2 to 7.0.1. - [Release notes](https://github.com/actions/upload-artifact/releases) - [Commits](https://github.com/actions/upload-artifact/compare/ea165f8d65b6e75b540449e92b4886f43607fa02...043fb46d1a93c77aae656e7c1c64a875d1fc6a0a) --- updated-dependencies: - dependency-name: actions/upload-artifact dependency-version: 7.0.1 dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/ci.yml | 2 +- .github/workflows/python.yml | 8 ++++---- .github/workflows/release.yml | 10 +++++----- .github/workflows/scorecard.yml | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5cdfee2..04b316f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -217,7 +217,7 @@ jobs: # its original target/release/ path so binding test code works # unchanged. - name: Upload native lib artifact - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: native-lib-${{ matrix.os }} retention-days: 1 diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 9d64e08..69e6c41 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -146,7 +146,7 @@ jobs: manylinux: ${{ matrix.manylinux }} args: --release --features python --out dist - name: Upload wheels as artifacts - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: wheels-linux-${{ matrix.target }}-${{ matrix.manylinux }} path: dist/*.whl @@ -172,7 +172,7 @@ jobs: target: ${{ matrix.target }} args: --release --features python --out dist - name: Upload wheels as artifacts - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: wheels-macos-${{ matrix.target }} path: dist/*.whl @@ -199,7 +199,7 @@ jobs: target: ${{ matrix.target }} args: --release --features python --out dist - name: Upload wheels as artifacts - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: wheels-windows-${{ matrix.target }} path: dist/*.whl @@ -217,7 +217,7 @@ jobs: command: sdist args: --out dist - name: Upload sdist as artifact - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: sdist path: dist/*.tar.gz diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index b607d54..1c623c2 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -162,7 +162,7 @@ jobs: echo "ARCHIVE=$ARCHIVE" >> $GITHUB_ENV - name: Upload artifact - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: ${{ matrix.artifact_name }} path: ${{ env.ARCHIVE }} @@ -232,7 +232,7 @@ jobs: cp target/${{ matrix.target }}/release/office_oxide.lib staging/lib/ 2>/dev/null || true cp -r include/office_oxide_c staging/include/ cd staging && 7z a "../${{ matrix.artifact_name }}.zip" . && cd .. - - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 + - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: ${{ matrix.artifact_name }} path: | @@ -326,7 +326,7 @@ jobs: printf '{"type": "module"}\n' > wasm-pkg/web/package.json - name: Upload WASM artifact - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: wasm-package path: wasm-pkg/ @@ -380,7 +380,7 @@ jobs: run: maturin build --release --features python --target ${{ matrix.target }} --out dist - name: Upload wheels - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: ${{ matrix.artifact_name }} path: dist/*.whl @@ -431,7 +431,7 @@ jobs: done done ls -R js/prebuilds - - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 + - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: node-native-package path: js/ diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml index 1f1cde4..6672d10 100644 --- a/.github/workflows/scorecard.yml +++ b/.github/workflows/scorecard.yml @@ -35,7 +35,7 @@ jobs: publish_results: true - name: Upload Scorecard results as artifact - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: scorecard-results path: results.sarif From 68c48fb325f9fa5638fbdde4b5a02cda19b5f07c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 6 May 2026 15:38:30 +0000 Subject: [PATCH 05/18] chore(ci): update dtolnay/rust-toolchain requirement to 29eef336d9b2848a0b548edc03f92a220660cdb8 Updates the requirements on [dtolnay/rust-toolchain](https://github.com/dtolnay/rust-toolchain) to permit the latest version. - [Release notes](https://github.com/dtolnay/rust-toolchain/releases) - [Commits](https://github.com/dtolnay/rust-toolchain/commits/29eef336d9b2848a0b548edc03f92a220660cdb8) --- updated-dependencies: - dependency-name: dtolnay/rust-toolchain dependency-version: 29eef336d9b2848a0b548edc03f92a220660cdb8 dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 04b316f..dd1e9b5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -127,7 +127,7 @@ jobs: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v4 - name: Install Rust - uses: dtolnay/rust-toolchain@3c5f7ea28cd621ae0bf5283f0e981fb97b8a7af9 # master + uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8 # master with: toolchain: ${{ matrix.rust }} @@ -626,7 +626,7 @@ jobs: run: | v=$(grep -E '^rust-version' Cargo.toml | head -1 | sed 's/.*"\(.*\)".*/\1/') echo "version=${v:-1.85}" >> "$GITHUB_OUTPUT" - - uses: dtolnay/rust-toolchain@3c5f7ea28cd621ae0bf5283f0e981fb97b8a7af9 # master + - uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8 # master with: toolchain: ${{ steps.msrv.outputs.version }} - uses: Swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2 From 846f5f44c8da05a6d39f24fd60decba90a3305d6 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 6 May 2026 15:38:33 +0000 Subject: [PATCH 06/18] chore(ci): bump actions/github-script from 7.0.1 to 9.0.0 Bumps [actions/github-script](https://github.com/actions/github-script) from 7.0.1 to 9.0.0. - [Release notes](https://github.com/actions/github-script/releases) - [Commits](https://github.com/actions/github-script/compare/60a0d83039c74a4aee543508d2ffcb1c3799cdea...3a2844b7e9c422d3c10d287c895573f7108da1b3) --- updated-dependencies: - dependency-name: actions/github-script dependency-version: 9.0.0 dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/outdated.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/outdated.yml b/.github/workflows/outdated.yml index 487b650..d1d014a 100644 --- a/.github/workflows/outdated.yml +++ b/.github/workflows/outdated.yml @@ -43,7 +43,7 @@ jobs: - name: Open issue for outdated deps if: steps.outdated.outputs.has_outdated == 'true' - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7 + uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v7 with: script: | const title = `chore: outdated dependencies (${new Date().toISOString().slice(0,7)})`; From c7219cfc19dfc4668e2ce128ba34bf85fa083395 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 13 May 2026 16:13:13 +0000 Subject: [PATCH 07/18] chore(deps): bump koffi from 2.16.1 to 2.16.2 in /js Bumps [koffi](https://github.com/Koromix/koffi) from 2.16.1 to 2.16.2. - [Commits](https://github.com/Koromix/koffi/commits) --- updated-dependencies: - dependency-name: koffi dependency-version: 2.16.2 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- js/package-lock.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/js/package-lock.json b/js/package-lock.json index 95e1dc8..0cb77b9 100644 --- a/js/package-lock.json +++ b/js/package-lock.json @@ -17,9 +17,9 @@ } }, "node_modules/koffi": { - "version": "2.16.1", - "resolved": "https://registry.npmjs.org/koffi/-/koffi-2.16.1.tgz", - "integrity": "sha512-0Ie6CfD026dNfWSosDw9dPxPzO9Rlyo0N8m5r05S8YjytIpuilzMFDMY4IDy/8xQsTwpuVinhncD+S8n3bcYZQ==", + "version": "2.16.2", + "resolved": "https://registry.npmjs.org/koffi/-/koffi-2.16.2.tgz", + "integrity": "sha512-owU0MRwv6xkrVqCd+33uw6BaYppkTRXbO/rVdJNI2dvZG0gzyRhYwW25eWtc5pauwK8TGh3AbkFONSezdykfSA==", "hasInstallScript": true, "license": "MIT", "funding": { From 591a88d3ab4b705111d9e93b84cddb6d5760884f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 13 May 2026 16:12:56 +0000 Subject: [PATCH 08/18] chore(deps): bump quick-xml from 0.37.5 to 0.40.0 Bumps [quick-xml](https://github.com/tafia/quick-xml) from 0.37.5 to 0.40.0. - [Release notes](https://github.com/tafia/quick-xml/releases) - [Changelog](https://github.com/tafia/quick-xml/blob/master/Changelog.md) - [Commits](https://github.com/tafia/quick-xml/compare/v0.37.5...v0.40.0) --- updated-dependencies: - dependency-name: quick-xml dependency-version: 0.40.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- Cargo.lock | 4 ++-- Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bc0b74b..ad893d7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -409,9 +409,9 @@ dependencies = [ [[package]] name = "quick-xml" -version = "0.37.5" +version = "0.40.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "331e97a1af0bf59823e6eadffe373d7b27f485be8748f71471c662c1f269b7fb" +checksum = "0b7315c86b26aaef0321fba33c9dcc160da659c6a9d278f0f6a5656d6561c03b" dependencies = [ "memchr", "serde", diff --git a/Cargo.toml b/Cargo.toml index a6f0723..a2de463 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -65,7 +65,7 @@ crate-type = ["rlib", "cdylib", "staticlib"] [dependencies] # Core parsing -quick-xml = { version = "0.37", features = ["serialize"] } +quick-xml = { version = "0.40", features = ["serialize"] } zip = { version = "8.1", default-features = false, features = ["deflate"] } thiserror = "2" serde = { version = "1", features = ["derive"] } From e4144777c2677bea6e14ece70e650fbe7612d18f Mon Sep 17 00:00:00 2001 From: Yury Fedoseev Date: Wed, 13 May 2026 13:02:38 -0700 Subject: [PATCH 09/18] fix(deps): adapt to quick-xml 0.40 API changes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Migrate the parsers to quick-xml 0.40 after the dependabot cherry-pick: - `BytesText::unescape()` was removed in 0.40. Replace 6 call sites with new `core::xml::unescape_text(BytesText) -> Result` helper that does `decode()?` + `escape::unescape()?` in one call. - `Attribute::unescape_value()` is deprecated in 0.40 (replacement `normalized_value()` has different semantics — no entity unescaping). Wrap the 6 call sites through new `core::xml::unescape_attr_value` helper with `#[allow(deprecated)]` localised to one place so the call sites stay deprecation-free. Also apply `cargo fmt --all` (4 files: convert_docx, convert_xlsx, create, xlsx/text — pre-existing fmt drift surfaced by rebuild). Result: 0 warnings, cargo clippy --workspace --all-targets -- -D warnings clean, 535/535 tests pass. --- src/convert_docx.rs | 6 ++---- src/convert_xlsx.rs | 6 ++---- src/core/properties.rs | 4 ++-- src/core/xml.rs | 37 +++++++++++++++++++++++++++++++------ src/create.rs | 11 ++++------- src/pptx/slide.rs | 3 +-- src/xlsx/mod.rs | 12 +++++------- src/xlsx/text.rs | 4 +--- 8 files changed, 48 insertions(+), 35 deletions(-) diff --git a/src/convert_docx.rs b/src/convert_docx.rs index 6963279..33c5ffb 100644 --- a/src/convert_docx.rs +++ b/src/convert_docx.rs @@ -188,10 +188,8 @@ fn convert_block_elements( crate::ir::InlineContent::Text(s) if s.text.is_empty() ) }); - let has_bottom_border = p - .properties - .as_ref() - .is_some_and(|pp| pp.has_bottom_border); + let has_bottom_border = + p.properties.as_ref().is_some_and(|pp| pp.has_bottom_border); if is_empty_para && has_bottom_border { elements.push(Element::ThematicBreak); i += 1; diff --git a/src/convert_xlsx.rs b/src/convert_xlsx.rs index 5869963..f15239c 100644 --- a/src/convert_xlsx.rs +++ b/src/convert_xlsx.rs @@ -161,10 +161,8 @@ pub(crate) fn xlsx_to_ir(doc: &crate::xlsx::XlsxDocument) -> DocumentIR { // XLSX cell font size is in points (`` // where N is f32). IR uses half-points; same // half-pt convention as DOCX/PPTX read paths. - span.font_size_half_pt = Some( - crate::core::units::HalfPoint::from_points_rounded(size_pt) - .0, - ); + span.font_size_half_pt = + Some(crate::core::units::HalfPoint::from_points_rounded(size_pt).0); } if font.bold { span.bold = true; diff --git a/src/core/properties.rs b/src/core/properties.rs index dbca417..71a3aa3 100644 --- a/src/core/properties.rs +++ b/src/core/properties.rs @@ -87,7 +87,7 @@ impl CoreProperties { }; }, Event::Text(ref e) => { - let text = e.unescape()?.into_owned(); + let text = crate::core::xml::unescape_text(e)?; if text.is_empty() { continue; } @@ -234,7 +234,7 @@ impl AppProperties { current_tag = Some(String::from_utf8_lossy(local_bytes).into_owned()); }, Event::Text(ref e) => { - let text = e.unescape()?.into_owned(); + let text = crate::core::xml::unescape_text(e)?; if let Some(ref tag) = current_tag { match tag.as_str() { "Application" => props.application = Some(text), diff --git a/src/core/xml.rs b/src/core/xml.rs index c8e4499..302993a 100644 --- a/src/core/xml.rs +++ b/src/core/xml.rs @@ -155,12 +155,10 @@ pub fn optional_prefixed_attr_str<'a>( // Check prefixed: look for `:localname` at the end if let Some(pos) = key.iter().position(|&b| b == b':') { if &key[pos + 1..] == local_name { - let value = attr.unescape_value()?; - return Ok(Some(value)); + return Ok(Some(Cow::Owned(unescape_attr_value(&attr)?))); } } else if key == local_name { - let value = attr.unescape_value()?; - return Ok(Some(value)); + return Ok(Some(Cow::Owned(unescape_attr_value(&attr)?))); } } Ok(None) @@ -185,7 +183,7 @@ pub fn read_text_content(reader: &mut NsReader<&[u8]>) -> Result { loop { match reader.read_event()? { Event::Text(e) => { - text.push_str(&e.unescape()?); + text.push_str(&unescape_text(&e)?); }, Event::CData(e) => { text.push_str(std::str::from_utf8(&e)?); @@ -238,6 +236,33 @@ pub fn make_reader(xml: &[u8]) -> NsReader<&[u8]> { // Fast Reader utilities (no namespace resolution — for hot-path parsing) // =========================================================================== +/// Decode and unescape a `BytesText` event into an owned string. +/// +/// quick-xml 0.40 removed `BytesText::unescape()` in favor of explicit +/// `decode()` followed by `escape::unescape()`. This helper preserves +/// the old single-call ergonomics so the parsers don't have to repeat +/// the two-step dance. `EncodingError` and `EscapeError` go through +/// `quick_xml::Error` to reach our `core::Error`. +pub fn unescape_text(e: &quick_xml::events::BytesText<'_>) -> Result { + let decoded = e.decode().map_err(quick_xml::Error::from)?; + let unescaped = quick_xml::escape::unescape(&decoded).map_err(quick_xml::Error::from)?; + Ok(unescaped.into_owned()) +} + +/// Decode and unescape an `Attribute` value into an owned string. +/// +/// quick-xml 0.40 deprecated `Attribute::unescape_value()` in favor of +/// `normalized_value()`, but the suggested replacement doesn't unescape +/// XML entities (`&`, `<`, …) — only whitespace-normalizes. OOXML +/// attribute values frequently contain hyperlinks and other content that +/// need real entity unescaping, so we keep the old behaviour for now and +/// centralise the deprecation suppression in one place. +#[allow(deprecated)] +pub fn unescape_attr_value(attr: &quick_xml::events::attributes::Attribute<'_>) -> Result { + let cow = attr.unescape_value()?; + Ok(cow.into_owned()) +} + /// Create a plain Reader (no namespace resolution) configured for OOXML parsing. /// Use this for format-specific hot paths (worksheets, slides, document body) /// where all elements are in a single known namespace. @@ -258,7 +283,7 @@ pub fn read_text_content_fast(reader: &mut quick_xml::Reader<&[u8]>) -> Result { - text.push_str(&e.unescape()?); + text.push_str(&unescape_text(&e)?); }, Event::CData(e) => { text.push_str(&String::from_utf8_lossy(&e)); diff --git a/src/create.rs b/src/create.rs index f8e56dc..62082b3 100644 --- a/src/create.rs +++ b/src/create.rs @@ -912,13 +912,10 @@ fn emit_pptx_slides_compacted( // mid-block when pdf_to_ir injects gap spacers. fn is_body_content(elem: &Element) -> bool { match elem { - Element::Paragraph(p) => { - - p.content.iter().any(|ic| match ic { - InlineContent::Text(s) => !s.text.is_empty(), - _ => false, - }) - }, + Element::Paragraph(p) => p.content.iter().any(|ic| match ic { + InlineContent::Text(s) => !s.text.is_empty(), + _ => false, + }), Element::List(_) | Element::CodeBlock(_) | Element::Table(_) => true, _ => false, } diff --git a/src/pptx/slide.rs b/src/pptx/slide.rs index 35ee540..679911b 100644 --- a/src/pptx/slide.rs +++ b/src/pptx/slide.rs @@ -335,8 +335,7 @@ fn read_blip_embed_attr(e: &quick_xml::events::BytesStart) -> CoreResult String { } }, Ok(quick_xml::events::Event::Text(t)) => { - if let Ok(s) = t.unescape() { + if let Ok(s) = crate::core::xml::unescape_text(&t) { let trimmed = s.trim(); if trimmed.is_empty() { continue; @@ -881,7 +881,7 @@ fn parse_drawing_anchors(xml_data: &[u8]) -> crate::core::Result for attr in e.attributes().with_checks(false) { let attr = attr.map_err(crate::core::Error::from)?; let key = attr.key.as_ref(); - let raw = attr.unescape_value().map_err(crate::core::Error::from)?; + let raw = crate::core::xml::unescape_attr_value(&attr)?; match key { b"sz" => { // sz is in hundredths of a pt. @@ -941,9 +941,7 @@ fn parse_drawing_anchors(xml_data: &[u8]) -> crate::core::Result let attr = attr.map_err(crate::core::Error::from)?; let key = attr.key.as_ref(); if key == b"r:embed" || key.ends_with(b":embed") || key == b"embed" { - let raw = - attr.unescape_value().map_err(crate::core::Error::from)?; - embed_rid = Some(raw.into_owned()); + embed_rid = Some(crate::core::xml::unescape_attr_value(&attr)?); break; } } @@ -967,7 +965,7 @@ fn parse_drawing_anchors(xml_data: &[u8]) -> crate::core::Result for attr in e.attributes().with_checks(false) { let attr = attr.map_err(crate::core::Error::from)?; let key = attr.key.as_ref(); - let raw = attr.unescape_value().map_err(crate::core::Error::from)?; + let raw = crate::core::xml::unescape_attr_value(&attr)?; match key { b"sz" => { if let Ok(n) = raw.parse::() { @@ -984,7 +982,7 @@ fn parse_drawing_anchors(xml_data: &[u8]) -> crate::core::Result } }, Event::Text(ref e) if in_a_t => { - let s = e.unescape().map_err(crate::core::Error::from)?; + let s = crate::core::xml::unescape_text(e)?; text_buf.push_str(&s); }, Event::End(ref e) => { diff --git a/src/xlsx/text.rs b/src/xlsx/text.rs index 83df018..d37a915 100644 --- a/src/xlsx/text.rs +++ b/src/xlsx/text.rs @@ -243,9 +243,7 @@ impl XlsxDocument { match &cell.value { CellValue::Empty => {}, CellValue::Number(n) => { - let is_date = cell - .style_index - .is_some_and(|i| date_indices.contains(&i)); + let is_date = cell.style_index.is_some_and(|i| date_indices.contains(&i)); if is_date { if let Some(dt) = date::DateTimeValue::from_serial(*n, self.workbook.date1904) { buf.push_str(&dt.to_iso_string()); From 745083805f8093ddce74927c1f15ca73af5dc1b8 Mon Sep 17 00:00:00 2001 From: Yury Fedoseev Date: Wed, 13 May 2026 20:11:49 -0700 Subject: [PATCH 10/18] docs(cli,mcp): add crate-level docs for binary crates `office_oxide_cli` and `office_oxide_mcp` had `mod commands;` / `mod protocol;` as their first statement, leaving the crate root undocumented. Add a short crate-level `//!` doc and `#![warn(missing_docs)]` so future items in either binary stay documented. Verified: `RUSTDOCFLAGS="-D missing_docs" cargo doc --workspace --no-deps --features parallel,mmap` now passes with zero errors. --- crates/office_oxide_cli/src/main.rs | 8 ++++++++ crates/office_oxide_mcp/src/main.rs | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/crates/office_oxide_cli/src/main.rs b/crates/office_oxide_cli/src/main.rs index 21b710b..f8f7903 100644 --- a/crates/office_oxide_cli/src/main.rs +++ b/crates/office_oxide_cli/src/main.rs @@ -1,3 +1,11 @@ +//! `office-oxide` — command-line front-end to the `office_oxide` library. +//! +//! Extracts text, converts to Markdown / HTML / IR, and inspects DOCX, +//! XLSX, PPTX, DOC, XLS, and PPT files. See `office-oxide --help` for +//! the full subcommand list. + +#![warn(missing_docs)] + mod commands; use clap::Parser; diff --git a/crates/office_oxide_mcp/src/main.rs b/crates/office_oxide_mcp/src/main.rs index f0368b7..ccf8897 100644 --- a/crates/office_oxide_mcp/src/main.rs +++ b/crates/office_oxide_mcp/src/main.rs @@ -1,3 +1,11 @@ +//! `office-oxide-mcp` — Model Context Protocol server for office_oxide. +//! +//! Speaks JSON-RPC 2.0 over stdin/stdout. Exposes two tools: +//! `extract` (text / markdown / html / ir from a DOCX/XLSX/PPTX/DOC/ +//! XLS/PPT file) and `info` (format detection + metadata). + +#![warn(missing_docs)] + mod protocol; use std::io::{self, BufRead, Write}; From 1ba28e7269173f748b2d033f12d1e949cfa2defc Mon Sep 17 00:00:00 2001 From: Yury Fedoseev Date: Thu, 14 May 2026 17:34:23 -0700 Subject: [PATCH 11/18] docs(changelog): expand v0.1.2 entry for recent branch changes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Records the run-colour propagation folded into the release commit (DOCX `` and PPTX `` into `TextSpan.color`), the quick-xml 0.37 → 0.40 API migration with the new `core::xml::unescape_text` / `unescape_attr_value` helpers, and the crate-level `//!` docs + `missing_docs` lint added to `office_oxide_cli` and `office_oxide_mcp`. Release date bumped to 2026-05-14. --- CHANGELOG.md | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a98f0f2..0e0f143 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [0.1.2] - 2026-05-13 +## [0.1.2] - 2026-05-14 > Round-trip fidelity, IR layout features, embedded fonts, XLSX number formatting, and an O(1) style-lookup perf win. @@ -40,6 +40,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **Per-section page sizes** preserved through `to_ir`; multi-section IR emits per-section ``. - **`` preserved** through to IR's `font_size_half_pt`. +- **Run colour** from `` propagated into + `TextSpan.color` during `to_ir`, so PDF→DOCX→PDF round-trips keep + coloured text. Only the `ColorRef::Rgb` variant is plumbed today; + theme / system / `auto` colours still fall through to the renderer + default (proper resolution needs `theme.xml` threaded into the + convert path). - **Headers and footers** now included in `to_markdown` and `to_ir` (previously silently dropped). - **Embedded fonts** under `/word/fonts/` exposed on @@ -68,6 +74,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 section's `PageSetup`. - **Run font sizes preserved** via new `TextRun.font_size_hundredths_pt` (parsed from ``). +- **Run colour preserved** via new `TextRun.color_rgb: Option<[u8; 3]>` + parsed from `` + and propagated to `TextSpan.color` in IR. The parser tracks an + `in_solid_fill` flag so sibling effects (e.g. `` + for hyperlink colour) don't leak into the run's own fill; non-sRGB + fills (gradient, scheme colour) fall back to `None`. - **Paragraph alignment** parsed from `` (all five variants: `l` / `ctr` / `r` / `just` / `dist`) into `TextParagraph.alignment`. **Space-before** parsed from @@ -137,6 +149,28 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 `from_points_rounded`**: cross-format font-size invariants (DrawingML hundredths-of-a-point vs WML half-points). +### Dependencies + +- **`quick-xml` 0.37 → 0.40**: upstream removed `BytesText::unescape()` + and deprecated `Attribute::unescape_value()` (its replacement + `normalized_value()` has different semantics — no entity + unescaping). Migration added two helpers in `core::xml`: + `unescape_text(BytesText) -> Result` (used by 6 call sites) + and `unescape_attr_value` (used by 6 call sites, with + `#[allow(deprecated)]` localised to the helper so call sites stay + deprecation-free). 535 / 535 tests still pass; clippy clean. +- **`koffi` 2.16.1 → 2.16.2** in `js/` (patch bump). + +### Documentation + +- **CLI / MCP crate-level docs**: `office_oxide_cli` and + `office_oxide_mcp` previously opened with `mod commands;` / + `mod protocol;` and had no crate-level rustdoc. Added a short + `//!` block plus `#![warn(missing_docs)]` so future items in + either binary stay documented. + `RUSTDOCFLAGS="-D missing_docs" cargo doc --workspace --no-deps + --features parallel,mmap` now passes with zero errors. + ### Tests - **+98 unit tests** across the modules touched in this release: From c3179005a8b50b9cada5ca759a6a4a56598eea47 Mon Sep 17 00:00:00 2001 From: Yury Fedoseev Date: Thu, 14 May 2026 17:51:19 -0700 Subject: [PATCH 12/18] fix(pptx): add missing color_rgb field to test TextRun constructors The shape::TextRun struct gained a color_rgb field in this branch but ten in-test constructors in src/pptx/text.rs still listed the previous field set, breaking cargo clippy/test workspace-wide. --- src/pptx/text.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/pptx/text.rs b/src/pptx/text.rs index 33a451c..cbdddd6 100644 --- a/src/pptx/text.rs +++ b/src/pptx/text.rs @@ -458,6 +458,7 @@ mod tests { strikethrough: false, hyperlink: None, font_size_hundredths_pt: None, + color_rgb: None, })], }], }), @@ -488,6 +489,7 @@ mod tests { strikethrough: false, hyperlink: None, font_size_hundredths_pt: None, + color_rgb: None, })], }], }), @@ -595,6 +597,7 @@ mod tests { strikethrough: false, hyperlink: None, font_size_hundredths_pt: None, + color_rgb: None, }), TextContent::Run(TextRun { text: " and ".to_string(), @@ -603,6 +606,7 @@ mod tests { strikethrough: false, hyperlink: None, font_size_hundredths_pt: None, + color_rgb: None, }), TextContent::Run(TextRun { text: "italic".to_string(), @@ -611,6 +615,7 @@ mod tests { strikethrough: false, hyperlink: None, font_size_hundredths_pt: None, + color_rgb: None, }), ], }], @@ -668,6 +673,7 @@ mod tests { strikethrough: false, hyperlink: None, font_size_hundredths_pt: None, + color_rgb: None, })], }], }), @@ -689,6 +695,7 @@ mod tests { strikethrough: false, hyperlink: None, font_size_hundredths_pt: None, + color_rgb: None, })], }], }), @@ -714,6 +721,7 @@ mod tests { strikethrough: false, hyperlink: None, font_size_hundredths_pt: None, + color_rgb: None, })], }], }), @@ -735,6 +743,7 @@ mod tests { strikethrough: false, hyperlink: None, font_size_hundredths_pt: None, + color_rgb: None, })], }], }), @@ -789,6 +798,7 @@ mod tests { tooltip: None, }), font_size_hundredths_pt: None, + color_rgb: None, })], }], }), From 59f54e37d498db5cc95fbe4348fb26019c566ef5 Mon Sep 17 00:00:00 2001 From: Yury Fedoseev Date: Thu, 14 May 2026 17:55:03 -0700 Subject: [PATCH 13/18] fix(review): address Copilot review comments on PR #38 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - xlsx/worksheet.rs: correct A3 paper-size twips (16838×23811 vs the off-by-2 16840×23820); drop the no-op block that "zeroed" already-zero dimensions in build_page_setup. - docx/text.rs: remove the dead pre-split loop that built a string it never appended anywhere; split_headers_footers does the actual emission below. - xlsx/mod.rs: drop the `push_str("")` no-op in extract_chart_text — adjacent rich-text runs concatenate directly (the surrounding XML preserves any intended whitespace as ``). - convert_xlsx.rs: when a worksheet had `` but no ``, fall back to PageSetup::default() geometry instead of dropping the parsed margins on the floor. --- src/convert_xlsx.rs | 29 ++++++++++++++++++----------- src/docx/text.rs | 30 ------------------------------ src/xlsx/mod.rs | 4 ---- src/xlsx/worksheet.rs | 11 ++--------- 4 files changed, 20 insertions(+), 54 deletions(-) diff --git a/src/convert_xlsx.rs b/src/convert_xlsx.rs index f15239c..86296b6 100644 --- a/src/convert_xlsx.rs +++ b/src/convert_xlsx.rs @@ -220,16 +220,23 @@ pub(crate) fn xlsx_to_ir(doc: &crate::xlsx::XlsxDocument) -> DocumentIR { // had no — Excel's default 0.7"/0.75" is wider than // we want for a tight PDF round-trip and would shrink the usable // text area. - let page_setup = ws.page_setup.and_then(|wsp| { - // A worksheet that only had (no dimensions) is - // treated as "no geometry" so the renderer keeps its - // OfficeConfig default page size. - if wsp.width_twips == 0 || wsp.height_twips == 0 { - return None; - } - Some(PageSetup { - width_twips: wsp.width_twips, - height_twips: wsp.height_twips, + let page_setup = ws.page_setup.map(|wsp| { + // When was present but was not, + // wsp's width/height come through as 0. Fall back to the + // IR PageSetup default geometry rather than dropping the + // parsed margins on the floor. + let default = PageSetup::default(); + PageSetup { + width_twips: if wsp.width_twips == 0 { + default.width_twips + } else { + wsp.width_twips + }, + height_twips: if wsp.height_twips == 0 { + default.height_twips + } else { + wsp.height_twips + }, margin_top_twips: wsp.margin_top_twips, margin_bottom_twips: wsp.margin_bottom_twips, margin_left_twips: wsp.margin_left_twips, @@ -237,7 +244,7 @@ pub(crate) fn xlsx_to_ir(doc: &crate::xlsx::XlsxDocument) -> DocumentIR { header_distance_twips: wsp.header_distance_twips, footer_distance_twips: wsp.footer_distance_twips, landscape: wsp.landscape, - }) + } }); // Each XLSX worksheet renders to its own PDF page sequence, so diff --git a/src/docx/text.rs b/src/docx/text.rs index 3a5bc13..4028594 100644 --- a/src/docx/text.rs +++ b/src/docx/text.rs @@ -36,34 +36,6 @@ impl DocxDocument { numbering: self.numbering.as_ref(), }; - // Headers (deduped on text content — headers may be repeated for - // first-page / even / default variants but the text is usually the - // same; we only want one copy in flat markdown). - let mut seen: std::collections::HashSet = std::collections::HashSet::new(); - for hf in &self.headers_footers { - if !matches!( - hf.hf_type, - super::HeaderFooterType::Default - | super::HeaderFooterType::First - | super::HeaderFooterType::Even - ) { - continue; - } - let mut buf = String::new(); - markdown_blocks(&hf.content, &ctx, &mut buf, 0); - let trimmed = buf.trim(); - // Skip empty headers/footers and duplicates. - if trimmed.is_empty() || !seen.insert(trimmed.to_string()) { - continue; - } - // We don't currently know which side (header vs footer) this - // came from at this layer — `HeaderFooter` carries only the - // type modifier (default/first/even). The body sits between - // the headers and footers we emit, so we put all headers - // before and all footers after the body. - } - - // Decide header/footer split using each section's references. let (header_texts, footer_texts) = split_headers_footers(self, &ctx); for h in &header_texts { out.push_str(h); @@ -80,11 +52,9 @@ impl DocxDocument { out.push('\n'); } - // Trim trailing newlines while out.ends_with('\n') { out.pop(); } - let _ = seen; // silence out } } diff --git a/src/xlsx/mod.rs b/src/xlsx/mod.rs index 40667fd..4455697 100644 --- a/src/xlsx/mod.rs +++ b/src/xlsx/mod.rs @@ -580,10 +580,6 @@ fn extract_chart_text(xml: &[u8]) -> String { let top = stack.last().map(|v| v.as_slice()); match top { Some(b"t") => { - // Rich-text run — append to current_title. - if !current_title.is_empty() { - current_title.push_str(""); - } current_title.push_str(trimmed); }, Some(b"v") => { diff --git a/src/xlsx/worksheet.rs b/src/xlsx/worksheet.rs index ee53726..b51671a 100644 --- a/src/xlsx/worksheet.rs +++ b/src/xlsx/worksheet.rs @@ -317,7 +317,7 @@ fn paper_size_enum_to_twips(id: u32) -> (u32, u32) { 1 => (12240, 15840), // Letter 8.5 × 11" 5 => (12240, 20160), // Legal 8.5 × 14" 7 => (10440, 15120), // Executive 7.25 × 10.5" - 8 => (16840, 23820), // A3 297 × 420 mm + 8 => (16838, 23811), // A3 297 × 420 mm 9 => (11906, 16838), // A4 210 × 297 mm 11 => (8392, 11906), // A5 148 × 210 mm 12 => (14171, 20012), // B4 250 × 353 mm @@ -369,7 +369,7 @@ fn build_page_setup( footer: 0.3, }); let r = raw.unwrap_or_default(); - let mut ps = PageSetup { + let ps = PageSetup { width_twips: r.width_twips, height_twips: r.height_twips, margin_top_twips: in_to_twips(m.top), @@ -380,13 +380,6 @@ fn build_page_setup( footer_distance_twips: in_to_twips(m.footer), landscape: r.landscape, }; - // If we only saw (no ), leave dimensions - // unset so the caller can fall back to the IR default; otherwise - // downstream renderers would draw onto a 0×0 page. - if ps.width_twips == 0 || ps.height_twips == 0 { - ps.width_twips = 0; - ps.height_twips = 0; - } Some(ps) } From dfd3a3408221d3bbe9aa226ad5586dee9c97897b Mon Sep 17 00:00:00 2001 From: Yury Fedoseev Date: Thu, 14 May 2026 18:05:14 -0700 Subject: [PATCH 14/18] fix(review): more Copilot follow-ups + coverage tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Review fixes: - xlsx/numfmt: rewrite format_commas to avoid the rounded.fract() float round-trip (could off-by-one near .999…), fall back to the bare Rust formatter when the value overflows u64, and surface NaN/Infinity as visible labels instead of empty strings so anomalous cells aren't mistaken for empty data. - xlsx/numfmt: format_currency now puts the minus sign in front of the symbol ("-$99.50" not "$-99.50"). Test updated to match. - xlsx/worksheet: extract the ECMA-376 default margins into a single PageMarginsIn::DEFAULTS constant and reuse it from parse_page_margins and build_page_setup so future tweaks stay in lockstep. - convert_pptx: use plain `h / 5` for hundredths-of-pt → twips. div_ceil was inflating every non-multiple-of-5 by an extra twip. Coverage: - New unit tests for src/xls/images.rs (was 0%) covering BLIP type detection, UID/header sizing, signature validation, format mapping, and end-to-end record extraction with a synthetic PNG payload. - New unit tests for src/xlsx/mod.rs (was ~16%) covering sheet-rels path derivation, relative ZIP path resolution (absolute, .. and ./ segments), image-format byte sniffing, extract_chart_text on a minimal title plus a categories/series example, and the drawing anchor parser on picture/text/empty inputs. --- src/convert_pptx.rs | 8 ++- src/xls/images.rs | 93 +++++++++++++++++++++++++ src/xlsx/mod.rs | 155 ++++++++++++++++++++++++++++++++++++++++++ src/xlsx/numfmt.rs | 58 +++++++++++----- src/xlsx/worksheet.rs | 36 ++++++---- 5 files changed, 316 insertions(+), 34 deletions(-) diff --git a/src/convert_pptx.rs b/src/convert_pptx.rs index 96ec625..6bbbfda 100644 --- a/src/convert_pptx.rs +++ b/src/convert_pptx.rs @@ -290,9 +290,11 @@ fn convert_text_body(body: &crate::pptx::TextBody, elements: &mut Vec) let content = convert_text_paragraph_inline(para); // Honour space_before from PPTX so spacer paragraphs // emitted by pdf_to_ir round-trip with their full vertical - // gap. Convert hundredths-of-pt → twips: hundredths * 0.2 - // (1pt = 20 twips, so pt*100 → twips = (pt*100)/5). - let space_before_twips = para.space_before_hundredths_pt.map(|h| h.div_ceil(5)); + // gap. Convert hundredths-of-pt → twips: 1pt = 20 twips, + // so pt*100 → twips = (pt*100)/5. Plain division keeps the + // round-trip exact for values that are multiples of 5; + // div_ceil would inflate every non-multiple by 1 twip. + let space_before_twips = para.space_before_hundredths_pt.map(|h| h / 5); // Empty paragraphs serve as vertical spacers — keep them // in the IR even when content is empty so the renderer // can advance the cursor by the requested amount. diff --git a/src/xls/images.rs b/src/xls/images.rs index f9f75f6..ba94751 100644 --- a/src/xls/images.rs +++ b/src/xls/images.rs @@ -93,3 +93,96 @@ fn to_format(rt: u16) -> ImageFormat { other => ImageFormat::Unknown(other), } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn blip_type_recognition() { + assert!(is_blip_type(0xF01D)); + assert!(is_blip_type(0xF01E)); + assert!(is_blip_type(0xF02A)); + assert!(!is_blip_type(0xF000)); + assert!(!is_blip_type(0xF020)); + } + + #[test] + fn uid_size_secondary_uid() { + // Bit 0 of inst signals a secondary UID — adds 16 bytes. + assert_eq!(uid_size(0xF01D, 0b00), 17); + assert_eq!(uid_size(0xF01D, 0b01), 33); + // 0xF01A..=0xF01C use the metafile-style header layout (base 16). + assert_eq!(uid_size(0xF01A, 0b00), 16); + assert_eq!(uid_size(0xF01A, 0b01), 32); + } + + #[test] + fn metafile_header_only_for_metafile_types() { + assert_eq!(metafile_header_size(0xF01A), 34); + assert_eq!(metafile_header_size(0xF01B), 34); + assert_eq!(metafile_header_size(0xF01C), 34); + assert_eq!(metafile_header_size(0xF01D), 0); + assert_eq!(metafile_header_size(0xF01E), 0); + } + + #[test] + fn signature_validation() { + // JPEG starts with FFD8. + assert!(has_valid_signature(0xF01D, &[0xFF, 0xD8, 0x00])); + assert!(!has_valid_signature(0xF01D, &[0x00, 0x00])); + // PNG starts with 89 50 4E 47. + assert!(has_valid_signature(0xF01E, b"\x89PNG\r\n")); + assert!(!has_valid_signature(0xF01E, b"WRONG")); + // EMF: 01 00 00 00 prefix. + assert!(has_valid_signature(0xF01A, &[0x01, 0x00, 0x00, 0x00, 0xAA])); + assert!(!has_valid_signature(0xF01A, &[0x00, 0x00, 0x00, 0x00])); + // Empty payload always invalid. + assert!(!has_valid_signature(0xF01D, &[])); + } + + #[test] + fn to_format_mapping() { + assert!(matches!(to_format(0xF01A), ImageFormat::Emf)); + assert!(matches!(to_format(0xF01B), ImageFormat::Wmf)); + assert!(matches!(to_format(0xF01C), ImageFormat::Pict)); + assert!(matches!(to_format(0xF01D), ImageFormat::Jpeg)); + assert!(matches!(to_format(0xF02A), ImageFormat::Jpeg)); + assert!(matches!(to_format(0xF01E), ImageFormat::Png)); + assert!(matches!(to_format(0xF01F), ImageFormat::Dib)); + assert!(matches!(to_format(0xF029), ImageFormat::Tiff)); + assert!(matches!(to_format(0xABCD), ImageFormat::Unknown(0xABCD))); + } + + #[test] + fn extract_images_skips_non_blip_bytes() { + // Random non-BLIP bytes produce no images and never crash. + let data = vec![0u8; 64]; + assert!(extract_images(&data).is_empty()); + } + + #[test] + fn extract_images_finds_embedded_png() { + // Synthesize a record header followed by a PNG signature so the + // scanner descends into a valid BLIP payload. + let rec_type: u16 = 0xF01E; // PNG + let inst: u16 = 0; // no secondary UID + let ver_inst: u16 = inst << 4; + let uid = 17usize; // base for non-metafile + let png_body = b"\x89PNG\r\n\x1a\nIHDRfakebody"; + let payload_len = uid + png_body.len(); + + let mut data = Vec::new(); + data.extend_from_slice(&ver_inst.to_le_bytes()); + data.extend_from_slice(&rec_type.to_le_bytes()); + data.extend_from_slice(&(payload_len as u32).to_le_bytes()); + data.extend_from_slice(&[0u8; 17]); // skipped UID bytes + data.extend_from_slice(png_body); + + let images = extract_images(&data); + assert_eq!(images.len(), 1); + assert!(matches!(images[0].format, ImageFormat::Png)); + assert_eq!(images[0].data, png_body); + assert_eq!(images[0].index, 0); + } +} diff --git a/src/xlsx/mod.rs b/src/xlsx/mod.rs index 4455697..8c2b79f 100644 --- a/src/xlsx/mod.rs +++ b/src/xlsx/mod.rs @@ -1058,3 +1058,158 @@ fn guess_image_format_from_bytes(bytes: &[u8]) -> &'static str { "png" } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn sheet_rels_path_top_level() { + assert_eq!(sheet_rels_path("xl/worksheets/sheet1.xml"), "xl/worksheets/_rels/sheet1.xml.rels"); + assert_eq!(sheet_rels_path("sheet1.xml"), "_rels/sheet1.xml.rels"); + } + + #[test] + fn resolve_relative_zip_path_absolute() { + assert_eq!(resolve_relative_zip_path("xl/worksheets/sheet1.xml", "/xl/media/img1.png"), "xl/media/img1.png"); + } + + #[test] + fn resolve_relative_zip_path_dotdot() { + assert_eq!( + resolve_relative_zip_path("xl/worksheets/sheet1.xml", "../drawings/drawing1.xml"), + "xl/drawings/drawing1.xml" + ); + } + + #[test] + fn resolve_relative_zip_path_dot_segment() { + assert_eq!( + resolve_relative_zip_path("xl/worksheets/sheet1.xml", "./local.xml"), + "xl/worksheets/local.xml" + ); + } + + #[test] + fn resolve_relative_zip_path_source_at_root() { + assert_eq!(resolve_relative_zip_path("file.xml", "sub/x.xml"), "sub/x.xml"); + } + + #[test] + fn guess_image_format_signatures() { + assert_eq!(guess_image_format_from_bytes(&[0x89, b'P', b'N', b'G', 13, 10, 26, 10]), "png"); + assert_eq!(guess_image_format_from_bytes(&[0xFF, 0xD8, 0xFF, 0xE0]), "jpeg"); + assert_eq!(guess_image_format_from_bytes(b"GIF89a..."), "gif"); + assert_eq!(guess_image_format_from_bytes(b"GIF87a..."), "gif"); + assert_eq!(guess_image_format_from_bytes(b"BM\0\0\0"), "bmp"); + assert_eq!(guess_image_format_from_bytes(b"II*\0\x08\0"), "tiff"); + assert_eq!(guess_image_format_from_bytes(b"MM\0*\0\x08"), "tiff"); + assert_eq!(guess_image_format_from_bytes(&[0xD7, 0xCD, 0xC6, 0x9A]), "wmf"); + assert_eq!(guess_image_format_from_bytes(&[0x01, 0x00, 0x00, 0x00, 0x58]), "emf"); + // Fall back to png for unknown payloads. + assert_eq!(guess_image_format_from_bytes(&[0, 0, 0]), "png"); + } + + #[test] + fn extract_chart_text_minimal_title() { + let xml = br#" + + + + + + Quarterly Sales + + + + +"#; + let out = extract_chart_text(xml); + assert!(out.contains("Title: Quarterly Sales"), "got: {out}"); + } + + #[test] + fn extract_chart_text_series_and_categories() { + let xml = br#" + + + + + Sheet1!$B$1Budget + + Q1 + Q2 + + + 1000 + 2000 + + + + +"#; + let out = extract_chart_text(xml); + assert!(out.contains("Categories: Q1, Q2"), "got: {out}"); + assert!(out.contains("Budget: 1000, 2000"), "got: {out}"); + } + + #[test] + fn parse_drawing_anchors_picture_one_cell() { + let xml = br#" + + + 0914400 + 0457200 + + + + + + + + + + +"#; + let parsed = parse_drawing_anchors(xml).expect("parse ok"); + assert_eq!(parsed.pictures.len(), 1); + assert_eq!(parsed.pictures[0].embed_rid, "rId4"); + assert_eq!(parsed.pictures[0].cx_emu, 2_000_000); + assert_eq!(parsed.pictures[0].cy_emu, 1_500_000); + assert_eq!(parsed.pictures[0].alt_text.as_deref(), Some("my-alt")); + } + + #[test] + fn parse_drawing_anchors_text_shape() { + let xml = br#" + + + + + + + Hello shape + + + +"#; + let parsed = parse_drawing_anchors(xml).expect("parse ok"); + assert_eq!(parsed.text_shapes.len(), 1); + assert_eq!(parsed.text_shapes[0].text, "Hello shape"); + assert_eq!(parsed.text_shapes[0].cx_emu, 3_000_000); + } + + #[test] + fn parse_drawing_anchors_empty_doc_is_ok() { + let xml = br#" +"#; + let parsed = parse_drawing_anchors(xml).expect("parse ok"); + assert!(parsed.pictures.is_empty()); + assert!(parsed.text_shapes.is_empty()); + } +} diff --git a/src/xlsx/numfmt.rs b/src/xlsx/numfmt.rs index c945861..6979381 100644 --- a/src/xlsx/numfmt.rs +++ b/src/xlsx/numfmt.rs @@ -7,8 +7,11 @@ /// Apply an Excel number format to a numeric value. pub fn apply_format(n: f64, fmt_id: u32, fmt_str: Option<&str>) -> String { - if n.is_nan() || n.is_infinite() { - return String::new(); + if n.is_nan() { + return "NaN".to_string(); + } + if n.is_infinite() { + return if n < 0.0 { "-Infinity".to_string() } else { "Infinity".to_string() }; } // Built-in format IDs per OOXML spec §18.8.30. @@ -67,26 +70,44 @@ fn format_fixed(n: f64, decimals: u8) -> String { pub fn format_commas(n: f64, decimals: u8) -> String { let negative = n < 0.0; let abs = n.abs(); + let sign = if negative { "-" } else { "" }; - // Round to the required number of decimal places first. let factor = 10f64.powi(decimals as i32); - let rounded = (abs * factor).round() / factor; + let scaled = (abs * factor).round(); - let int_part = rounded.trunc() as u64; - let int_str = insert_commas(int_part); + // Fall back to the locale-free Rust formatter for magnitudes that + // overflow u64 — better to lose the thousands separators than to + // emit a silently-wrapped integer. + if !scaled.is_finite() || scaled >= u64::MAX as f64 { + return format!("{}{:.prec$}", sign, abs, prec = decimals as usize); + } - let sign = if negative { "-" } else { "" }; + let scaled_int = scaled as u64; if decimals == 0 { - format!("{}{}", sign, int_str) + format!("{}{}", sign, insert_commas(scaled_int)) } else { - let frac = ((rounded.fract()) * factor).round() as u64; - format!("{}{}.{:0>width$}", sign, int_str, frac, width = decimals as usize) + let divisor = factor as u64; + let int_part = scaled_int / divisor; + let frac = scaled_int % divisor; + format!( + "{}{}.{:0>width$}", + sign, + insert_commas(int_part), + frac, + width = decimals as usize + ) } } fn format_currency(n: f64, symbol: &str, decimals: u8) -> String { - format!("{}{}", symbol, format_commas(n, decimals)) + // Put any minus sign before the currency symbol so callers see + // "-$99.50" rather than "$-99.50". + if n < 0.0 { + format!("-{}{}", symbol, format_commas(n.abs(), decimals)) + } else { + format!("{}{}", symbol, format_commas(n, decimals)) + } } /// Format a number as a percentage (multiplied by 100, with optional decimal places). @@ -349,14 +370,17 @@ mod tests { // ── Edge cases ────────────────────────────────────────────────────── #[test] - fn nan_returns_empty() { - assert_eq!(apply_format(f64::NAN, 0, None), ""); + fn nan_renders_as_label() { + // Returning the literal "NaN" rather than an empty string keeps + // anomalous cells visible in extracted text so they're not + // mistaken for empty data. + assert_eq!(apply_format(f64::NAN, 0, None), "NaN"); } #[test] - fn infinity_returns_empty() { - assert_eq!(apply_format(f64::INFINITY, 0, None), ""); - assert_eq!(apply_format(f64::NEG_INFINITY, 0, None), ""); + fn infinity_renders_as_label() { + assert_eq!(apply_format(f64::INFINITY, 0, None), "Infinity"); + assert_eq!(apply_format(f64::NEG_INFINITY, 0, None), "-Infinity"); } #[test] @@ -374,7 +398,7 @@ mod tests { #[test] fn negative_currency() { - assert_eq!(apply_format(-99.5, 7, None), "$-99.50"); + assert_eq!(apply_format(-99.5, 7, None), "-$99.50"); } #[test] diff --git a/src/xlsx/worksheet.rs b/src/xlsx/worksheet.rs index b51671a..2d81935 100644 --- a/src/xlsx/worksheet.rs +++ b/src/xlsx/worksheet.rs @@ -247,6 +247,20 @@ struct PageMarginsIn { footer: f64, } +impl PageMarginsIn { + /// ECMA-376 default margins (inches). Single source of truth used by + /// both `parse_page_margins` (when an attribute is absent) and + /// `build_page_setup` (when no `` element was present). + const DEFAULTS: PageMarginsIn = PageMarginsIn { + left: 0.7, + right: 0.7, + top: 0.75, + bottom: 0.75, + header: 0.3, + footer: 0.3, + }; +} + /// Raw `` shape — physical dimensions in twips plus orientation. #[derive(Debug, Clone, Copy, Default)] struct PageSetupRaw { @@ -271,13 +285,14 @@ fn parse_page_margins( if left.is_none() && right.is_none() && top.is_none() && bottom.is_none() { return Ok(None); } + let d = PageMarginsIn::DEFAULTS; Ok(Some(PageMarginsIn { - left: left.unwrap_or(0.7), - right: right.unwrap_or(0.7), - top: top.unwrap_or(0.75), - bottom: bottom.unwrap_or(0.75), - header: header.unwrap_or(0.3), - footer: footer.unwrap_or(0.3), + left: left.unwrap_or(d.left), + right: right.unwrap_or(d.right), + top: top.unwrap_or(d.top), + bottom: bottom.unwrap_or(d.bottom), + header: header.unwrap_or(d.header), + footer: footer.unwrap_or(d.footer), })) } @@ -360,14 +375,7 @@ fn build_page_setup( return None; } let in_to_twips = |v: f64| (v * 1440.0).round().max(0.0) as u32; - let m = margins.unwrap_or(PageMarginsIn { - left: 0.7, - right: 0.7, - top: 0.75, - bottom: 0.75, - header: 0.3, - footer: 0.3, - }); + let m = margins.unwrap_or(PageMarginsIn::DEFAULTS); let r = raw.unwrap_or_default(); let ps = PageSetup { width_twips: r.width_twips, From 5745467f0781f033f6d228ca294b3c965a3c2bd8 Mon Sep 17 00:00:00 2001 From: Yury Fedoseev Date: Thu, 14 May 2026 18:10:27 -0700 Subject: [PATCH 15/18] fix(review): rustfmt + more Copilot follow-ups - Apply rustfmt across recent edits (the v0.1.2 PR's Lint and Format Check job was failing because my recent commits hand-wrote a few lines that exceeded rustfmt's max_width). - xlsx/numfmt: only treat 'E'/'e' as a scientific-notation marker when followed by '+' or '-'. A bare 'E' in a custom format like "000E" was previously consuming the next character unconditionally, which could swallow a literal or a digit it should have kept. - xlsx/mod: in parse_drawing_anchors, restrict the `` fallback to the outermost anchor scope (AnchorKind::Unknown). Otherwise the `` inside a shape's `` would overwrite x/y coords parsed earlier from `` in an absoluteAnchor. --- src/xlsx/mod.rs | 19 ++++++++++++++++--- src/xlsx/numfmt.rs | 25 +++++++++++++++++++------ 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/src/xlsx/mod.rs b/src/xlsx/mod.rs index 8c2b79f..f646bb4 100644 --- a/src/xlsx/mod.rs +++ b/src/xlsx/mod.rs @@ -924,7 +924,14 @@ fn parse_drawing_anchors(xml_data: &[u8]) -> crate::core::Result cy_emu = v.parse().unwrap_or(0); } }, - b"off" if cx_emu == 0 && cy_emu == 0 => { + b"off" if cx_emu == 0 && cy_emu == 0 && matches!(kind, AnchorKind::Unknown) => { + // Honour `` only at the outermost anchor level, + // before we've descended into `` or + // ``. Otherwise the `` inside a + // shape's `` (which expresses a transform + // local to the shape, not the anchor origin) would + // overwrite the absolute coordinates parsed from + // ``. if let Some(v) = crate::core::xml::optional_attr_str(e, b"x")? { x_emu = v.parse().unwrap_or(x_emu); } @@ -1065,13 +1072,19 @@ mod tests { #[test] fn sheet_rels_path_top_level() { - assert_eq!(sheet_rels_path("xl/worksheets/sheet1.xml"), "xl/worksheets/_rels/sheet1.xml.rels"); + assert_eq!( + sheet_rels_path("xl/worksheets/sheet1.xml"), + "xl/worksheets/_rels/sheet1.xml.rels" + ); assert_eq!(sheet_rels_path("sheet1.xml"), "_rels/sheet1.xml.rels"); } #[test] fn resolve_relative_zip_path_absolute() { - assert_eq!(resolve_relative_zip_path("xl/worksheets/sheet1.xml", "/xl/media/img1.png"), "xl/media/img1.png"); + assert_eq!( + resolve_relative_zip_path("xl/worksheets/sheet1.xml", "/xl/media/img1.png"), + "xl/media/img1.png" + ); } #[test] diff --git a/src/xlsx/numfmt.rs b/src/xlsx/numfmt.rs index 6979381..3235f85 100644 --- a/src/xlsx/numfmt.rs +++ b/src/xlsx/numfmt.rs @@ -11,7 +11,11 @@ pub fn apply_format(n: f64, fmt_id: u32, fmt_str: Option<&str>) -> String { return "NaN".to_string(); } if n.is_infinite() { - return if n < 0.0 { "-Infinity".to_string() } else { "Infinity".to_string() }; + return if n < 0.0 { + "-Infinity".to_string() + } else { + "Infinity".to_string() + }; } // Built-in format IDs per OOXML spec §18.8.30. @@ -232,11 +236,20 @@ fn apply_custom(n: f64, fmt: &str) -> String { } }, 'E' | 'e' => { - has_scientific = true; - // Skip the +/- and exponent digits - chars.next(); // '+' or '-' - while chars.peek().is_some_and(|c| c.is_ascii_digit()) { - chars.next(); + // Only treat this as scientific notation when followed by + // `+` or `-` (per ECMA-376 §18.8.31). Bare `E` is just + // a literal in formats like "000E" and must not consume + // the next character. + if matches!(chars.peek(), Some('+') | Some('-')) { + has_scientific = true; + chars.next(); // consume the sign + while chars.peek().is_some_and(|c| c.is_ascii_digit()) { + chars.next(); + } + } else if !in_num_part { + currency_prefix.push(c); + } else { + suffix.push(c); } }, '$' => { From 83a33ac30e8b18f4c616a38ad703c31b5f4652ce Mon Sep 17 00:00:00 2001 From: Yury Fedoseev Date: Thu, 14 May 2026 18:12:44 -0700 Subject: [PATCH 16/18] docs(review): document embed_font/font_table style limitations + debug logs - pptx::write::embed_font: spell out that deduplication is by name only, not by bytes; document the workaround of using distinct family names for multiple faces. - docx::write::generate_font_table_xml: note that every entry is emitted as regardless of the underlying style; document the recommended workaround (separate family names per face). - xlsx::read_drawing_for_sheet: emit a `debug!` line when a drawing part fails to read or parse, instead of silently swallowing the error. Lets operators trace cases where worksheet drawings vanish. --- src/docx/write.rs | 7 +++++++ src/pptx/write.rs | 8 ++++++-- src/xlsx/mod.rs | 13 +++++++++++-- 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/src/docx/write.rs b/src/docx/write.rs index c0ea70c..7d36c6a 100644 --- a/src/docx/write.rs +++ b/src/docx/write.rs @@ -3101,6 +3101,13 @@ fn generate_core_props_xml(props: &CoreProps) -> Vec { /// program when there's a match. Without it, Word silently /// substitutes Calibri / Cambria for everything regardless of how /// many TTFs we ship under `/word/fonts/`. +/// +/// **Known limitation**: each entry is emitted as `` +/// regardless of whether the underlying program is a regular, bold, +/// italic, or bold-italic face — we don't introspect the font binary +/// to detect the style. If a caller wants Word to pick up a bold-only +/// face, they should embed it under a distinct family name (e.g. +/// `Calibri-Bold`) and reference that name from runs explicitly. fn generate_font_table_xml(entries: &[(String, String)]) -> Vec { let mut w = Writer::new_with_indent(Vec::new(), b' ', 2); w.write_event(Event::Decl(BytesDecl::new("1.0", Some("UTF-8"), Some("yes")))) diff --git a/src/pptx/write.rs b/src/pptx/write.rs index b4729f8..8ec5a05 100644 --- a/src/pptx/write.rs +++ b/src/pptx/write.rs @@ -357,8 +357,12 @@ impl PptxWriter { } /// Embed a font program (TrueType / OpenType bytes) under `ppt/fonts/`. - /// `name` is used for the file name and the human-readable font name. - /// Subsequent calls with the same name are deduplicated. + /// + /// `name` is used for both the on-disk file name and the human-readable + /// font name in the presentation's font table. Deduplication is by + /// `name` only — supplying different bytes for an already-registered + /// name is a no-op. Pass distinct names (e.g. `Calibri-Bold` vs + /// `Calibri`) when you need to ship multiple faces of the same family. pub fn embed_font(&mut self, name: impl Into, data: Vec) -> &mut Self { let name = name.into(); if !self.embedded_fonts.iter().any(|(n, _)| n == &name) { diff --git a/src/xlsx/mod.rs b/src/xlsx/mod.rs index f646bb4..7c6121a 100644 --- a/src/xlsx/mod.rs +++ b/src/xlsx/mod.rs @@ -671,7 +671,10 @@ fn read_drawing_for_sheet( let drawing_xml = match XlsxDocument::read_xml_entry(archive, &drawing_path) { Ok(d) => d, - Err(_) => return (Vec::new(), Vec::new()), + Err(e) => { + debug!("XlsxDocument: drawing part {} unreadable ({}); skipping", drawing_path, e); + return (Vec::new(), Vec::new()); + }, }; let drawing_rels_path = sheet_rels_path(&drawing_path); @@ -682,7 +685,13 @@ fn read_drawing_for_sheet( let parsed = match parse_drawing_anchors(&drawing_xml) { Ok(a) => a, - Err(_) => return (Vec::new(), Vec::new()), + Err(e) => { + debug!( + "XlsxDocument: drawing {} failed to parse ({}); dropping anchors", + drawing_path, e + ); + return (Vec::new(), Vec::new()); + }, }; // Resolve picture anchors → bytes. From c42825bf033a01fb09fbef2e22cb1928487ccd7a Mon Sep 17 00:00:00 2001 From: Yury Fedoseev Date: Thu, 14 May 2026 18:17:07 -0700 Subject: [PATCH 17/18] fix(docx): track header-vs-footer role at parse time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, `split_headers_footers` derived role by comparing each entry's index against the cumulative count of all sections' `header_refs`. That assumed `headers_footers` was laid out as "all headers first, then all footers" — but the parser actually interleaves them per section (header_refs of section 0, then footer_refs of section 0, then headers of section 1, etc.). In multi-section documents the cumulative-count split silently misclassified entries into the wrong column. Record the role explicitly on each parsed `HeaderFooter` and let the markdown renderer read it directly. Walking header_refs and footer_refs in two separate loops at parse time keeps the role authoritative, even when individual refs fail to resolve and don't contribute an entry to `headers_footers`. Closes a Copilot review comment on PR #38. --- src/docx/headers.rs | 5 +++++ src/docx/mod.rs | 40 ++++++++++++++++++++++++++-------------- src/docx/text.rs | 18 +++++++----------- 3 files changed, 38 insertions(+), 25 deletions(-) diff --git a/src/docx/headers.rs b/src/docx/headers.rs index a7e9c7e..1e3a770 100644 --- a/src/docx/headers.rs +++ b/src/docx/headers.rs @@ -83,4 +83,9 @@ pub struct HeaderFooter { pub hf_type: HeaderFooterType, /// Block content within the header or footer. pub content: Vec, + /// `true` if this came from a ``, `false` if + /// from a ``. Lets downstream consumers (e.g. + /// the markdown renderer) sort entries into headers vs footers + /// without trying to back-derive from cumulative ref counts. + pub is_header: bool, } diff --git a/src/docx/mod.rs b/src/docx/mod.rs index 02d2e60..0a41711 100644 --- a/src/docx/mod.rs +++ b/src/docx/mod.rs @@ -175,24 +175,36 @@ impl DocxDocument { let doc_data = opc.read_part(&main_part)?; let (body, sections) = parse_document(&doc_data, &doc_rels)?; - // Parse headers and footers + // Parse headers and footers. Walk header refs and footer refs + // separately so each parsed `HeaderFooter` can record its own + // role; without that distinction, downstream consumers had to + // back-derive headers-vs-footers from cumulative ref counts, + // which silently misclassifies entries in multi-section docs. let mut headers_footers = Vec::new(); - for section in §ions { - for hf_ref in section.header_refs.iter().chain(section.footer_refs.iter()) { - if let Some(rel) = doc_rels.get_by_id(&hf_ref.relationship_id) { - if rel.target_mode == TargetMode::Internal { - let part_name = main_part.resolve_relative(&rel.target)?; - if opc.has_part(&part_name) { - let data = opc.read_part(&part_name)?; - let content = parse_body_elements(&data)?; - headers_footers.push(HeaderFooter { - hf_type: hf_ref.hf_type, - content, - }); - } + let mut parse_hf = |hf_ref: &HeaderFooterRef, is_header: bool| -> CoreResult<()> { + if let Some(rel) = doc_rels.get_by_id(&hf_ref.relationship_id) { + if rel.target_mode == TargetMode::Internal { + let part_name = main_part.resolve_relative(&rel.target)?; + if opc.has_part(&part_name) { + let data = opc.read_part(&part_name)?; + let content = parse_body_elements(&data)?; + headers_footers.push(HeaderFooter { + hf_type: hf_ref.hf_type, + content, + is_header, + }); } } } + Ok(()) + }; + for section in §ions { + for hf_ref in §ion.header_refs { + parse_hf(hf_ref, true)?; + } + for hf_ref in §ion.footer_refs { + parse_hf(hf_ref, false)?; + } } // Scan `word/fonts/` for embedded font programs. Files there are diff --git a/src/docx/text.rs b/src/docx/text.rs index 4028594..6f766b0 100644 --- a/src/docx/text.rs +++ b/src/docx/text.rs @@ -59,29 +59,25 @@ impl DocxDocument { } } -/// Split parsed `HeaderFooter` entries into headers vs footers using the -/// section reference lists. Returns (headers, footers) as deduplicated -/// markdown-string vectors. We don't currently retain the relationship -/// IDs that map a section ref to a specific parsed `HeaderFooter`, so we -/// approximate: header_refs.len() entries from the front go to headers, -/// the rest go to footers. Correct for the common case (single section -/// with one of each); on multi-variant documents some misclassification -/// is possible but text is still preserved (just maybe in the wrong slot). +/// Split parsed `HeaderFooter` entries into headers vs footers and +/// return them as deduplicated markdown-string vectors. Role is read +/// directly from each entry's `is_header` field (set at parse time), +/// so this is correct regardless of how many sections the document +/// has or how the entries are interleaved. fn split_headers_footers(doc: &DocxDocument, ctx: &MarkdownCtx) -> (Vec, Vec) { let mut headers: Vec = Vec::new(); let mut footers: Vec = Vec::new(); let mut header_seen: std::collections::HashSet = std::collections::HashSet::new(); let mut footer_seen: std::collections::HashSet = std::collections::HashSet::new(); - let n_header_refs: usize = doc.sections.iter().map(|s| s.header_refs.len()).sum(); - for (idx, hf) in doc.headers_footers.iter().enumerate() { + for hf in &doc.headers_footers { let mut buf = String::new(); markdown_blocks(&hf.content, ctx, &mut buf, 0); let t = buf.trim().to_string(); if t.is_empty() { continue; } - if idx < n_header_refs { + if hf.is_header { if header_seen.insert(t.clone()) { headers.push(t); } From ac38c7ee68a42077bf10b85748dd2ea1fcd63009 Mon Sep 17 00:00:00 2001 From: Yury Fedoseev Date: Thu, 14 May 2026 18:32:30 -0700 Subject: [PATCH 18/18] test: add IR round-trip coverage for previously untested Element variants Coverage was sitting at 73.2% line on this branch, below the 75% floor enforced by the Code Coverage CI job. The PR introduced a lot of new write/conversion code (chart text, embedded fonts, multi-section sectPr, drawing anchors, page setup) and the existing integration tests only exercised the common element variants. Adds round-trip tests through `create_from_ir_to_writer` for: - ThematicBreak (verifies w:pBdr emission in document.xml) - PageBreak + ColumnBreak (verifies w:br w:type="page"/"column") - Footnote + Endnote (verifies the footnotes.xml/endnotes.xml parts) - TextBox (verifies floating content lands in document.xml) - Numbered List with start_number - Multi-section document with Continuous / NextPage / OddPage breaks - Crate-level Document::from_reader + plain_text + to_markdown + to_ir convenience path Local line coverage rises from 73.21% to 76.43%, clearing the 75% threshold. --- tests/write_integration.rs | 368 +++++++++++++++++++++++++++++++++++++ 1 file changed, 368 insertions(+) diff --git a/tests/write_integration.rs b/tests/write_integration.rs index 92391fe..4f83e6c 100644 --- a/tests/write_integration.rs +++ b/tests/write_integration.rs @@ -1169,3 +1169,371 @@ fn ir_table_caption_round_trip() { "expected caption text in document.xml" ); } + +// --------------------------------------------------------------------------- +// Tests for Element variants that were previously uncovered: ThematicBreak, +// PageBreak, ColumnBreak, TextBox, Footnote, Endnote. Each test goes +// through the full IR → DOCX → re-parse round-trip so the create.rs +// dispatch arms, the corresponding `DocxWriter` methods, and the +// downstream re-parser all get exercised. +// --------------------------------------------------------------------------- + +#[test] +fn ir_thematic_break_emits_bordered_paragraph() { + use office_oxide::ir::*; + + let ir = DocumentIR { + metadata: Metadata { + format: office_oxide::DocumentFormat::Docx, + ..Default::default() + }, + sections: vec![Section { + elements: vec![ + Element::Paragraph(Paragraph { + content: vec![InlineContent::Text(TextSpan::plain("Before"))], + ..Default::default() + }), + Element::ThematicBreak, + Element::Paragraph(Paragraph { + content: vec![InlineContent::Text(TextSpan::plain("After"))], + ..Default::default() + }), + ], + ..Default::default() + }], + }; + + let mut buf = Cursor::new(Vec::new()); + office_oxide::create::create_from_ir_to_writer( + &ir, + office_oxide::DocumentFormat::Docx, + &mut buf, + ) + .unwrap(); + buf.set_position(0); + + let zip_bytes = buf.into_inner(); + let mut zip = zip::ZipArchive::new(Cursor::new(zip_bytes)).unwrap(); + let mut doc_xml = String::new(); + { + use std::io::Read; + zip.by_name("word/document.xml") + .unwrap() + .read_to_string(&mut doc_xml) + .unwrap(); + } + // Thematic break = empty paragraph with a bottom border. The raw + // XML should include a w:pBdr/w:bottom element somewhere between + // "Before" and "After". + assert!( + doc_xml.contains("w:pBdr"), + "expected pBdr (paragraph border) for thematic break" + ); + assert!(doc_xml.contains("Before") && doc_xml.contains("After")); +} + +#[test] +fn ir_page_and_column_breaks_round_trip() { + use office_oxide::ir::*; + + let ir = DocumentIR { + metadata: Metadata { + format: office_oxide::DocumentFormat::Docx, + ..Default::default() + }, + sections: vec![Section { + elements: vec![ + Element::Paragraph(Paragraph { + content: vec![InlineContent::Text(TextSpan::plain("Page 1"))], + ..Default::default() + }), + Element::PageBreak, + Element::Paragraph(Paragraph { + content: vec![InlineContent::Text(TextSpan::plain("Page 2 col 1"))], + ..Default::default() + }), + Element::ColumnBreak, + Element::Paragraph(Paragraph { + content: vec![InlineContent::Text(TextSpan::plain("Page 2 col 2"))], + ..Default::default() + }), + ], + ..Default::default() + }], + }; + + let mut buf = Cursor::new(Vec::new()); + office_oxide::create::create_from_ir_to_writer( + &ir, + office_oxide::DocumentFormat::Docx, + &mut buf, + ) + .unwrap(); + buf.set_position(0); + + let zip_bytes = buf.into_inner(); + let mut zip = zip::ZipArchive::new(Cursor::new(zip_bytes)).unwrap(); + let mut doc_xml = String::new(); + { + use std::io::Read; + zip.by_name("word/document.xml") + .unwrap() + .read_to_string(&mut doc_xml) + .unwrap(); + } + // Page break = ; column break = . + assert!(doc_xml.contains("w:type=\"page\""), "expected page break w:br: {doc_xml:.500}",); + assert!(doc_xml.contains("w:type=\"column\""), "expected column break w:br",); +} + +#[test] +fn ir_footnote_endnote_round_trip() { + use office_oxide::ir::*; + + let footnote_content = vec![Element::Paragraph(Paragraph { + content: vec![InlineContent::Text(TextSpan::plain("This is a footnote."))], + ..Default::default() + })]; + let endnote_content = vec![Element::Paragraph(Paragraph { + content: vec![InlineContent::Text(TextSpan::plain("This is an endnote."))], + ..Default::default() + })]; + + let ir = DocumentIR { + metadata: Metadata { + format: office_oxide::DocumentFormat::Docx, + ..Default::default() + }, + sections: vec![Section { + elements: vec![ + Element::Paragraph(Paragraph { + content: vec![ + InlineContent::Text(TextSpan::plain("Main body")), + InlineContent::FootnoteRef(FootnoteRef { + note_id: 1, + marker: None, + }), + InlineContent::EndnoteRef(FootnoteRef { + note_id: 2, + marker: None, + }), + ], + ..Default::default() + }), + Element::Footnote(Note { + id: 1, + content: footnote_content, + marker: None, + }), + Element::Endnote(Note { + id: 2, + content: endnote_content, + marker: None, + }), + ], + ..Default::default() + }], + }; + + let mut buf = Cursor::new(Vec::new()); + office_oxide::create::create_from_ir_to_writer( + &ir, + office_oxide::DocumentFormat::Docx, + &mut buf, + ) + .unwrap(); + buf.set_position(0); + + // The package should now contain a footnotes part and an endnotes part. + let zip_bytes = buf.into_inner(); + let zip = zip::ZipArchive::new(Cursor::new(zip_bytes)).unwrap(); + let names: Vec = zip.file_names().map(String::from).collect(); + assert!( + names.iter().any(|n| n == "word/footnotes.xml"), + "expected word/footnotes.xml in: {names:?}" + ); + assert!( + names.iter().any(|n| n == "word/endnotes.xml"), + "expected word/endnotes.xml in: {names:?}" + ); +} + +#[test] +fn ir_text_box_round_trip() { + use office_oxide::ir::*; + + let ir = DocumentIR { + metadata: Metadata { + format: office_oxide::DocumentFormat::Docx, + ..Default::default() + }, + sections: vec![Section { + elements: vec![Element::TextBox(TextBox { + content: vec![Element::Paragraph(Paragraph { + content: vec![InlineContent::Text(TextSpan::plain("Floating callout"))], + ..Default::default() + })], + ..Default::default() + })], + ..Default::default() + }], + }; + + let mut buf = Cursor::new(Vec::new()); + office_oxide::create::create_from_ir_to_writer( + &ir, + office_oxide::DocumentFormat::Docx, + &mut buf, + ) + .unwrap(); + buf.set_position(0); + + let zip_bytes = buf.into_inner(); + let mut zip = zip::ZipArchive::new(Cursor::new(zip_bytes)).unwrap(); + let mut doc_xml = String::new(); + { + use std::io::Read; + zip.by_name("word/document.xml") + .unwrap() + .read_to_string(&mut doc_xml) + .unwrap(); + } + // Text-box content lives inside a w:txbxContent element rather + // than as a top-level paragraph, so look for the raw text in + // the XML. Round-trip plain_text extraction of floating shapes + // is intentionally not exposed today. + assert!( + doc_xml.contains("Floating callout"), + "expected text-box content in document.xml" + ); +} + +#[test] +fn ir_numbered_list_round_trip() { + use office_oxide::ir::*; + + let item = |text: &str| ListItem { + content: vec![Element::Paragraph(Paragraph { + content: vec![InlineContent::Text(TextSpan::plain(text))], + ..Default::default() + })], + ..Default::default() + }; + let ir = DocumentIR { + metadata: Metadata { + format: office_oxide::DocumentFormat::Docx, + ..Default::default() + }, + sections: vec![Section { + elements: vec![Element::List(List { + ordered: true, + start_number: Some(5), + items: vec![item("Five"), item("Six"), item("Seven")], + ..Default::default() + })], + ..Default::default() + }], + }; + + let mut buf = Cursor::new(Vec::new()); + office_oxide::create::create_from_ir_to_writer( + &ir, + office_oxide::DocumentFormat::Docx, + &mut buf, + ) + .unwrap(); + buf.set_position(0); + + let doc = office_oxide::docx::DocxDocument::from_reader(buf).unwrap(); + let md = doc.to_markdown(); + assert!(md.contains("Five") && md.contains("Six") && md.contains("Seven"), "md: {md}"); +} + +#[test] +fn ir_multi_section_round_trip() { + use office_oxide::ir::*; + + let make_section = |label: &str, break_type: SectionBreakType| Section { + elements: vec![Element::Paragraph(Paragraph { + content: vec![InlineContent::Text(TextSpan::plain(label))], + ..Default::default() + })], + break_type, + ..Default::default() + }; + + let ir = DocumentIR { + metadata: Metadata { + format: office_oxide::DocumentFormat::Docx, + ..Default::default() + }, + sections: vec![ + make_section("Section A", SectionBreakType::Continuous), + make_section("Section B", SectionBreakType::NextPage), + make_section("Section C", SectionBreakType::OddPage), + ], + }; + + let mut buf = Cursor::new(Vec::new()); + office_oxide::create::create_from_ir_to_writer( + &ir, + office_oxide::DocumentFormat::Docx, + &mut buf, + ) + .unwrap(); + buf.set_position(0); + + let doc = office_oxide::docx::DocxDocument::from_reader(buf).unwrap(); + let text = doc.plain_text(); + assert!( + text.contains("Section A") && text.contains("Section B") && text.contains("Section C"), + "text: {text}" + ); +} + +#[test] +fn convenience_functions_round_trip() { + use office_oxide::ir::*; + + let ir = DocumentIR { + metadata: Metadata { + format: office_oxide::DocumentFormat::Docx, + ..Default::default() + }, + sections: vec![Section { + elements: vec![ + Element::Heading(Heading { + level: 1, + content: vec![InlineContent::Text(TextSpan::plain("Title"))], + ..Default::default() + }), + Element::Paragraph(Paragraph { + content: vec![InlineContent::Text(TextSpan::plain("Hello"))], + ..Default::default() + }), + ], + ..Default::default() + }], + }; + + let mut buf = Cursor::new(Vec::new()); + office_oxide::create::create_from_ir_to_writer( + &ir, + office_oxide::DocumentFormat::Docx, + &mut buf, + ) + .unwrap(); + buf.set_position(0); + + // Exercise the crate-level extract_text / to_markdown / Document::open paths. + let bytes = buf.into_inner(); + let doc = office_oxide::Document::from_reader( + Cursor::new(bytes.clone()), + office_oxide::DocumentFormat::Docx, + ) + .unwrap(); + assert!(doc.plain_text().contains("Hello")); + assert!(doc.to_markdown().contains("Hello")); + let ir2 = doc.to_ir(); + assert!(!ir2.sections.is_empty()); +}