From 6b18677d806b557e5253370a647b2248b0bdc136 Mon Sep 17 00:00:00 2001
From: Yury Fedoseev <yfedoseev@gmail.com>
Date: Wed, 13 May 2026 18:36:43 -0700
Subject: [PATCH 01/18] =?UTF-8?q?release:=20v0.1.2=20=E2=80=94=20round-tri?=
 =?UTF-8?q?p=20fidelity,=20IR=20layout,=20perf,=20embedded=20fonts?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Major work for the v0.1.2 release branch. Consolidates 32 commits
of round-trip fidelity fixes, IR enrichment, performance, and
test coverage into a single release commit.

# Performance
- xlsx: O(1) cell-style lookups via HashMap (replaces linear Vec scan
  in the hot per-cell formatting path)

# Round-trip fidelity (PDF → office → PDF)
- DOCX/PPTX/XLSX: preserve images, fonts, columns end-to-end
- Alignment, spacing, footers, rules survive both directions
- PPTX ThematicBreak encoded as a 30-char U+2500 marker run that
  downstream PDF renderers detect and re-emit as a real <hr>
- DOCX <w:pBdr><w:bottom/> on an empty paragraph recovers as
  Element::ThematicBreak in IR

# DOCX
- Parse <w:framePr> into IR FramePosition (layout-preserving paths
  like pdf_oxide's to_docx_bytes_layout)
- Heading carries frame_position
- Parse floating <wp:anchor> drawings and <wps:wsp> vector shapes
  (line/rect with stroke/fill RGB)
- Preserve per-section page sizes; emit per-section <w:sectPr>
  on multi-section IR
- Preserve <w:sz> through to IR's font_size_half_pt
- Include header/footer text in to_markdown and to_ir
- Embedded fonts under /word/fonts/ are parsed and exposed on
  DocxDocument.embedded_fonts; strip_embedded_font_filename
  recovers the original face name from font_<n>_<face>.<ext>
  (fixes greedy alphabetic-trim regression)
- parse_drawing decomposed into focused recursive helpers
- Plumb paragraph alignment + inline image collection

# PPTX
- Real Title+Body slide layout instead of blank
- Paginate slides (~250 cap) + synthesize Slide N heading on to_ir
- Wrap shapes in positioned TextBox + parse slide background
- Don't wrap zero-size shape positions in TextBox
- Propagate slide size to per-section page_setup
- Preserve run font sizes (sz attribute → font_size_hundredths_pt)
- Parse paragraph algn + spcBef → IR alignment + space_before
- Picture shapes carry embed_rid + bytes + format resolved via
  pre-built media map
- PPTX font embedding under /ppt/fonts/
- Structured chart text extraction (parses <c:chart> nodes into
  per-chart text blocks rendered as ## Chart N in markdown)

# XLSX
- Per-worksheet page_setup round-trip via <pageMargins>/<pageSetup>
  with inch/mm/cm/paperSize parsing
- Preserve font sizes through IR; emit prose XLSX as paragraphs
  when a 1-column sheet has long-text cells
- Unique worksheet names in ir_to_xlsx
- New numfmt module: built-in IDs 0-44 (general, fixed, commas,
  percent, currency, scientific, accounting) + custom format
  strings (multi-section, [Red] color directive, currency prefix,
  quoted literal suffix, scale-by-thousand)
- Worksheet drawings: WorksheetPicture + WorksheetTextShape parsed
  from xl/drawings/, anchor coords in EMU
- Embedded fonts under /xl/fonts/

# IR enrichment
- New types: Shape, ShapeGeom, FramePosition, ParagraphAlignment
  variants (Distribute), block_default centralisation (ThematicBreak
  → "---" / "<hr />", PageBreak/Shape invisible in flow, TextBox
  recursively renders children)
- New helpers: first_inline_font_size_pt, inline_to_element_block,
  build_nested_list (flat / 2-level / 3-level nesting)
- Heading carries frame_position + alignment
- Section.background_rgb propagated from PPTX slide background

# Writers
- DOCX: wire fontTable, heading styles, embed fonts, core props,
  dedup runs
- PPTX: cap slides at ~250 (PowerPoint hard limit), autoFit, set_title_aligned
- XLSX: split long paragraphs across cells; unique sheet names

# Refactors
- core: unified font embedding helper + cross-format font-size
  invariant (HalfPoint::from_word_sz / from_drawingml_sz)
- ir: consolidate inline_to_element / build_nested_list /
  first_inline_font_size_pt (used by all 3 IR converters)
- ir_render: extract block_default to centralise no-flow defaults
  (compiler-enforced exhaustiveness on new Element variants)

# Tests
- 98 new unit tests across the touched modules (core,
  xlsx/numfmt, xlsx/worksheet, docx/formatting, docx/mod,
  pptx/slide, ir, ir_render). All in-module #[cfg(test)] blocks;
  no new integration files.
- Final state: 535/535 tests pass on default, --features parallel,
  --features mmap, --features parallel+mmap

# Cleanup
- cargo fmt clean
- cargo clippy --workspace --all-targets -- -D warnings clean
- 0 build warnings
- maturin build (python feature) and wasm-pack build (wasm
  feature) both produce working packages; Python smoke verifies
  Document / EditableDocument / XlsxWriter / PptxWriter /
  create_from_markdown all functional
---
 CHANGELOG.md                          | 155 +++++
 Cargo.lock                            |   6 +-
 Cargo.toml                            |   2 +-
 bench_rust/Cargo.toml                 |   2 +-
 crates/office_oxide_cli/Cargo.toml    |   2 +-
 crates/office_oxide_mcp/Cargo.toml    |   2 +-
 csharp/OfficeOxide/OfficeOxide.csproj |   2 +-
 go/cmd/install/main.go                |   2 +-
 js/package-lock.json                  |   4 +-
 js/package.json                       |   2 +-
 pyproject.toml                        |   2 +-
 src/convert_doc.rs                    |   1 +
 src/convert_docx.rs                   | 494 +++++++++++++---
 src/convert_ppt.rs                    |   1 +
 src/convert_pptx.rs                   | 221 ++++---
 src/convert_xlsx.rs                   | 333 ++++++++++-
 src/core/core_properties.rs           | 162 ++++++
 src/core/embedded_fonts.rs            | 119 ++++
 src/core/mod.rs                       |   5 +
 src/core/opc.rs                       |  14 +
 src/core/relationships.rs             |  10 +
 src/core/units.rs                     |  75 +++
 src/create.rs                         | 790 +++++++++++++++++++++++---
 src/docx/document.rs                  |  11 +
 src/docx/formatting.rs                | 193 +++++++
 src/docx/image.rs                     |  71 ++-
 src/docx/mod.rs                       | 757 ++++++++++++++++++++++--
 src/docx/text.rs                      |  83 +++
 src/docx/write.rs                     | 284 ++++++++-
 src/ir.rs                             | 363 ++++++++++++
 src/ir_from_markdown.rs               |   1 +
 src/ir_render.rs                      | 247 ++++++--
 src/pptx/mod.rs                       |  97 +++-
 src/pptx/shape.rs                     |  25 +-
 src/pptx/slide.rs                     | 648 ++++++++++++++++++++-
 src/pptx/text.rs                      |  36 ++
 src/pptx/write.rs                     | 320 ++++++++++-
 src/xlsx/mod.rs                       | 698 ++++++++++++++++++++++-
 src/xlsx/numfmt.rs                    | 478 ++++++++++++++++
 src/xlsx/styles.rs                    |  46 +-
 src/xlsx/text.rs                      | 121 ++++
 src/xlsx/worksheet.rs                 | 465 +++++++++++++++
 src/xlsx/write.rs                     | 744 ++++++++++++++++++++++--
 tests/office_integration.rs           |  37 +-
 tests/write_integration.rs            |   1 +
 wasm-pkg/package.json                 |   2 +-
 46 files changed, 7601 insertions(+), 533 deletions(-)
 create mode 100644 src/core/core_properties.rs
 create mode 100644 src/core/embedded_fonts.rs
 create mode 100644 src/xlsx/numfmt.rs
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 45b7aee..a98f0f2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,161 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.1.2] - 2026-05-13
+
+> Round-trip fidelity, IR layout features, embedded fonts, XLSX number formatting, and an O(1) style-lookup perf win.
+
+### Performance
+
+- **XLSX styles**: cell-format lookups now use a `HashMap`, replacing
+  the linear `Vec` scan in `format_cell_value` / `is_date_cell`.
+  Per-cell formatting becomes O(1); large styled workbooks parse
+  noticeably faster with no API change.
+
+### Round-trip fidelity (PDF → office → PDF)
+
+- **Alignment, spacing, footers, and horizontal rules** preserved end-to-end
+  through both `to_docx` and `to_pptx` writers.
+- **Images, fonts, and column layouts** preserved across DOCX, PPTX, and
+  XLSX. Source-PDF font programs that previously registered as empty
+  subsets now embed correctly.
+- **`Element::ThematicBreak`** encoded in PPTX as a centered 30-char run
+  of `U+2500 BOX DRAWINGS LIGHT HORIZONTAL`. Downstream PDF renderers
+  detect the all-U+2500 content and re-emit a real horizontal rule.
+- **DOCX horizontal rules** recovered from the conventional encoding
+  (empty paragraph + `<w:pBdr><w:bottom/>`) back into `Element::ThematicBreak`.
+
+### DOCX
+
+- **`<w:framePr>` parsed into IR** as `FramePosition` (twips, page-anchored)
+  on both `Paragraph` and `Heading`. Used by layout-preserving paths
+  (e.g. pdf_oxide's `to_docx_bytes_layout`).
+- **Floating drawings and vector shapes**: `<wp:anchor>` images plus
+  `<wps:wsp>` preset shapes (line, rect) with stroke/fill RGB and
+  stroke width round-trip through `DrawingInfo`.
+- **Per-section page sizes** preserved through `to_ir`; multi-section IR
+  emits per-section `<w:sectPr>`.
+- **`<w:sz>` preserved** through to IR's `font_size_half_pt`.
+- **Headers and footers** now included in `to_markdown` and `to_ir`
+  (previously silently dropped).
+- **Embedded fonts** under `/word/fonts/` exposed on
+  `DocxDocument.embedded_fonts`. `strip_embedded_font_filename` recovers
+  the original face name from `font_<n>_<face>.<ext>` (fixes greedy
+  alphabetic-trim regression where `TeXGyreTermesX-` was returned
+  instead of `TeXGyreTermesX-Regular`).
+- **`parse_drawing` decomposed** into focused recursive helpers
+  (`parse_inline_or_anchor_body`, `parse_anchor_position`,
+  `parse_shape_properties`, etc.) for readability.
+- **Run-level `<w:rFonts w:ascii>` plumbed into `TextSpan.font_name`**;
+  `<w:cols>` propagated to `Section.columns`.
+
+### PPTX
+
+- **Pagination**: each slide forces a `SectionBreakType::NextPage` so two
+  slides never share a rendered page.
+- **Real Title+Body slide layout** emitted by the writer instead of a blank
+  layout, so PowerPoint shows placeholder hints in edit mode.
+- **Slide background**: `<p:cSld><p:bg><p:bgPr><a:solidFill><a:srgbClr>`
+  parsed into `Slide.background_rgb` and propagated to `Section.background_rgb`.
+- **Positioned text boxes**: shapes with explicit `<a:xfrm>` coordinates
+  wrap their content in `Element::TextBox` so downstream renderers can
+  place them at absolute EMU coordinates. Zero-size shapes skip the wrapper.
+- **Slide size → page setup**: `<p:sldSz cx=… cy=…>` propagated to each
+  section's `PageSetup`.
+- **Run font sizes preserved** via new `TextRun.font_size_hundredths_pt`
+  (parsed from `<a:rPr sz="…"/>`).
+- **Paragraph alignment** parsed from `<a:pPr algn="…"/>` (all five
+  variants: `l` / `ctr` / `r` / `just` / `dist`) into
+  `TextParagraph.alignment`. **Space-before** parsed from
+  `<a:spcBef><a:spcPts val=…/>`.
+- **Title alignment propagation**: `find_title` returns text + first
+  paragraph's alignment, seeding both `Section.title` and the synthesised
+  level-2 Heading's alignment.
+- **Picture shapes** now carry `embed_rid`, `data`, and `format`
+  (resolved via a pre-built media map at parse time, so the parallel
+  slide parser doesn't need the OPC reader).
+- **Font embedding** under `/ppt/fonts/`.
+- **Structured chart text extraction**: `<c:chart>` parts parsed into
+  per-chart text blocks rendered as `## Chart N` in markdown / search /
+  PDF without needing a graphical chart renderer.
+- **Compaction**: consecutive H1/H2 cover-page headings fold into one
+  slide instead of fragmenting; long XLSX paragraphs split across cells
+  to respect ~32k char-per-cell limits.
+- **Slide cap**: writer caps at ~250 slides (PowerPoint's hard limit).
+
+### XLSX
+
+- **Per-worksheet `page_setup`** round-trips via `<pageMargins>` (inches)
+  and `<pageSetup>` (paperWidth/paperHeight with mm/cm/in suffix or
+  `paperSize` enum 1–13). New `Worksheet.page_setup`.
+- **`numfmt` module** (`crate::xlsx::numfmt`): built-in IDs 0–44 (general,
+  fixed, commas, percent, currency, scientific, accounting) and a
+  simplified custom format-string parser (multi-section, `[Red]` color
+  directives stripped, currency prefix from `[$€-407]`, quoted literal
+  suffix, percent and thousands separators). Applied to numeric cells
+  during `format_cell_value` and `write_cell_value_fast`.
+- **Font sizes** preserved through IR; long-text single-column sheets
+  emit as paragraphs instead of a tall 1-column GFM table.
+- **Unique worksheet names** in `ir_to_xlsx` (duplicates suffixed with
+  `_2`, `_3`, …).
+- **Drawings**: `xl/drawings/drawingN.xml` parsed into
+  `Worksheet.images` (`WorksheetPicture` with EMU coords + bytes) and
+  `Worksheet.text_shapes` (`WorksheetTextShape` for layout-mode text
+  boxes from `to_xlsx_bytes_layout`).
+- **Embedded fonts** under `/xl/fonts/`.
+
+### IR enrichment
+
+- **New types**: `Shape` (vector shape anchored at absolute EMU coords),
+  `ShapeGeom` (`Line`, `Rect`), `FramePosition` (twip-anchored frame).
+- **`Heading`** gains `frame_position` + `alignment`.
+- **`Section`** gains `background_rgb`.
+- **`ParagraphAlignment`** gains the `Distribute` variant.
+- **`Element::Shape(Shape)`** variant for vector shapes.
+- **New helpers**: `first_inline_font_size_pt`, `inline_to_element_block`,
+  `build_nested_list` (flat / 2-level / 3-level recursion).
+- **Centralized defaults** in `ir_render::block_default`: ThematicBreak
+  renders as `"---"` / `<hr />`; PageBreak / ColumnBreak / Shape are
+  invisible in flow; TextBox / Footnote / Endnote recursively render
+  children. Adding a new `Element` variant forces a compile error
+  in `block_default::default_plain` instead of silent fallthrough.
+
+### Core
+
+- **`crate::core::core_properties`**: shared `docProps/core.xml` generator
+  used by all three writers. Emits `dc:title`, `dc:creator`, `dc:subject`,
+  `dc:description`, `cp:keywords`, `dcterms:created`, `dcterms:modified`
+  from the IR's `Metadata`. Empty fields are omitted entirely.
+- **`crate::core::embedded_fonts`**: unified font-embedding helper
+  (`write_embedded_fonts`, `sanitize_font_filename`). All three formats
+  share the layout `<prefix>font_<n>_<safe_name>.ttf`.
+- **`HalfPoint::from_word_sz` / `from_drawingml_sz` / `to_drawingml_sz` /
+  `from_points_rounded`**: cross-format font-size invariants
+  (DrawingML hundredths-of-a-point vs WML half-points).
+
+### Tests
+
+- **+98 unit tests** across the modules touched in this release:
+  `core::embedded_fonts`, `core::core_properties`, `core::units`,
+  `xlsx::numfmt`, `xlsx::worksheet`, `docx::formatting`, `docx::mod`,
+  `pptx::slide`, `ir`, `ir_render`.
+- **535 / 535 tests pass** across default, `--features parallel`,
+  `--features mmap`, and `--features parallel,mmap` builds.
+- `cargo fmt` clean. `cargo clippy --workspace --all-targets -- -D warnings`
+  clean.
+
+### Bindings
+
+- **Python wheel** (maturin, PyO3 0.28) builds cleanly and exposes
+  `Document`, `EditableDocument`, `XlsxWriter`, `PptxWriter`,
+  `OfficeOxideError`, `create_from_markdown`, `extract_text`,
+  `to_markdown`, `to_html`, `version`.
+- **WASM** package (`wasm-pack build --target web/node/bundler`) builds
+  cleanly with `--features wasm`.
+- **C#** package bumped to 0.1.2 (csproj only — no API changes).
+
+[0.1.2]: https://github.com/yfedoseev/office_oxide/compare/v0.1.1...v0.1.2
+
 ## [0.1.1] - 2026-04-30
 
 > Richer IR type system, DOCX writer output, improved PPTX/XLSX IR renderers, and writer APIs in all language bindings
diff --git a/Cargo.lock b/Cargo.lock
index a965793..bc0b74b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -286,7 +286,7 @@ dependencies = [
 
 [[package]]
 name = "office_oxide"
-version = "0.1.1"
+version = "0.1.2"
 dependencies = [
  "atoi_simd",
  "encoding_rs",
@@ -307,7 +307,7 @@ dependencies = [
 
 [[package]]
 name = "office_oxide_cli"
-version = "0.1.1"
+version = "0.1.2"
 dependencies = [
  "clap",
  "office_oxide",
@@ -316,7 +316,7 @@ dependencies = [
 
 [[package]]
 name = "office_oxide_mcp"
-version = "0.1.1"
+version = "0.1.2"
 dependencies = [
  "office_oxide",
  "serde_json",
diff --git a/Cargo.toml b/Cargo.toml
index 0f79594..a6f0723 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -20,7 +20,7 @@ match_like_matches_macro = "allow"
 manual_find = "allow"
 
 [workspace.package]
-version = "0.1.1"
+version = "0.1.2"
 edition = "2024"
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/yfedoseev/office_oxide"
diff --git a/bench_rust/Cargo.toml b/bench_rust/Cargo.toml
index 9d5cc8c..6cbedb5 100644
--- a/bench_rust/Cargo.toml
+++ b/bench_rust/Cargo.toml
@@ -5,7 +5,7 @@
 
 [package]
 name = "bench_rust"
-version = "0.1.1"
+version = "0.1.2"
 edition = "2021"
 
 [dependencies]
diff --git a/crates/office_oxide_cli/Cargo.toml b/crates/office_oxide_cli/Cargo.toml
index a2be3e8..d62f4a9 100644
--- a/crates/office_oxide_cli/Cargo.toml
+++ b/crates/office_oxide_cli/Cargo.toml
@@ -21,7 +21,7 @@ name = "office-oxide"
 path = "src/main.rs"
 
 [dependencies]
-office_oxide = { version = "0.1.1", path = "../.." }
+office_oxide = { version = "0.1.2", path = "../.." }
 clap = { version = "4", features = ["derive"] }
 serde_json = "1"
 
diff --git a/crates/office_oxide_mcp/Cargo.toml b/crates/office_oxide_mcp/Cargo.toml
index 8254454..1334937 100644
--- a/crates/office_oxide_mcp/Cargo.toml
+++ b/crates/office_oxide_mcp/Cargo.toml
@@ -21,7 +21,7 @@ name = "office-oxide-mcp"
 path = "src/main.rs"
 
 [dependencies]
-office_oxide = { version = "0.1.1", path = "../.." }
+office_oxide = { version = "0.1.2", path = "../.." }
 serde_json = "1"
 
 [package.metadata.binstall]
diff --git a/csharp/OfficeOxide/OfficeOxide.csproj b/csharp/OfficeOxide/OfficeOxide.csproj
index 302f567..21060fa 100644
--- a/csharp/OfficeOxide/OfficeOxide.csproj
+++ b/csharp/OfficeOxide/OfficeOxide.csproj
@@ -12,7 +12,7 @@
     <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
 
     <PackageId>OfficeOxide</PackageId>
-    <Version>0.1.1</Version>
+    <Version>0.1.2</Version>
     <Title>OfficeOxide</Title>
     <Authors>Yury Fedoseev</Authors>
     <Product>office_oxide</Product>
diff --git a/go/cmd/install/main.go b/go/cmd/install/main.go
index 4b790c0..7565f2e 100644
--- a/go/cmd/install/main.go
+++ b/go/cmd/install/main.go
@@ -32,7 +32,7 @@ import (
 )
 
 // Bumped in lockstep with the Rust crate.
-const defaultVersion = "0.1.1"
+const defaultVersion = "0.1.2"
 
 const releaseBase = "https://github.com/yfedoseev/office_oxide/releases/download"
 
diff --git a/js/package-lock.json b/js/package-lock.json
index 236fd78..95e1dc8 100644
--- a/js/package-lock.json
+++ b/js/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "office-oxide",
-  "version": "0.1.1",
+  "version": "0.1.2",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "office-oxide",
-      "version": "0.1.1",
+      "version": "0.1.2",
       "hasInstallScript": true,
       "license": "MIT OR Apache-2.0",
       "dependencies": {
diff --git a/js/package.json b/js/package.json
index e4d6389..e849d8b 100644
--- a/js/package.json
+++ b/js/package.json
@@ -1,6 +1,6 @@
 {
   "name": "office-oxide",
-  "version": "0.1.1",
+  "version": "0.1.2",
   "description": "Fast Office document processing (DOCX/XLSX/PPTX/DOC/XLS/PPT) for Node.js — native bindings backed by the Rust office_oxide library.",
   "license": "MIT OR Apache-2.0",
   "author": "Yury Fedoseev",
diff --git a/pyproject.toml b/pyproject.toml
index 80d9b37..17d1dd6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "maturin"
 
 [project]
 name = "office-oxide"
-version = "0.1.1"
+version = "0.1.2"
 description = "The fastest Office document processing library for Python — DOCX, XLSX, PPTX, DOC, XLS, PPT"
 requires-python = ">=3.8"
 license = { text = "MIT OR Apache-2.0" }
diff --git a/src/convert_doc.rs b/src/convert_doc.rs
index dc72ce4..1661f4a 100644
--- a/src/convert_doc.rs
+++ b/src/convert_doc.rs
@@ -30,6 +30,7 @@ pub(crate) fn doc_to_ir(doc: &crate::doc::DocDocument) -> DocumentIR {
                     bold: true,
                     ..TextSpan::plain(trimmed)
                 })],
+                ..Default::default()
             }));
         } else {
             elements.push(Element::Paragraph(Paragraph {
diff --git a/src/convert_docx.rs b/src/convert_docx.rs
index aee035a..6963279 100644
--- a/src/convert_docx.rs
+++ b/src/convert_docx.rs
@@ -2,38 +2,156 @@ use crate::format::DocumentFormat;
 use crate::ir::*;
 
 pub(crate) fn docx_to_ir(doc: &crate::docx::DocxDocument) -> DocumentIR {
-    let mut elements = Vec::new();
-    convert_block_elements(&doc.body.elements, &mut elements, doc);
-
-    // Extract title from first heading
-    let title = elements.iter().find_map(|e| {
-        if let Element::Heading(h) = e {
-            Some(
-                h.content
-                    .iter()
-                    .filter_map(|c| match c {
-                        InlineContent::Text(span) => Some(span.text.as_str()),
-                        _ => None,
-                    })
-                    .collect::<String>(),
-            )
+    // Build per-section block-element windows from `body.section_breaks`.
+    // Each break index is the exclusive end of one section. Trailing
+    // elements after the last break go into a final section described
+    // by the body-level `<w:sectPr>`.
+    let breaks = &doc.body.section_breaks;
+    let total = doc.body.elements.len();
+
+    let mut windows: Vec<(usize, usize)> = Vec::new();
+    let mut prev = 0;
+    for &b in breaks {
+        let end = b.min(total);
+        if end > prev {
+            windows.push((prev, end));
+        }
+        prev = end;
+    }
+    if prev < total || windows.is_empty() {
+        windows.push((prev, total));
+    }
+
+    // Bring page-level headers and footers into the IR. Without this any
+    // downstream renderer (PDF, search, plain-text) loses non-body content
+    // like "My header" / "My footer" / page numbers / running titles. The
+    // split between header and footer uses the section ref counts (same
+    // approach as `to_markdown`).
+    let n_header_refs: usize = doc.sections.iter().map(|s| s.header_refs.len()).sum();
+    let mut header_blocks: Vec<Element> = Vec::new();
+    let mut footer_blocks: Vec<Element> = Vec::new();
+    for (idx, hf) in doc.headers_footers.iter().enumerate() {
+        let mut tmp: Vec<Element> = Vec::new();
+        convert_block_elements(&hf.content, &mut tmp, doc);
+        if tmp
+            .iter()
+            .all(|e| matches!(e, Element::Paragraph(p) if p.content.is_empty()))
+        {
+            continue;
+        }
+        if idx < n_header_refs {
+            header_blocks.extend(tmp);
         } else {
-            None
+            footer_blocks.extend(tmp);
         }
-    });
+    }
+    let header = if header_blocks.is_empty() {
+        None
+    } else {
+        Some(HeaderFooter {
+            content: header_blocks,
+        })
+    };
+    let footer = if footer_blocks.is_empty() {
+        None
+    } else {
+        Some(HeaderFooter {
+            content: footer_blocks,
+        })
+    };
+
+    let mut ir_sections: Vec<Section> = Vec::with_capacity(windows.len());
+    let mut doc_title: Option<String> = None;
+
+    for (idx, (start, end)) in windows.iter().copied().enumerate() {
+        let mut elements = Vec::new();
+        convert_block_elements(&doc.body.elements[start..end], &mut elements, doc);
+
+        let title = elements.iter().find_map(|e| {
+            if let Element::Heading(h) = e {
+                Some(
+                    h.content
+                        .iter()
+                        .filter_map(|c| match c {
+                            InlineContent::Text(span) => Some(span.text.as_str()),
+                            _ => None,
+                        })
+                        .collect::<String>(),
+                )
+            } else {
+                None
+            }
+        });
+        if doc_title.is_none() {
+            doc_title = title.clone();
+        }
+
+        let page_setup = doc.sections.get(idx).map(section_props_to_page_setup);
+        // Propagate the multi-column layout out of the source DOCX so
+        // the IR carries `Section.columns` for the renderer. Without
+        // this, a PDF→DOCX→PDF round-trip of a 2-column source paper
+        // (arxiv preprints etc.) collapsed back to a single column on
+        // read because the column count was dropped at this hop.
+        let columns = doc
+            .sections
+            .get(idx)
+            .and_then(|sp| sp.columns)
+            .filter(|n| *n >= 2)
+            .map(|n| ColumnLayout {
+                count: n,
+                ..Default::default()
+            });
+
+        let break_type = if idx == 0 {
+            SectionBreakType::Continuous
+        } else {
+            SectionBreakType::NextPage
+        };
+
+        ir_sections.push(Section {
+            title,
+            elements,
+            page_setup,
+            break_type,
+            columns,
+            header: header.clone(),
+            footer: footer.clone(),
+            ..Default::default()
+        });
+    }
 
     DocumentIR {
         metadata: Metadata {
             format: DocumentFormat::Docx,
-            title: title.clone(),
+            title: doc_title,
             ..Default::default()
         },
-        sections: vec![Section {
-            title,
-            elements,
-            ..Default::default()
-        }],
+        sections: ir_sections,
+    }
+}
+
+fn section_props_to_page_setup(sp: &crate::docx::SectionProperties) -> PageSetup {
+    let mut ps = PageSetup::default();
+    if let Some(size) = &sp.page_size {
+        ps.width_twips = size.width.0.max(0) as u32;
+        ps.height_twips = size.height.0.max(0) as u32;
+        if let Some(crate::docx::PageOrientation::Landscape) = size.orient {
+            ps.landscape = true;
+        }
+    }
+    if let Some(m) = &sp.margins {
+        ps.margin_top_twips = m.top.0.max(0) as u32;
+        ps.margin_bottom_twips = m.bottom.0.max(0) as u32;
+        ps.margin_left_twips = m.left.0.max(0) as u32;
+        ps.margin_right_twips = m.right.0.max(0) as u32;
+        if let Some(h) = m.header {
+            ps.header_distance_twips = h.0.max(0) as u32;
+        }
+        if let Some(f) = m.footer {
+            ps.footer_distance_twips = f.0.max(0) as u32;
+        }
     }
+    ps
 }
 
 fn convert_block_elements(
@@ -58,15 +176,39 @@ fn convert_block_elements(
 
                 // Check for heading
                 let heading_level = resolve_heading_level(p, doc);
+                let alignment = paragraph_alignment(p);
+
+                // Detect "horizontal rule" encoding: empty paragraph
+                // with a single bottom border. pdf_to_ir round-trips
+                // ThematicBreak through DOCX as exactly this shape;
+                // recover it here so the renderer draws a rule.
+                let inline = convert_paragraph_inline(p, doc);
+                let is_empty_para = inline.iter().all(|ic| {
+                    matches!(ic,
+                        crate::ir::InlineContent::Text(s) if s.text.is_empty()
+                    )
+                });
+                let has_bottom_border = p
+                    .properties
+                    .as_ref()
+                    .is_some_and(|pp| pp.has_bottom_border);
+                if is_empty_para && has_bottom_border {
+                    elements.push(Element::ThematicBreak);
+                    i += 1;
+                    continue;
+                }
 
                 if let Some(level) = heading_level {
                     elements.push(Element::Heading(Heading {
                         level: (level + 1).min(6),
                         content: convert_paragraph_inline(p, doc),
+                        frame_position: paragraph_frame_position(p),
+                        alignment,
                     }));
                 } else {
                     // Check for page break in runs
                     let (before_break, has_break) = split_at_page_break(p, doc);
+                    let frame_pos = paragraph_frame_position(p);
                     if !before_break.is_empty() || !has_break {
                         elements.push(Element::Paragraph(Paragraph {
                             content: if before_break.is_empty() && !has_break {
@@ -74,6 +216,8 @@ fn convert_block_elements(
                             } else {
                                 before_break
                             },
+                            frame_position: frame_pos,
+                            alignment,
                             ..Default::default()
                         }));
                     }
@@ -81,6 +225,19 @@ fn convert_block_elements(
                         elements.push(Element::ThematicBreak);
                     }
                 }
+                // Promote any floating drawings (anchored images, vector
+                // shapes) embedded in this paragraph to paragraph-sibling
+                // IR elements so the positional renderer can lay them out
+                // alongside the text frame.
+                collect_paragraph_floats(p, doc, elements);
+                // Promote inline drawings (`<wp:inline>` wrapper) to
+                // paragraph-sibling Image elements as well. Without this
+                // every embedded raster image (e.g. logos, figures, the
+                // CFR federal seal) lost its bytes on the way through
+                // the IR — the inline-content model has no Image
+                // variant, so hoisting to a sibling Element is the
+                // only way to carry the bitmap forward.
+                collect_paragraph_inline_images(p, doc, elements);
                 i += 1;
             },
             crate::docx::BlockElement::Table(t) => {
@@ -91,6 +248,191 @@ fn convert_block_elements(
     }
 }
 
+/// Pull `<w:framePr>` data out of a paragraph's properties into the IR
+/// position type. Returns `None` if the paragraph isn't absolutely
+/// positioned (the common case).
+/// Walk a paragraph's runs and emit one IR `Element` for every
+/// floating (anchored) drawing — both raster pictures and vector
+/// `<wps:wsp>` shapes. Inline drawings are left for the inline-content
+/// path. Promoting floats to paragraph siblings keeps the positional
+/// renderer simple: it can iterate a flat element list and place each
+/// one at its absolute coordinates.
+/// Walk a paragraph's runs and emit one IR `Element::Image` for every
+/// inline drawing (`<wp:inline>` wrapper). Counterpart to
+/// `collect_paragraph_floats` which handles `<wp:anchor>`-anchored
+/// drawings. The IR's `InlineContent` enum has no Image variant so
+/// inline drawings can't ride along with the rest of a paragraph's
+/// runs; instead we hoist them as paragraph-sibling Element::Image
+/// nodes right after the surrounding text paragraph.
+fn collect_paragraph_inline_images(
+    p: &crate::docx::Paragraph,
+    doc: &crate::docx::DocxDocument,
+    out: &mut Vec<Element>,
+) {
+    for pc in &p.content {
+        let runs: &[crate::docx::Run] = match pc {
+            crate::docx::ParagraphContent::Run(r) => std::slice::from_ref(r),
+            crate::docx::ParagraphContent::Hyperlink(hl) => &hl.runs,
+        };
+        for run in runs {
+            for rc in &run.content {
+                if let crate::docx::RunContent::Drawing(d) = rc {
+                    if !d.inline {
+                        continue;
+                    }
+                    if d.relationship_id.is_empty() {
+                        continue;
+                    }
+                    let (data, ext) = match doc.images.get(&d.relationship_id).cloned() {
+                        Some(v) => v,
+                        None => continue,
+                    };
+                    let format =
+                        ext.as_deref()
+                            .and_then(|e| match e.to_ascii_lowercase().as_str() {
+                                "png" => Some(ImageFormat::Png),
+                                "jpg" | "jpeg" => Some(ImageFormat::Jpeg),
+                                "gif" => Some(ImageFormat::Gif),
+                                _ => None,
+                            });
+                    out.push(Element::Image(Image {
+                        alt_text: d.description.clone(),
+                        data: Some(data),
+                        format,
+                        display_width_emu: Some(d.width.0.max(0) as u64),
+                        display_height_emu: Some(d.height.0.max(0) as u64),
+                        positioning: ImagePositioning::Inline,
+                        ..Default::default()
+                    }));
+                }
+            }
+        }
+    }
+}
+
+fn collect_paragraph_floats(
+    p: &crate::docx::Paragraph,
+    doc: &crate::docx::DocxDocument,
+    out: &mut Vec<Element>,
+) {
+    for pc in &p.content {
+        let runs: &[crate::docx::Run] = match pc {
+            crate::docx::ParagraphContent::Run(r) => std::slice::from_ref(r),
+            crate::docx::ParagraphContent::Hyperlink(hl) => &hl.runs,
+        };
+        for run in runs {
+            for rc in &run.content {
+                if let crate::docx::RunContent::Drawing(d) = rc {
+                    if d.inline {
+                        continue;
+                    }
+                    if let Some(el) = drawing_to_float_element(d, doc) {
+                        out.push(el);
+                    }
+                }
+            }
+        }
+    }
+}
+
+fn drawing_to_float_element(
+    d: &crate::docx::DrawingInfo,
+    doc: &crate::docx::DocxDocument,
+) -> Option<Element> {
+    use crate::docx::{AnchorFrame, ShapeKind};
+
+    let pos = d.anchor_position?;
+    let to_ir_anchor = |f: AnchorFrame| match f {
+        AnchorFrame::Page => FloatAnchor::Page,
+        AnchorFrame::Margin => FloatAnchor::Margin,
+        AnchorFrame::Column => FloatAnchor::Column,
+        AnchorFrame::Paragraph => FloatAnchor::Paragraph,
+        AnchorFrame::Line | AnchorFrame::Character => FloatAnchor::Page,
+    };
+    let h_anchor = to_ir_anchor(pos.h_relative_from);
+    let v_anchor = to_ir_anchor(pos.v_relative_from);
+    let width_emu = d.width.0.max(0) as u64;
+    let height_emu = d.height.0.max(0) as u64;
+
+    // Vector shape takes precedence: a `<wps:wsp>` with `prstGeom`
+    // never carries a `<a:blip>`, so the relationship_id is empty.
+    if let Some(shape) = &d.shape {
+        let kind = match shape.kind {
+            ShapeKind::Line => ShapeGeom::Line,
+            ShapeKind::Rect => ShapeGeom::Rect,
+        };
+        return Some(Element::Shape(Shape {
+            kind,
+            x_emu: pos.x_emu,
+            y_emu: pos.y_emu,
+            width_emu,
+            height_emu,
+            h_anchor,
+            v_anchor,
+            stroke_rgb: shape.stroke_rgb.map(|(r, g, b)| [r, g, b]),
+            fill_rgb: shape.fill_rgb.map(|(r, g, b)| [r, g, b]),
+            stroke_w_emu: shape.stroke_w_emu,
+        }));
+    }
+
+    if d.relationship_id.is_empty() {
+        return None;
+    }
+    let (data, ext) = doc.images.get(&d.relationship_id).cloned()?;
+    let format = ext.as_deref().and_then(|e| match e {
+        "png" => Some(ImageFormat::Png),
+        "jpg" | "jpeg" => Some(ImageFormat::Jpeg),
+        _ => None,
+    });
+    Some(Element::Image(Image {
+        alt_text: d.description.clone(),
+        data: Some(data),
+        format,
+        display_width_emu: Some(width_emu),
+        display_height_emu: Some(height_emu),
+        positioning: ImagePositioning::Floating(FloatingImage {
+            x_emu: pos.x_emu,
+            y_emu: pos.y_emu,
+            width_emu,
+            height_emu,
+            h_anchor,
+            v_anchor,
+            text_wrap: TextWrap::default(),
+            allow_overlap: true,
+        }),
+        ..Default::default()
+    }))
+}
+
+/// Translate a paragraph's `<w:jc>` justification into the IR's
+/// `ParagraphAlignment`. `Left` (and `Both`/`Distribute`) collapse
+/// to `None` so the renderer uses default left-alignment without
+/// emitting an explicit override.
+fn paragraph_alignment(p: &crate::docx::Paragraph) -> Option<ParagraphAlignment> {
+    let jc = p
+        .properties
+        .as_ref()
+        .and_then(|pp| pp.justification.as_ref())?;
+    match jc {
+        crate::docx::Justification::Center => Some(ParagraphAlignment::Center),
+        crate::docx::Justification::Right => Some(ParagraphAlignment::Right),
+        crate::docx::Justification::Both => Some(ParagraphAlignment::Justify),
+        crate::docx::Justification::Distribute => Some(ParagraphAlignment::Distribute),
+        crate::docx::Justification::Left => None,
+    }
+}
+
+fn paragraph_frame_position(p: &crate::docx::Paragraph) -> Option<FramePosition> {
+    p.properties.as_ref().and_then(|props| {
+        props.frame_position.as_ref().map(|f| FramePosition {
+            x_twips: f.x_twips,
+            y_twips: f.y_twips,
+            width_twips: f.width_twips,
+            height_twips: f.height_twips,
+        })
+    })
+}
+
 fn resolve_heading_level(
     p: &crate::docx::Paragraph,
     doc: &crate::docx::DocxDocument,
@@ -150,6 +492,35 @@ fn convert_run(
         .as_ref()
         .and_then(|rp| rp.strike.or(rp.dstrike))
         .unwrap_or(false);
+    // `<w:sz w:val="N"/>` is already in half-points; IR uses the
+    // same encoding. See `crate::core::units::HalfPoint::from_word_sz`
+    // for the cross-format invariant (also: PPTX hundredths-pt,
+    // XLSX points-as-f32 must convert here).
+    let font_size_half_pt = run.properties.as_ref().and_then(|rp| {
+        rp.font_size
+            .map(|hp| crate::core::units::HalfPoint::from_word_sz(hp.0).0)
+    });
+    // `<w:rFonts w:ascii="...">` carries the run's face name. Without
+    // forwarding it onto `TextSpan.font_name`, the IR→PDF renderer
+    // falls back to the page builder's default font (Helvetica) and
+    // every PDF→DOCX→PDF round-trip loses every typeface — even when
+    // the DOCX writer correctly embedded the source-PDF font program
+    // under `word/fonts/`.
+    let font_name = run.properties.as_ref().and_then(|rp| rp.font_name.clone());
+    // Propagate `<w:color w:val="RRGGBB"/>` so PDF→DOCX→PDF round-trips
+    // preserve coloured text (red "0" in `pdfs_pdfium/text_color.pdf`
+    // and the like). Theme / system / auto colours fall through to
+    // the renderer default for now — resolving them properly needs the
+    // document's `theme.xml`, which the current convert path doesn't
+    // thread in.
+    let text_color = run
+        .properties
+        .as_ref()
+        .and_then(|rp| rp.color.as_ref())
+        .and_then(|c| match c {
+            crate::core::theme::ColorRef::Rgb(rgb) => Some(rgb.0),
+            _ => None,
+        });
 
     for rc in &run.content {
         match rc {
@@ -160,6 +531,9 @@ fn convert_run(
                     italic,
                     strikethrough: strike,
                     hyperlink: hyperlink_url.map(|s| s.to_string()),
+                    font_size_half_pt,
+                    font_name: font_name.clone(),
+                    color: text_color,
                     ..Default::default()
                 }));
             },
@@ -175,10 +549,16 @@ fn convert_run(
                 content.push(InlineContent::Text(TextSpan::plain("\t")));
             },
             crate::docx::RunContent::Drawing(drawing) => {
-                // Emit as a separate image element — but we're in inline context,
-                // so we just note the alt text inline
-                if drawing.description.is_some() {
-                    content.push(InlineContent::Text(TextSpan::plain("")));
+                // Inline drawings handled at the paragraph level via
+                // `collect_paragraph_inline_images`. The inline-content
+                // model has no Image variant; hoisting here would
+                // require splitting paragraphs around each drawing,
+                // which loses spans. Just record alt text so the
+                // run's surrounding text doesn't lose semantic continuity.
+                if let Some(alt) = drawing.description.clone() {
+                    if !alt.is_empty() {
+                        content.push(InlineContent::Text(TextSpan::plain(alt)));
+                    }
                 }
             },
         }
@@ -263,65 +643,7 @@ fn convert_list_group(
     }
 
     // Build nested list structure from flat (ilvl, content) pairs
-    Element::List(build_nested_list(is_ordered, &items, 0))
-}
-
-fn inline_to_element(content: Vec<InlineContent>) -> Vec<Element> {
-    if content.is_empty() {
-        Vec::new()
-    } else {
-        vec![Element::Paragraph(Paragraph {
-            content,
-            ..Default::default()
-        })]
-    }
-}
-
-fn build_nested_list(ordered: bool, items: &[(u8, Vec<InlineContent>)], base_level: u8) -> List {
-    let mut list_items = Vec::new();
-    let mut idx = 0;
-
-    while idx < items.len() {
-        let (ilvl, content) = &items[idx];
-        if *ilvl == base_level {
-            // Collect any nested items immediately following at deeper levels
-            let mut nested = None;
-            let nested_start = idx + 1;
-            let mut nested_end = nested_start;
-            while nested_end < items.len() && items[nested_end].0 > base_level {
-                nested_end += 1;
-            }
-            if nested_end > nested_start {
-                nested = Some(build_nested_list(
-                    ordered,
-                    &items[nested_start..nested_end],
-                    base_level + 1,
-                ));
-            }
-            list_items.push(ListItem {
-                content: inline_to_element(content.clone()),
-                nested,
-            });
-            idx = if nested_end > nested_start {
-                nested_end
-            } else {
-                idx + 1
-            };
-        } else {
-            // Item at unexpected level — just add it flat
-            list_items.push(ListItem {
-                content: inline_to_element(content.clone()),
-                nested: None,
-            });
-            idx += 1;
-        }
-    }
-
-    List {
-        ordered,
-        items: list_items,
-        ..Default::default()
-    }
+    Element::List(crate::ir::build_nested_list(is_ordered, &items, 0))
 }
 
 // ---------------------------------------------------------------------------
diff --git a/src/convert_ppt.rs b/src/convert_ppt.rs
index 77d432d..83fec2d 100644
--- a/src/convert_ppt.rs
+++ b/src/convert_ppt.rs
@@ -26,6 +26,7 @@ pub(crate) fn ppt_to_ir(doc: &crate::ppt::PptDocument) -> DocumentIR {
                             bold: true,
                             ..TextSpan::plain(text)
                         })],
+                        ..Default::default()
                     }));
                 },
                 TextType::Body | TextType::HalfBody | TextType::QuarterBody => {
diff --git a/src/convert_pptx.rs b/src/convert_pptx.rs
index 937408e..96ec625 100644
--- a/src/convert_pptx.rs
+++ b/src/convert_pptx.rs
@@ -2,12 +2,42 @@ use crate::format::DocumentFormat;
 use crate::ir::*;
 
 pub(crate) fn pptx_to_ir(doc: &crate::pptx::PptxDocument) -> DocumentIR {
+    // Slide size sits at presentation level — every slide in the
+    // deck shares it. EMU → twips is /635 (914400 EMU per inch,
+    // 1440 twips per inch → 914400/1440 = 635).
+    let page_setup = doc.presentation.slide_size.as_ref().map(|sz| PageSetup {
+        width_twips: (sz.cx.max(0) / 635) as u32,
+        height_twips: (sz.cy.max(0) / 635) as u32,
+        landscape: sz.cx > sz.cy,
+        ..Default::default()
+    });
+
     let mut sections = Vec::new();
 
-    for slide in &doc.slides {
-        let title = find_title_text(&slide.shapes);
+    for slide in doc.slides.iter() {
+        let title_with_algn = find_title(&slide.shapes);
+        let title = title_with_algn.as_ref().map(|(t, _)| t.clone());
+        let title_alignment = title_with_algn.as_ref().and_then(|(_, a)| a.clone());
         let mut elements = Vec::new();
 
+        // Lead each slide with the title placeholder text as a
+        // heading so it has visible demarcation in the rendered
+        // PDF/HTML output. When the slide has no title we used to
+        // synthesise "Slide N" — that was useful for markdown anchors
+        // but pure visual noise in paginated output, where every
+        // slide already starts on its own page via the NextPage break.
+        // Worse, the synthesised heading rendered as 20 pt bold and
+        // contributed ~50 pt of fixed vertical overhead per section,
+        // which inflated PDF→PPTX→PDF round-trip page counts.
+        if let Some(ref t) = title {
+            elements.push(Element::Heading(Heading {
+                level: 2,
+                content: vec![InlineContent::Text(TextSpan::plain(t.clone()))],
+                alignment: title_alignment.clone(),
+                ..Default::default()
+            }));
+        }
+
         // Sort shapes spatially
         let mut shape_entries: Vec<(Option<&crate::pptx::ShapePosition>, &crate::pptx::Shape)> =
             Vec::new();
@@ -18,6 +48,11 @@ pub(crate) fn pptx_to_ir(doc: &crate::pptx::PptxDocument) -> DocumentIR {
             convert_shape(shape, &mut elements);
         }
 
+        // Propagate slide background colour to the section so the
+        // PDF renderer can paint a full-slide rectangle before laying
+        // down shapes.
+        let background_rgb = slide.background_rgb;
+
         // Add notes as paragraphs at end
         if let Some(ref notes) = slide.notes {
             if !notes.is_empty() {
@@ -28,9 +63,21 @@ pub(crate) fn pptx_to_ir(doc: &crate::pptx::PptxDocument) -> DocumentIR {
             }
         }
 
+        // Each PPTX slide is its own page when rendered to PDF or
+        // any paginated format. Default `Continuous` would let two
+        // slides share a page, which is wrong for slide content.
+        let break_type = if sections.is_empty() {
+            SectionBreakType::Continuous
+        } else {
+            SectionBreakType::NextPage
+        };
+
         sections.push(Section {
             title: title.clone(),
             elements,
+            break_type,
+            page_setup: page_setup.clone(),
+            background_rgb,
             ..Default::default()
         });
     }
@@ -86,7 +133,10 @@ fn is_title_placeholder(ph_type: Option<&str>) -> bool {
     matches!(ph_type, Some("title" | "ctrTitle"))
 }
 
-fn find_title_text(shapes: &[crate::pptx::Shape]) -> Option<String> {
+/// Locate the title placeholder and return its text together with the
+/// alignment of the first paragraph. Used by `pptx_to_ir` to seed both
+/// `Section.title` and the synthesised level-2 Heading's alignment.
+fn find_title(shapes: &[crate::pptx::Shape]) -> Option<(String, Option<ParagraphAlignment>)> {
     for shape in shapes {
         match shape {
             crate::pptx::Shape::AutoShape(auto)
@@ -98,13 +148,14 @@ fn find_title_text(shapes: &[crate::pptx::Shape]) -> Option<String> {
                 if let Some(ref tb) = auto.text_body {
                     let text = plain_text_from_body(tb);
                     if !text.is_empty() {
-                        return Some(text);
+                        let algn = tb.paragraphs.first().and_then(|p| p.alignment.clone());
+                        return Some((text, algn));
                     }
                 }
             },
             crate::pptx::Shape::Group(grp) => {
-                if let Some(title) = find_title_text(&grp.children) {
-                    return Some(title);
+                if let Some(t) = find_title(&grp.children) {
+                    return Some(t);
                 }
             },
             _ => {},
@@ -142,14 +193,36 @@ fn convert_shape(shape: &crate::pptx::Shape, elements: &mut Vec<Element>) {
             }
 
             if let Some(ref tb) = auto.text_body {
-                convert_text_body(tb, elements);
+                let mut inner = Vec::new();
+                convert_text_body(tb, &mut inner);
+                if inner.is_empty() {
+                    return;
+                }
+                push_positional_textbox(elements, inner, auto.position.as_ref());
             }
         },
         crate::pptx::Shape::Picture(pic) => {
-            elements.push(Element::Image(Image {
+            // Carry the resolved media bytes through so the PDF renderer
+            // (`render_pptx_textbox_content`) can paint the actual
+            // picture at its shape rectangle. `embed_rid` is preserved
+            // as alt-text fallback only when the relationship couldn't
+            // be resolved — we still want a placeholder element so the
+            // shape's position survives in plain-text / markdown output.
+            let format = pic.format.as_deref().and_then(image_format_from_ext);
+            let (display_w, display_h) = pic
+                .position
+                .as_ref()
+                .map(|p| (Some(p.cx.max(0) as u64), Some(p.cy.max(0) as u64)))
+                .unwrap_or((None, None));
+            let img_el = Element::Image(Image {
                 alt_text: pic.alt_text.clone(),
+                data: pic.data.clone(),
+                format,
+                display_width_emu: display_w,
+                display_height_emu: display_h,
                 ..Default::default()
-            }));
+            });
+            push_positional_textbox(elements, vec![img_el], pic.position.as_ref());
         },
         crate::pptx::Shape::Group(grp) => {
             for child in &grp.children {
@@ -158,13 +231,49 @@ fn convert_shape(shape: &crate::pptx::Shape, elements: &mut Vec<Element>) {
         },
         crate::pptx::Shape::GraphicFrame(gf) => {
             if let crate::pptx::GraphicContent::Table(ref tbl) = gf.content {
-                elements.push(convert_pptx_table(tbl));
+                let table_el = convert_pptx_table(tbl);
+                push_positional_textbox(elements, vec![table_el], gf.position.as_ref());
             }
         },
         crate::pptx::Shape::Connector(_) => {},
     }
 }
 
+/// Wrap a shape's converted IR content in a `TextBox` carrying its
+/// absolute `(x, y, cx, cy)` EMU rectangle. The PPTX renderer uses
+/// these coordinates to paint each shape at its source position
+/// instead of flowing them as a single long page.
+///
+/// When the source shape has no `<a:xfrm>` (rare — placeholders that
+/// inherit geometry from a slide layout), the inner content is pushed
+/// as flow elements so plain-text / markdown rendering still sees it.
+fn push_positional_textbox(
+    elements: &mut Vec<Element>,
+    content: Vec<Element>,
+    position: Option<&crate::pptx::ShapePosition>,
+) {
+    // Wrap in `Element::TextBox` only when the source shape carried a
+    // *real* `<a:xfrm>`. Placeholders that inherit geometry from the
+    // slide layout parse as `ShapePosition { x: 0, y: 0, cx: 0, cy: 0 }`
+    // — wrapping those in TextBox tells the renderer "place this 0×0
+    // rectangle at (0, 0)" which collapses every paragraph onto the
+    // top-left corner. Treat all-zeros as "no position" so the
+    // content flows normally instead.
+    let real_position = position.filter(|p| p.cx > 0 && p.cy > 0);
+    if let Some(pos) = real_position {
+        elements.push(Element::TextBox(TextBox {
+            content,
+            x_emu: Some(pos.x),
+            y_emu: Some(pos.y),
+            width_emu: Some(pos.cx.max(0) as u64),
+            height_emu: Some(pos.cy.max(0) as u64),
+            ..Default::default()
+        }));
+    } else {
+        elements.extend(content);
+    }
+}
+
 fn convert_text_body(body: &crate::pptx::TextBody, elements: &mut Vec<Element>) {
     // Check if any paragraph has level > 0 — treat as list
     let has_levels = body.paragraphs.iter().any(|p| p.level > 0);
@@ -175,13 +284,23 @@ fn convert_text_body(body: &crate::pptx::TextBody, elements: &mut Vec<Element>)
         for para in &body.paragraphs {
             items.push((para.level as u8, convert_text_paragraph_inline(para)));
         }
-        elements.push(Element::List(build_nested_list(false, &items, 0)));
+        elements.push(Element::List(crate::ir::build_nested_list(false, &items, 0)));
     } else {
         for para in &body.paragraphs {
             let content = convert_text_paragraph_inline(para);
-            if !content.is_empty() {
+            // Honour space_before from PPTX so spacer paragraphs
+            // emitted by pdf_to_ir round-trip with their full vertical
+            // gap. Convert hundredths-of-pt → twips: hundredths * 0.2
+            // (1pt = 20 twips, so pt*100 → twips = (pt*100)/5).
+            let space_before_twips = para.space_before_hundredths_pt.map(|h| h.div_ceil(5));
+            // Empty paragraphs serve as vertical spacers — keep them
+            // in the IR even when content is empty so the renderer
+            // can advance the cursor by the requested amount.
+            if !content.is_empty() || space_before_twips.is_some() {
                 elements.push(Element::Paragraph(Paragraph {
                     content,
+                    alignment: para.alignment.clone(),
+                    space_before_twips,
                     ..Default::default()
                 }));
             }
@@ -199,12 +318,19 @@ fn convert_text_paragraph_inline(para: &crate::pptx::TextParagraph) -> Vec<Inlin
                         crate::pptx::HyperlinkTarget::External(url) => Some(url.clone()),
                         crate::pptx::HyperlinkTarget::Internal(_) => None,
                     });
+                    let font_size_half_pt = run.font_size_hundredths_pt.map(|hp| {
+                        crate::core::units::HalfPoint::from_drawingml_sz(hp)
+                            .0
+                            .max(1)
+                    });
                     content.push(InlineContent::Text(TextSpan {
                         text: run.text.clone(),
                         bold: run.bold.unwrap_or(false),
                         italic: run.italic.unwrap_or(false),
                         strikethrough: run.strikethrough,
                         hyperlink,
+                        font_size_half_pt,
+                        color: run.color_rgb,
                         ..Default::default()
                     }));
                 }
@@ -222,59 +348,6 @@ fn convert_text_paragraph_inline(para: &crate::pptx::TextParagraph) -> Vec<Inlin
     content
 }
 
-fn inline_to_element(content: Vec<InlineContent>) -> Vec<Element> {
-    if content.is_empty() {
-        Vec::new()
-    } else {
-        vec![Element::Paragraph(Paragraph {
-            content,
-            ..Default::default()
-        })]
-    }
-}
-
-fn build_nested_list(ordered: bool, items: &[(u8, Vec<InlineContent>)], base_level: u8) -> List {
-    let mut list_items = Vec::new();
-    let mut idx = 0;
-
-    while idx < items.len() {
-        let (level, content) = &items[idx];
-        if *level <= base_level {
-            let nested_start = idx + 1;
-            let mut nested_end = nested_start;
-            while nested_end < items.len() && items[nested_end].0 > base_level {
-                nested_end += 1;
-            }
-            let nested = if nested_end > nested_start {
-                Some(build_nested_list(ordered, &items[nested_start..nested_end], base_level + 1))
-            } else {
-                None
-            };
-            list_items.push(ListItem {
-                content: inline_to_element(content.clone()),
-                nested,
-            });
-            idx = if nested_end > nested_start {
-                nested_end
-            } else {
-                idx + 1
-            };
-        } else {
-            list_items.push(ListItem {
-                content: inline_to_element(content.clone()),
-                nested: None,
-            });
-            idx += 1;
-        }
-    }
-
-    List {
-        ordered,
-        items: list_items,
-        ..Default::default()
-    }
-}
-
 fn convert_pptx_table(table: &crate::pptx::Table) -> Element {
     let mut ir_rows = Vec::new();
 
@@ -294,6 +367,7 @@ fn convert_pptx_table(table: &crate::pptx::Table) -> Element {
                     if !content.is_empty() {
                         cell_elements.push(Element::Paragraph(Paragraph {
                             content,
+                            alignment: para.alignment.clone(),
                             ..Default::default()
                         }));
                     }
@@ -320,3 +394,20 @@ fn convert_pptx_table(table: &crate::pptx::Table) -> Element {
         ..Default::default()
     })
 }
+
+/// Map a lowercase file extension (`"png"`, `"jpeg"`, `"emf"`, …) to
+/// the matching `ImageFormat` variant. Used by `convert_shape` when
+/// converting a parsed PPTX `<p:pic>` whose underlying media part the
+/// PPTX reader resolved into bytes + extension.
+fn image_format_from_ext(ext: &str) -> Option<ImageFormat> {
+    match ext {
+        "png" => Some(ImageFormat::Png),
+        "jpg" | "jpeg" => Some(ImageFormat::Jpeg),
+        "gif" => Some(ImageFormat::Gif),
+        "tif" | "tiff" => Some(ImageFormat::Tiff),
+        "bmp" => Some(ImageFormat::Bmp),
+        "emf" => Some(ImageFormat::Emf),
+        "wmf" => Some(ImageFormat::Wmf),
+        _ => None,
+    }
+}
diff --git a/src/convert_xlsx.rs b/src/convert_xlsx.rs
index 3c95d9e..5869963 100644
--- a/src/convert_xlsx.rs
+++ b/src/convert_xlsx.rs
@@ -1,50 +1,296 @@
 use crate::format::DocumentFormat;
 use crate::ir::*;
 
+/// Parse a 6-char hex colour like `"FFA500"` into `[r, g, b]`.
+fn parse_hex_rgb(s: &str) -> Option<[u8; 3]> {
+    let s = s.trim_start_matches('#');
+    if s.len() != 6 {
+        return None;
+    }
+    let r = u8::from_str_radix(&s[0..2], 16).ok()?;
+    let g = u8::from_str_radix(&s[2..4], 16).ok()?;
+    let b = u8::from_str_radix(&s[4..6], 16).ok()?;
+    Some([r, g, b])
+}
+
 pub(crate) fn xlsx_to_ir(doc: &crate::xlsx::XlsxDocument) -> DocumentIR {
-    let mut sections = Vec::new();
+    // Pre-compute date style indices once — avoids re-scanning format strings per cell.
+    let date_indices = doc.date_style_indices();
 
-    for ws in &doc.worksheets {
-        let mut rows = Vec::new();
+    // Single String buffer reused across all cells — clear() keeps the heap
+    // allocation; std::mem::take() moves it into TextSpan for non-empty cells.
+    let mut buf = String::new();
 
-        for (row_idx, row) in ws.rows.iter().enumerate() {
-            let mut cells = Vec::new();
+    let mut sections = Vec::new();
+
+    for (ws_idx, ws) in doc.worksheets.iter().enumerate() {
+        // First pass: parse all rows into (cells, style-indices).
+        // Each entry is a Vec of (text, style_index_for_first_non_empty_cell)
+        // — we keep style index for the first non-empty cell in each row
+        // because that's what carries the font info we want to recover.
+        let mut parsed_rows: Vec<Vec<(String, Option<u32>)>> = Vec::with_capacity(ws.rows.len());
+        for row in &ws.rows {
+            let mut cells: Vec<(String, Option<u32>)> = Vec::with_capacity(row.cells.len());
             for cell in &row.cells {
-                let text = doc.format_cell_value(cell);
-                cells.push(TableCell {
-                    content: vec![Element::Paragraph(Paragraph {
-                        content: if text.is_empty() {
-                            Vec::new()
-                        } else {
-                            vec![InlineContent::Text(TextSpan::plain(text))]
-                        },
-                        ..Default::default()
-                    })],
-                    col_span: 1,
-                    row_span: 1,
-                    ..Default::default()
-                });
+                buf.clear();
+                doc.write_cell_value_fast(cell, &mut buf, &date_indices);
+                let text = if buf.is_empty() {
+                    String::new()
+                } else {
+                    std::mem::take(&mut buf)
+                };
+                cells.push((text, cell.style_index));
             }
+            // Drop trailing empty cells.
+            while cells.last().is_some_and(|(t, _)| t.is_empty()) {
+                cells.pop();
+            }
+            parsed_rows.push(cells);
+        }
 
-            rows.push(TableRow {
-                cells,
-                is_header: row_idx == 0,
+        // Decide row layout: a worksheet whose rows mostly have at most one
+        // non-empty cell is "document style" — flowing text laid out one
+        // paragraph per row. Render those rows as Paragraphs (not as a
+        // 1-column Table) so the downstream PDF renderer flows them like
+        // body text and honours per-paragraph font sizes.
+        //
+        // We choose Paragraph mode when ≥80 % of non-empty rows have ≤1
+        // non-empty cell. That's permissive enough to handle real
+        // worksheets that mostly hold prose but still emit a Table when a
+        // genuine grid is present.
+        let mut prose_score = 0usize;
+        let mut nonempty_rows = 0usize;
+        for cells in &parsed_rows {
+            let nc = cells.iter().filter(|(t, _)| !t.is_empty()).count();
+            if nc == 0 {
+                continue;
+            }
+            nonempty_rows += 1;
+            if nc <= 1 {
+                prose_score += 1;
+            }
+        }
+        let prose_mode = nonempty_rows >= 3 && prose_score * 100 >= nonempty_rows * 80;
+
+        // Materialise any pictures or text shapes anchored on the
+        // worksheet as positional IR elements so they survive the
+        // round-trip back to PDF. Pictures wrap an `Element::Image`
+        // in an `Element::TextBox`; text shapes wrap a styled
+        // paragraph the same way. The flow renderer then paints
+        // both at their absolute EMU rectangle (see
+        // `render_text_box`).
+        let mut image_elements: Vec<Element> =
+            Vec::with_capacity(ws.images.len() + ws.text_shapes.len());
+        for ts in &ws.text_shapes {
+            let mut span = TextSpan::plain(ts.text.clone());
+            if let Some(sz) = ts.font_size_pt {
+                span.font_size_half_pt =
+                    Some(crate::core::units::HalfPoint::from_points_rounded(sz as f64).0);
+            }
+            if ts.bold {
+                span.bold = true;
+            }
+            if ts.italic {
+                span.italic = true;
+            }
+            if let Some(ref hex) = ts.color_hex {
+                if let Some(rgb) = parse_hex_rgb(hex) {
+                    span.color = Some(rgb);
+                }
+            }
+            if let Some(ref f) = ts.font_name {
+                span.font_name = Some(f.clone());
+            }
+            let para = Element::Paragraph(Paragraph {
+                content: vec![InlineContent::Text(span)],
                 ..Default::default()
             });
+            image_elements.push(Element::TextBox(TextBox {
+                content: vec![para],
+                x_emu: Some(ts.x_emu),
+                y_emu: Some(ts.y_emu),
+                width_emu: Some(ts.cx_emu.max(0) as u64),
+                height_emu: Some(ts.cy_emu.max(0) as u64),
+                ..Default::default()
+            }));
         }
+        for pic in &ws.images {
+            let format = image_format_from_ext(&pic.format);
+            let img = Image {
+                alt_text: pic.alt_text.clone(),
+                data: Some(pic.data.clone()),
+                format,
+                display_width_emu: Some(pic.cx_emu.max(0) as u64),
+                display_height_emu: Some(pic.cy_emu.max(0) as u64),
+                ..Default::default()
+            };
+            // Wrap in TextBox so downstream renderers can paint at the
+            // exact (x_emu, y_emu) anchor instead of inline-after-text.
+            // When the source drawing was a cell-anchor and we
+            // couldn't resolve EMU coords (cx == 0), drop the wrap so
+            // the image flows inline at the section start.
+            if pic.cx_emu > 0 && pic.cy_emu > 0 {
+                image_elements.push(Element::TextBox(TextBox {
+                    content: vec![Element::Image(img)],
+                    x_emu: Some(pic.x_emu),
+                    y_emu: Some(pic.y_emu),
+                    width_emu: Some(pic.cx_emu.max(0) as u64),
+                    height_emu: Some(pic.cy_emu.max(0) as u64),
+                    ..Default::default()
+                }));
+            } else {
+                image_elements.push(Element::Image(img));
+            }
+        }
+
+        let elements = if prose_mode {
+            // Each row → one Paragraph. Pull font size from cell style if the
+            // worksheet's stylesheet is loaded. Skip empty rows entirely
+            // (they were just visual separators).
+            let mut out: Vec<Element> = Vec::new();
+            for cells in &parsed_rows {
+                // Find the first non-empty cell.
+                let Some((text, style_idx)) = cells.iter().find(|(t, _)| !t.is_empty()).cloned()
+                else {
+                    continue;
+                };
+                let mut span = TextSpan::plain(text);
+                if let Some(idx) = style_idx {
+                    if let Some(font) = font_for(doc, idx) {
+                        if let Some(size_pt) = font.size {
+                            // XLSX cell font size is in points (`<font><sz val="N"/>`
+                            // where N is f32). IR uses half-points; same
+                            // half-pt convention as DOCX/PPTX read paths.
+                            span.font_size_half_pt = Some(
+                                crate::core::units::HalfPoint::from_points_rounded(size_pt)
+                                    .0,
+                            );
+                        }
+                        if font.bold {
+                            span.bold = true;
+                        }
+                        if font.italic {
+                            span.italic = true;
+                        }
+                    }
+                }
+                out.push(Element::Paragraph(Paragraph {
+                    content: vec![InlineContent::Text(span)],
+                    ..Default::default()
+                }));
+            }
+            out
+        } else {
+            // Genuine grid → emit a Table.
+            let mut rows: Vec<TableRow> = Vec::with_capacity(parsed_rows.len());
+            for (row_idx, cells) in parsed_rows.iter().enumerate() {
+                let mut tcells: Vec<TableCell> = Vec::with_capacity(cells.len());
+                for (text, _) in cells {
+                    let content = if text.is_empty() {
+                        Vec::new()
+                    } else {
+                        vec![InlineContent::Text(TextSpan::plain(text.clone()))]
+                    };
+                    tcells.push(TableCell {
+                        content: vec![Element::Paragraph(Paragraph {
+                            content,
+                            ..Default::default()
+                        })],
+                        col_span: 1,
+                        row_span: 1,
+                        ..Default::default()
+                    });
+                }
+                rows.push(TableRow {
+                    cells: tcells,
+                    is_header: row_idx == 0,
+                    ..Default::default()
+                });
+            }
+            if rows.is_empty() {
+                Vec::new()
+            } else {
+                vec![Element::Table(Table {
+                    rows,
+                    ..Default::default()
+                })]
+            }
+        };
 
-        let elements = if rows.is_empty() {
-            Vec::new()
+        // Per-sheet page geometry (parsed from <pageMargins>/<pageSetup>).
+        // Default the margins back to 0.5"/0.5" (720 twips) when the source
+        // had no <pageMargins> — Excel's default 0.7"/0.75" is wider than
+        // we want for a tight PDF round-trip and would shrink the usable
+        // text area.
+        let page_setup = ws.page_setup.and_then(|wsp| {
+            // A worksheet that only had <pageMargins> (no dimensions) is
+            // treated as "no geometry" so the renderer keeps its
+            // OfficeConfig default page size.
+            if wsp.width_twips == 0 || wsp.height_twips == 0 {
+                return None;
+            }
+            Some(PageSetup {
+                width_twips: wsp.width_twips,
+                height_twips: wsp.height_twips,
+                margin_top_twips: wsp.margin_top_twips,
+                margin_bottom_twips: wsp.margin_bottom_twips,
+                margin_left_twips: wsp.margin_left_twips,
+                margin_right_twips: wsp.margin_right_twips,
+                header_distance_twips: wsp.header_distance_twips,
+                footer_distance_twips: wsp.footer_distance_twips,
+                landscape: wsp.landscape,
+            })
+        });
+
+        // Each XLSX worksheet renders to its own PDF page sequence, so
+        // mark every section after the first as a hard page break (same
+        // pattern as PPTX in convert_pptx.rs). Without this the second
+        // worksheet's content flows into the first sheet's last page.
+        let break_type = if ws_idx == 0 {
+            SectionBreakType::Continuous
         } else {
-            vec![Element::Table(Table {
-                rows,
-                ..Default::default()
-            })]
+            SectionBreakType::NextPage
         };
 
+        // Stitch worksheet pictures in front of cell-derived content
+        // so they paint underneath the text (positional TextBoxes are
+        // absolute regardless of order, but inline images render
+        // first). Empty `image_elements` means no drawing on this sheet.
+        let mut combined: Vec<Element> = image_elements;
+        combined.extend(elements);
+
         sections.push(Section {
             title: Some(ws.name.clone()),
-            elements,
+            elements: combined,
+            page_setup,
+            break_type,
+            ..Default::default()
+        });
+    }
+
+    // Append a section for chart content. We don't render charts as graphics;
+    // capturing their text (titles, axis labels, series names, cached values)
+    // ensures that all human-meaningful words in the workbook appear in the
+    // IR and downstream conversions, even when the chart itself isn't drawn.
+    if !doc.chart_text.is_empty() {
+        let mut chart_elements: Vec<Element> = Vec::new();
+        for (i, text) in doc.chart_text.iter().enumerate() {
+            chart_elements.push(Element::Heading(Heading {
+                level: 3,
+                content: vec![InlineContent::Text(TextSpan::plain(format!(
+                    "Chart {}",
+                    i + 1
+                )))],
+                ..Default::default()
+            }));
+            chart_elements.push(Element::Paragraph(Paragraph {
+                content: vec![InlineContent::Text(TextSpan::plain(text.clone()))],
+                ..Default::default()
+            }));
+        }
+        sections.push(Section {
+            title: Some("Charts".to_string()),
+            elements: chart_elements,
             ..Default::default()
         });
     }
@@ -60,3 +306,32 @@ pub(crate) fn xlsx_to_ir(doc: &crate::xlsx::XlsxDocument) -> DocumentIR {
         sections,
     }
 }
+
+/// Look up a cell's font through the workbook's stylesheet (if loaded).
+/// `to_ir` runs after the document has been fully read; if styles weren't
+/// parsed yet they remain `None` and we silently skip per-cell font
+/// recovery rather than mutate the document during a `&self` traversal.
+fn font_for(
+    doc: &crate::xlsx::XlsxDocument,
+    style_index: u32,
+) -> Option<&crate::xlsx::styles::Font> {
+    doc.styles.as_ref()?.font_for(style_index)
+}
+
+/// Map a lowercase file extension (`"png"`, `"jpeg"`, ...) to the
+/// matching `ImageFormat` variant. Mirrors the PPTX helper. Returns
+/// `None` for unrecognised extensions; the round-trip then carries
+/// only the bytes (renderers usually sniff the format from the magic
+/// header and ignore the missing variant).
+fn image_format_from_ext(ext: &str) -> Option<ImageFormat> {
+    match ext {
+        "png" => Some(ImageFormat::Png),
+        "jpg" | "jpeg" => Some(ImageFormat::Jpeg),
+        "gif" => Some(ImageFormat::Gif),
+        "tif" | "tiff" => Some(ImageFormat::Tiff),
+        "bmp" => Some(ImageFormat::Bmp),
+        "emf" => Some(ImageFormat::Emf),
+        "wmf" => Some(ImageFormat::Wmf),
+        _ => None,
+    }
+}
diff --git a/src/core/core_properties.rs b/src/core/core_properties.rs
new file mode 100644
index 0000000..4580c88
--- /dev/null
+++ b/src/core/core_properties.rs
@@ -0,0 +1,162 @@
+//! Shared `docProps/core.xml` generator used by DOCX, PPTX, and XLSX
+//! writers. Emits the OOXML core-properties payload from the IR's
+//! `Metadata` so document title / author / subject / created /
+//! modified surface in Word, PowerPoint, and Excel "Properties"
+//! dialogs.
+
+use crate::ir::Metadata;
+use quick_xml::Writer;
+use quick_xml::events::{BytesDecl, BytesEnd, BytesStart, BytesText, Event};
+
+/// MIME content type for `docProps/core.xml`.
+pub const CONTENT_TYPE: &str = "application/vnd.openxmlformats-package.core-properties+xml";
+
+/// Generate the XML payload for `docProps/core.xml`. Empty fields
+/// in the input are omitted entirely (no `<dc:title></dc:title>`),
+/// matching the convention Word / PowerPoint use.
+pub fn generate_xml(meta: &Metadata) -> Vec<u8> {
+    let mut w = Writer::new_with_indent(Vec::new(), b' ', 2);
+    w.write_event(Event::Decl(BytesDecl::new("1.0", Some("UTF-8"), Some("yes"))))
+        .expect("decl");
+
+    let mut root = BytesStart::new("cp:coreProperties");
+    root.push_attribute((
+        "xmlns:cp",
+        "http://schemas.openxmlformats.org/package/2006/metadata/core-properties",
+    ));
+    root.push_attribute(("xmlns:dc", "http://purl.org/dc/elements/1.1/"));
+    root.push_attribute(("xmlns:dcterms", "http://purl.org/dc/terms/"));
+    root.push_attribute(("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance"));
+    w.write_event(Event::Start(root)).expect("root");
+
+    write_text(&mut w, "dc:title", meta.title.as_deref());
+    write_text(&mut w, "dc:subject", meta.subject.as_deref());
+    write_text(&mut w, "dc:creator", meta.author.as_deref());
+    write_text(&mut w, "dc:description", meta.description.as_deref());
+    if !meta.keywords.is_empty() {
+        write_text(&mut w, "cp:keywords", Some(meta.keywords.join(", ").as_str()));
+    }
+    write_dcterms(&mut w, "dcterms:created", meta.created.as_deref());
+    write_dcterms(&mut w, "dcterms:modified", meta.modified.as_deref());
+
+    w.write_event(Event::End(BytesEnd::new("cp:coreProperties")))
+        .expect("close");
+    w.into_inner()
+}
+
+fn write_text(w: &mut Writer<Vec<u8>>, tag: &str, value: Option<&str>) {
+    if let Some(v) = value {
+        if v.is_empty() {
+            return;
+        }
+        w.write_event(Event::Start(BytesStart::new(tag.to_string())))
+            .expect("open");
+        w.write_event(Event::Text(BytesText::new(v))).expect("text");
+        w.write_event(Event::End(BytesEnd::new(tag.to_string())))
+            .expect("close");
+    }
+}
+
+fn write_dcterms(w: &mut Writer<Vec<u8>>, tag: &str, value: Option<&str>) {
+    if let Some(v) = value {
+        if v.is_empty() {
+            return;
+        }
+        let mut elem = BytesStart::new(tag.to_string());
+        elem.push_attribute(("xsi:type", "dcterms:W3CDTF"));
+        w.write_event(Event::Start(elem)).expect("open");
+        w.write_event(Event::Text(BytesText::new(v))).expect("text");
+        w.write_event(Event::End(BytesEnd::new(tag.to_string())))
+            .expect("close");
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::DocumentFormat;
+
+    fn meta_string(meta: &Metadata) -> String {
+        String::from_utf8(generate_xml(meta)).unwrap()
+    }
+
+    #[test]
+    fn empty_metadata_emits_only_root() {
+        let meta = Metadata {
+            format: DocumentFormat::Docx,
+            ..Default::default()
+        };
+        let xml = meta_string(&meta);
+        assert!(xml.contains("<cp:coreProperties"), "xml: {xml}");
+        assert!(!xml.contains("<dc:title"), "xml: {xml}");
+        assert!(!xml.contains("<dc:creator"), "xml: {xml}");
+        assert!(!xml.contains("<dcterms:created"), "xml: {xml}");
+    }
+
+    #[test]
+    fn title_and_author_are_emitted() {
+        let meta = Metadata {
+            format: DocumentFormat::Docx,
+            title: Some("Hello".into()),
+            author: Some("Yury".into()),
+            ..Default::default()
+        };
+        let xml = meta_string(&meta);
+        assert!(xml.contains("<dc:title>Hello</dc:title>"), "xml: {xml}");
+        assert!(xml.contains("<dc:creator>Yury</dc:creator>"), "xml: {xml}");
+    }
+
+    #[test]
+    fn empty_string_field_is_omitted() {
+        let meta = Metadata {
+            format: DocumentFormat::Docx,
+            title: Some(String::new()),
+            author: Some("Someone".into()),
+            ..Default::default()
+        };
+        let xml = meta_string(&meta);
+        // Empty title is dropped entirely; non-empty author is kept.
+        assert!(!xml.contains("<dc:title"), "xml: {xml}");
+        assert!(xml.contains("<dc:creator>Someone</dc:creator>"), "xml: {xml}");
+    }
+
+    #[test]
+    fn dcterms_carry_w3cdtf_type_attribute() {
+        let meta = Metadata {
+            format: DocumentFormat::Docx,
+            created: Some("2026-05-13T10:00:00Z".into()),
+            modified: Some("2026-05-13T11:00:00Z".into()),
+            ..Default::default()
+        };
+        let xml = meta_string(&meta);
+        assert!(xml.contains("xsi:type=\"dcterms:W3CDTF\""), "xml: {xml}");
+        assert!(xml.contains("2026-05-13T10:00:00Z"), "xml: {xml}");
+        assert!(xml.contains("2026-05-13T11:00:00Z"), "xml: {xml}");
+    }
+
+    #[test]
+    fn keywords_joined_with_comma() {
+        let meta = Metadata {
+            format: DocumentFormat::Docx,
+            keywords: vec!["rust".into(), "office".into(), "oxide".into()],
+            ..Default::default()
+        };
+        let xml = meta_string(&meta);
+        assert!(xml.contains("<cp:keywords>rust, office, oxide</cp:keywords>"), "xml: {xml}");
+    }
+
+    #[test]
+    fn no_keywords_omits_element() {
+        let meta = Metadata {
+            format: DocumentFormat::Docx,
+            ..Default::default()
+        };
+        let xml = meta_string(&meta);
+        assert!(!xml.contains("<cp:keywords"), "xml: {xml}");
+    }
+
+    #[test]
+    fn content_type_is_core_properties() {
+        assert!(CONTENT_TYPE.ends_with("core-properties+xml"));
+    }
+}
diff --git a/src/core/embedded_fonts.rs b/src/core/embedded_fonts.rs
new file mode 100644
index 0000000..23ac670
--- /dev/null
+++ b/src/core/embedded_fonts.rs
@@ -0,0 +1,119 @@
+//! Embedded font helpers shared by DOCX, PPTX, and XLSX writers.
+//!
+//! All three Office formats can carry TrueType / OpenType font programs
+//! inside their package so a downstream consumer can render with the
+//! original typeface even when the system doesn't have it installed.
+//! The OPC layout is identical across formats:
+//!
+//! - DOCX:  `/word/fonts/font_<n>_<safe_name>.ttf`
+//! - PPTX:  `/ppt/fonts/font_<n>_<safe_name>.ttf`
+//! - XLSX:  `/xl/fonts/font_<n>_<safe_name>.ttf`
+//!
+//! Other apps (Word, PowerPoint, Excel) require additional manifest
+//! plumbing (`<w:embeddedFontLst>`, `<p:embeddedFontLst>`, etc.) to
+//! actually pick up the embed; until that lands the in-process reader
+//! is the only consumer. It scans the `*/fonts/` directory directly,
+//! which is why the layout is uniform across formats.
+//!
+//! `sanitize_font_filename` strips characters that aren't legal in OPC
+//! part names so font names can be embedded into the path safely.
+
+use super::Result;
+use super::opc::{OpcWriter, PartName};
+use std::io::{Seek, Write};
+
+/// Generic content type for embedded font payloads. The package
+/// remains valid OPC even though Word/PowerPoint/Excel won't
+/// auto-discover the font without the per-format manifest entries.
+const FONT_CONTENT_TYPE: &str = "application/x-font-ttf";
+
+/// Strip path-unsafe characters from a font name so it can live
+/// inside an OPC part name (`/word/fonts/font_<n>_<safe_name>.ttf`).
+/// Keeps ASCII alphanumeric, `-`, and `_`; replaces everything else
+/// with `_` and clamps to 40 characters.
+pub fn sanitize_font_filename(name: &str) -> String {
+    name.chars()
+        .map(|c| {
+            if c.is_ascii_alphanumeric() || c == '-' || c == '_' {
+                c
+            } else {
+                '_'
+            }
+        })
+        .take(40)
+        .collect()
+}
+
+/// Write the supplied font programs into the OPC package under the
+/// given path prefix (e.g. `/word/fonts/`, `/ppt/fonts/`, or
+/// `/xl/fonts/`). Each entry becomes
+/// `<prefix>font_<n>_<safe_name>.ttf` with `n` starting at 1.
+///
+/// `prefix` must end with `/` and start with `/`.
+pub fn write_embedded_fonts<W: Write + Seek>(
+    opc: &mut OpcWriter<W>,
+    prefix: &str,
+    fonts: &[(String, Vec<u8>)],
+) -> Result<()> {
+    debug_assert!(prefix.starts_with('/') && prefix.ends_with('/'));
+    if !fonts.is_empty() {
+        // Register `ttf` once as a Default content-type entry. The
+        // per-part Overrides we emit alongside still take precedence
+        // at lookup; the Default just keeps OOXML SDK validators
+        // happy ("missing Default for extension ttf").
+        opc.register_default_content_type("ttf", FONT_CONTENT_TYPE);
+    }
+    for (idx, (name, data)) in fonts.iter().enumerate() {
+        let n = idx + 1;
+        let safe_name = sanitize_font_filename(name);
+        let target = format!("{prefix}font_{n}_{safe_name}.ttf");
+        let part = PartName::new(&target)?;
+        opc.add_part(&part, FONT_CONTENT_TYPE, data)?;
+    }
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn sanitize_keeps_alphanumeric() {
+        assert_eq!(sanitize_font_filename("Calibri"), "Calibri");
+        assert_eq!(sanitize_font_filename("Arial123"), "Arial123");
+    }
+
+    #[test]
+    fn sanitize_keeps_dash_and_underscore() {
+        assert_eq!(sanitize_font_filename("Times-Roman"), "Times-Roman");
+        assert_eq!(sanitize_font_filename("TeXGyreTermesX-Regular"), "TeXGyreTermesX-Regular");
+        assert_eq!(sanitize_font_filename("my_font"), "my_font");
+    }
+
+    #[test]
+    fn sanitize_replaces_path_unsafe_chars() {
+        assert_eq!(sanitize_font_filename("Arial/Bold"), "Arial_Bold");
+        assert_eq!(sanitize_font_filename("a*b?c"), "a_b_c");
+        assert_eq!(sanitize_font_filename("Noto Sans"), "Noto_Sans");
+        assert_eq!(sanitize_font_filename("a.b"), "a_b");
+    }
+
+    #[test]
+    fn sanitize_replaces_non_ascii() {
+        // Non-ASCII alphanumeric is replaced with '_'.
+        assert_eq!(sanitize_font_filename("Café"), "Caf_");
+    }
+
+    #[test]
+    fn sanitize_clamps_to_40_chars() {
+        let long = "A".repeat(100);
+        let s = sanitize_font_filename(&long);
+        assert_eq!(s.len(), 40);
+        assert!(s.chars().all(|c| c == 'A'));
+    }
+
+    #[test]
+    fn sanitize_empty_input() {
+        assert_eq!(sanitize_font_filename(""), "");
+    }
+}
diff --git a/src/core/mod.rs b/src/core/mod.rs
index aa40ec9..14bd6ed 100644
--- a/src/core/mod.rs
+++ b/src/core/mod.rs
@@ -8,8 +8,13 @@
 
 /// `[Content_Types].xml` parsing and writing.
 pub mod content_types;
+/// Shared `docProps/core.xml` generator used by DOCX, PPTX, XLSX writers.
+pub mod core_properties;
 /// In-place editing of OPC packages (preserves unchanged parts).
 pub mod editable;
+/// Helpers for embedding TrueType / OpenType font programs in DOCX,
+/// PPTX, and XLSX packages.
+pub mod embedded_fonts;
 /// Core error type and `Result` alias used throughout OOXML parsing.
 pub mod error;
 /// OPC (Open Packaging Conventions) reader and writer for ZIP-based packages.
diff --git a/src/core/opc.rs b/src/core/opc.rs
index 307e9d3..16dc027 100644
--- a/src/core/opc.rs
+++ b/src/core/opc.rs
@@ -452,6 +452,20 @@ impl<W: Write + Seek> OpcWriter<W> {
         })
     }
 
+    /// Register a default `[Content_Types].xml` entry for a file
+    /// extension. Use this for parts whose content type is uniform
+    /// across the package (e.g. `ttf` for all embedded fonts, `png`
+    /// for all raster images). Default + Override is legal OOXML;
+    /// Override takes precedence at lookup time, so passing the same
+    /// content type to both is redundant but safe.
+    ///
+    /// Validators (Office Open XML SDK) flag packages that ship many
+    /// per-file overrides without a matching Default — emit defaults
+    /// for known-uniform extensions to satisfy them.
+    pub fn register_default_content_type(&mut self, extension: &str, content_type: &str) {
+        self.content_types.add_default(extension, content_type);
+    }
+
     /// Add a part to the package.
     pub fn add_part(&mut self, name: &PartName, content_type: &str, data: &[u8]) -> Result<()> {
         // Register content type override
diff --git a/src/core/relationships.rs b/src/core/relationships.rs
index 77b9887..09dbaf0 100644
--- a/src/core/relationships.rs
+++ b/src/core/relationships.rs
@@ -33,9 +33,19 @@ pub mod rel_types {
     /// Relationship type for the font table.
     pub const FONT_TABLE: &str =
         "http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable";
+    /// Relationship type for an individual embedded font program (the
+    /// `<w:embedRegular r:id="…"/>` reference from `fontTable.xml` to a
+    /// `.ttf` part under `word/fonts/`).
+    pub const FONT: &str =
+        "http://schemas.openxmlformats.org/officeDocument/2006/relationships/font";
     /// Relationship type for embedded images.
     pub const IMAGE: &str =
         "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image";
+    /// Relationship type for a SpreadsheetML / DrawingML drawing part
+    /// (`xl/drawings/drawingN.xml`). Worksheet-to-drawing rel; the
+    /// drawing itself owns IMAGE rels keyed by `<a:blip r:embed=...>`.
+    pub const DRAWING: &str =
+        "http://schemas.openxmlformats.org/officeDocument/2006/relationships/drawing";
     /// Relationship type for hyperlinks.
     pub const HYPERLINK: &str =
         "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink";
diff --git a/src/core/units.rs b/src/core/units.rs
index 456de8e..0fc2502 100644
--- a/src/core/units.rs
+++ b/src/core/units.rs
@@ -100,6 +100,30 @@ impl HalfPoint {
     pub fn from_points(pt: f64) -> Self {
         Self((pt * 2.0) as u32)
     }
+
+    /// Round to the nearest half-point.
+    pub fn from_points_rounded(pt: f64) -> Self {
+        Self((pt * 2.0).round() as u32)
+    }
+
+    /// Construct from WordProcessingML's `<w:sz w:val="N"/>`. The
+    /// attribute value is already in half-points, so this is identity
+    /// modulo signed→unsigned.
+    pub fn from_word_sz(half_pt: u32) -> Self {
+        Self(half_pt)
+    }
+
+    /// Construct from DrawingML's `<a:rPr sz="N"/>`. The attribute
+    /// value is in *hundredths of a point* (sz=1200 → 12 pt). Half-pt
+    /// = hundredths / 50.
+    pub fn from_drawingml_sz(hundredths_pt: u32) -> Self {
+        Self(hundredths_pt / 50)
+    }
+
+    /// Reverse of [`Self::from_drawingml_sz`].
+    pub fn to_drawingml_sz(self) -> u32 {
+        self.0 * 50
+    }
 }
 
 /// Percentage * 1000 (e.g., 50% = 50_000, 100% = 100_000). ST_Percentage in OOXML.
@@ -184,6 +208,57 @@ mod tests {
         assert_eq!(from.0, 20);
     }
 
+    #[test]
+    fn half_point_from_points_rounded() {
+        // 10.1pt → 20.2 half-pts → rounds to 20.
+        assert_eq!(HalfPoint::from_points_rounded(10.1).0, 20);
+        // 10.3pt → 20.6 half-pts → rounds to 21.
+        assert_eq!(HalfPoint::from_points_rounded(10.3).0, 21);
+        // Exact half-point boundary rounds with banker's rules; check whole-pt.
+        assert_eq!(HalfPoint::from_points_rounded(12.0).0, 24);
+        // Compare with truncating from_points: 10.49 → trunc 20, round 21.
+        assert_eq!(HalfPoint::from_points(10.49).0, 20);
+        assert_eq!(HalfPoint::from_points_rounded(10.49).0, 21);
+    }
+
+    #[test]
+    fn half_point_from_word_sz() {
+        // <w:sz w:val="24"/> means 24 half-points → 12pt.
+        let sz = HalfPoint::from_word_sz(24);
+        assert_eq!(sz.0, 24);
+        assert!((sz.to_points() - 12.0).abs() < f64::EPSILON);
+    }
+
+    #[test]
+    fn half_point_from_drawingml_sz() {
+        // <a:rPr sz="1200"/> means 1200 hundredths of a point → 12pt → 24 half-pts.
+        let sz = HalfPoint::from_drawingml_sz(1200);
+        assert_eq!(sz.0, 24);
+        assert!((sz.to_points() - 12.0).abs() < f64::EPSILON);
+        // 1800 hundredths → 18pt → 36 half-pts.
+        assert_eq!(HalfPoint::from_drawingml_sz(1800).0, 36);
+        // Below half-point granularity (sz=125 → 2.5 half-pt) truncates.
+        assert_eq!(HalfPoint::from_drawingml_sz(125).0, 2);
+    }
+
+    #[test]
+    fn half_point_to_drawingml_sz() {
+        // 24 half-pts (=12pt) → 1200 hundredths.
+        assert_eq!(HalfPoint(24).to_drawingml_sz(), 1200);
+        assert_eq!(HalfPoint(36).to_drawingml_sz(), 1800);
+    }
+
+    #[test]
+    fn half_point_drawingml_round_trip() {
+        for hundredths in [100u32, 600, 1100, 1200, 1800, 2400, 3600] {
+            let hp = HalfPoint::from_drawingml_sz(hundredths);
+            // Round-trip is lossless when hundredths is divisible by 50.
+            if hundredths % 50 == 0 {
+                assert_eq!(hp.to_drawingml_sz(), hundredths, "input {hundredths}");
+            }
+        }
+    }
+
     #[test]
     fn percentage_conversions() {
         let fifty = Percentage1000(50_000);
diff --git a/src/create.rs b/src/create.rs
index faaa6a3..f8e56dc 100644
--- a/src/create.rs
+++ b/src/create.rs
@@ -91,7 +91,9 @@ pub fn create_from_ir_to_writer<W: Write + Seek>(
 // DOCX conversion
 // ---------------------------------------------------------------------------
 
-fn ir_to_docx(ir: &DocumentIR) -> crate::docx::write::DocxWriter {
+/// Build a `DocxWriter` from `DocumentIR`, exposed so callers can embed
+/// extra parts (fonts, custom metadata) before serialization.
+pub fn ir_to_docx(ir: &DocumentIR) -> crate::docx::write::DocxWriter {
     use crate::docx::write::{DocxWriter, IrParaProps, Run};
 
     let mut writer = DocxWriter::new();
@@ -179,6 +181,7 @@ fn add_element_to_docx(writer: &mut crate::docx::write::DocxWriter, elem: &Eleme
             let runs: Vec<Run> = ir_inline_to_runs(&h.content);
             let props = IrParaProps {
                 style: Some(format!("Heading{level}")),
+                alignment: h.alignment.clone(),
                 ..Default::default()
             };
             writer.add_ir_paragraph(&runs, Some(props));
@@ -218,8 +221,28 @@ fn add_element_to_docx(writer: &mut crate::docx::write::DocxWriter, elem: &Eleme
             writer.add_ir_image(img);
         },
         Element::ThematicBreak => {
-            // Emitted as a blank paragraph (no visual rule; full border support is a future enhancement).
-            let props = IrParaProps::default();
+            // Emit as a blank paragraph with a single bottom border —
+            // the conventional DOCX representation of a horizontal
+            // rule. Word displays this as a thin black line under
+            // the paragraph; on PDF→DOCX→IR re-parse the renderer
+            // detects "empty paragraph with bottom-border-only" and
+            // draws a horizontal rule.
+            let border = crate::ir::ParagraphBorder {
+                top: None,
+                left: None,
+                right: None,
+                between: None,
+                bottom: Some(crate::ir::BorderLine {
+                    style: crate::ir::BorderStyle::Single,
+                    color: Some([0, 0, 0]),
+                    size: Some(6),
+                    space: Some(1),
+                }),
+            };
+            let props = IrParaProps {
+                border: Some(border),
+                ..Default::default()
+            };
             writer.add_ir_paragraph(&[], Some(props));
         },
         Element::PageBreak => {
@@ -240,6 +263,11 @@ fn add_element_to_docx(writer: &mut crate::docx::write::DocxWriter, elem: &Eleme
         Element::CodeBlock(cb) => {
             writer.add_code_block(&cb.content);
         },
+        Element::Shape(_) => {
+            // Vector shapes are written directly by the layout-preserving
+            // DOCX writer (`pdf_oxide::converters::docx_layout`), not via
+            // the markdown→IR→DOCX pipeline.
+        },
     }
 }
 
@@ -284,22 +312,160 @@ fn ir_inline_to_runs(content: &[InlineContent]) -> Vec<crate::docx::write::Run>
             },
         }
     }
-    runs
+    coalesce_runs(runs)
+}
+
+/// Merge adjacent text runs that share identical run properties so the
+/// emitted DOCX has one `<w:r>` per styling region instead of one per
+/// PDF span. PDF text extraction returns ~1 span per word; without
+/// this pass the document.xml balloons (~5× over the merged form),
+/// search/replace breaks across word boundaries, and screen readers
+/// stutter.
+///
+/// Footnote/endnote/field runs are never merged (they carry semantic
+/// markers that must stay in their own `<w:r>` for Word to recognise
+/// them as references).
+fn coalesce_runs(runs: Vec<crate::docx::write::Run>) -> Vec<crate::docx::write::Run> {
+    use crate::docx::write::Run;
+    let mut out: Vec<Run> = Vec::with_capacity(runs.len());
+    for r in runs {
+        let mergeable = r.footnote_ref.is_none() && r.endnote_ref.is_none() && r.text != "\n";
+        if mergeable {
+            if let Some(last) = out.last_mut() {
+                if last.footnote_ref.is_none()
+                    && last.endnote_ref.is_none()
+                    && last.text != "\n"
+                    && run_props_equal(last, &r)
+                {
+                    last.text.push_str(&r.text);
+                    continue;
+                }
+            }
+        }
+        out.push(r);
+    }
+    out
+}
+
+/// Compare two runs' style properties (everything except `text`,
+/// `footnote_ref`, `endnote_ref`) for byte-equality.
+fn run_props_equal(a: &crate::docx::write::Run, b: &crate::docx::write::Run) -> bool {
+    a.bold == b.bold
+        && a.italic == b.italic
+        && a.underline == b.underline
+        && a.underline_style == b.underline_style
+        && a.strikethrough == b.strikethrough
+        && a.color == b.color
+        && a.color_rgb == b.color_rgb
+        && a.font_size_pt == b.font_size_pt
+        && a.font_size_half_pt == b.font_size_half_pt
+        && a.font_name == b.font_name
+        && a.highlight == b.highlight
+        && a.vertical_align == b.vertical_align
+        && a.all_caps == b.all_caps
+        && a.small_caps == b.small_caps
+        && a.char_spacing_half_pt == b.char_spacing_half_pt
 }
 
 // ---------------------------------------------------------------------------
 // XLSX conversion
 // ---------------------------------------------------------------------------
 
-fn ir_to_xlsx(ir: &DocumentIR) -> crate::xlsx::write::XlsxWriter {
-    use crate::xlsx::write::CellData;
+/// Sanitise a worksheet name and ensure it doesn't clash with names
+/// already used in the workbook. Excel limits names to 31 chars and
+/// forbids `:\\/?*[]`; the spec also forbids the reserved name
+/// "History". When the sanitised candidate is empty or already taken,
+/// fall back to "Sheet<idx>" — and even that is post-checked so
+/// pathological inputs can't collide.
+fn unique_sheet_name(raw: &str, idx: usize, used: &std::collections::HashSet<String>) -> String {
+    fn sanitise(s: &str) -> String {
+        let mut out = String::with_capacity(s.len().min(31));
+        for ch in s.chars() {
+            if matches!(ch, ':' | '\\' | '/' | '?' | '*' | '[' | ']') {
+                out.push('_');
+            } else {
+                out.push(ch);
+            }
+            if out.chars().count() >= 31 {
+                break;
+            }
+        }
+        out.trim().to_string()
+    }
+    let candidate = sanitise(raw);
+    if !candidate.is_empty()
+        && !candidate.eq_ignore_ascii_case("history")
+        && !used.contains(&candidate)
+    {
+        return candidate;
+    }
+    // Fall back to indexed name.
+    let mut fallback = format!("Sheet{idx}");
+    let mut bump = idx;
+    while used.contains(&fallback) {
+        bump += 1;
+        fallback = format!("Sheet{bump}");
+    }
+    fallback
+}
+
+/// Build an `XlsxWriter` from `DocumentIR`. Public so callers can embed
+/// extra parts (fonts, custom metadata) before serialization. Mirrors
+/// `ir_to_docx` and `ir_to_pptx`.
+pub fn ir_to_xlsx(ir: &DocumentIR) -> crate::xlsx::write::XlsxWriter {
+    use crate::xlsx::write::{CellData, CellStyle};
 
     let mut writer = crate::xlsx::write::XlsxWriter::new();
+    writer.set_metadata(&ir.metadata);
+
+    // Sheet names must be unique within a workbook (ECMA-376) and Excel
+    // additionally rejects names > 31 chars, names containing `:\\/?*[]`,
+    // and the literal "History". We sanitise + de-duplicate by
+    // appending the 1-based index when a section's title would clash
+    // (or when there's no title at all).
+    let mut used_names: std::collections::HashSet<String> = std::collections::HashSet::new();
+    for (idx, section) in ir.sections.iter().enumerate() {
+        // Prefer the section's title; failing that, use the first
+        // heading inside the section so each tab gets a meaningful
+        // label (e.g. "1 Introduction", "Abstract") instead of the
+        // anonymous "Sheet1..N".
+        let raw_owned = section
+            .title
+            .clone()
+            .or_else(|| first_heading_text(&section.elements))
+            .unwrap_or_default();
+        let raw = raw_owned.as_str();
+        let name = unique_sheet_name(raw, idx + 1, &used_names);
+        used_names.insert(name.clone());
+        let mut sheet = writer.add_sheet(&name);
+
+        // Propagate per-section page geometry so a PDF→XLSX→PDF round
+        // trip preserves the source MediaBox. Without this each
+        // worksheet falls back to default Letter portrait and a long
+        // PDF (134 / 660 pages) flows onto far fewer pages because the
+        // renderer uses a different page size on read-back.
+        if let Some(ps) = section.page_setup.as_ref() {
+            sheet.set_page_setup(crate::xlsx::write::PageSetup {
+                width_twips: ps.width_twips,
+                height_twips: ps.height_twips,
+                margin_top_twips: ps.margin_top_twips,
+                margin_bottom_twips: ps.margin_bottom_twips,
+                margin_left_twips: ps.margin_left_twips,
+                margin_right_twips: ps.margin_right_twips,
+                header_distance_twips: ps.header_distance_twips,
+                footer_distance_twips: ps.footer_distance_twips,
+                landscape: ps.landscape,
+            });
+        }
 
-    for section in &ir.sections {
-        let name = section.title.as_deref().unwrap_or("Sheet");
-        let mut sheet = writer.add_sheet(name);
         let mut row_cursor = 0usize;
+        // Body paragraphs that aren't part of a table get split across
+        // multiple rows when long, so a page-of-prose stays readable
+        // instead of piling 1500 chars into a single clipped cell. We
+        // also widen column A so the resulting rows have somewhere to
+        // breathe. Short paragraphs (≤ 80 chars) and headings stay in
+        // a single cell to preserve their visual identity.
+        let mut body_paragraphs_seen = false;
 
         for elem in &section.elements {
             match elem {
@@ -335,31 +501,220 @@ fn ir_to_xlsx(ir: &DocumentIR) -> crate::xlsx::write::XlsxWriter {
                 Element::Paragraph(p) => {
                     let text = inline_to_text(&p.content);
                     if !text.is_empty() {
-                        sheet.set_cell(row_cursor, 0, CellData::String(text));
-                        row_cursor += 1;
+                        body_paragraphs_seen = true;
+                        // Persist the IR paragraph's font size onto the cell.
+                        // This is what allows a PDF→IR→XLSX→IR→PDF round-trip
+                        // to recover the original 9–10 pt body size instead of
+                        // falling back to the 12 pt default and inflating the
+                        // page count.
+                        let mut style = CellStyle::new();
+                        if let Some(size_pt) = crate::ir::first_inline_font_size_pt(&p.content) {
+                            style = style.font_size(size_pt);
+                        }
+                        if let Some(name) = first_inline_font_name(&p.content) {
+                            style = style.font_name(name);
+                        }
+                        for line in split_paragraph_for_xlsx(&text) {
+                            sheet.set_cell_styled(
+                                row_cursor,
+                                0,
+                                CellData::String(line),
+                                style.clone(),
+                            );
+                            row_cursor += 1;
+                        }
+                    }
+                },
+                Element::Image(img) => {
+                    // Anchor any image carried by the IR onto this
+                    // worksheet. EMU coordinates default to (0, 0) when
+                    // the IR didn't carry per-image positioning — the
+                    // round-trip still recovers the bytes, just stacked
+                    // at the sheet origin. When position-aware writers
+                    // wrap images in TextBox the outer branch below
+                    // unwraps the EMU coords.
+                    if let (Some(data), Some(fmt)) = (&img.data, &img.format) {
+                        let cx = img.display_width_emu.unwrap_or(3_000_000) as i64;
+                        let cy = img.display_height_emu.unwrap_or(2_000_000) as i64;
+                        sheet.add_image(data.clone(), fmt.extension(), 0, 0, cx, cy);
+                    }
+                },
+                Element::TextBox(tb) => {
+                    // Positional wrapper: when the IR places an image
+                    // inside a TextBox (PDF→IR can carry shape coords
+                    // that way), forward the inner image bytes with the
+                    // TextBox's anchor.
+                    let x = tb.x_emu.unwrap_or(0);
+                    let y = tb.y_emu.unwrap_or(0);
+                    let cx = tb.width_emu.unwrap_or(0) as i64;
+                    let cy = tb.height_emu.unwrap_or(0) as i64;
+                    for inner in &tb.content {
+                        if let Element::Image(img) = inner {
+                            if let (Some(data), Some(fmt)) = (&img.data, &img.format) {
+                                let icx = if cx > 0 {
+                                    cx
+                                } else {
+                                    img.display_width_emu.unwrap_or(3_000_000) as i64
+                                };
+                                let icy = if cy > 0 {
+                                    cy
+                                } else {
+                                    img.display_height_emu.unwrap_or(2_000_000) as i64
+                                };
+                                sheet.add_image(data.clone(), fmt.extension(), x, y, icx, icy);
+                            }
+                        }
                     }
                 },
                 Element::Heading(h) => {
                     let text = inline_to_text(&h.content);
                     if !text.is_empty() {
-                        sheet.set_cell(row_cursor, 0, CellData::String(text));
+                        let data = CellData::String(text);
+                        let mut style = CellStyle::new().bold();
+                        if let Some(size_pt) = crate::ir::first_inline_font_size_pt(&h.content) {
+                            style = style.font_size(size_pt);
+                        }
+                        if let Some(name) = first_inline_font_name(&h.content) {
+                            style = style.font_name(name);
+                        }
+                        sheet.set_cell_styled(row_cursor, 0, data, style);
                         row_cursor += 1;
                     }
                 },
                 _ => {},
             }
         }
+
+        // If we emitted any body paragraphs (rather than just tables)
+        // widen column A so multi-line prose has somewhere to render.
+        // Tables manage their own per-column widths above so we leave
+        // those alone.
+        if body_paragraphs_seen {
+            sheet.set_column_width(0, 80.0);
+        }
     }
 
     writer
 }
 
+/// Split a long paragraph into ~120-char chunks at sentence boundaries
+/// for XLSX rendering. Short paragraphs (≤ 80 chars) pass through as a
+/// single chunk so they keep their compact look.
+///
+/// Operates on `char_indices` throughout so the byte indices we slice
+/// at are always valid UTF-8 boundaries — paragraphs from PDFs often
+/// contain multi-byte glyphs (mathematical italic, accented Latin,
+/// CJK) and naive byte arithmetic blows up on them.
+fn split_paragraph_for_xlsx(text: &str) -> Vec<String> {
+    const SHORT_THRESHOLD: usize = 80;
+    const TARGET_LINE_LEN: usize = 120;
+    const SCAN_BACK_CHARS: usize = 60;
+
+    if text.chars().count() <= SHORT_THRESHOLD {
+        return vec![text.to_string()];
+    }
+
+    // Pre-compute char positions so all slicing happens on boundaries.
+    let chars: Vec<(usize, char)> = text.char_indices().collect();
+    let total_chars = chars.len();
+    let total_bytes = text.len();
+
+    let mut chunks: Vec<String> = Vec::new();
+    let mut char_start: usize = 0; // index into `chars`
+
+    while char_start < total_chars {
+        let remaining_chars = total_chars - char_start;
+        if remaining_chars <= TARGET_LINE_LEN {
+            let head_byte = chars[char_start].0;
+            let tail = text[head_byte..].trim();
+            if !tail.is_empty() {
+                chunks.push(tail.to_string());
+            }
+            break;
+        }
+
+        // The "minimum break point" is char_start + TARGET_LINE_LEN.
+        let min_break_char = char_start + TARGET_LINE_LEN;
+        let scan_back_char = min_break_char
+            .saturating_sub(SCAN_BACK_CHARS)
+            .max(char_start);
+
+        // Find a sentence boundary: a `.` followed by ` ` followed by
+        // an uppercase ASCII letter. Prefer breaks at or after the
+        // target, then fall back to one slightly before.
+        let mut break_char: Option<usize> = None;
+
+        // Pass 1: at-or-after the cap.
+        for i in min_break_char..total_chars.saturating_sub(2) {
+            if chars[i].1 == '.' && chars[i + 1].1 == ' ' && chars[i + 2].1.is_ascii_uppercase() {
+                break_char = Some(i + 2); // start of the next sentence
+                break;
+            }
+        }
+
+        // Pass 2: before the cap, within scan_back window.
+        if break_char.is_none() {
+            for i in scan_back_char..min_break_char.saturating_sub(2).max(scan_back_char) {
+                if i + 2 >= total_chars {
+                    break;
+                }
+                if chars[i].1 == '.' && chars[i + 1].1 == ' ' && chars[i + 2].1.is_ascii_uppercase()
+                {
+                    break_char = Some(i + 2);
+                }
+            }
+        }
+
+        // Pass 3: next whitespace at-or-after the cap.
+        if break_char.is_none() {
+            for i in min_break_char..total_chars {
+                if chars[i].1 == ' ' {
+                    break_char = Some(i + 1);
+                    break;
+                }
+            }
+        }
+
+        let next_char = break_char.unwrap_or(total_chars);
+        let head_byte = chars[char_start].0;
+        let tail_byte = if next_char >= total_chars {
+            total_bytes
+        } else {
+            chars[next_char].0
+        };
+        let head = text[head_byte..tail_byte].trim();
+        if !head.is_empty() {
+            chunks.push(head.to_string());
+        }
+
+        // Advance past any leading whitespace on the tail (we already
+        // trimmed `head`, but `next_char` may sit right at the space).
+        let mut cs = next_char;
+        while cs < total_chars && chars[cs].1 == ' ' {
+            cs += 1;
+        }
+        if cs <= char_start {
+            // Defensive: ensure forward progress.
+            cs = char_start + 1;
+        }
+        char_start = cs;
+    }
+
+    if chunks.is_empty() {
+        chunks.push(text.to_string());
+    }
+    chunks
+}
+
 // ---------------------------------------------------------------------------
 // PPTX conversion
 // ---------------------------------------------------------------------------
 
-fn ir_to_pptx(ir: &DocumentIR) -> crate::pptx::write::PptxWriter {
+/// Build a `PptxWriter` from `DocumentIR`. Public so callers can embed
+/// extra parts (fonts, custom metadata) before serialization.
+pub fn ir_to_pptx(ir: &DocumentIR) -> crate::pptx::write::PptxWriter {
     let mut writer = crate::pptx::write::PptxWriter::new();
+    writer.set_metadata(&ir.metadata);
 
     if let Some(ps) = ir.sections.iter().find_map(|s| s.page_setup.as_ref()) {
         let cx = ps.width_twips as u64 * 914_400 / 1440;
@@ -367,85 +722,327 @@ fn ir_to_pptx(ir: &DocumentIR) -> crate::pptx::write::PptxWriter {
         writer.set_presentation_size(cx, cy);
     }
 
-    for section in &ir.sections {
-        let slide = writer.add_slide();
+    // PowerPoint shows a "found a problem with content. Do you want to
+    // repair?" dialog and renders Slide Sorter very slowly when a deck
+    // exceeds ~250 slides. For large PDFs (e.g. a 660-page CFR) the
+    // historical 1-section-per-slide mapping produces decks that hit
+    // both issues. When the IR has more sections than the threshold
+    // we collapse consecutive sections into heading-bounded chunks of
+    // at most ~12 paragraphs each and cap the total slide count.
+    const MAX_SLIDES: usize = 250;
+    const MAX_PARAGRAPHS_PER_SLIDE: usize = 12;
 
-        if let Some(ref title) = section.title {
-            if !title.is_empty() {
-                slide.set_title(title);
-            }
+    if ir.sections.len() <= MAX_SLIDES {
+        for section in &ir.sections {
+            emit_pptx_slide_from_section(&mut writer, section);
         }
+    } else {
+        emit_pptx_slides_compacted(&mut writer, ir, MAX_SLIDES, MAX_PARAGRAPHS_PER_SLIDE);
+    }
 
-        for elem in &section.elements {
-            match elem {
-                Element::Heading(h) => {
-                    if slide.title.is_none() {
-                        slide.set_title(&inline_to_text(&h.content));
-                    } else {
-                        let runs = inline_to_pptx_runs(&h.content);
-                        if !runs.is_empty() {
-                            slide.add_rich_text(&runs);
-                        }
-                    }
-                },
-                Element::Paragraph(p) => {
-                    let runs = inline_to_pptx_runs(&p.content);
-                    if !runs.is_empty() {
-                        slide.add_rich_text(&runs);
-                    }
-                },
-                Element::List(l) => {
-                    let items: Vec<String> = l
-                        .items
+    writer
+}
+
+/// One IR section → one slide. Used for "small" decks where 1:1 paging
+/// is still viable.
+fn emit_pptx_slide_from_section(writer: &mut crate::pptx::write::PptxWriter, section: &Section) {
+    let slide = writer.add_slide();
+
+    if let Some(ref title) = section.title {
+        if !title.is_empty() {
+            slide.set_title(title);
+        }
+    }
+
+    for elem in &section.elements {
+        emit_pptx_element(slide, elem);
+    }
+}
+
+/// Marker text used to encode `Element::ThematicBreak` through PPTX
+/// round-trip. The PPTX paragraph format has no `<a:pPr>` border the
+/// way DOCX `<w:pBdr>` does; emitting a thin connector shape would
+/// position the rule absolutely on the slide (wrong for flow
+/// content). Instead we emit a centered paragraph of U+2500 (BOX
+/// DRAWINGS LIGHT HORIZONTAL) characters; the renderer's pdf_oxide
+/// side detects this exact pattern and re-emits a real
+/// `page.horizontal_rule()`. Plain enough that any other consumer
+/// (PowerPoint itself, a markdown export, a screen reader) sees a
+/// visible horizontal-rule glyph string and treats it as a
+/// separator.
+pub(crate) const PPTX_THEMATIC_BREAK_MARKER: &str = "\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}\u{2500}";
+
+fn emit_pptx_element(slide: &mut crate::pptx::write::SlideData, elem: &Element) {
+    match elem {
+        Element::ThematicBreak => {
+            // Encode via the marker text + center alignment. The
+            // pdf_oxide renderer recognises the all-U+2500 content
+            // and draws a real `page.horizontal_rule()` instead of
+            // rendering the box-drawing glyphs.
+            let runs = vec![crate::pptx::write::Run::new(PPTX_THEMATIC_BREAK_MARKER)];
+            slide.add_rich_text_aligned(&runs, Some(ParagraphAlignment::Center));
+        },
+        Element::Heading(h) => {
+            if slide.title.is_none() {
+                slide.set_title_aligned(&inline_to_text(&h.content), h.alignment.clone());
+            } else {
+                let runs = inline_to_pptx_runs(&h.content);
+                if !runs.is_empty() {
+                    slide.add_rich_text_aligned(&runs, h.alignment.clone());
+                }
+            }
+        },
+        Element::Paragraph(p) => {
+            let runs = inline_to_pptx_runs(&p.content);
+            // Always emit, including for runs.is_empty() — empty
+            // spacer paragraphs (used by pdf_to_ir to preserve large
+            // vertical gaps on source cover pages) need to round-trip
+            // through PPTX as empty <a:p> elements so the rendered
+            // PPTX→IR→PDF cycle reproduces the source's vertical
+            // rhythm. `space_before_twips` from IR (twips) is
+            // converted to PPTX `<a:spcPts>` hundredths-of-pt:
+            // 1 twip = 1/1440 in = 1/20 pt → twips * 5 = pt*100.
+            let space_before_hundredths_pt = p.space_before_twips.map(|t| t * 5);
+            let props = crate::pptx::write::ParaProps {
+                alignment: p.alignment.clone(),
+                space_before_hundredths_pt,
+            };
+            slide.add_rich_text_with_props(&runs, props);
+        },
+        Element::List(l) => {
+            let items: Vec<String> = l
+                .items
+                .iter()
+                .map(|i| {
+                    i.content
                         .iter()
-                        .map(|i| {
-                            i.content
-                                .iter()
-                                .map(|e| match e {
-                                    Element::Paragraph(p) => inline_to_text(&p.content),
-                                    _ => String::new(),
-                                })
-                                .collect::<Vec<_>>()
-                                .join(" ")
+                        .map(|e| match e {
+                            Element::Paragraph(p) => inline_to_text(&p.content),
+                            _ => String::new(),
                         })
-                        .collect();
-                    let item_refs: Vec<&str> = items.iter().map(|s| s.as_str()).collect();
-                    slide.add_bullet_list(&item_refs);
-                },
-                Element::Table(t) => {
-                    let text = t
-                        .rows
+                        .collect::<Vec<_>>()
+                        .join(" ")
+                })
+                .collect();
+            let item_refs: Vec<&str> = items.iter().map(|s| s.as_str()).collect();
+            slide.add_bullet_list(&item_refs);
+        },
+        Element::Table(t) => {
+            let text = t
+                .rows
+                .iter()
+                .map(|row| {
+                    row.cells
                         .iter()
-                        .map(|row| {
-                            row.cells
-                                .iter()
-                                .map(cell_text)
-                                .collect::<Vec<_>>()
-                                .join("\t")
-                        })
+                        .map(cell_text)
                         .collect::<Vec<_>>()
-                        .join("\n");
-                    if !text.is_empty() {
-                        slide.add_text(&text);
+                        .join("\t")
+                })
+                .collect::<Vec<_>>()
+                .join("\n");
+            if !text.is_empty() {
+                slide.add_text(&text);
+            }
+        },
+        Element::Image(img) => {
+            if let (Some(data), Some(fmt)) = (&img.data, &img.format) {
+                let cx = img.display_width_emu.unwrap_or(3_000_000);
+                let cy = img.display_height_emu.unwrap_or(2_000_000);
+                slide.add_image(data.clone(), fmt.clone(), 0, 0, cx, cy);
+            }
+        },
+        Element::CodeBlock(cb) => {
+            let run = crate::pptx::write::Run::new(&cb.content).font("Courier New");
+            slide.add_rich_text(&[run]);
+        },
+        _ => {},
+    }
+}
+
+/// Heading-aware compaction for large IR section lists.
+///
+/// Strategy:
+/// 1. Build a flat list of `(title, elements)` "groups" where every
+///    H1/H2 boundary starts a new group and the section's own
+///    elements between headings are concatenated.
+/// 2. Each group becomes one or more slides, splitting at paragraph
+///    boundaries when the body exceeds `max_paragraphs_per_slide`.
+/// 3. After collecting candidate slides, if we still exceed
+///    `max_slides`, fold trailing slides into the previous one until
+///    the cap is met (preserves earlier headings/structure).
+fn emit_pptx_slides_compacted(
+    writer: &mut crate::pptx::write::PptxWriter,
+    ir: &DocumentIR,
+    max_slides: usize,
+    max_paragraphs_per_slide: usize,
+) {
+    // Step 1: build heading-bounded groups. Each group's title is
+    // (text, optional alignment); the alignment flows through to
+    // `slide.set_title_aligned` in step 4 so source-PDF cover-page
+    // headings keep their original alignment (typically Center).
+    type TitleWithAlgn = (String, Option<ParagraphAlignment>);
+    let mut groups: Vec<(Option<TitleWithAlgn>, Vec<Element>)> = Vec::new();
+    let mut current_title: Option<TitleWithAlgn> = None;
+    let mut current_elems: Vec<Element> = Vec::new();
+    // Tracks whether the current group has accumulated any genuine
+    // body content (non-heading element). When false, an incoming
+    // H1/H2 is folded into the current slide as a subtitle instead of
+    // starting a new one. This prevents cover pages — where each
+    // title-block line is promoted to a heading by `pdf_to_ir` — from
+    // exploding into one title-only slide per line.
+    let mut current_has_body = false;
+
+    let flush = |groups: &mut Vec<(Option<TitleWithAlgn>, Vec<Element>)>,
+                 title: &mut Option<TitleWithAlgn>,
+                 elems: &mut Vec<Element>| {
+        if !elems.is_empty() || title.is_some() {
+            groups.push((title.take(), std::mem::take(elems)));
+        }
+    };
+
+    // Whether an element constitutes "body content" for compaction
+    // purposes. Cover pages typically begin with a logo or seal Image
+    // and a list of centered headings; flipping `current_has_body` on
+    // the leading Image causes the first heading to fall into the
+    // "real new section" branch and strand the image as a title-less
+    // slide. Only text-bearing elements should anchor a slide as
+    // having body content. Empty paragraphs used as vertical spacers
+    // (no runs, no border) are skipped — they're layout glue, not
+    // content; counting them as body causes cover pages to split
+    // mid-block when pdf_to_ir injects gap spacers.
+    fn is_body_content(elem: &Element) -> bool {
+        match elem {
+            Element::Paragraph(p) => {
+                
+                p.content.iter().any(|ic| match ic {
+                    InlineContent::Text(s) => !s.text.is_empty(),
+                    _ => false,
+                })
+            },
+            Element::List(_) | Element::CodeBlock(_) | Element::Table(_) => true,
+            _ => false,
+        }
+    }
+
+    for section in &ir.sections {
+        for elem in &section.elements {
+            if let Element::Heading(h) = elem {
+                if h.level <= 2 {
+                    let text = inline_to_text(&h.content);
+                    let trimmed = text.trim();
+                    if trimmed.is_empty() {
+                        continue;
                     }
-                },
-                Element::Image(img) => {
-                    if let (Some(data), Some(fmt)) = (&img.data, &img.format) {
-                        let cx = img.display_width_emu.unwrap_or(3_000_000);
-                        let cy = img.display_height_emu.unwrap_or(2_000_000);
-                        slide.add_image(data.clone(), fmt.clone(), 0, 0, cx, cy);
+
+                    if !current_has_body {
+                        // Cover-page fold: keep all consecutive
+                        // headings on the same slide. First heading
+                        // owns the slide title; subsequent headings
+                        // become bold paragraphs so they stay visible.
+                        if current_title.is_none() {
+                            current_title = Some((trimmed.to_string(), h.alignment.clone()));
+                        } else {
+                            let mut span = TextSpan::plain(trimmed.to_string());
+                            span.bold = true;
+                            current_elems.push(Element::Paragraph(Paragraph {
+                                content: vec![InlineContent::Text(span)],
+                                alignment: h.alignment.clone(),
+                                ..Default::default()
+                            }));
+                        }
+                        continue;
                     }
-                },
-                Element::CodeBlock(cb) => {
-                    let run = crate::pptx::write::Run::new(&cb.content).font("Courier New");
-                    slide.add_rich_text(&[run]);
-                },
-                _ => {},
+
+                    // Real new section: flush and open a new group.
+                    flush(&mut groups, &mut current_title, &mut current_elems);
+                    current_has_body = false;
+                    current_title = Some((trimmed.to_string(), h.alignment.clone()));
+                    continue;
+                }
+            }
+            current_elems.push(elem.clone());
+            if is_body_content(elem) {
+                current_has_body = true;
             }
         }
     }
+    flush(&mut groups, &mut current_title, &mut current_elems);
 
-    writer
+    // If the IR had no H1/H2 headings at all we end up with a single
+    // group holding everything. That would be one slide with all the
+    // content packed in, which the renderer can't actually fit. Fall
+    // back to a paragraph-count partition over the flattened element
+    // stream.
+    if groups.len() <= 1 {
+        let mut all_elems: Vec<Element> = Vec::new();
+        for section in &ir.sections {
+            for elem in &section.elements {
+                all_elems.push(elem.clone());
+            }
+        }
+        groups = vec![(None, all_elems)];
+    }
+
+    // Step 2: split each group into slide-sized chunks.
+    struct PendingSlide {
+        title: Option<(String, Option<ParagraphAlignment>)>,
+        elements: Vec<Element>,
+    }
+    let mut pending: Vec<PendingSlide> = Vec::new();
+
+    for (title, elems) in groups {
+        let mut chunk: Vec<Element> = Vec::new();
+        let mut paragraph_count = 0usize;
+        let mut first_chunk = true;
+        for elem in elems {
+            let is_paragraph_like =
+                matches!(elem, Element::Paragraph(_) | Element::List(_) | Element::CodeBlock(_));
+            if is_paragraph_like && paragraph_count >= max_paragraphs_per_slide {
+                pending.push(PendingSlide {
+                    title: if first_chunk { title.clone() } else { None },
+                    elements: std::mem::take(&mut chunk),
+                });
+                paragraph_count = 0;
+                first_chunk = false;
+            }
+            if is_paragraph_like {
+                paragraph_count += 1;
+            }
+            chunk.push(elem);
+        }
+        if !chunk.is_empty() || (first_chunk && title.is_some()) {
+            pending.push(PendingSlide {
+                title: if first_chunk { title.clone() } else { None },
+                elements: chunk,
+            });
+        }
+    }
+
+    // Step 3: enforce the slide cap by folding trailing slides into
+    // the previous one. We always keep at least one slide.
+    while pending.len() > max_slides {
+        // Pop the last slide and append its elements to the previous.
+        let tail = pending.pop().expect("pending non-empty");
+        if let Some(prev) = pending.last_mut() {
+            prev.elements.extend(tail.elements);
+        } else {
+            pending.push(tail);
+            break;
+        }
+    }
+
+    // Step 4: emit slides.
+    for ps in pending {
+        let slide = writer.add_slide();
+        if let Some((t, algn)) = ps.title.as_ref() {
+            if !t.is_empty() {
+                slide.set_title_aligned(t, algn.clone());
+            }
+        }
+        for elem in &ps.elements {
+            emit_pptx_element(slide, elem);
+        }
+    }
 }
 
 // ---------------------------------------------------------------------------
@@ -490,6 +1087,41 @@ fn text_to_cell_data(text: &str) -> crate::xlsx::write::CellData {
     }
 }
 
+/// Pluck the first `Element::Heading`'s plain text from a section's
+/// element list. Used by `ir_to_xlsx` to derive a meaningful
+/// worksheet tab label when the section itself doesn't carry a
+/// title — typical for a PDF→IR conversion where heading detection
+/// happens at the element level, not the section level.
+fn first_heading_text(elements: &[Element]) -> Option<String> {
+    for el in elements {
+        if let Element::Heading(h) = el {
+            let text = inline_to_text(&h.content);
+            let trimmed = text.trim();
+            if !trimmed.is_empty() {
+                return Some(trimmed.to_string());
+            }
+        }
+    }
+    None
+}
+
+/// First explicit font name from inline content. Used by the XLSX
+/// path so cell styles carry the source font instead of always
+/// falling back to the writer's "Calibri" default. Mirrors the
+/// `first_inline_font_size_pt` helper.
+fn first_inline_font_name(content: &[InlineContent]) -> Option<String> {
+    for ic in content {
+        if let InlineContent::Text(span) = ic {
+            if let Some(name) = &span.font_name {
+                if !name.is_empty() {
+                    return Some(name.clone());
+                }
+            }
+        }
+    }
+    None
+}
+
 fn xlsx_cell_style(is_header: bool, bg: Option<[u8; 3]>) -> Option<crate::xlsx::write::CellStyle> {
     use crate::xlsx::write::CellStyle;
     if is_header {
diff --git a/src/docx/document.rs b/src/docx/document.rs
index d572228..968165b 100644
--- a/src/docx/document.rs
+++ b/src/docx/document.rs
@@ -6,9 +6,20 @@ use super::table::Table;
 pub struct Body {
     /// Ordered list of block elements (paragraphs and tables).
     pub elements: Vec<BlockElement>,
+    /// Indices into `elements` where each `<w:sectPr>` boundary falls.
+    /// `section_breaks[i]` is the **count of elements covered by the
+    /// i-th section** — i.e. elements `[prev_break, section_breaks[i])`
+    /// belong to section `i`. The final section runs from the last
+    /// break to `elements.len()` and uses the document-level `sectPr`.
+    /// Empty for documents with only one section.
+    pub section_breaks: Vec<usize>,
 }
 
 /// A block-level element in the document body (or in a table cell).
+// `Paragraph` is ~320 bytes larger than `Table`. Boxing would force
+// a heap allocation on the hot parse path for every paragraph; we
+// accept the stack size in exchange for keeping parsing alloc-free.
+#[allow(clippy::large_enum_variant)]
 #[derive(Debug, Clone)]
 pub enum BlockElement {
     /// A paragraph (`w:p`).
diff --git a/src/docx/formatting.rs b/src/docx/formatting.rs
index ebd4e69..c332641 100644
--- a/src/docx/formatting.rs
+++ b/src/docx/formatting.rs
@@ -45,6 +45,35 @@ pub struct ParagraphProperties {
     pub outline_level: Option<u8>,
     /// Paragraph-mark run properties (`w:rPr` inside `w:pPr`).
     pub run_properties: Option<RunProperties>,
+    /// Frame position from `<w:framePr>`. When present this paragraph is
+    /// absolutely positioned on the page (used by layout-preserving
+    /// PDF-derived DOCX, e.g. pdf_oxide's `to_docx_bytes_layout`).
+    pub frame_position: Option<FrameProps>,
+    /// Section properties from `<w:sectPr>` inside this paragraph's `<w:pPr>`.
+    /// When present this paragraph terminates a section — the properties
+    /// describe the section that ends here.
+    pub section_properties: Option<super::SectionProperties>,
+    /// True when the paragraph has a `<w:pBdr><w:bottom .../></w:pBdr>`.
+    /// Used to recover horizontal rules: pdf_to_ir emits
+    /// `Element::ThematicBreak` which round-trips through DOCX as an
+    /// empty paragraph with a single bottom border. Without
+    /// preserving this flag the rule would be silently dropped on
+    /// re-parse and turned into a plain empty paragraph.
+    #[allow(dead_code)]
+    pub has_bottom_border: bool,
+}
+
+/// `<w:framePr>` attributes — page-anchored frame coordinates in twips.
+#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
+pub struct FrameProps {
+    /// X position in twips, anchored to the page (top-left).
+    pub x_twips: i32,
+    /// Y position in twips, anchored to the page (top-left).
+    pub y_twips: i32,
+    /// Frame width in twips.
+    pub width_twips: i32,
+    /// Frame height in twips.
+    pub height_twips: i32,
 }
 
 /// Underline style.
@@ -583,6 +612,44 @@ pub(crate) fn parse_paragraph_properties_fast(
                     b"rPr" => {
                         props.run_properties = Some(parse_run_properties_fast(reader)?);
                     },
+                    b"framePr" => {
+                        props.frame_position = parse_frame_pr(e);
+                        xml::skip_element_fast(reader)?;
+                    },
+                    b"sectPr" => {
+                        props.section_properties =
+                            Some(super::parse_section_properties(reader, e)?);
+                    },
+                    b"pBdr" => {
+                        // Scan for <w:bottom .../> inside pBdr to
+                        // detect horizontal-rule encoding (empty
+                        // paragraph + bottom border = the
+                        // conventional DOCX <hr/>). We don't capture
+                        // full border styling — just the presence
+                        // of a bottom edge.
+                        let mut depth = 1i32;
+                        loop {
+                            match reader.read_event()? {
+                                Event::Start(ref ee) | Event::Empty(ref ee)
+                                    if ee.local_name().as_ref() == b"bottom" =>
+                                {
+                                    props.has_bottom_border = true;
+                                    if matches!(reader.read_event()?, Event::Eof) {
+                                        break;
+                                    }
+                                },
+                                Event::Start(_) => depth += 1,
+                                Event::End(ref ee) => {
+                                    depth -= 1;
+                                    if depth <= 0 && ee.local_name().as_ref() == b"pBdr" {
+                                        break;
+                                    }
+                                },
+                                Event::Eof => break,
+                                _ => {},
+                            }
+                        }
+                    },
                     _ => {
                         xml::skip_element_fast(reader)?;
                     },
@@ -607,6 +674,9 @@ pub(crate) fn parse_paragraph_properties_fast(
                     b"spacing" => {
                         props.spacing = Some(parse_spacing(e)?);
                     },
+                    b"framePr" => {
+                        props.frame_position = parse_frame_pr(e);
+                    },
                     b"outlineLvl" => {
                         if let Ok(Some(val)) = xml::optional_attr_str(e, b"w:val") {
                             if let Ok(lvl) = val.parse::<u8>() {
@@ -774,6 +844,32 @@ pub(crate) fn parse_indent(e: &BytesStart) -> crate::core::Result<ParagraphInden
     Ok(indent)
 }
 
+/// Parse `<w:framePr>` attributes (`w:x`, `w:y`, `w:w`, `w:h`).
+/// Returns `None` if the element doesn't carry usable absolute coords —
+/// e.g. when only `wrap`/`anchor` modifiers are set without explicit
+/// position/size, which we can't reproduce as positional.
+fn parse_frame_pr(e: &BytesStart) -> Option<FrameProps> {
+    let read_int = |attr: &[u8]| -> Option<i32> {
+        xml::optional_attr_str(e, attr)
+            .ok()
+            .flatten()
+            .and_then(|v| v.parse::<i32>().ok())
+    };
+    let x = read_int(b"w:x");
+    let y = read_int(b"w:y");
+    let w = read_int(b"w:w");
+    let h = read_int(b"w:h");
+    match (x, y, w, h) {
+        (Some(x), Some(y), Some(w), Some(h)) => Some(FrameProps {
+            x_twips: x,
+            y_twips: y,
+            width_twips: w,
+            height_twips: h,
+        }),
+        _ => None,
+    }
+}
+
 fn parse_spacing(e: &BytesStart) -> crate::core::Result<ParagraphSpacing> {
     let mut spacing = ParagraphSpacing::default();
     if let Some(val) = xml::optional_attr_str(e, b"w:before")? {
@@ -933,4 +1029,101 @@ mod tests {
             }
         }
     }
+
+    // Advance a fast reader past the opening <w:pPr> wrapper so the
+    // caller can drive parse_paragraph_properties_fast directly.
+    fn open_ppr_fast(xml: &[u8]) -> quick_xml::Reader<&[u8]> {
+        let mut reader = xml::make_fast_reader(xml);
+        loop {
+            match reader.read_event().unwrap() {
+                Event::Start(ref e) if e.local_name().as_ref() == b"pPr" => return reader,
+                Event::Eof => panic!("no <w:pPr> in test xml"),
+                _ => {},
+            }
+        }
+    }
+
+    // ── framePr ─────────────────────────────────────────────────────────
+
+    #[test]
+    fn parse_frame_pr_empty_element() {
+        let xml =
+            br#"<w:pPr xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
+          <w:framePr w:x="720" w:y="1080" w:w="3000" w:h="500"/>
+        </w:pPr>"#;
+        let mut reader = open_ppr_fast(xml);
+        let pp = parse_paragraph_properties_fast(&mut reader).unwrap();
+        let fp = pp.frame_position.expect("framePr parsed");
+        assert_eq!(fp.x_twips, 720);
+        assert_eq!(fp.y_twips, 1080);
+        assert_eq!(fp.width_twips, 3000);
+        assert_eq!(fp.height_twips, 500);
+    }
+
+    #[test]
+    fn parse_frame_pr_missing_attrs_returns_none() {
+        // Missing w:h → frame_position must be None.
+        let xml =
+            br#"<w:pPr xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
+          <w:framePr w:x="100" w:y="200" w:w="300"/>
+        </w:pPr>"#;
+        let mut reader = open_ppr_fast(xml);
+        let pp = parse_paragraph_properties_fast(&mut reader).unwrap();
+        assert!(pp.frame_position.is_none());
+    }
+
+    #[test]
+    fn parse_frame_pr_inside_start_form() {
+        // Start/End form (rather than Empty) — should still parse.
+        let xml =
+            br#"<w:pPr xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
+          <w:framePr w:x="10" w:y="20" w:w="30" w:h="40"></w:framePr>
+        </w:pPr>"#;
+        let mut reader = open_ppr_fast(xml);
+        let pp = parse_paragraph_properties_fast(&mut reader).unwrap();
+        let fp = pp.frame_position.expect("framePr parsed");
+        assert_eq!(fp.x_twips, 10);
+        assert_eq!(fp.width_twips, 30);
+    }
+
+    // ── pBdr / has_bottom_border ────────────────────────────────────────
+
+    #[test]
+    fn parse_p_bdr_with_bottom() {
+        let xml =
+            br#"<w:pPr xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
+          <w:pBdr>
+            <w:bottom w:val="single" w:sz="6" w:space="1" w:color="auto"/>
+          </w:pBdr>
+        </w:pPr>"#;
+        let mut reader = open_ppr_fast(xml);
+        let pp = parse_paragraph_properties_fast(&mut reader).unwrap();
+        assert!(pp.has_bottom_border);
+    }
+
+    #[test]
+    fn parse_p_bdr_without_bottom() {
+        let xml =
+            br#"<w:pPr xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
+          <w:pBdr>
+            <w:top w:val="single" w:sz="6"/>
+            <w:left w:val="single" w:sz="6"/>
+          </w:pBdr>
+        </w:pPr>"#;
+        let mut reader = open_ppr_fast(xml);
+        let pp = parse_paragraph_properties_fast(&mut reader).unwrap();
+        assert!(!pp.has_bottom_border);
+    }
+
+    #[test]
+    fn paragraph_properties_default_has_no_frame_or_border() {
+        let xml =
+            br#"<w:pPr xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
+          <w:pStyle w:val="Normal"/>
+        </w:pPr>"#;
+        let mut reader = open_ppr_fast(xml);
+        let pp = parse_paragraph_properties_fast(&mut reader).unwrap();
+        assert!(pp.frame_position.is_none());
+        assert!(!pp.has_bottom_border);
+    }
 }
diff --git a/src/docx/image.rs b/src/docx/image.rs
index bef7ad1..bb8aa25 100644
--- a/src/docx/image.rs
+++ b/src/docx/image.rs
@@ -1,16 +1,81 @@
 use crate::core::units::Emu;
 
 /// Information about a drawing/image reference within a run.
+///
+/// Carries enough data for both bitmap pictures (`<a:blip r:embed=…/>`)
+/// and DrawingML preset shapes (`<wps:wsp>` with `<a:prstGeom>`). Only
+/// one of `relationship_id` or `shape` is populated for any given
+/// drawing — the consumer (`convert_docx`) uses whichever is set to
+/// decide what kind of IR `Element` to emit.
 #[derive(Debug, Clone)]
 pub struct DrawingInfo {
-    /// Relationship ID pointing to the image part.
+    /// Relationship ID pointing to the image part. Empty when the
+    /// drawing is a vector shape rather than a raster picture.
     pub relationship_id: String,
     /// Alt-text description from `wp:docPr/@descr`.
     pub description: Option<String>,
-    /// Image width in EMUs.
+    /// Image / shape width in EMUs.
     pub width: Emu,
-    /// Image height in EMUs.
+    /// Image / shape height in EMUs.
     pub height: Emu,
     /// `true` = inline, `false` = anchor (floating).
     pub inline: bool,
+    /// Floating-anchor position (only set when `inline == false`).
+    pub anchor_position: Option<AnchorPosition>,
+    /// Vector shape data when the drawing is a `<wps:wsp>` rather
+    /// than an embedded picture.
+    pub shape: Option<ShapeInfo>,
+}
+
+/// Absolute coordinates extracted from a `<wp:anchor>` wrapper.
+#[derive(Debug, Clone, Copy, Default)]
+pub struct AnchorPosition {
+    /// Horizontal offset in EMUs.
+    pub x_emu: i64,
+    /// Vertical offset in EMUs.
+    pub y_emu: i64,
+    /// What the horizontal offset is anchored to (page / margin / column).
+    pub h_relative_from: AnchorFrame,
+    /// What the vertical offset is anchored to (page / margin / paragraph).
+    pub v_relative_from: AnchorFrame,
+}
+
+/// Reference frame for a floating-object anchor.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
+pub enum AnchorFrame {
+    /// Position relative to the page.
+    #[default]
+    Page,
+    /// Position relative to the page margin.
+    Margin,
+    /// Position relative to the column.
+    Column,
+    /// Position relative to the paragraph (for vertical anchor).
+    Paragraph,
+    /// Position relative to the page line (for vertical anchor).
+    Line,
+    /// Position relative to the character (for horizontal anchor).
+    Character,
+}
+
+/// Vector-shape data parsed from `<wps:wsp>`.
+#[derive(Debug, Clone)]
+pub struct ShapeInfo {
+    /// Geometry preset from `<a:prstGeom prst="…">`.
+    pub kind: ShapeKind,
+    /// Stroke colour (`<a:ln><a:solidFill><a:srgbClr val="…"/>`).
+    pub stroke_rgb: Option<(u8, u8, u8)>,
+    /// Fill colour (`<wps:spPr><a:solidFill><a:srgbClr val="…"/>`).
+    pub fill_rgb: Option<(u8, u8, u8)>,
+    /// Stroke width in EMUs (`<a:ln w="…">`).
+    pub stroke_w_emu: Option<i64>,
+}
+
+/// Subset of DrawingML preset shape kinds we currently round-trip.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum ShapeKind {
+    /// Straight line (`prst="line"`).
+    Line,
+    /// Rectangle (`prst="rect"`).
+    Rect,
 }
diff --git a/src/docx/mod.rs b/src/docx/mod.rs
index 1f698a8..02d2e60 100644
--- a/src/docx/mod.rs
+++ b/src/docx/mod.rs
@@ -48,9 +48,11 @@ pub use formatting::{
     Justification, ParagraphIndent, ParagraphProperties, ParagraphSpacing, RunProperties,
     UnderlineType, VerticalAlign,
 };
-pub use headers::{HeaderFooter, HeaderFooterType, PageMargins, PageSize, SectionProperties};
+pub use headers::{
+    HeaderFooter, HeaderFooterType, PageMargins, PageOrientation, PageSize, SectionProperties,
+};
 pub use hyperlink::{Hyperlink, HyperlinkTarget};
-pub use image::DrawingInfo;
+pub use image::{AnchorFrame, AnchorPosition, DrawingInfo, ShapeInfo, ShapeKind};
 pub use numbering::{NumberFormat, NumberingDefinitions};
 pub use paragraph::{BreakType, Paragraph, ParagraphContent, Run, RunContent};
 pub use styles::{Style, StyleSheet, StyleType};
@@ -69,7 +71,7 @@ use crate::core::units::Emu;
 use crate::core::xml;
 
 use self::formatting::{parse_paragraph_properties_fast, parse_run_properties_fast};
-use self::headers::{HeaderFooterRef, PageOrientation};
+use self::headers::HeaderFooterRef;
 use self::table::{
     MergeType, Shading, TableCellProperties, TableRowProperties, TableWidth, TableWidthType,
 };
@@ -102,6 +104,17 @@ pub struct DocxDocument {
     pub sections: Vec<SectionProperties>,
     /// Parsed headers and footers.
     pub headers_footers: Vec<HeaderFooter>,
+    /// Font programs found under `word/fonts/`. Each entry is
+    /// `(font_name, ttf_or_otf_bytes)`. PDF→DOCX→PDF round-trips use these
+    /// to preserve typeface fidelity (e.g. CJK / math fonts beyond
+    /// pdf_oxide's bundled DejaVu fallback).
+    pub embedded_fonts: Vec<(String, Vec<u8>)>,
+    /// Image parts referenced from the main document, keyed by the
+    /// relationship id used in `<a:blip r:embed="rIdN"/>`. Lets the
+    /// IR converter populate `Image::data` so downstream renderers
+    /// (the positional PDF reader, plain-text export with alt-text,
+    /// etc.) can place actual bitmap content.
+    pub images: std::collections::HashMap<String, (Vec<u8>, Option<String>)>,
 }
 
 impl DocxDocument {
@@ -182,10 +195,75 @@ impl DocxDocument {
             }
         }
 
+        // Scan `word/fonts/` for embedded font programs. Files there are
+        // typically `font_<n>_<name>.ttf` (written by our own `DocxWriter`)
+        // but the loop accepts any `.ttf`/`.otf` for forward-compat.
+        let mut embedded_fonts: Vec<(String, Vec<u8>)> = Vec::new();
+        for name in opc.part_names() {
+            let s = name.to_string();
+            if !s.starts_with("/word/fonts/") {
+                continue;
+            }
+            let lower = s.to_lowercase();
+            if !(lower.ends_with(".ttf") || lower.ends_with(".otf")) {
+                continue;
+            }
+            if let Ok(data) = opc.read_part(&name) {
+                // Extract a usable face name from the OPC part. Writers
+                // ship fonts as `font_<n>_<face_name>.<ext>` (the
+                // `embedded_fonts` writer convention used by all three
+                // PDF→office paths) — strip the leading `font_<n>_`
+                // prefix and the trailing `.ttf`/`.otf` so the
+                // registered name matches what the IR carries on each
+                // run's `font_name` (e.g. `TeXGyreTermesX-Regular`).
+                // Falls back to the basename for files that don't
+                // follow the convention.
+                let basename = s.rsplit('/').next().unwrap_or("font");
+                let face = strip_embedded_font_filename(basename);
+                let font_name = if face.is_empty() {
+                    basename.to_string()
+                } else {
+                    face
+                };
+                embedded_fonts.push((font_name, data));
+            }
+        }
+
+        // Pull image parts referenced by the main document
+        // relationships. We capture the raw bytes plus the lower-cased
+        // file extension so downstream code can decide on the format
+        // without re-sniffing magic bytes.
+        let mut images: std::collections::HashMap<String, (Vec<u8>, Option<String>)> =
+            std::collections::HashMap::new();
+        for rel in doc_rels.get_by_type(rel_types::IMAGE) {
+            if rel.target_mode != TargetMode::Internal {
+                continue;
+            }
+            let part_name = match main_part.resolve_relative(&rel.target) {
+                Ok(p) => p,
+                Err(_) => continue,
+            };
+            if !opc.has_part(&part_name) {
+                continue;
+            }
+            let data = match opc.read_part(&part_name) {
+                Ok(d) => d,
+                Err(_) => continue,
+            };
+            let ext = part_name
+                .as_str()
+                .rsplit('.')
+                .next()
+                .map(|s| s.to_lowercase());
+            images.insert(rel.id.clone(), (data, ext));
+        }
+
         debug!(
-            "DocxDocument: {} block elements, {} sections",
+            "DocxDocument: {} block elements, {} sections, {} embedded fonts, {} images",
             body.elements.len(),
-            sections.len()
+            sections.len(),
+            embedded_fonts.len(),
+            images.len()
         );
         Ok(DocxDocument {
             body,
@@ -194,6 +272,8 @@ impl DocxDocument {
             theme,
             sections,
             headers_footers,
+            embedded_fonts,
+            images,
         })
     }
 }
@@ -259,8 +339,33 @@ fn parse_document(
     // Resolve hyperlink targets using relationships
     resolve_hyperlinks(&mut elements, rels);
 
-    let body = Body { elements };
-    Ok((body, sections))
+    // Detect mid-document section breaks: paragraphs whose <w:pPr>
+    // carries a <w:sectPr>. Each such paragraph terminates a section,
+    // and its sectPr describes the section that ends there. Trailing
+    // elements after the last break belong to a final section
+    // described by the body-level sectPr (already in `sections`).
+    let mut section_breaks: Vec<usize> = Vec::new();
+    let mut break_sections: Vec<SectionProperties> = Vec::new();
+    for (idx, el) in elements.iter().enumerate() {
+        if let BlockElement::Paragraph(p) = el {
+            if let Some(props) = &p.properties {
+                if let Some(sp) = &props.section_properties {
+                    section_breaks.push(idx + 1);
+                    break_sections.push(sp.clone());
+                }
+            }
+        }
+    }
+    // Stitch break-derived section_properties in front of the
+    // body-level final sectPr so the section list is in document order.
+    let mut all_sections = break_sections;
+    all_sections.extend(sections);
+
+    let body = Body {
+        elements,
+        section_breaks,
+    };
+    Ok((body, all_sections))
 }
 
 /// Walk the element tree and resolve hyperlink rIds to actual URLs.
@@ -443,53 +548,239 @@ fn parse_hyperlink(
 // Drawing / image parsing
 // ---------------------------------------------------------------------------
 
+/// Parse a `<w:drawing>` element. The opening tag has already been
+/// consumed by the caller, so we drive forward until the matching
+/// `</w:drawing>` End event.
+///
+/// A drawing wraps either `<wp:inline>` or `<wp:anchor>` (anchor =
+/// floating). Everything we care about lives inside that single
+/// wrapper, so we delegate to `parse_inline_or_anchor_body` and treat
+/// any other top-level event as ignorable filler.
 fn parse_drawing(reader: &mut quick_xml::Reader<&[u8]>) -> CoreResult<Option<DrawingInfo>> {
-    let mut inline = true;
+    let mut info: Option<DrawingInfo> = None;
+
+    loop {
+        match reader.read_event()? {
+            Event::Start(ref e) => match e.local_name().as_ref() {
+                b"inline" => {
+                    info = parse_inline_or_anchor_body(reader, /*inline=*/ true, b"inline")?;
+                },
+                b"anchor" => {
+                    info = parse_inline_or_anchor_body(reader, /*inline=*/ false, b"anchor")?;
+                },
+                _ => {
+                    xml::skip_element_fast(reader)?;
+                },
+            },
+            Event::End(ref e) if e.local_name().as_ref() == b"drawing" => break,
+            Event::Eof => break,
+            _ => {},
+        }
+    }
+
+    Ok(info)
+}
+
+/// Parse the body of `<wp:inline>` or `<wp:anchor>` until the matching
+/// closing tag (`end_local`). Collects extent, docPr, position, and the
+/// graphic payload (image or shape) into a `DrawingInfo`.
+fn parse_inline_or_anchor_body(
+    reader: &mut quick_xml::Reader<&[u8]>,
+    inline: bool,
+    end_local: &[u8],
+) -> CoreResult<Option<DrawingInfo>> {
+    use crate::docx::image::{AnchorFrame, AnchorPosition};
+
     let mut width = Emu(0);
     let mut height = Emu(0);
     let mut description: Option<String> = None;
     let mut relationship_id: Option<String> = None;
-    let mut depth = 1u32;
+    let mut shape: Option<crate::docx::image::ShapeInfo> = None;
+
+    let mut anchor_x: Option<i64> = None;
+    let mut anchor_y: Option<i64> = None;
+    let mut h_frame = AnchorFrame::default();
+    let mut v_frame = AnchorFrame::default();
+
+    loop {
+        match reader.read_event()? {
+            Event::Start(ref e) => match e.local_name().as_ref() {
+                b"extent" => {
+                    parse_extent_attrs(e, &mut width, &mut height);
+                    xml::skip_element_fast(reader)?;
+                },
+                b"docPr" => {
+                    if let Some(desc) = xml::optional_attr_str(e, b"descr")? {
+                        description = Some(desc.into_owned());
+                    }
+                    xml::skip_element_fast(reader)?;
+                },
+                b"positionH" => {
+                    if let Some(rf) = xml::optional_attr_str(e, b"relativeFrom")? {
+                        h_frame = parse_anchor_frame(&rf);
+                    }
+                    anchor_x = parse_position_offset(reader, b"positionH")?;
+                },
+                b"positionV" => {
+                    if let Some(rf) = xml::optional_attr_str(e, b"relativeFrom")? {
+                        v_frame = parse_anchor_frame(&rf);
+                    }
+                    anchor_y = parse_position_offset(reader, b"positionV")?;
+                },
+                b"graphic" => {
+                    let g = parse_graphic(reader)?;
+                    if let Some(rid) = g.relationship_id {
+                        relationship_id = Some(rid);
+                    }
+                    if let Some(s) = g.shape {
+                        shape = Some(s);
+                    }
+                },
+                _ => {
+                    xml::skip_element_fast(reader)?;
+                },
+            },
+            Event::Empty(ref e) => match e.local_name().as_ref() {
+                b"extent" => parse_extent_attrs(e, &mut width, &mut height),
+                b"docPr" => {
+                    if let Some(desc) = xml::optional_attr_str(e, b"descr")? {
+                        description = Some(desc.into_owned());
+                    }
+                },
+                _ => {},
+            },
+            Event::End(ref e) if e.local_name().as_ref() == end_local => break,
+            Event::Eof => break,
+            _ => {},
+        }
+    }
+
+    let anchor_position = if !inline && (anchor_x.is_some() || anchor_y.is_some()) {
+        Some(AnchorPosition {
+            x_emu: anchor_x.unwrap_or(0),
+            y_emu: anchor_y.unwrap_or(0),
+            h_relative_from: h_frame,
+            v_relative_from: v_frame,
+        })
+    } else {
+        None
+    };
+
+    if relationship_id.is_some() || shape.is_some() {
+        Ok(Some(DrawingInfo {
+            relationship_id: relationship_id.unwrap_or_default(),
+            description,
+            width,
+            height,
+            inline,
+            anchor_position,
+            shape,
+        }))
+    } else {
+        Ok(None)
+    }
+}
+
+/// Parse the inside of `<wp:positionH>` or `<wp:positionV>` looking for
+/// the nested `<wp:posOffset>` text value. Reads through the matching
+/// closing tag (`end_local`).
+fn parse_position_offset(
+    reader: &mut quick_xml::Reader<&[u8]>,
+    end_local: &[u8],
+) -> CoreResult<Option<i64>> {
+    let mut offset: Option<i64> = None;
+
+    loop {
+        match reader.read_event()? {
+            Event::Start(ref e) if e.local_name().as_ref() == b"posOffset" => {
+                let text = xml::read_text_content_fast(reader)?;
+                if let Ok(v) = text.trim().parse::<i64>() {
+                    offset = Some(v);
+                }
+            },
+            Event::Start(_) => {
+                xml::skip_element_fast(reader)?;
+            },
+            Event::End(ref e) if e.local_name().as_ref() == end_local => break,
+            Event::Eof => break,
+            _ => {},
+        }
+    }
+
+    Ok(offset)
+}
+
+/// Result of parsing an `<a:graphic>` element: at most one of an
+/// embedded picture (`relationship_id`) or a vector shape (`shape`).
+struct GraphicPayload {
+    relationship_id: Option<String>,
+    shape: Option<crate::docx::image::ShapeInfo>,
+}
+
+/// Parse `<a:graphic>` and any contained `<pic:pic>` (image) or
+/// `<wps:wsp>` (vector shape). Reads through `</a:graphic>`.
+fn parse_graphic(reader: &mut quick_xml::Reader<&[u8]>) -> CoreResult<GraphicPayload> {
+    let mut relationship_id: Option<String> = None;
+    let mut shape: Option<crate::docx::image::ShapeInfo> = None;
+
+    loop {
+        match reader.read_event()? {
+            Event::Start(ref e) => match e.local_name().as_ref() {
+                b"pic" => {
+                    if let Some(rid) = parse_pic(reader)? {
+                        relationship_id = Some(rid);
+                    }
+                },
+                b"wsp" => {
+                    if let Some(s) = parse_wsp(reader)? {
+                        shape = Some(s);
+                    }
+                },
+                // <a:graphicData> is just a wrapper; descend into it.
+                b"graphicData" => continue,
+                _ => {
+                    xml::skip_element_fast(reader)?;
+                },
+            },
+            Event::End(ref e) if e.local_name().as_ref() == b"graphic" => break,
+            Event::Eof => break,
+            _ => {},
+        }
+    }
+
+    Ok(GraphicPayload {
+        relationship_id,
+        shape,
+    })
+}
+
+/// Parse `<pic:pic>` looking for the embedded `<a:blip r:embed="…"/>`.
+/// Reads through `</pic:pic>`. The blip lives inside `<pic:blipFill>`,
+/// so we descend through whatever wrappers we encounter rather than
+/// skipping siblings.
+fn parse_pic(reader: &mut quick_xml::Reader<&[u8]>) -> CoreResult<Option<String>> {
+    let mut rid: Option<String> = None;
+    // Track depth relative to <pic:pic>: we entered after its Start was
+    // consumed by the caller, so we are at depth 1. Exit when we close
+    // back out.
+    let mut depth: u32 = 1;
 
     loop {
         match reader.read_event()? {
             Event::Start(ref e) => {
-                depth += 1;
-                let local = e.local_name();
-                let local_bytes = local.as_ref();
-                match local_bytes {
-                    b"inline" => inline = true,
-                    b"anchor" => inline = false,
-                    b"extent" => parse_extent_attrs(e, &mut width, &mut height),
-                    b"docPr" => {
-                        if let Ok(Some(desc)) = xml::optional_attr_str(e, b"descr") {
-                            description = Some(desc.into_owned());
-                        }
-                    },
-                    b"blip" => {
-                        if let Ok(Some(embed)) = xml::optional_attr_str(e, b"r:embed") {
-                            relationship_id = Some(embed.into_owned());
-                        }
-                    },
-                    _ => {},
+                if e.local_name().as_ref() == b"blip" {
+                    if let Some(embed) = xml::optional_attr_str(e, b"r:embed")? {
+                        rid = Some(embed.into_owned());
+                    }
+                    // Skip over blip's own children (e.g. <a:extLst>).
+                    xml::skip_element_fast(reader)?;
+                } else {
+                    depth += 1;
                 }
             },
-            Event::Empty(ref e) => {
-                let local = e.local_name();
-                let local_bytes = local.as_ref();
-                match local_bytes {
-                    b"extent" => parse_extent_attrs(e, &mut width, &mut height),
-                    b"docPr" => {
-                        if let Ok(Some(desc)) = xml::optional_attr_str(e, b"descr") {
-                            description = Some(desc.into_owned());
-                        }
-                    },
-                    b"blip" => {
-                        if let Ok(Some(embed)) = xml::optional_attr_str(e, b"r:embed") {
-                            relationship_id = Some(embed.into_owned());
-                        }
-                    },
-                    _ => {},
+            Event::Empty(ref e) if e.local_name().as_ref() == b"blip" => {
+                if let Some(embed) = xml::optional_attr_str(e, b"r:embed")? {
+                    rid = Some(embed.into_owned());
                 }
             },
             Event::End(_) => {
@@ -503,17 +794,209 @@ fn parse_drawing(reader: &mut quick_xml::Reader<&[u8]>) -> CoreResult<Option<Dra
         }
     }
 
-    if let Some(rid) = relationship_id {
-        Ok(Some(DrawingInfo {
-            relationship_id: rid,
-            description,
-            width,
-            height,
-            inline,
-        }))
-    } else {
-        Ok(None)
+    Ok(rid)
+}
+
+/// Parse `<wps:wsp>` (a DrawingML vector shape). Reads through
+/// `</wps:wsp>` and returns the assembled `ShapeInfo`, or `None` if no
+/// `<a:prstGeom>` was seen.
+fn parse_wsp(
+    reader: &mut quick_xml::Reader<&[u8]>,
+) -> CoreResult<Option<crate::docx::image::ShapeInfo>> {
+    use crate::docx::image::{ShapeInfo, ShapeKind};
+
+    let mut kind: Option<ShapeKind> = None;
+    let mut stroke_rgb: Option<(u8, u8, u8)> = None;
+    let mut fill_rgb: Option<(u8, u8, u8)> = None;
+    let mut stroke_w_emu: Option<i64> = None;
+
+    loop {
+        match reader.read_event()? {
+            Event::Start(ref e) => match e.local_name().as_ref() {
+                b"spPr" => {
+                    parse_sp_pr(
+                        reader,
+                        &mut kind,
+                        &mut stroke_rgb,
+                        &mut fill_rgb,
+                        &mut stroke_w_emu,
+                    )?;
+                },
+                _ => {
+                    xml::skip_element_fast(reader)?;
+                },
+            },
+            Event::End(ref e) if e.local_name().as_ref() == b"wsp" => break,
+            Event::Eof => break,
+            _ => {},
+        }
+    }
+
+    Ok(kind.map(|k| ShapeInfo {
+        kind: k,
+        stroke_rgb,
+        fill_rgb,
+        stroke_w_emu,
+    }))
+}
+
+/// Parse `<wps:spPr>`: contains the geometry preset, an optional fill,
+/// and an optional `<a:ln>` (line/stroke) sub-element. Reads through
+/// `</wps:spPr>`.
+fn parse_sp_pr(
+    reader: &mut quick_xml::Reader<&[u8]>,
+    kind: &mut Option<crate::docx::image::ShapeKind>,
+    stroke_rgb: &mut Option<(u8, u8, u8)>,
+    fill_rgb: &mut Option<(u8, u8, u8)>,
+    stroke_w_emu: &mut Option<i64>,
+) -> CoreResult<()> {
+    use crate::docx::image::ShapeKind;
+
+    loop {
+        match reader.read_event()? {
+            Event::Start(ref e) => match e.local_name().as_ref() {
+                b"prstGeom" => {
+                    if let Some(prst) = xml::optional_attr_str(e, b"prst")? {
+                        *kind = match prst.as_ref() {
+                            "line" | "straightConnector1" => Some(ShapeKind::Line),
+                            "rect" => Some(ShapeKind::Rect),
+                            _ => *kind,
+                        };
+                    }
+                    xml::skip_element_fast(reader)?;
+                },
+                b"ln" => {
+                    if let Some(w) = xml::optional_attr_str(e, b"w")? {
+                        *stroke_w_emu = w.parse().ok();
+                    }
+                    *stroke_rgb = parse_line_color(reader)?.or(*stroke_rgb);
+                },
+                b"solidFill" => {
+                    *fill_rgb = parse_solid_fill_color(reader)?.or(*fill_rgb);
+                },
+                _ => {
+                    xml::skip_element_fast(reader)?;
+                },
+            },
+            Event::Empty(ref e) => match e.local_name().as_ref() {
+                b"prstGeom" => {
+                    if let Some(prst) = xml::optional_attr_str(e, b"prst")? {
+                        *kind = match prst.as_ref() {
+                            "line" | "straightConnector1" => Some(ShapeKind::Line),
+                            "rect" => Some(ShapeKind::Rect),
+                            _ => *kind,
+                        };
+                    }
+                },
+                b"ln" => {
+                    if let Some(w) = xml::optional_attr_str(e, b"w")? {
+                        *stroke_w_emu = w.parse().ok();
+                    }
+                },
+                _ => {},
+            },
+            Event::End(ref e) if e.local_name().as_ref() == b"spPr" => break,
+            Event::Eof => break,
+            _ => {},
+        }
+    }
+
+    Ok(())
+}
+
+/// Parse `<a:ln>` looking for an inner `<a:solidFill><a:srgbClr/>`.
+/// Reads through `</a:ln>`.
+fn parse_line_color(reader: &mut quick_xml::Reader<&[u8]>) -> CoreResult<Option<(u8, u8, u8)>> {
+    let mut rgb: Option<(u8, u8, u8)> = None;
+
+    loop {
+        match reader.read_event()? {
+            Event::Start(ref e) => match e.local_name().as_ref() {
+                b"solidFill" => {
+                    if let Some(c) = parse_solid_fill_color(reader)? {
+                        rgb = Some(c);
+                    }
+                },
+                _ => {
+                    xml::skip_element_fast(reader)?;
+                },
+            },
+            Event::End(ref e) if e.local_name().as_ref() == b"ln" => break,
+            Event::Eof => break,
+            _ => {},
+        }
+    }
+
+    Ok(rgb)
+}
+
+/// Parse `<a:solidFill>` looking for an inner `<a:srgbClr val="…"/>`.
+/// Reads through `</a:solidFill>`.
+fn parse_solid_fill_color(
+    reader: &mut quick_xml::Reader<&[u8]>,
+) -> CoreResult<Option<(u8, u8, u8)>> {
+    let mut rgb: Option<(u8, u8, u8)> = None;
+
+    loop {
+        match reader.read_event()? {
+            Event::Start(ref e) => {
+                if e.local_name().as_ref() == b"srgbClr" {
+                    if let Some(val) = xml::optional_attr_str(e, b"val")? {
+                        if let Some(parsed) = parse_hex_rgb(&val) {
+                            rgb = Some(parsed);
+                        }
+                    }
+                }
+                xml::skip_element_fast(reader)?;
+            },
+            Event::Empty(ref e) if e.local_name().as_ref() == b"srgbClr" => {
+                if let Some(val) = xml::optional_attr_str(e, b"val")? {
+                    if let Some(parsed) = parse_hex_rgb(&val) {
+                        rgb = Some(parsed);
+                    }
+                }
+            },
+            Event::End(ref e) if e.local_name().as_ref() == b"solidFill" => break,
+            Event::Eof => break,
+            _ => {},
+        }
+    }
+
+    Ok(rgb)
+}
+
+fn parse_anchor_frame(s: &str) -> crate::docx::image::AnchorFrame {
+    use crate::docx::image::AnchorFrame;
+    match s {
+        "page" => AnchorFrame::Page,
+        "margin" | "leftMargin" | "rightMargin" | "topMargin" | "bottomMargin" | "insideMargin"
+        | "outsideMargin" => AnchorFrame::Margin,
+        "column" => AnchorFrame::Column,
+        "paragraph" => AnchorFrame::Paragraph,
+        "line" => AnchorFrame::Line,
+        "character" => AnchorFrame::Character,
+        _ => AnchorFrame::Page,
+    }
+}
+
+fn parse_hex_rgb(s: &str) -> Option<(u8, u8, u8)> {
+    let bytes = s.trim().as_bytes();
+    if bytes.len() != 6 {
+        return None;
     }
+    fn hex_pair(a: u8, b: u8) -> Option<u8> {
+        let h = |c: u8| match c {
+            b'0'..=b'9' => Some(c - b'0'),
+            b'a'..=b'f' => Some(10 + c - b'a'),
+            b'A'..=b'F' => Some(10 + c - b'A'),
+            _ => None,
+        };
+        Some((h(a)? << 4) | h(b)?)
+    }
+    let r = hex_pair(bytes[0], bytes[1])?;
+    let g = hex_pair(bytes[2], bytes[3])?;
+    let b = hex_pair(bytes[4], bytes[5])?;
+    Some((r, g, b))
 }
 
 fn parse_extent_attrs(e: &quick_xml::events::BytesStart, width: &mut Emu, height: &mut Emu) {
@@ -822,7 +1305,7 @@ fn parse_table_width(e: &quick_xml::events::BytesStart) -> CoreResult<Option<Tab
 // Section properties parsing
 // ---------------------------------------------------------------------------
 
-fn parse_section_properties(
+pub(crate) fn parse_section_properties(
     reader: &mut quick_xml::Reader<&[u8]>,
     _start: &quick_xml::events::BytesStart,
 ) -> CoreResult<SectionProperties> {
@@ -917,6 +1400,36 @@ fn parse_section_properties(
     Ok(props)
 }
 
+/// Recover the original face name from an embedded-font filename
+/// produced by `core::embedded_fonts::write_embedded_fonts`. The
+/// writer ships fonts as `font_<n>_<face>.<ext>` where `<face>` is
+/// the original face name (with `/`, `?`, `*` etc. sanitized to `_`
+/// — but NOT alphabetic characters, which earlier callers' naive
+/// `trim_end_matches(alphabetic)` was greedily eating).
+///
+/// Examples:
+///   `font_4_TeXGyreTermesX-Regular.ttf` → `TeXGyreTermesX-Regular`
+///   `font_1_NewTXBMI.ttf`               → `NewTXBMI`
+///   `font.otf`                          → `` (caller falls back to basename)
+pub(crate) fn strip_embedded_font_filename(basename: &str) -> String {
+    // Drop extension.
+    let stem = match basename.rfind('.') {
+        Some(i) => &basename[..i],
+        None => basename,
+    };
+    // Strip the `font_<digits>_` prefix when present.
+    if let Some(rest) = stem.strip_prefix("font_") {
+        if let Some(under_idx) = rest.find('_') {
+            // Everything before the underscore must be digits;
+            // otherwise treat the whole stem as the face name.
+            if rest[..under_idx].chars().all(|c| c.is_ascii_digit()) {
+                return rest[under_idx + 1..].to_string();
+            }
+        }
+    }
+    stem.to_string()
+}
+
 fn parse_hf_type(e: &quick_xml::events::BytesStart) -> CoreResult<HeaderFooterType> {
     Ok(match xml::optional_attr_str(e, b"w:type")? {
         Some(ref val) => match val.as_ref() {
@@ -1171,6 +1684,86 @@ mod tests {
         assert!(md.contains("| Cell1 | Cell2 |"));
     }
 
+    #[test]
+    fn parse_drawing_anchor_position() {
+        let xml =
+            br#"<w:drawing xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
+                xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"
+                xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
+                xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
+            <wp:anchor>
+                <wp:positionH relativeFrom="page"><wp:posOffset>914400</wp:posOffset></wp:positionH>
+                <wp:positionV relativeFrom="page"><wp:posOffset>457200</wp:posOffset></wp:positionV>
+                <wp:extent cx="2000000" cy="1500000"/>
+                <a:graphic><a:graphicData uri="">
+                    <pic:pic xmlns:pic="http://schemas.openxmlformats.org/drawingml/2006/picture">
+                        <pic:blipFill><a:blip r:embed="rId7"/></pic:blipFill>
+                    </pic:pic>
+                </a:graphicData></a:graphic>
+            </wp:anchor>
+        </w:drawing>"#;
+        let mut reader = make_content_reader(xml);
+        // Advance past the outer <w:drawing> Start so parse_drawing
+        // sees the inner contents (it expects to be entered with
+        // depth=1 already accounting for that wrapper).
+        loop {
+            match reader.read_event().unwrap() {
+                quick_xml::events::Event::Start(ref e) if e.local_name().as_ref() == b"drawing" => {
+                    break;
+                },
+                quick_xml::events::Event::Eof => panic!("no drawing"),
+                _ => {},
+            }
+        }
+        let info = parse_drawing(&mut reader).unwrap().expect("drawing");
+        assert!(!info.inline);
+        let pos = info.anchor_position.expect("anchor position");
+        assert_eq!(pos.x_emu, 914400);
+        assert_eq!(pos.y_emu, 457200);
+        assert_eq!(pos.h_relative_from, crate::docx::AnchorFrame::Page);
+        assert_eq!(info.relationship_id, "rId7");
+    }
+
+    #[test]
+    fn parse_drawing_wsp_line_shape() {
+        let xml =
+            br#"<w:drawing xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
+                xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"
+                xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
+                xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape">
+            <wp:anchor>
+                <wp:positionH relativeFrom="page"><wp:posOffset>100000</wp:posOffset></wp:positionH>
+                <wp:positionV relativeFrom="page"><wp:posOffset>200000</wp:posOffset></wp:positionV>
+                <wp:extent cx="500000" cy="0"/>
+                <a:graphic><a:graphicData>
+                    <wps:wsp>
+                        <wps:spPr>
+                            <a:prstGeom prst="line"/>
+                            <a:ln w="9525">
+                                <a:solidFill><a:srgbClr val="FF0000"/></a:solidFill>
+                            </a:ln>
+                        </wps:spPr>
+                    </wps:wsp>
+                </a:graphicData></a:graphic>
+            </wp:anchor>
+        </w:drawing>"#;
+        let mut reader = make_content_reader(xml);
+        loop {
+            match reader.read_event().unwrap() {
+                quick_xml::events::Event::Start(ref e) if e.local_name().as_ref() == b"drawing" => {
+                    break;
+                },
+                quick_xml::events::Event::Eof => panic!("no drawing"),
+                _ => {},
+            }
+        }
+        let info = parse_drawing(&mut reader).unwrap().expect("drawing");
+        let shape = info.shape.expect("shape");
+        assert_eq!(shape.kind, crate::docx::ShapeKind::Line);
+        assert_eq!(shape.stroke_rgb, Some((0xFF, 0x00, 0x00)));
+        assert_eq!(shape.stroke_w_emu, Some(9525));
+    }
+
     #[test]
     fn section_properties() {
         let xml = br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
@@ -1193,4 +1786,60 @@ mod tests {
         let margins = sect.margins.as_ref().unwrap();
         assert_eq!(margins.left.0, 1800);
     }
+
+    // ── strip_embedded_font_filename ────────────────────────────────────
+
+    #[test]
+    fn strip_embedded_font_writer_convention() {
+        // Writer convention: font_<n>_<face>.<ext>
+        assert_eq!(
+            strip_embedded_font_filename("font_4_TeXGyreTermesX-Regular.ttf"),
+            "TeXGyreTermesX-Regular"
+        );
+        assert_eq!(strip_embedded_font_filename("font_1_NewTXBMI.ttf"), "NewTXBMI");
+        assert_eq!(strip_embedded_font_filename("font_12_DejaVuSans.otf"), "DejaVuSans");
+    }
+
+    #[test]
+    fn strip_embedded_font_no_prefix_keeps_stem() {
+        // No `font_<n>_` prefix → return the stem unchanged.
+        assert_eq!(strip_embedded_font_filename("Arial.ttf"), "Arial");
+        assert_eq!(strip_embedded_font_filename("MyFont.otf"), "MyFont");
+    }
+
+    #[test]
+    fn strip_embedded_font_no_extension() {
+        // No extension → use the whole input.
+        assert_eq!(strip_embedded_font_filename("font_1_Calibri"), "Calibri");
+        assert_eq!(strip_embedded_font_filename("Calibri"), "Calibri");
+    }
+
+    #[test]
+    fn strip_embedded_font_non_digit_prefix_keeps_stem() {
+        // `font_xxx_<face>` where xxx isn't digits → don't strip.
+        assert_eq!(strip_embedded_font_filename("font_abc_Foo.ttf"), "font_abc_Foo");
+    }
+
+    #[test]
+    fn strip_embedded_font_alphabetic_face_preserved() {
+        // Regression: greedy trim_end_matches(alphabetic) used to eat
+        // the face name. Verify a face with trailing alphabetic chars
+        // survives intact.
+        assert_eq!(
+            strip_embedded_font_filename("font_4_TeXGyreTermesX-Bold.ttf"),
+            "TeXGyreTermesX-Bold"
+        );
+    }
+
+    #[test]
+    fn strip_embedded_font_empty() {
+        assert_eq!(strip_embedded_font_filename(""), "");
+    }
+
+    #[test]
+    fn strip_embedded_font_no_face_after_prefix() {
+        // `font_<n>_` with nothing after the underscore → empty face.
+        // Caller of this helper falls back to the full basename.
+        assert_eq!(strip_embedded_font_filename("font_5_.ttf"), "");
+    }
 }
diff --git a/src/docx/text.rs b/src/docx/text.rs
index 1e2c1bb..3a5bc13 100644
--- a/src/docx/text.rs
+++ b/src/docx/text.rs
@@ -24,21 +24,104 @@ impl DocxDocument {
     }
 
     /// Convert the document to Markdown.
+    ///
+    /// Includes headers and footers around the body so a downstream
+    /// renderer (PDF, HTML, search index) sees the full visible content
+    /// of every page. Without this, simple-but-meaningful artefacts like
+    /// `My header` / `My footer` are silently dropped.
     pub fn to_markdown(&self) -> String {
         let mut out = String::new();
         let ctx = MarkdownCtx {
             styles: self.styles.as_ref(),
             numbering: self.numbering.as_ref(),
         };
+
+        // Headers (deduped on text content — headers may be repeated for
+        // first-page / even / default variants but the text is usually the
+        // same; we only want one copy in flat markdown).
+        let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
+        for hf in &self.headers_footers {
+            if !matches!(
+                hf.hf_type,
+                super::HeaderFooterType::Default
+                    | super::HeaderFooterType::First
+                    | super::HeaderFooterType::Even
+            ) {
+                continue;
+            }
+            let mut buf = String::new();
+            markdown_blocks(&hf.content, &ctx, &mut buf, 0);
+            let trimmed = buf.trim();
+            // Skip empty headers/footers and duplicates.
+            if trimmed.is_empty() || !seen.insert(trimmed.to_string()) {
+                continue;
+            }
+            // We don't currently know which side (header vs footer) this
+            // came from at this layer — `HeaderFooter` carries only the
+            // type modifier (default/first/even). The body sits between
+            // the headers and footers we emit, so we put all headers
+            // before and all footers after the body.
+        }
+
+        // Decide header/footer split using each section's references.
+        let (header_texts, footer_texts) = split_headers_footers(self, &ctx);
+        for h in &header_texts {
+            out.push_str(h);
+            out.push_str("\n\n");
+        }
+
         markdown_blocks(&self.body.elements, &ctx, &mut out, 0);
+
+        for f in &footer_texts {
+            if !out.ends_with("\n\n") {
+                out.push_str("\n\n");
+            }
+            out.push_str(f);
+            out.push('\n');
+        }
+
         // Trim trailing newlines
         while out.ends_with('\n') {
             out.pop();
         }
+        let _ = seen; // silence
         out
     }
 }
 
+/// Split parsed `HeaderFooter` entries into headers vs footers using the
+/// section reference lists. Returns (headers, footers) as deduplicated
+/// markdown-string vectors. We don't currently retain the relationship
+/// IDs that map a section ref to a specific parsed `HeaderFooter`, so we
+/// approximate: header_refs.len() entries from the front go to headers,
+/// the rest go to footers. Correct for the common case (single section
+/// with one of each); on multi-variant documents some misclassification
+/// is possible but text is still preserved (just maybe in the wrong slot).
+fn split_headers_footers(doc: &DocxDocument, ctx: &MarkdownCtx) -> (Vec<String>, Vec<String>) {
+    let mut headers: Vec<String> = Vec::new();
+    let mut footers: Vec<String> = Vec::new();
+    let mut header_seen: std::collections::HashSet<String> = std::collections::HashSet::new();
+    let mut footer_seen: std::collections::HashSet<String> = std::collections::HashSet::new();
+
+    let n_header_refs: usize = doc.sections.iter().map(|s| s.header_refs.len()).sum();
+    for (idx, hf) in doc.headers_footers.iter().enumerate() {
+        let mut buf = String::new();
+        markdown_blocks(&hf.content, ctx, &mut buf, 0);
+        let t = buf.trim().to_string();
+        if t.is_empty() {
+            continue;
+        }
+        if idx < n_header_refs {
+            if header_seen.insert(t.clone()) {
+                headers.push(t);
+            }
+        } else if footer_seen.insert(t.clone()) {
+            footers.push(t);
+        }
+    }
+    (headers, footers)
+}
+
 fn plain_text_blocks(elements: &[BlockElement], out: &mut String) {
     for elem in elements {
         match elem {
diff --git a/src/docx/write.rs b/src/docx/write.rs
index 914e9e2..c0ea70c 100644
--- a/src/docx/write.rs
+++ b/src/docx/write.rs
@@ -47,6 +47,8 @@ use super::Result;
 const CT_DOCUMENT: &str =
     "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml";
 const CT_STYLES: &str = "application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml";
+const CT_FONT_TABLE: &str =
+    "application/vnd.openxmlformats-officedocument.wordprocessingml.fontTable+xml";
 const CT_NUMBERING: &str =
     "application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml";
 const CT_HEADER: &str = "application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml";
@@ -443,6 +445,11 @@ pub struct DocxWriter {
     endnotes: Vec<DocxNote>,
     core_props: Option<CoreProps>,
     next_num_id: u32,
+    /// Embedded font programs to ship inside the package under `word/fonts/`.
+    /// Each entry is `(font_name, ttf_or_otf_bytes)`. The reader recognizes
+    /// these and re-uses them to render any downstream conversion (notably
+    /// PDF) so a PDF→DOCX→PDF round-trip preserves typeface fidelity.
+    embedded_fonts: Vec<(String, Vec<u8>)>,
 }
 
 impl DocxWriter {
@@ -456,9 +463,21 @@ impl DocxWriter {
             endnotes: Vec::new(),
             core_props: None,
             next_num_id: 3,
+            embedded_fonts: Vec::new(),
         }
     }
 
+    /// Embed a font program (TrueType / OpenType bytes) under `word/fonts/`.
+    /// `name` is used for the file name and as the human-readable font name.
+    /// Subsequent calls with the same name are deduplicated.
+    pub fn embed_font(&mut self, name: impl Into<String>, data: Vec<u8>) -> &mut Self {
+        let name = name.into();
+        if !self.embedded_fonts.iter().any(|(n, _)| n == &name) {
+            self.embedded_fonts.push((name, data));
+        }
+        self
+    }
+
     /// Add a plain paragraph with the given text.
     pub fn add_paragraph(&mut self, text: &str) -> &mut Self {
         self.elements
@@ -795,6 +814,42 @@ impl DocxWriter {
             });
         }
 
+        // --- Embed fonts ---
+        // Three pieces have to land together so Word/LibreOffice
+        // actually pick up the font programs:
+        //
+        //   1. The TTF/OTF parts under `/word/fonts/font_<n>_<safe>.ttf`.
+        //   2. `/word/fontTable.xml` listing each font name with an
+        //      `<w:embedRegular r:id="rIdN"/>` reference.
+        //   3. `/word/_rels/fontTable.xml.rels` mapping each rId from
+        //      step 2 to the matching font part.
+        //   4. A relationship in `word/_rels/document.xml.rels` of type
+        //      `…/fontTable` so Word knows where to find fontTable.xml.
+        //
+        // Without all four, the in-process reader still finds the TTFs
+        // by directory scan, but Word silently substitutes Calibri.
+        if !self.embedded_fonts.is_empty() {
+            let font_table_part = PartName::new("/word/fontTable.xml")?;
+            opc.add_part_rel(&doc_part, rel_types::FONT_TABLE, "fontTable.xml");
+
+            // Each font part + the fontTable→font rel.
+            let mut font_entries: Vec<(String, String)> =
+                Vec::with_capacity(self.embedded_fonts.len());
+            for (idx, (name, data)) in self.embedded_fonts.iter().enumerate() {
+                let n = idx + 1;
+                let safe = crate::core::embedded_fonts::sanitize_font_filename(name);
+                let target_rel = format!("fonts/font_{n}_{safe}.ttf");
+                let target_abs = format!("/word/fonts/font_{n}_{safe}.ttf");
+                let part = PartName::new(&target_abs)?;
+                opc.add_part(&part, "application/x-font-ttf", data)?;
+                let rid = opc.add_part_rel(&font_table_part, rel_types::FONT, &target_rel);
+                font_entries.push((name.clone(), rid));
+            }
+
+            let xml = generate_font_table_xml(&font_entries);
+            opc.add_part(&font_table_part, CT_FONT_TABLE, &xml)?;
+        }
+
         // --- Register headers/footers ---
         let mut hf_rids: Vec<(HfType, String)> = Vec::new();
         for (i, hf) in self.headers_footers.iter().enumerate() {
@@ -961,8 +1016,36 @@ impl DocxWriter {
         w.write_event(Event::Start(BytesStart::new("w:body")))
             .expect("write body start");
 
+        // Multi-section DOCX: each non-final `<w:sectPr>` lives inside the
+        // `<w:pPr>` of a paragraph that terminates that section. Only the
+        // final sectPr sits at body level. The previous implementation
+        // dropped every non-final SectPr on the floor, so a multi-section
+        // IR (e.g. one section per source PDF page from `pdf_to_ir`)
+        // collapsed into a single section on the read side and lost all
+        // per-page geometry.
+        //
+        // Find the last `DocxElement::SectPr` index — that's the final
+        // section, written at body level. Every earlier SectPr is emitted
+        // as a synthetic empty paragraph carrying just `<w:pPr><w:sectPr>…</w:sectPr></w:pPr>`,
+        // which `parse_paragraph_properties_fast` recognises and pushes
+        // into `body.section_breaks`. `docx_to_ir` then walks
+        // `section_breaks` to slice elements into per-section windows.
+        let last_sectpr_idx: Option<usize> = self
+            .elements
+            .iter()
+            .rposition(|e| matches!(e, DocxElement::SectPr(_)));
+
         let mut image_counter = 0u32;
-        for element in &self.elements {
+        for (idx, element) in self.elements.iter().enumerate() {
+            if let DocxElement::SectPr(sp) = element {
+                if Some(idx) == last_sectpr_idx {
+                    // Final section is rendered as the body-level sectPr
+                    // below (uses the `sect_pr` info already gathered).
+                    continue;
+                }
+                write_inline_section_break_paragraph(&mut w, sp);
+                continue;
+            }
             write_docx_element(&mut w, element, image_rids, &mut image_counter);
         }
 
@@ -1215,6 +1298,11 @@ fn convert_ir_element_to_docx_elements(elem: &crate::ir::Element, out: &mut Vec<
         },
         E::Footnote(_) | E::Endnote(_) => {},
         E::CodeBlock(cb) => out.push(DocxElement::CodeBlock(cb.content.clone())),
+        E::Shape(_) => {
+            // Vector shapes are emitted by the layout-preserving DOCX
+            // writer in pdf_oxide directly; the markdown-driven IR
+            // writer doesn't have anywhere to put them yet.
+        },
     }
 }
 
@@ -2624,6 +2712,114 @@ fn write_floating_image_run(
         .expect("write p end");
 }
 
+/// Emit a synthetic empty paragraph that carries an inline `<w:sectPr>`
+/// inside its `<w:pPr>`. Used for non-final section boundaries — the
+/// paragraph is what marks the section break for the reader; its
+/// `<w:sectPr>` describes the section ending at this point. We don't
+/// emit hf / footnote references on inline sectPr (they're document-wide
+/// and live on the body-level final sectPr only).
+fn write_inline_section_break_paragraph(w: &mut Writer<Vec<u8>>, sp: &DocxSectPr) {
+    w.write_event(Event::Start(BytesStart::new("w:p")))
+        .expect("write inline-section p start");
+    w.write_event(Event::Start(BytesStart::new("w:pPr")))
+        .expect("write inline-section pPr start");
+    write_section_pr_body(w, sp.page_setup.as_ref(), sp.columns.as_ref(), &sp.break_type);
+    w.write_event(Event::End(BytesEnd::new("w:pPr")))
+        .expect("write inline-section pPr end");
+    w.write_event(Event::End(BytesEnd::new("w:p")))
+        .expect("write inline-section p end");
+}
+
+/// Shared `<w:sectPr>...</w:sectPr>` body writer — used by both the
+/// body-level final sectPr and inline (per-paragraph) section breaks.
+/// Caller writes the surrounding `<w:sectPr>`/`</w:sectPr>` tags.
+fn write_section_pr_body(
+    w: &mut Writer<Vec<u8>>,
+    page_setup: Option<&PageSetup>,
+    columns: Option<&ColumnLayout>,
+    break_type: &SectionBreakType,
+) {
+    w.write_event(Event::Start(BytesStart::new("w:sectPr")))
+        .expect("write sectPr start");
+
+    match break_type {
+        SectionBreakType::Continuous => {
+            // Continuous is the default; emit it explicitly so the reader
+            // doesn't pick up a stale value from a sibling section.
+            let mut t = BytesStart::new("w:type");
+            t.push_attribute(("w:val", "continuous"));
+            w.write_event(Event::Empty(t)).expect("write sect type");
+        },
+        SectionBreakType::NextPage => {
+            let mut t = BytesStart::new("w:type");
+            t.push_attribute(("w:val", "nextPage"));
+            w.write_event(Event::Empty(t)).expect("write sect type");
+        },
+        SectionBreakType::EvenPage => {
+            let mut t = BytesStart::new("w:type");
+            t.push_attribute(("w:val", "evenPage"));
+            w.write_event(Event::Empty(t)).expect("write sect type");
+        },
+        SectionBreakType::OddPage => {
+            let mut t = BytesStart::new("w:type");
+            t.push_attribute(("w:val", "oddPage"));
+            w.write_event(Event::Empty(t)).expect("write sect type");
+        },
+    }
+
+    if let Some(ps) = page_setup {
+        let mut pg_sz = BytesStart::new("w:pgSz");
+        pg_sz.push_attribute(("w:w", ps.width_twips.to_string().as_str()));
+        pg_sz.push_attribute(("w:h", ps.height_twips.to_string().as_str()));
+        if ps.landscape {
+            pg_sz.push_attribute(("w:orient", "landscape"));
+        }
+        w.write_event(Event::Empty(pg_sz)).expect("write pgSz");
+
+        let mut pg_mar = BytesStart::new("w:pgMar");
+        pg_mar.push_attribute(("w:top", ps.margin_top_twips.to_string().as_str()));
+        pg_mar.push_attribute(("w:bottom", ps.margin_bottom_twips.to_string().as_str()));
+        pg_mar.push_attribute(("w:left", ps.margin_left_twips.to_string().as_str()));
+        pg_mar.push_attribute(("w:right", ps.margin_right_twips.to_string().as_str()));
+        pg_mar.push_attribute(("w:header", ps.header_distance_twips.to_string().as_str()));
+        pg_mar.push_attribute(("w:footer", ps.footer_distance_twips.to_string().as_str()));
+        w.write_event(Event::Empty(pg_mar)).expect("write pgMar");
+    }
+
+    if let Some(cols) = columns {
+        if cols.column_widths_twips.is_empty() {
+            let mut c = BytesStart::new("w:cols");
+            c.push_attribute(("w:num", cols.count.to_string().as_str()));
+            if let Some(sp) = cols.space_twips {
+                c.push_attribute(("w:space", sp.to_string().as_str()));
+            }
+            if cols.separator {
+                c.push_attribute(("w:sep", "1"));
+            }
+            w.write_event(Event::Empty(c)).expect("write cols");
+        } else {
+            let mut c = BytesStart::new("w:cols");
+            c.push_attribute(("w:num", cols.count.to_string().as_str()));
+            if cols.separator {
+                c.push_attribute(("w:sep", "1"));
+            }
+            w.write_event(Event::Start(c)).expect("write cols start");
+            let default_space = cols.space_twips.unwrap_or(720);
+            for &cw in &cols.column_widths_twips {
+                let mut col = BytesStart::new("w:col");
+                col.push_attribute(("w:w", cw.to_string().as_str()));
+                col.push_attribute(("w:space", default_space.to_string().as_str()));
+                w.write_event(Event::Empty(col)).expect("write col");
+            }
+            w.write_event(Event::End(BytesEnd::new("w:cols")))
+                .expect("write cols end");
+        }
+    }
+
+    w.write_event(Event::End(BytesEnd::new("w:sectPr")))
+        .expect("write sectPr end");
+}
+
 fn write_body_sect_pr(w: &mut Writer<Vec<u8>>, sp: &SectPrInfo) {
     w.write_event(Event::Start(BytesStart::new("w:sectPr")))
         .expect("write sectPr start");
@@ -2895,6 +3091,46 @@ fn generate_core_props_xml(props: &CoreProps) -> Vec<u8> {
     w.into_inner()
 }
 
+// ---------------------------------------------------------------------------
+// fontTable.xml generator
+// ---------------------------------------------------------------------------
+
+/// Build `word/fontTable.xml` listing each embedded font with an
+/// `<w:embedRegular r:id="…"/>` reference. Word looks up `<w:rFonts
+/// w:ascii="…"/>` names against this table and uses the embedded
+/// program when there's a match. Without it, Word silently
+/// substitutes Calibri / Cambria for everything regardless of how
+/// many TTFs we ship under `/word/fonts/`.
+fn generate_font_table_xml(entries: &[(String, String)]) -> Vec<u8> {
+    let mut w = Writer::new_with_indent(Vec::new(), b' ', 2);
+    w.write_event(Event::Decl(BytesDecl::new("1.0", Some("UTF-8"), Some("yes"))))
+        .expect("decl");
+
+    let mut fonts = BytesStart::new("w:fonts");
+    fonts.push_attribute(("xmlns:w", crate::core::xml::ns::WML_STR));
+    fonts.push_attribute(("xmlns:r", crate::core::xml::ns::R_STR));
+    w.write_event(Event::Start(fonts)).expect("fonts start");
+
+    for (name, rid) in entries {
+        let mut font = BytesStart::new("w:font");
+        font.push_attribute(("w:name", name.as_str()));
+        w.write_event(Event::Start(font)).expect("font start");
+
+        // <w:embedRegular r:id="rIdN"/> — Word treats this as the regular-weight
+        // glyph source for the named font face.
+        let mut embed = BytesStart::new("w:embedRegular");
+        embed.push_attribute(("r:id", rid.as_str()));
+        w.write_event(Event::Empty(embed)).expect("embedRegular");
+
+        w.write_event(Event::End(BytesEnd::new("w:font")))
+            .expect("font end");
+    }
+
+    w.write_event(Event::End(BytesEnd::new("w:fonts")))
+        .expect("fonts end");
+    w.into_inner()
+}
+
 // ---------------------------------------------------------------------------
 // Styles and numbering generators
 // ---------------------------------------------------------------------------
@@ -2949,14 +3185,60 @@ fn write_paragraph_style(
     w.write_event(Event::Empty(name_elem))
         .expect("write style name");
 
+    // basedOn Normal so heading styles inherit body defaults.
+    if outline_level.is_some() {
+        let mut based = BytesStart::new("w:basedOn");
+        based.push_attribute(("w:val", "Normal"));
+        w.write_event(Event::Empty(based)).expect("write basedOn");
+    }
+
     if let Some(level) = outline_level {
         w.write_event(Event::Start(BytesStart::new("w:pPr")))
             .expect("write pPr start");
+        // Spacing-before for visual breathing room above the heading.
+        let mut sp = BytesStart::new("w:spacing");
+        sp.push_attribute(("w:before", "240")); // 12 pt
+        sp.push_attribute(("w:after", "120")); //  6 pt
+        w.write_event(Event::Empty(sp)).expect("write spacing");
         let mut lvl = BytesStart::new("w:outlineLvl");
         lvl.push_attribute(("w:val", level.to_string().as_str()));
         w.write_event(Event::Empty(lvl)).expect("write outlineLvl");
         w.write_event(Event::End(BytesEnd::new("w:pPr")))
             .expect("write pPr end");
+
+        // Run properties — size & bold per Word's default heading scale.
+        // Without this, every <w:pStyle val="HeadingN"/> in the body
+        // renders as plain Normal — the headings disappear visually.
+        let (sz_half_pt, bold, italic, color) = match level {
+            0 => (56, true, false, "2F5496"), // Heading 1: 28 pt
+            1 => (44, true, false, "2F5496"), // Heading 2: 22 pt
+            2 => (32, true, false, "1F3864"), // Heading 3: 16 pt
+            3 => (28, true, true, "2F5496"),  // Heading 4: 14 pt italic
+            4 => (24, true, false, "2F5496"), // Heading 5: 12 pt
+            _ => (22, true, true, "1F3864"),  // Heading 6: 11 pt italic
+        };
+        w.write_event(Event::Start(BytesStart::new("w:rPr")))
+            .expect("write rPr start");
+        if bold {
+            w.write_event(Event::Empty(BytesStart::new("w:b")))
+                .expect("write b");
+        }
+        if italic {
+            w.write_event(Event::Empty(BytesStart::new("w:i")))
+                .expect("write i");
+        }
+        let mut col = BytesStart::new("w:color");
+        col.push_attribute(("w:val", color));
+        w.write_event(Event::Empty(col)).expect("write color");
+        let sz_str = sz_half_pt.to_string();
+        let mut sz = BytesStart::new("w:sz");
+        sz.push_attribute(("w:val", sz_str.as_str()));
+        w.write_event(Event::Empty(sz)).expect("write sz");
+        let mut sz_cs = BytesStart::new("w:szCs");
+        sz_cs.push_attribute(("w:val", sz_str.as_str()));
+        w.write_event(Event::Empty(sz_cs)).expect("write szCs");
+        w.write_event(Event::End(BytesEnd::new("w:rPr")))
+            .expect("write rPr end");
     }
 
     w.write_event(Event::End(BytesEnd::new("w:style")))
diff --git a/src/ir.rs b/src/ir.rs
index 093615b..1ebe1e9 100644
--- a/src/ir.rs
+++ b/src/ir.rs
@@ -550,6 +550,12 @@ pub struct Section {
     pub even_page_header: Option<HeaderFooter>,
     /// Footer used on even-numbered pages of this section.
     pub even_page_footer: Option<HeaderFooter>,
+    /// Solid background colour for this section (RGB).
+    /// PPTX: parsed from `<p:cSld><p:bg><p:bgPr><a:solidFill>` on the slide.
+    /// Image / gradient backgrounds are intentionally skipped — only the
+    /// solid case round-trips through this minimal field.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub background_rgb: Option<[u8; 3]>,
 }
 
 /// A block-level content element.
@@ -581,15 +587,77 @@ pub enum Element {
     Endnote(Note),
     /// A preformatted code block.
     CodeBlock(CodeBlock),
+    /// A vector shape (line / rectangle) anchored on the page. Used by
+    /// the layout-preserving DOCX path to round-trip rules and dividers.
+    Shape(Shape),
+}
+
+/// A vector shape anchored at absolute page coordinates.
+#[allow(dead_code)]
+#[derive(Debug, Clone, Default, PartialEq, serde::Serialize, serde::Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub struct Shape {
+    /// Geometry kind.
+    pub kind: ShapeGeom,
+    /// X offset from the anchor in EMUs.
+    pub x_emu: i64,
+    /// Y offset from the anchor in EMUs.
+    pub y_emu: i64,
+    /// Width in EMUs.
+    pub width_emu: u64,
+    /// Height in EMUs.
+    pub height_emu: u64,
+    /// Horizontal anchor reference frame.
+    #[serde(default)]
+    pub h_anchor: FloatAnchor,
+    /// Vertical anchor reference frame.
+    #[serde(default)]
+    pub v_anchor: FloatAnchor,
+    /// Stroke colour as RGB (0..255).
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub stroke_rgb: Option<[u8; 3]>,
+    /// Fill colour as RGB (0..255).
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub fill_rgb: Option<[u8; 3]>,
+    /// Stroke width in EMUs.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub stroke_w_emu: Option<i64>,
+}
+
+/// Vector-shape geometry kinds we currently round-trip.
+#[allow(dead_code)]
+#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum ShapeGeom {
+    /// Straight line from `(x, y)` to `(x + width, y + height)`.
+    #[default]
+    Line,
+    /// Axis-aligned rectangle.
+    Rect,
 }
 
 /// A heading element with a nesting level.
 #[derive(Debug, Clone, PartialEq, Default, serde::Serialize, serde::Deserialize)]
 pub struct Heading {
     /// Heading level 1–6 (1 = largest).
+    #[serde(default = "default_heading_level")]
     pub level: u8,
     /// Inline content of the heading.
+    #[serde(default)]
     pub content: Vec<InlineContent>,
+    /// Absolute frame position for layout-preserving DOCX
+    /// (mirrors `Paragraph::frame_position`).
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub frame_position: Option<FramePosition>,
+    /// Horizontal alignment (mirrors `Paragraph::alignment`). PDF
+    /// title pages often centre their headings; without this the
+    /// round-trip flattens them to left-aligned.
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub alignment: Option<ParagraphAlignment>,
+}
+
+fn default_heading_level() -> u8 {
+    1
 }
 
 /// A paragraph of inline content.
@@ -626,6 +694,27 @@ pub struct Paragraph {
     pub page_break_before: bool,
     /// Outline level (0 = body text, 1–9 = heading levels).
     pub outline_level: Option<u8>,
+    /// Absolute frame position (from `<w:framePr>`). Present when the
+    /// DOCX uses page-anchored frames for layout-preserving content
+    /// (see pdf_oxide's `to_docx_bytes_layout`). Twips relative to the
+    /// page origin (top-left).
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub frame_position: Option<FramePosition>,
+}
+
+/// Absolute frame position for a paragraph anchored to the page.
+/// Mirrors the OOXML `<w:framePr>` attribute set we care about for
+/// reproducing visual layout in downstream renderers.
+#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
+pub struct FramePosition {
+    /// X position in twips, anchored to the page origin (top-left).
+    pub x_twips: i32,
+    /// Y position in twips, anchored to the page origin (top-left).
+    pub y_twips: i32,
+    /// Frame width in twips.
+    pub width_twips: i32,
+    /// Frame height in twips.
+    pub height_twips: i32,
 }
 
 /// Inline content within a paragraph or heading.
@@ -643,6 +732,26 @@ pub enum InlineContent {
     EndnoteRef(FootnoteRef),
 }
 
+/// Pick the dominant font size (in points) for a paragraph's worth of
+/// inline content. Returns the *first* declared `font_size_half_pt`,
+/// converted from half-points to points (e.g. 18 half-pt → 9 pt).
+///
+/// Used by both renderers and writers when one paragraph-level size is
+/// needed: the IR groups runs into a paragraph by line clustering, so
+/// the size on the first span is representative of the body text.
+/// Mixed-size paragraphs (drop-caps, math marks mid-line) lose the
+/// variation — that's the deliberate trade-off.
+pub fn first_inline_font_size_pt(content: &[InlineContent]) -> Option<f32> {
+    for ic in content {
+        if let InlineContent::Text(span) = ic {
+            if let Some(half_pt) = span.font_size_half_pt {
+                return Some(half_pt as f32 / 2.0);
+            }
+        }
+    }
+    None
+}
+
 /// A styled run of text.
 #[derive(Debug, Clone, PartialEq, Default, serde::Serialize, serde::Deserialize)]
 pub struct TextSpan {
@@ -789,6 +898,69 @@ pub struct ListItem {
     pub nested: Option<List>,
 }
 
+/// Wrap a non-empty inline-content vector into a single-Paragraph
+/// block, or return an empty Vec if the inline content is empty.
+/// Used by list builders to turn each item's inline run into its
+/// `Vec<Element>` content slot.
+pub fn inline_to_element_block(content: Vec<InlineContent>) -> Vec<Element> {
+    if content.is_empty() {
+        Vec::new()
+    } else {
+        vec![Element::Paragraph(Paragraph {
+            content,
+            ..Default::default()
+        })]
+    }
+}
+
+/// Build a nested `List` from a flat `(level, inline)` sequence.
+///
+/// Items whose level matches `base_level` (or is shallower) become
+/// `ListItem`s at the current depth. Items whose level is *deeper*
+/// than `base_level` are recursively grouped into the most recent
+/// item's `nested` sub-list. Levels are 0-indexed.
+///
+/// Used by both `convert_docx` and `convert_pptx` to translate flat
+/// `<w:numPr w:ilvl=…>` / `<a:p lvl=…>` paragraph streams into the
+/// IR's tree-shaped `List`.
+pub fn build_nested_list(
+    ordered: bool,
+    items: &[(u8, Vec<InlineContent>)],
+    base_level: u8,
+) -> List {
+    let mut list_items = Vec::new();
+    let mut idx = 0;
+
+    while idx < items.len() {
+        let (level, content) = &items[idx];
+        let nested_start = idx + 1;
+        let mut nested_end = nested_start;
+        while nested_end < items.len() && items[nested_end].0 > base_level {
+            nested_end += 1;
+        }
+        let nested = if *level <= base_level && nested_end > nested_start {
+            Some(build_nested_list(ordered, &items[nested_start..nested_end], base_level + 1))
+        } else {
+            None
+        };
+        list_items.push(ListItem {
+            content: inline_to_element_block(content.clone()),
+            nested,
+        });
+        idx = if nested_end > nested_start {
+            nested_end
+        } else {
+            idx + 1
+        };
+    }
+
+    List {
+        ordered,
+        items: list_items,
+        ..Default::default()
+    }
+}
+
 /// An embedded image reference.
 #[derive(Debug, Clone, PartialEq, Default, serde::Serialize, serde::Deserialize)]
 pub struct Image {
@@ -813,3 +985,194 @@ pub struct Image {
     #[serde(default)]
     pub positioning: ImagePositioning,
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // ── first_inline_font_size_pt ────────────────────────────────────
+
+    #[test]
+    fn first_font_size_returns_half_pt_as_pt() {
+        let content = vec![InlineContent::Text(TextSpan {
+            text: "hi".into(),
+            font_size_half_pt: Some(24), // 12pt
+            ..Default::default()
+        })];
+        assert_eq!(first_inline_font_size_pt(&content), Some(12.0));
+    }
+
+    #[test]
+    fn first_font_size_picks_first_declared() {
+        // Second span's size is ignored — the first declared one wins.
+        let content = vec![
+            InlineContent::Text(TextSpan {
+                text: "a".into(),
+                font_size_half_pt: Some(20), // 10pt
+                ..Default::default()
+            }),
+            InlineContent::Text(TextSpan {
+                text: "b".into(),
+                font_size_half_pt: Some(48), // 24pt — ignored
+                ..Default::default()
+            }),
+        ];
+        assert_eq!(first_inline_font_size_pt(&content), Some(10.0));
+    }
+
+    #[test]
+    fn first_font_size_skips_unsized_runs() {
+        // First run has no size; second does → returns the second's size.
+        let content = vec![
+            InlineContent::Text(TextSpan {
+                text: "a".into(),
+                ..Default::default()
+            }),
+            InlineContent::Text(TextSpan {
+                text: "b".into(),
+                font_size_half_pt: Some(16), // 8pt
+                ..Default::default()
+            }),
+        ];
+        assert_eq!(first_inline_font_size_pt(&content), Some(8.0));
+    }
+
+    #[test]
+    fn first_font_size_empty_returns_none() {
+        assert_eq!(first_inline_font_size_pt(&[]), None);
+    }
+
+    #[test]
+    fn first_font_size_all_unsized_returns_none() {
+        let content = vec![
+            InlineContent::Text(TextSpan::plain("a")),
+            InlineContent::Text(TextSpan::plain("b")),
+        ];
+        assert_eq!(first_inline_font_size_pt(&content), None);
+    }
+
+    // ── inline_to_element_block ──────────────────────────────────────
+
+    #[test]
+    fn inline_to_element_block_empty_returns_empty() {
+        let result = inline_to_element_block(vec![]);
+        assert!(result.is_empty());
+    }
+
+    #[test]
+    fn inline_to_element_block_wraps_in_paragraph() {
+        let inline = vec![InlineContent::Text(TextSpan::plain("hello"))];
+        let result = inline_to_element_block(inline);
+        assert_eq!(result.len(), 1);
+        match &result[0] {
+            Element::Paragraph(p) => {
+                assert_eq!(p.content.len(), 1);
+                assert!(matches!(
+                    &p.content[0],
+                    InlineContent::Text(s) if s.text == "hello"
+                ));
+            },
+            _ => panic!("expected Paragraph"),
+        }
+    }
+
+    // ── build_nested_list ────────────────────────────────────────────
+
+    fn item(level: u8, text: &str) -> (u8, Vec<InlineContent>) {
+        (level, vec![InlineContent::Text(TextSpan::plain(text))])
+    }
+
+    fn list_item_text(item: &ListItem) -> String {
+        let mut out = String::new();
+        for el in &item.content {
+            if let Element::Paragraph(p) = el {
+                for c in &p.content {
+                    if let InlineContent::Text(s) = c {
+                        out.push_str(&s.text);
+                    }
+                }
+            }
+        }
+        out
+    }
+
+    #[test]
+    fn build_nested_list_flat() {
+        let items = vec![item(0, "A"), item(0, "B"), item(0, "C")];
+        let list = build_nested_list(false, &items, 0);
+        assert!(!list.ordered);
+        assert_eq!(list.items.len(), 3);
+        assert!(list.items.iter().all(|li| li.nested.is_none()));
+        assert_eq!(list_item_text(&list.items[1]), "B");
+    }
+
+    #[test]
+    fn build_nested_list_two_levels() {
+        // Top:   A
+        //   sub: A.1, A.2
+        // Top:   B
+        let items = vec![item(0, "A"), item(1, "A.1"), item(1, "A.2"), item(0, "B")];
+        let list = build_nested_list(true, &items, 0);
+        assert!(list.ordered);
+        assert_eq!(list.items.len(), 2);
+        let nested = list.items[0].nested.as_ref().expect("A has nested");
+        assert_eq!(nested.items.len(), 2);
+        assert_eq!(list_item_text(&nested.items[0]), "A.1");
+        assert_eq!(list_item_text(&nested.items[1]), "A.2");
+        // B has no nested children.
+        assert!(list.items[1].nested.is_none());
+    }
+
+    #[test]
+    fn build_nested_list_three_levels() {
+        let items = vec![item(0, "A"), item(1, "A.1"), item(2, "A.1.x"), item(0, "B")];
+        let list = build_nested_list(false, &items, 0);
+        let l1 = list.items[0].nested.as_ref().unwrap();
+        assert_eq!(l1.items.len(), 1);
+        let l2 = l1.items[0].nested.as_ref().unwrap();
+        assert_eq!(l2.items.len(), 1);
+        assert_eq!(list_item_text(&l2.items[0]), "A.1.x");
+    }
+
+    #[test]
+    fn build_nested_list_empty() {
+        let list = build_nested_list(false, &[], 0);
+        assert!(list.items.is_empty());
+    }
+
+    // ── TextSpan::plain ──────────────────────────────────────────────
+
+    #[test]
+    fn text_span_plain_has_default_styling() {
+        let s = TextSpan::plain("hi");
+        assert_eq!(s.text, "hi");
+        assert!(!s.bold);
+        assert!(!s.italic);
+        assert!(s.font_size_half_pt.is_none());
+        assert!(s.hyperlink.is_none());
+    }
+
+    // ── FramePosition / Shape defaults ───────────────────────────────
+
+    #[test]
+    fn shape_default_is_line_at_origin() {
+        let s = Shape::default();
+        assert!(matches!(s.kind, ShapeGeom::Line));
+        assert_eq!(s.x_emu, 0);
+        assert_eq!(s.width_emu, 0);
+        assert!(s.stroke_rgb.is_none());
+    }
+
+    #[test]
+    fn frame_position_round_trips_via_serde() {
+        let fp = FramePosition {
+            x_twips: 720,
+            y_twips: 1080,
+            width_twips: 5000,
+            height_twips: 400,
+        };
+        let json = serde_json::to_string(&fp).unwrap();
+        let back: FramePosition = serde_json::from_str(&json).unwrap();
+        assert_eq!(fp, back);
+    }
+}
diff --git a/src/ir_from_markdown.rs b/src/ir_from_markdown.rs
index 21e4884..dc0887d 100644
--- a/src/ir_from_markdown.rs
+++ b/src/ir_from_markdown.rs
@@ -123,6 +123,7 @@ impl<'a> MarkdownParser<'a> {
                     current.elements.push(Element::Heading(Heading {
                         level,
                         content: parse_inline(&text),
+                        ..Default::default()
                     }));
                 }
                 continue;
diff --git a/src/ir_render.rs b/src/ir_render.rs
index 74f8df8..3b9c8f8 100644
--- a/src/ir_render.rs
+++ b/src/ir_render.rs
@@ -1,5 +1,125 @@
 use crate::ir::*;
 
+mod block_default {
+    //! Default flow-rendering for [`Element`] variants that don't
+    //! carry a meaningful inline / paragraph / heading shape.
+    //!
+    //! Each `default_*` function is **exhaustive** over `Element`:
+    //! the compiler forces a decision when a new variant is added
+    //! ("is this variant invisible in flow output, or do specific
+    //! renderers need to handle it?"). Renderers in the parent
+    //! module keep arms only for variants where their output
+    //! differs from these defaults; everything else falls through
+    //! to the matching `default_*` here via `other => default_X(other)`.
+    use super::*;
+    use std::fmt::Write;
+
+    /// Plain-text default. Most invisible variants → `""`;
+    /// `ThematicBreak` → `"---"` (matches markdown); container
+    /// elements recursively render their children.
+    pub fn default_plain(element: &Element) -> String {
+        match element {
+            Element::ThematicBreak => "---".to_string(),
+            Element::TextBox(tb) => tb
+                .content
+                .iter()
+                .map(super::render_element_plain)
+                .collect::<Vec<_>>()
+                .join("\n\n"),
+            Element::Footnote(n) | Element::Endnote(n) => n
+                .content
+                .iter()
+                .map(super::render_element_plain)
+                .collect::<Vec<_>>()
+                .join("\n\n"),
+            // Invisible in flow: shapes are positioned, not flow content;
+            // page/column breaks have no plain-text counterpart; an
+            // unannotated image shows nothing in plain text.
+            Element::PageBreak | Element::ColumnBreak | Element::Shape(_) | Element::Image(_) => {
+                String::new()
+            },
+            // The variants below have rich flow output and shouldn't
+            // hit this default — `render_element_plain` handles them.
+            // Reaching here means a renderer forgot a real arm; we
+            // emit empty rather than panic so the document still
+            // renders, but the explicit arms below let the compiler
+            // catch added variants.
+            Element::Heading(_)
+            | Element::Paragraph(_)
+            | Element::Table(_)
+            | Element::List(_)
+            | Element::CodeBlock(_) => String::new(),
+        }
+    }
+
+    /// Markdown default. Same as plain except images get an alt-text
+    /// `![alt]()` form.
+    pub fn default_markdown(element: &Element) -> String {
+        match element {
+            Element::ThematicBreak => "---".to_string(),
+            Element::TextBox(tb) => tb
+                .content
+                .iter()
+                .map(super::render_element_markdown)
+                .collect::<Vec<_>>()
+                .join("\n\n"),
+            Element::Footnote(n) | Element::Endnote(n) => n
+                .content
+                .iter()
+                .map(super::render_element_markdown)
+                .collect::<Vec<_>>()
+                .join("\n\n"),
+            Element::PageBreak | Element::ColumnBreak | Element::Shape(_) => String::new(),
+            Element::Image(img) => {
+                let alt = img.alt_text.as_deref().unwrap_or("");
+                format!("![{alt}]()")
+            },
+            Element::Heading(_)
+            | Element::Paragraph(_)
+            | Element::Table(_)
+            | Element::List(_)
+            | Element::CodeBlock(_) => String::new(),
+        }
+    }
+
+    /// HTML default. `ThematicBreak` → `<hr />`; images render an
+    /// empty `<img alt="…"/>`; everything else mirrors `default_plain`
+    /// behaviour with HTML escaping.
+    pub fn default_html(element: &Element) -> String {
+        match element {
+            Element::ThematicBreak => "<hr />".to_string(),
+            Element::TextBox(tb) => tb
+                .content
+                .iter()
+                .map(super::render_element_html)
+                .collect::<Vec<_>>()
+                .join("\n"),
+            Element::Footnote(n) | Element::Endnote(n) => n
+                .content
+                .iter()
+                .map(super::render_element_html)
+                .collect::<Vec<_>>()
+                .join("\n"),
+            Element::PageBreak | Element::ColumnBreak | Element::Shape(_) => String::new(),
+            Element::Image(img) => {
+                let alt = img
+                    .alt_text
+                    .as_deref()
+                    .map(super::escape_html)
+                    .unwrap_or_default();
+                let mut out = String::with_capacity(20 + alt.len());
+                let _ = write!(out, "<img alt=\"{alt}\" />");
+                out
+            },
+            Element::Heading(_)
+            | Element::Paragraph(_)
+            | Element::Table(_)
+            | Element::List(_)
+            | Element::CodeBlock(_) => String::new(),
+        }
+    }
+}
+
 impl DocumentIR {
     /// Render the IR as plain text.
     pub fn plain_text(&self) -> String {
@@ -65,29 +185,15 @@ fn render_element_plain(element: &Element) -> String {
         Element::Paragraph(p) => render_inline_plain(&p.content),
         Element::Table(t) => render_table_plain(t),
         Element::List(l) => render_list_plain(l, 0),
-        Element::Image(img) => {
-            if let Some(ref alt) = img.alt_text {
-                format!("[{alt}]")
-            } else {
-                String::new()
-            }
+        Element::Image(img) => match &img.alt_text {
+            Some(alt) => format!("[{alt}]"),
+            None => String::new(),
         },
-        Element::ThematicBreak => "---".to_string(),
-        Element::TextBox(tb) => tb
-            .content
-            .iter()
-            .map(render_element_plain)
-            .collect::<Vec<_>>()
-            .join("\n\n"),
-        Element::PageBreak => String::new(),
-        Element::ColumnBreak => String::new(),
-        Element::Footnote(n) | Element::Endnote(n) => n
-            .content
-            .iter()
-            .map(render_element_plain)
-            .collect::<Vec<_>>()
-            .join("\n\n"),
         Element::CodeBlock(cb) => cb.content.clone(),
+        // Invisible-in-flow / container variants delegated to the
+        // shared default. Adding a new `Element` variant forces a
+        // compile error in `block_default::default_plain`, not here.
+        other => block_default::default_plain(other),
     }
 }
 
@@ -170,29 +276,13 @@ fn render_element_markdown(element: &Element) -> String {
         Element::Paragraph(p) => render_inline_markdown(&p.content),
         Element::Table(t) => render_table_markdown(t),
         Element::List(l) => render_list_markdown(l, 0),
-        Element::Image(img) => {
-            let alt = img.alt_text.as_deref().unwrap_or("");
-            format!("![{alt}]()")
-        },
-        Element::ThematicBreak => "---".to_string(),
-        Element::TextBox(tb) => tb
-            .content
-            .iter()
-            .map(render_element_markdown)
-            .collect::<Vec<_>>()
-            .join("\n\n"),
-        Element::PageBreak => String::new(),
-        Element::ColumnBreak => String::new(),
-        Element::Footnote(n) | Element::Endnote(n) => n
-            .content
-            .iter()
-            .map(render_element_markdown)
-            .collect::<Vec<_>>()
-            .join("\n\n"),
         Element::CodeBlock(cb) => {
             let lang = cb.language.as_deref().unwrap_or("");
             format!("```{lang}\n{}\n```", cb.content)
         },
+        // Invisible-in-flow / container / image variants delegated
+        // to the shared default — see `block_default::default_markdown`.
+        other => block_default::default_markdown(other),
     }
 }
 
@@ -358,29 +448,13 @@ fn render_element_html(element: &Element) -> String {
         },
         Element::Table(t) => render_table_html(t),
         Element::List(l) => render_list_html(l),
-        Element::Image(img) => {
-            let alt = img.alt_text.as_deref().map(escape_html).unwrap_or_default();
-            format!("<img alt=\"{alt}\" />")
-        },
-        Element::ThematicBreak => "<hr />".to_string(),
-        Element::TextBox(tb) => tb
-            .content
-            .iter()
-            .map(render_element_html)
-            .collect::<Vec<_>>()
-            .join("\n"),
-        Element::PageBreak => String::new(),
-        Element::ColumnBreak => String::new(),
-        Element::Footnote(n) | Element::Endnote(n) => n
-            .content
-            .iter()
-            .map(render_element_html)
-            .collect::<Vec<_>>()
-            .join("\n"),
         Element::CodeBlock(cb) => {
             let escaped = escape_html(&cb.content);
             format!("<pre><code>{escaped}</code></pre>")
         },
+        // Invisible-in-flow / container / image variants delegated
+        // to the shared default — see `block_default::default_html`.
+        other => block_default::default_html(other),
     }
 }
 
@@ -500,6 +574,7 @@ mod tests {
         let ir = simple_ir(vec![Element::Heading(Heading {
             level: 2,
             content: vec![span("Title")],
+            ..Default::default()
         })]);
         assert_eq!(ir.to_markdown(), "## Title");
     }
@@ -694,4 +769,58 @@ mod tests {
         assert!(html.contains("<li><p>First</p></li>"));
         assert!(html.contains("<li><p>Second</p></li>"));
     }
+
+    // ── Defaults centralized in `block_default` ──────────────────────
+
+    #[test]
+    fn thematic_break_renders_as_hr_in_plain() {
+        let ir = simple_ir(vec![Element::ThematicBreak]);
+        assert_eq!(ir.plain_text(), "---");
+    }
+
+    #[test]
+    fn thematic_break_renders_in_markdown() {
+        let ir = simple_ir(vec![Element::ThematicBreak]);
+        assert!(ir.to_markdown().contains("---"));
+    }
+
+    #[test]
+    fn page_break_invisible_in_plain() {
+        // PageBreak/ColumnBreak/Shape/Image have no plain-text counterpart
+        // — they collapse to empty so plain_text shows only the surrounding
+        // content.
+        let ir = simple_ir(vec![para("before"), Element::PageBreak, para("after")]);
+        let plain = ir.plain_text();
+        assert!(plain.contains("before"));
+        assert!(plain.contains("after"));
+    }
+
+    #[test]
+    fn shape_invisible_in_plain() {
+        let ir = simple_ir(vec![
+            para("before"),
+            Element::Shape(Shape::default()),
+            para("after"),
+        ]);
+        let plain = ir.plain_text();
+        assert!(plain.contains("before"));
+        assert!(plain.contains("after"));
+    }
+
+    #[test]
+    fn text_box_recursively_renders_children() {
+        let ir = simple_ir(vec![Element::TextBox(TextBox {
+            content: vec![para("inside")],
+            ..Default::default()
+        })]);
+        let plain = ir.plain_text();
+        assert!(plain.contains("inside"), "plain: {plain}");
+    }
+
+    #[test]
+    fn html_thematic_break() {
+        let ir = simple_ir(vec![Element::ThematicBreak]);
+        let html = ir.to_html();
+        assert!(html.contains("<hr"), "html: {html}");
+    }
 }
diff --git a/src/pptx/mod.rs b/src/pptx/mod.rs
index 1583dac..6807ca0 100644
--- a/src/pptx/mod.rs
+++ b/src/pptx/mod.rs
@@ -56,6 +56,10 @@ pub struct PptxDocument {
     pub slides: Vec<Slide>,
     /// Theme data (colors, fonts), if present.
     pub theme: Option<Theme>,
+    /// Font programs found under `ppt/fonts/`. Each entry is
+    /// `(font_name, ttf_or_otf_bytes)`. PDF→PPTX→PDF round-trips use
+    /// these to preserve the source typeface (mirrors the DOCX side).
+    pub embedded_fonts: Vec<(String, Vec<u8>)>,
 }
 
 impl PptxDocument {
@@ -101,6 +105,10 @@ impl PptxDocument {
             slide_data: Vec<u8>,
             slide_rels: Relationships,
             notes_data: Option<Vec<u8>>,
+            /// rId → (raw bytes, format-extension lowercase like "png" / "jpeg").
+            /// Pre-resolved here in Phase 1 so the parallel slide parser
+            /// (Phase 2) doesn't need access to the OPC reader.
+            media: std::collections::HashMap<String, (Vec<u8>, String)>,
         }
         let mut bundles = Vec::with_capacity(presentation.slides.len());
         for (slide_idx, slide_id) in presentation.slides.iter().enumerate() {
@@ -136,28 +144,87 @@ impl PptxDocument {
                     None
                 };
 
+            // Pre-load all IMAGE-relationship parts the slide references.
+            // PPTX picture frames carry `<a:blip r:embed="rIdN"/>`; the
+            // relationship resolves to a part like `/ppt/media/image3.png`.
+            // Parsing happens in parallel below and can't use the OPC
+            // reader, so we materialise the bytes here keyed by rId.
+            let mut media = std::collections::HashMap::new();
+            for rel in slide_rels.all() {
+                if rel.rel_type != rel_types::IMAGE {
+                    continue;
+                }
+                let target = match part_name.resolve_relative(&rel.target) {
+                    Ok(t) => t,
+                    Err(_) => continue,
+                };
+                if !opc.has_part(&target) {
+                    continue;
+                }
+                let bytes = match opc.read_part(&target) {
+                    Ok(b) => b,
+                    Err(_) => continue,
+                };
+                let ext = std::path::Path::new(&rel.target)
+                    .extension()
+                    .and_then(|s| s.to_str())
+                    .map(|s| s.to_ascii_lowercase())
+                    .unwrap_or_else(|| guess_format_from_bytes(&bytes).to_string());
+                media.insert(rel.id.clone(), (bytes, ext));
+            }
+
             bundles.push(SlideBundle {
                 slide_data,
                 slide_rels,
                 notes_data,
+                media,
             });
         }
 
         // Phase 2: parse slides (parallel when feature enabled)
         let slides = crate::core::parallel::map_collect(bundles, |b| -> Result<Slide> {
             let name = xml_csl_name(&b.slide_data);
-            let mut parsed = Slide::parse(&b.slide_data, name, &b.slide_rels)?;
+            let mut parsed = Slide::parse(&b.slide_data, name, &b.slide_rels, &b.media)?;
             if let Some(notes_data) = &b.notes_data {
                 parsed.notes = extract_notes_text(notes_data);
             }
             Ok(parsed)
         })?;
 
-        debug!("PptxDocument: {} slides parsed", slides.len());
+        // Scan `ppt/fonts/` for embedded font programs. Mirrors the DOCX
+        // reader (`word/fonts/`).
+        let mut embedded_fonts: Vec<(String, Vec<u8>)> = Vec::new();
+        for name in opc.part_names() {
+            let s = name.to_string();
+            if !s.starts_with("/ppt/fonts/") {
+                continue;
+            }
+            let lower = s.to_lowercase();
+            if !(lower.ends_with(".ttf") || lower.ends_with(".otf")) {
+                continue;
+            }
+            if let Ok(data) = opc.read_part(&name) {
+                let basename = s.rsplit('/').next().unwrap_or("font");
+                let face = crate::docx::strip_embedded_font_filename(basename);
+                let font_name = if face.is_empty() {
+                    basename.to_string()
+                } else {
+                    face
+                };
+                embedded_fonts.push((font_name, data));
+            }
+        }
+
+        debug!(
+            "PptxDocument: {} slides parsed, {} embedded fonts",
+            slides.len(),
+            embedded_fonts.len()
+        );
         Ok(PptxDocument {
             presentation,
             slides,
             theme,
+            embedded_fonts,
         })
     }
 }
@@ -192,6 +259,32 @@ fn extract_notes_text(xml_data: &[u8]) -> Option<String> {
     slide::extract_notes_text(xml_data)
 }
 
+/// Best-effort image-format detection from the raw bytes.
+///
+/// Used as a fallback when the relationship target has no recognisable
+/// extension (rare — DrawingML images almost always carry one). Returns
+/// a lowercase extension string suitable for round-tripping back into
+/// `office_oxide::ir::ImageFormat::extension()`.
+fn guess_format_from_bytes(bytes: &[u8]) -> &'static str {
+    if bytes.starts_with(&[0x89, b'P', b'N', b'G']) {
+        "png"
+    } else if bytes.starts_with(&[0xFF, 0xD8, 0xFF]) {
+        "jpeg"
+    } else if bytes.starts_with(b"GIF87a") || bytes.starts_with(b"GIF89a") {
+        "gif"
+    } else if bytes.starts_with(b"BM") {
+        "bmp"
+    } else if bytes.len() >= 4 && bytes.starts_with(&[0xD7, 0xCD, 0xC6, 0x9A]) {
+        "wmf"
+    } else if bytes.len() >= 4 && bytes.starts_with(&[0x01, 0x00, 0x00, 0x00]) {
+        "emf"
+    } else if bytes.len() >= 4 && (bytes.starts_with(b"II*\0") || bytes.starts_with(b"MM\0*")) {
+        "tiff"
+    } else {
+        "png"
+    }
+}
+
 impl crate::core::OfficeDocument for PptxDocument {
     fn plain_text(&self) -> String {
         self.plain_text()
diff --git a/src/pptx/shape.rs b/src/pptx/shape.rs
index b7ac63b..21c94fe 100644
--- a/src/pptx/shape.rs
+++ b/src/pptx/shape.rs
@@ -41,6 +41,14 @@ pub struct PictureShape {
     pub alt_text: Option<String>,
     /// Bounding box position and size in EMU.
     pub position: Option<ShapePosition>,
+    /// Relationship ID (`r:embed`) of the underlying media part, if any.
+    pub embed_rid: Option<String>,
+    /// Raw image bytes resolved via `embed_rid`, if the slide carried a
+    /// resolvable IMAGE relationship at parse time.
+    pub data: Option<Vec<u8>>,
+    /// Image format inferred from the relationship target extension or
+    /// byte signature (e.g. `"png"`, `"jpeg"`, `"gif"`, `"emf"`).
+    pub format: Option<String>,
 }
 
 /// A group of child shapes (`<p:grpSp>`).
@@ -123,10 +131,16 @@ pub struct TextBody {
 }
 
 /// A single paragraph within a text body (`<a:p>`).
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, Default)]
 pub struct TextParagraph {
     /// Outline level (0 = top level).
     pub level: u32,
+    /// Paragraph alignment from `<a:pPr algn="…"/>`. None when the
+    /// attribute is absent (renderer-default left alignment).
+    pub alignment: Option<crate::ir::ParagraphAlignment>,
+    /// Space before the paragraph, in 100ths of a point — read from
+    /// `<a:pPr><a:spcBef><a:spcPts val="…"/></a:spcBef></a:pPr>`.
+    pub space_before_hundredths_pt: Option<u32>,
     /// Inline content items in this paragraph.
     pub content: Vec<TextContent>,
 }
@@ -143,7 +157,7 @@ pub enum TextContent {
 }
 
 /// A text run with optional character formatting (`<a:r>`).
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, Default)]
 pub struct TextRun {
     /// The text content of this run.
     pub text: String,
@@ -155,6 +169,13 @@ pub struct TextRun {
     pub strikethrough: bool,
     /// Hyperlink attached to this run, if any.
     pub hyperlink: Option<HyperlinkInfo>,
+    /// Font size in hundredths of a point (`<a:rPr sz="1800"/>` → `Some(1800)` = 18 pt).
+    /// `None` when the run inherits its size from the placeholder/master.
+    pub font_size_hundredths_pt: Option<u32>,
+    /// Explicit run colour from `<a:rPr><a:solidFill><a:srgbClr val="…"/></a:solidFill></a:rPr>`.
+    /// `None` when the run inherits its colour from the placeholder /
+    /// theme, or when the fill is non-sRGB (gradient, scheme colour).
+    pub color_rgb: Option<[u8; 3]>,
 }
 
 /// An auto-updated field inside a paragraph (`<a:fld>`).
diff --git a/src/pptx/slide.rs b/src/pptx/slide.rs
index e72b8ba..35ee540 100644
--- a/src/pptx/slide.rs
+++ b/src/pptx/slide.rs
@@ -11,8 +11,21 @@ use super::shape::{
 
 type CoreResult<T> = crate::core::Result<T>;
 
-/// Parsed run properties: (bold, italic, strikethrough, hyperlink).
-type RunProps = (Option<bool>, Option<bool>, bool, Option<HyperlinkInfo>);
+/// Parsed run properties: (bold, italic, strikethrough, hyperlink, font_size_hundredths_pt).
+///
+/// PPTX `<a:rPr sz="..."/>` carries font size in hundredths of a point
+/// (e.g. `sz="1800"` = 18 pt). Carrying it through the parser is what
+/// keeps PDF→PPTX→PDF round-trips from defaulting every paragraph to
+/// the writer's 12 pt fallback (which inflated 8-page A4 sources to
+/// ~30 pages).
+type RunProps = (
+    Option<bool>,
+    Option<bool>,
+    bool,
+    Option<HyperlinkInfo>,
+    Option<u32>,
+    Option<[u8; 3]>,
+);
 
 /// A parsed PPTX slide.
 #[derive(Debug, Clone)]
@@ -23,6 +36,11 @@ pub struct Slide {
     pub shapes: Vec<Shape>,
     /// Speaker notes text, if a notes slide is present.
     pub notes: Option<String>,
+    /// Solid background colour (RGB) extracted from the slide's
+    /// `<p:cSld><p:bg><p:bgPr><a:solidFill>` element. Only the solid
+    /// case is parsed; gradient / image / theme-reference fills are
+    /// dropped silently and surface as `None`.
+    pub background_rgb: Option<[u8; 3]>,
 }
 
 /// Create a fast reader that does NOT trim text content.
@@ -35,14 +53,23 @@ fn make_content_reader(xml_data: &[u8]) -> quick_xml::Reader<&[u8]> {
 
 impl Slide {
     /// Parse a slide from its XML data.
-    pub(crate) fn parse(xml_data: &[u8], name: String, rels: &Relationships) -> CoreResult<Self> {
+    pub(crate) fn parse(
+        xml_data: &[u8],
+        name: String,
+        rels: &Relationships,
+        media: &std::collections::HashMap<String, (Vec<u8>, String)>,
+    ) -> CoreResult<Self> {
         let mut reader = make_content_reader(xml_data);
         let mut shapes = Vec::new();
+        let mut background_rgb = None;
 
         loop {
             match reader.read_event()? {
+                Event::Start(ref e) if e.local_name().as_ref() == b"bg" => {
+                    background_rgb = parse_slide_bg(&mut reader)?;
+                },
                 Event::Start(ref e) if e.local_name().as_ref() == b"spTree" => {
-                    shapes = parse_shape_tree(&mut reader, rels)?;
+                    shapes = parse_shape_tree(&mut reader, rels, media)?;
                 },
                 Event::Eof => break,
                 _ => {},
@@ -53,10 +80,75 @@ impl Slide {
             name,
             shapes,
             notes: None,
+            background_rgb,
         })
     }
 }
 
+/// Parse `<p:bg>` looking for a single solid-fill colour.
+///
+/// Returns `Some([r, g, b])` if the background is a `<p:bgPr>` with an
+/// `<a:solidFill><a:srgbClr val="RRGGBB"/>`. All other forms (gradient,
+/// blip / image, scheme / theme references via `<p:bgRef>`) return
+/// `None` — the renderer silently falls back to no background, which
+/// matches "minimum theme-background support" per the v0.3.42 plan.
+fn parse_slide_bg(reader: &mut quick_xml::Reader<&[u8]>) -> CoreResult<Option<[u8; 3]>> {
+    let mut rgb = None;
+    let mut depth = 1u32;
+    let mut in_solid_fill = false;
+    loop {
+        match reader.read_event()? {
+            Event::Start(ref e) => {
+                depth += 1;
+                if e.local_name().as_ref() == b"solidFill" {
+                    in_solid_fill = true;
+                }
+            },
+            Event::Empty(ref e) => {
+                if in_solid_fill && e.local_name().as_ref() == b"srgbClr" {
+                    if let Some(val) = xml::optional_attr_str(e, b"val")? {
+                        rgb = parse_hex_rgb(val.as_ref());
+                    }
+                }
+            },
+            Event::End(ref e) => {
+                if e.local_name().as_ref() == b"solidFill" {
+                    in_solid_fill = false;
+                }
+                depth -= 1;
+                if depth == 0 {
+                    break;
+                }
+            },
+            Event::Eof => break,
+            _ => {},
+        }
+    }
+    Ok(rgb)
+}
+
+/// Parse a 6-character hex colour (e.g. `"0E273B"`) into `[r, g, b]`.
+fn parse_hex_rgb(s: &str) -> Option<[u8; 3]> {
+    let bytes = s.as_bytes();
+    if bytes.len() != 6 {
+        return None;
+    }
+    let h = |hi, lo| -> Option<u8> {
+        let n = |c: u8| match c {
+            b'0'..=b'9' => Some(c - b'0'),
+            b'a'..=b'f' => Some(c - b'a' + 10),
+            b'A'..=b'F' => Some(c - b'A' + 10),
+            _ => None,
+        };
+        Some(n(hi)? * 16 + n(lo)?)
+    };
+    Some([
+        h(bytes[0], bytes[1])?,
+        h(bytes[2], bytes[3])?,
+        h(bytes[4], bytes[5])?,
+    ])
+}
+
 // ---------------------------------------------------------------------------
 // Shape tree parsing
 // ---------------------------------------------------------------------------
@@ -64,6 +156,7 @@ impl Slide {
 fn parse_shape_tree(
     reader: &mut quick_xml::Reader<&[u8]>,
     rels: &Relationships,
+    media: &std::collections::HashMap<String, (Vec<u8>, String)>,
 ) -> CoreResult<Vec<Shape>> {
     let mut shapes = Vec::new();
 
@@ -71,8 +164,8 @@ fn parse_shape_tree(
         match reader.read_event()? {
             Event::Start(ref e) => match e.local_name().as_ref() {
                 b"sp" => shapes.push(parse_auto_shape(reader, rels)?),
-                b"pic" => shapes.push(parse_picture(reader)?),
-                b"grpSp" => shapes.push(parse_group_shape(reader, rels)?),
+                b"pic" => shapes.push(parse_picture(reader, media)?),
+                b"grpSp" => shapes.push(parse_group_shape(reader, rels, media)?),
                 b"graphicFrame" => shapes.push(parse_graphic_frame(reader, rels)?),
                 b"cxnSp" => shapes.push(parse_connector(reader)?),
                 _ => {
@@ -147,11 +240,15 @@ fn parse_auto_shape(
 // PictureShape (p:pic)
 // ---------------------------------------------------------------------------
 
-fn parse_picture(reader: &mut quick_xml::Reader<&[u8]>) -> CoreResult<Shape> {
+fn parse_picture(
+    reader: &mut quick_xml::Reader<&[u8]>,
+    media: &std::collections::HashMap<String, (Vec<u8>, String)>,
+) -> CoreResult<Shape> {
     let mut id = 0u32;
     let mut name = String::new();
     let mut alt_text = None;
     let mut position = None;
+    let mut embed_rid: Option<String> = None;
 
     loop {
         match reader.read_event()? {
@@ -163,7 +260,7 @@ fn parse_picture(reader: &mut quick_xml::Reader<&[u8]>) -> CoreResult<Shape> {
                     alt_text = props.2;
                 },
                 b"blipFill" => {
-                    xml::skip_element_fast(reader)?;
+                    embed_rid = parse_blip_fill_embed(reader)?;
                 },
                 b"spPr" => {
                     position = parse_shape_properties(reader)?;
@@ -180,14 +277,71 @@ fn parse_picture(reader: &mut quick_xml::Reader<&[u8]>) -> CoreResult<Shape> {
         }
     }
 
+    let (data, format) = match embed_rid.as_deref().and_then(|rid| media.get(rid)) {
+        Some((bytes, ext)) => (Some(bytes.clone()), Some(ext.clone())),
+        None => (None, None),
+    };
+
     Ok(Shape::Picture(PictureShape {
         id,
         name,
         alt_text,
         position,
+        embed_rid,
+        data,
+        format,
     }))
 }
 
+/// Parse `<p:blipFill>…<a:blip r:embed="rIdN"/>…</p:blipFill>` and
+/// return the `r:embed` attribute, if present. Other contents (stretch,
+/// crop, tile) are skipped — only the embed rId is needed to resolve
+/// the underlying media part.
+fn parse_blip_fill_embed(reader: &mut quick_xml::Reader<&[u8]>) -> CoreResult<Option<String>> {
+    let mut embed: Option<String> = None;
+    let mut depth: u32 = 1;
+    loop {
+        match reader.read_event()? {
+            Event::Start(ref e) => {
+                if e.local_name().as_ref() == b"blip" && embed.is_none() {
+                    embed = read_blip_embed_attr(e)?;
+                }
+                depth += 1;
+            },
+            Event::Empty(ref e) => {
+                if e.local_name().as_ref() == b"blip" && embed.is_none() {
+                    embed = read_blip_embed_attr(e)?;
+                }
+            },
+            Event::End(_) => {
+                depth -= 1;
+                if depth == 0 {
+                    break;
+                }
+            },
+            Event::Eof => break,
+            _ => {},
+        }
+    }
+    Ok(embed)
+}
+
+fn read_blip_embed_attr(e: &quick_xml::events::BytesStart) -> CoreResult<Option<String>> {
+    // `<a:blip>` carries `r:embed="rIdN"` (DrawingML namespace `a:`,
+    // relationship namespace `r:`). The attribute may be present in
+    // either the `Empty` or `Start` form; both routes feed this helper.
+    for attr in e.attributes().with_checks(false) {
+        let attr = attr.map_err(crate::core::Error::from)?;
+        let key = attr.key.as_ref();
+        let is_embed = key == b"r:embed" || key.ends_with(b":embed") || key == b"embed";
+        if is_embed {
+            let raw = attr.unescape_value().map_err(crate::core::Error::from)?;
+            return Ok(Some(raw.into_owned()));
+        }
+    }
+    Ok(None)
+}
+
 // ---------------------------------------------------------------------------
 // GroupShape (p:grpSp)
 // ---------------------------------------------------------------------------
@@ -195,6 +349,7 @@ fn parse_picture(reader: &mut quick_xml::Reader<&[u8]>) -> CoreResult<Shape> {
 fn parse_group_shape(
     reader: &mut quick_xml::Reader<&[u8]>,
     rels: &Relationships,
+    media: &std::collections::HashMap<String, (Vec<u8>, String)>,
 ) -> CoreResult<Shape> {
     let mut id = 0u32;
     let mut name = String::new();
@@ -213,8 +368,8 @@ fn parse_group_shape(
                     position = parse_grp_shape_properties(reader)?;
                 },
                 b"sp" => children.push(parse_auto_shape(reader, rels)?),
-                b"pic" => children.push(parse_picture(reader)?),
-                b"grpSp" => children.push(parse_group_shape(reader, rels)?),
+                b"pic" => children.push(parse_picture(reader, media)?),
+                b"grpSp" => children.push(parse_group_shape(reader, rels, media)?),
                 b"graphicFrame" => children.push(parse_graphic_frame(reader, rels)?),
                 b"cxnSp" => children.push(parse_connector(reader)?),
                 _ => {
@@ -686,9 +841,23 @@ fn parse_text_paragraph(
     reader: &mut quick_xml::Reader<&[u8]>,
     rels: &Relationships,
 ) -> CoreResult<TextParagraph> {
+    use crate::ir::ParagraphAlignment;
     let mut level = 0u32;
+    let mut alignment: Option<ParagraphAlignment> = None;
+    let mut space_before_hundredths_pt: Option<u32> = None;
     let mut content = Vec::new();
 
+    let parse_algn = |e: &quick_xml::events::BytesStart| -> CoreResult<Option<ParagraphAlignment>> {
+        Ok(xml::optional_attr_str(e, b"algn")?.and_then(|v| match v.as_ref() {
+            "l" => Some(ParagraphAlignment::Left),
+            "ctr" => Some(ParagraphAlignment::Center),
+            "r" => Some(ParagraphAlignment::Right),
+            "just" | "justLow" => Some(ParagraphAlignment::Justify),
+            "dist" | "thaiDist" => Some(ParagraphAlignment::Distribute),
+            _ => None,
+        }))
+    };
+
     loop {
         match reader.read_event()? {
             Event::Start(ref e) => match e.local_name().as_ref() {
@@ -696,7 +865,41 @@ fn parse_text_paragraph(
                     level = xml::optional_attr_str(e, b"lvl")?
                         .and_then(|v| v.parse().ok())
                         .unwrap_or(0);
-                    xml::skip_element_fast(reader)?;
+                    alignment = parse_algn(e)?;
+                    // <a:pPr> with body — scan for <a:spcBef><a:spcPts/>
+                    let depth_start = 1i32;
+                    let mut depth = depth_start;
+                    let mut in_spc_bef = false;
+                    loop {
+                        match reader.read_event()? {
+                            Event::Start(ref ee) => {
+                                depth += 1;
+                                if ee.local_name().as_ref() == b"spcBef" {
+                                    in_spc_bef = true;
+                                }
+                            },
+                            Event::Empty(ref ee) => {
+                                if in_spc_bef && ee.local_name().as_ref() == b"spcPts" {
+                                    if let Some(v) = xml::optional_attr_str(ee, b"val")? {
+                                        if let Ok(n) = v.parse::<u32>() {
+                                            space_before_hundredths_pt = Some(n);
+                                        }
+                                    }
+                                }
+                            },
+                            Event::End(ref ee) => {
+                                depth -= 1;
+                                if ee.local_name().as_ref() == b"spcBef" {
+                                    in_spc_bef = false;
+                                }
+                                if depth <= 0 && ee.local_name().as_ref() == b"pPr" {
+                                    break;
+                                }
+                            },
+                            Event::Eof => break,
+                            _ => {},
+                        }
+                    }
                 },
                 b"r" => {
                     content.push(TextContent::Run(parse_text_run(reader, rels)?));
@@ -717,6 +920,7 @@ fn parse_text_paragraph(
                     level = xml::optional_attr_str(e, b"lvl")?
                         .and_then(|v| v.parse().ok())
                         .unwrap_or(0);
+                    alignment = parse_algn(e)?;
                 },
                 b"br" => {
                     content.push(TextContent::LineBreak);
@@ -731,7 +935,12 @@ fn parse_text_paragraph(
         }
     }
 
-    Ok(TextParagraph { level, content })
+    Ok(TextParagraph {
+        level,
+        alignment,
+        space_before_hundredths_pt,
+        content,
+    })
 }
 
 /// Parse `<a:r>` text run.
@@ -744,6 +953,8 @@ fn parse_text_run(
     let mut italic = None;
     let mut strikethrough = false;
     let mut hyperlink = None;
+    let mut font_size_hundredths_pt = None;
+    let mut color_rgb: Option<[u8; 3]> = None;
 
     loop {
         match reader.read_event()? {
@@ -754,6 +965,8 @@ fn parse_text_run(
                     italic = props.1;
                     strikethrough = props.2;
                     hyperlink = props.3;
+                    font_size_hundredths_pt = props.4;
+                    color_rgb = props.5;
                 },
                 b"t" => {
                     text = xml::read_text_content_fast(reader)?;
@@ -768,6 +981,8 @@ fn parse_text_run(
                 italic = props.1;
                 strikethrough = props.2;
                 hyperlink = props.3;
+                font_size_hundredths_pt = props.4;
+                color_rgb = props.5;
             },
             Event::End(ref e) if e.local_name().as_ref() == b"r" => {
                 break;
@@ -783,6 +998,8 @@ fn parse_text_run(
         italic,
         strikethrough,
         hyperlink,
+        font_size_hundredths_pt,
+        color_rgb,
     })
 }
 
@@ -796,27 +1013,52 @@ fn parse_run_properties(
     let italic = parse_bool_attr(start, b"i")?;
     let strike = xml::optional_attr_str(start, b"strike")?;
     let strikethrough = strike.as_deref().is_some_and(|v| v != "noStrike");
+    let font_size_hundredths_pt = parse_u32_attr(start, b"sz")?;
     let mut hyperlink = None;
+    let mut color_rgb: Option<[u8; 3]> = None;
+    // Track whether we are inside `<a:solidFill>` so we only pick up
+    // the inner `<a:srgbClr>` (the fill colour proper) and not
+    // unrelated `<a:srgbClr>` elements that may appear in sibling
+    // effects (e.g. `<a:hl><a:srgbClr/>` for hyperlink colour).
+    let mut in_solid_fill = false;
 
     loop {
         match reader.read_event()? {
-            Event::Start(ref e) | Event::Empty(ref e)
-                if e.local_name().as_ref() == b"hlinkClick" =>
-            {
-                hyperlink = parse_hlink_click(e, rels)?;
+            Event::Start(ref e) => {
+                if e.local_name().as_ref() == b"solidFill" {
+                    in_solid_fill = true;
+                } else if e.local_name().as_ref() == b"hlinkClick" {
+                    hyperlink = parse_hlink_click(e, rels)?;
+                }
             },
-            Event::End(ref e) if e.local_name().as_ref() == b"rPr" => {
-                break;
+            Event::Empty(ref e) => {
+                if e.local_name().as_ref() == b"hlinkClick" {
+                    hyperlink = parse_hlink_click(e, rels)?;
+                } else if in_solid_fill
+                    && e.local_name().as_ref() == b"srgbClr"
+                    && color_rgb.is_none()
+                {
+                    color_rgb = parse_srgb_clr(e);
+                }
+            },
+            Event::End(ref e) => {
+                if e.local_name().as_ref() == b"solidFill" {
+                    in_solid_fill = false;
+                } else if e.local_name().as_ref() == b"rPr" {
+                    break;
+                }
             },
             Event::Eof => break,
             _ => {},
         }
     }
 
-    Ok((bold, italic, strikethrough, hyperlink))
+    Ok((bold, italic, strikethrough, hyperlink, font_size_hundredths_pt, color_rgb))
 }
 
-/// Parse run properties from an `<a:rPr/>` Empty element.
+/// Parse run properties from an `<a:rPr/>` Empty element. Empty
+/// elements cannot carry a `<a:solidFill>` child so `color_rgb`
+/// is always `None` on this path.
 fn parse_run_properties_empty(
     e: &quick_xml::events::BytesStart,
     _rels: &Relationships,
@@ -825,7 +1067,29 @@ fn parse_run_properties_empty(
     let italic = parse_bool_attr(e, b"i")?;
     let strike = xml::optional_attr_str(e, b"strike")?;
     let strikethrough = strike.as_deref().is_some_and(|v| v != "noStrike");
-    Ok((bold, italic, strikethrough, None))
+    let font_size_hundredths_pt = parse_u32_attr(e, b"sz")?;
+    Ok((bold, italic, strikethrough, None, font_size_hundredths_pt, None))
+}
+
+/// Decode a 6-hex-digit `val="RRGGBB"` attribute from `<a:srgbClr/>`
+/// to a `[u8; 3]`. Returns `None` when the attribute is absent or
+/// malformed.
+fn parse_srgb_clr(e: &quick_xml::events::BytesStart) -> Option<[u8; 3]> {
+    let val = xml::optional_attr_str(e, b"val").ok().flatten()?;
+    let s = val.as_ref();
+    if s.len() != 6 {
+        return None;
+    }
+    let r = u8::from_str_radix(&s[0..2], 16).ok()?;
+    let g = u8::from_str_radix(&s[2..4], 16).ok()?;
+    let b = u8::from_str_radix(&s[4..6], 16).ok()?;
+    Some([r, g, b])
+}
+
+/// Parse a non-negative integer DrawingML attribute (e.g. `sz="1800"`).
+/// Returns `None` if the attribute is absent or not parseable.
+fn parse_u32_attr(e: &quick_xml::events::BytesStart, key: &[u8]) -> CoreResult<Option<u32>> {
+    Ok(xml::optional_attr_str(e, key)?.and_then(|v| v.parse::<u32>().ok()))
 }
 
 /// Parse a DrawingML boolean attribute: `b="1"` → Some(true), `b="0"` → Some(false), absent → None.
@@ -994,7 +1258,8 @@ pub(crate) fn extract_notes_text(xml_data: &[u8]) -> Option<String> {
     loop {
         match reader.read_event() {
             Ok(Event::Start(ref e)) if e.local_name().as_ref() == b"spTree" => {
-                shapes = parse_shape_tree(&mut reader, &rels).ok()?;
+                shapes =
+                    parse_shape_tree(&mut reader, &rels, &std::collections::HashMap::new()).ok()?;
             },
             Ok(Event::Eof) => break,
             Err(_) => break,
@@ -1093,7 +1358,9 @@ mod tests {
         );
 
         let rels = Relationships::empty();
-        let slide = Slide::parse(&xml, "Slide1".to_string(), &rels).unwrap();
+        let slide =
+            Slide::parse(&xml, "Slide1".to_string(), &rels, &std::collections::HashMap::new())
+                .unwrap();
 
         assert_eq!(slide.shapes.len(), 1);
         if let Shape::AutoShape(ref auto) = slide.shapes[0] {
@@ -1151,7 +1418,8 @@ mod tests {
         );
 
         let rels = Relationships::empty();
-        let slide = Slide::parse(&xml, String::new(), &rels).unwrap();
+        let slide =
+            Slide::parse(&xml, String::new(), &rels, &std::collections::HashMap::new()).unwrap();
 
         assert_eq!(slide.shapes.len(), 1);
         if let Shape::Group(ref grp) = slide.shapes[0] {
@@ -1225,7 +1493,8 @@ mod tests {
         );
 
         let rels = Relationships::empty();
-        let slide = Slide::parse(&xml, String::new(), &rels).unwrap();
+        let slide =
+            Slide::parse(&xml, String::new(), &rels, &std::collections::HashMap::new()).unwrap();
 
         assert_eq!(slide.shapes.len(), 1);
         if let Shape::GraphicFrame(ref gf) = slide.shapes[0] {
@@ -1266,7 +1535,8 @@ mod tests {
         );
 
         let rels = Relationships::empty();
-        let slide = Slide::parse(&xml, String::new(), &rels).unwrap();
+        let slide =
+            Slide::parse(&xml, String::new(), &rels, &std::collections::HashMap::new()).unwrap();
 
         assert_eq!(slide.shapes.len(), 1);
         if let Shape::Picture(ref pic) = slide.shapes[0] {
@@ -1300,7 +1570,8 @@ mod tests {
         );
 
         let rels = Relationships::empty();
-        let slide = Slide::parse(&xml, String::new(), &rels).unwrap();
+        let slide =
+            Slide::parse(&xml, String::new(), &rels, &std::collections::HashMap::new()).unwrap();
 
         assert_eq!(slide.shapes.len(), 1);
         if let Shape::Connector(ref cxn) = slide.shapes[0] {
@@ -1336,7 +1607,8 @@ mod tests {
         );
 
         let rels = Relationships::empty();
-        let slide = Slide::parse(&xml, String::new(), &rels).unwrap();
+        let slide =
+            Slide::parse(&xml, String::new(), &rels, &std::collections::HashMap::new()).unwrap();
 
         if let Shape::AutoShape(ref auto) = slide.shapes[0] {
             let tb = auto.text_body.as_ref().unwrap();
@@ -1372,7 +1644,8 @@ mod tests {
         );
 
         let rels = Relationships::empty();
-        let slide = Slide::parse(&xml, String::new(), &rels).unwrap();
+        let slide =
+            Slide::parse(&xml, String::new(), &rels, &std::collections::HashMap::new()).unwrap();
 
         if let Shape::AutoShape(ref auto) = slide.shapes[0] {
             let tb = auto.text_body.as_ref().unwrap();
@@ -1427,4 +1700,321 @@ mod tests {
         let text = extract_notes_text(xml).unwrap();
         assert_eq!(text, "Speaker notes here\nSecond line");
     }
+
+    // ── New: blip rId extraction, font size, alignment, space_before, bg ─
+
+    #[test]
+    fn run_carries_font_size_from_sz_attr() {
+        // <a:rPr sz="1800"/> means 18 pt — should land on the run as
+        // 1800 hundredths-of-a-point.
+        let xml = make_slide_xml(
+            r#"<p:sp>
+  <p:nvSpPr><p:cNvPr id="7" name="T"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
+  <p:spPr/>
+  <p:txBody>
+    <a:bodyPr/>
+    <a:p>
+      <a:r>
+        <a:rPr sz="1800"/>
+        <a:t>sized</a:t>
+      </a:r>
+    </a:p>
+  </p:txBody>
+</p:sp>"#,
+        );
+
+        let rels = Relationships::empty();
+        let slide =
+            Slide::parse(&xml, String::new(), &rels, &std::collections::HashMap::new()).unwrap();
+        if let Shape::AutoShape(ref a) = slide.shapes[0] {
+            let tb = a.text_body.as_ref().unwrap();
+            if let TextContent::Run(ref r) = tb.paragraphs[0].content[0] {
+                assert_eq!(r.font_size_hundredths_pt, Some(1800));
+            } else {
+                panic!("expected run");
+            }
+        }
+    }
+
+    #[test]
+    fn run_font_size_absent_when_sz_missing() {
+        let xml = make_slide_xml(
+            r#"<p:sp>
+  <p:nvSpPr><p:cNvPr id="8" name="T"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
+  <p:spPr/>
+  <p:txBody>
+    <a:bodyPr/>
+    <a:p>
+      <a:r><a:t>unsized</a:t></a:r>
+    </a:p>
+  </p:txBody>
+</p:sp>"#,
+        );
+
+        let rels = Relationships::empty();
+        let slide =
+            Slide::parse(&xml, String::new(), &rels, &std::collections::HashMap::new()).unwrap();
+        if let Shape::AutoShape(ref a) = slide.shapes[0] {
+            let tb = a.text_body.as_ref().unwrap();
+            if let TextContent::Run(ref r) = tb.paragraphs[0].content[0] {
+                assert!(r.font_size_hundredths_pt.is_none());
+            }
+        }
+    }
+
+    #[test]
+    fn paragraph_alignment_parsed_from_algn_attr() {
+        use crate::ir::ParagraphAlignment;
+        let xml = make_slide_xml(
+            r#"<p:sp>
+  <p:nvSpPr><p:cNvPr id="9" name="T"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
+  <p:spPr/>
+  <p:txBody>
+    <a:bodyPr/>
+    <a:p>
+      <a:pPr algn="ctr"/>
+      <a:r><a:t>centered</a:t></a:r>
+    </a:p>
+  </p:txBody>
+</p:sp>"#,
+        );
+
+        let rels = Relationships::empty();
+        let slide =
+            Slide::parse(&xml, String::new(), &rels, &std::collections::HashMap::new()).unwrap();
+        if let Shape::AutoShape(ref a) = slide.shapes[0] {
+            let para = &a.text_body.as_ref().unwrap().paragraphs[0];
+            assert_eq!(para.alignment, Some(ParagraphAlignment::Center));
+        }
+    }
+
+    #[test]
+    fn paragraph_alignment_all_variants() {
+        use crate::ir::ParagraphAlignment;
+        let cases = [
+            ("l", ParagraphAlignment::Left),
+            ("ctr", ParagraphAlignment::Center),
+            ("r", ParagraphAlignment::Right),
+            ("just", ParagraphAlignment::Justify),
+            ("dist", ParagraphAlignment::Distribute),
+        ];
+        for (algn, expected) in cases {
+            let xml = make_slide_xml(&format!(
+                r#"<p:sp>
+  <p:nvSpPr><p:cNvPr id="9" name="T"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
+  <p:spPr/>
+  <p:txBody>
+    <a:bodyPr/>
+    <a:p>
+      <a:pPr algn="{algn}"/>
+      <a:r><a:t>x</a:t></a:r>
+    </a:p>
+  </p:txBody>
+</p:sp>"#
+            ));
+            let slide = Slide::parse(
+                &xml,
+                String::new(),
+                &Relationships::empty(),
+                &std::collections::HashMap::new(),
+            )
+            .unwrap();
+            if let Shape::AutoShape(ref a) = slide.shapes[0] {
+                let para = &a.text_body.as_ref().unwrap().paragraphs[0];
+                assert_eq!(para.alignment, Some(expected), "algn={algn}");
+            }
+        }
+    }
+
+    #[test]
+    fn paragraph_space_before_parsed_from_spc_bef() {
+        let xml = make_slide_xml(
+            r#"<p:sp>
+  <p:nvSpPr><p:cNvPr id="11" name="T"/><p:cNvSpPr/><p:nvPr/></p:nvSpPr>
+  <p:spPr/>
+  <p:txBody>
+    <a:bodyPr/>
+    <a:p>
+      <a:pPr>
+        <a:spcBef><a:spcPts val="1200"/></a:spcBef>
+      </a:pPr>
+      <a:r><a:t>spaced</a:t></a:r>
+    </a:p>
+  </p:txBody>
+</p:sp>"#,
+        );
+
+        let rels = Relationships::empty();
+        let slide =
+            Slide::parse(&xml, String::new(), &rels, &std::collections::HashMap::new()).unwrap();
+        if let Shape::AutoShape(ref a) = slide.shapes[0] {
+            let para = &a.text_body.as_ref().unwrap().paragraphs[0];
+            assert_eq!(para.space_before_hundredths_pt, Some(1200));
+        }
+    }
+
+    #[test]
+    fn picture_embed_resolves_via_media_map() {
+        // Build a media map keyed by the rId used in the slide xml so
+        // parse_picture can resolve the embed → bytes.
+        let xml = make_slide_xml(
+            r#"<p:pic>
+  <p:nvPicPr>
+    <p:cNvPr id="33" name="Photo"/>
+    <p:cNvPicPr/>
+    <p:nvPr/>
+  </p:nvPicPr>
+  <p:blipFill>
+    <a:blip r:embed="rId7"/>
+  </p:blipFill>
+  <p:spPr>
+    <a:xfrm><a:off x="0" y="0"/><a:ext cx="100" cy="100"/></a:xfrm>
+  </p:spPr>
+</p:pic>"#,
+        );
+
+        let mut media = std::collections::HashMap::new();
+        media.insert("rId7".to_string(), (vec![0xDEu8, 0xADu8, 0xBEu8, 0xEFu8], "png".to_string()));
+
+        let slide = Slide::parse(&xml, String::new(), &Relationships::empty(), &media).unwrap();
+        if let Shape::Picture(ref pic) = slide.shapes[0] {
+            assert_eq!(pic.embed_rid.as_deref(), Some("rId7"));
+            assert_eq!(pic.data.as_deref(), Some(&[0xDEu8, 0xADu8, 0xBEu8, 0xEFu8][..]));
+            assert_eq!(pic.format.as_deref(), Some("png"));
+        } else {
+            panic!("expected picture");
+        }
+    }
+
+    #[test]
+    fn picture_embed_without_media_still_carries_rid() {
+        // Empty media map: rId is captured but data/format are None.
+        let xml = make_slide_xml(
+            r#"<p:pic>
+  <p:nvPicPr>
+    <p:cNvPr id="34" name="Photo"/>
+    <p:cNvPicPr/>
+    <p:nvPr/>
+  </p:nvPicPr>
+  <p:blipFill><a:blip r:embed="rId9"/></p:blipFill>
+  <p:spPr>
+    <a:xfrm><a:off x="0" y="0"/><a:ext cx="10" cy="10"/></a:xfrm>
+  </p:spPr>
+</p:pic>"#,
+        );
+
+        let slide = Slide::parse(
+            &xml,
+            String::new(),
+            &Relationships::empty(),
+            &std::collections::HashMap::new(),
+        )
+        .unwrap();
+        if let Shape::Picture(ref pic) = slide.shapes[0] {
+            assert_eq!(pic.embed_rid.as_deref(), Some("rId9"));
+            assert!(pic.data.is_none());
+            assert!(pic.format.is_none());
+        }
+    }
+
+    #[test]
+    fn slide_background_solid_rgb() {
+        // <p:bg><p:bgPr><a:solidFill><a:srgbClr val="FF8800"/>…
+        let xml = br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
+       xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main"
+       xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
+  <p:cSld>
+    <p:bg>
+      <p:bgPr>
+        <a:solidFill><a:srgbClr val="FF8800"/></a:solidFill>
+      </p:bgPr>
+    </p:bg>
+    <p:spTree>
+      <p:nvGrpSpPr><p:cNvPr id="1" name=""/><p:cNvGrpSpPr/><p:nvPr/></p:nvGrpSpPr>
+      <p:grpSpPr/>
+    </p:spTree>
+  </p:cSld>
+</p:sld>"#;
+        let slide = Slide::parse(
+            xml,
+            String::new(),
+            &Relationships::empty(),
+            &std::collections::HashMap::new(),
+        )
+        .unwrap();
+        assert_eq!(slide.background_rgb, Some([0xFF, 0x88, 0x00]));
+    }
+
+    #[test]
+    fn slide_no_background_returns_none() {
+        let xml = make_slide_xml("");
+        let slide = Slide::parse(
+            &xml,
+            String::new(),
+            &Relationships::empty(),
+            &std::collections::HashMap::new(),
+        )
+        .unwrap();
+        assert!(slide.background_rgb.is_none());
+    }
+
+    #[test]
+    fn parse_hex_rgb_valid() {
+        assert_eq!(parse_hex_rgb("FF8800"), Some([0xFF, 0x88, 0x00]));
+        assert_eq!(parse_hex_rgb("000000"), Some([0, 0, 0]));
+        assert_eq!(parse_hex_rgb("ffffff"), Some([0xFF, 0xFF, 0xFF]));
+    }
+
+    #[test]
+    fn parse_hex_rgb_invalid() {
+        assert_eq!(parse_hex_rgb("FF88"), None); // too short
+        assert_eq!(parse_hex_rgb("ZZZZZZ"), None); // not hex
+        assert_eq!(parse_hex_rgb(""), None);
+    }
+
+    // ── read_blip_embed_attr ────────────────────────────────────────────
+
+    fn first_start_elem(xml: &[u8]) -> quick_xml::events::BytesStart<'static> {
+        let mut reader = xml::make_fast_reader(xml);
+        loop {
+            match reader.read_event().unwrap() {
+                Event::Start(e) | Event::Empty(e) => return e.into_owned(),
+                Event::Eof => panic!("no start"),
+                _ => {},
+            }
+        }
+    }
+
+    #[test]
+    fn blip_embed_attr_with_r_prefix() {
+        let e = first_start_elem(
+            br#"<a:blip xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
+                       xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
+                       r:embed="rId5"/>"#,
+        );
+        let rid = read_blip_embed_attr(&e).unwrap();
+        assert_eq!(rid.as_deref(), Some("rId5"));
+    }
+
+    #[test]
+    fn blip_embed_attr_arbitrary_prefix() {
+        // Some writers use an unrelated prefix bound to the rels namespace.
+        let e = first_start_elem(
+            br#"<a:blip xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
+                       xmlns:foo="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
+                       foo:embed="rId99"/>"#,
+        );
+        let rid = read_blip_embed_attr(&e).unwrap();
+        assert_eq!(rid.as_deref(), Some("rId99"));
+    }
+
+    #[test]
+    fn blip_embed_attr_absent() {
+        let e = first_start_elem(
+            br#"<a:blip xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"/>"#,
+        );
+        let rid = read_blip_embed_attr(&e).unwrap();
+        assert!(rid.is_none());
+    }
 }
diff --git a/src/pptx/text.rs b/src/pptx/text.rs
index 8338ecd..33a451c 100644
--- a/src/pptx/text.rs
+++ b/src/pptx/text.rs
@@ -431,6 +431,7 @@ mod tests {
             },
             slides,
             theme: None,
+            embedded_fonts: Vec::new(),
         }
     }
 
@@ -448,12 +449,15 @@ mod tests {
             text_body: Some(TextBody {
                 paragraphs: vec![TextParagraph {
                     level: 0,
+                    alignment: None,
+                    space_before_hundredths_pt: None,
                     content: vec![TextContent::Run(TextRun {
                         text: text.to_string(),
                         bold: None,
                         italic: None,
                         strikethrough: false,
                         hyperlink: None,
+                        font_size_hundredths_pt: None,
                     })],
                 }],
             }),
@@ -475,12 +479,15 @@ mod tests {
             text_body: Some(TextBody {
                 paragraphs: vec![TextParagraph {
                     level: 0,
+                    alignment: None,
+                    space_before_hundredths_pt: None,
                     content: vec![TextContent::Run(TextRun {
                         text: text.to_string(),
                         bold: None,
                         italic: None,
                         strikethrough: false,
                         hyperlink: None,
+                        font_size_hundredths_pt: None,
                     })],
                 }],
             }),
@@ -501,6 +508,7 @@ mod tests {
                 text_shape("Middle", "middle text", 50, 2500),
             ],
             notes: None,
+            background_rgb: None,
         }]);
 
         let text = doc.slide_plain_text(0).unwrap();
@@ -513,6 +521,7 @@ mod tests {
             name: String::new(),
             shapes: vec![text_shape("Text", "Hello", 0, 0)],
             notes: Some("Speaker notes".to_string()),
+            background_rgb: None,
         }]);
 
         let text = doc.slide_plain_text(0).unwrap();
@@ -526,11 +535,13 @@ mod tests {
                 name: String::new(),
                 shapes: vec![text_shape("A", "Slide one", 0, 0)],
                 notes: None,
+                background_rgb: None,
             },
             Slide {
                 name: String::new(),
                 shapes: vec![text_shape("B", "Slide two", 0, 0)],
                 notes: None,
+                background_rgb: None,
             },
         ]);
 
@@ -547,6 +558,7 @@ mod tests {
                 text_shape("Body", "Body text", 0, 2000),
             ],
             notes: None,
+            background_rgb: None,
         }]);
 
         let md = doc.slide_to_markdown(0).unwrap();
@@ -573,6 +585,8 @@ mod tests {
                 text_body: Some(TextBody {
                     paragraphs: vec![TextParagraph {
                         level: 0,
+                        alignment: None,
+                        space_before_hundredths_pt: None,
                         content: vec![
                             TextContent::Run(TextRun {
                                 text: "bold".to_string(),
@@ -580,6 +594,7 @@ mod tests {
                                 italic: None,
                                 strikethrough: false,
                                 hyperlink: None,
+                                font_size_hundredths_pt: None,
                             }),
                             TextContent::Run(TextRun {
                                 text: " and ".to_string(),
@@ -587,6 +602,7 @@ mod tests {
                                 italic: None,
                                 strikethrough: false,
                                 hyperlink: None,
+                                font_size_hundredths_pt: None,
                             }),
                             TextContent::Run(TextRun {
                                 text: "italic".to_string(),
@@ -594,6 +610,7 @@ mod tests {
                                 italic: Some(true),
                                 strikethrough: false,
                                 hyperlink: None,
+                                font_size_hundredths_pt: None,
                             }),
                         ],
                     }],
@@ -601,6 +618,7 @@ mod tests {
                 placeholder: None,
             })],
             notes: None,
+            background_rgb: None,
         }]);
 
         let md = doc.slide_to_markdown(0).unwrap();
@@ -613,6 +631,7 @@ mod tests {
             name: String::new(),
             shapes: vec![text_shape("Text", "Content", 0, 0)],
             notes: Some("Note line 1\nNote line 2".to_string()),
+            background_rgb: None,
         }]);
 
         let md = doc.slide_to_markdown(0).unwrap();
@@ -640,12 +659,15 @@ mod tests {
                                     text_body: Some(TextBody {
                                         paragraphs: vec![TextParagraph {
                                             level: 0,
+                                            alignment: None,
+                                            space_before_hundredths_pt: None,
                                             content: vec![TextContent::Run(TextRun {
                                                 text: "H1".to_string(),
                                                 bold: None,
                                                 italic: None,
                                                 strikethrough: false,
                                                 hyperlink: None,
+                                                font_size_hundredths_pt: None,
                                             })],
                                         }],
                                     }),
@@ -658,12 +680,15 @@ mod tests {
                                     text_body: Some(TextBody {
                                         paragraphs: vec![TextParagraph {
                                             level: 0,
+                                            alignment: None,
+                                            space_before_hundredths_pt: None,
                                             content: vec![TextContent::Run(TextRun {
                                                 text: "H2".to_string(),
                                                 bold: None,
                                                 italic: None,
                                                 strikethrough: false,
                                                 hyperlink: None,
+                                                font_size_hundredths_pt: None,
                                             })],
                                         }],
                                     }),
@@ -680,12 +705,15 @@ mod tests {
                                     text_body: Some(TextBody {
                                         paragraphs: vec![TextParagraph {
                                             level: 0,
+                                            alignment: None,
+                                            space_before_hundredths_pt: None,
                                             content: vec![TextContent::Run(TextRun {
                                                 text: "A".to_string(),
                                                 bold: None,
                                                 italic: None,
                                                 strikethrough: false,
                                                 hyperlink: None,
+                                                font_size_hundredths_pt: None,
                                             })],
                                         }],
                                     }),
@@ -698,12 +726,15 @@ mod tests {
                                     text_body: Some(TextBody {
                                         paragraphs: vec![TextParagraph {
                                             level: 0,
+                                            alignment: None,
+                                            space_before_hundredths_pt: None,
                                             content: vec![TextContent::Run(TextRun {
                                                 text: "B".to_string(),
                                                 bold: None,
                                                 italic: None,
                                                 strikethrough: false,
                                                 hyperlink: None,
+                                                font_size_hundredths_pt: None,
                                             })],
                                         }],
                                     }),
@@ -718,6 +749,7 @@ mod tests {
                 }),
             })],
             notes: None,
+            background_rgb: None,
         }]);
 
         let md = doc.slide_to_markdown(0).unwrap();
@@ -743,6 +775,8 @@ mod tests {
                 text_body: Some(TextBody {
                     paragraphs: vec![TextParagraph {
                         level: 0,
+                        alignment: None,
+                        space_before_hundredths_pt: None,
                         content: vec![TextContent::Run(TextRun {
                             text: "Click here".to_string(),
                             bold: None,
@@ -754,12 +788,14 @@ mod tests {
                                 ),
                                 tooltip: None,
                             }),
+                            font_size_hundredths_pt: None,
                         })],
                     }],
                 }),
                 placeholder: None,
             })],
             notes: None,
+            background_rgb: None,
         }]);
 
         let md = doc.slide_to_markdown(0).unwrap();
diff --git a/src/pptx/write.rs b/src/pptx/write.rs
index e2d20e0..b4729f8 100644
--- a/src/pptx/write.rs
+++ b/src/pptx/write.rs
@@ -158,10 +158,23 @@ impl From<String> for Run {
 // Internal body content model
 // ---------------------------------------------------------------------------
 
+/// Paragraph-level properties carried through a `BodyItem::RichText`.
+/// Present so the writer can emit `<a:pPr>` attributes (alignment,
+/// space-before) that don't fit on per-run `<a:rPr>`.
+#[derive(Debug, Clone, Default)]
+pub struct ParaProps {
+    /// Paragraph alignment written as `<a:pPr algn="…"/>`. `None`
+    /// leaves the renderer-default left alignment in place.
+    pub alignment: Option<crate::ir::ParagraphAlignment>,
+    /// Space before the paragraph in points × 100. 1250 = 12.5pt.
+    /// When set, written as `<a:spcBef><a:spcPts val="…"/></a:spcBef>`.
+    pub space_before_hundredths_pt: Option<u32>,
+}
+
 #[derive(Debug, Clone)]
 enum BodyItem {
     Text(String),
-    RichText(Vec<Run>),
+    RichText(Vec<Run>, ParaProps),
     BulletList(Vec<String>),
     /// Free-floating text box: (runs, x_emu, y_emu, cx_emu, cy_emu)
     TextBox(Vec<Run>, i64, i64, i64, i64),
@@ -178,6 +191,10 @@ enum BodyItem {
 pub struct SlideData {
     /// The slide title (if set).
     pub title: Option<String>,
+    /// Optional explicit alignment for the title placeholder. None
+    /// leaves alignment to the slide layout default (typically
+    /// centered for title placeholders).
+    pub title_alignment: Option<crate::ir::ParagraphAlignment>,
     body_items: Vec<BodyItem>,
 }
 
@@ -185,6 +202,7 @@ impl SlideData {
     fn new() -> Self {
         Self {
             title: None,
+            title_alignment: None,
             body_items: Vec::new(),
         }
     }
@@ -195,6 +213,18 @@ impl SlideData {
         self
     }
 
+    /// Set the slide title and its alignment. Overwrites any
+    /// previously set title.
+    pub fn set_title_aligned(
+        &mut self,
+        title: &str,
+        alignment: Option<crate::ir::ParagraphAlignment>,
+    ) -> &mut Self {
+        self.title = Some(title.to_string());
+        self.title_alignment = alignment;
+        self
+    }
+
     /// Add a plain text paragraph to the body area.
     pub fn add_text(&mut self, text: &str) -> &mut Self {
         self.body_items.push(BodyItem::Text(text.to_string()));
@@ -203,7 +233,32 @@ impl SlideData {
 
     /// Add a paragraph of styled [`Run`]s to the body area.
     pub fn add_rich_text(&mut self, runs: &[Run]) -> &mut Self {
-        self.body_items.push(BodyItem::RichText(runs.to_vec()));
+        self.body_items
+            .push(BodyItem::RichText(runs.to_vec(), ParaProps::default()));
+        self
+    }
+
+    /// Add a paragraph of styled [`Run`]s with an explicit alignment.
+    pub fn add_rich_text_aligned(
+        &mut self,
+        runs: &[Run],
+        alignment: Option<crate::ir::ParagraphAlignment>,
+    ) -> &mut Self {
+        self.body_items.push(BodyItem::RichText(
+            runs.to_vec(),
+            ParaProps {
+                alignment,
+                ..Default::default()
+            },
+        ));
+        self
+    }
+
+    /// Add a paragraph of styled [`Run`]s with full paragraph
+    /// properties (alignment, space-before).
+    pub fn add_rich_text_with_props(&mut self, runs: &[Run], props: ParaProps) -> &mut Self {
+        self.body_items
+            .push(BodyItem::RichText(runs.to_vec(), props));
         self
     }
 
@@ -273,6 +328,14 @@ pub struct PptxWriter {
     cx: u64,
     /// Presentation height in EMU (default: 6 858 000 — standard 16:9).
     cy: u64,
+    /// Embedded font programs to ship inside the package under `ppt/fonts/`.
+    /// Mirrors `DocxWriter::embed_font` semantics: each `(name, bytes)` pair
+    /// becomes one font part, used by PDF↔PPTX round-trips to preserve the
+    /// source typeface.
+    embedded_fonts: Vec<(String, Vec<u8>)>,
+    /// Document metadata for `docProps/core.xml`. `None` means no
+    /// core-properties part is written.
+    metadata: Option<crate::ir::Metadata>,
 }
 
 impl PptxWriter {
@@ -282,7 +345,26 @@ impl PptxWriter {
             slides: Vec::new(),
             cx: 12_192_000,
             cy: 6_858_000,
+            embedded_fonts: Vec::new(),
+            metadata: None,
+        }
+    }
+
+    /// Set document metadata (written to `docProps/core.xml`).
+    pub fn set_metadata(&mut self, meta: &crate::ir::Metadata) -> &mut Self {
+        self.metadata = Some(meta.clone());
+        self
+    }
+
+    /// Embed a font program (TrueType / OpenType bytes) under `ppt/fonts/`.
+    /// `name` is used for the file name and the human-readable font name.
+    /// Subsequent calls with the same name are deduplicated.
+    pub fn embed_font(&mut self, name: impl Into<String>, data: Vec<u8>) -> &mut Self {
+        let name = name.into();
+        if !self.embedded_fonts.iter().any(|(n, _)| n == &name) {
+            self.embedded_fonts.push((name, data));
         }
+        self
     }
 
     /// Override the presentation canvas size (in EMU).
@@ -358,6 +440,17 @@ impl PptxWriter {
         opc.add_package_rel(rel_types::OFFICE_DOCUMENT, "ppt/presentation.xml");
         opc.add_part_rel(&pres_part, rel_types::SLIDE_MASTER, "slideMasters/slideMaster1.xml");
 
+        // Core properties (docProps/core.xml). Written only when the
+        // caller supplied metadata so files generated through the
+        // existing `add_slide` API stay byte-identical when no
+        // metadata was set.
+        if let Some(ref meta) = self.metadata {
+            let core_part = PartName::new("/docProps/core.xml")?;
+            opc.add_package_rel(rel_types::CORE_PROPERTIES, "docProps/core.xml");
+            let core_xml = crate::core::core_properties::generate_xml(meta);
+            opc.add_part(&core_part, crate::core::core_properties::CONTENT_TYPE, &core_xml)?;
+        }
+
         let mut slide_parts = Vec::with_capacity(self.slides.len());
         for i in 0..self.slides.len() {
             let idx = i + 1;
@@ -411,6 +504,17 @@ impl PptxWriter {
             opc.add_part(slide_part, CT_SLIDE, &slide_xml)?;
         }
 
+        // Embed fonts under `ppt/fonts/font_<n>_<safe_name>.ttf`. Mirrors
+        // the DOCX `word/fonts/` layout. Other PowerPoint software may not
+        // honor this without the full presentation-relationship machinery
+        // for `<p:embeddedFontLst>`, but the in-process reader scans the
+        // directory directly so PDF↔PPTX round-trips preserve fonts.
+        crate::core::embedded_fonts::write_embedded_fonts(
+            &mut opc,
+            "/ppt/fonts/",
+            &self.embedded_fonts,
+        )?;
+
         opc.finish()?;
         Ok(())
     }
@@ -562,6 +666,21 @@ fn generate_presentation_xml(slide_count: usize, cx: u64, cy: u64) -> Vec<u8> {
     sld_sz.push_attribute(("cy", cy.to_string().as_str()));
     w.write_event(Event::Empty(sld_sz)).expect("write");
 
+    // notesSz: PowerPoint expects this even when there are no notes
+    // pages. Standard default is the same dimensions as the slide.
+    let mut notes_sz = BytesStart::new("p:notesSz");
+    notes_sz.push_attribute(("cx", cx.to_string().as_str()));
+    notes_sz.push_attribute(("cy", cy.to_string().as_str()));
+    w.write_event(Event::Empty(notes_sz)).expect("write");
+
+    // defaultTextStyle: empty list of paragraph-level defaults is
+    // legal and silences PowerPoint's "Reset Layout" command failure
+    // when the user opens the deck.
+    w.write_event(Event::Start(BytesStart::new("p:defaultTextStyle")))
+        .expect("write");
+    w.write_event(Event::End(BytesEnd::new("p:defaultTextStyle")))
+        .expect("write");
+
     w.write_event(Event::End(BytesEnd::new("p:presentation")))
         .expect("write");
     w.into_inner()
@@ -610,8 +729,14 @@ fn generate_slide_layout_xml() -> Vec<u8> {
     let mut w = Writer::new(Vec::new());
     write_decl(&mut w);
 
+    // Type "obj" = "Title and Content" — PowerPoint's standard
+    // layout. Slides referencing this layout get a sized title
+    // placeholder at the top and a body placeholder filling the
+    // rest. Was `type="blank"` with empty spTree; that left
+    // PowerPoint guessing at placeholder geometry.
     let mut root = pml_root("p:sldLayout");
-    root.push_attribute(("type", "blank"));
+    root.push_attribute(("type", "obj"));
+    root.push_attribute(("preserve", "1"));
     w.write_event(Event::Start(root)).expect("write");
     w.write_event(Event::Start(BytesStart::new("p:cSld")))
         .expect("write");
@@ -619,6 +744,30 @@ fn generate_slide_layout_xml() -> Vec<u8> {
         .expect("write");
     write_nv_grp_sp_pr(&mut w);
     write_empty(&mut w, "p:grpSpPr");
+
+    // Title placeholder — top of slide, ~5 % top inset, full width minus margin.
+    write_layout_placeholder(
+        &mut w,
+        2,
+        "Title 1",
+        "title",
+        None,
+        // Geometry in EMU. Standard 16:9 @ 12 192 000 × 6 858 000:
+        // place title at (914 400, 685 800) ≈ 1 in × 0.75 in,
+        // size 10 363 200 × 1 143 000 ≈ 11.3 in × 1.25 in.
+        Some((914_400, 685_800, 10_363_200, 1_143_000)),
+    );
+
+    // Body placeholder — fills the area below the title.
+    write_layout_placeholder(
+        &mut w,
+        3,
+        "Body 2",
+        "body",
+        Some(1),
+        Some((914_400, 1_905_000, 10_363_200, 4_343_400)),
+    );
+
     w.write_event(Event::End(BytesEnd::new("p:spTree")))
         .expect("write");
     w.write_event(Event::End(BytesEnd::new("p:cSld")))
@@ -628,6 +777,93 @@ fn generate_slide_layout_xml() -> Vec<u8> {
     w.into_inner()
 }
 
+/// Emit one placeholder `<p:sp>` inside the slide layout: an empty
+/// shape carrying the placeholder type/idx + its xfrm rectangle.
+/// Slides that reference this layout's `type` and `idx` inherit the
+/// geometry — without it PowerPoint falls back to bare-default
+/// positioning that often pushes content off the slide canvas.
+fn write_layout_placeholder(
+    w: &mut Writer<Vec<u8>>,
+    id: u32,
+    name: &str,
+    ph_type: &str,
+    ph_idx: Option<u32>,
+    geometry_emu: Option<(i64, i64, i64, i64)>, // (x, y, cx, cy)
+) {
+    let id_str = id.to_string();
+    w.write_event(Event::Start(BytesStart::new("p:sp")))
+        .expect("sp start");
+
+    w.write_event(Event::Start(BytesStart::new("p:nvSpPr")))
+        .expect("nvSpPr start");
+    let mut cnv_pr = BytesStart::new("p:cNvPr");
+    cnv_pr.push_attribute(("id", id_str.as_str()));
+    cnv_pr.push_attribute(("name", name));
+    w.write_event(Event::Empty(cnv_pr)).expect("cNvPr");
+    w.write_event(Event::Start(BytesStart::new("p:cNvSpPr")))
+        .expect("cNvSpPr start");
+    let mut locks = BytesStart::new("a:spLocks");
+    locks.push_attribute(("noGrp", "1"));
+    w.write_event(Event::Empty(locks)).expect("spLocks");
+    w.write_event(Event::End(BytesEnd::new("p:cNvSpPr")))
+        .expect("cNvSpPr end");
+    w.write_event(Event::Start(BytesStart::new("p:nvPr")))
+        .expect("nvPr start");
+    let mut ph = BytesStart::new("p:ph");
+    ph.push_attribute(("type", ph_type));
+    let idx_buf;
+    if let Some(idx) = ph_idx {
+        idx_buf = idx.to_string();
+        ph.push_attribute(("idx", idx_buf.as_str()));
+    }
+    w.write_event(Event::Empty(ph)).expect("ph");
+    w.write_event(Event::End(BytesEnd::new("p:nvPr")))
+        .expect("nvPr end");
+    w.write_event(Event::End(BytesEnd::new("p:nvSpPr")))
+        .expect("nvSpPr end");
+
+    // spPr with optional xfrm geometry
+    if let Some((x, y, cx, cy)) = geometry_emu {
+        w.write_event(Event::Start(BytesStart::new("p:spPr")))
+            .expect("spPr start");
+        w.write_event(Event::Start(BytesStart::new("a:xfrm")))
+            .expect("xfrm start");
+        let mut off = BytesStart::new("a:off");
+        let xs = x.to_string();
+        let ys = y.to_string();
+        off.push_attribute(("x", xs.as_str()));
+        off.push_attribute(("y", ys.as_str()));
+        w.write_event(Event::Empty(off)).expect("off");
+        let mut ext = BytesStart::new("a:ext");
+        let cxs = cx.to_string();
+        let cys = cy.to_string();
+        ext.push_attribute(("cx", cxs.as_str()));
+        ext.push_attribute(("cy", cys.as_str()));
+        w.write_event(Event::Empty(ext)).expect("ext");
+        w.write_event(Event::End(BytesEnd::new("a:xfrm")))
+            .expect("xfrm end");
+        w.write_event(Event::End(BytesEnd::new("p:spPr")))
+            .expect("spPr end");
+    } else {
+        write_empty(w, "p:spPr");
+    }
+
+    // Empty txBody — slides supply their own text.
+    w.write_event(Event::Start(BytesStart::new("p:txBody")))
+        .expect("txBody start");
+    write_empty(w, "a:bodyPr");
+    write_empty(w, "a:lstStyle");
+    w.write_event(Event::Start(BytesStart::new("a:p")))
+        .expect("a:p start");
+    w.write_event(Event::End(BytesEnd::new("a:p")))
+        .expect("a:p end");
+    w.write_event(Event::End(BytesEnd::new("p:txBody")))
+        .expect("txBody end");
+
+    w.write_event(Event::End(BytesEnd::new("p:sp")))
+        .expect("sp end");
+}
+
 // ---------------------------------------------------------------------------
 // slides/slideN.xml
 // ---------------------------------------------------------------------------
@@ -649,7 +885,7 @@ fn generate_slide_xml(slide: &SlideData, img_rids: &[(String, i64, i64, u64, u64
     let mut next_id: u32 = 2;
 
     if let Some(ref title) = slide.title {
-        write_title_shape(&mut w, next_id, title);
+        write_title_shape(&mut w, next_id, title, slide.title_alignment.as_ref());
         next_id += 1;
     }
 
@@ -687,7 +923,12 @@ fn generate_slide_xml(slide: &SlideData, img_rids: &[(String, i64, i64, u64, u64
     w.into_inner()
 }
 
-fn write_title_shape(w: &mut Writer<Vec<u8>>, id: u32, title: &str) {
+fn write_title_shape(
+    w: &mut Writer<Vec<u8>>,
+    id: u32,
+    title: &str,
+    alignment: Option<&crate::ir::ParagraphAlignment>,
+) {
     let id_str = id.to_string();
     w.write_event(Event::Start(BytesStart::new("p:sp")))
         .expect("write");
@@ -720,7 +961,16 @@ fn write_title_shape(w: &mut Writer<Vec<u8>>, id: u32, title: &str) {
     w.write_event(Event::Start(BytesStart::new("p:txBody")))
         .expect("write");
     write_empty(w, "a:bodyPr");
-    write_plain_paragraph(w, title);
+    if let Some(a) = alignment {
+        let runs = vec![Run::new(title)];
+        let props = ParaProps {
+            alignment: Some(a.clone()),
+            ..Default::default()
+        };
+        write_rich_paragraph(w, &runs, &props);
+    } else {
+        write_plain_paragraph(w, title);
+    }
     w.write_event(Event::End(BytesEnd::new("p:txBody")))
         .expect("write");
 
@@ -761,12 +1011,20 @@ fn write_body_shape(w: &mut Writer<Vec<u8>>, id: u32, items: &[&BodyItem]) {
 
     w.write_event(Event::Start(BytesStart::new("p:txBody")))
         .expect("write");
-    write_empty(w, "a:bodyPr");
+    // <a:bodyPr><a:normAutofit/></a:bodyPr>: tell PowerPoint to
+    // shrink-to-fit the body text. Without this, dense PDF pages
+    // imported as slides overflow the placeholder and content
+    // renders off-slide.
+    w.write_event(Event::Start(BytesStart::new("a:bodyPr")))
+        .expect("write bodyPr start");
+    write_empty(w, "a:normAutofit");
+    w.write_event(Event::End(BytesEnd::new("a:bodyPr")))
+        .expect("write bodyPr end");
 
     for item in items {
         match item {
             BodyItem::Text(text) => write_plain_paragraph(w, text),
-            BodyItem::RichText(runs) => write_rich_paragraph(w, runs),
+            BodyItem::RichText(runs, props) => write_rich_paragraph(w, runs, props),
             BodyItem::BulletList(bullets) => {
                 for bullet in bullets {
                     write_bullet_paragraph(w, bullet);
@@ -838,13 +1096,21 @@ fn write_text_box_shape(
     w.write_event(Event::End(BytesEnd::new("p:spPr")))
         .expect("write");
 
-    // txBody
+    // txBody — `wrap="none"` plus explicit zero insets so callers
+    // sizing the shape rectangle to the exact text bbox (e.g. the
+    // PDF→PPTX layout path) get the text rendered without
+    // PowerPoint's default ~0.1" left/right padding silently eating
+    // shape width and forcing visible glyph re-wrapping.
     w.write_event(Event::Start(BytesStart::new("p:txBody")))
         .expect("write");
     let mut body_pr = BytesStart::new("a:bodyPr");
-    body_pr.push_attribute(("wrap", "square"));
+    body_pr.push_attribute(("wrap", "none"));
+    body_pr.push_attribute(("lIns", "0"));
+    body_pr.push_attribute(("tIns", "0"));
+    body_pr.push_attribute(("rIns", "0"));
+    body_pr.push_attribute(("bIns", "0"));
     w.write_event(Event::Empty(body_pr)).expect("write");
-    write_rich_paragraph(w, runs);
+    write_rich_paragraph(w, runs, &ParaProps::default());
     w.write_event(Event::End(BytesEnd::new("p:txBody")))
         .expect("write");
 
@@ -922,9 +1188,39 @@ fn write_plain_paragraph(w: &mut Writer<Vec<u8>>, text: &str) {
         .expect("write");
 }
 
-fn write_rich_paragraph(w: &mut Writer<Vec<u8>>, runs: &[Run]) {
+fn write_rich_paragraph(w: &mut Writer<Vec<u8>>, runs: &[Run], props: &ParaProps) {
+    use crate::ir::ParagraphAlignment;
     w.write_event(Event::Start(BytesStart::new("a:p")))
         .expect("write");
+    let algn = props.alignment.as_ref().map(|a| match a {
+        ParagraphAlignment::Left => "l",
+        ParagraphAlignment::Center => "ctr",
+        ParagraphAlignment::Right => "r",
+        ParagraphAlignment::Justify => "just",
+        ParagraphAlignment::Distribute => "dist",
+    });
+    let need_ppr = algn.is_some() || props.space_before_hundredths_pt.is_some();
+    if need_ppr {
+        let mut p_pr = BytesStart::new("a:pPr");
+        if let Some(v) = algn {
+            p_pr.push_attribute(("algn", v));
+        }
+        if let Some(spc) = props.space_before_hundredths_pt {
+            // <a:pPr ...><a:spcBef><a:spcPts val="N"/></a:spcBef></a:pPr>
+            w.write_event(Event::Start(p_pr)).expect("write pPr start");
+            w.write_event(Event::Start(BytesStart::new("a:spcBef")))
+                .expect("write spcBef");
+            let mut spc_pts = BytesStart::new("a:spcPts");
+            spc_pts.push_attribute(("val", spc.to_string().as_str()));
+            w.write_event(Event::Empty(spc_pts)).expect("write spcPts");
+            w.write_event(Event::End(BytesEnd::new("a:spcBef")))
+                .expect("write spcBef end");
+            w.write_event(Event::End(BytesEnd::new("a:pPr")))
+                .expect("write pPr end");
+        } else {
+            w.write_event(Event::Empty(p_pr)).expect("write pPr");
+        }
+    }
     for run in runs {
         write_dml_run(w, run);
     }
diff --git a/src/xlsx/mod.rs b/src/xlsx/mod.rs
index 1003d1f..9db81a8 100644
--- a/src/xlsx/mod.rs
+++ b/src/xlsx/mod.rs
@@ -24,6 +24,8 @@ pub mod date;
 pub mod edit;
 /// XLSX-specific error type.
 pub mod error;
+/// Number format rendering: apply Excel format strings to numeric values.
+pub mod numfmt;
 /// Shared string table (SST) parsing and lookup.
 pub mod shared_strings;
 /// Spreadsheet styles: number formats, fonts, fills, borders, cell formats.
@@ -69,6 +71,19 @@ pub struct XlsxDocument {
     pub styles: Option<StyleSheet>,
     /// DrawingML theme (lazily parsed; access via `ensure_theme()`).
     pub theme: Option<Theme>,
+    /// Text content extracted from `xl/charts/chart*.xml` parts. Each entry
+    /// is the flattened text (titles, axis labels, series names, category
+    /// labels, values) of one chart in document order. We don't render
+    /// charts as graphics but keeping their text content lets it appear in
+    /// extracted text and downstream conversions.
+    pub chart_text: Vec<String>,
+    /// Font programs found under `xl/fonts/`. Each entry is
+    /// `(font_name, ttf_or_otf_bytes)`. Mirrors `DocxDocument` and
+    /// `PptxDocument`. PDF→XLSX→PDF round-trips ship source fonts
+    /// here so the round-trip can re-register them with the PDF
+    /// renderer; without this hop XLSX-mediated round-trips lost
+    /// every typeface to the base 14 fallback.
+    pub embedded_fonts: Vec<(String, Vec<u8>)>,
     // Raw bytes for lazy parsing (None after parsing or if not present)
     styles_data: Option<Vec<u8>>,
     theme_data: Option<Vec<u8>>,
@@ -170,6 +185,8 @@ impl XlsxDocument {
             name: String,
             data: Vec<u8>,
             rels: Relationships,
+            images: Vec<crate::xlsx::worksheet::WorksheetPicture>,
+            text_shapes: Vec<crate::xlsx::worksheet::WorksheetTextShape>,
         }
         let mut bundles = Vec::with_capacity(workbook.sheets.len());
         for sheet in &workbook.sheets {
@@ -214,26 +231,86 @@ impl XlsxDocument {
                 Err(_) => Relationships::empty(),
             };
 
+            // Resolve the worksheet's DRAWING rel up-front (Phase 1
+            // has access to &mut archive). Each entry decodes
+            // `<xdr:pic>` and `<xdr:sp>` anchors and the underlying
+            // media bytes so Phase 2's parallel parser doesn't need
+            // the archive.
+            let (images, text_shapes) = read_drawing_for_sheet(&mut archive, &sheet_path, &ws_rels);
+
             bundles.push(SheetBundle {
                 name: sheet.name.clone(),
                 data: ws_data,
                 rels: ws_rels,
+                images,
+                text_shapes,
             });
         }
 
         // Phase 2: parse worksheets (parallel when feature enabled)
         let worksheets = crate::core::parallel::map_collect(bundles, |b| -> Result<Worksheet> {
-            let ws = Worksheet::parse(&b.data, b.name, &b.rels)?;
+            let mut ws = Worksheet::parse(&b.data, b.name, &b.rels)?;
+            ws.images = b.images;
+            ws.text_shapes = b.text_shapes;
             Ok(ws)
         })?;
 
-        debug!("XlsxDocument: {} worksheets parsed", worksheets.len());
+        // Scan for chart XML parts (xl/charts/chart*.xml) and extract their
+        // visible text — title, axis titles, series names, category labels,
+        // cached values. We don't render charts as graphics but their words
+        // belong in any text-based downstream conversion (markdown, search
+        // indexes, accessibility readers, our PDF text fallback).
+        let mut chart_text: Vec<String> = Vec::new();
+        let chart_names: Vec<String> = (0..archive.len())
+            .filter_map(|i| archive.by_index(i).ok().map(|f| f.name().to_string()))
+            .filter(|n| n.starts_with("xl/charts/chart") && n.ends_with(".xml"))
+            .collect();
+        for name in chart_names {
+            if let Ok(data) = Self::read_xml_entry(&mut archive, &name) {
+                let text = extract_chart_text(&data);
+                if !text.is_empty() {
+                    chart_text.push(text);
+                }
+            }
+        }
+
+        // Scan `xl/fonts/` for embedded font programs. Mirrors the
+        // DOCX (`word/fonts/`) and PPTX (`ppt/fonts/`) readers.
+        let mut embedded_fonts: Vec<(String, Vec<u8>)> = Vec::new();
+        let font_names: Vec<String> = (0..archive.len())
+            .filter_map(|i| archive.by_index(i).ok().map(|f| f.name().to_string()))
+            .filter(|n| {
+                n.starts_with("xl/fonts/")
+                    && (n.to_lowercase().ends_with(".ttf") || n.to_lowercase().ends_with(".otf"))
+            })
+            .collect();
+        for name in font_names {
+            if let Ok(data) = opc::read_zip_entry(&mut archive, &name) {
+                let basename = name.rsplit('/').next().unwrap_or("font");
+                let face = crate::docx::strip_embedded_font_filename(basename);
+                let font_name = if face.is_empty() {
+                    basename.to_string()
+                } else {
+                    face
+                };
+                embedded_fonts.push((font_name, data));
+            }
+        }
+
+        debug!(
+            "XlsxDocument: {} worksheets parsed, {} chart(s), {} embedded fonts",
+            worksheets.len(),
+            chart_text.len(),
+            embedded_fonts.len()
+        );
         Ok(XlsxDocument {
             workbook,
             worksheets,
             shared_strings,
             styles,
             theme: None,
+            chart_text,
+            embedded_fonts,
             styles_data: None,
             theme_data,
         })
@@ -338,19 +415,222 @@ impl XlsxDocument {
             .collect();
         let worksheets = worksheets?;
 
-        debug!("XlsxDocument: {} worksheets parsed", worksheets.len());
+        // Mirror the zip-path embedded-fonts scan over the OPC part
+        // listing. Loading via OPC is the slow path used when a
+        // caller hands us a pre-built `OpcReader`, so duplicating
+        // the cheap scan keeps font fidelity working there too.
+        let mut embedded_fonts: Vec<(String, Vec<u8>)> = Vec::new();
+        for name in opc.part_names() {
+            let s = name.to_string();
+            if !s.starts_with("/xl/fonts/") {
+                continue;
+            }
+            let lower = s.to_lowercase();
+            if !(lower.ends_with(".ttf") || lower.ends_with(".otf")) {
+                continue;
+            }
+            if let Ok(data) = opc.read_part(&name) {
+                let basename = s.rsplit('/').next().unwrap_or("font");
+                let face = crate::docx::strip_embedded_font_filename(basename);
+                let font_name = if face.is_empty() {
+                    basename.to_string()
+                } else {
+                    face
+                };
+                embedded_fonts.push((font_name, data));
+            }
+        }
+
+        debug!(
+            "XlsxDocument: {} worksheets parsed (OPC path), {} embedded fonts",
+            worksheets.len(),
+            embedded_fonts.len()
+        );
         Ok(XlsxDocument {
             workbook,
             worksheets,
             shared_strings,
             styles,
             theme: None,
+            // OPC path doesn't extract chart text yet; the zip path is the
+            // hot one used by Document::from_reader. Charts via OPC can be
+            // added if a use case appears.
+            chart_text: Vec::new(),
+            embedded_fonts,
             styles_data: None,
             theme_data,
         })
     }
 }
 
+/// Extract structured content from a chart XML stream (DrawingML chart
+/// format) into a flat textual representation.
+///
+/// Walks the chart's title (`<c:title>`), axis titles (`<c:catAx>` /
+/// `<c:valAx>` / `<c:title>`), and each series (`<c:ser>`). For every
+/// series we capture the name (`<c:tx>`), category labels (`<c:cat>`),
+/// and cached numeric values (`<c:val>`). The output groups them into
+/// readable lines that include the **structure** of the chart — series
+/// names paired with their values per category — rather than the flat
+/// soup of `<a:t>`/`<c:v>` text the previous implementation produced.
+///
+/// Output shape:
+/// ```text
+/// Title: ...
+/// Categories: A, B, C, ...
+/// Series Budget: 1690, 2100, 1570, ...
+/// Series Projected: 1310, 3480, 510, ...
+/// ```
+///
+/// This still travels through `to_markdown` and `convert_xlsx_to_ir` as
+/// plain text (not an actual table), but the structure is now meaningful
+/// for both human readers and downstream NLP / search.
+fn extract_chart_text(xml: &[u8]) -> String {
+    let mut reader = quick_xml::Reader::from_reader(xml);
+    reader.config_mut().trim_text(false);
+    let mut buf = Vec::new();
+
+    // Tag-context stack — push localname on Start, pop on End.
+    let mut stack: Vec<Vec<u8>> = Vec::new();
+    // Most recently seen text inside a `<t>` (rich-text run) — used to
+    // build the chart title and axis-title strings.
+    let mut current_title: String = String::new();
+    let mut titles: Vec<String> = Vec::new();
+    // The chart-level title is the first `<c:title>` we close that lives
+    // outside any `<c:catAx>` / `<c:valAx>` / `<c:legend>`.
+    // Per-series state.
+    let mut series: Vec<ChartSeries> = Vec::new();
+    let mut cur_series: Option<ChartSeries> = None;
+    // Current `<c:v>` text being accumulated.
+    let mut cur_v: String = String::new();
+    // Categories from the current series (or the first series — they are
+    // typically shared across all series in the chart).
+    let mut shared_categories: Vec<String> = Vec::new();
+    let mut cur_cat_buf: Vec<String> = Vec::new();
+
+    loop {
+        match reader.read_event_into(&mut buf) {
+            Ok(quick_xml::events::Event::Start(e)) => {
+                let local = e.local_name().as_ref().to_vec();
+                if local == b"ser" {
+                    cur_series = Some(ChartSeries::default());
+                    cur_cat_buf.clear();
+                }
+                stack.push(local);
+            },
+            Ok(quick_xml::events::Event::End(e)) => {
+                let local = e.local_name().as_ref().to_vec();
+                let _ = stack.pop();
+                match local.as_slice() {
+                    b"t" => {
+                        // End of a rich-text run — accumulate into current_title
+                        // if we're inside a chart-level or axis title.
+                    },
+                    b"title" => {
+                        if !current_title.trim().is_empty() {
+                            titles.push(current_title.trim().to_string());
+                        }
+                        current_title.clear();
+                    },
+                    b"v" => {
+                        let val = cur_v.trim().to_string();
+                        cur_v.clear();
+                        if val.is_empty() {
+                            continue;
+                        }
+                        if let Some(s) = cur_series.as_mut() {
+                            // Decide whether this <c:v> is series-name, category,
+                            // or value based on the enclosing scope.
+                            let in_tx = stack.iter().any(|t| t.as_slice() == b"tx");
+                            let in_cat = stack.iter().any(|t| t.as_slice() == b"cat");
+                            let in_val = stack.iter().any(|t| t.as_slice() == b"val");
+                            if in_tx && s.name.is_empty() {
+                                s.name = val;
+                            } else if in_cat {
+                                cur_cat_buf.push(val);
+                            } else if in_val {
+                                s.values.push(val);
+                            }
+                        }
+                    },
+                    b"ser" => {
+                        if let Some(mut s) = cur_series.take() {
+                            // Fold the per-series categories into shared_categories
+                            // (first series wins — they are typically identical).
+                            if shared_categories.is_empty() && !cur_cat_buf.is_empty() {
+                                shared_categories = std::mem::take(&mut cur_cat_buf);
+                            } else {
+                                cur_cat_buf.clear();
+                            }
+                            if s.name.is_empty() {
+                                s.name = format!("Series {}", series.len() + 1);
+                            }
+                            series.push(s);
+                        }
+                    },
+                    _ => {},
+                }
+            },
+            Ok(quick_xml::events::Event::Text(t)) => {
+                if let Ok(s) = t.unescape() {
+                    let trimmed = s.trim();
+                    if trimmed.is_empty() {
+                        continue;
+                    }
+                    let top = stack.last().map(|v| v.as_slice());
+                    match top {
+                        Some(b"t") => {
+                            // Rich-text run — append to current_title.
+                            if !current_title.is_empty() {
+                                current_title.push_str("");
+                            }
+                            current_title.push_str(trimmed);
+                        },
+                        Some(b"v") => {
+                            cur_v.push_str(trimmed);
+                        },
+                        _ => {},
+                    }
+                }
+            },
+            Ok(quick_xml::events::Event::Eof) => break,
+            Err(_) => break,
+            _ => {},
+        }
+        buf.clear();
+    }
+
+    // Emit a structured representation. Each line is independent — the
+    // markdown writer joins them with `\n`.
+    let mut out = String::new();
+    if !titles.is_empty() {
+        out.push_str(&format!("Title: {}", titles.join(" — ")));
+    }
+    if !shared_categories.is_empty() {
+        if !out.is_empty() {
+            out.push('\n');
+        }
+        out.push_str(&format!("Categories: {}", shared_categories.join(", ")));
+    }
+    for s in &series {
+        if !out.is_empty() {
+            out.push('\n');
+        }
+        if s.values.is_empty() {
+            out.push_str(&format!("Series: {}", s.name));
+        } else {
+            out.push_str(&format!("{}: {}", s.name, s.values.join(", ")));
+        }
+    }
+    out
+}
+
+#[derive(Default)]
+struct ChartSeries {
+    name: String,
+    values: Vec<String>,
+}
+
 /// Compute the .rels path for a worksheet ZIP entry.
 /// e.g. "xl/worksheets/sheet1.xml" → "xl/worksheets/_rels/sheet1.xml.rels"
 fn sheet_rels_path(sheet_path: &str) -> String {
@@ -372,3 +652,415 @@ impl crate::core::OfficeDocument for XlsxDocument {
         self.to_markdown()
     }
 }
+
+/// Read the DRAWING-rel target for a worksheet, parse its `<xdr:pic>`
+/// and `<xdr:sp>` anchors, and resolve each picture's underlying media
+/// bytes. Returns `(pictures, text_shapes)`. Soft failures (no rel,
+/// missing part, parse error) yield empty vectors — drawings are
+/// best-effort extras and shouldn't fail worksheet loading.
+fn read_drawing_for_sheet<R: Read + Seek>(
+    archive: &mut ZipArchive<R>,
+    sheet_path: &str,
+    sheet_rels: &Relationships,
+) -> (
+    Vec<crate::xlsx::worksheet::WorksheetPicture>,
+    Vec<crate::xlsx::worksheet::WorksheetTextShape>,
+) {
+    let drawing_rel = match sheet_rels.first_by_type(rel_types::DRAWING) {
+        Some(r) => r,
+        None => return (Vec::new(), Vec::new()),
+    };
+
+    let drawing_path = resolve_relative_zip_path(sheet_path, &drawing_rel.target);
+
+    let drawing_xml = match XlsxDocument::read_xml_entry(archive, &drawing_path) {
+        Ok(d) => d,
+        Err(_) => return (Vec::new(), Vec::new()),
+    };
+
+    let drawing_rels_path = sheet_rels_path(&drawing_path);
+    let drawing_rels = match XlsxDocument::read_xml_entry(archive, &drawing_rels_path) {
+        Ok(d) => Relationships::parse(&d).unwrap_or_else(|_| Relationships::empty()),
+        Err(_) => Relationships::empty(),
+    };
+
+    let parsed = match parse_drawing_anchors(&drawing_xml) {
+        Ok(a) => a,
+        Err(_) => return (Vec::new(), Vec::new()),
+    };
+
+    // Resolve picture anchors → bytes.
+    let mut pictures = Vec::with_capacity(parsed.pictures.len());
+    for a in parsed.pictures {
+        let rel = match drawing_rels.get_by_id(&a.embed_rid) {
+            Some(r) => r,
+            None => continue,
+        };
+        let media_path = resolve_relative_zip_path(&drawing_path, &rel.target);
+        let bytes = match opc::read_zip_entry(archive, &media_path) {
+            Ok(b) => b,
+            Err(_) => continue,
+        };
+        let ext = std::path::Path::new(&rel.target)
+            .extension()
+            .and_then(|s| s.to_str())
+            .map(|s| s.to_ascii_lowercase())
+            .unwrap_or_else(|| guess_image_format_from_bytes(&bytes).to_string());
+
+        pictures.push(crate::xlsx::worksheet::WorksheetPicture {
+            data: bytes,
+            format: ext,
+            x_emu: a.x_emu,
+            y_emu: a.y_emu,
+            cx_emu: a.cx_emu,
+            cy_emu: a.cy_emu,
+            alt_text: a.alt_text,
+        });
+    }
+
+    let text_shapes = parsed
+        .text_shapes
+        .into_iter()
+        .map(|t| crate::xlsx::worksheet::WorksheetTextShape {
+            text: t.text,
+            font_name: t.font_name,
+            font_size_pt: t.font_size_pt,
+            bold: t.bold,
+            italic: t.italic,
+            color_hex: t.color_hex,
+            x_emu: t.x_emu,
+            y_emu: t.y_emu,
+            cx_emu: t.cx_emu,
+            cy_emu: t.cy_emu,
+        })
+        .collect();
+
+    (pictures, text_shapes)
+}
+
+/// Resolve a `..`-relative target inside an OPC package back to an
+/// absolute ZIP-entry path. Mirrors `PartName::resolve_relative` but
+/// operates on plain ZIP paths (the `from_zip` fast path doesn't use
+/// `PartName`).
+fn resolve_relative_zip_path(source: &str, target: &str) -> String {
+    if target.starts_with('/') {
+        return target.trim_start_matches('/').to_string();
+    }
+    let base_dir = match source.rfind('/') {
+        Some(i) => &source[..i],
+        None => "",
+    };
+    let mut parts: Vec<&str> = if base_dir.is_empty() {
+        Vec::new()
+    } else {
+        base_dir.split('/').collect()
+    };
+    for seg in target.split('/') {
+        match seg {
+            "" | "." => {},
+            ".." => {
+                parts.pop();
+            },
+            other => parts.push(other),
+        }
+    }
+    parts.join("/")
+}
+
+#[derive(Debug)]
+struct DrawingPictureAnchor {
+    embed_rid: String,
+    x_emu: i64,
+    y_emu: i64,
+    cx_emu: i64,
+    cy_emu: i64,
+    alt_text: Option<String>,
+}
+
+#[derive(Debug, Default)]
+struct DrawingTextAnchor {
+    text: String,
+    font_name: Option<String>,
+    font_size_pt: Option<f32>,
+    bold: bool,
+    italic: bool,
+    color_hex: Option<String>,
+    x_emu: i64,
+    y_emu: i64,
+    cx_emu: i64,
+    cy_emu: i64,
+}
+
+#[derive(Debug, Default)]
+struct DrawingAnchors {
+    pictures: Vec<DrawingPictureAnchor>,
+    text_shapes: Vec<DrawingTextAnchor>,
+}
+
+/// Parse `xl/drawings/drawingN.xml` and return both `<xdr:pic>` and
+/// `<xdr:sp>` anchors. Supports `<xdr:absoluteAnchor>` (direct EMU
+/// pos+ext) and the cell-anchor variants — for cell anchors we
+/// approximate the absolute origin from `<xdr:from>` x/y when present.
+/// `<xdr:sp>` shapes carry text inside `<xdr:txBody>` runs.
+fn parse_drawing_anchors(xml_data: &[u8]) -> crate::core::Result<DrawingAnchors> {
+    use quick_xml::events::Event;
+
+    let mut reader = crate::core::xml::make_fast_reader(xml_data);
+    let mut out = DrawingAnchors::default();
+
+    // Per-anchor accumulator state. We don't pre-classify the anchor
+    // as picture-vs-text; we discover that mid-walk based on which
+    // child element appears (`pic` vs `sp`).
+    enum AnchorKind {
+        Unknown,
+        Picture,
+        Text,
+    }
+    let mut in_anchor = false;
+    let mut kind = AnchorKind::Unknown;
+    let mut x_emu = 0i64;
+    let mut y_emu = 0i64;
+    let mut cx_emu = 0i64;
+    let mut cy_emu = 0i64;
+    let mut embed_rid: Option<String> = None;
+    let mut alt_text: Option<String> = None;
+    // Text-shape state.
+    let mut in_txbody = false;
+    let mut in_run = false;
+    let mut in_a_t = false;
+    let mut text_buf = String::new();
+    let mut font_name: Option<String> = None;
+    let mut font_size_pt: Option<f32> = None;
+    let mut bold = false;
+    let mut italic = false;
+    let mut color_hex: Option<String> = None;
+    let mut in_solid_fill = false;
+
+    loop {
+        let evt = reader.read_event()?;
+        match evt {
+            Event::Start(ref e) => {
+                let local = e.local_name().as_ref().to_vec();
+                match local.as_slice() {
+                    b"absoluteAnchor" | b"oneCellAnchor" | b"twoCellAnchor" => {
+                        in_anchor = true;
+                        kind = AnchorKind::Unknown;
+                        x_emu = 0;
+                        y_emu = 0;
+                        cx_emu = 0;
+                        cy_emu = 0;
+                        embed_rid = None;
+                        alt_text = None;
+                        in_txbody = false;
+                        in_run = false;
+                        in_a_t = false;
+                        text_buf.clear();
+                        font_name = None;
+                        font_size_pt = None;
+                        bold = false;
+                        italic = false;
+                        color_hex = None;
+                        in_solid_fill = false;
+                    },
+                    b"pic" if in_anchor => {
+                        kind = AnchorKind::Picture;
+                    },
+                    b"sp" if in_anchor => {
+                        kind = AnchorKind::Text;
+                    },
+                    b"txBody" if in_anchor => {
+                        in_txbody = true;
+                    },
+                    b"r" if in_txbody => {
+                        in_run = true;
+                    },
+                    b"t" if in_run => {
+                        in_a_t = true;
+                    },
+                    b"rPr" if in_run => {
+                        for attr in e.attributes().with_checks(false) {
+                            let attr = attr.map_err(crate::core::Error::from)?;
+                            let key = attr.key.as_ref();
+                            let raw = attr.unescape_value().map_err(crate::core::Error::from)?;
+                            match key {
+                                b"sz" => {
+                                    // sz is in hundredths of a pt.
+                                    if let Ok(n) = raw.parse::<i32>() {
+                                        font_size_pt = Some(n as f32 / 100.0);
+                                    }
+                                },
+                                b"b" => bold = raw == "1" || raw == "true",
+                                b"i" => italic = raw == "1" || raw == "true",
+                                _ => {},
+                            }
+                        }
+                    },
+                    b"solidFill" if in_run => {
+                        in_solid_fill = true;
+                    },
+                    b"cNvPr" if in_anchor => {
+                        if let Some(d) = crate::core::xml::optional_attr_str(e, b"descr")? {
+                            alt_text = Some(d.into_owned());
+                        }
+                    },
+                    _ => {},
+                }
+            },
+            Event::Empty(ref e) => {
+                if !in_anchor {
+                    continue;
+                }
+                let local = e.local_name().as_ref().to_vec();
+                match local.as_slice() {
+                    b"pos" => {
+                        if let Some(v) = crate::core::xml::optional_attr_str(e, b"x")? {
+                            x_emu = v.parse().unwrap_or(0);
+                        }
+                        if let Some(v) = crate::core::xml::optional_attr_str(e, b"y")? {
+                            y_emu = v.parse().unwrap_or(0);
+                        }
+                    },
+                    b"ext" => {
+                        if let Some(v) = crate::core::xml::optional_attr_str(e, b"cx")? {
+                            cx_emu = v.parse().unwrap_or(0);
+                        }
+                        if let Some(v) = crate::core::xml::optional_attr_str(e, b"cy")? {
+                            cy_emu = v.parse().unwrap_or(0);
+                        }
+                    },
+                    b"off" if cx_emu == 0 && cy_emu == 0 => {
+                        if let Some(v) = crate::core::xml::optional_attr_str(e, b"x")? {
+                            x_emu = v.parse().unwrap_or(x_emu);
+                        }
+                        if let Some(v) = crate::core::xml::optional_attr_str(e, b"y")? {
+                            y_emu = v.parse().unwrap_or(y_emu);
+                        }
+                    },
+                    b"blip" => {
+                        for attr in e.attributes().with_checks(false) {
+                            let attr = attr.map_err(crate::core::Error::from)?;
+                            let key = attr.key.as_ref();
+                            if key == b"r:embed" || key.ends_with(b":embed") || key == b"embed" {
+                                let raw =
+                                    attr.unescape_value().map_err(crate::core::Error::from)?;
+                                embed_rid = Some(raw.into_owned());
+                                break;
+                            }
+                        }
+                    },
+                    b"cNvPr" => {
+                        if let Some(d) = crate::core::xml::optional_attr_str(e, b"descr")? {
+                            alt_text = Some(d.into_owned());
+                        }
+                    },
+                    b"latin" if in_run => {
+                        if let Some(t) = crate::core::xml::optional_attr_str(e, b"typeface")? {
+                            font_name = Some(t.into_owned());
+                        }
+                    },
+                    b"srgbClr" if in_solid_fill => {
+                        if let Some(v) = crate::core::xml::optional_attr_str(e, b"val")? {
+                            color_hex = Some(v.into_owned().to_uppercase());
+                        }
+                    },
+                    b"rPr" if in_run => {
+                        for attr in e.attributes().with_checks(false) {
+                            let attr = attr.map_err(crate::core::Error::from)?;
+                            let key = attr.key.as_ref();
+                            let raw = attr.unescape_value().map_err(crate::core::Error::from)?;
+                            match key {
+                                b"sz" => {
+                                    if let Ok(n) = raw.parse::<i32>() {
+                                        font_size_pt = Some(n as f32 / 100.0);
+                                    }
+                                },
+                                b"b" => bold = raw == "1" || raw == "true",
+                                b"i" => italic = raw == "1" || raw == "true",
+                                _ => {},
+                            }
+                        }
+                    },
+                    _ => {},
+                }
+            },
+            Event::Text(ref e) if in_a_t => {
+                let s = e.unescape().map_err(crate::core::Error::from)?;
+                text_buf.push_str(&s);
+            },
+            Event::End(ref e) => {
+                let local = e.local_name().as_ref().to_vec();
+                match local.as_slice() {
+                    b"t" => in_a_t = false,
+                    b"r" => in_run = false,
+                    b"txBody" => in_txbody = false,
+                    b"solidFill" => in_solid_fill = false,
+                    s if matches!(s, b"absoluteAnchor" | b"oneCellAnchor" | b"twoCellAnchor")
+                        && in_anchor =>
+                    {
+                        in_anchor = false;
+                        match kind {
+                            AnchorKind::Picture => {
+                                if let Some(rid) = embed_rid.take() {
+                                    out.pictures.push(DrawingPictureAnchor {
+                                        embed_rid: rid,
+                                        x_emu,
+                                        y_emu,
+                                        cx_emu,
+                                        cy_emu,
+                                        alt_text: alt_text.take(),
+                                    });
+                                }
+                            },
+                            AnchorKind::Text => {
+                                if !text_buf.is_empty() {
+                                    out.text_shapes.push(DrawingTextAnchor {
+                                        text: std::mem::take(&mut text_buf),
+                                        font_name: font_name.take(),
+                                        font_size_pt: font_size_pt.take(),
+                                        bold,
+                                        italic,
+                                        color_hex: color_hex.take(),
+                                        x_emu,
+                                        y_emu,
+                                        cx_emu,
+                                        cy_emu,
+                                    });
+                                }
+                            },
+                            AnchorKind::Unknown => {},
+                        }
+                        kind = AnchorKind::Unknown;
+                    },
+                    _ => {},
+                }
+            },
+            Event::Eof => break,
+            _ => {},
+        }
+    }
+
+    Ok(out)
+}
+
+/// Best-effort image-format detection from raw bytes (used when the
+/// drawing rel target lacks a recognisable extension). Mirrors the
+/// PPTX helper.
+fn guess_image_format_from_bytes(bytes: &[u8]) -> &'static str {
+    if bytes.starts_with(&[0x89, b'P', b'N', b'G']) {
+        "png"
+    } else if bytes.starts_with(&[0xFF, 0xD8, 0xFF]) {
+        "jpeg"
+    } else if bytes.starts_with(b"GIF87a") || bytes.starts_with(b"GIF89a") {
+        "gif"
+    } else if bytes.starts_with(b"BM") {
+        "bmp"
+    } else if bytes.len() >= 4 && (bytes.starts_with(b"II*\0") || bytes.starts_with(b"MM\0*")) {
+        "tiff"
+    } else if bytes.len() >= 4 && bytes.starts_with(&[0xD7, 0xCD, 0xC6, 0x9A]) {
+        "wmf"
+    } else if bytes.len() >= 4 && bytes.starts_with(&[0x01, 0x00, 0x00, 0x00]) {
+        "emf"
+    } else {
+        "png"
+    }
+}
diff --git a/src/xlsx/numfmt.rs b/src/xlsx/numfmt.rs
new file mode 100644
index 0000000..c945861
--- /dev/null
+++ b/src/xlsx/numfmt.rs
@@ -0,0 +1,478 @@
+//! Excel number format rendering.
+//!
+//! Applies a numeric format string (or built-in format ID) to an f64 value
+//! and returns the display string. Covers the cases that matter in practice:
+//! integers, fixed decimals, thousands separators, percentages, currency,
+//! and scientific notation. Complex conditions/colors are stripped gracefully.
+
+/// Apply an Excel number format to a numeric value.
+pub fn apply_format(n: f64, fmt_id: u32, fmt_str: Option<&str>) -> String {
+    if n.is_nan() || n.is_infinite() {
+        return String::new();
+    }
+
+    // Built-in format IDs per OOXML spec §18.8.30.
+    match fmt_id {
+        0 | 49 => return format_general(n),         // General / @
+        1 => return format_integer(n),              // 0
+        2 => return format_fixed(n, 2),             // 0.00
+        3 => return format_commas(n, 0),            // #,##0
+        4 => return format_commas(n, 2),            // #,##0.00
+        5 | 6 => return format_currency(n, "$", 0), // $#,##0
+        7 | 8 => return format_currency(n, "$", 2), // $#,##0.00
+        9 => return format_percent(n, 0),           // 0%
+        10 => return format_percent(n, 2),          // 0.00%
+        11 => return format_scientific(n),          // 0.00E+00
+        12 => return format_general(n),             // # ?/? (fractions — approx)
+        13 => return format_general(n),             // # ??/??
+        37 | 38 => return format_commas(n, 0),      // #,##0 accounting variants
+        39 | 40 => return format_commas(n, 2),      // #,##0.00 accounting variants
+        41..=44 => return format_commas(n, 2),      // _(* ...) accounting
+        _ => {},
+    }
+
+    // Custom format string (IDs 164+).
+    if let Some(fmt) = fmt_str {
+        let fmt = fmt.trim();
+        if !fmt.is_empty() && fmt != "General" && fmt != "@" {
+            return apply_custom(n, fmt);
+        }
+    }
+
+    format_general(n)
+}
+
+// ── Simple format primitives ───────────────────────────────────────────────
+
+/// Format a number using Excel's General format (integer if whole, float otherwise).
+pub fn format_general(n: f64) -> String {
+    if n == n.trunc() && n.abs() < 1e15 {
+        format!("{}", n as i64)
+    } else {
+        // Trim unnecessary trailing zeros from float repr.
+        let s = format!("{}", n);
+        s
+    }
+}
+
+fn format_integer(n: f64) -> String {
+    format!("{}", n.round() as i64)
+}
+
+fn format_fixed(n: f64, decimals: u8) -> String {
+    format!("{:.prec$}", n, prec = decimals as usize)
+}
+
+/// Format a number with thousands-separator commas and the given decimal places.
+pub fn format_commas(n: f64, decimals: u8) -> String {
+    let negative = n < 0.0;
+    let abs = n.abs();
+
+    // Round to the required number of decimal places first.
+    let factor = 10f64.powi(decimals as i32);
+    let rounded = (abs * factor).round() / factor;
+
+    let int_part = rounded.trunc() as u64;
+    let int_str = insert_commas(int_part);
+
+    let sign = if negative { "-" } else { "" };
+
+    if decimals == 0 {
+        format!("{}{}", sign, int_str)
+    } else {
+        let frac = ((rounded.fract()) * factor).round() as u64;
+        format!("{}{}.{:0>width$}", sign, int_str, frac, width = decimals as usize)
+    }
+}
+
+fn format_currency(n: f64, symbol: &str, decimals: u8) -> String {
+    format!("{}{}", symbol, format_commas(n, decimals))
+}
+
+/// Format a number as a percentage (multiplied by 100, with optional decimal places).
+pub fn format_percent(n: f64, decimals: u8) -> String {
+    let pct = n * 100.0;
+    if decimals == 0 {
+        format!("{}%", pct.round() as i64)
+    } else {
+        format!("{:.prec$}%", pct, prec = decimals as usize)
+    }
+}
+
+fn format_scientific(n: f64) -> String {
+    // Excel uses E+XX notation (no leading zero in exponent on some locales, but
+    // two-digit exponent is safest for matching).
+    format!("{:.2E}", n)
+}
+
+fn insert_commas(n: u64) -> String {
+    let s = n.to_string();
+    let bytes = s.as_bytes();
+    let len = bytes.len();
+    let mut out = String::with_capacity(len + len / 3);
+    for (i, &b) in bytes.iter().enumerate() {
+        if i > 0 && (len - i).is_multiple_of(3) {
+            out.push(',');
+        }
+        out.push(b as char);
+    }
+    out
+}
+
+// ── Custom format string interpreter ──────────────────────────────────────
+
+/// Simplified parser for Excel format strings. Handles the common cases:
+/// thousands separators, decimal places, percentages, currency symbols,
+/// and scientific notation. Strips color/condition brackets and literals.
+fn apply_custom(n: f64, fmt: &str) -> String {
+    // Multi-section: take the first section (positive numbers).
+    // Second section = negatives, third = zero, fourth = text.
+    let section = fmt.split(';').next().unwrap_or(fmt);
+
+    // ── Parse the section ────────────────────────────────────────────────
+    let mut currency_prefix = String::new();
+    let mut suffix = String::new(); // literal text after the number
+    let mut has_percent = false;
+    let mut has_comma_in_num = false;
+    let mut decimal_zeros = 0u8; // '0' chars after '.'
+    let mut _decimal_hashes = 0u8; // '#' chars after '.'  (optional digits)
+    let mut has_scientific = false;
+    let mut in_decimal = false;
+    let mut in_num_part = false;
+
+    let mut chars = section.chars().peekable();
+    while let Some(c) = chars.next() {
+        match c {
+            // Bracketed: colour like [Red] or locale/currency like [$€-407]
+            '[' => {
+                let mut inner = String::new();
+                for ch in chars.by_ref() {
+                    if ch == ']' {
+                        break;
+                    }
+                    inner.push(ch);
+                }
+                if let Some(rest) = inner.strip_prefix('$') {
+                    // [$symbol-locale] — extract symbol
+                    let sym: String = rest.chars().take_while(|&ch| ch != '-').collect();
+                    if !sym.is_empty() {
+                        currency_prefix = sym;
+                    }
+                }
+                // Colour directives ignored.
+            },
+            // Quoted literal text — collect as suffix
+            '"' => {
+                for ch in chars.by_ref() {
+                    if ch == '"' {
+                        break;
+                    }
+                    suffix.push(ch);
+                }
+            },
+            // Escape: next char is literal
+            '\\' => {
+                chars.next();
+            },
+            // _X = pad with X (alignment) — skip X
+            '_' => {
+                chars.next();
+            },
+            // *X = repeat X (fill) — skip X
+            '*' => {
+                chars.next();
+            },
+
+            '%' => {
+                has_percent = true;
+                in_num_part = true;
+            },
+            '.' => {
+                in_decimal = true;
+                in_num_part = true;
+            },
+            '0' => {
+                in_num_part = true;
+                if in_decimal {
+                    decimal_zeros += 1;
+                }
+            },
+            '#' => {
+                in_num_part = true;
+                if in_decimal {
+                    _decimal_hashes += 1;
+                }
+            },
+            ',' => {
+                // Comma between '#'/'0' chars = thousands separator.
+                // Comma at end of number part = scale-by-1000 (rare, skip for now).
+                if in_num_part {
+                    has_comma_in_num = true;
+                }
+            },
+            'E' | 'e' => {
+                has_scientific = true;
+                // Skip the +/- and exponent digits
+                chars.next(); // '+' or '-'
+                while chars.peek().is_some_and(|c| c.is_ascii_digit()) {
+                    chars.next();
+                }
+            },
+            '$' => {
+                currency_prefix = "$".to_string();
+                in_num_part = true;
+            },
+            // Other literal characters before the number part = currency prefix
+            c if !in_num_part && !c.is_ascii_whitespace() => {
+                currency_prefix.push(c);
+            },
+            _ => {},
+        }
+    }
+
+    let decimals = decimal_zeros; // treat '0' decimals as the required precision
+
+    // ── Format the value ─────────────────────────────────────────────────
+    let value = if has_percent { n * 100.0 } else { n };
+
+    let body = if has_scientific {
+        format_scientific(value)
+    } else if has_comma_in_num {
+        format_commas(value, decimals)
+    } else if in_decimal && decimals > 0 {
+        format_fixed(value, decimals)
+    } else if in_num_part {
+        format_integer(value)
+    } else {
+        format_general(value)
+    };
+
+    let pct_suffix = if has_percent { "%" } else { "" };
+
+    format!("{}{}{}{}", currency_prefix, body, suffix, pct_suffix)
+}
+
+// ── Tests ──────────────────────────────────────────────────────────────────
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn builtin_general() {
+        assert_eq!(apply_format(42.0, 0, None), "42");
+        assert_eq!(apply_format(4.25, 0, None), "4.25");
+    }
+
+    #[test]
+    fn builtin_integer() {
+        assert_eq!(apply_format(42.7, 1, None), "43");
+    }
+
+    #[test]
+    fn builtin_fixed_two() {
+        assert_eq!(apply_format(4.25678, 2, None), "4.26");
+    }
+
+    #[test]
+    fn builtin_commas_zero() {
+        assert_eq!(apply_format(1234567.0, 3, None), "1,234,567");
+    }
+
+    #[test]
+    fn builtin_commas_two() {
+        assert_eq!(apply_format(1234567.891, 4, None), "1,234,567.89");
+    }
+
+    #[test]
+    fn builtin_percent_zero() {
+        assert_eq!(apply_format(0.75, 9, None), "75%");
+    }
+
+    #[test]
+    fn builtin_percent_two() {
+        assert_eq!(apply_format(0.1234, 10, None), "12.34%");
+    }
+
+    #[test]
+    fn builtin_currency_usd() {
+        assert_eq!(apply_format(1234.5, 7, None), "$1,234.50");
+    }
+
+    #[test]
+    fn custom_thousands() {
+        assert_eq!(apply_format(1234567.0, 164, Some("#,##0")), "1,234,567");
+    }
+
+    #[test]
+    fn custom_thousands_two_decimals() {
+        assert_eq!(apply_format(1234.5, 164, Some("#,##0.00")), "1,234.50");
+    }
+
+    #[test]
+    fn custom_percent() {
+        assert_eq!(apply_format(0.5, 164, Some("0%")), "50%");
+    }
+
+    #[test]
+    fn custom_percent_decimals() {
+        assert_eq!(apply_format(0.1256, 164, Some("0.00%")), "12.56%");
+    }
+
+    #[test]
+    fn custom_euro() {
+        let result = apply_format(1234.5, 164, Some("[$€-407]#,##0.00"));
+        assert!(result.contains("€"), "expected euro symbol, got: {result}");
+        assert!(result.contains("1,234.50"), "expected formatted number, got: {result}");
+    }
+
+    #[test]
+    fn custom_dollar_prefix() {
+        assert_eq!(apply_format(99.9, 164, Some("$#,##0.00")), "$99.90");
+    }
+
+    #[test]
+    fn negative_commas() {
+        assert_eq!(apply_format(-1234.5, 4, None), "-1,234.50");
+    }
+
+    #[test]
+    fn zero_percent() {
+        assert_eq!(apply_format(0.0, 9, None), "0%");
+    }
+
+    #[test]
+    fn large_commas() {
+        assert_eq!(apply_format(1_000_000_000.0, 3, None), "1,000,000,000");
+    }
+
+    // ── Edge cases ──────────────────────────────────────────────────────
+
+    #[test]
+    fn nan_returns_empty() {
+        assert_eq!(apply_format(f64::NAN, 0, None), "");
+    }
+
+    #[test]
+    fn infinity_returns_empty() {
+        assert_eq!(apply_format(f64::INFINITY, 0, None), "");
+        assert_eq!(apply_format(f64::NEG_INFINITY, 0, None), "");
+    }
+
+    #[test]
+    fn zero_renders_uniformly() {
+        assert_eq!(apply_format(0.0, 0, None), "0");
+        assert_eq!(apply_format(0.0, 2, None), "0.00");
+        assert_eq!(apply_format(0.0, 4, None), "0.00");
+    }
+
+    #[test]
+    fn negative_percent() {
+        assert_eq!(apply_format(-0.25, 9, None), "-25%");
+        assert_eq!(apply_format(-0.1234, 10, None), "-12.34%");
+    }
+
+    #[test]
+    fn negative_currency() {
+        assert_eq!(apply_format(-99.5, 7, None), "$-99.50");
+    }
+
+    #[test]
+    fn scientific_builtin() {
+        // Format id 11 = 0.00E+00 → uses Rust's "{:.2E}" wrapper.
+        let s = apply_format(12345.6789, 11, None);
+        assert!(s.contains('E'), "scientific got: {s}");
+    }
+
+    #[test]
+    fn accounting_alias() {
+        // 37–40 map to comma formats matching #,##0 family.
+        assert_eq!(apply_format(1234.0, 37, None), "1,234");
+        assert_eq!(apply_format(1234.5, 39, None), "1,234.50");
+    }
+
+    #[test]
+    fn accounting_paren_range() {
+        // 41..=44 are accounting variants → commas with 2 decimals.
+        for id in 41u32..=44 {
+            assert_eq!(apply_format(1234.5, id, None), "1,234.50", "fmt id {id}");
+        }
+    }
+
+    #[test]
+    fn fraction_falls_back_to_general() {
+        // Fraction formats (12,13) currently render as general.
+        assert_eq!(apply_format(1.5, 12, None), "1.5");
+        assert_eq!(apply_format(2.0, 13, None), "2");
+    }
+
+    #[test]
+    fn custom_general_falls_through_to_default() {
+        // "General" and "@" should fall back to General formatting.
+        assert_eq!(apply_format(42.5, 164, Some("General")), "42.5");
+        assert_eq!(apply_format(42.0, 164, Some("@")), "42");
+    }
+
+    #[test]
+    fn custom_blank_falls_back_to_general() {
+        assert_eq!(apply_format(4.25, 164, Some("")), "4.25");
+        assert_eq!(apply_format(4.25, 164, Some("   ")), "4.25");
+    }
+
+    #[test]
+    fn custom_multi_section_uses_first() {
+        // Multi-section format: positives use first section only.
+        assert_eq!(apply_format(1234.5, 164, Some("#,##0.00;-#,##0.00")), "1,234.50");
+    }
+
+    #[test]
+    fn custom_with_quoted_literal_suffix() {
+        let result = apply_format(42.0, 164, Some(r#"0" units""#));
+        assert!(result.contains("42"), "got: {result}");
+        assert!(result.contains("units"), "got: {result}");
+    }
+
+    #[test]
+    fn custom_color_directive_is_stripped() {
+        // [Red] is a color directive — should be ignored, not emitted.
+        let result = apply_format(123.0, 164, Some("[Red]#,##0"));
+        assert!(!result.contains("Red"));
+        assert!(result.contains("123"));
+    }
+
+    #[test]
+    fn format_general_keeps_integers_unsuffixed() {
+        // Whole-number floats render without ".0".
+        assert_eq!(format_general(42.0), "42");
+        assert_eq!(format_general(-7.0), "-7");
+        assert_eq!(format_general(0.0), "0");
+    }
+
+    #[test]
+    fn format_general_keeps_decimal_for_fraction() {
+        assert_eq!(format_general(4.25), "4.25");
+        assert_eq!(format_general(-2.5), "-2.5");
+    }
+
+    #[test]
+    fn format_commas_negative_with_decimals() {
+        assert_eq!(format_commas(-1234.5, 2), "-1,234.50");
+    }
+
+    #[test]
+    fn format_commas_zero() {
+        assert_eq!(format_commas(0.0, 0), "0");
+        assert_eq!(format_commas(0.0, 2), "0.00");
+    }
+
+    #[test]
+    fn format_percent_negative() {
+        assert_eq!(format_percent(-0.5, 0), "-50%");
+    }
+
+    #[test]
+    fn format_percent_zero_decimals() {
+        // 50% with 0 decimals.
+        assert_eq!(format_percent(0.5, 0), "50%");
+    }
+}
diff --git a/src/xlsx/styles.rs b/src/xlsx/styles.rs
index 7b9f2a5..505776a 100644
--- a/src/xlsx/styles.rs
+++ b/src/xlsx/styles.rs
@@ -1,3 +1,5 @@
+use std::collections::HashMap;
+
 use quick_xml::events::Event;
 
 use crate::core::theme::ColorRef;
@@ -8,8 +10,8 @@ use super::shared_strings::parse_color_ref;
 /// Parsed stylesheet from `xl/styles.xml`.
 #[derive(Debug, Clone)]
 pub struct StyleSheet {
-    /// Custom number formats (IDs ≥ 164).
-    pub number_formats: Vec<NumberFormat>,
+    /// Custom number formats: numFmtId → formatCode string (O(1) lookup).
+    pub number_formats: HashMap<u32, String>,
     /// Font definitions.
     pub fonts: Vec<Font>,
     /// Fill definitions.
@@ -22,15 +24,6 @@ pub struct StyleSheet {
     pub cell_style_formats: Vec<CellFormat>,
 }
 
-/// A custom number format (ID >= 164).
-#[derive(Debug, Clone)]
-pub struct NumberFormat {
-    /// Format ID (used by `CellFormat.number_format_id`).
-    pub id: u32,
-    /// Excel format code string (e.g., `"#,##0.00"`).
-    pub format_code: String,
-}
-
 /// A font definition.
 #[derive(Debug, Clone)]
 pub struct Font {
@@ -105,7 +98,7 @@ impl StyleSheet {
     pub fn parse(xml_data: &[u8]) -> crate::core::Result<Self> {
         let mut reader = xml::make_fast_reader(xml_data);
 
-        let mut number_formats = Vec::new();
+        let mut number_formats = HashMap::new();
         let mut fonts = Vec::new();
         let mut fills = Vec::new();
         let mut borders = Vec::new();
@@ -116,7 +109,7 @@ impl StyleSheet {
             match reader.read_event()? {
                 Event::Start(ref e) => match e.local_name().as_ref() {
                     b"numFmts" => {
-                        number_formats = parse_num_fmts(&mut reader)?;
+                        number_formats = parse_num_fmts_map(&mut reader)?;
                     },
                     b"fonts" => {
                         fonts = parse_fonts(&mut reader)?;
@@ -154,10 +147,7 @@ impl StyleSheet {
     pub fn number_format_for(&self, style_index: u32) -> Option<&str> {
         let xf = self.cell_formats.get(style_index as usize)?;
         let fmt_id = xf.number_format_id;
-        self.number_formats
-            .iter()
-            .find(|nf| nf.id == fmt_id)
-            .map(|nf| nf.format_code.as_str())
+        self.number_formats.get(&fmt_id).map(|s| s.as_str())
     }
 
     /// Get the font for a cell format index.
@@ -175,16 +165,18 @@ impl StyleSheet {
     }
 }
 
-/// Parse `<numFmts>` — custom number formats.
-fn parse_num_fmts(reader: &mut quick_xml::Reader<&[u8]>) -> crate::core::Result<Vec<NumberFormat>> {
-    let mut formats = Vec::new();
+/// Parse `<numFmts>` — custom number formats into a HashMap for O(1) lookup.
+fn parse_num_fmts_map(
+    reader: &mut quick_xml::Reader<&[u8]>,
+) -> crate::core::Result<HashMap<u32, String>> {
+    let mut map = HashMap::new();
 
     loop {
         match reader.read_event()? {
             Event::Start(ref e) | Event::Empty(ref e) if e.local_name().as_ref() == b"numFmt" => {
                 let id: u32 = xml::required_attr_str(e, b"numFmtId")?.parse()?;
                 let format_code = xml::required_attr_str(e, b"formatCode")?.into_owned();
-                formats.push(NumberFormat { id, format_code });
+                map.insert(id, format_code);
             },
             Event::End(ref e) if e.local_name().as_ref() == b"numFmts" => {
                 break;
@@ -194,7 +186,7 @@ fn parse_num_fmts(reader: &mut quick_xml::Reader<&[u8]>) -> crate::core::Result<
         }
     }
 
-    Ok(formats)
+    Ok(map)
 }
 
 /// Parse `<fonts>` collection.
@@ -538,8 +530,7 @@ mod tests {
 
         // Number formats
         assert_eq!(ss.number_formats.len(), 1);
-        assert_eq!(ss.number_formats[0].id, 164);
-        assert_eq!(ss.number_formats[0].format_code, "yyyy-mm-dd");
+        assert_eq!(ss.number_formats.get(&164).map(|s| s.as_str()), Some("yyyy-mm-dd"));
 
         // Fonts
         assert_eq!(ss.fonts.len(), 2);
@@ -561,10 +552,7 @@ mod tests {
     #[test]
     fn number_format_lookup() {
         let ss = StyleSheet {
-            number_formats: vec![NumberFormat {
-                id: 164,
-                format_code: "yyyy-mm-dd".to_string(),
-            }],
+            number_formats: [(164u32, "yyyy-mm-dd".to_string())].into_iter().collect(),
             fonts: vec![],
             fills: vec![],
             borders: vec![],
@@ -598,7 +586,7 @@ mod tests {
     #[test]
     fn font_lookup() {
         let ss = StyleSheet {
-            number_formats: vec![],
+            number_formats: std::collections::HashMap::new(),
             fonts: vec![
                 Font {
                     bold: false,
diff --git a/src/xlsx/text.rs b/src/xlsx/text.rs
index b88d19e..83df018 100644
--- a/src/xlsx/text.rs
+++ b/src/xlsx/text.rs
@@ -1,6 +1,7 @@
 use super::XlsxDocument;
 use super::cell::{Cell, CellValue};
 use super::date;
+use super::numfmt;
 use super::worksheet::Row;
 
 impl XlsxDocument {
@@ -71,6 +72,14 @@ impl XlsxDocument {
                 }
             }
         }
+        // Charts: emit each chart's extracted text under a "## Chart N" heading
+        // so its words appear in markdown / search / PDF without needing a
+        // graphical chart renderer.
+        for (i, text) in self.chart_text.iter().enumerate() {
+            if !text.trim().is_empty() {
+                parts.push(format!("## Chart {}\n\n{}", i + 1, text));
+            }
+        }
         parts.join("\n\n")
     }
 
@@ -86,6 +95,33 @@ impl XlsxDocument {
             return Some(String::new());
         }
 
+        // If the sheet is effectively single-column with prose-length cells
+        // (notes, single-column reports), emit each cell as its own paragraph
+        // instead of wrapping every line in a 1-column GFM table. The table
+        // form looks awful when rendered (tall, narrow, hard to read) and
+        // round-trips badly through markdown→IR→office.
+        if col_count == 1
+            && ws.rows.iter().any(|r| {
+                r.cells
+                    .first()
+                    .map(|c| self.format_cell_value(c).chars().count() > 20)
+                    .unwrap_or(false)
+            })
+        {
+            let mut out = String::new();
+            out.push_str(&format!("## {}\n\n", ws.name));
+            for row in &ws.rows {
+                if let Some(cell) = row.cells.first() {
+                    let text = self.format_cell_value(cell);
+                    if !text.trim().is_empty() {
+                        out.push_str(text.trim());
+                        out.push_str("\n\n");
+                    }
+                }
+            }
+            return Some(out.trim_end().to_string());
+        }
+
         let mut lines = Vec::new();
 
         // Sheet name as heading
@@ -143,6 +179,18 @@ impl XlsxDocument {
                         return;
                     }
                 }
+                if let Some(idx) = cell.style_index {
+                    if let Some(styles) = self.styles.as_ref() {
+                        if let Some(fmt_id) = styles.number_format_id_for(idx) {
+                            if fmt_id != 0 {
+                                let fmt_str = styles.number_format_for(idx);
+                                let formatted = numfmt::apply_format(*n, fmt_id, fmt_str);
+                                buf.push_str(&formatted);
+                                return;
+                            }
+                        }
+                    }
+                }
                 write_number(*n, buf);
             },
             CellValue::String(s) => buf.push_str(s),
@@ -164,6 +212,79 @@ impl XlsxDocument {
             CellValue::Date(dt) => buf.push_str(&dt.to_iso_string()),
         }
     }
+
+    /// Pre-compute the set of style indices that map to date formats.
+    /// Call once before iterating many cells; use with `write_cell_value_fast`.
+    pub fn date_style_indices(&self) -> std::collections::HashSet<u32> {
+        let Some(styles) = self.styles.as_ref() else {
+            return Default::default();
+        };
+        (0..styles.cell_formats.len() as u32)
+            .filter(|&idx| {
+                let Some(fmt_id) = styles.number_format_id_for(idx) else {
+                    return false;
+                };
+                date::is_date_format_id(fmt_id)
+                    || styles
+                        .number_format_for(idx)
+                        .is_some_and(date::is_date_format_string)
+            })
+            .collect()
+    }
+
+    /// Like `write_cell_value` but uses a pre-computed date style set instead
+    /// of calling `is_date_cell()` (which re-scans format strings) per cell.
+    pub fn write_cell_value_fast(
+        &self,
+        cell: &Cell,
+        buf: &mut String,
+        date_indices: &std::collections::HashSet<u32>,
+    ) {
+        match &cell.value {
+            CellValue::Empty => {},
+            CellValue::Number(n) => {
+                let is_date = cell
+                    .style_index
+                    .is_some_and(|i| date_indices.contains(&i));
+                if is_date {
+                    if let Some(dt) = date::DateTimeValue::from_serial(*n, self.workbook.date1904) {
+                        buf.push_str(&dt.to_iso_string());
+                        return;
+                    }
+                }
+                // Apply number format (thousands, decimals, %, currency, etc.)
+                if let Some(idx) = cell.style_index {
+                    if let Some(styles) = self.styles.as_ref() {
+                        if let Some(fmt_id) = styles.number_format_id_for(idx) {
+                            if fmt_id != 0 {
+                                let fmt_str = styles.number_format_for(idx);
+                                let formatted = numfmt::apply_format(*n, fmt_id, fmt_str);
+                                buf.push_str(&formatted);
+                                return;
+                            }
+                        }
+                    }
+                }
+                write_number(*n, buf);
+            },
+            CellValue::String(s) => buf.push_str(s),
+            CellValue::SharedString(idx) => {
+                let s = self.shared_strings.get(*idx).unwrap_or("");
+                if s.len() <= 32_768 {
+                    buf.push_str(s);
+                } else {
+                    let mut end = 32_768;
+                    while !s.is_char_boundary(end) && end > 0 {
+                        end -= 1;
+                    }
+                    buf.push_str(&s[..end]);
+                }
+            },
+            CellValue::Boolean(b) => buf.push_str(if *b { "TRUE" } else { "FALSE" }),
+            CellValue::Error(e) => buf.push_str(e),
+            CellValue::Date(dt) => buf.push_str(&dt.to_iso_string()),
+        }
+    }
 }
 
 /// Write a formatted number directly to a buffer.
diff --git a/src/xlsx/worksheet.rs b/src/xlsx/worksheet.rs
index 0af3200..ee53726 100644
--- a/src/xlsx/worksheet.rs
+++ b/src/xlsx/worksheet.rs
@@ -17,6 +17,98 @@ pub struct Worksheet {
     pub merged_cells: Vec<String>,
     /// Hyperlinks defined on this sheet.
     pub hyperlinks: Vec<HyperlinkInfo>,
+    /// Per-sheet page geometry parsed from `<pageMargins>` + `<pageSetup>`.
+    pub page_setup: Option<PageSetup>,
+    /// Pictures anchored on this worksheet via `xl/drawings/drawingN.xml`.
+    /// Resolved at parse time: anchor + image bytes are materialised
+    /// into this `Vec` so consumers don't need to re-walk the OPC
+    /// reader. Empty when the worksheet has no drawing rel.
+    pub images: Vec<WorksheetPicture>,
+    /// Layout-preserving text shapes anchored on this worksheet via a
+    /// DrawingML drawing part. Each entry is one `<xdr:sp>` carrying a
+    /// single styled run — populated by the round-trip from
+    /// `to_xlsx_bytes_layout`. Empty when the worksheet has no
+    /// `<xdr:sp>` shapes (the common XLSX case).
+    pub text_shapes: Vec<WorksheetTextShape>,
+}
+
+/// A text shape anchored on a worksheet via a DrawingML drawing part.
+/// Mirrors `xlsx::write::SheetTextShape`.
+#[derive(Debug, Clone)]
+pub struct WorksheetTextShape {
+    /// Text content of the shape.
+    pub text: String,
+    /// Font face name.
+    pub font_name: Option<String>,
+    /// Font size in points (full-pt scale).
+    pub font_size_pt: Option<f32>,
+    /// Bold weight.
+    pub bold: bool,
+    /// Italic style.
+    pub italic: bool,
+    /// 6-char hex colour, when present.
+    pub color_hex: Option<String>,
+    /// X anchor in EMU.
+    pub x_emu: i64,
+    /// Y anchor in EMU.
+    pub y_emu: i64,
+    /// Width in EMU.
+    pub cx_emu: i64,
+    /// Height in EMU.
+    pub cy_emu: i64,
+}
+
+/// A picture anchored on a worksheet via a DrawingML drawing part.
+///
+/// Coordinates are in EMU (914400 per inch) and absolute relative to
+/// the sheet origin (top-left). When the source used a one-cell or
+/// two-cell anchor we approximate the equivalent absolute origin by
+/// summing the from-cell coordinates. The bytes are the raw image
+/// part contents; `format` is the lowercase file extension.
+#[derive(Debug, Clone)]
+pub struct WorksheetPicture {
+    /// Image bytes.
+    pub data: Vec<u8>,
+    /// Lowercase file extension (`"png"`, `"jpeg"`, ...).
+    pub format: String,
+    /// X anchor in EMU.
+    pub x_emu: i64,
+    /// Y anchor in EMU.
+    pub y_emu: i64,
+    /// Rendered width in EMU.
+    pub cx_emu: i64,
+    /// Rendered height in EMU.
+    pub cy_emu: i64,
+    /// Optional `<xdr:cNvPr descr=…>` accessibility text.
+    pub alt_text: Option<String>,
+}
+
+/// Per-sheet page geometry (inches for margins, twips for dimensions).
+///
+/// Parsed from `<pageMargins>` (margins in inches per ECMA-376) and
+/// `<pageSetup>` (size as `paperWidth`/`paperHeight` with a unit suffix
+/// — `mm`, `cm`, `in` — or as a `paperSize` enum).  Stored in twips for
+/// IR parity (1 inch = 1440 twips, 1 mm = 1440/25.4 ≈ 56.6929 twips).
+#[derive(Debug, Clone, Copy, PartialEq, Default)]
+pub struct PageSetup {
+    /// Page width in twips. Zero if no page setup was seen.
+    pub width_twips: u32,
+    /// Page height in twips.
+    pub height_twips: u32,
+    /// Top margin in twips.
+    pub margin_top_twips: u32,
+    /// Bottom margin in twips.
+    pub margin_bottom_twips: u32,
+    /// Left margin in twips.
+    pub margin_left_twips: u32,
+    /// Right margin in twips.
+    pub margin_right_twips: u32,
+    /// Header distance from top edge in twips.
+    pub header_distance_twips: u32,
+    /// Footer distance from bottom edge in twips.
+    pub footer_distance_twips: u32,
+    /// Whether the page is in landscape orientation.
+    pub landscape: bool,
 }
 
 /// A row from `<sheetData>`.
@@ -63,6 +155,12 @@ impl Worksheet {
         let mut rows = Vec::new();
         let mut merged_cells = Vec::new();
         let mut hyperlinks = Vec::new();
+        // Page setup is collected lazily because <pageMargins> and
+        // <pageSetup> arrive as separate sibling elements and either may
+        // appear without the other. We materialize the IR value at the
+        // end iff at least one was seen.
+        let mut margins_in: Option<PageMarginsIn> = None;
+        let mut page_setup_raw: Option<PageSetupRaw> = None;
 
         loop {
             match reader.read_event()? {
@@ -86,6 +184,14 @@ impl Worksheet {
                         }
                         reader.read_to_end(e.to_end().name())?;
                     },
+                    b"pageMargins" => {
+                        margins_in = parse_page_margins(e)?;
+                        reader.read_to_end(e.to_end().name())?;
+                    },
+                    b"pageSetup" => {
+                        page_setup_raw = parse_page_setup_attrs(e)?;
+                        reader.read_to_end(e.to_end().name())?;
+                    },
                     _ => {},
                 },
                 Event::Empty(ref e) => match e.local_name().as_ref() {
@@ -102,6 +208,12 @@ impl Worksheet {
                             hyperlinks.push(hl);
                         }
                     },
+                    b"pageMargins" => {
+                        margins_in = parse_page_margins(e)?;
+                    },
+                    b"pageSetup" => {
+                        page_setup_raw = parse_page_setup_attrs(e)?;
+                    },
                     _ => {},
                 },
                 Event::Eof => break,
@@ -109,16 +221,175 @@ impl Worksheet {
             }
         }
 
+        let page_setup = build_page_setup(margins_in, page_setup_raw);
+
         Ok(Worksheet {
             name,
             dimension,
             rows,
             merged_cells,
             hyperlinks,
+            page_setup,
+            images: Vec::new(),
+            text_shapes: Vec::new(),
         })
     }
 }
 
+/// Raw `<pageMargins>` values in inches (per ECMA-376 §18.3.1.62).
+#[derive(Debug, Clone, Copy)]
+struct PageMarginsIn {
+    left: f64,
+    right: f64,
+    top: f64,
+    bottom: f64,
+    header: f64,
+    footer: f64,
+}
+
+/// Raw `<pageSetup>` shape — physical dimensions in twips plus orientation.
+#[derive(Debug, Clone, Copy, Default)]
+struct PageSetupRaw {
+    width_twips: u32,
+    height_twips: u32,
+    landscape: bool,
+}
+
+fn parse_page_margins(
+    e: &quick_xml::events::BytesStart,
+) -> crate::core::Result<Option<PageMarginsIn>> {
+    let parse = |k: &[u8]| -> crate::core::Result<Option<f64>> {
+        Ok(xml::optional_attr_str(e, k)?
+            .and_then(|v| fast_float2::parse::<f64, _>(v.as_ref()).ok()))
+    };
+    let left = parse(b"left")?;
+    let right = parse(b"right")?;
+    let top = parse(b"top")?;
+    let bottom = parse(b"bottom")?;
+    let header = parse(b"header")?;
+    let footer = parse(b"footer")?;
+    if left.is_none() && right.is_none() && top.is_none() && bottom.is_none() {
+        return Ok(None);
+    }
+    Ok(Some(PageMarginsIn {
+        left: left.unwrap_or(0.7),
+        right: right.unwrap_or(0.7),
+        top: top.unwrap_or(0.75),
+        bottom: bottom.unwrap_or(0.75),
+        header: header.unwrap_or(0.3),
+        footer: footer.unwrap_or(0.3),
+    }))
+}
+
+/// Translate an inch / mm / cm dimension token (e.g. "210mm", "8.5in",
+/// "21cm", or a bare "210" assumed mm) into twips.  Returns `None` for
+/// blanks or values that fail to parse.
+fn dim_to_twips(s: &str) -> Option<u32> {
+    let s = s.trim();
+    if s.is_empty() {
+        return None;
+    }
+    let (num_part, factor): (&str, f64) = if let Some(rest) = s.strip_suffix("mm") {
+        (rest, 1440.0 / 25.4)
+    } else if let Some(rest) = s.strip_suffix("cm") {
+        (rest, 1440.0 / 2.54)
+    } else if let Some(rest) = s.strip_suffix("in") {
+        (rest, 1440.0)
+    } else {
+        // Bare numeric — ECMA-376 says the default unit varies by locale;
+        // mm is the safest bet for arbitrary writers (and matches what we
+        // emit in `build_worksheet_xml`).
+        (s, 1440.0 / 25.4)
+    };
+    let v: f64 = fast_float2::parse(num_part.trim()).ok()?;
+    if v <= 0.0 {
+        return None;
+    }
+    Some((v * factor).round() as u32)
+}
+
+/// Translate the OOXML `paperSize` enum into (width_twips, height_twips).
+/// Covers the dimensions we're most likely to encounter in a PDF→XLSX
+/// round-trip — Letter, Legal, A3, A4, A5, B4, B5, Executive, Tabloid.
+/// Unknown values fall back to A4 portrait.
+fn paper_size_enum_to_twips(id: u32) -> (u32, u32) {
+    match id {
+        1 => (12240, 15840),  // Letter 8.5 × 11"
+        5 => (12240, 20160),  // Legal 8.5 × 14"
+        7 => (10440, 15120),  // Executive 7.25 × 10.5"
+        8 => (16840, 23820),  // A3 297 × 420 mm
+        9 => (11906, 16838),  // A4 210 × 297 mm
+        11 => (8392, 11906),  // A5 148 × 210 mm
+        12 => (14171, 20012), // B4 250 × 353 mm
+        13 => (9979, 14171),  // B5 176 × 250 mm
+        3 => (15840, 24480),  // Tabloid 11 × 17"
+        _ => (11906, 16838),  // Default A4
+    }
+}
+
+fn parse_page_setup_attrs(
+    e: &quick_xml::events::BytesStart,
+) -> crate::core::Result<Option<PageSetupRaw>> {
+    let pw = xml::optional_attr_str(e, b"paperWidth")?.and_then(|v| dim_to_twips(v.as_ref()));
+    let ph = xml::optional_attr_str(e, b"paperHeight")?.and_then(|v| dim_to_twips(v.as_ref()));
+    let paper_size = xml::optional_attr_str(e, b"paperSize")?
+        .and_then(|v| atoi_simd::parse_pos::<u32, false>(v.as_bytes()).ok());
+    let orientation = xml::optional_attr_str(e, b"orientation")?;
+    let landscape = matches!(orientation.as_deref(), Some("landscape"));
+
+    let (width_twips, height_twips) = match (pw, ph) {
+        (Some(w), Some(h)) => (w, h),
+        _ => match paper_size {
+            Some(id) => paper_size_enum_to_twips(id),
+            None => return Ok(None),
+        },
+    };
+
+    Ok(Some(PageSetupRaw {
+        width_twips,
+        height_twips,
+        landscape,
+    }))
+}
+
+fn build_page_setup(
+    margins: Option<PageMarginsIn>,
+    raw: Option<PageSetupRaw>,
+) -> Option<PageSetup> {
+    if margins.is_none() && raw.is_none() {
+        return None;
+    }
+    let in_to_twips = |v: f64| (v * 1440.0).round().max(0.0) as u32;
+    let m = margins.unwrap_or(PageMarginsIn {
+        left: 0.7,
+        right: 0.7,
+        top: 0.75,
+        bottom: 0.75,
+        header: 0.3,
+        footer: 0.3,
+    });
+    let r = raw.unwrap_or_default();
+    let mut ps = PageSetup {
+        width_twips: r.width_twips,
+        height_twips: r.height_twips,
+        margin_top_twips: in_to_twips(m.top),
+        margin_bottom_twips: in_to_twips(m.bottom),
+        margin_left_twips: in_to_twips(m.left),
+        margin_right_twips: in_to_twips(m.right),
+        header_distance_twips: in_to_twips(m.header),
+        footer_distance_twips: in_to_twips(m.footer),
+        landscape: r.landscape,
+    };
+    // If we only saw <pageMargins> (no <pageSetup>), leave dimensions
+    // unset so the caller can fall back to the IR default; otherwise
+    // downstream renderers would draw onto a 0×0 page.
+    if ps.width_twips == 0 || ps.height_twips == 0 {
+        ps.width_twips = 0;
+        ps.height_twips = 0;
+    }
+    Some(ps)
+}
+
 fn parse_hyperlink(
     e: &quick_xml::events::BytesStart,
     rels: &crate::core::relationships::Relationships,
@@ -371,6 +642,41 @@ mod tests {
         assert!(matches!(cell.value, CellValue::Number(n) if n == 20.0));
     }
 
+    #[test]
+    fn parse_worksheet_page_setup() {
+        let xml = br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<worksheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
+  <sheetData/>
+  <pageMargins left="0.5" right="0.5" top="0.5" bottom="0.5" header="0.3" footer="0.3"/>
+  <pageSetup paperWidth="215.90mm" paperHeight="279.40mm" orientation="portrait"/>
+</worksheet>"#;
+        let ws = Worksheet::parse(xml, "S".to_string(), &empty_rels()).unwrap();
+        let ps = ws.page_setup.expect("page_setup parsed");
+        // 215.9mm ≈ 8.5", 279.4mm ≈ 11", both in twips
+        assert!((ps.width_twips as i32 - 12240).abs() <= 1, "width {:?}", ps.width_twips);
+        assert!((ps.height_twips as i32 - 15840).abs() <= 1, "height {:?}", ps.height_twips);
+        // 0.5" margin = 720 twips
+        assert_eq!(ps.margin_top_twips, 720);
+        assert_eq!(ps.margin_left_twips, 720);
+        assert!(!ps.landscape);
+    }
+
+    #[test]
+    fn parse_worksheet_page_setup_paper_enum() {
+        // paperSize=9 = A4 → 11906x16838 twips.
+        let xml = br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<worksheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
+  <sheetData/>
+  <pageMargins left="0.7" right="0.7" top="0.75" bottom="0.75" header="0.3" footer="0.3"/>
+  <pageSetup paperSize="9" orientation="landscape"/>
+</worksheet>"#;
+        let ws = Worksheet::parse(xml, "S".to_string(), &empty_rels()).unwrap();
+        let ps = ws.page_setup.expect("page_setup parsed");
+        assert_eq!(ps.width_twips, 11906);
+        assert_eq!(ps.height_twips, 16838);
+        assert!(ps.landscape);
+    }
+
     #[test]
     fn parse_worksheet_merged_cells() {
         let xml = br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
@@ -387,4 +693,163 @@ mod tests {
         let ws = Worksheet::parse(xml, "Sheet1".to_string(), &empty_rels()).unwrap();
         assert_eq!(ws.merged_cells, vec!["A1:C1"]);
     }
+
+    // ── dim_to_twips ─────────────────────────────────────────────────────
+
+    #[test]
+    fn dim_to_twips_inches() {
+        // 1 inch = 1440 twips.
+        assert_eq!(dim_to_twips("1in"), Some(1440));
+        assert_eq!(dim_to_twips("8.5in"), Some(12240));
+    }
+
+    #[test]
+    fn dim_to_twips_millimeters() {
+        // 210mm = 11906 twips (A4 width); allow ±1 for rounding.
+        let twips = dim_to_twips("210mm").unwrap();
+        assert!((twips as i32 - 11906).abs() <= 1, "got {twips}");
+    }
+
+    #[test]
+    fn dim_to_twips_centimeters() {
+        // 1cm = 1440/2.54 ≈ 567 twips.
+        let twips = dim_to_twips("1cm").unwrap();
+        assert!((twips as i32 - 567).abs() <= 1, "got {twips}");
+    }
+
+    #[test]
+    fn dim_to_twips_bare_number_assumed_mm() {
+        // Bare numeric defaults to mm.
+        let a = dim_to_twips("210mm").unwrap();
+        let b = dim_to_twips("210").unwrap();
+        assert_eq!(a, b);
+    }
+
+    #[test]
+    fn dim_to_twips_empty_and_zero() {
+        assert_eq!(dim_to_twips(""), None);
+        assert_eq!(dim_to_twips("   "), None);
+        // Zero / negative dimensions are nonsensical: rejected.
+        assert_eq!(dim_to_twips("0mm"), None);
+        assert_eq!(dim_to_twips("-5in"), None);
+    }
+
+    #[test]
+    fn dim_to_twips_invalid_string() {
+        assert_eq!(dim_to_twips("garbage"), None);
+        assert_eq!(dim_to_twips("abcmm"), None);
+    }
+
+    // ── paper_size_enum_to_twips ────────────────────────────────────────
+
+    #[test]
+    fn paper_size_letter() {
+        assert_eq!(paper_size_enum_to_twips(1), (12240, 15840));
+    }
+
+    #[test]
+    fn paper_size_legal() {
+        assert_eq!(paper_size_enum_to_twips(5), (12240, 20160));
+    }
+
+    #[test]
+    fn paper_size_a4() {
+        assert_eq!(paper_size_enum_to_twips(9), (11906, 16838));
+    }
+
+    #[test]
+    fn paper_size_unknown_falls_back_to_a4() {
+        assert_eq!(paper_size_enum_to_twips(9999), (11906, 16838));
+    }
+
+    // ── build_page_setup ────────────────────────────────────────────────
+
+    #[test]
+    fn build_page_setup_returns_none_when_both_missing() {
+        assert!(build_page_setup(None, None).is_none());
+    }
+
+    #[test]
+    fn build_page_setup_margins_only_zeroes_dimensions() {
+        // <pageMargins> without <pageSetup> → dimensions left at 0 so
+        // a downstream consumer falls back to its default page size.
+        let margins = Some(PageMarginsIn {
+            left: 1.0,
+            right: 1.0,
+            top: 1.0,
+            bottom: 1.0,
+            header: 0.5,
+            footer: 0.5,
+        });
+        let ps = build_page_setup(margins, None).unwrap();
+        assert_eq!(ps.width_twips, 0);
+        assert_eq!(ps.height_twips, 0);
+        // 1 inch margins = 1440 twips.
+        assert_eq!(ps.margin_top_twips, 1440);
+        assert_eq!(ps.margin_left_twips, 1440);
+        assert_eq!(ps.header_distance_twips, 720); // 0.5 in
+    }
+
+    #[test]
+    fn build_page_setup_dimensions_only_uses_default_margins() {
+        // <pageSetup> alone uses ECMA-376 default 0.7/0.7/0.75/0.75 inch margins.
+        let raw = Some(PageSetupRaw {
+            width_twips: 12240,
+            height_twips: 15840,
+            landscape: false,
+        });
+        let ps = build_page_setup(None, raw).unwrap();
+        assert_eq!(ps.width_twips, 12240);
+        assert_eq!(ps.height_twips, 15840);
+        // 0.7in = 1008 twips.
+        assert_eq!(ps.margin_left_twips, 1008);
+        // 0.75in = 1080 twips.
+        assert_eq!(ps.margin_top_twips, 1080);
+    }
+
+    #[test]
+    fn build_page_setup_combines_both() {
+        let margins = Some(PageMarginsIn {
+            left: 0.5,
+            right: 0.5,
+            top: 0.5,
+            bottom: 0.5,
+            header: 0.3,
+            footer: 0.3,
+        });
+        let raw = Some(PageSetupRaw {
+            width_twips: 11906,
+            height_twips: 16838,
+            landscape: true,
+        });
+        let ps = build_page_setup(margins, raw).unwrap();
+        assert_eq!(ps.width_twips, 11906);
+        assert!(ps.landscape);
+        assert_eq!(ps.margin_left_twips, 720); // 0.5in
+    }
+
+    #[test]
+    fn parse_worksheet_landscape_with_paper_enum() {
+        // Verifies that landscape attribute survives the parse_page_setup_attrs path.
+        let xml = br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<worksheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
+  <sheetData/>
+  <pageSetup paperSize="1" orientation="landscape"/>
+</worksheet>"#;
+        let ws = Worksheet::parse(xml, "S".to_string(), &empty_rels()).unwrap();
+        let ps = ws.page_setup.expect("page_setup");
+        assert_eq!(ps.width_twips, 12240); // Letter
+        assert!(ps.landscape);
+    }
+
+    #[test]
+    fn parse_worksheet_default_when_no_setup() {
+        // No <pageMargins> or <pageSetup> → no page_setup at all.
+        let xml = br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<worksheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
+  <sheetData/>
+</worksheet>"#;
+        let ws = Worksheet::parse(xml, "S".to_string(), &empty_rels()).unwrap();
+        assert!(ws.page_setup.is_none());
+    }
 }
diff --git a/src/xlsx/write.rs b/src/xlsx/write.rs
index bf9da7d..7604cb9 100644
--- a/src/xlsx/write.rs
+++ b/src/xlsx/write.rs
@@ -232,6 +232,98 @@ pub enum CellData {
 /// Builder for creating XLSX files.
 pub struct XlsxWriter {
     sheets: Vec<SheetDataInner>,
+    /// Embedded font programs to ship inside the package under `xl/fonts/`.
+    /// Same layout as DOCX `word/fonts/` and PPTX `ppt/fonts/`. Excel
+    /// itself doesn't honor these without `<workbookView>` / theme
+    /// plumbing, but the in-process reader scans the directory so
+    /// PDF↔XLSX round-trips can preserve typefaces.
+    embedded_fonts: Vec<(String, Vec<u8>)>,
+    /// Document metadata for `docProps/core.xml`. `None` means no
+    /// core-properties part is written.
+    metadata: Option<crate::ir::Metadata>,
+}
+
+/// Per-worksheet page geometry.
+///
+/// Maps roughly 1-to-1 onto OOXML's `<pageMargins>` and `<pageSetup>` —
+/// margins are stored in inches per ECMA-376 (§18.3.1.62), the page size
+/// is emitted as `paperWidth`/`paperHeight` in millimetres so arbitrary
+/// PDF MediaBox dimensions round-trip without snapping to the nearest
+/// `paperSize` enum.  All inputs are twips for parity with the rest of
+/// the IR (`width_twips`, `margin_top_twips`, …).
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub struct PageSetup {
+    /// Page width in twips (1/1440 inch).
+    pub width_twips: u32,
+    /// Page height in twips.
+    pub height_twips: u32,
+    /// Top margin in twips.
+    pub margin_top_twips: u32,
+    /// Bottom margin in twips.
+    pub margin_bottom_twips: u32,
+    /// Left margin in twips.
+    pub margin_left_twips: u32,
+    /// Right margin in twips.
+    pub margin_right_twips: u32,
+    /// Header distance from top edge in twips.
+    pub header_distance_twips: u32,
+    /// Footer distance from bottom edge in twips.
+    pub footer_distance_twips: u32,
+    /// Whether the page is in landscape orientation.
+    pub landscape: bool,
+}
+
+/// A picture anchored on a worksheet via a DrawingML drawing part.
+///
+/// Anchor coordinates are in EMU and absolute relative to the sheet
+/// origin (top-left). Round-trips render via `<xdr:absoluteAnchor>` in
+/// `xl/drawings/drawingN.xml`. The writer emits the bytes verbatim; the
+/// reader resolves them back through the worksheet → drawing → image
+/// relationship chain.
+#[derive(Debug, Clone)]
+pub struct SheetImage {
+    /// Raw image bytes (PNG / JPEG / etc., as produced by the source).
+    pub data: Vec<u8>,
+    /// Lowercase file extension (`"png"`, `"jpeg"`, ...).
+    pub format: String,
+    /// X anchor in EMU, from sheet origin.
+    pub x_emu: i64,
+    /// Y anchor in EMU.
+    pub y_emu: i64,
+    /// Rendered width in EMU.
+    pub cx_emu: i64,
+    /// Rendered height in EMU.
+    pub cy_emu: i64,
+}
+
+/// A text shape anchored on a worksheet via a DrawingML drawing part.
+///
+/// Used by the layout-preserving PDF→XLSX path to emit each PDF text
+/// span at its exact source EMU coordinates as an `<xdr:sp>` shape
+/// inside `xl/drawings/drawingN.xml`. The shape carries a single run
+/// with the span's text, font, size, weight, italic, and colour.
+#[derive(Debug, Clone)]
+pub struct SheetTextShape {
+    /// Text content of the shape (single run).
+    pub text: String,
+    /// Font face name (e.g. `"Times New Roman"`).
+    pub font_name: String,
+    /// Font size in points (full-pt scale, not half-pt).
+    pub font_size_pt: f32,
+    /// Bold weight.
+    pub bold: bool,
+    /// Italic style.
+    pub italic: bool,
+    /// Optional 6-char hex colour like `"FF0000"`. `None` ⇒ pure black.
+    pub color_hex: Option<String>,
+    /// X anchor in EMU.
+    pub x_emu: i64,
+    /// Y anchor in EMU.
+    pub y_emu: i64,
+    /// Width in EMU.
+    pub cx_emu: i64,
+    /// Height in EMU.
+    pub cy_emu: i64,
 }
 
 /// Full internal sheet representation.
@@ -242,6 +334,13 @@ struct SheetDataInner {
     pub cell_styles: HashMap<(usize, usize), CellStyle>,
     /// Merged cell regions: (row, col, row_span, col_span).
     pub merge_regions: Vec<(usize, usize, usize, usize)>,
+    /// Per-sheet page geometry (`<pageMargins>` + `<pageSetup>`).
+    pub page_setup: Option<PageSetup>,
+    /// Pictures anchored on this sheet via a DrawingML drawing part.
+    pub images: Vec<SheetImage>,
+    /// Text shapes anchored on this sheet via a DrawingML drawing part.
+    /// Used by the layout-preserving PDF→XLSX path.
+    pub text_shapes: Vec<SheetTextShape>,
 }
 
 impl SheetDataInner {
@@ -252,6 +351,9 @@ impl SheetDataInner {
             col_widths: HashMap::new(),
             cell_styles: HashMap::new(),
             merge_regions: Vec::new(),
+            page_setup: None,
+            images: Vec::new(),
+            text_shapes: Vec::new(),
         }
     }
 
@@ -383,6 +485,73 @@ impl<'a> SheetData<'a> {
         self.0.merge_cells(row, col, row_span, col_span);
         self
     }
+
+    /// Set per-sheet page geometry. Emits `<pageMargins>` and `<pageSetup>`
+    /// inside the worksheet XML so PDF→XLSX→PDF round-trips preserve the
+    /// source MediaBox and margins instead of snapping back to default
+    /// Letter-portrait. Pass `None` (the default) to omit both elements.
+    pub fn set_page_setup(&mut self, ps: PageSetup) -> &mut Self {
+        self.0.page_setup = Some(ps);
+        self
+    }
+
+    /// Anchor a styled text run on this worksheet at absolute EMU
+    /// coordinates. Used by the PDF→XLSX layout-preserving path: each
+    /// PDF text span becomes one `<xdr:sp>` shape with a single run.
+    #[allow(clippy::too_many_arguments)]
+    pub fn add_text_shape(
+        &mut self,
+        text: impl Into<String>,
+        font_name: impl Into<String>,
+        font_size_pt: f32,
+        bold: bool,
+        italic: bool,
+        color_hex: Option<String>,
+        x_emu: i64,
+        y_emu: i64,
+        cx_emu: i64,
+        cy_emu: i64,
+    ) -> &mut Self {
+        self.0.text_shapes.push(SheetTextShape {
+            text: text.into(),
+            font_name: font_name.into(),
+            font_size_pt,
+            bold,
+            italic,
+            color_hex,
+            x_emu,
+            y_emu,
+            cx_emu,
+            cy_emu,
+        });
+        self
+    }
+
+    /// Anchor a picture on this worksheet at absolute EMU coordinates.
+    ///
+    /// On write the writer materialises a `xl/drawings/drawingN.xml`
+    /// part for this sheet, registers an IMAGE relationship per
+    /// picture, and writes the bytes under `xl/media/image_<sheet>_<n>.<ext>`.
+    /// `format` is the lowercase file extension (`"png"`, `"jpeg"`, ...).
+    pub fn add_image(
+        &mut self,
+        data: Vec<u8>,
+        format: impl Into<String>,
+        x_emu: i64,
+        y_emu: i64,
+        cx_emu: i64,
+        cy_emu: i64,
+    ) -> &mut Self {
+        self.0.images.push(SheetImage {
+            data,
+            format: format.into(),
+            x_emu,
+            y_emu,
+            cx_emu,
+            cy_emu,
+        });
+        self
+    }
 }
 
 // ---------------------------------------------------------------------------
@@ -398,7 +567,28 @@ impl Default for XlsxWriter {
 impl XlsxWriter {
     /// Create a new, empty XLSX writer.
     pub fn new() -> Self {
-        Self { sheets: Vec::new() }
+        Self {
+            sheets: Vec::new(),
+            embedded_fonts: Vec::new(),
+            metadata: None,
+        }
+    }
+
+    /// Set document metadata (written to `docProps/core.xml`).
+    pub fn set_metadata(&mut self, meta: &crate::ir::Metadata) -> &mut Self {
+        self.metadata = Some(meta.clone());
+        self
+    }
+
+    /// Embed a font program (TrueType / OpenType bytes) under `xl/fonts/`.
+    /// `name` is used for the file name and as the human-readable font name.
+    /// Subsequent calls with the same name are deduplicated.
+    pub fn embed_font(&mut self, name: impl Into<String>, data: Vec<u8>) -> &mut Self {
+        let name = name.into();
+        if !self.embedded_fonts.iter().any(|(n, _)| n == &name) {
+            self.embedded_fonts.push((name, data));
+        }
+        self
     }
 
     /// Add a worksheet and return a mutable handle to it.
@@ -455,6 +645,13 @@ impl XlsxWriter {
         }
     }
 
+    /// Set per-sheet page geometry by sheet index. See `SheetData::set_page_setup`.
+    pub fn sheet_set_page_setup(&mut self, sheet: usize, ps: PageSetup) {
+        if let Some(s) = self.sheets.get_mut(sheet) {
+            s.page_setup = Some(ps);
+        }
+    }
+
     /// Save the workbook to a file.
     pub fn save(&self, path: impl AsRef<Path>) -> Result<()> {
         let mut opc = OpcWriter::create(path)?;
@@ -480,6 +677,17 @@ impl XlsxWriter {
 
         opc.add_package_rel(rel_types::OFFICE_DOCUMENT, "xl/workbook.xml");
 
+        // Core properties (docProps/core.xml). Optional; written only
+        // when caller supplied metadata via `set_metadata`. Surfaces
+        // PDF /Title /Author etc. in Excel's "Properties" dialog after
+        // a PDF→XLSX→Excel round trip.
+        if let Some(ref meta) = self.metadata {
+            let core_part = PartName::new("/docProps/core.xml")?;
+            opc.add_package_rel(rel_types::CORE_PROPERTIES, "docProps/core.xml");
+            let core_xml = crate::core::core_properties::generate_xml(meta);
+            opc.add_part(&core_part, crate::core::core_properties::CONTENT_TYPE, &core_xml)?;
+        }
+
         let mut sheet_rids = Vec::with_capacity(self.sheets.len());
         for (i, _) in self.sheets.iter().enumerate() {
             let target = format!("worksheets/sheet{}.xml", i + 1);
@@ -497,7 +705,24 @@ impl XlsxWriter {
         for (i, sheet) in self.sheets.iter().enumerate() {
             let part_name_str = format!("/xl/worksheets/sheet{}.xml", i + 1);
             let part_name = PartName::new(&part_name_str)?;
-            let ws_xml = Self::build_worksheet_xml(sheet, &style_table)?;
+
+            // Emit drawing + media parts up-front so we have the rId
+            // for the `<drawing r:id="…"/>` element inside the
+            // worksheet XML below. Sheets without pictures or text
+            // shapes get no drawing part at all.
+            let drawing_rid = if !sheet.images.is_empty() || !sheet.text_shapes.is_empty() {
+                Some(Self::write_drawing_for_sheet(
+                    opc,
+                    &part_name,
+                    i + 1,
+                    &sheet.images,
+                    &sheet.text_shapes,
+                )?)
+            } else {
+                None
+            };
+
+            let ws_xml = Self::build_worksheet_xml(sheet, &style_table, drawing_rid.as_deref())?;
             opc.add_part(&part_name, CT_WORKSHEET, &ws_xml)?;
         }
 
@@ -505,6 +730,13 @@ impl XlsxWriter {
         let styles_xml = style_table.build_styles_xml()?;
         opc.add_part(&styles_part, CT_STYLES, &styles_xml)?;
 
+        // Embed fonts under `xl/fonts/font_<n>_<safe_name>.ttf`. Same
+        // layout as DOCX/PPTX. Excel itself doesn't auto-discover the
+        // fonts without `<workbookView>` plumbing, but the in-process
+        // reader scans the directory so PDF↔XLSX round-trips can reuse
+        // the source typeface.
+        crate::core::embedded_fonts::write_embedded_fonts(opc, "/xl/fonts/", &self.embedded_fonts)?;
+
         Ok(())
     }
 
@@ -538,6 +770,7 @@ impl XlsxWriter {
     fn build_worksheet_xml(
         sheet: &SheetDataInner,
         style_table: &StyleTable,
+        drawing_rid: Option<&str>,
     ) -> crate::core::Result<Vec<u8>> {
         let mut w = Writer::new_with_indent(Vec::new(), b' ', 2);
 
@@ -545,6 +778,11 @@ impl XlsxWriter {
 
         let mut root = BytesStart::new("worksheet");
         root.push_attribute(("xmlns", NS_SML));
+        // Worksheets that anchor drawings need the relationship
+        // namespace so the `<drawing r:id="…"/>` element below
+        // resolves. Declaring it unconditionally is harmless for
+        // plain-data sheets and keeps the writer code simple.
+        root.push_attribute(("xmlns:r", NS_REL));
         w.write_event(Event::Start(root))?;
 
         // Column widths
@@ -608,11 +846,135 @@ impl XlsxWriter {
             w.write_event(Event::End(BytesEnd::new("mergeCells")))?;
         }
 
+        // <pageMargins> + <pageSetup>. ECMA-376 §18.3.1.62 / §18.3.1.63 —
+        // pageMargins values are in inches (f64), pageSetup carries the
+        // physical paper dimensions and orientation. We emit `paperWidth`
+        // and `paperHeight` in mm so arbitrary PDF MediaBoxes round-trip
+        // verbatim instead of snapping to the closest `paperSize` enum
+        // (which only covers a fixed set of standard sizes — Letter,
+        // Legal, A4, A3, …).
+        if let Some(ps) = sheet.page_setup {
+            // twips → inches, twips → mm (1 inch = 1440 twips = 25.4 mm).
+            let to_in = |t: u32| t as f64 / 1440.0;
+            let to_mm = |t: u32| t as f64 / 1440.0 * 25.4;
+
+            let left = format!("{:.4}", to_in(ps.margin_left_twips));
+            let right = format!("{:.4}", to_in(ps.margin_right_twips));
+            let top = format!("{:.4}", to_in(ps.margin_top_twips));
+            let bottom = format!("{:.4}", to_in(ps.margin_bottom_twips));
+            let header = format!("{:.4}", to_in(ps.header_distance_twips));
+            let footer = format!("{:.4}", to_in(ps.footer_distance_twips));
+
+            let mut pm = BytesStart::new("pageMargins");
+            pm.push_attribute(("left", left.as_str()));
+            pm.push_attribute(("right", right.as_str()));
+            pm.push_attribute(("top", top.as_str()));
+            pm.push_attribute(("bottom", bottom.as_str()));
+            pm.push_attribute(("header", header.as_str()));
+            pm.push_attribute(("footer", footer.as_str()));
+            w.write_event(Event::Empty(pm))?;
+
+            let pw_mm = format!("{:.2}mm", to_mm(ps.width_twips));
+            let ph_mm = format!("{:.2}mm", to_mm(ps.height_twips));
+            let orientation = if ps.landscape {
+                "landscape"
+            } else {
+                "portrait"
+            };
+            let mut psu = BytesStart::new("pageSetup");
+            psu.push_attribute(("paperWidth", pw_mm.as_str()));
+            psu.push_attribute(("paperHeight", ph_mm.as_str()));
+            psu.push_attribute(("orientation", orientation));
+            w.write_event(Event::Empty(psu))?;
+        }
+
+        // `<drawing>` MUST appear after `<pageSetup>` per the
+        // worksheet child-order schema (CT_Worksheet, ECMA-376
+        // §18.3.1.99). Excel rejects the file with "We found a problem
+        // with some content" otherwise.
+        if let Some(rid) = drawing_rid {
+            let mut d = BytesStart::new("drawing");
+            d.push_attribute(("r:id", rid));
+            w.write_event(Event::Empty(d))?;
+        }
+
         w.write_event(Event::End(BytesEnd::new("worksheet")))?;
 
         Ok(w.into_inner())
     }
 
+    /// Materialise `xl/drawings/drawing<sheet_n>.xml`, write each
+    /// picture's bytes under `xl/media/image_<sheet_n>_<pic_n>.<ext>`,
+    /// wire the worksheet→drawing and drawing→image relationships, and
+    /// register PNG/JPEG default content types.
+    ///
+    /// Returns the relationship ID added to the worksheet — the caller
+    /// places it on the `<drawing r:id="…"/>` element inside the
+    /// worksheet XML.
+    fn write_drawing_for_sheet<W: Write + Seek>(
+        opc: &mut OpcWriter<W>,
+        worksheet_part: &PartName,
+        sheet_n: usize,
+        images: &[SheetImage],
+        text_shapes: &[SheetTextShape],
+    ) -> Result<String> {
+        let drawing_target = format!("../drawings/drawing{}.xml", sheet_n);
+        let drawing_rid = opc.add_part_rel(worksheet_part, rel_types::DRAWING, &drawing_target);
+
+        let drawing_part_str = format!("/xl/drawings/drawing{}.xml", sheet_n);
+        let drawing_part = PartName::new(&drawing_part_str)?;
+
+        // Add IMAGE rels off the drawing part. Targets are relative to
+        // the drawing part itself (`../media/imageX.ext`). Track the
+        // rIds so each `<xdr:pic>` in the drawing XML can reference
+        // them via `<a:blip r:embed="rIdN"/>`.
+        let mut blip_rids: Vec<String> = Vec::with_capacity(images.len());
+        for (i, img) in images.iter().enumerate() {
+            let ext = if img.format.is_empty() {
+                "png"
+            } else {
+                img.format.as_str()
+            };
+            let media_path_str = format!("/xl/media/image_{}_{}.{}", sheet_n, i + 1, ext);
+            let media_part = PartName::new(&media_path_str)?;
+
+            // Default Content-Type by extension (Default Extension="png")
+            // satisfies SDK validators that flag overrides without a
+            // matching Default. Re-registering the same default is a
+            // no-op inside ContentTypesBuilder.
+            let mime = match ext {
+                "jpg" | "jpeg" => "image/jpeg",
+                "gif" => "image/gif",
+                "tiff" | "tif" => "image/tiff",
+                "bmp" => "image/bmp",
+                "emf" => "image/x-emf",
+                "wmf" => "image/x-wmf",
+                _ => "image/png",
+            };
+            opc.register_default_content_type(ext, mime);
+
+            // Write image bytes raw (no Content-Type override needed
+            // since we registered the Default above; passing the same
+            // mime to add_part is harmless).
+            opc.add_part(&media_part, mime, &img.data)?;
+
+            // Drawing-relative target: `../media/image_..._N.ext`.
+            let rel_target = format!("../media/image_{}_{}.{}", sheet_n, i + 1, ext);
+            let rid = opc.add_part_rel(&drawing_part, rel_types::IMAGE, &rel_target);
+            blip_rids.push(rid);
+        }
+
+        // Now the drawing XML itself. One `<xdr:absoluteAnchor>` per
+        // picture and per text shape; anchor in EMU from the sheet
+        // origin, with the picture's `<a:blip r:embed="rIdN"/>`
+        // referring back to the image rels we just added.
+        let drawing_xml = build_drawing_xml(images, &blip_rids, text_shapes)?;
+        const CT_DRAWING: &str = "application/vnd.openxmlformats-officedocument.drawing+xml";
+        opc.add_part(&drawing_part, CT_DRAWING, &drawing_xml)?;
+
+        Ok(drawing_rid)
+    }
+
     fn write_cell(
         w: &mut Writer<Vec<u8>>,
         row: usize,
@@ -683,6 +1045,238 @@ impl XlsxWriter {
     }
 }
 
+/// Generate `xl/drawings/drawing<n>.xml` for a sheet's pictures.
+///
+/// Each picture becomes one `<xdr:absoluteAnchor>` containing an
+/// `<xdr:pic>` with an `<a:blip r:embed="…"/>` referring back to the
+/// IMAGE rel registered on the drawing part. EMU coordinates flow
+/// through verbatim from the caller's `SheetImage`, which preserves
+/// source-PDF anchor positions on a PDF→XLSX→PDF round-trip when the
+/// upstream IR carries them.
+fn build_drawing_xml(
+    images: &[SheetImage],
+    blip_rids: &[String],
+    text_shapes: &[SheetTextShape],
+) -> crate::core::Result<Vec<u8>> {
+    const NS_XDR: &str = "http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing";
+    const NS_A: &str = "http://schemas.openxmlformats.org/drawingml/2006/main";
+
+    let mut w = Writer::new_with_indent(Vec::new(), b' ', 2);
+    w.write_event(Event::Decl(BytesDecl::new("1.0", Some("UTF-8"), Some("yes"))))?;
+
+    let mut root = BytesStart::new("xdr:wsDr");
+    root.push_attribute(("xmlns:xdr", NS_XDR));
+    root.push_attribute(("xmlns:a", NS_A));
+    root.push_attribute(("xmlns:r", NS_REL));
+    w.write_event(Event::Start(root))?;
+
+    for (i, img) in images.iter().enumerate() {
+        let rid = blip_rids.get(i).map(String::as_str).unwrap_or("rId1");
+
+        // <xdr:absoluteAnchor>
+        w.write_event(Event::Start(BytesStart::new("xdr:absoluteAnchor")))?;
+
+        // <xdr:pos x=".." y=".."/>
+        let pos_x = img.x_emu.to_string();
+        let pos_y = img.y_emu.to_string();
+        let mut pos = BytesStart::new("xdr:pos");
+        pos.push_attribute(("x", pos_x.as_str()));
+        pos.push_attribute(("y", pos_y.as_str()));
+        w.write_event(Event::Empty(pos))?;
+
+        // <xdr:ext cx=".." cy=".."/>
+        let ext_cx = img.cx_emu.max(1).to_string();
+        let ext_cy = img.cy_emu.max(1).to_string();
+        let mut ext = BytesStart::new("xdr:ext");
+        ext.push_attribute(("cx", ext_cx.as_str()));
+        ext.push_attribute(("cy", ext_cy.as_str()));
+        w.write_event(Event::Empty(ext))?;
+
+        // <xdr:pic>
+        w.write_event(Event::Start(BytesStart::new("xdr:pic")))?;
+
+        // <xdr:nvPicPr>
+        w.write_event(Event::Start(BytesStart::new("xdr:nvPicPr")))?;
+        let pic_id = (i + 1).to_string();
+        let pic_name = format!("Picture {}", i + 1);
+        let mut cnv_pr = BytesStart::new("xdr:cNvPr");
+        cnv_pr.push_attribute(("id", pic_id.as_str()));
+        cnv_pr.push_attribute(("name", pic_name.as_str()));
+        w.write_event(Event::Empty(cnv_pr))?;
+        w.write_event(Event::Empty(BytesStart::new("xdr:cNvPicPr")))?;
+        w.write_event(Event::End(BytesEnd::new("xdr:nvPicPr")))?;
+
+        // <xdr:blipFill>
+        w.write_event(Event::Start(BytesStart::new("xdr:blipFill")))?;
+        let mut blip = BytesStart::new("a:blip");
+        blip.push_attribute(("r:embed", rid));
+        w.write_event(Event::Empty(blip))?;
+        w.write_event(Event::Start(BytesStart::new("a:stretch")))?;
+        w.write_event(Event::Empty(BytesStart::new("a:fillRect")))?;
+        w.write_event(Event::End(BytesEnd::new("a:stretch")))?;
+        w.write_event(Event::End(BytesEnd::new("xdr:blipFill")))?;
+
+        // <xdr:spPr>
+        w.write_event(Event::Start(BytesStart::new("xdr:spPr")))?;
+        w.write_event(Event::Start(BytesStart::new("a:xfrm")))?;
+        let mut off = BytesStart::new("a:off");
+        off.push_attribute(("x", pos_x.as_str()));
+        off.push_attribute(("y", pos_y.as_str()));
+        w.write_event(Event::Empty(off))?;
+        let mut ext2 = BytesStart::new("a:ext");
+        ext2.push_attribute(("cx", ext_cx.as_str()));
+        ext2.push_attribute(("cy", ext_cy.as_str()));
+        w.write_event(Event::Empty(ext2))?;
+        w.write_event(Event::End(BytesEnd::new("a:xfrm")))?;
+        let mut prst = BytesStart::new("a:prstGeom");
+        prst.push_attribute(("prst", "rect"));
+        w.write_event(Event::Start(prst))?;
+        w.write_event(Event::Empty(BytesStart::new("a:avLst")))?;
+        w.write_event(Event::End(BytesEnd::new("a:prstGeom")))?;
+        w.write_event(Event::End(BytesEnd::new("xdr:spPr")))?;
+
+        w.write_event(Event::End(BytesEnd::new("xdr:pic")))?;
+
+        // <xdr:clientData/>
+        w.write_event(Event::Empty(BytesStart::new("xdr:clientData")))?;
+
+        w.write_event(Event::End(BytesEnd::new("xdr:absoluteAnchor")))?;
+    }
+
+    // ── Text shapes (one `<xdr:sp>` per layout-mode PDF span) ───────────
+    let pic_count = images.len();
+    for (j, ts) in text_shapes.iter().enumerate() {
+        // Skip empty-text shapes — Excel rejects shape XML with
+        // an empty `<a:t/>` even though OOXML allows it.
+        let trimmed = ts.text.trim_matches('\u{0000}');
+        if trimmed.is_empty() {
+            continue;
+        }
+
+        w.write_event(Event::Start(BytesStart::new("xdr:absoluteAnchor")))?;
+
+        let pos_x = ts.x_emu.to_string();
+        let pos_y = ts.y_emu.to_string();
+        let mut pos = BytesStart::new("xdr:pos");
+        pos.push_attribute(("x", pos_x.as_str()));
+        pos.push_attribute(("y", pos_y.as_str()));
+        w.write_event(Event::Empty(pos))?;
+
+        let ext_cx = ts.cx_emu.max(1).to_string();
+        let ext_cy = ts.cy_emu.max(1).to_string();
+        let mut ext = BytesStart::new("xdr:ext");
+        ext.push_attribute(("cx", ext_cx.as_str()));
+        ext.push_attribute(("cy", ext_cy.as_str()));
+        w.write_event(Event::Empty(ext))?;
+
+        w.write_event(Event::Start(BytesStart::new("xdr:sp")))?;
+
+        // <xdr:nvSpPr>
+        w.write_event(Event::Start(BytesStart::new("xdr:nvSpPr")))?;
+        let sp_id = (pic_count + j + 1).to_string();
+        let sp_name = format!("TextShape {}", pic_count + j + 1);
+        let mut cnv_pr = BytesStart::new("xdr:cNvPr");
+        cnv_pr.push_attribute(("id", sp_id.as_str()));
+        cnv_pr.push_attribute(("name", sp_name.as_str()));
+        w.write_event(Event::Empty(cnv_pr))?;
+        let mut cnv_sp_pr = BytesStart::new("xdr:cNvSpPr");
+        cnv_sp_pr.push_attribute(("txBox", "1"));
+        w.write_event(Event::Empty(cnv_sp_pr))?;
+        w.write_event(Event::End(BytesEnd::new("xdr:nvSpPr")))?;
+
+        // <xdr:spPr>
+        w.write_event(Event::Start(BytesStart::new("xdr:spPr")))?;
+        w.write_event(Event::Start(BytesStart::new("a:xfrm")))?;
+        let mut off = BytesStart::new("a:off");
+        off.push_attribute(("x", pos_x.as_str()));
+        off.push_attribute(("y", pos_y.as_str()));
+        w.write_event(Event::Empty(off))?;
+        let mut ext2 = BytesStart::new("a:ext");
+        ext2.push_attribute(("cx", ext_cx.as_str()));
+        ext2.push_attribute(("cy", ext_cy.as_str()));
+        w.write_event(Event::Empty(ext2))?;
+        w.write_event(Event::End(BytesEnd::new("a:xfrm")))?;
+        let mut prst = BytesStart::new("a:prstGeom");
+        prst.push_attribute(("prst", "rect"));
+        w.write_event(Event::Start(prst))?;
+        w.write_event(Event::Empty(BytesStart::new("a:avLst")))?;
+        w.write_event(Event::End(BytesEnd::new("a:prstGeom")))?;
+        // Transparent fill so the text shape doesn't paint a
+        // white rectangle over neighbouring content.
+        w.write_event(Event::Empty(BytesStart::new("a:noFill")))?;
+        w.write_event(Event::End(BytesEnd::new("xdr:spPr")))?;
+
+        // <xdr:txBody> — inline a single run with the span's run
+        // properties. PPTX/PRESENT and SpreadsheetML share the same
+        // DrawingML run model, so the structure mirrors PPTX text
+        // bodies elsewhere in this crate.
+        w.write_event(Event::Start(BytesStart::new("xdr:txBody")))?;
+        // <a:bodyPr wrap="none"> so a single span doesn't wrap mid-line.
+        let mut body_pr = BytesStart::new("a:bodyPr");
+        body_pr.push_attribute(("wrap", "none"));
+        body_pr.push_attribute(("rtlCol", "0"));
+        body_pr.push_attribute(("lIns", "0"));
+        body_pr.push_attribute(("tIns", "0"));
+        body_pr.push_attribute(("rIns", "0"));
+        body_pr.push_attribute(("bIns", "0"));
+        w.write_event(Event::Empty(body_pr))?;
+        w.write_event(Event::Empty(BytesStart::new("a:lstStyle")))?;
+        w.write_event(Event::Start(BytesStart::new("a:p")))?;
+        // <a:pPr marL="0" indent="0"/>
+        let mut p_pr = BytesStart::new("a:pPr");
+        p_pr.push_attribute(("marL", "0"));
+        p_pr.push_attribute(("indent", "0"));
+        w.write_event(Event::Empty(p_pr))?;
+        // <a:r>
+        w.write_event(Event::Start(BytesStart::new("a:r")))?;
+        // <a:rPr lang="en-US" sz=".." b=".." i=".."> with optional <a:solidFill> and <a:latin>
+        let sz_hp = (ts.font_size_pt * 100.0).round() as i32;
+        let sz_str = sz_hp.to_string();
+        let mut r_pr = BytesStart::new("a:rPr");
+        r_pr.push_attribute(("lang", "en-US"));
+        r_pr.push_attribute(("sz", sz_str.as_str()));
+        if ts.bold {
+            r_pr.push_attribute(("b", "1"));
+        }
+        if ts.italic {
+            r_pr.push_attribute(("i", "1"));
+        }
+        let want_color_or_font = ts.color_hex.is_some() || !ts.font_name.is_empty();
+        if want_color_or_font {
+            w.write_event(Event::Start(r_pr))?;
+            if let Some(ref hex) = ts.color_hex {
+                w.write_event(Event::Start(BytesStart::new("a:solidFill")))?;
+                let mut srgb = BytesStart::new("a:srgbClr");
+                srgb.push_attribute(("val", hex.as_str()));
+                w.write_event(Event::Empty(srgb))?;
+                w.write_event(Event::End(BytesEnd::new("a:solidFill")))?;
+            }
+            if !ts.font_name.is_empty() {
+                let mut latin = BytesStart::new("a:latin");
+                latin.push_attribute(("typeface", ts.font_name.as_str()));
+                w.write_event(Event::Empty(latin))?;
+            }
+            w.write_event(Event::End(BytesEnd::new("a:rPr")))?;
+        } else {
+            w.write_event(Event::Empty(r_pr))?;
+        }
+        // <a:t>text</a:t>
+        w.write_event(Event::Start(BytesStart::new("a:t")))?;
+        w.write_event(Event::Text(quick_xml::events::BytesText::new(trimmed)))?;
+        w.write_event(Event::End(BytesEnd::new("a:t")))?;
+        w.write_event(Event::End(BytesEnd::new("a:r")))?;
+        w.write_event(Event::End(BytesEnd::new("a:p")))?;
+        w.write_event(Event::End(BytesEnd::new("xdr:txBody")))?;
+
+        w.write_event(Event::End(BytesEnd::new("xdr:sp")))?;
+        w.write_event(Event::Empty(BytesStart::new("xdr:clientData")))?;
+        w.write_event(Event::End(BytesEnd::new("xdr:absoluteAnchor")))?;
+    }
+
+    w.write_event(Event::End(BytesEnd::new("xdr:wsDr")))?;
+    Ok(w.into_inner())
+}
+
 // ---------------------------------------------------------------------------
 // StyleTable — collects unique CellStyle objects, assigns xfIds, builds
 // styles.xml dynamically.
@@ -716,52 +1310,61 @@ struct XfKey {
 struct StyleTable {
     /// Map from (sheet_ptr, row, col) to xf index.
     cell_xf: HashMap<(*const SheetDataInner, usize, usize), u32>,
+    /// Ordered list of fonts for XML serialization.
     fonts: Vec<FontKey>,
+    /// Ordered list of fills for XML serialization.
     fills: Vec<FillKey>,
     num_fmts: Vec<(u32, String)>, // (numFmtId, formatCode) for custom formats
+    /// Ordered list of xf records for XML serialization.
     xfs: Vec<XfKey>,
+    // Lookup maps for O(1) deduplication during build.
+    font_map: HashMap<FontKey, u32>,
+    fill_map: HashMap<FillKey, u32>,
+    xf_map: HashMap<XfKey, u32>,
 }
 
 impl StyleTable {
     fn build(sheets: &[SheetDataInner]) -> Self {
-        let mut table = StyleTable {
-            cell_xf: HashMap::new(),
-            fonts: Vec::new(),
-            fills: Vec::new(),
-            num_fmts: Vec::new(),
-            xfs: Vec::new(),
-        };
-
-        // Built-in fill indices: 0=none, 1=gray125 (required by Excel)
-        // We pre-populate to match the required structure.
-        table.fills.push(FillKey(None)); // idx 0: none
-        table.fills.push(FillKey(None)); // idx 1: gray125
-
-        // Default font (idx 0)
-        table.fonts.push(FontKey {
+        let default_font = FontKey {
             bold: false,
             italic: false,
             underline: false,
             color: None,
             size_half_pt: None,
             name: None,
-        });
-
-        // Default xf (idx 0) — no style
-        table.xfs.push(XfKey {
+        };
+        let default_xf = XfKey {
             font_idx: 0,
             fill_idx: 0,
             num_fmt_id: 0,
             h_align: None,
             wrap_text: false,
-        });
+        };
+
+        let mut font_map = HashMap::new();
+        font_map.insert(default_font.clone(), 0u32);
+        let mut fill_map: HashMap<FillKey, u32> = HashMap::new();
+        fill_map.insert(FillKey(None), 0u32); // idx 0 = none; idx 1 = gray125 (pre-populated below)
+        let mut xf_map = HashMap::new();
+        xf_map.insert(default_xf.clone(), 0u32);
+
+        let mut table = StyleTable {
+            cell_xf: HashMap::new(),
+            fonts: vec![default_font],
+            fills: vec![FillKey(None), FillKey(None)], // idx 0: none, idx 1: gray125
+            num_fmts: Vec::new(),
+            xfs: vec![default_xf],
+            font_map,
+            fill_map,
+            xf_map,
+        };
 
         let mut next_custom_fmt_id: u32 = 164; // custom numFmtIds start at 164
 
         for sheet in sheets {
             let sheet_ptr = sheet as *const SheetDataInner;
             for ((row, col), style) in &sheet.cell_styles {
-                // Resolve font index
+                // Resolve font index — O(1) via HashMap.
                 let font_key = FontKey {
                     bold: style.bold,
                     italic: style.italic,
@@ -770,45 +1373,32 @@ impl StyleTable {
                     size_half_pt: style.font_size_pt.map(|s| (s * 2.0).round() as u32),
                     name: style.font_name.clone(),
                 };
-                let font_idx = if font_key
-                    == (FontKey {
-                        bold: false,
-                        italic: false,
-                        underline: false,
-                        color: None,
-                        size_half_pt: None,
-                        name: None,
-                    }) {
-                    0
+                let font_idx = if let Some(&i) = table.font_map.get(&font_key) {
+                    i
                 } else {
-                    match table.fonts.iter().position(|f| f == &font_key) {
-                        Some(i) => i as u32,
-                        None => {
-                            table.fonts.push(font_key);
-                            (table.fonts.len() - 1) as u32
-                        },
-                    }
+                    let idx = table.fonts.len() as u32;
+                    table.fonts.push(font_key.clone());
+                    table.font_map.insert(font_key, idx);
+                    idx
                 };
 
-                // Resolve fill index
+                // Resolve fill index — O(1) via HashMap.
                 let fill_key = FillKey(style.background_color.clone());
                 let fill_idx = if fill_key.0.is_none() {
                     0
+                } else if let Some(&i) = table.fill_map.get(&fill_key) {
+                    i
                 } else {
-                    match table.fills.iter().position(|f| f == &fill_key) {
-                        Some(i) => i as u32,
-                        None => {
-                            table.fills.push(fill_key);
-                            (table.fills.len() - 1) as u32
-                        },
-                    }
+                    let idx = table.fills.len() as u32;
+                    table.fills.push(fill_key.clone());
+                    table.fill_map.insert(fill_key, idx);
+                    idx
                 };
 
-                // Resolve number format id
+                // Resolve number format id.
                 let num_fmt_id = match style.number_format.builtin_id() {
                     Some(id) => id,
                     None => {
-                        // Custom format — shouldn't happen with current enum
                         let id = next_custom_fmt_id;
                         next_custom_fmt_id += 1;
                         table.num_fmts.push((id, "General".to_string()));
@@ -833,12 +1423,14 @@ impl StyleTable {
                     wrap_text: style.wrap_text,
                 };
 
-                let xf_idx = match table.xfs.iter().position(|x| x == &xf_key) {
-                    Some(i) => i as u32,
-                    None => {
-                        table.xfs.push(xf_key);
-                        (table.xfs.len() - 1) as u32
-                    },
+                // Resolve xf index — O(1) via HashMap.
+                let xf_idx = if let Some(&i) = table.xf_map.get(&xf_key) {
+                    i
+                } else {
+                    let idx = table.xfs.len() as u32;
+                    table.xfs.push(xf_key.clone());
+                    table.xf_map.insert(xf_key, idx);
+                    idx
                 };
 
                 table.cell_xf.insert((sheet_ptr, *row, *col), xf_idx);
@@ -1130,6 +1722,46 @@ mod tests {
         assert!(!buf.get_ref().is_empty());
     }
 
+    #[test]
+    fn page_setup_round_trip() {
+        // Letter portrait, 0.5" margins. The on-wire format is mm in
+        // <pageSetup paperWidth/paperHeight> + inches in <pageMargins>;
+        // verify both elements appear and that the parser recovers
+        // values within rounding tolerance.
+        let mut wb = XlsxWriter::new();
+        let mut sheet = wb.add_sheet("Geom");
+        sheet.set_cell(0, 0, CellData::String("hi".into()));
+        sheet.set_page_setup(PageSetup {
+            width_twips: 12240,    // 8.5"
+            height_twips: 15840,   // 11"
+            margin_top_twips: 720, // 0.5"
+            margin_bottom_twips: 720,
+            margin_left_twips: 720,
+            margin_right_twips: 720,
+            header_distance_twips: 432, // 0.3"
+            footer_distance_twips: 432,
+            landscape: false,
+        });
+        let mut buf = std::io::Cursor::new(Vec::new());
+        wb.write_to(&mut buf).expect("write");
+
+        // Pull sheet1.xml out and check the attributes.
+        buf.set_position(0);
+        let mut zip = zip::ZipArchive::new(buf).expect("zip");
+        let mut xml = String::new();
+        {
+            let mut entry = zip.by_name("xl/worksheets/sheet1.xml").expect("sheet");
+            std::io::Read::read_to_string(&mut entry, &mut xml).expect("read");
+        }
+        assert!(xml.contains("<pageMargins"), "missing pageMargins: {xml}");
+        assert!(xml.contains("<pageSetup"), "missing pageSetup: {xml}");
+        assert!(xml.contains(r#"orientation="portrait""#));
+        // 8.5" = 215.90mm, 11" = 279.40mm
+        assert!(xml.contains(r#"paperWidth="215.90mm""#), "width attr: {xml}");
+        assert!(xml.contains(r#"paperHeight="279.40mm""#), "height attr: {xml}");
+        assert!(xml.contains(r#"left="0.5000""#));
+    }
+
     #[test]
     fn merge_cells_xml() {
         let mut wb = XlsxWriter::new();
diff --git a/tests/office_integration.rs b/tests/office_integration.rs
index 2903197..cc9ee13 100644
--- a/tests/office_integration.rs
+++ b/tests/office_integration.rs
@@ -534,15 +534,23 @@ fn pptx_to_ir_slides_as_sections() {
     assert_eq!(ir.sections.len(), 1);
     assert_eq!(ir.sections[0].title.as_deref(), Some("Intro"));
 
-    // Body content should be a paragraph (title is used as section title, not element)
-    assert!(
-        ir.sections[0]
-            .elements
+    // Body content should be a paragraph (title is used as section title, not element).
+    // PPTX shapes with positions wrap their content in `Element::TextBox` so the
+    // renderer can place them at absolute coordinates — look inside the
+    // wrapper as well as at the top level.
+    let has_welcome = ir.sections[0].elements.iter().any(|e| match e {
+        Element::Paragraph(p) => p
+            .content
             .iter()
-            .any(|e| matches!(e, Element::Paragraph(p) if
+            .any(|c| matches!(c, InlineContent::Text(s) if s.text == "Welcome")),
+        Element::TextBox(tb) => tb.content.iter().any(|inner| {
+            matches!(inner, Element::Paragraph(p) if
                 p.content.iter().any(|c| matches!(c, InlineContent::Text(s) if s.text == "Welcome"))
-            ))
-    );
+            )
+        }),
+        _ => false,
+    });
+    assert!(has_welcome);
 }
 
 // ===========================================================================
@@ -939,9 +947,18 @@ fn pptx_image_to_ir() {
     let doc = Document::from_reader(Cursor::new(data), DocumentFormat::Pptx).unwrap();
     let ir = doc.to_ir();
 
-    assert!(ir.sections[0].elements.iter().any(
-        |e| matches!(e, Element::Image(img) if img.alt_text.as_deref() == Some("A scenic view"))
-    ));
+    // PPTX picture shapes are wrapped in a positional `Element::TextBox`
+    // so the renderer knows where to draw the picture frame.
+    let has_pic = ir.sections[0].elements.iter().any(|e| {
+        match e {
+        Element::Image(img) => img.alt_text.as_deref() == Some("A scenic view"),
+        Element::TextBox(tb) => tb.content.iter().any(|inner| {
+            matches!(inner, Element::Image(img) if img.alt_text.as_deref() == Some("A scenic view"))
+        }),
+        _ => false,
+    }
+    });
+    assert!(has_pic);
 }
 
 // ===========================================================================
diff --git a/tests/write_integration.rs b/tests/write_integration.rs
index 41e0f27..92391fe 100644
--- a/tests/write_integration.rs
+++ b/tests/write_integration.rs
@@ -235,6 +235,7 @@ fn sample_ir(format: office_oxide::DocumentFormat) -> office_oxide::DocumentIR {
                 Element::Heading(Heading {
                     level: 1,
                     content: vec![InlineContent::Text(TextSpan::plain("Main Heading"))],
+                    ..Default::default()
                 }),
                 Element::Paragraph(Paragraph {
                     content: vec![InlineContent::Text(TextSpan::plain("Body text here"))],
diff --git a/wasm-pkg/package.json b/wasm-pkg/package.json
index c482d01..e2c0b2d 100644
--- a/wasm-pkg/package.json
+++ b/wasm-pkg/package.json
@@ -1,6 +1,6 @@
 {
   "name": "office-oxide-wasm",
-  "version": "0.1.1",
+  "version": "0.1.2",
   "description": "Fast Office document processing (DOCX/XLSX/PPTX/DOC/XLS/PPT) compiled to WebAssembly. Rust core, zero JS dependencies. Works in Node.js, bundlers, and browsers.",
   "license": "MIT OR Apache-2.0",
   "author": "Yury Fedoseev",

From 9d4380cc9252ea2617e070401ca669e746502b80 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 6 May 2026 15:37:56 +0000
Subject: [PATCH 02/18] chore(ci): bump actions/attest-sbom from 2.4.0 to 4.1.0

Bumps [actions/attest-sbom](https://github.com/actions/attest-sbom) from 2.4.0 to 4.1.0.
- [Release notes](https://github.com/actions/attest-sbom/releases)
- [Changelog](https://github.com/actions/attest-sbom/blob/main/RELEASE.md)
- [Commits](https://github.com/actions/attest-sbom/compare/bd218ad0dbcb3e146bd073d1d9c6d78e08aa8a0b...c604332985a26aa8cf1bdc465b92731239ec6b9e)

---
updated-dependencies:
- dependency-name: actions/attest-sbom
  dependency-version: 4.1.0
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/release.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index f0fc82a..b607d54 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -748,7 +748,7 @@ jobs:
           GH_TOKEN: ${{ github.token }}
 
       - name: Attest SBOM
-        uses: actions/attest-sbom@bd218ad0dbcb3e146bd073d1d9c6d78e08aa8a0b # v2
+        uses: actions/attest-sbom@c604332985a26aa8cf1bdc465b92731239ec6b9e # v4.1.0
         with:
           subject-path: sbom.cdx.json
           sbom-path: sbom.cdx.json

From be5d3a3a4c29526d89519b4e030d385e47eccf2a Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 6 May 2026 15:38:11 +0000
Subject: [PATCH 03/18] chore(ci): bump github/codeql-action from 3.35.2 to
 4.35.3

Bumps [github/codeql-action](https://github.com/github/codeql-action) from 3.35.2 to 4.35.3.
- [Release notes](https://github.com/github/codeql-action/releases)
- [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md)
- [Commits](https://github.com/github/codeql-action/compare/ce64ddcb0d8d890d2df4a9d1c04ff297367dea2a...e46ed2cbd01164d986452f91f178727624ae40d7)

---
updated-dependencies:
- dependency-name: github/codeql-action
  dependency-version: 4.35.3
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/codeql.yml    | 4 ++--
 .github/workflows/scorecard.yml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 603bfaa..da29ee0 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -34,7 +34,7 @@ jobs:
         uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8 # stable
 
       - name: Initialize CodeQL
-        uses: github/codeql-action/init@ce64ddcb0d8d890d2df4a9d1c04ff297367dea2a # v3
+        uses: github/codeql-action/init@e46ed2cbd01164d986452f91f178727624ae40d7 # v3
         with:
           languages: ${{ matrix.language }}
           # Use default queries + security-extended suite
@@ -44,6 +44,6 @@ jobs:
         run: cargo build --lib
 
       - name: Perform CodeQL Analysis
-        uses: github/codeql-action/analyze@ce64ddcb0d8d890d2df4a9d1c04ff297367dea2a # v3
+        uses: github/codeql-action/analyze@e46ed2cbd01164d986452f91f178727624ae40d7 # v3
         with:
           category: "/language:${{ matrix.language }}"
diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml
index c5ee089..1f1cde4 100644
--- a/.github/workflows/scorecard.yml
+++ b/.github/workflows/scorecard.yml
@@ -42,6 +42,6 @@ jobs:
           retention-days: 5
 
       - name: Upload Scorecard results to GitHub Security tab
-        uses: github/codeql-action/upload-sarif@ce64ddcb0d8d890d2df4a9d1c04ff297367dea2a # v3
+        uses: github/codeql-action/upload-sarif@e46ed2cbd01164d986452f91f178727624ae40d7 # v3
         with:
           sarif_file: results.sarif

From 7ffcaccde048a97de48bf030823222b0b5a9ea87 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 6 May 2026 15:38:25 +0000
Subject: [PATCH 04/18] chore(ci): bump actions/upload-artifact from 4.6.2 to
 7.0.1

Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 4.6.2 to 7.0.1.
- [Release notes](https://github.com/actions/upload-artifact/releases)
- [Commits](https://github.com/actions/upload-artifact/compare/ea165f8d65b6e75b540449e92b4886f43607fa02...043fb46d1a93c77aae656e7c1c64a875d1fc6a0a)

---
updated-dependencies:
- dependency-name: actions/upload-artifact
  dependency-version: 7.0.1
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/ci.yml        |  2 +-
 .github/workflows/python.yml    |  8 ++++----
 .github/workflows/release.yml   | 10 +++++-----
 .github/workflows/scorecard.yml |  2 +-
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 5cdfee2..04b316f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -217,7 +217,7 @@ jobs:
       # its original target/release/ path so binding test code works
       # unchanged.
       - name: Upload native lib artifact
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
           name: native-lib-${{ matrix.os }}
           retention-days: 1
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index 9d64e08..69e6c41 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -146,7 +146,7 @@ jobs:
           manylinux: ${{ matrix.manylinux }}
           args: --release --features python --out dist
       - name: Upload wheels as artifacts
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
           name: wheels-linux-${{ matrix.target }}-${{ matrix.manylinux }}
           path: dist/*.whl
@@ -172,7 +172,7 @@ jobs:
           target: ${{ matrix.target }}
           args: --release --features python --out dist
       - name: Upload wheels as artifacts
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
           name: wheels-macos-${{ matrix.target }}
           path: dist/*.whl
@@ -199,7 +199,7 @@ jobs:
           target: ${{ matrix.target }}
           args: --release --features python --out dist
       - name: Upload wheels as artifacts
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
           name: wheels-windows-${{ matrix.target }}
           path: dist/*.whl
@@ -217,7 +217,7 @@ jobs:
           command: sdist
           args: --out dist
       - name: Upload sdist as artifact
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
           name: sdist
           path: dist/*.tar.gz
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index b607d54..1c623c2 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -162,7 +162,7 @@ jobs:
           echo "ARCHIVE=$ARCHIVE" >> $GITHUB_ENV
 
       - name: Upload artifact
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
           name: ${{ matrix.artifact_name }}
           path: ${{ env.ARCHIVE }}
@@ -232,7 +232,7 @@ jobs:
           cp target/${{ matrix.target }}/release/office_oxide.lib staging/lib/ 2>/dev/null || true
           cp -r include/office_oxide_c staging/include/
           cd staging && 7z a "../${{ matrix.artifact_name }}.zip" . && cd ..
-      - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
+      - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
           name: ${{ matrix.artifact_name }}
           path: |
@@ -326,7 +326,7 @@ jobs:
           printf '{"type": "module"}\n'   > wasm-pkg/web/package.json
 
       - name: Upload WASM artifact
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
           name: wasm-package
           path: wasm-pkg/
@@ -380,7 +380,7 @@ jobs:
         run: maturin build --release --features python --target ${{ matrix.target }} --out dist
 
       - name: Upload wheels
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
           name: ${{ matrix.artifact_name }}
           path: dist/*.whl
@@ -431,7 +431,7 @@ jobs:
             done
           done
           ls -R js/prebuilds
-      - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
+      - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
           name: node-native-package
           path: js/
diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml
index 1f1cde4..6672d10 100644
--- a/.github/workflows/scorecard.yml
+++ b/.github/workflows/scorecard.yml
@@ -35,7 +35,7 @@ jobs:
           publish_results: true
 
       - name: Upload Scorecard results as artifact
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
           name: scorecard-results
           path: results.sarif

From 68c48fb325f9fa5638fbdde4b5a02cda19b5f07c Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 6 May 2026 15:38:30 +0000
Subject: [PATCH 05/18] chore(ci): update dtolnay/rust-toolchain requirement to
 29eef336d9b2848a0b548edc03f92a220660cdb8

Updates the requirements on [dtolnay/rust-toolchain](https://github.com/dtolnay/rust-toolchain) to permit the latest version.
- [Release notes](https://github.com/dtolnay/rust-toolchain/releases)
- [Commits](https://github.com/dtolnay/rust-toolchain/commits/29eef336d9b2848a0b548edc03f92a220660cdb8)

---
updated-dependencies:
- dependency-name: dtolnay/rust-toolchain
  dependency-version: 29eef336d9b2848a0b548edc03f92a220660cdb8
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 04b316f..dd1e9b5 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -127,7 +127,7 @@ jobs:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v4
 
       - name: Install Rust
-        uses: dtolnay/rust-toolchain@3c5f7ea28cd621ae0bf5283f0e981fb97b8a7af9 # master
+        uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8 # master
         with:
           toolchain: ${{ matrix.rust }}
 
@@ -626,7 +626,7 @@ jobs:
         run: |
           v=$(grep -E '^rust-version' Cargo.toml | head -1 | sed 's/.*"\(.*\)".*/\1/')
           echo "version=${v:-1.85}" >> "$GITHUB_OUTPUT"
-      - uses: dtolnay/rust-toolchain@3c5f7ea28cd621ae0bf5283f0e981fb97b8a7af9 # master
+      - uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8 # master
         with:
           toolchain: ${{ steps.msrv.outputs.version }}
       - uses: Swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2

From 846f5f44c8da05a6d39f24fd60decba90a3305d6 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 6 May 2026 15:38:33 +0000
Subject: [PATCH 06/18] chore(ci): bump actions/github-script from 7.0.1 to
 9.0.0

Bumps [actions/github-script](https://github.com/actions/github-script) from 7.0.1 to 9.0.0.
- [Release notes](https://github.com/actions/github-script/releases)
- [Commits](https://github.com/actions/github-script/compare/60a0d83039c74a4aee543508d2ffcb1c3799cdea...3a2844b7e9c422d3c10d287c895573f7108da1b3)

---
updated-dependencies:
- dependency-name: actions/github-script
  dependency-version: 9.0.0
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/outdated.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/outdated.yml b/.github/workflows/outdated.yml
index 487b650..d1d014a 100644
--- a/.github/workflows/outdated.yml
+++ b/.github/workflows/outdated.yml
@@ -43,7 +43,7 @@ jobs:
 
       - name: Open issue for outdated deps
         if: steps.outdated.outputs.has_outdated == 'true'
-        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7
+        uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v7
         with:
           script: |
             const title = `chore: outdated dependencies (${new Date().toISOString().slice(0,7)})`;

From c7219cfc19dfc4668e2ce128ba34bf85fa083395 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 13 May 2026 16:13:13 +0000
Subject: [PATCH 07/18] chore(deps): bump koffi from 2.16.1 to 2.16.2 in /js

Bumps [koffi](https://github.com/Koromix/koffi) from 2.16.1 to 2.16.2.
- [Commits](https://github.com/Koromix/koffi/commits)

---
updated-dependencies:
- dependency-name: koffi
  dependency-version: 2.16.2
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 js/package-lock.json | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/js/package-lock.json b/js/package-lock.json
index 95e1dc8..0cb77b9 100644
--- a/js/package-lock.json
+++ b/js/package-lock.json
@@ -17,9 +17,9 @@
       }
     },
     "node_modules/koffi": {
-      "version": "2.16.1",
-      "resolved": "https://registry.npmjs.org/koffi/-/koffi-2.16.1.tgz",
-      "integrity": "sha512-0Ie6CfD026dNfWSosDw9dPxPzO9Rlyo0N8m5r05S8YjytIpuilzMFDMY4IDy/8xQsTwpuVinhncD+S8n3bcYZQ==",
+      "version": "2.16.2",
+      "resolved": "https://registry.npmjs.org/koffi/-/koffi-2.16.2.tgz",
+      "integrity": "sha512-owU0MRwv6xkrVqCd+33uw6BaYppkTRXbO/rVdJNI2dvZG0gzyRhYwW25eWtc5pauwK8TGh3AbkFONSezdykfSA==",
       "hasInstallScript": true,
       "license": "MIT",
       "funding": {

From 591a88d3ab4b705111d9e93b84cddb6d5760884f Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 13 May 2026 16:12:56 +0000
Subject: [PATCH 08/18] chore(deps): bump quick-xml from 0.37.5 to 0.40.0

Bumps [quick-xml](https://github.com/tafia/quick-xml) from 0.37.5 to 0.40.0.
- [Release notes](https://github.com/tafia/quick-xml/releases)
- [Changelog](https://github.com/tafia/quick-xml/blob/master/Changelog.md)
- [Commits](https://github.com/tafia/quick-xml/compare/v0.37.5...v0.40.0)

---
updated-dependencies:
- dependency-name: quick-xml
  dependency-version: 0.40.0
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 Cargo.lock | 4 ++--
 Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index bc0b74b..ad893d7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -409,9 +409,9 @@ dependencies = [
 
 [[package]]
 name = "quick-xml"
-version = "0.37.5"
+version = "0.40.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "331e97a1af0bf59823e6eadffe373d7b27f485be8748f71471c662c1f269b7fb"
+checksum = "0b7315c86b26aaef0321fba33c9dcc160da659c6a9d278f0f6a5656d6561c03b"
 dependencies = [
  "memchr",
  "serde",
diff --git a/Cargo.toml b/Cargo.toml
index a6f0723..a2de463 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -65,7 +65,7 @@ crate-type = ["rlib", "cdylib", "staticlib"]
 
 [dependencies]
 # Core parsing
-quick-xml = { version = "0.37", features = ["serialize"] }
+quick-xml = { version = "0.40", features = ["serialize"] }
 zip = { version = "8.1", default-features = false, features = ["deflate"] }
 thiserror = "2"
 serde = { version = "1", features = ["derive"] }

From e4144777c2677bea6e14ece70e650fbe7612d18f Mon Sep 17 00:00:00 2001
From: Yury Fedoseev <yfedoseev@gmail.com>
Date: Wed, 13 May 2026 13:02:38 -0700
Subject: [PATCH 09/18] fix(deps): adapt to quick-xml 0.40 API changes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Migrate the parsers to quick-xml 0.40 after the dependabot cherry-pick:

- `BytesText::unescape()` was removed in 0.40. Replace 6 call sites
  with new `core::xml::unescape_text(BytesText) -> Result<String>`
  helper that does `decode()?` + `escape::unescape()?` in one call.
- `Attribute::unescape_value()` is deprecated in 0.40 (replacement
  `normalized_value()` has different semantics — no entity unescaping).
  Wrap the 6 call sites through new `core::xml::unescape_attr_value`
  helper with `#[allow(deprecated)]` localised to one place so the
  call sites stay deprecation-free.

Also apply `cargo fmt --all` (4 files: convert_docx, convert_xlsx,
create, xlsx/text — pre-existing fmt drift surfaced by rebuild).

Result: 0 warnings, cargo clippy --workspace --all-targets
-- -D warnings clean, 535/535 tests pass.
---
 src/convert_docx.rs    |  6 ++----
 src/convert_xlsx.rs    |  6 ++----
 src/core/properties.rs |  4 ++--
 src/core/xml.rs        | 37 +++++++++++++++++++++++++++++++------
 src/create.rs          | 11 ++++-------
 src/pptx/slide.rs      |  3 +--
 src/xlsx/mod.rs        | 12 +++++-------
 src/xlsx/text.rs       |  4 +---
 8 files changed, 48 insertions(+), 35 deletions(-)

diff --git a/src/convert_docx.rs b/src/convert_docx.rs
index 6963279..33c5ffb 100644
--- a/src/convert_docx.rs
+++ b/src/convert_docx.rs
@@ -188,10 +188,8 @@ fn convert_block_elements(
                         crate::ir::InlineContent::Text(s) if s.text.is_empty()
                     )
                 });
-                let has_bottom_border = p
-                    .properties
-                    .as_ref()
-                    .is_some_and(|pp| pp.has_bottom_border);
+                let has_bottom_border =
+                    p.properties.as_ref().is_some_and(|pp| pp.has_bottom_border);
                 if is_empty_para && has_bottom_border {
                     elements.push(Element::ThematicBreak);
                     i += 1;
diff --git a/src/convert_xlsx.rs b/src/convert_xlsx.rs
index 5869963..f15239c 100644
--- a/src/convert_xlsx.rs
+++ b/src/convert_xlsx.rs
@@ -161,10 +161,8 @@ pub(crate) fn xlsx_to_ir(doc: &crate::xlsx::XlsxDocument) -> DocumentIR {
                             // XLSX cell font size is in points (`<font><sz val="N"/>`
                             // where N is f32). IR uses half-points; same
                             // half-pt convention as DOCX/PPTX read paths.
-                            span.font_size_half_pt = Some(
-                                crate::core::units::HalfPoint::from_points_rounded(size_pt)
-                                    .0,
-                            );
+                            span.font_size_half_pt =
+                                Some(crate::core::units::HalfPoint::from_points_rounded(size_pt).0);
                         }
                         if font.bold {
                             span.bold = true;
diff --git a/src/core/properties.rs b/src/core/properties.rs
index dbca417..71a3aa3 100644
--- a/src/core/properties.rs
+++ b/src/core/properties.rs
@@ -87,7 +87,7 @@ impl CoreProperties {
                     };
                 },
                 Event::Text(ref e) => {
-                    let text = e.unescape()?.into_owned();
+                    let text = crate::core::xml::unescape_text(e)?;
                     if text.is_empty() {
                         continue;
                     }
@@ -234,7 +234,7 @@ impl AppProperties {
                     current_tag = Some(String::from_utf8_lossy(local_bytes).into_owned());
                 },
                 Event::Text(ref e) => {
-                    let text = e.unescape()?.into_owned();
+                    let text = crate::core::xml::unescape_text(e)?;
                     if let Some(ref tag) = current_tag {
                         match tag.as_str() {
                             "Application" => props.application = Some(text),
diff --git a/src/core/xml.rs b/src/core/xml.rs
index c8e4499..302993a 100644
--- a/src/core/xml.rs
+++ b/src/core/xml.rs
@@ -155,12 +155,10 @@ pub fn optional_prefixed_attr_str<'a>(
         // Check prefixed: look for `:localname` at the end
         if let Some(pos) = key.iter().position(|&b| b == b':') {
             if &key[pos + 1..] == local_name {
-                let value = attr.unescape_value()?;
-                return Ok(Some(value));
+                return Ok(Some(Cow::Owned(unescape_attr_value(&attr)?)));
             }
         } else if key == local_name {
-            let value = attr.unescape_value()?;
-            return Ok(Some(value));
+            return Ok(Some(Cow::Owned(unescape_attr_value(&attr)?)));
         }
     }
     Ok(None)
@@ -185,7 +183,7 @@ pub fn read_text_content(reader: &mut NsReader<&[u8]>) -> Result<String> {
     loop {
         match reader.read_event()? {
             Event::Text(e) => {
-                text.push_str(&e.unescape()?);
+                text.push_str(&unescape_text(&e)?);
             },
             Event::CData(e) => {
                 text.push_str(std::str::from_utf8(&e)?);
@@ -238,6 +236,33 @@ pub fn make_reader(xml: &[u8]) -> NsReader<&[u8]> {
 // Fast Reader utilities (no namespace resolution — for hot-path parsing)
 // ===========================================================================
 
+/// Decode and unescape a `BytesText` event into an owned string.
+///
+/// quick-xml 0.40 removed `BytesText::unescape()` in favor of explicit
+/// `decode()` followed by `escape::unescape()`. This helper preserves
+/// the old single-call ergonomics so the parsers don't have to repeat
+/// the two-step dance. `EncodingError` and `EscapeError` go through
+/// `quick_xml::Error` to reach our `core::Error`.
+pub fn unescape_text(e: &quick_xml::events::BytesText<'_>) -> Result<String> {
+    let decoded = e.decode().map_err(quick_xml::Error::from)?;
+    let unescaped = quick_xml::escape::unescape(&decoded).map_err(quick_xml::Error::from)?;
+    Ok(unescaped.into_owned())
+}
+
+/// Decode and unescape an `Attribute` value into an owned string.
+///
+/// quick-xml 0.40 deprecated `Attribute::unescape_value()` in favor of
+/// `normalized_value()`, but the suggested replacement doesn't unescape
+/// XML entities (`&amp;`, `&lt;`, …) — only whitespace-normalizes. OOXML
+/// attribute values frequently contain hyperlinks and other content that
+/// need real entity unescaping, so we keep the old behaviour for now and
+/// centralise the deprecation suppression in one place.
+#[allow(deprecated)]
+pub fn unescape_attr_value(attr: &quick_xml::events::attributes::Attribute<'_>) -> Result<String> {
+    let cow = attr.unescape_value()?;
+    Ok(cow.into_owned())
+}
+
 /// Create a plain Reader (no namespace resolution) configured for OOXML parsing.
 /// Use this for format-specific hot paths (worksheets, slides, document body)
 /// where all elements are in a single known namespace.
@@ -258,7 +283,7 @@ pub fn read_text_content_fast(reader: &mut quick_xml::Reader<&[u8]>) -> Result<S
     loop {
         match reader.read_event()? {
             Event::Text(e) => {
-                text.push_str(&e.unescape()?);
+                text.push_str(&unescape_text(&e)?);
             },
             Event::CData(e) => {
                 text.push_str(&String::from_utf8_lossy(&e));
diff --git a/src/create.rs b/src/create.rs
index f8e56dc..62082b3 100644
--- a/src/create.rs
+++ b/src/create.rs
@@ -912,13 +912,10 @@ fn emit_pptx_slides_compacted(
     // mid-block when pdf_to_ir injects gap spacers.
     fn is_body_content(elem: &Element) -> bool {
         match elem {
-            Element::Paragraph(p) => {
-                
-                p.content.iter().any(|ic| match ic {
-                    InlineContent::Text(s) => !s.text.is_empty(),
-                    _ => false,
-                })
-            },
+            Element::Paragraph(p) => p.content.iter().any(|ic| match ic {
+                InlineContent::Text(s) => !s.text.is_empty(),
+                _ => false,
+            }),
             Element::List(_) | Element::CodeBlock(_) | Element::Table(_) => true,
             _ => false,
         }
diff --git a/src/pptx/slide.rs b/src/pptx/slide.rs
index 35ee540..679911b 100644
--- a/src/pptx/slide.rs
+++ b/src/pptx/slide.rs
@@ -335,8 +335,7 @@ fn read_blip_embed_attr(e: &quick_xml::events::BytesStart) -> CoreResult<Option<
         let key = attr.key.as_ref();
         let is_embed = key == b"r:embed" || key.ends_with(b":embed") || key == b"embed";
         if is_embed {
-            let raw = attr.unescape_value().map_err(crate::core::Error::from)?;
-            return Ok(Some(raw.into_owned()));
+            return Ok(Some(crate::core::xml::unescape_attr_value(&attr)?));
         }
     }
     Ok(None)
diff --git a/src/xlsx/mod.rs b/src/xlsx/mod.rs
index 9db81a8..40667fd 100644
--- a/src/xlsx/mod.rs
+++ b/src/xlsx/mod.rs
@@ -572,7 +572,7 @@ fn extract_chart_text(xml: &[u8]) -> String {
                 }
             },
             Ok(quick_xml::events::Event::Text(t)) => {
-                if let Ok(s) = t.unescape() {
+                if let Ok(s) = crate::core::xml::unescape_text(&t) {
                     let trimmed = s.trim();
                     if trimmed.is_empty() {
                         continue;
@@ -881,7 +881,7 @@ fn parse_drawing_anchors(xml_data: &[u8]) -> crate::core::Result<DrawingAnchors>
                         for attr in e.attributes().with_checks(false) {
                             let attr = attr.map_err(crate::core::Error::from)?;
                             let key = attr.key.as_ref();
-                            let raw = attr.unescape_value().map_err(crate::core::Error::from)?;
+                            let raw = crate::core::xml::unescape_attr_value(&attr)?;
                             match key {
                                 b"sz" => {
                                     // sz is in hundredths of a pt.
@@ -941,9 +941,7 @@ fn parse_drawing_anchors(xml_data: &[u8]) -> crate::core::Result<DrawingAnchors>
                             let attr = attr.map_err(crate::core::Error::from)?;
                             let key = attr.key.as_ref();
                             if key == b"r:embed" || key.ends_with(b":embed") || key == b"embed" {
-                                let raw =
-                                    attr.unescape_value().map_err(crate::core::Error::from)?;
-                                embed_rid = Some(raw.into_owned());
+                                embed_rid = Some(crate::core::xml::unescape_attr_value(&attr)?);
                                 break;
                             }
                         }
@@ -967,7 +965,7 @@ fn parse_drawing_anchors(xml_data: &[u8]) -> crate::core::Result<DrawingAnchors>
                         for attr in e.attributes().with_checks(false) {
                             let attr = attr.map_err(crate::core::Error::from)?;
                             let key = attr.key.as_ref();
-                            let raw = attr.unescape_value().map_err(crate::core::Error::from)?;
+                            let raw = crate::core::xml::unescape_attr_value(&attr)?;
                             match key {
                                 b"sz" => {
                                     if let Ok(n) = raw.parse::<i32>() {
@@ -984,7 +982,7 @@ fn parse_drawing_anchors(xml_data: &[u8]) -> crate::core::Result<DrawingAnchors>
                 }
             },
             Event::Text(ref e) if in_a_t => {
-                let s = e.unescape().map_err(crate::core::Error::from)?;
+                let s = crate::core::xml::unescape_text(e)?;
                 text_buf.push_str(&s);
             },
             Event::End(ref e) => {
diff --git a/src/xlsx/text.rs b/src/xlsx/text.rs
index 83df018..d37a915 100644
--- a/src/xlsx/text.rs
+++ b/src/xlsx/text.rs
@@ -243,9 +243,7 @@ impl XlsxDocument {
         match &cell.value {
             CellValue::Empty => {},
             CellValue::Number(n) => {
-                let is_date = cell
-                    .style_index
-                    .is_some_and(|i| date_indices.contains(&i));
+                let is_date = cell.style_index.is_some_and(|i| date_indices.contains(&i));
                 if is_date {
                     if let Some(dt) = date::DateTimeValue::from_serial(*n, self.workbook.date1904) {
                         buf.push_str(&dt.to_iso_string());

From 745083805f8093ddce74927c1f15ca73af5dc1b8 Mon Sep 17 00:00:00 2001
From: Yury Fedoseev <yfedoseev@gmail.com>
Date: Wed, 13 May 2026 20:11:49 -0700
Subject: [PATCH 10/18] docs(cli,mcp): add crate-level docs for binary crates

`office_oxide_cli` and `office_oxide_mcp` had `mod commands;` / `mod
protocol;` as their first statement, leaving the crate root undocumented.
Add a short crate-level `//!` doc and `#![warn(missing_docs)]` so future
items in either binary stay documented.

Verified: `RUSTDOCFLAGS="-D missing_docs" cargo doc --workspace
--no-deps --features parallel,mmap` now passes with zero errors.
---
 crates/office_oxide_cli/src/main.rs | 8 ++++++++
 crates/office_oxide_mcp/src/main.rs | 8 ++++++++
 2 files changed, 16 insertions(+)

diff --git a/crates/office_oxide_cli/src/main.rs b/crates/office_oxide_cli/src/main.rs
index 21b710b..f8f7903 100644
--- a/crates/office_oxide_cli/src/main.rs
+++ b/crates/office_oxide_cli/src/main.rs
@@ -1,3 +1,11 @@
+//! `office-oxide` — command-line front-end to the `office_oxide` library.
+//!
+//! Extracts text, converts to Markdown / HTML / IR, and inspects DOCX,
+//! XLSX, PPTX, DOC, XLS, and PPT files. See `office-oxide --help` for
+//! the full subcommand list.
+
+#![warn(missing_docs)]
+
 mod commands;
 
 use clap::Parser;
diff --git a/crates/office_oxide_mcp/src/main.rs b/crates/office_oxide_mcp/src/main.rs
index f0368b7..ccf8897 100644
--- a/crates/office_oxide_mcp/src/main.rs
+++ b/crates/office_oxide_mcp/src/main.rs
@@ -1,3 +1,11 @@
+//! `office-oxide-mcp` — Model Context Protocol server for office_oxide.
+//!
+//! Speaks JSON-RPC 2.0 over stdin/stdout. Exposes two tools:
+//! `extract` (text / markdown / html / ir from a DOCX/XLSX/PPTX/DOC/
+//! XLS/PPT file) and `info` (format detection + metadata).
+
+#![warn(missing_docs)]
+
 mod protocol;
 
 use std::io::{self, BufRead, Write};

From 1ba28e7269173f748b2d033f12d1e949cfa2defc Mon Sep 17 00:00:00 2001
From: Yury Fedoseev <yfedoseev@gmail.com>
Date: Thu, 14 May 2026 17:34:23 -0700
Subject: [PATCH 11/18] docs(changelog): expand v0.1.2 entry for recent branch
 changes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Records the run-colour propagation folded into the release commit
(DOCX `<w:rPr><w:color/>` and PPTX `<a:solidFill><a:srgbClr/>` into
`TextSpan.color`), the quick-xml 0.37 → 0.40 API migration with the
new `core::xml::unescape_text` / `unescape_attr_value` helpers, and
the crate-level `//!` docs + `missing_docs` lint added to
`office_oxide_cli` and `office_oxide_mcp`. Release date bumped to
2026-05-14.
---
 CHANGELOG.md | 36 +++++++++++++++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a98f0f2..0e0f143 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,7 +5,7 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [0.1.2] - 2026-05-13
+## [0.1.2] - 2026-05-14
 
 > Round-trip fidelity, IR layout features, embedded fonts, XLSX number formatting, and an O(1) style-lookup perf win.
 
@@ -40,6 +40,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - **Per-section page sizes** preserved through `to_ir`; multi-section IR
   emits per-section `<w:sectPr>`.
 - **`<w:sz>` preserved** through to IR's `font_size_half_pt`.
+- **Run colour** from `<w:rPr><w:color w:val="RRGGBB"/>` propagated into
+  `TextSpan.color` during `to_ir`, so PDF→DOCX→PDF round-trips keep
+  coloured text. Only the `ColorRef::Rgb` variant is plumbed today;
+  theme / system / `auto` colours still fall through to the renderer
+  default (proper resolution needs `theme.xml` threaded into the
+  convert path).
 - **Headers and footers** now included in `to_markdown` and `to_ir`
   (previously silently dropped).
 - **Embedded fonts** under `/word/fonts/` exposed on
@@ -68,6 +74,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   section's `PageSetup`.
 - **Run font sizes preserved** via new `TextRun.font_size_hundredths_pt`
   (parsed from `<a:rPr sz="…"/>`).
+- **Run colour preserved** via new `TextRun.color_rgb: Option<[u8; 3]>`
+  parsed from `<a:rPr><a:solidFill><a:srgbClr val="RRGGBB"/></a:solidFill>`
+  and propagated to `TextSpan.color` in IR. The parser tracks an
+  `in_solid_fill` flag so sibling effects (e.g. `<a:hl><a:srgbClr/>`
+  for hyperlink colour) don't leak into the run's own fill; non-sRGB
+  fills (gradient, scheme colour) fall back to `None`.
 - **Paragraph alignment** parsed from `<a:pPr algn="…"/>` (all five
   variants: `l` / `ctr` / `r` / `just` / `dist`) into
   `TextParagraph.alignment`. **Space-before** parsed from
@@ -137,6 +149,28 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   `from_points_rounded`**: cross-format font-size invariants
   (DrawingML hundredths-of-a-point vs WML half-points).
 
+### Dependencies
+
+- **`quick-xml` 0.37 → 0.40**: upstream removed `BytesText::unescape()`
+  and deprecated `Attribute::unescape_value()` (its replacement
+  `normalized_value()` has different semantics — no entity
+  unescaping). Migration added two helpers in `core::xml`:
+  `unescape_text(BytesText) -> Result<String>` (used by 6 call sites)
+  and `unescape_attr_value` (used by 6 call sites, with
+  `#[allow(deprecated)]` localised to the helper so call sites stay
+  deprecation-free). 535 / 535 tests still pass; clippy clean.
+- **`koffi` 2.16.1 → 2.16.2** in `js/` (patch bump).
+
+### Documentation
+
+- **CLI / MCP crate-level docs**: `office_oxide_cli` and
+  `office_oxide_mcp` previously opened with `mod commands;` /
+  `mod protocol;` and had no crate-level rustdoc. Added a short
+  `//!` block plus `#![warn(missing_docs)]` so future items in
+  either binary stay documented.
+  `RUSTDOCFLAGS="-D missing_docs" cargo doc --workspace --no-deps
+  --features parallel,mmap` now passes with zero errors.
+
 ### Tests
 
 - **+98 unit tests** across the modules touched in this release:

From c3179005a8b50b9cada5ca759a6a4a56598eea47 Mon Sep 17 00:00:00 2001
From: Yury Fedoseev <yfedoseev@gmail.com>
Date: Thu, 14 May 2026 17:51:19 -0700
Subject: [PATCH 12/18] fix(pptx): add missing color_rgb field to test TextRun
 constructors

The shape::TextRun struct gained a color_rgb field in this branch but
ten in-test constructors in src/pptx/text.rs still listed the previous
field set, breaking cargo clippy/test workspace-wide.
---
 src/pptx/text.rs | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/pptx/text.rs b/src/pptx/text.rs
index 33a451c..cbdddd6 100644
--- a/src/pptx/text.rs
+++ b/src/pptx/text.rs
@@ -458,6 +458,7 @@ mod tests {
                         strikethrough: false,
                         hyperlink: None,
                         font_size_hundredths_pt: None,
+                        color_rgb: None,
                     })],
                 }],
             }),
@@ -488,6 +489,7 @@ mod tests {
                         strikethrough: false,
                         hyperlink: None,
                         font_size_hundredths_pt: None,
+                        color_rgb: None,
                     })],
                 }],
             }),
@@ -595,6 +597,7 @@ mod tests {
                                 strikethrough: false,
                                 hyperlink: None,
                                 font_size_hundredths_pt: None,
+                                color_rgb: None,
                             }),
                             TextContent::Run(TextRun {
                                 text: " and ".to_string(),
@@ -603,6 +606,7 @@ mod tests {
                                 strikethrough: false,
                                 hyperlink: None,
                                 font_size_hundredths_pt: None,
+                                color_rgb: None,
                             }),
                             TextContent::Run(TextRun {
                                 text: "italic".to_string(),
@@ -611,6 +615,7 @@ mod tests {
                                 strikethrough: false,
                                 hyperlink: None,
                                 font_size_hundredths_pt: None,
+                                color_rgb: None,
                             }),
                         ],
                     }],
@@ -668,6 +673,7 @@ mod tests {
                                                 strikethrough: false,
                                                 hyperlink: None,
                                                 font_size_hundredths_pt: None,
+                                                color_rgb: None,
                                             })],
                                         }],
                                     }),
@@ -689,6 +695,7 @@ mod tests {
                                                 strikethrough: false,
                                                 hyperlink: None,
                                                 font_size_hundredths_pt: None,
+                                                color_rgb: None,
                                             })],
                                         }],
                                     }),
@@ -714,6 +721,7 @@ mod tests {
                                                 strikethrough: false,
                                                 hyperlink: None,
                                                 font_size_hundredths_pt: None,
+                                                color_rgb: None,
                                             })],
                                         }],
                                     }),
@@ -735,6 +743,7 @@ mod tests {
                                                 strikethrough: false,
                                                 hyperlink: None,
                                                 font_size_hundredths_pt: None,
+                                                color_rgb: None,
                                             })],
                                         }],
                                     }),
@@ -789,6 +798,7 @@ mod tests {
                                 tooltip: None,
                             }),
                             font_size_hundredths_pt: None,
+                            color_rgb: None,
                         })],
                     }],
                 }),

From 59f54e37d498db5cc95fbe4348fb26019c566ef5 Mon Sep 17 00:00:00 2001
From: Yury Fedoseev <yfedoseev@gmail.com>
Date: Thu, 14 May 2026 17:55:03 -0700
Subject: [PATCH 13/18] fix(review): address Copilot review comments on PR #38
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- xlsx/worksheet.rs: correct A3 paper-size twips (16838×23811 vs
  the off-by-2 16840×23820); drop the no-op block that "zeroed"
  already-zero dimensions in build_page_setup.
- docx/text.rs: remove the dead pre-split loop that built a string
  it never appended anywhere; split_headers_footers does the
  actual emission below.
- xlsx/mod.rs: drop the `push_str("")` no-op in extract_chart_text
  — adjacent rich-text runs concatenate directly (the surrounding
  XML preserves any intended whitespace as `<a:t xml:space="preserve">`).
- convert_xlsx.rs: when a worksheet had `<pageMargins>` but no
  `<pageSetup>`, fall back to PageSetup::default() geometry instead
  of dropping the parsed margins on the floor.
---
 src/convert_xlsx.rs   | 29 ++++++++++++++++++-----------
 src/docx/text.rs      | 30 ------------------------------
 src/xlsx/mod.rs       |  4 ----
 src/xlsx/worksheet.rs | 11 ++---------
 4 files changed, 20 insertions(+), 54 deletions(-)

diff --git a/src/convert_xlsx.rs b/src/convert_xlsx.rs
index f15239c..86296b6 100644
--- a/src/convert_xlsx.rs
+++ b/src/convert_xlsx.rs
@@ -220,16 +220,23 @@ pub(crate) fn xlsx_to_ir(doc: &crate::xlsx::XlsxDocument) -> DocumentIR {
         // had no <pageMargins> — Excel's default 0.7"/0.75" is wider than
         // we want for a tight PDF round-trip and would shrink the usable
         // text area.
-        let page_setup = ws.page_setup.and_then(|wsp| {
-            // A worksheet that only had <pageMargins> (no dimensions) is
-            // treated as "no geometry" so the renderer keeps its
-            // OfficeConfig default page size.
-            if wsp.width_twips == 0 || wsp.height_twips == 0 {
-                return None;
-            }
-            Some(PageSetup {
-                width_twips: wsp.width_twips,
-                height_twips: wsp.height_twips,
+        let page_setup = ws.page_setup.map(|wsp| {
+            // When <pageMargins> was present but <pageSetup> was not,
+            // wsp's width/height come through as 0. Fall back to the
+            // IR PageSetup default geometry rather than dropping the
+            // parsed margins on the floor.
+            let default = PageSetup::default();
+            PageSetup {
+                width_twips: if wsp.width_twips == 0 {
+                    default.width_twips
+                } else {
+                    wsp.width_twips
+                },
+                height_twips: if wsp.height_twips == 0 {
+                    default.height_twips
+                } else {
+                    wsp.height_twips
+                },
                 margin_top_twips: wsp.margin_top_twips,
                 margin_bottom_twips: wsp.margin_bottom_twips,
                 margin_left_twips: wsp.margin_left_twips,
@@ -237,7 +244,7 @@ pub(crate) fn xlsx_to_ir(doc: &crate::xlsx::XlsxDocument) -> DocumentIR {
                 header_distance_twips: wsp.header_distance_twips,
                 footer_distance_twips: wsp.footer_distance_twips,
                 landscape: wsp.landscape,
-            })
+            }
         });
 
         // Each XLSX worksheet renders to its own PDF page sequence, so
diff --git a/src/docx/text.rs b/src/docx/text.rs
index 3a5bc13..4028594 100644
--- a/src/docx/text.rs
+++ b/src/docx/text.rs
@@ -36,34 +36,6 @@ impl DocxDocument {
             numbering: self.numbering.as_ref(),
         };
 
-        // Headers (deduped on text content — headers may be repeated for
-        // first-page / even / default variants but the text is usually the
-        // same; we only want one copy in flat markdown).
-        let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
-        for hf in &self.headers_footers {
-            if !matches!(
-                hf.hf_type,
-                super::HeaderFooterType::Default
-                    | super::HeaderFooterType::First
-                    | super::HeaderFooterType::Even
-            ) {
-                continue;
-            }
-            let mut buf = String::new();
-            markdown_blocks(&hf.content, &ctx, &mut buf, 0);
-            let trimmed = buf.trim();
-            // Skip empty headers/footers and duplicates.
-            if trimmed.is_empty() || !seen.insert(trimmed.to_string()) {
-                continue;
-            }
-            // We don't currently know which side (header vs footer) this
-            // came from at this layer — `HeaderFooter` carries only the
-            // type modifier (default/first/even). The body sits between
-            // the headers and footers we emit, so we put all headers
-            // before and all footers after the body.
-        }
-
-        // Decide header/footer split using each section's references.
         let (header_texts, footer_texts) = split_headers_footers(self, &ctx);
         for h in &header_texts {
             out.push_str(h);
@@ -80,11 +52,9 @@ impl DocxDocument {
             out.push('\n');
         }
 
-        // Trim trailing newlines
         while out.ends_with('\n') {
             out.pop();
         }
-        let _ = seen; // silence
         out
     }
 }
diff --git a/src/xlsx/mod.rs b/src/xlsx/mod.rs
index 40667fd..4455697 100644
--- a/src/xlsx/mod.rs
+++ b/src/xlsx/mod.rs
@@ -580,10 +580,6 @@ fn extract_chart_text(xml: &[u8]) -> String {
                     let top = stack.last().map(|v| v.as_slice());
                     match top {
                         Some(b"t") => {
-                            // Rich-text run — append to current_title.
-                            if !current_title.is_empty() {
-                                current_title.push_str("");
-                            }
                             current_title.push_str(trimmed);
                         },
                         Some(b"v") => {
diff --git a/src/xlsx/worksheet.rs b/src/xlsx/worksheet.rs
index ee53726..b51671a 100644
--- a/src/xlsx/worksheet.rs
+++ b/src/xlsx/worksheet.rs
@@ -317,7 +317,7 @@ fn paper_size_enum_to_twips(id: u32) -> (u32, u32) {
         1 => (12240, 15840),  // Letter 8.5 × 11"
         5 => (12240, 20160),  // Legal 8.5 × 14"
         7 => (10440, 15120),  // Executive 7.25 × 10.5"
-        8 => (16840, 23820),  // A3 297 × 420 mm
+        8 => (16838, 23811),  // A3 297 × 420 mm
         9 => (11906, 16838),  // A4 210 × 297 mm
         11 => (8392, 11906),  // A5 148 × 210 mm
         12 => (14171, 20012), // B4 250 × 353 mm
@@ -369,7 +369,7 @@ fn build_page_setup(
         footer: 0.3,
     });
     let r = raw.unwrap_or_default();
-    let mut ps = PageSetup {
+    let ps = PageSetup {
         width_twips: r.width_twips,
         height_twips: r.height_twips,
         margin_top_twips: in_to_twips(m.top),
@@ -380,13 +380,6 @@ fn build_page_setup(
         footer_distance_twips: in_to_twips(m.footer),
         landscape: r.landscape,
     };
-    // If we only saw <pageMargins> (no <pageSetup>), leave dimensions
-    // unset so the caller can fall back to the IR default; otherwise
-    // downstream renderers would draw onto a 0×0 page.
-    if ps.width_twips == 0 || ps.height_twips == 0 {
-        ps.width_twips = 0;
-        ps.height_twips = 0;
-    }
     Some(ps)
 }
 

From dfd3a3408221d3bbe9aa226ad5586dee9c97897b Mon Sep 17 00:00:00 2001
From: Yury Fedoseev <yfedoseev@gmail.com>
Date: Thu, 14 May 2026 18:05:14 -0700
Subject: [PATCH 14/18] fix(review): more Copilot follow-ups + coverage tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Review fixes:
- xlsx/numfmt: rewrite format_commas to avoid the rounded.fract()
  float round-trip (could off-by-one near .999…), fall back to the
  bare Rust formatter when the value overflows u64, and surface
  NaN/Infinity as visible labels instead of empty strings so anomalous
  cells aren't mistaken for empty data.
- xlsx/numfmt: format_currency now puts the minus sign in front of the
  symbol ("-$99.50" not "$-99.50"). Test updated to match.
- xlsx/worksheet: extract the ECMA-376 default margins into a single
  PageMarginsIn::DEFAULTS constant and reuse it from parse_page_margins
  and build_page_setup so future tweaks stay in lockstep.
- convert_pptx: use plain `h / 5` for hundredths-of-pt → twips. div_ceil
  was inflating every non-multiple-of-5 by an extra twip.

Coverage:
- New unit tests for src/xls/images.rs (was 0%) covering BLIP type
  detection, UID/header sizing, signature validation, format mapping,
  and end-to-end record extraction with a synthetic PNG payload.
- New unit tests for src/xlsx/mod.rs (was ~16%) covering sheet-rels
  path derivation, relative ZIP path resolution (absolute, .. and ./
  segments), image-format byte sniffing, extract_chart_text on a
  minimal title plus a categories/series example, and the drawing
  anchor parser on picture/text/empty inputs.
---
 src/convert_pptx.rs   |   8 ++-
 src/xls/images.rs     |  93 +++++++++++++++++++++++++
 src/xlsx/mod.rs       | 155 ++++++++++++++++++++++++++++++++++++++++++
 src/xlsx/numfmt.rs    |  58 +++++++++++-----
 src/xlsx/worksheet.rs |  36 ++++++----
 5 files changed, 316 insertions(+), 34 deletions(-)

diff --git a/src/convert_pptx.rs b/src/convert_pptx.rs
index 96ec625..6bbbfda 100644
--- a/src/convert_pptx.rs
+++ b/src/convert_pptx.rs
@@ -290,9 +290,11 @@ fn convert_text_body(body: &crate::pptx::TextBody, elements: &mut Vec<Element>)
             let content = convert_text_paragraph_inline(para);
             // Honour space_before from PPTX so spacer paragraphs
             // emitted by pdf_to_ir round-trip with their full vertical
-            // gap. Convert hundredths-of-pt → twips: hundredths * 0.2
-            // (1pt = 20 twips, so pt*100 → twips = (pt*100)/5).
-            let space_before_twips = para.space_before_hundredths_pt.map(|h| h.div_ceil(5));
+            // gap. Convert hundredths-of-pt → twips: 1pt = 20 twips,
+            // so pt*100 → twips = (pt*100)/5. Plain division keeps the
+            // round-trip exact for values that are multiples of 5;
+            // div_ceil would inflate every non-multiple by 1 twip.
+            let space_before_twips = para.space_before_hundredths_pt.map(|h| h / 5);
             // Empty paragraphs serve as vertical spacers — keep them
             // in the IR even when content is empty so the renderer
             // can advance the cursor by the requested amount.
diff --git a/src/xls/images.rs b/src/xls/images.rs
index f9f75f6..ba94751 100644
--- a/src/xls/images.rs
+++ b/src/xls/images.rs
@@ -93,3 +93,96 @@ fn to_format(rt: u16) -> ImageFormat {
         other => ImageFormat::Unknown(other),
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn blip_type_recognition() {
+        assert!(is_blip_type(0xF01D));
+        assert!(is_blip_type(0xF01E));
+        assert!(is_blip_type(0xF02A));
+        assert!(!is_blip_type(0xF000));
+        assert!(!is_blip_type(0xF020));
+    }
+
+    #[test]
+    fn uid_size_secondary_uid() {
+        // Bit 0 of inst signals a secondary UID — adds 16 bytes.
+        assert_eq!(uid_size(0xF01D, 0b00), 17);
+        assert_eq!(uid_size(0xF01D, 0b01), 33);
+        // 0xF01A..=0xF01C use the metafile-style header layout (base 16).
+        assert_eq!(uid_size(0xF01A, 0b00), 16);
+        assert_eq!(uid_size(0xF01A, 0b01), 32);
+    }
+
+    #[test]
+    fn metafile_header_only_for_metafile_types() {
+        assert_eq!(metafile_header_size(0xF01A), 34);
+        assert_eq!(metafile_header_size(0xF01B), 34);
+        assert_eq!(metafile_header_size(0xF01C), 34);
+        assert_eq!(metafile_header_size(0xF01D), 0);
+        assert_eq!(metafile_header_size(0xF01E), 0);
+    }
+
+    #[test]
+    fn signature_validation() {
+        // JPEG starts with FFD8.
+        assert!(has_valid_signature(0xF01D, &[0xFF, 0xD8, 0x00]));
+        assert!(!has_valid_signature(0xF01D, &[0x00, 0x00]));
+        // PNG starts with 89 50 4E 47.
+        assert!(has_valid_signature(0xF01E, b"\x89PNG\r\n"));
+        assert!(!has_valid_signature(0xF01E, b"WRONG"));
+        // EMF: 01 00 00 00 prefix.
+        assert!(has_valid_signature(0xF01A, &[0x01, 0x00, 0x00, 0x00, 0xAA]));
+        assert!(!has_valid_signature(0xF01A, &[0x00, 0x00, 0x00, 0x00]));
+        // Empty payload always invalid.
+        assert!(!has_valid_signature(0xF01D, &[]));
+    }
+
+    #[test]
+    fn to_format_mapping() {
+        assert!(matches!(to_format(0xF01A), ImageFormat::Emf));
+        assert!(matches!(to_format(0xF01B), ImageFormat::Wmf));
+        assert!(matches!(to_format(0xF01C), ImageFormat::Pict));
+        assert!(matches!(to_format(0xF01D), ImageFormat::Jpeg));
+        assert!(matches!(to_format(0xF02A), ImageFormat::Jpeg));
+        assert!(matches!(to_format(0xF01E), ImageFormat::Png));
+        assert!(matches!(to_format(0xF01F), ImageFormat::Dib));
+        assert!(matches!(to_format(0xF029), ImageFormat::Tiff));
+        assert!(matches!(to_format(0xABCD), ImageFormat::Unknown(0xABCD)));
+    }
+
+    #[test]
+    fn extract_images_skips_non_blip_bytes() {
+        // Random non-BLIP bytes produce no images and never crash.
+        let data = vec![0u8; 64];
+        assert!(extract_images(&data).is_empty());
+    }
+
+    #[test]
+    fn extract_images_finds_embedded_png() {
+        // Synthesize a record header followed by a PNG signature so the
+        // scanner descends into a valid BLIP payload.
+        let rec_type: u16 = 0xF01E; // PNG
+        let inst: u16 = 0; // no secondary UID
+        let ver_inst: u16 = inst << 4;
+        let uid = 17usize; // base for non-metafile
+        let png_body = b"\x89PNG\r\n\x1a\nIHDRfakebody";
+        let payload_len = uid + png_body.len();
+
+        let mut data = Vec::new();
+        data.extend_from_slice(&ver_inst.to_le_bytes());
+        data.extend_from_slice(&rec_type.to_le_bytes());
+        data.extend_from_slice(&(payload_len as u32).to_le_bytes());
+        data.extend_from_slice(&[0u8; 17]); // skipped UID bytes
+        data.extend_from_slice(png_body);
+
+        let images = extract_images(&data);
+        assert_eq!(images.len(), 1);
+        assert!(matches!(images[0].format, ImageFormat::Png));
+        assert_eq!(images[0].data, png_body);
+        assert_eq!(images[0].index, 0);
+    }
+}
diff --git a/src/xlsx/mod.rs b/src/xlsx/mod.rs
index 4455697..8c2b79f 100644
--- a/src/xlsx/mod.rs
+++ b/src/xlsx/mod.rs
@@ -1058,3 +1058,158 @@ fn guess_image_format_from_bytes(bytes: &[u8]) -> &'static str {
         "png"
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn sheet_rels_path_top_level() {
+        assert_eq!(sheet_rels_path("xl/worksheets/sheet1.xml"), "xl/worksheets/_rels/sheet1.xml.rels");
+        assert_eq!(sheet_rels_path("sheet1.xml"), "_rels/sheet1.xml.rels");
+    }
+
+    #[test]
+    fn resolve_relative_zip_path_absolute() {
+        assert_eq!(resolve_relative_zip_path("xl/worksheets/sheet1.xml", "/xl/media/img1.png"), "xl/media/img1.png");
+    }
+
+    #[test]
+    fn resolve_relative_zip_path_dotdot() {
+        assert_eq!(
+            resolve_relative_zip_path("xl/worksheets/sheet1.xml", "../drawings/drawing1.xml"),
+            "xl/drawings/drawing1.xml"
+        );
+    }
+
+    #[test]
+    fn resolve_relative_zip_path_dot_segment() {
+        assert_eq!(
+            resolve_relative_zip_path("xl/worksheets/sheet1.xml", "./local.xml"),
+            "xl/worksheets/local.xml"
+        );
+    }
+
+    #[test]
+    fn resolve_relative_zip_path_source_at_root() {
+        assert_eq!(resolve_relative_zip_path("file.xml", "sub/x.xml"), "sub/x.xml");
+    }
+
+    #[test]
+    fn guess_image_format_signatures() {
+        assert_eq!(guess_image_format_from_bytes(&[0x89, b'P', b'N', b'G', 13, 10, 26, 10]), "png");
+        assert_eq!(guess_image_format_from_bytes(&[0xFF, 0xD8, 0xFF, 0xE0]), "jpeg");
+        assert_eq!(guess_image_format_from_bytes(b"GIF89a..."), "gif");
+        assert_eq!(guess_image_format_from_bytes(b"GIF87a..."), "gif");
+        assert_eq!(guess_image_format_from_bytes(b"BM\0\0\0"), "bmp");
+        assert_eq!(guess_image_format_from_bytes(b"II*\0\x08\0"), "tiff");
+        assert_eq!(guess_image_format_from_bytes(b"MM\0*\0\x08"), "tiff");
+        assert_eq!(guess_image_format_from_bytes(&[0xD7, 0xCD, 0xC6, 0x9A]), "wmf");
+        assert_eq!(guess_image_format_from_bytes(&[0x01, 0x00, 0x00, 0x00, 0x58]), "emf");
+        // Fall back to png for unknown payloads.
+        assert_eq!(guess_image_format_from_bytes(&[0, 0, 0]), "png");
+    }
+
+    #[test]
+    fn extract_chart_text_minimal_title() {
+        let xml = br#"<?xml version="1.0"?>
+<c:chartSpace xmlns:c="http://schemas.openxmlformats.org/drawingml/2006/chart"
+              xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main">
+  <c:chart>
+    <c:title>
+      <c:tx>
+        <c:rich>
+          <a:p><a:r><a:t>Quarterly Sales</a:t></a:r></a:p>
+        </c:rich>
+      </c:tx>
+    </c:title>
+  </c:chart>
+</c:chartSpace>"#;
+        let out = extract_chart_text(xml);
+        assert!(out.contains("Title: Quarterly Sales"), "got: {out}");
+    }
+
+    #[test]
+    fn extract_chart_text_series_and_categories() {
+        let xml = br#"<?xml version="1.0"?>
+<c:chartSpace xmlns:c="http://schemas.openxmlformats.org/drawingml/2006/chart"
+              xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main">
+  <c:chart><c:plotArea>
+    <c:barChart>
+      <c:ser>
+        <c:tx><c:strRef><c:f>Sheet1!$B$1</c:f><c:strCache><c:pt><c:v>Budget</c:v></c:pt></c:strCache></c:strRef></c:tx>
+        <c:cat><c:strRef><c:strCache>
+          <c:pt><c:v>Q1</c:v></c:pt>
+          <c:pt><c:v>Q2</c:v></c:pt>
+        </c:strCache></c:strRef></c:cat>
+        <c:val><c:numRef><c:numCache>
+          <c:pt><c:v>1000</c:v></c:pt>
+          <c:pt><c:v>2000</c:v></c:pt>
+        </c:numCache></c:numRef></c:val>
+      </c:ser>
+    </c:barChart>
+  </c:plotArea></c:chart>
+</c:chartSpace>"#;
+        let out = extract_chart_text(xml);
+        assert!(out.contains("Categories: Q1, Q2"), "got: {out}");
+        assert!(out.contains("Budget: 1000, 2000"), "got: {out}");
+    }
+
+    #[test]
+    fn parse_drawing_anchors_picture_one_cell() {
+        let xml = br#"<?xml version="1.0"?>
+<xdr:wsDr xmlns:xdr="http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing"
+          xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
+          xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
+  <xdr:oneCellAnchor>
+    <xdr:from><xdr:col>0</xdr:col><xdr:colOff>914400</xdr:colOff>
+              <xdr:row>0</xdr:row><xdr:rowOff>457200</xdr:rowOff></xdr:from>
+    <xdr:ext cx="2000000" cy="1500000"/>
+    <xdr:pic>
+      <xdr:nvPicPr>
+        <xdr:cNvPr id="2" name="Image1" descr="my-alt"/>
+      </xdr:nvPicPr>
+      <xdr:blipFill>
+        <a:blip r:embed="rId4"/>
+      </xdr:blipFill>
+    </xdr:pic>
+  </xdr:oneCellAnchor>
+</xdr:wsDr>"#;
+        let parsed = parse_drawing_anchors(xml).expect("parse ok");
+        assert_eq!(parsed.pictures.len(), 1);
+        assert_eq!(parsed.pictures[0].embed_rid, "rId4");
+        assert_eq!(parsed.pictures[0].cx_emu, 2_000_000);
+        assert_eq!(parsed.pictures[0].cy_emu, 1_500_000);
+        assert_eq!(parsed.pictures[0].alt_text.as_deref(), Some("my-alt"));
+    }
+
+    #[test]
+    fn parse_drawing_anchors_text_shape() {
+        let xml = br#"<?xml version="1.0"?>
+<xdr:wsDr xmlns:xdr="http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing"
+          xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main">
+  <xdr:absoluteAnchor>
+    <xdr:pos x="100000" y="200000"/>
+    <xdr:ext cx="3000000" cy="500000"/>
+    <xdr:sp>
+      <xdr:txBody>
+        <a:p><a:r><a:t>Hello shape</a:t></a:r></a:p>
+      </xdr:txBody>
+    </xdr:sp>
+  </xdr:absoluteAnchor>
+</xdr:wsDr>"#;
+        let parsed = parse_drawing_anchors(xml).expect("parse ok");
+        assert_eq!(parsed.text_shapes.len(), 1);
+        assert_eq!(parsed.text_shapes[0].text, "Hello shape");
+        assert_eq!(parsed.text_shapes[0].cx_emu, 3_000_000);
+    }
+
+    #[test]
+    fn parse_drawing_anchors_empty_doc_is_ok() {
+        let xml = br#"<?xml version="1.0"?>
+<xdr:wsDr xmlns:xdr="http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing"/>"#;
+        let parsed = parse_drawing_anchors(xml).expect("parse ok");
+        assert!(parsed.pictures.is_empty());
+        assert!(parsed.text_shapes.is_empty());
+    }
+}
diff --git a/src/xlsx/numfmt.rs b/src/xlsx/numfmt.rs
index c945861..6979381 100644
--- a/src/xlsx/numfmt.rs
+++ b/src/xlsx/numfmt.rs
@@ -7,8 +7,11 @@
 
 /// Apply an Excel number format to a numeric value.
 pub fn apply_format(n: f64, fmt_id: u32, fmt_str: Option<&str>) -> String {
-    if n.is_nan() || n.is_infinite() {
-        return String::new();
+    if n.is_nan() {
+        return "NaN".to_string();
+    }
+    if n.is_infinite() {
+        return if n < 0.0 { "-Infinity".to_string() } else { "Infinity".to_string() };
     }
 
     // Built-in format IDs per OOXML spec §18.8.30.
@@ -67,26 +70,44 @@ fn format_fixed(n: f64, decimals: u8) -> String {
 pub fn format_commas(n: f64, decimals: u8) -> String {
     let negative = n < 0.0;
     let abs = n.abs();
+    let sign = if negative { "-" } else { "" };
 
-    // Round to the required number of decimal places first.
     let factor = 10f64.powi(decimals as i32);
-    let rounded = (abs * factor).round() / factor;
+    let scaled = (abs * factor).round();
 
-    let int_part = rounded.trunc() as u64;
-    let int_str = insert_commas(int_part);
+    // Fall back to the locale-free Rust formatter for magnitudes that
+    // overflow u64 — better to lose the thousands separators than to
+    // emit a silently-wrapped integer.
+    if !scaled.is_finite() || scaled >= u64::MAX as f64 {
+        return format!("{}{:.prec$}", sign, abs, prec = decimals as usize);
+    }
 
-    let sign = if negative { "-" } else { "" };
+    let scaled_int = scaled as u64;
 
     if decimals == 0 {
-        format!("{}{}", sign, int_str)
+        format!("{}{}", sign, insert_commas(scaled_int))
     } else {
-        let frac = ((rounded.fract()) * factor).round() as u64;
-        format!("{}{}.{:0>width$}", sign, int_str, frac, width = decimals as usize)
+        let divisor = factor as u64;
+        let int_part = scaled_int / divisor;
+        let frac = scaled_int % divisor;
+        format!(
+            "{}{}.{:0>width$}",
+            sign,
+            insert_commas(int_part),
+            frac,
+            width = decimals as usize
+        )
     }
 }
 
 fn format_currency(n: f64, symbol: &str, decimals: u8) -> String {
-    format!("{}{}", symbol, format_commas(n, decimals))
+    // Put any minus sign before the currency symbol so callers see
+    // "-$99.50" rather than "$-99.50".
+    if n < 0.0 {
+        format!("-{}{}", symbol, format_commas(n.abs(), decimals))
+    } else {
+        format!("{}{}", symbol, format_commas(n, decimals))
+    }
 }
 
 /// Format a number as a percentage (multiplied by 100, with optional decimal places).
@@ -349,14 +370,17 @@ mod tests {
     // ── Edge cases ──────────────────────────────────────────────────────
 
     #[test]
-    fn nan_returns_empty() {
-        assert_eq!(apply_format(f64::NAN, 0, None), "");
+    fn nan_renders_as_label() {
+        // Returning the literal "NaN" rather than an empty string keeps
+        // anomalous cells visible in extracted text so they're not
+        // mistaken for empty data.
+        assert_eq!(apply_format(f64::NAN, 0, None), "NaN");
     }
 
     #[test]
-    fn infinity_returns_empty() {
-        assert_eq!(apply_format(f64::INFINITY, 0, None), "");
-        assert_eq!(apply_format(f64::NEG_INFINITY, 0, None), "");
+    fn infinity_renders_as_label() {
+        assert_eq!(apply_format(f64::INFINITY, 0, None), "Infinity");
+        assert_eq!(apply_format(f64::NEG_INFINITY, 0, None), "-Infinity");
     }
 
     #[test]
@@ -374,7 +398,7 @@ mod tests {
 
     #[test]
     fn negative_currency() {
-        assert_eq!(apply_format(-99.5, 7, None), "$-99.50");
+        assert_eq!(apply_format(-99.5, 7, None), "-$99.50");
     }
 
     #[test]
diff --git a/src/xlsx/worksheet.rs b/src/xlsx/worksheet.rs
index b51671a..2d81935 100644
--- a/src/xlsx/worksheet.rs
+++ b/src/xlsx/worksheet.rs
@@ -247,6 +247,20 @@ struct PageMarginsIn {
     footer: f64,
 }
 
+impl PageMarginsIn {
+    /// ECMA-376 default margins (inches). Single source of truth used by
+    /// both `parse_page_margins` (when an attribute is absent) and
+    /// `build_page_setup` (when no `<pageMargins>` element was present).
+    const DEFAULTS: PageMarginsIn = PageMarginsIn {
+        left: 0.7,
+        right: 0.7,
+        top: 0.75,
+        bottom: 0.75,
+        header: 0.3,
+        footer: 0.3,
+    };
+}
+
 /// Raw `<pageSetup>` shape — physical dimensions in twips plus orientation.
 #[derive(Debug, Clone, Copy, Default)]
 struct PageSetupRaw {
@@ -271,13 +285,14 @@ fn parse_page_margins(
     if left.is_none() && right.is_none() && top.is_none() && bottom.is_none() {
         return Ok(None);
     }
+    let d = PageMarginsIn::DEFAULTS;
     Ok(Some(PageMarginsIn {
-        left: left.unwrap_or(0.7),
-        right: right.unwrap_or(0.7),
-        top: top.unwrap_or(0.75),
-        bottom: bottom.unwrap_or(0.75),
-        header: header.unwrap_or(0.3),
-        footer: footer.unwrap_or(0.3),
+        left: left.unwrap_or(d.left),
+        right: right.unwrap_or(d.right),
+        top: top.unwrap_or(d.top),
+        bottom: bottom.unwrap_or(d.bottom),
+        header: header.unwrap_or(d.header),
+        footer: footer.unwrap_or(d.footer),
     }))
 }
 
@@ -360,14 +375,7 @@ fn build_page_setup(
         return None;
     }
     let in_to_twips = |v: f64| (v * 1440.0).round().max(0.0) as u32;
-    let m = margins.unwrap_or(PageMarginsIn {
-        left: 0.7,
-        right: 0.7,
-        top: 0.75,
-        bottom: 0.75,
-        header: 0.3,
-        footer: 0.3,
-    });
+    let m = margins.unwrap_or(PageMarginsIn::DEFAULTS);
     let r = raw.unwrap_or_default();
     let ps = PageSetup {
         width_twips: r.width_twips,

From 5745467f0781f033f6d228ca294b3c965a3c2bd8 Mon Sep 17 00:00:00 2001
From: Yury Fedoseev <yfedoseev@gmail.com>
Date: Thu, 14 May 2026 18:10:27 -0700
Subject: [PATCH 15/18] fix(review): rustfmt + more Copilot follow-ups

- Apply rustfmt across recent edits (the v0.1.2 PR's Lint and Format
  Check job was failing because my recent commits hand-wrote a few
  lines that exceeded rustfmt's max_width).
- xlsx/numfmt: only treat 'E'/'e' as a scientific-notation marker when
  followed by '+' or '-'. A bare 'E' in a custom format like "000E"
  was previously consuming the next character unconditionally, which
  could swallow a literal or a digit it should have kept.
- xlsx/mod: in parse_drawing_anchors, restrict the `<off>` fallback to
  the outermost anchor scope (AnchorKind::Unknown). Otherwise the
  `<a:off>` inside a shape's `<a:xfrm>` would overwrite x/y coords
  parsed earlier from `<xdr:pos>` in an absoluteAnchor.
---
 src/xlsx/mod.rs    | 19 ++++++++++++++++---
 src/xlsx/numfmt.rs | 25 +++++++++++++++++++------
 2 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/src/xlsx/mod.rs b/src/xlsx/mod.rs
index 8c2b79f..f646bb4 100644
--- a/src/xlsx/mod.rs
+++ b/src/xlsx/mod.rs
@@ -924,7 +924,14 @@ fn parse_drawing_anchors(xml_data: &[u8]) -> crate::core::Result<DrawingAnchors>
                             cy_emu = v.parse().unwrap_or(0);
                         }
                     },
-                    b"off" if cx_emu == 0 && cy_emu == 0 => {
+                    b"off" if cx_emu == 0 && cy_emu == 0 && matches!(kind, AnchorKind::Unknown) => {
+                        // Honour `<off>` only at the outermost anchor level,
+                        // before we've descended into `<xdr:pic>` or
+                        // `<xdr:sp>`. Otherwise the `<a:off>` inside a
+                        // shape's `<a:xfrm>` (which expresses a transform
+                        // local to the shape, not the anchor origin) would
+                        // overwrite the absolute coordinates parsed from
+                        // `<xdr:pos>`.
                         if let Some(v) = crate::core::xml::optional_attr_str(e, b"x")? {
                             x_emu = v.parse().unwrap_or(x_emu);
                         }
@@ -1065,13 +1072,19 @@ mod tests {
 
     #[test]
     fn sheet_rels_path_top_level() {
-        assert_eq!(sheet_rels_path("xl/worksheets/sheet1.xml"), "xl/worksheets/_rels/sheet1.xml.rels");
+        assert_eq!(
+            sheet_rels_path("xl/worksheets/sheet1.xml"),
+            "xl/worksheets/_rels/sheet1.xml.rels"
+        );
         assert_eq!(sheet_rels_path("sheet1.xml"), "_rels/sheet1.xml.rels");
     }
 
     #[test]
     fn resolve_relative_zip_path_absolute() {
-        assert_eq!(resolve_relative_zip_path("xl/worksheets/sheet1.xml", "/xl/media/img1.png"), "xl/media/img1.png");
+        assert_eq!(
+            resolve_relative_zip_path("xl/worksheets/sheet1.xml", "/xl/media/img1.png"),
+            "xl/media/img1.png"
+        );
     }
 
     #[test]
diff --git a/src/xlsx/numfmt.rs b/src/xlsx/numfmt.rs
index 6979381..3235f85 100644
--- a/src/xlsx/numfmt.rs
+++ b/src/xlsx/numfmt.rs
@@ -11,7 +11,11 @@ pub fn apply_format(n: f64, fmt_id: u32, fmt_str: Option<&str>) -> String {
         return "NaN".to_string();
     }
     if n.is_infinite() {
-        return if n < 0.0 { "-Infinity".to_string() } else { "Infinity".to_string() };
+        return if n < 0.0 {
+            "-Infinity".to_string()
+        } else {
+            "Infinity".to_string()
+        };
     }
 
     // Built-in format IDs per OOXML spec §18.8.30.
@@ -232,11 +236,20 @@ fn apply_custom(n: f64, fmt: &str) -> String {
                 }
             },
             'E' | 'e' => {
-                has_scientific = true;
-                // Skip the +/- and exponent digits
-                chars.next(); // '+' or '-'
-                while chars.peek().is_some_and(|c| c.is_ascii_digit()) {
-                    chars.next();
+                // Only treat this as scientific notation when followed by
+                // `+` or `-` (per ECMA-376 §18.8.31). Bare `E` is just
+                // a literal in formats like "000E" and must not consume
+                // the next character.
+                if matches!(chars.peek(), Some('+') | Some('-')) {
+                    has_scientific = true;
+                    chars.next(); // consume the sign
+                    while chars.peek().is_some_and(|c| c.is_ascii_digit()) {
+                        chars.next();
+                    }
+                } else if !in_num_part {
+                    currency_prefix.push(c);
+                } else {
+                    suffix.push(c);
                 }
             },
             '$' => {

From 83a33ac30e8b18f4c616a38ad703c31b5f4652ce Mon Sep 17 00:00:00 2001
From: Yury Fedoseev <yfedoseev@gmail.com>
Date: Thu, 14 May 2026 18:12:44 -0700
Subject: [PATCH 16/18] docs(review): document embed_font/font_table style
 limitations + debug logs

- pptx::write::embed_font: spell out that deduplication is by name
  only, not by bytes; document the workaround of using distinct
  family names for multiple faces.
- docx::write::generate_font_table_xml: note that every entry is
  emitted as <w:embedRegular> regardless of the underlying style;
  document the recommended workaround (separate family names per
  face).
- xlsx::read_drawing_for_sheet: emit a `debug!` line when a drawing
  part fails to read or parse, instead of silently swallowing the
  error. Lets operators trace cases where worksheet drawings vanish.
---
 src/docx/write.rs |  7 +++++++
 src/pptx/write.rs |  8 ++++++--
 src/xlsx/mod.rs   | 13 +++++++++++--
 3 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/src/docx/write.rs b/src/docx/write.rs
index c0ea70c..7d36c6a 100644
--- a/src/docx/write.rs
+++ b/src/docx/write.rs
@@ -3101,6 +3101,13 @@ fn generate_core_props_xml(props: &CoreProps) -> Vec<u8> {
 /// program when there's a match. Without it, Word silently
 /// substitutes Calibri / Cambria for everything regardless of how
 /// many TTFs we ship under `/word/fonts/`.
+///
+/// **Known limitation**: each entry is emitted as `<w:embedRegular>`
+/// regardless of whether the underlying program is a regular, bold,
+/// italic, or bold-italic face — we don't introspect the font binary
+/// to detect the style. If a caller wants Word to pick up a bold-only
+/// face, they should embed it under a distinct family name (e.g.
+/// `Calibri-Bold`) and reference that name from runs explicitly.
 fn generate_font_table_xml(entries: &[(String, String)]) -> Vec<u8> {
     let mut w = Writer::new_with_indent(Vec::new(), b' ', 2);
     w.write_event(Event::Decl(BytesDecl::new("1.0", Some("UTF-8"), Some("yes"))))
diff --git a/src/pptx/write.rs b/src/pptx/write.rs
index b4729f8..8ec5a05 100644
--- a/src/pptx/write.rs
+++ b/src/pptx/write.rs
@@ -357,8 +357,12 @@ impl PptxWriter {
     }
 
     /// Embed a font program (TrueType / OpenType bytes) under `ppt/fonts/`.
-    /// `name` is used for the file name and the human-readable font name.
-    /// Subsequent calls with the same name are deduplicated.
+    ///
+    /// `name` is used for both the on-disk file name and the human-readable
+    /// font name in the presentation's font table. Deduplication is by
+    /// `name` only — supplying different bytes for an already-registered
+    /// name is a no-op. Pass distinct names (e.g. `Calibri-Bold` vs
+    /// `Calibri`) when you need to ship multiple faces of the same family.
     pub fn embed_font(&mut self, name: impl Into<String>, data: Vec<u8>) -> &mut Self {
         let name = name.into();
         if !self.embedded_fonts.iter().any(|(n, _)| n == &name) {
diff --git a/src/xlsx/mod.rs b/src/xlsx/mod.rs
index f646bb4..7c6121a 100644
--- a/src/xlsx/mod.rs
+++ b/src/xlsx/mod.rs
@@ -671,7 +671,10 @@ fn read_drawing_for_sheet<R: Read + Seek>(
 
     let drawing_xml = match XlsxDocument::read_xml_entry(archive, &drawing_path) {
         Ok(d) => d,
-        Err(_) => return (Vec::new(), Vec::new()),
+        Err(e) => {
+            debug!("XlsxDocument: drawing part {} unreadable ({}); skipping", drawing_path, e);
+            return (Vec::new(), Vec::new());
+        },
     };
 
     let drawing_rels_path = sheet_rels_path(&drawing_path);
@@ -682,7 +685,13 @@ fn read_drawing_for_sheet<R: Read + Seek>(
 
     let parsed = match parse_drawing_anchors(&drawing_xml) {
         Ok(a) => a,
-        Err(_) => return (Vec::new(), Vec::new()),
+        Err(e) => {
+            debug!(
+                "XlsxDocument: drawing {} failed to parse ({}); dropping anchors",
+                drawing_path, e
+            );
+            return (Vec::new(), Vec::new());
+        },
     };
 
     // Resolve picture anchors → bytes.

From c42825bf033a01fb09fbef2e22cb1928487ccd7a Mon Sep 17 00:00:00 2001
From: Yury Fedoseev <yfedoseev@gmail.com>
Date: Thu, 14 May 2026 18:17:07 -0700
Subject: [PATCH 17/18] fix(docx): track header-vs-footer role at parse time
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously, `split_headers_footers` derived role by comparing each
entry's index against the cumulative count of all sections'
`header_refs`. That assumed `headers_footers` was laid out as
"all headers first, then all footers" — but the parser actually
interleaves them per section (header_refs of section 0, then
footer_refs of section 0, then headers of section 1, etc.). In
multi-section documents the cumulative-count split silently
misclassified entries into the wrong column.

Record the role explicitly on each parsed `HeaderFooter` and let
the markdown renderer read it directly. Walking header_refs and
footer_refs in two separate loops at parse time keeps the role
authoritative, even when individual refs fail to resolve and don't
contribute an entry to `headers_footers`.

Closes a Copilot review comment on PR #38.
---
 src/docx/headers.rs |  5 +++++
 src/docx/mod.rs     | 40 ++++++++++++++++++++++++++--------------
 src/docx/text.rs    | 18 +++++++-----------
 3 files changed, 38 insertions(+), 25 deletions(-)

diff --git a/src/docx/headers.rs b/src/docx/headers.rs
index a7e9c7e..1e3a770 100644
--- a/src/docx/headers.rs
+++ b/src/docx/headers.rs
@@ -83,4 +83,9 @@ pub struct HeaderFooter {
     pub hf_type: HeaderFooterType,
     /// Block content within the header or footer.
     pub content: Vec<BlockElement>,
+    /// `true` if this came from a `<w:headerReference>`, `false` if
+    /// from a `<w:footerReference>`. Lets downstream consumers (e.g.
+    /// the markdown renderer) sort entries into headers vs footers
+    /// without trying to back-derive from cumulative ref counts.
+    pub is_header: bool,
 }
diff --git a/src/docx/mod.rs b/src/docx/mod.rs
index 02d2e60..0a41711 100644
--- a/src/docx/mod.rs
+++ b/src/docx/mod.rs
@@ -175,24 +175,36 @@ impl DocxDocument {
         let doc_data = opc.read_part(&main_part)?;
         let (body, sections) = parse_document(&doc_data, &doc_rels)?;
 
-        // Parse headers and footers
+        // Parse headers and footers. Walk header refs and footer refs
+        // separately so each parsed `HeaderFooter` can record its own
+        // role; without that distinction, downstream consumers had to
+        // back-derive headers-vs-footers from cumulative ref counts,
+        // which silently misclassifies entries in multi-section docs.
         let mut headers_footers = Vec::new();
-        for section in &sections {
-            for hf_ref in section.header_refs.iter().chain(section.footer_refs.iter()) {
-                if let Some(rel) = doc_rels.get_by_id(&hf_ref.relationship_id) {
-                    if rel.target_mode == TargetMode::Internal {
-                        let part_name = main_part.resolve_relative(&rel.target)?;
-                        if opc.has_part(&part_name) {
-                            let data = opc.read_part(&part_name)?;
-                            let content = parse_body_elements(&data)?;
-                            headers_footers.push(HeaderFooter {
-                                hf_type: hf_ref.hf_type,
-                                content,
-                            });
-                        }
+        let mut parse_hf = |hf_ref: &HeaderFooterRef, is_header: bool| -> CoreResult<()> {
+            if let Some(rel) = doc_rels.get_by_id(&hf_ref.relationship_id) {
+                if rel.target_mode == TargetMode::Internal {
+                    let part_name = main_part.resolve_relative(&rel.target)?;
+                    if opc.has_part(&part_name) {
+                        let data = opc.read_part(&part_name)?;
+                        let content = parse_body_elements(&data)?;
+                        headers_footers.push(HeaderFooter {
+                            hf_type: hf_ref.hf_type,
+                            content,
+                            is_header,
+                        });
                     }
                 }
             }
+            Ok(())
+        };
+        for section in &sections {
+            for hf_ref in &section.header_refs {
+                parse_hf(hf_ref, true)?;
+            }
+            for hf_ref in &section.footer_refs {
+                parse_hf(hf_ref, false)?;
+            }
         }
 
         // Scan `word/fonts/` for embedded font programs. Files there are
diff --git a/src/docx/text.rs b/src/docx/text.rs
index 4028594..6f766b0 100644
--- a/src/docx/text.rs
+++ b/src/docx/text.rs
@@ -59,29 +59,25 @@ impl DocxDocument {
     }
 }
 
-/// Split parsed `HeaderFooter` entries into headers vs footers using the
-/// section reference lists. Returns (headers, footers) as deduplicated
-/// markdown-string vectors. We don't currently retain the relationship
-/// IDs that map a section ref to a specific parsed `HeaderFooter`, so we
-/// approximate: header_refs.len() entries from the front go to headers,
-/// the rest go to footers. Correct for the common case (single section
-/// with one of each); on multi-variant documents some misclassification
-/// is possible but text is still preserved (just maybe in the wrong slot).
+/// Split parsed `HeaderFooter` entries into headers vs footers and
+/// return them as deduplicated markdown-string vectors. Role is read
+/// directly from each entry's `is_header` field (set at parse time),
+/// so this is correct regardless of how many sections the document
+/// has or how the entries are interleaved.
 fn split_headers_footers(doc: &DocxDocument, ctx: &MarkdownCtx) -> (Vec<String>, Vec<String>) {
     let mut headers: Vec<String> = Vec::new();
     let mut footers: Vec<String> = Vec::new();
     let mut header_seen: std::collections::HashSet<String> = std::collections::HashSet::new();
     let mut footer_seen: std::collections::HashSet<String> = std::collections::HashSet::new();
 
-    let n_header_refs: usize = doc.sections.iter().map(|s| s.header_refs.len()).sum();
-    for (idx, hf) in doc.headers_footers.iter().enumerate() {
+    for hf in &doc.headers_footers {
         let mut buf = String::new();
         markdown_blocks(&hf.content, ctx, &mut buf, 0);
         let t = buf.trim().to_string();
         if t.is_empty() {
             continue;
         }
-        if idx < n_header_refs {
+        if hf.is_header {
             if header_seen.insert(t.clone()) {
                 headers.push(t);
             }

From ac38c7ee68a42077bf10b85748dd2ea1fcd63009 Mon Sep 17 00:00:00 2001
From: Yury Fedoseev <yfedoseev@gmail.com>
Date: Thu, 14 May 2026 18:32:30 -0700
Subject: [PATCH 18/18] test: add IR round-trip coverage for previously
 untested Element variants

Coverage was sitting at 73.2% line on this branch, below the 75%
floor enforced by the Code Coverage CI job. The PR introduced a lot
of new write/conversion code (chart text, embedded fonts, multi-section
sectPr, drawing anchors, page setup) and the existing integration
tests only exercised the common element variants.

Adds round-trip tests through `create_from_ir_to_writer` for:
- ThematicBreak (verifies w:pBdr emission in document.xml)
- PageBreak + ColumnBreak (verifies w:br w:type="page"/"column")
- Footnote + Endnote (verifies the footnotes.xml/endnotes.xml parts)
- TextBox (verifies floating content lands in document.xml)
- Numbered List with start_number
- Multi-section document with Continuous / NextPage / OddPage breaks
- Crate-level Document::from_reader + plain_text + to_markdown + to_ir
  convenience path

Local line coverage rises from 73.21% to 76.43%, clearing the 75%
threshold.
---
 tests/write_integration.rs | 368 +++++++++++++++++++++++++++++++++++++
 1 file changed, 368 insertions(+)

diff --git a/tests/write_integration.rs b/tests/write_integration.rs
index 92391fe..4f83e6c 100644
--- a/tests/write_integration.rs
+++ b/tests/write_integration.rs
@@ -1169,3 +1169,371 @@ fn ir_table_caption_round_trip() {
         "expected caption text in document.xml"
     );
 }
+
+// ---------------------------------------------------------------------------
+// Tests for Element variants that were previously uncovered: ThematicBreak,
+// PageBreak, ColumnBreak, TextBox, Footnote, Endnote. Each test goes
+// through the full IR → DOCX → re-parse round-trip so the create.rs
+// dispatch arms, the corresponding `DocxWriter` methods, and the
+// downstream re-parser all get exercised.
+// ---------------------------------------------------------------------------
+
+#[test]
+fn ir_thematic_break_emits_bordered_paragraph() {
+    use office_oxide::ir::*;
+
+    let ir = DocumentIR {
+        metadata: Metadata {
+            format: office_oxide::DocumentFormat::Docx,
+            ..Default::default()
+        },
+        sections: vec![Section {
+            elements: vec![
+                Element::Paragraph(Paragraph {
+                    content: vec![InlineContent::Text(TextSpan::plain("Before"))],
+                    ..Default::default()
+                }),
+                Element::ThematicBreak,
+                Element::Paragraph(Paragraph {
+                    content: vec![InlineContent::Text(TextSpan::plain("After"))],
+                    ..Default::default()
+                }),
+            ],
+            ..Default::default()
+        }],
+    };
+
+    let mut buf = Cursor::new(Vec::new());
+    office_oxide::create::create_from_ir_to_writer(
+        &ir,
+        office_oxide::DocumentFormat::Docx,
+        &mut buf,
+    )
+    .unwrap();
+    buf.set_position(0);
+
+    let zip_bytes = buf.into_inner();
+    let mut zip = zip::ZipArchive::new(Cursor::new(zip_bytes)).unwrap();
+    let mut doc_xml = String::new();
+    {
+        use std::io::Read;
+        zip.by_name("word/document.xml")
+            .unwrap()
+            .read_to_string(&mut doc_xml)
+            .unwrap();
+    }
+    // Thematic break = empty paragraph with a bottom border. The raw
+    // XML should include a w:pBdr/w:bottom element somewhere between
+    // "Before" and "After".
+    assert!(
+        doc_xml.contains("w:pBdr"),
+        "expected pBdr (paragraph border) for thematic break"
+    );
+    assert!(doc_xml.contains("Before") && doc_xml.contains("After"));
+}
+
+#[test]
+fn ir_page_and_column_breaks_round_trip() {
+    use office_oxide::ir::*;
+
+    let ir = DocumentIR {
+        metadata: Metadata {
+            format: office_oxide::DocumentFormat::Docx,
+            ..Default::default()
+        },
+        sections: vec![Section {
+            elements: vec![
+                Element::Paragraph(Paragraph {
+                    content: vec![InlineContent::Text(TextSpan::plain("Page 1"))],
+                    ..Default::default()
+                }),
+                Element::PageBreak,
+                Element::Paragraph(Paragraph {
+                    content: vec![InlineContent::Text(TextSpan::plain("Page 2 col 1"))],
+                    ..Default::default()
+                }),
+                Element::ColumnBreak,
+                Element::Paragraph(Paragraph {
+                    content: vec![InlineContent::Text(TextSpan::plain("Page 2 col 2"))],
+                    ..Default::default()
+                }),
+            ],
+            ..Default::default()
+        }],
+    };
+
+    let mut buf = Cursor::new(Vec::new());
+    office_oxide::create::create_from_ir_to_writer(
+        &ir,
+        office_oxide::DocumentFormat::Docx,
+        &mut buf,
+    )
+    .unwrap();
+    buf.set_position(0);
+
+    let zip_bytes = buf.into_inner();
+    let mut zip = zip::ZipArchive::new(Cursor::new(zip_bytes)).unwrap();
+    let mut doc_xml = String::new();
+    {
+        use std::io::Read;
+        zip.by_name("word/document.xml")
+            .unwrap()
+            .read_to_string(&mut doc_xml)
+            .unwrap();
+    }
+    // Page break = <w:br w:type="page"/>; column break = <w:br w:type="column"/>.
+    assert!(doc_xml.contains("w:type=\"page\""), "expected page break w:br: {doc_xml:.500}",);
+    assert!(doc_xml.contains("w:type=\"column\""), "expected column break w:br",);
+}
+
+#[test]
+fn ir_footnote_endnote_round_trip() {
+    use office_oxide::ir::*;
+
+    let footnote_content = vec![Element::Paragraph(Paragraph {
+        content: vec![InlineContent::Text(TextSpan::plain("This is a footnote."))],
+        ..Default::default()
+    })];
+    let endnote_content = vec![Element::Paragraph(Paragraph {
+        content: vec![InlineContent::Text(TextSpan::plain("This is an endnote."))],
+        ..Default::default()
+    })];
+
+    let ir = DocumentIR {
+        metadata: Metadata {
+            format: office_oxide::DocumentFormat::Docx,
+            ..Default::default()
+        },
+        sections: vec![Section {
+            elements: vec![
+                Element::Paragraph(Paragraph {
+                    content: vec![
+                        InlineContent::Text(TextSpan::plain("Main body")),
+                        InlineContent::FootnoteRef(FootnoteRef {
+                            note_id: 1,
+                            marker: None,
+                        }),
+                        InlineContent::EndnoteRef(FootnoteRef {
+                            note_id: 2,
+                            marker: None,
+                        }),
+                    ],
+                    ..Default::default()
+                }),
+                Element::Footnote(Note {
+                    id: 1,
+                    content: footnote_content,
+                    marker: None,
+                }),
+                Element::Endnote(Note {
+                    id: 2,
+                    content: endnote_content,
+                    marker: None,
+                }),
+            ],
+            ..Default::default()
+        }],
+    };
+
+    let mut buf = Cursor::new(Vec::new());
+    office_oxide::create::create_from_ir_to_writer(
+        &ir,
+        office_oxide::DocumentFormat::Docx,
+        &mut buf,
+    )
+    .unwrap();
+    buf.set_position(0);
+
+    // The package should now contain a footnotes part and an endnotes part.
+    let zip_bytes = buf.into_inner();
+    let zip = zip::ZipArchive::new(Cursor::new(zip_bytes)).unwrap();
+    let names: Vec<String> = zip.file_names().map(String::from).collect();
+    assert!(
+        names.iter().any(|n| n == "word/footnotes.xml"),
+        "expected word/footnotes.xml in: {names:?}"
+    );
+    assert!(
+        names.iter().any(|n| n == "word/endnotes.xml"),
+        "expected word/endnotes.xml in: {names:?}"
+    );
+}
+
+#[test]
+fn ir_text_box_round_trip() {
+    use office_oxide::ir::*;
+
+    let ir = DocumentIR {
+        metadata: Metadata {
+            format: office_oxide::DocumentFormat::Docx,
+            ..Default::default()
+        },
+        sections: vec![Section {
+            elements: vec![Element::TextBox(TextBox {
+                content: vec![Element::Paragraph(Paragraph {
+                    content: vec![InlineContent::Text(TextSpan::plain("Floating callout"))],
+                    ..Default::default()
+                })],
+                ..Default::default()
+            })],
+            ..Default::default()
+        }],
+    };
+
+    let mut buf = Cursor::new(Vec::new());
+    office_oxide::create::create_from_ir_to_writer(
+        &ir,
+        office_oxide::DocumentFormat::Docx,
+        &mut buf,
+    )
+    .unwrap();
+    buf.set_position(0);
+
+    let zip_bytes = buf.into_inner();
+    let mut zip = zip::ZipArchive::new(Cursor::new(zip_bytes)).unwrap();
+    let mut doc_xml = String::new();
+    {
+        use std::io::Read;
+        zip.by_name("word/document.xml")
+            .unwrap()
+            .read_to_string(&mut doc_xml)
+            .unwrap();
+    }
+    // Text-box content lives inside a w:txbxContent element rather
+    // than as a top-level paragraph, so look for the raw text in
+    // the XML. Round-trip plain_text extraction of floating shapes
+    // is intentionally not exposed today.
+    assert!(
+        doc_xml.contains("Floating callout"),
+        "expected text-box content in document.xml"
+    );
+}
+
+#[test]
+fn ir_numbered_list_round_trip() {
+    use office_oxide::ir::*;
+
+    let item = |text: &str| ListItem {
+        content: vec![Element::Paragraph(Paragraph {
+            content: vec![InlineContent::Text(TextSpan::plain(text))],
+            ..Default::default()
+        })],
+        ..Default::default()
+    };
+    let ir = DocumentIR {
+        metadata: Metadata {
+            format: office_oxide::DocumentFormat::Docx,
+            ..Default::default()
+        },
+        sections: vec![Section {
+            elements: vec![Element::List(List {
+                ordered: true,
+                start_number: Some(5),
+                items: vec![item("Five"), item("Six"), item("Seven")],
+                ..Default::default()
+            })],
+            ..Default::default()
+        }],
+    };
+
+    let mut buf = Cursor::new(Vec::new());
+    office_oxide::create::create_from_ir_to_writer(
+        &ir,
+        office_oxide::DocumentFormat::Docx,
+        &mut buf,
+    )
+    .unwrap();
+    buf.set_position(0);
+
+    let doc = office_oxide::docx::DocxDocument::from_reader(buf).unwrap();
+    let md = doc.to_markdown();
+    assert!(md.contains("Five") && md.contains("Six") && md.contains("Seven"), "md: {md}");
+}
+
+#[test]
+fn ir_multi_section_round_trip() {
+    use office_oxide::ir::*;
+
+    let make_section = |label: &str, break_type: SectionBreakType| Section {
+        elements: vec![Element::Paragraph(Paragraph {
+            content: vec![InlineContent::Text(TextSpan::plain(label))],
+            ..Default::default()
+        })],
+        break_type,
+        ..Default::default()
+    };
+
+    let ir = DocumentIR {
+        metadata: Metadata {
+            format: office_oxide::DocumentFormat::Docx,
+            ..Default::default()
+        },
+        sections: vec![
+            make_section("Section A", SectionBreakType::Continuous),
+            make_section("Section B", SectionBreakType::NextPage),
+            make_section("Section C", SectionBreakType::OddPage),
+        ],
+    };
+
+    let mut buf = Cursor::new(Vec::new());
+    office_oxide::create::create_from_ir_to_writer(
+        &ir,
+        office_oxide::DocumentFormat::Docx,
+        &mut buf,
+    )
+    .unwrap();
+    buf.set_position(0);
+
+    let doc = office_oxide::docx::DocxDocument::from_reader(buf).unwrap();
+    let text = doc.plain_text();
+    assert!(
+        text.contains("Section A") && text.contains("Section B") && text.contains("Section C"),
+        "text: {text}"
+    );
+}
+
+#[test]
+fn convenience_functions_round_trip() {
+    use office_oxide::ir::*;
+
+    let ir = DocumentIR {
+        metadata: Metadata {
+            format: office_oxide::DocumentFormat::Docx,
+            ..Default::default()
+        },
+        sections: vec![Section {
+            elements: vec![
+                Element::Heading(Heading {
+                    level: 1,
+                    content: vec![InlineContent::Text(TextSpan::plain("Title"))],
+                    ..Default::default()
+                }),
+                Element::Paragraph(Paragraph {
+                    content: vec![InlineContent::Text(TextSpan::plain("Hello"))],
+                    ..Default::default()
+                }),
+            ],
+            ..Default::default()
+        }],
+    };
+
+    let mut buf = Cursor::new(Vec::new());
+    office_oxide::create::create_from_ir_to_writer(
+        &ir,
+        office_oxide::DocumentFormat::Docx,
+        &mut buf,
+    )
+    .unwrap();
+    buf.set_position(0);
+
+    // Exercise the crate-level extract_text / to_markdown / Document::open paths.
+    let bytes = buf.into_inner();
+    let doc = office_oxide::Document::from_reader(
+        Cursor::new(bytes.clone()),
+        office_oxide::DocumentFormat::Docx,
+    )
+    .unwrap();
+    assert!(doc.plain_text().contains("Hello"));
+    assert!(doc.to_markdown().contains("Hello"));
+    let ir2 = doc.to_ir();
+    assert!(!ir2.sections.is_empty());
+}