jgm · aantich · Nov 9, 2025 · Nov 9, 2025 · Nov 9, 2025
diff --git a/MANUAL.txt b/MANUAL.txt
@@ -282,6 +282,7 @@ header when requesting a document from a URL:
     - `twiki` ([TWiki markup])
     - `typst` ([typst])
     - `vimwiki` ([Vimwiki])
+    - `xlsx` ([Excel spreadsheet][XLSX])
     - `xml` (XML version of native AST)
     - the path of a custom Lua reader, see [Custom readers and writers] below
     :::
@@ -518,6 +519,7 @@ header when requesting a document from a URL:
 [DokuWiki markup]: https://www.dokuwiki.org/dokuwiki
 [ZimWiki markup]: https://zim-wiki.org/manual/Help/Wiki_Syntax.html
 [XWiki markup]: https://www.xwiki.org/xwiki/bin/view/Documentation/UserGuide/Features/XWikiSyntax/
+[XLSX]: https://en.wikipedia.org/wiki/Microsoft_Excel#File_formats
 [Vimdoc]: https://vimhelp.org/helphelp.txt.html#help-writing
 [TWiki markup]: https://twiki.org/cgi-bin/view/TWiki/TextFormattingRules
 [TikiWiki markup]: https://doc.tiki.org/Wiki-Syntax-Text#The_Markup_Language_Wiki-Syntax

diff --git a/pandoc.cabal b/pandoc.cabal
@@ -438,6 +438,8 @@ extra-source-files:
                  test/odt/odt/*.odt
                  test/odt/markdown/*.md
                  test/odt/native/*.native
+                 test/xlsx-reader/*.xlsx
+                 test/xlsx-reader/*.native
                  test/pod-reader.pod
                  test/vimdoc/*.markdown
                  test/vimdoc/*.vimdoc
@@ -610,6 +612,7 @@ library
                    Text.Pandoc.Readers.TikiWiki,
                    Text.Pandoc.Readers.Txt2Tags,
                    Text.Pandoc.Readers.Docx,
+                   Text.Pandoc.Readers.Xlsx,
                    Text.Pandoc.Readers.ODT,
                    Text.Pandoc.Readers.EPUB,
                    Text.Pandoc.Readers.Muse,
@@ -718,6 +721,10 @@ library
                    Text.Pandoc.Readers.Docx.Util,
                    Text.Pandoc.Readers.Docx.Symbols,
                    Text.Pandoc.Readers.Docx.Fields,
+                   Text.Pandoc.Readers.OOXML.Shared,
+                   Text.Pandoc.Readers.Xlsx.Parse,
+                   Text.Pandoc.Readers.Xlsx.Cells,
+                   Text.Pandoc.Readers.Xlsx.Sheets,
                    Text.Pandoc.Readers.HTML.Parsing,
                    Text.Pandoc.Readers.HTML.Table,
                    Text.Pandoc.Readers.HTML.TagCategories,
@@ -854,6 +861,7 @@ test-suite test-pandoc
                   Tests.Readers.RST
                   Tests.Readers.RTF
                   Tests.Readers.Docx
+                  Tests.Readers.Xlsx
                   Tests.Readers.ODT
                   Tests.Readers.Txt2Tags
                   Tests.Readers.EPUB

diff --git a/src/Text/Pandoc/Readers.hs b/src/Text/Pandoc/Readers.hs
@@ -26,6 +26,7 @@ module Text.Pandoc.Readers
     Reader (..)
   , readers
   , readDocx
+  , readXlsx
   , readODT
   , readMarkdown
   , readCommonMark
@@ -87,6 +88,7 @@ import Text.Pandoc.Readers.Markdown
 import Text.Pandoc.Readers.Creole
 import Text.Pandoc.Readers.DocBook
 import Text.Pandoc.Readers.Docx
+import Text.Pandoc.Readers.Xlsx
 import Text.Pandoc.Readers.DokuWiki
 import Text.Pandoc.Readers.EPUB
 import Text.Pandoc.Readers.FB2
@@ -157,6 +159,7 @@ readers = [("native"       , TextReader readNative)
           ,("twiki"        , TextReader readTWiki)
           ,("tikiwiki"     , TextReader readTikiWiki)
           ,("docx"         , ByteStringReader readDocx)
+          ,("xlsx"         , ByteStringReader readXlsx)
           ,("odt"          , ByteStringReader readODT)
           ,("t2t"          , TextReader readTxt2Tags)
           ,("epub"         , ByteStringReader readEPUB)

diff --git a/src/Text/Pandoc/Readers/Docx/Util.hs b/src/Text/Pandoc/Readers/Docx/Util.hs
@@ -24,51 +24,11 @@ module Text.Pandoc.Readers.Docx.Util (
                                       , extractChildren
                                       ) where
 
-import qualified Data.Text as T
-import Data.Text (Text)
-import Text.Pandoc.XML.Light
-import qualified Data.Map as M
 import Data.List (partition)
-
-type NameSpaces = M.Map Text Text
-
-elemToNameSpaces :: Element -> NameSpaces
-elemToNameSpaces = foldr (\(Attr qn val) ->
-                             case qn of
-                               QName s _ (Just "xmlns") -> M.insert s val
-                               _ -> id) mempty . elAttribs
-
-elemName :: NameSpaces -> Text -> Text -> QName
-elemName ns prefix name =
-  QName name (M.lookup prefix ns)
-             (if T.null prefix then Nothing else Just prefix)
-
-isElem :: NameSpaces -> Text -> Text -> Element -> Bool
-isElem ns prefix name element =
-  let ns' = ns <> elemToNameSpaces element
-  in qName (elName element) == name &&
-     qURI (elName element) == M.lookup prefix ns'
-
-findChildByName :: NameSpaces -> Text -> Text -> Element -> Maybe Element
-findChildByName ns pref name el =
-  let ns' = ns <> elemToNameSpaces el
-  in  findChild (elemName ns' pref name) el
-
-findChildrenByName :: NameSpaces -> Text -> Text -> Element -> [Element]
-findChildrenByName ns pref name el =
-  let ns' = ns <> elemToNameSpaces el
-  in  findChildren (elemName ns' pref name) el
-
--- | Like 'findChildrenByName', but searches descendants.
-findElementByName :: NameSpaces -> Text -> Text -> Element -> Maybe Element
-findElementByName ns pref name el =
-  let ns' = ns <> elemToNameSpaces el
-  in  findElement (elemName ns' pref name) el
-
-findAttrByName :: NameSpaces -> Text -> Text -> Element -> Maybe Text
-findAttrByName ns pref name el =
-  let ns' = ns <> elemToNameSpaces el
-  in  findAttr (elemName ns' pref name) el
+import Text.Pandoc.XML.Light
+import Text.Pandoc.Readers.OOXML.Shared
+  (NameSpaces, elemName, isElem, elemToNameSpaces,
+   findChildByName, findChildrenByName, findElementByName, findAttrByName)
 
 
 -- | Removes child elements that satisfy a given condition.

diff --git a/src/Text/Pandoc/Readers/OOXML/Shared.hs b/src/Text/Pandoc/Readers/OOXML/Shared.hs
@@ -0,0 +1,95 @@
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+   Module      : Text.Pandoc.Readers.OOXML.Shared
+   Copyright   : © 2025 Anton Antic
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : Anton Antic <[email protected]>
+   Stability   : alpha
+   Portability : portable
+
+Shared utilities for Office Open XML (OOXML) readers (DOCX, PPTX).
+Provides common functions for ZIP archive handling, XML parsing,
+namespace management, and DrawingML parsing.
+-}
+module Text.Pandoc.Readers.OOXML.Shared
+  ( -- * Constants
+    emusPerInch
+  , emuToInches
+  , inchesToEmu
+    -- * Types
+  , NameSpaces
+  , elemName
+  , elemToNameSpaces
+  , isElem
+  , findChildByName
+  , findChildrenByName
+  , findElementByName
+  , findAttrByName
+  ) where
+
+import qualified Data.Map as M
+import qualified Data.Text as T
+import Data.Text (Text)
+import Text.Pandoc.XML.Light
+
+-- | Type alias for namespace mappings
+type NameSpaces = M.Map Text Text
+
+-- | English Metric Units per inch
+-- 1 inch = 914400 EMUs (used in OOXML for dimensions)
+emusPerInch :: Integer
+emusPerInch = 914400
+
+-- | Convert EMUs to inches
+emuToInches :: Integer -> Double
+emuToInches n = fromIntegral n / fromIntegral emusPerInch
+
+-- | Convert inches to EMUs
+inchesToEmu :: Double -> Integer
+inchesToEmu n = round (n * fromIntegral emusPerInch)
+
+-- | Extract namespace declarations from element attributes
+elemToNameSpaces :: Element -> NameSpaces
+elemToNameSpaces = foldr (\(Attr qn val) ->
+                            case qn of
+                              QName s _ (Just "xmlns") -> M.insert s val
+                              _ -> id) mempty . elAttribs
+
+-- | Create a qualified name from namespace map, prefix, and local name
+elemName :: NameSpaces -> Text -> Text -> QName
+elemName ns prefix name =
+  QName name
+        (M.lookup prefix ns)
+        (if T.null prefix then Nothing else Just prefix)
+
+-- | Check if element matches namespace prefix and local name
+isElem :: NameSpaces -> Text -> Text -> Element -> Bool
+isElem ns prefix name element =
+  let ns' = ns <> elemToNameSpaces element
+  in  qName (elName element) == name &&
+      qURI (elName element) == M.lookup prefix ns'
+
+-- | Find first child element matching namespace and name
+findChildByName :: NameSpaces -> Text -> Text -> Element -> Maybe Element
+findChildByName ns pref name el =
+  let ns' = ns <> elemToNameSpaces el
+  in  findChild (elemName ns' pref name) el
+
+-- | Find all children matching namespace and name
+findChildrenByName :: NameSpaces -> Text -> Text -> Element -> [Element]
+findChildrenByName ns pref name el =
+  let ns' = ns <> elemToNameSpaces el
+  in  findChildren (elemName ns' pref name) el
+
+-- | Find element anywhere in descendants matching namespace and name
+findElementByName :: NameSpaces -> Text -> Text -> Element -> Maybe Element
+findElementByName ns pref name el =
+  let ns' = ns <> elemToNameSpaces el
+  in  findElement (elemName ns' pref name) el
+
+-- | Find attribute value by namespace prefix and name
+findAttrByName :: NameSpaces -> Text -> Text -> Element -> Maybe Text
+findAttrByName ns pref name el =
+  let ns' = ns <> elemToNameSpaces el
+  in  findAttr (elemName ns' pref name) el
diff --git a/src/Text/Pandoc/Readers/Xlsx.hs b/src/Text/Pandoc/Readers/Xlsx.hs
@@ -0,0 +1,40 @@
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+   Module      : Text.Pandoc.Readers.Xlsx
+   Copyright   : © 2025 Anton Antic
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : Anton Antic <[email protected]>
+   Stability   : alpha
+   Portability : portable
+
+Conversion of XLSX (Excel spreadsheet) documents to 'Pandoc' document.
+-}
+module Text.Pandoc.Readers.Xlsx (readXlsx) where
+
+import qualified Data.ByteString.Lazy as B
+import qualified Data.Text as T
+import Codec.Archive.Zip (toArchiveOrFail)
+import Control.Monad.Except (throwError)
+import Text.Pandoc.Class.PandocMonad (PandocMonad)
+import Text.Pandoc.Definition (Pandoc(..))
+import Text.Pandoc.Error (PandocError(..))
+import Text.Pandoc.Options (ReaderOptions)
+import Text.Pandoc.Readers.Xlsx.Parse (archiveToXlsx)
+import Text.Pandoc.Readers.Xlsx.Sheets (xlsxToOutput)
+
+-- | Read XLSX file into Pandoc AST
+readXlsx :: PandocMonad m => ReaderOptions -> B.ByteString -> m Pandoc
+readXlsx opts bytes =
+  case toArchiveOrFail bytes of
+    Right archive ->
+      case archiveToXlsx archive of
+        Right xlsx -> do
+          let (meta, blocks) = xlsxToOutput opts xlsx
+          return $ Pandoc meta blocks
+        Left err ->
+          throwError $ PandocParseError $ "Failed to parse XLSX: " <> err
+
+    Left err ->
+      throwError $ PandocParseError $
+        "Failed to unpack XLSX archive: " <> T.pack err
diff --git a/src/Text/Pandoc/Readers/Xlsx/Cells.hs b/src/Text/Pandoc/Readers/Xlsx/Cells.hs
@@ -0,0 +1,63 @@
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+   Module      : Text.Pandoc.Readers.Xlsx.Cells
+   Copyright   : © 2025 Anton Antic
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : Anton Antic <[email protected]>
+   Stability   : alpha
+   Portability : portable
+
+Cell types and parsing for XLSX.
+-}
+module Text.Pandoc.Readers.Xlsx.Cells
+  ( CellRef(..)
+  , XlsxCell(..)
+  , CellValue(..)
+  , parseCellRef
+  ) where
+
+import qualified Data.Text as T
+import Data.Text (Text)
+import Data.Char (ord, isAlpha)
+import Text.Read (readMaybe)
+
+-- | Cell reference (A1 notation)
+data CellRef = CellRef
+  { cellRefCol :: Int    -- 1-based (A=1, B=2, ..., AA=27)
+  , cellRefRow :: Int    -- 1-based
+  } deriving (Show, Eq, Ord)
+
+-- | Cell value types
+data CellValue
+  = TextValue Text
+  | NumberValue Double
+  | EmptyValue
+  deriving (Show, Eq)
+
+-- | Parsed cell
+data XlsxCell = XlsxCell
+  { cellRef :: CellRef
+  , cellValue :: CellValue
+  , cellBold :: Bool
+  , cellItalic :: Bool
+  } deriving (Show)
+
+-- | Parse cell reference (A1 → CellRef)
+parseCellRef :: Text -> Either Text CellRef
+parseCellRef ref = do
+  let (colStr, rowStr) = T.span isAlpha ref
+
+  row <- case readMaybe (T.unpack rowStr) of
+    Just r | r > 0 -> Right r
+    _ -> Left $ "Invalid row: " <> rowStr
+
+  col <- parseColumn colStr
+
+  return $ CellRef col row
+
+-- | Parse column (A=1, Z=26, AA=27, etc.)
+parseColumn :: Text -> Either Text Int
+parseColumn colStr
+  | T.null colStr = Left "Empty column"
+  | otherwise = Right $ T.foldl' (\acc c -> acc * 26 + (ord c - ord 'A' + 1)) 0 colStr