Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions MANUAL.txt
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,7 @@ header when requesting a document from a URL:
- `twiki` ([TWiki markup])
- `typst` ([typst])
- `vimwiki` ([Vimwiki])
- `xlsx` ([Excel spreadsheet][XLSX])
- `xml` (XML version of native AST)
- the path of a custom Lua reader, see [Custom readers and writers] below
:::
Expand Down Expand Up @@ -518,6 +519,7 @@ header when requesting a document from a URL:
[DokuWiki markup]: https://www.dokuwiki.org/dokuwiki
[ZimWiki markup]: https://zim-wiki.org/manual/Help/Wiki_Syntax.html
[XWiki markup]: https://www.xwiki.org/xwiki/bin/view/Documentation/UserGuide/Features/XWikiSyntax/
[XLSX]: https://en.wikipedia.org/wiki/Microsoft_Excel#File_formats
[Vimdoc]: https://vimhelp.org/helphelp.txt.html#help-writing
[TWiki markup]: https://twiki.org/cgi-bin/view/TWiki/TextFormattingRules
[TikiWiki markup]: https://doc.tiki.org/Wiki-Syntax-Text#The_Markup_Language_Wiki-Syntax
Expand Down
8 changes: 8 additions & 0 deletions pandoc.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,8 @@ extra-source-files:
test/odt/odt/*.odt
test/odt/markdown/*.md
test/odt/native/*.native
test/xlsx-reader/*.xlsx
test/xlsx-reader/*.native
test/pod-reader.pod
test/vimdoc/*.markdown
test/vimdoc/*.vimdoc
Expand Down Expand Up @@ -610,6 +612,7 @@ library
Text.Pandoc.Readers.TikiWiki,
Text.Pandoc.Readers.Txt2Tags,
Text.Pandoc.Readers.Docx,
Text.Pandoc.Readers.Xlsx,
Text.Pandoc.Readers.ODT,
Text.Pandoc.Readers.EPUB,
Text.Pandoc.Readers.Muse,
Expand Down Expand Up @@ -718,6 +721,10 @@ library
Text.Pandoc.Readers.Docx.Util,
Text.Pandoc.Readers.Docx.Symbols,
Text.Pandoc.Readers.Docx.Fields,
Text.Pandoc.Readers.OOXML.Shared,
Text.Pandoc.Readers.Xlsx.Parse,
Text.Pandoc.Readers.Xlsx.Cells,
Text.Pandoc.Readers.Xlsx.Sheets,
Text.Pandoc.Readers.HTML.Parsing,
Text.Pandoc.Readers.HTML.Table,
Text.Pandoc.Readers.HTML.TagCategories,
Expand Down Expand Up @@ -854,6 +861,7 @@ test-suite test-pandoc
Tests.Readers.RST
Tests.Readers.RTF
Tests.Readers.Docx
Tests.Readers.Xlsx
Tests.Readers.ODT
Tests.Readers.Txt2Tags
Tests.Readers.EPUB
Expand Down
3 changes: 3 additions & 0 deletions src/Text/Pandoc/Readers.hs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ module Text.Pandoc.Readers
Reader (..)
, readers
, readDocx
, readXlsx
, readODT
, readMarkdown
, readCommonMark
Expand Down Expand Up @@ -87,6 +88,7 @@ import Text.Pandoc.Readers.Markdown
import Text.Pandoc.Readers.Creole
import Text.Pandoc.Readers.DocBook
import Text.Pandoc.Readers.Docx
import Text.Pandoc.Readers.Xlsx
import Text.Pandoc.Readers.DokuWiki
import Text.Pandoc.Readers.EPUB
import Text.Pandoc.Readers.FB2
Expand Down Expand Up @@ -157,6 +159,7 @@ readers = [("native" , TextReader readNative)
,("twiki" , TextReader readTWiki)
,("tikiwiki" , TextReader readTikiWiki)
,("docx" , ByteStringReader readDocx)
,("xlsx" , ByteStringReader readXlsx)
,("odt" , ByteStringReader readODT)
,("t2t" , TextReader readTxt2Tags)
,("epub" , ByteStringReader readEPUB)
Expand Down
48 changes: 4 additions & 44 deletions src/Text/Pandoc/Readers/Docx/Util.hs
Original file line number Diff line number Diff line change
Expand Up @@ -24,51 +24,11 @@ module Text.Pandoc.Readers.Docx.Util (
, extractChildren
) where

import qualified Data.Text as T
import Data.Text (Text)
import Text.Pandoc.XML.Light
import qualified Data.Map as M
import Data.List (partition)

type NameSpaces = M.Map Text Text

elemToNameSpaces :: Element -> NameSpaces
elemToNameSpaces = foldr (\(Attr qn val) ->
case qn of
QName s _ (Just "xmlns") -> M.insert s val
_ -> id) mempty . elAttribs

elemName :: NameSpaces -> Text -> Text -> QName
elemName ns prefix name =
QName name (M.lookup prefix ns)
(if T.null prefix then Nothing else Just prefix)

isElem :: NameSpaces -> Text -> Text -> Element -> Bool
isElem ns prefix name element =
let ns' = ns <> elemToNameSpaces element
in qName (elName element) == name &&
qURI (elName element) == M.lookup prefix ns'

findChildByName :: NameSpaces -> Text -> Text -> Element -> Maybe Element
findChildByName ns pref name el =
let ns' = ns <> elemToNameSpaces el
in findChild (elemName ns' pref name) el

findChildrenByName :: NameSpaces -> Text -> Text -> Element -> [Element]
findChildrenByName ns pref name el =
let ns' = ns <> elemToNameSpaces el
in findChildren (elemName ns' pref name) el

-- | Like 'findChildrenByName', but searches descendants.
findElementByName :: NameSpaces -> Text -> Text -> Element -> Maybe Element
findElementByName ns pref name el =
let ns' = ns <> elemToNameSpaces el
in findElement (elemName ns' pref name) el

findAttrByName :: NameSpaces -> Text -> Text -> Element -> Maybe Text
findAttrByName ns pref name el =
let ns' = ns <> elemToNameSpaces el
in findAttr (elemName ns' pref name) el
import Text.Pandoc.XML.Light
import Text.Pandoc.Readers.OOXML.Shared
(NameSpaces, elemName, isElem, elemToNameSpaces,
findChildByName, findChildrenByName, findElementByName, findAttrByName)


-- | Removes child elements that satisfy a given condition.
Expand Down
95 changes: 95 additions & 0 deletions src/Text/Pandoc/Readers/OOXML/Shared.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
{-# LANGUAGE OverloadedStrings #-}
{- |
Module : Text.Pandoc.Readers.OOXML.Shared
Copyright : © 2025 Anton Antic
License : GNU GPL, version 2 or above
Maintainer : Anton Antic <[email protected]>
Stability : alpha
Portability : portable
Shared utilities for Office Open XML (OOXML) readers (DOCX, PPTX).
Provides common functions for ZIP archive handling, XML parsing,
namespace management, and DrawingML parsing.
-}
module Text.Pandoc.Readers.OOXML.Shared
( -- * Constants
emusPerInch
, emuToInches
, inchesToEmu
-- * Types
, NameSpaces
, elemName
, elemToNameSpaces
, isElem
, findChildByName
, findChildrenByName
, findElementByName
, findAttrByName
) where

import qualified Data.Map as M
import qualified Data.Text as T
import Data.Text (Text)
import Text.Pandoc.XML.Light

-- | Type alias for namespace mappings
type NameSpaces = M.Map Text Text

-- | English Metric Units per inch
-- 1 inch = 914400 EMUs (used in OOXML for dimensions)
emusPerInch :: Integer
emusPerInch = 914400

-- | Convert EMUs to inches
emuToInches :: Integer -> Double
emuToInches n = fromIntegral n / fromIntegral emusPerInch

-- | Convert inches to EMUs
inchesToEmu :: Double -> Integer
inchesToEmu n = round (n * fromIntegral emusPerInch)

-- | Extract namespace declarations from element attributes
elemToNameSpaces :: Element -> NameSpaces
elemToNameSpaces = foldr (\(Attr qn val) ->
case qn of
QName s _ (Just "xmlns") -> M.insert s val
_ -> id) mempty . elAttribs

-- | Create a qualified name from namespace map, prefix, and local name
elemName :: NameSpaces -> Text -> Text -> QName
elemName ns prefix name =
QName name
(M.lookup prefix ns)
(if T.null prefix then Nothing else Just prefix)

-- | Check if element matches namespace prefix and local name
isElem :: NameSpaces -> Text -> Text -> Element -> Bool
isElem ns prefix name element =
let ns' = ns <> elemToNameSpaces element
in qName (elName element) == name &&
qURI (elName element) == M.lookup prefix ns'

-- | Find first child element matching namespace and name
findChildByName :: NameSpaces -> Text -> Text -> Element -> Maybe Element
findChildByName ns pref name el =
let ns' = ns <> elemToNameSpaces el
in findChild (elemName ns' pref name) el

-- | Find all children matching namespace and name
findChildrenByName :: NameSpaces -> Text -> Text -> Element -> [Element]
findChildrenByName ns pref name el =
let ns' = ns <> elemToNameSpaces el
in findChildren (elemName ns' pref name) el

-- | Find element anywhere in descendants matching namespace and name
findElementByName :: NameSpaces -> Text -> Text -> Element -> Maybe Element
findElementByName ns pref name el =
let ns' = ns <> elemToNameSpaces el
in findElement (elemName ns' pref name) el

-- | Find attribute value by namespace prefix and name
findAttrByName :: NameSpaces -> Text -> Text -> Element -> Maybe Text
findAttrByName ns pref name el =
let ns' = ns <> elemToNameSpaces el
in findAttr (elemName ns' pref name) el
40 changes: 40 additions & 0 deletions src/Text/Pandoc/Readers/Xlsx.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
{-# LANGUAGE OverloadedStrings #-}
{- |
Module : Text.Pandoc.Readers.Xlsx
Copyright : © 2025 Anton Antic
License : GNU GPL, version 2 or above
Maintainer : Anton Antic <[email protected]>
Stability : alpha
Portability : portable
Conversion of XLSX (Excel spreadsheet) documents to 'Pandoc' document.
-}
module Text.Pandoc.Readers.Xlsx (readXlsx) where

import qualified Data.ByteString.Lazy as B
import qualified Data.Text as T
import Codec.Archive.Zip (toArchiveOrFail)
import Control.Monad.Except (throwError)
import Text.Pandoc.Class.PandocMonad (PandocMonad)
import Text.Pandoc.Definition (Pandoc(..))
import Text.Pandoc.Error (PandocError(..))
import Text.Pandoc.Options (ReaderOptions)
import Text.Pandoc.Readers.Xlsx.Parse (archiveToXlsx)
import Text.Pandoc.Readers.Xlsx.Sheets (xlsxToOutput)

-- | Read XLSX file into Pandoc AST
readXlsx :: PandocMonad m => ReaderOptions -> B.ByteString -> m Pandoc
readXlsx opts bytes =
case toArchiveOrFail bytes of
Right archive ->
case archiveToXlsx archive of
Right xlsx -> do
let (meta, blocks) = xlsxToOutput opts xlsx
return $ Pandoc meta blocks
Left err ->
throwError $ PandocParseError $ "Failed to parse XLSX: " <> err

Left err ->
throwError $ PandocParseError $
"Failed to unpack XLSX archive: " <> T.pack err
63 changes: 63 additions & 0 deletions src/Text/Pandoc/Readers/Xlsx/Cells.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
{-# LANGUAGE OverloadedStrings #-}
{- |
Module : Text.Pandoc.Readers.Xlsx.Cells
Copyright : © 2025 Anton Antic
License : GNU GPL, version 2 or above
Maintainer : Anton Antic <[email protected]>
Stability : alpha
Portability : portable
Cell types and parsing for XLSX.
-}
module Text.Pandoc.Readers.Xlsx.Cells
( CellRef(..)
, XlsxCell(..)
, CellValue(..)
, parseCellRef
) where

import qualified Data.Text as T
import Data.Text (Text)
import Data.Char (ord, isAlpha)
import Text.Read (readMaybe)

-- | Cell reference (A1 notation)
data CellRef = CellRef
{ cellRefCol :: Int -- 1-based (A=1, B=2, ..., AA=27)
, cellRefRow :: Int -- 1-based
} deriving (Show, Eq, Ord)

-- | Cell value types
data CellValue
= TextValue Text
| NumberValue Double
| EmptyValue
deriving (Show, Eq)

-- | Parsed cell
data XlsxCell = XlsxCell
{ cellRef :: CellRef
, cellValue :: CellValue
, cellBold :: Bool
, cellItalic :: Bool
} deriving (Show)

-- | Parse cell reference (A1 → CellRef)
parseCellRef :: Text -> Either Text CellRef
parseCellRef ref = do
let (colStr, rowStr) = T.span isAlpha ref

row <- case readMaybe (T.unpack rowStr) of
Just r | r > 0 -> Right r
_ -> Left $ "Invalid row: " <> rowStr

col <- parseColumn colStr

return $ CellRef col row

-- | Parse column (A=1, Z=26, AA=27, etc.)
parseColumn :: Text -> Either Text Int
parseColumn colStr
| T.null colStr = Left "Empty column"
| otherwise = Right $ T.foldl' (\acc c -> acc * 26 + (ord c - ord 'A' + 1)) 0 colStr
Loading