diff --git a/HocrHeaderFixer.cs b/HocrHeaderFixer.cs index beb2e07..74a58cf 100644 --- a/HocrHeaderFixer.cs +++ b/HocrHeaderFixer.cs @@ -19,20 +19,12 @@ public void Init() Console.WriteLine("Fixing hOCR file headers..."); } - public void Process(string hocrFile, XDocument hocrXml) + public bool Process(string hocrFile, XDocument hocrXml) { XNamespace ns = "http://www.w3.org/1999/xhtml"; var head = hocrXml.Element(ns + "html").Element(ns + "head"); head.Element(ns + "title").Value = "Image: " + FileName ?? Regex.Replace(Path.GetFileName(hocrFile), "_HOCR.shtml$", "_JP2.jpg"); head.Add(new XElement(ns + "meta", new XAttribute("name", "ocr-system"), new XAttribute("content", HtrId.HasValue ? $"Transkribus-HtrId:{HtrId}" : "Transkribus"))); - using var fileStream = File.Open(hocrFile, FileMode.Truncate); - var writer = XmlWriter.Create(fileStream, new XmlWriterSettings - { - // need to specify false here to stop it from emitting a byte order mark - Encoding = new UTF8Encoding(false), - Indent = true - }); - hocrXml.Save(writer); - writer.Close(); + return true; } } \ No newline at end of file diff --git a/IHocrXmlProcessor.cs b/IHocrXmlProcessor.cs index 874c291..eed261f 100644 --- a/IHocrXmlProcessor.cs +++ b/IHocrXmlProcessor.cs @@ -2,6 +2,16 @@ interface IHocrXmlProcessor { + /// + /// Used to set up any initial state, such as creating directories + /// void Init(); - void Process(string hocrFile, XDocument hocrXml); + + /// + /// Receives each hOCR xml tree for processing + /// + /// Path to the hOCR file + /// Xml tree representing the hOCR file + /// True if the xml tree was modified and needs to be saved back to the file. + bool Process(string hocrFile, XDocument hocrXml); } diff --git a/OcrGenerator.cs b/OcrGenerator.cs index 3f11e9f..e90858f 100644 --- a/OcrGenerator.cs +++ b/OcrGenerator.cs @@ -16,14 +16,14 @@ public void Init() Directory.CreateDirectory(OcrDirectory); } - public void Process(string hocrFile, XDocument hocrXml) + public bool Process(string hocrFile, XDocument hocrXml) { var text = new StringBuilder(); XNamespace ns = "http://www.w3.org/1999/xhtml"; foreach (var paragraph in hocrXml.Descendants(ns + "p")) { - var lines = paragraph.Elements(ns + "span").Where(span => span.Attribute("class")?.Value == "ocr_line"); - foreach (var line in lines) + var lines = paragraph.Elements(ns + "span").Where(span => span.Attribute("class")?.Value == "ocr_line"); + foreach (var line in lines) { var words = line.Elements(ns + "span").Select(span => span.Value); text.AppendJoin(' ', words); @@ -33,5 +33,6 @@ public void Process(string hocrFile, XDocument hocrXml) } var ocrFile = Path.Join(OcrDirectory, Regex.Replace(Path.GetFileName(hocrFile), "_HOCR.shtml$", "_OCR.asc")); File.WriteAllText(ocrFile, text.ToString().Trim()); + return false; } } diff --git a/Processor.cs b/Processor.cs index e161089..17145ac 100644 --- a/Processor.cs +++ b/Processor.cs @@ -2,6 +2,7 @@ using System.Runtime.InteropServices; using System.Text; using System.Text.RegularExpressions; +using System.Xml; using System.Xml.Linq; using Flurl.Http; @@ -50,10 +51,12 @@ public async Task ProcessSinglePage(Uri fileUri, MicroservicePageOptions Directory.CreateDirectory(Jp2Directory); await File.WriteAllBytesAsync(Path.Join(Jp2Directory, Path.GetFileName(fileUri.LocalPath)), sourceFile); await ConvertJp2sToJpgs(); - var page = await SendSinglePageToTranskribus(options); + var page = await SendSinglePageToTranskribus(options); await GetSinglePageTranskribusAltoXml(page); await ConvertAltoToHocr(); - ProcessHocrXml(new HocrHeaderFixer(options.HtrId, Path.GetFileName(fileUri.LocalPath))); + ProcessHocrXml( + new HocrHeaderFixer(options.HtrId, Path.GetFileName(fileUri.LocalPath)), + new WordAlignmentFixer()); var hocrFile = Directory.EnumerateFiles(HocrDirectory).Single(); return await File.ReadAllBytesAsync(hocrFile); } @@ -110,14 +113,14 @@ public async Task CheckProgress(IdCrudOptions options) DeleteDirectoryIfExists(OcrDirectory); } } - + public async Task CreateOcrDatastreamsFromHocr(OcrOptions options) { string pidFilePath = null; try { - pidFilePath = options.PidFile is null - ? await GetPagePids(options, options.Pid) + pidFilePath = options.PidFile is null + ? await GetPagePids(options, options.Pid) : Path.GetFullPath(options.PidFile); await GetHocrDatastreams(options, pidFilePath); ProcessHocrXml(new OcrGenerator(OcrDirectory)); @@ -401,9 +404,22 @@ void ProcessHocrXml(params IHocrXmlProcessor[] processors) foreach (var hocrFile in Directory.EnumerateFiles(HocrDirectory)) { var xml = XDocument.Load(hocrFile); + var xmlModified = false; foreach (var processor in processors) { - processor.Process(hocrFile, xml); + xmlModified |= processor.Process(hocrFile, xml); + } + if (xmlModified) + { + using var fileStream = File.Open(hocrFile, FileMode.Create); + var writer = XmlWriter.Create(fileStream, new XmlWriterSettings + { + // need to specify false here to stop it from emitting a byte order mark + Encoding = new UTF8Encoding(false), + Indent = true + }); + xml.Save(writer); + writer.Close(); } } } diff --git a/WordAlignmentFixer.cs b/WordAlignmentFixer.cs new file mode 100644 index 0000000..e050ec3 --- /dev/null +++ b/WordAlignmentFixer.cs @@ -0,0 +1,46 @@ +using System.Text.RegularExpressions; +using System.Xml.Linq; + +partial class WordAlignmentFixer : IHocrXmlProcessor +{ + public void Init() + { + Console.WriteLine("Fixing horizontal word alignment..."); + } + + public bool Process(string hocrFile, XDocument hocrXml) + { + XNamespace ns = "http://www.w3.org/1999/xhtml"; + foreach (var paragraph in hocrXml.Descendants(ns + "p")) + { + var lines = paragraph.Elements(ns + "span").Where(span => span.Attribute("class")?.Value == "ocr_line"); + foreach (var line in lines) + { + var lineAttributes = line.Attribute("title").Value; + var lineBbox = BboxRegex().Match(lineAttributes).Groups + .Cast() + .Skip(1) + .Select(g => int.Parse(g.Value)) + .ToArray(); + var words = line.Elements(ns + "span"); + // # of characters = sum of word lengths + the spaces between + var lineCharLength = words.Sum(w => w.Value.Length) + words.Count() - 1; + var linePixLength = lineBbox[2] - lineBbox[0]; + var pixPerChar = linePixLength / lineCharLength; + var currentPosition = lineBbox[0]; + foreach (var word in words) + { + var newLeft = currentPosition; + var newRight = currentPosition += word.Value.Length * pixPerChar; + var newBbox = $"bbox {newLeft} {lineBbox[1]} {newRight} {lineBbox[3]}"; + word.Attribute("title").SetValue(BboxRegex().Replace(word.Attribute("title").Value, newBbox)); + currentPosition += pixPerChar; + } + } + } + return true; + } + + [GeneratedRegex(@"bbox (\d+) (\d+) (\d+) (\d+)")] + private static partial Regex BboxRegex(); +} \ No newline at end of file diff --git a/transkribus-process.csproj b/transkribus-process.csproj index b0e856c..d9f8d2f 100644 --- a/transkribus-process.csproj +++ b/transkribus-process.csproj @@ -5,7 +5,7 @@ net8.0 transkribus_process enable - 1.1.0 + 1.2.0