diff --git a/HocrHeaderFixer.cs b/HocrHeaderFixer.cs
index beb2e07..74a58cf 100644
--- a/HocrHeaderFixer.cs
+++ b/HocrHeaderFixer.cs
@@ -19,20 +19,12 @@ public void Init()
Console.WriteLine("Fixing hOCR file headers...");
}
- public void Process(string hocrFile, XDocument hocrXml)
+ public bool Process(string hocrFile, XDocument hocrXml)
{
XNamespace ns = "http://www.w3.org/1999/xhtml";
var head = hocrXml.Element(ns + "html").Element(ns + "head");
head.Element(ns + "title").Value = "Image: " + FileName ?? Regex.Replace(Path.GetFileName(hocrFile), "_HOCR.shtml$", "_JP2.jpg");
head.Add(new XElement(ns + "meta", new XAttribute("name", "ocr-system"), new XAttribute("content", HtrId.HasValue ? $"Transkribus-HtrId:{HtrId}" : "Transkribus")));
- using var fileStream = File.Open(hocrFile, FileMode.Truncate);
- var writer = XmlWriter.Create(fileStream, new XmlWriterSettings
- {
- // need to specify false here to stop it from emitting a byte order mark
- Encoding = new UTF8Encoding(false),
- Indent = true
- });
- hocrXml.Save(writer);
- writer.Close();
+ return true;
}
}
\ No newline at end of file
diff --git a/IHocrXmlProcessor.cs b/IHocrXmlProcessor.cs
index 874c291..eed261f 100644
--- a/IHocrXmlProcessor.cs
+++ b/IHocrXmlProcessor.cs
@@ -2,6 +2,16 @@
interface IHocrXmlProcessor
{
+ ///
+ /// Used to set up any initial state, such as creating directories
+ ///
void Init();
- void Process(string hocrFile, XDocument hocrXml);
+
+ ///
+ /// Receives each hOCR xml tree for processing
+ ///
+ /// Path to the hOCR file
+ /// Xml tree representing the hOCR file
+ /// True if the xml tree was modified and needs to be saved back to the file.
+ bool Process(string hocrFile, XDocument hocrXml);
}
diff --git a/OcrGenerator.cs b/OcrGenerator.cs
index 3f11e9f..e90858f 100644
--- a/OcrGenerator.cs
+++ b/OcrGenerator.cs
@@ -16,14 +16,14 @@ public void Init()
Directory.CreateDirectory(OcrDirectory);
}
- public void Process(string hocrFile, XDocument hocrXml)
+ public bool Process(string hocrFile, XDocument hocrXml)
{
var text = new StringBuilder();
XNamespace ns = "http://www.w3.org/1999/xhtml";
foreach (var paragraph in hocrXml.Descendants(ns + "p"))
{
- var lines = paragraph.Elements(ns + "span").Where(span => span.Attribute("class")?.Value == "ocr_line");
- foreach (var line in lines)
+ var lines = paragraph.Elements(ns + "span").Where(span => span.Attribute("class")?.Value == "ocr_line");
+ foreach (var line in lines)
{
var words = line.Elements(ns + "span").Select(span => span.Value);
text.AppendJoin(' ', words);
@@ -33,5 +33,6 @@ public void Process(string hocrFile, XDocument hocrXml)
}
var ocrFile = Path.Join(OcrDirectory, Regex.Replace(Path.GetFileName(hocrFile), "_HOCR.shtml$", "_OCR.asc"));
File.WriteAllText(ocrFile, text.ToString().Trim());
+ return false;
}
}
diff --git a/Processor.cs b/Processor.cs
index e161089..17145ac 100644
--- a/Processor.cs
+++ b/Processor.cs
@@ -2,6 +2,7 @@
using System.Runtime.InteropServices;
using System.Text;
using System.Text.RegularExpressions;
+using System.Xml;
using System.Xml.Linq;
using Flurl.Http;
@@ -50,10 +51,12 @@ public async Task ProcessSinglePage(Uri fileUri, MicroservicePageOptions
Directory.CreateDirectory(Jp2Directory);
await File.WriteAllBytesAsync(Path.Join(Jp2Directory, Path.GetFileName(fileUri.LocalPath)), sourceFile);
await ConvertJp2sToJpgs();
- var page = await SendSinglePageToTranskribus(options);
+ var page = await SendSinglePageToTranskribus(options);
await GetSinglePageTranskribusAltoXml(page);
await ConvertAltoToHocr();
- ProcessHocrXml(new HocrHeaderFixer(options.HtrId, Path.GetFileName(fileUri.LocalPath)));
+ ProcessHocrXml(
+ new HocrHeaderFixer(options.HtrId, Path.GetFileName(fileUri.LocalPath)),
+ new WordAlignmentFixer());
var hocrFile = Directory.EnumerateFiles(HocrDirectory).Single();
return await File.ReadAllBytesAsync(hocrFile);
}
@@ -110,14 +113,14 @@ public async Task CheckProgress(IdCrudOptions options)
DeleteDirectoryIfExists(OcrDirectory);
}
}
-
+
public async Task CreateOcrDatastreamsFromHocr(OcrOptions options)
{
string pidFilePath = null;
try
{
- pidFilePath = options.PidFile is null
- ? await GetPagePids(options, options.Pid)
+ pidFilePath = options.PidFile is null
+ ? await GetPagePids(options, options.Pid)
: Path.GetFullPath(options.PidFile);
await GetHocrDatastreams(options, pidFilePath);
ProcessHocrXml(new OcrGenerator(OcrDirectory));
@@ -401,9 +404,22 @@ void ProcessHocrXml(params IHocrXmlProcessor[] processors)
foreach (var hocrFile in Directory.EnumerateFiles(HocrDirectory))
{
var xml = XDocument.Load(hocrFile);
+ var xmlModified = false;
foreach (var processor in processors)
{
- processor.Process(hocrFile, xml);
+ xmlModified |= processor.Process(hocrFile, xml);
+ }
+ if (xmlModified)
+ {
+ using var fileStream = File.Open(hocrFile, FileMode.Create);
+ var writer = XmlWriter.Create(fileStream, new XmlWriterSettings
+ {
+ // need to specify false here to stop it from emitting a byte order mark
+ Encoding = new UTF8Encoding(false),
+ Indent = true
+ });
+ xml.Save(writer);
+ writer.Close();
}
}
}
diff --git a/WordAlignmentFixer.cs b/WordAlignmentFixer.cs
new file mode 100644
index 0000000..e050ec3
--- /dev/null
+++ b/WordAlignmentFixer.cs
@@ -0,0 +1,46 @@
+using System.Text.RegularExpressions;
+using System.Xml.Linq;
+
+partial class WordAlignmentFixer : IHocrXmlProcessor
+{
+ public void Init()
+ {
+ Console.WriteLine("Fixing horizontal word alignment...");
+ }
+
+ public bool Process(string hocrFile, XDocument hocrXml)
+ {
+ XNamespace ns = "http://www.w3.org/1999/xhtml";
+ foreach (var paragraph in hocrXml.Descendants(ns + "p"))
+ {
+ var lines = paragraph.Elements(ns + "span").Where(span => span.Attribute("class")?.Value == "ocr_line");
+ foreach (var line in lines)
+ {
+ var lineAttributes = line.Attribute("title").Value;
+ var lineBbox = BboxRegex().Match(lineAttributes).Groups
+ .Cast()
+ .Skip(1)
+ .Select(g => int.Parse(g.Value))
+ .ToArray();
+ var words = line.Elements(ns + "span");
+ // # of characters = sum of word lengths + the spaces between
+ var lineCharLength = words.Sum(w => w.Value.Length) + words.Count() - 1;
+ var linePixLength = lineBbox[2] - lineBbox[0];
+ var pixPerChar = linePixLength / lineCharLength;
+ var currentPosition = lineBbox[0];
+ foreach (var word in words)
+ {
+ var newLeft = currentPosition;
+ var newRight = currentPosition += word.Value.Length * pixPerChar;
+ var newBbox = $"bbox {newLeft} {lineBbox[1]} {newRight} {lineBbox[3]}";
+ word.Attribute("title").SetValue(BboxRegex().Replace(word.Attribute("title").Value, newBbox));
+ currentPosition += pixPerChar;
+ }
+ }
+ }
+ return true;
+ }
+
+ [GeneratedRegex(@"bbox (\d+) (\d+) (\d+) (\d+)")]
+ private static partial Regex BboxRegex();
+}
\ No newline at end of file
diff --git a/transkribus-process.csproj b/transkribus-process.csproj
index b0e856c..d9f8d2f 100644
--- a/transkribus-process.csproj
+++ b/transkribus-process.csproj
@@ -5,7 +5,7 @@
net8.0
transkribus_process
enable
- 1.1.0
+ 1.2.0