Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 2 additions & 10 deletions HocrHeaderFixer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,12 @@ public void Init()
Console.WriteLine("Fixing hOCR file headers...");
}

public void Process(string hocrFile, XDocument hocrXml)
public bool Process(string hocrFile, XDocument hocrXml)
{
XNamespace ns = "http://www.w3.org/1999/xhtml";
var head = hocrXml.Element(ns + "html").Element(ns + "head");
head.Element(ns + "title").Value = "Image: " + FileName ?? Regex.Replace(Path.GetFileName(hocrFile), "_HOCR.shtml$", "_JP2.jpg");
head.Add(new XElement(ns + "meta", new XAttribute("name", "ocr-system"), new XAttribute("content", HtrId.HasValue ? $"Transkribus-HtrId:{HtrId}" : "Transkribus")));
using var fileStream = File.Open(hocrFile, FileMode.Truncate);
var writer = XmlWriter.Create(fileStream, new XmlWriterSettings
{
// need to specify false here to stop it from emitting a byte order mark
Encoding = new UTF8Encoding(false),
Indent = true
});
hocrXml.Save(writer);
writer.Close();
return true;
}
}
12 changes: 11 additions & 1 deletion IHocrXmlProcessor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,16 @@

interface IHocrXmlProcessor
{
/// <summary>
/// Used to set up any initial state, such as creating directories
/// </summary>
void Init();
void Process(string hocrFile, XDocument hocrXml);

/// <summary>
/// Receives each hOCR xml tree for processing
/// </summary>
/// <param name="hocrFile">Path to the hOCR file</param>
/// <param name="hocrXml">Xml tree representing the hOCR file</param>
/// <returns>True if the xml tree was modified and needs to be saved back to the file.</returns>
bool Process(string hocrFile, XDocument hocrXml);
}
7 changes: 4 additions & 3 deletions OcrGenerator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@ public void Init()
Directory.CreateDirectory(OcrDirectory);
}

public void Process(string hocrFile, XDocument hocrXml)
public bool Process(string hocrFile, XDocument hocrXml)
{
var text = new StringBuilder();
XNamespace ns = "http://www.w3.org/1999/xhtml";
foreach (var paragraph in hocrXml.Descendants(ns + "p"))
{
var lines = paragraph.Elements(ns + "span").Where(span => span.Attribute("class")?.Value == "ocr_line");
foreach (var line in lines)
var lines = paragraph.Elements(ns + "span").Where(span => span.Attribute("class")?.Value == "ocr_line");
foreach (var line in lines)
{
var words = line.Elements(ns + "span").Select(span => span.Value);
text.AppendJoin(' ', words);
Expand All @@ -33,5 +33,6 @@ public void Process(string hocrFile, XDocument hocrXml)
}
var ocrFile = Path.Join(OcrDirectory, Regex.Replace(Path.GetFileName(hocrFile), "_HOCR.shtml$", "_OCR.asc"));
File.WriteAllText(ocrFile, text.ToString().Trim());
return false;
}
}
28 changes: 22 additions & 6 deletions Processor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
using System.Runtime.InteropServices;
using System.Text;
using System.Text.RegularExpressions;
using System.Xml;
using System.Xml.Linq;
using Flurl.Http;

Expand Down Expand Up @@ -50,10 +51,12 @@ public async Task<byte[]> ProcessSinglePage(Uri fileUri, MicroservicePageOptions
Directory.CreateDirectory(Jp2Directory);
await File.WriteAllBytesAsync(Path.Join(Jp2Directory, Path.GetFileName(fileUri.LocalPath)), sourceFile);
await ConvertJp2sToJpgs();
var page = await SendSinglePageToTranskribus(options);
var page = await SendSinglePageToTranskribus(options);
await GetSinglePageTranskribusAltoXml(page);
await ConvertAltoToHocr();
ProcessHocrXml(new HocrHeaderFixer(options.HtrId, Path.GetFileName(fileUri.LocalPath)));
ProcessHocrXml(
new HocrHeaderFixer(options.HtrId, Path.GetFileName(fileUri.LocalPath)),
new WordAlignmentFixer());
var hocrFile = Directory.EnumerateFiles(HocrDirectory).Single();
return await File.ReadAllBytesAsync(hocrFile);
}
Expand Down Expand Up @@ -110,14 +113,14 @@ public async Task CheckProgress(IdCrudOptions options)
DeleteDirectoryIfExists(OcrDirectory);
}
}

public async Task CreateOcrDatastreamsFromHocr(OcrOptions options)
{
string pidFilePath = null;
try
{
pidFilePath = options.PidFile is null
? await GetPagePids(options, options.Pid)
pidFilePath = options.PidFile is null
? await GetPagePids(options, options.Pid)
: Path.GetFullPath(options.PidFile);
await GetHocrDatastreams(options, pidFilePath);
ProcessHocrXml(new OcrGenerator(OcrDirectory));
Expand Down Expand Up @@ -401,9 +404,22 @@ void ProcessHocrXml(params IHocrXmlProcessor[] processors)
foreach (var hocrFile in Directory.EnumerateFiles(HocrDirectory))
{
var xml = XDocument.Load(hocrFile);
var xmlModified = false;
foreach (var processor in processors)
{
processor.Process(hocrFile, xml);
xmlModified |= processor.Process(hocrFile, xml);
}
if (xmlModified)
{
using var fileStream = File.Open(hocrFile, FileMode.Create);
var writer = XmlWriter.Create(fileStream, new XmlWriterSettings
{
// need to specify false here to stop it from emitting a byte order mark
Encoding = new UTF8Encoding(false),
Indent = true
});
xml.Save(writer);
writer.Close();
}
}
}
Expand Down
46 changes: 46 additions & 0 deletions WordAlignmentFixer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
using System.Text.RegularExpressions;
using System.Xml.Linq;

partial class WordAlignmentFixer : IHocrXmlProcessor
{
public void Init()
{
Console.WriteLine("Fixing horizontal word alignment...");
}

public bool Process(string hocrFile, XDocument hocrXml)
{
XNamespace ns = "http://www.w3.org/1999/xhtml";
foreach (var paragraph in hocrXml.Descendants(ns + "p"))
{
var lines = paragraph.Elements(ns + "span").Where(span => span.Attribute("class")?.Value == "ocr_line");
foreach (var line in lines)
{
var lineAttributes = line.Attribute("title").Value;
var lineBbox = BboxRegex().Match(lineAttributes).Groups
.Cast<Group>()
.Skip(1)
.Select(g => int.Parse(g.Value))
.ToArray();
var words = line.Elements(ns + "span");
// # of characters = sum of word lengths + the spaces between
var lineCharLength = words.Sum(w => w.Value.Length) + words.Count() - 1;
var linePixLength = lineBbox[2] - lineBbox[0];
var pixPerChar = linePixLength / lineCharLength;
var currentPosition = lineBbox[0];
foreach (var word in words)
{
var newLeft = currentPosition;
var newRight = currentPosition += word.Value.Length * pixPerChar;
var newBbox = $"bbox {newLeft} {lineBbox[1]} {newRight} {lineBbox[3]}";
word.Attribute("title").SetValue(BboxRegex().Replace(word.Attribute("title").Value, newBbox));
currentPosition += pixPerChar;
}
}
}
return true;
}

[GeneratedRegex(@"bbox (\d+) (\d+) (\d+) (\d+)")]
private static partial Regex BboxRegex();
}
2 changes: 1 addition & 1 deletion transkribus-process.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<TargetFramework>net8.0</TargetFramework>
<RootNamespace>transkribus_process</RootNamespace>
<ImplicitUsings>enable</ImplicitUsings>
<Version>1.1.0</Version>
<Version>1.2.0</Version>
</PropertyGroup>

<ItemGroup>
Expand Down