|
| 1 | +using Microsoft.AspNetCore.Mvc; |
| 2 | +using OCR_with_Tesseract_in_Docker_on_Linux.Models; |
| 3 | +using Syncfusion.Drawing; |
| 4 | +using Syncfusion.OCRProcessor; |
| 5 | +using Syncfusion.Pdf.Graphics; |
| 6 | +using Syncfusion.Pdf.Parsing; |
| 7 | +using System.Diagnostics; |
| 8 | +using System.Xml.Linq; |
| 9 | + |
| 10 | +namespace OCR_with_Tesseract_in_Docker_on_Linux.Controllers |
| 11 | +{ |
| 12 | + public class HomeController : Controller |
| 13 | + { |
| 14 | + private readonly ILogger<HomeController> _logger; |
| 15 | + |
| 16 | + public HomeController(ILogger<HomeController> logger) |
| 17 | + { |
| 18 | + _logger = logger; |
| 19 | + } |
| 20 | + |
| 21 | + public IActionResult Index() |
| 22 | + { |
| 23 | + return View(); |
| 24 | + } |
| 25 | + |
| 26 | + public IActionResult Privacy() |
| 27 | + { |
| 28 | + return View(); |
| 29 | + } |
| 30 | + public IActionResult PerformOCR() |
| 31 | + { |
| 32 | + string docPath = Path.GetFullPath(@"Data/Input.pdf"); |
| 33 | + //Initialize the OCR processor. |
| 34 | + using (OCRProcessor processor = new OCRProcessor()) |
| 35 | + { |
| 36 | + FileStream fileStream = new FileStream(docPath, FileMode.Open, FileAccess.Read); |
| 37 | + //Load a PDF document |
| 38 | + PdfLoadedDocument lDoc = new PdfLoadedDocument(fileStream); |
| 39 | + //Set OCR language to process |
| 40 | + processor.Settings.Language = Languages.English; |
| 41 | + IOcrEngine tesseractEngine = new Tesseract5OCREngine(); |
| 42 | + processor.ExternalEngine = tesseractEngine; |
| 43 | + //Process OCR by providing the PDF document. |
| 44 | + processor.PerformOCR(lDoc); |
| 45 | + //Create memory stream |
| 46 | + using (MemoryStream stream = new MemoryStream()) |
| 47 | + { |
| 48 | + //Save the document to memory stream |
| 49 | + lDoc.Save(stream); |
| 50 | + lDoc.Close(); |
| 51 | + //Set the position as '0' |
| 52 | + stream.Position = 0; |
| 53 | + //Download the PDF document in the browser |
| 54 | + FileStreamResult fileStreamResult = new FileStreamResult(stream, "application/pdf"); |
| 55 | + fileStreamResult.FileDownloadName = "Sample.pdf"; |
| 56 | + return fileStreamResult; |
| 57 | + } |
| 58 | + } |
| 59 | + |
| 60 | + } |
| 61 | + [ResponseCache(Duration = 0, Location = ResponseCacheLocation.None, NoStore = true)] |
| 62 | + public IActionResult Error() |
| 63 | + { |
| 64 | + return View(new ErrorViewModel { RequestId = Activity.Current?.Id ?? HttpContext.TraceIdentifier }); |
| 65 | + } |
| 66 | + } |
| 67 | + // Tesseract5OcrEngine implementation |
| 68 | + class Tesseract5OCREngine : IOcrEngine |
| 69 | + { |
| 70 | + private float imageHeight; |
| 71 | + private float imageWidth; |
| 72 | + |
| 73 | + public OCRLayoutResult PerformOCR(Stream stream) |
| 74 | + { |
| 75 | + if (stream == null || !stream.CanRead) |
| 76 | + throw new ArgumentException("Input stream is null or not readable for OCR.", nameof(stream)); |
| 77 | + |
| 78 | + stream.Position = 0; |
| 79 | + |
| 80 | + using (MemoryStream tempMemStream = new MemoryStream()) |
| 81 | + { |
| 82 | + stream.CopyTo(tempMemStream); |
| 83 | + tempMemStream.Position = 0; |
| 84 | + PdfTiffImage pdfTiffImage = new PdfTiffImage(tempMemStream); |
| 85 | + imageHeight = pdfTiffImage.Height; |
| 86 | + imageWidth = pdfTiffImage.Width; |
| 87 | + } |
| 88 | + |
| 89 | + string tempImageFile = Path.GetTempFileName(); |
| 90 | + string tempHocrFile = tempImageFile + ".hocr"; |
| 91 | + |
| 92 | + // Write stream to temp image file |
| 93 | + using (FileStream tempFileStream = new FileStream(tempImageFile, FileMode.Create, FileAccess.Write)) |
| 94 | + { |
| 95 | + stream.Position = 0; |
| 96 | + stream.CopyTo(tempFileStream); |
| 97 | + } |
| 98 | + |
| 99 | + ProcessStartInfo startInfo = new ProcessStartInfo |
| 100 | + { |
| 101 | + FileName = "tesseract", |
| 102 | + Arguments = $"\"{tempImageFile}\" \"{tempImageFile}\" -l eng hocr", |
| 103 | + RedirectStandardError = true, |
| 104 | + UseShellExecute = false, |
| 105 | + CreateNoWindow = true |
| 106 | + }; |
| 107 | + |
| 108 | + string hocrText = null; |
| 109 | + using (Process process = new Process { StartInfo = startInfo }) |
| 110 | + { |
| 111 | + process.Start(); |
| 112 | + string errorOutput = process.StandardError.ReadToEnd(); |
| 113 | + process.WaitForExit(); |
| 114 | + |
| 115 | + if (process.ExitCode != 0) |
| 116 | + throw new Exception($"Tesseract process failed with exit code {process.ExitCode}. Error: {errorOutput}"); |
| 117 | + |
| 118 | + if (!File.Exists(tempHocrFile)) |
| 119 | + throw new Exception("HOCR output file not found. Tesseract might have failed or not produced output."); |
| 120 | + |
| 121 | + hocrText = File.ReadAllText(tempHocrFile); |
| 122 | + } |
| 123 | + |
| 124 | + // Clean up temp files |
| 125 | + if (File.Exists(tempImageFile)) File.Delete(tempImageFile); |
| 126 | + if (File.Exists(tempHocrFile)) File.Delete(tempHocrFile); |
| 127 | + |
| 128 | + if (string.IsNullOrEmpty(hocrText)) |
| 129 | + throw new Exception("HOCR text could not be generated or was empty."); |
| 130 | + |
| 131 | + var ocrLayoutResult = new OCRLayoutResult(); |
| 132 | + BuildOCRLayoutResult(ocrLayoutResult, hocrText, imageWidth, imageHeight); |
| 133 | + ocrLayoutResult.ImageWidth = imageWidth; |
| 134 | + ocrLayoutResult.ImageHeight = imageHeight; |
| 135 | + |
| 136 | + return ocrLayoutResult; |
| 137 | + } |
| 138 | + |
| 139 | + void BuildOCRLayoutResult(OCRLayoutResult ocr, string hOcrText, float imageWidth, float imageHeight) |
| 140 | + { |
| 141 | + var doc = XDocument.Parse(hOcrText, LoadOptions.None); |
| 142 | + var ns = "http://www.w3.org/1999/xhtml"; |
| 143 | + |
| 144 | + foreach (var pageElement in doc.Descendants(ns + "div").Where(d => d.Attribute("class")?.Value == "ocr_page")) |
| 145 | + { |
| 146 | + Page ocrPage = new Page(); |
| 147 | + |
| 148 | + foreach (var lineElement in pageElement.Descendants(ns + "span") |
| 149 | + .Where(s => s.Attribute("class")?.Value == "ocr_line" || s.Attribute("class")?.Value == "ocr_header")) |
| 150 | + { |
| 151 | + Line ocrLine = new Line(); |
| 152 | + |
| 153 | + foreach (var wordElement in lineElement.Descendants(ns + "span") |
| 154 | + .Where(s => s.Attribute("class")?.Value == "ocrx_word")) |
| 155 | + { |
| 156 | + Word ocrWord = new Word { Text = wordElement.Value }; |
| 157 | + String title = wordElement.Attribute("title")?.Value; |
| 158 | + |
| 159 | + if (title != null) |
| 160 | + { |
| 161 | + String bboxString = title.Split(';')[0].Replace("bbox", "").Trim(); |
| 162 | + int[] coords = bboxString.Split(' ', StringSplitOptions.RemoveEmptyEntries).Select(int.Parse).ToArray(); |
| 163 | + |
| 164 | + if (coords.Length == 4) |
| 165 | + { |
| 166 | + float x = coords[0]; |
| 167 | + float y = coords[1]; |
| 168 | + float width = coords[2] - coords[0]; |
| 169 | + float height = coords[3] - coords[1]; |
| 170 | + ocrWord.Rectangle = new RectangleF(x, y, width, height); |
| 171 | + } |
| 172 | + } |
| 173 | + |
| 174 | + ocrLine.Add(ocrWord); |
| 175 | + } |
| 176 | + |
| 177 | + ocrPage.Add(ocrLine); |
| 178 | + } |
| 179 | + |
| 180 | + ocr.Add(ocrPage); |
| 181 | + } |
| 182 | + } |
| 183 | + } |
| 184 | +} |
0 commit comments