From 1bdf5f66241764b727675d96010541ecb02a6795 Mon Sep 17 00:00:00 2001 From: jgibbon Date: Tue, 27 Jan 2015 20:54:43 +0100 Subject: [PATCH 1/2] Replace temporary Files with tesseract stdin/stdout introduce Tesseract.processStream() to simplify usage with build tools and reduce I/O --- lib/tesseract.js | 120 ++++++++++++++++------------------------------ test/tesseract.js | 14 ++++++ 2 files changed, 54 insertions(+), 80 deletions(-) diff --git a/lib/tesseract.js b/lib/tesseract.js index 78f780d..edc3d95 100644 --- a/lib/tesseract.js +++ b/lib/tesseract.js @@ -4,16 +4,10 @@ * Module dependencies. */ var utils = require('./utils'); -var exec = require('child_process').exec; -var fs = require('fs'); -var tmpdir = require('os').tmpdir(); // let the os take care of removing zombie tmp files -var uuid = require('node-uuid'); -var path = require('path'); +var spawn = require('child_process').spawn; +var createReadStream = require('fs').createReadStream; var Tesseract = { - - tmpFiles: [], - /** * options default options passed to Tesseract binary * @type {Object} @@ -26,108 +20,74 @@ var Tesseract = { }, /** - * outputEncoding - * @type {String} + * Create Stream from path, then run processStream + * + * @param {String} image + * @param {Object} options to pass to Tesseract binary + * @param {Function} callback + * @returns {Stream} */ - outputEncoding: 'UTF-8', + process: function(image, options, callback) { + return this.processStream(createReadStream(image), options, callback); + }, + + /** * Runs Tesseract binary with options * - * @param {String} image + * @param {Stream} image * @param {Object} options to pass to Tesseract binary * @param {Function} callback + * @returns {Stream} */ - process: function(image, options, callback) { + processStream: function(image, options, callback) { if (typeof options === 'function') { callback = options; options = null; } - options = utils.merge(Tesseract.options, options); - // generate output file name - var output = path.resolve(tmpdir, 'node-tesseract-' + uuid.v4()); - - // add the tmp file to the list - Tesseract.tmpFiles.push(output); - // assemble tesseract command - var command = [options.binary, image, output]; + var command = '- -'; if (options.l !== null) { - command.push('-l ' + options.l); + command += ' -l ' + options.l; } if (options.psm !== null) { - command.push('-psm ' + options.psm); + command += ' -psm ' + options.psm; } if (options.config !== null) { - command.push(options.config); + command += ' ' + options.config; } - command = command.join(' '); - - var opts = options.env || {}; - - // Run the tesseract command - exec(command, opts, function(err) { - if (err) { - // Something went wrong executing the assembled command - callback(err, null); - return; - } - - var outputFile = output + '.txt'; - fs.readFile(outputFile, function(err, data) { - if (!err) { - // There was no error, so get the text - data = data.toString(Tesseract.outputEncoding); - } - fs.unlink(outputFile); - // remove the file from the tmpFiles array - var index = Tesseract.tmpFiles.indexOf(output); - if (~index) Tesseract.tmpFiles.splice(index, 1); - - callback(err, data); - }); // end reaFile - - }); // end exec - + var bin = spawn(Tesseract.options.binary, command.split(' ')), + body = '', + errbody = '', + done = false; + + image.pipe(bin.stdin); + + if (callback) { + bin.stdout.on('data', function(chunk) { + body += chunk; + }); + bin.stderr.on('data', function(chunk) { + errbody += chunk; + }); + bin.on('exit', function() { + callback(errbody, body); + }); + } + return bin.stdout; } }; - -function gc() { - for (var i = Tesseract.tmpFiles.length - 1; i >= 0; i--) { - try { - fs.unlinkSync(Tesseract.tmpFiles[i] + '.txt'); - } catch (err) {} - - var index = Tesseract.tmpFiles.indexOf(Tesseract.tmpFiles[i]); - if (~index) Tesseract.tmpFiles.splice(index, 1); - }; -} - -var version = process.versions.node.split('.').map(function(value) { - return parseInt(value, 10); -}); - -if (version[0] === 0 && (version[1] < 9 || version[1] === 9 && version[2] < 5)) { - process.addListener('uncaughtException', function _uncaughtExceptionThrown(err) { - gc(); - throw err; - }); -} - -// clean up the tmp files -process.addListener('exit', function _exit(code) { - gc(); -}); - /** * Module exports. */ module.exports.process = Tesseract.process; +module.exports.processStream = Tesseract.processStream; diff --git a/test/tesseract.js b/test/tesseract.js index bd64c53..45b386c 100644 --- a/test/tesseract.js +++ b/test/tesseract.js @@ -14,6 +14,20 @@ describe('process', function(){ done(); }); + }); + + it('should return the string "node-tesseract" when run with options', function(done){ + + var testImage = __dirname + '/test.png'; + + tesseract.process(testImage, { + psm:3, + l:'eng' + }, function(err, text) { + text.trim().should.equal('node-tesseract'); + done(); + }); + }) }) From 248253a80d9a486f34e31a5d2be8e078468d667a Mon Sep 17 00:00:00 2001 From: jgibbon Date: Tue, 27 Jan 2015 20:54:43 +0100 Subject: [PATCH 2/2] Use alternative tesseract parameters Tests failed on wercker while passing locally, so let's try "stdin stdout" instead of "- -". --- lib/tesseract.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/tesseract.js b/lib/tesseract.js index edc3d95..9f8c7af 100644 --- a/lib/tesseract.js +++ b/lib/tesseract.js @@ -50,7 +50,7 @@ var Tesseract = { options = utils.merge(Tesseract.options, options); // assemble tesseract command - var command = '- -'; + var command = 'stdin stdout'; if (options.l !== null) { command += ' -l ' + options.l;