From f76cae3af6780ba30b32fd21c4048c39f510a418 Mon Sep 17 00:00:00 2001
From: Vladimir Rybas
Date: Fri, 13 Jan 2012 22:59:23 +0700
Subject: [PATCH 1/3] Timeout for PDF extraction from OpenOffice supported
document format
Because when we extract_pdf() document more than 400-500 pages,
the JODConverter fails with exception:
Exception in thread "main" org.artofsolving.jodconverter.office.OfficeException:
task did not complete within timeout at org.artofsolving.jodconverter.office.PooledOfficeManager.execute...
---
lib/docsplit.rb | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/lib/docsplit.rb b/lib/docsplit.rb
index 5d0fa84..c53646a 100755
--- a/lib/docsplit.rb
+++ b/lib/docsplit.rb
@@ -62,6 +62,7 @@ def self.extract_images(pdfs, opts={})
# If the document is in an image format, use GraphicsMagick to extract the PDF.
def self.extract_pdf(docs, opts={})
out = opts[:output] || '.'
+ timeout = opts[:timeout] || 3600
FileUtils.mkdir_p out unless File.exists?(out)
[docs].flatten.each do |doc|
ext = File.extname(doc)
@@ -71,7 +72,7 @@ def self.extract_pdf(docs, opts={})
if GM_FORMATS.include?(`file -b --mime #{ESCAPE[doc]}`.strip.split(/[:;]\s+/)[0])
`gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
else
- options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ROOT}/vendor/conf/document-formats.js"
+ options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -t #{timeout} -r #{ROOT}/vendor/conf/document-formats.js"
run "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
end
end
From 51b6f7d5fcd38ea554d0e20424c83fad4900bc04 Mon Sep 17 00:00:00 2001
From: Vladimir Rybas
Date: Fri, 13 Jan 2012 23:19:53 +0700
Subject: [PATCH 2/3] Command line parameter and documentation update for
timeout option
---
index.html | 4 +++-
lib/docsplit/command_line.rb | 3 +++
2 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/index.html b/index.html
index 001087b..3d556da 100755
--- a/index.html
+++ b/index.html
@@ -228,7 +228,9 @@ Usage
xls and so on, as well as html, odf, rtf, swf, svg, and wpd.
The first time that you convert a new file type, OpenOffice will lazy-load
the code that processes it — subsequent conversions will be much faster.
-
+
You can use the --timeout flag to increase or decrease time before 'error on timeout' exception.
+ Useful when you convert huge documents. Default timeout is 1 hour (more than enough to convert 1000-paged document).
+
docsplit pdf documentation/*.html
diff --git a/lib/docsplit/command_line.rb b/lib/docsplit/command_line.rb
index 8d48500..098b2ee 100755
--- a/lib/docsplit/command_line.rb
+++ b/lib/docsplit/command_line.rb
@@ -94,6 +94,9 @@ def parse_options
opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c|
@options[:clean] = false
end
+ opts.on('-t', '--timeout [SEC]', 'Timeout for PDF extraction from OpenOffice document format (default is 1 hour)') do |t|
+ @options[:timeout] = t
+ end
opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
@options[:rolling] = true
end
From e52a17d6e99c2c4f26d8fe67aa0e6bdffbffc7ad Mon Sep 17 00:00:00 2001
From: Vladimir Rybas
Date: Wed, 19 Jun 2013 20:17:40 +0700
Subject: [PATCH 3/3] Fixed help message for `--timeout` option
---
lib/docsplit/command_line.rb | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/lib/docsplit/command_line.rb b/lib/docsplit/command_line.rb
index 098b2ee..22a9dad 100755
--- a/lib/docsplit/command_line.rb
+++ b/lib/docsplit/command_line.rb
@@ -94,7 +94,7 @@ def parse_options
opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c|
@options[:clean] = false
end
- opts.on('-t', '--timeout [SEC]', 'Timeout for PDF extraction from OpenOffice document format (default is 1 hour)') do |t|
+ opts.on('-t', '--timeout [SEC]', 'Timeout for PDF extraction from OpenOffice supported document format (default is 1 hour)') do |t|
@options[:timeout] = t
end
opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
@@ -119,4 +119,4 @@ def parse_options
end
-end
\ No newline at end of file
+end