From 81186fffc8fc96be92fd561ad6f09f073ca9599c Mon Sep 17 00:00:00 2001 From: Dean Malmgren Date: Fri, 31 Mar 2017 18:16:59 -0500 Subject: [PATCH 1/3] enabling autocomlpete on extensions properly --- textract/cli.py | 10 +++++----- textract/parsers/__init__.py | 35 +++++++++++++++++++++++++++++++++-- 2 files changed, 38 insertions(+), 7 deletions(-) diff --git a/textract/cli.py b/textract/cli.py index 525f70b0..47e1fe84 100644 --- a/textract/cli.py +++ b/textract/cli.py @@ -8,11 +8,13 @@ import pkgutil import sys import six +import re +import glob import argcomplete from . import VERSION -from .parsers import DEFAULT_ENCODING +from .parsers import DEFAULT_ENCODING, _get_available_extensions class AddToNamespaceAction(argparse.Action): @@ -62,10 +64,8 @@ def get_parser(): ) parser.add_argument( '--extension', type=str, default=None, - choices=_get_available_encodings(), - help='Specify the extension of the file (e.g., docx or pdf). ' - 'Extension can be also passed with the ' - 'leading . (e.g., .docx or .pdf).', + choices=_get_available_extensions(), + help='Specify the extension of the file.', ) parser.add_argument( '-m', '--method', default='', diff --git a/textract/parsers/__init__.py b/textract/parsers/__init__.py index 1d6b5826..0642e451 100644 --- a/textract/parsers/__init__.py +++ b/textract/parsers/__init__.py @@ -4,6 +4,8 @@ import os import importlib +import glob +import re from .. import exceptions @@ -22,6 +24,9 @@ # the command line interface DEFAULT_ENCODING = 'utf_8' +# filename format +_FILENAME_SUFFIX = '_parser' + def process(filename, encoding=DEFAULT_ENCODING, extension=None, **kwargs): """This is the core function used for extracting text. It routes the @@ -55,13 +60,14 @@ def process(filename, encoding=DEFAULT_ENCODING, extension=None, **kwargs): # to avoid conflicts with packages that are installed globally # (e.g. python's json module), all extension parser modules have # the _parser extension - rel_module = ext + '_parser' + rel_module = ext + _FILENAME_SUFFIX # If we can't import the module, the file extension isn't currently # supported try: filetype_module = importlib.import_module( - rel_module, 'textract.parsers') + rel_module, 'textract.parsers' + ) except ImportError: raise exceptions.ExtensionNotSupported(ext) @@ -69,3 +75,28 @@ def process(filename, encoding=DEFAULT_ENCODING, extension=None, **kwargs): parser = filetype_module.Parser() return parser.process(filename, encoding, **kwargs) + + +def _get_available_extensions(): + """Get a list of available file extensions to make it easy for + tab-completion and exception handling. + """ + extensions = [] + + # from filenames + parsers_dir = os.path.join(os.path.dirname(__file__)) + glob_filename = os.path.join(parsers_dir, "*" + _FILENAME_SUFFIX + ".py") + ext_re = re.compile(glob_filename.replace('*', "(?P\w+)")) + for filename in glob.glob(glob_filename): + ext_match = ext_re.match(filename) + ext = ext_match.groups()[0] + extensions.append(ext) + extensions.append('.' + ext) + + # from relevant synonyms (don't use the '' synonym) + for ext in EXTENSION_SYNONYMS.keys(): + if ext: + extensions.append(ext) + extensions.append(ext.replace('.', '', 1)) + extensions.sort() + return extensions From 47b8dd1d9ace9eebdb10e5528c638a4b91864332 Mon Sep 17 00:00:00 2001 From: Dean Malmgren Date: Fri, 31 Mar 2017 18:23:41 -0500 Subject: [PATCH 2/3] improved error message with available extensions --- textract/exceptions.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/textract/exceptions.py b/textract/exceptions.py index dbecfe8d..3453cf71 100644 --- a/textract/exceptions.py +++ b/textract/exceptions.py @@ -16,11 +16,19 @@ class ExtensionNotSupported(CommandLineError): def __init__(self, ext): self.ext = ext + from .parsers import _get_available_extensions + available_extensions = [] + for e in _get_available_extensions(): + if e.startswith('.'): + available_extensions.append(e) + self.available_extensions_str = ', '.join(available_extensions) + def __str__(self): return self.render(( 'The filename extension %(ext)s is not yet supported by\n' 'textract. Please suggest this filename extension here:\n\n' - ' https://github.com/deanmalmgren/textract/issues\n' + ' https://github.com/deanmalmgren/textract/issues\n\n' + 'Available extensions include: %(available_extensions_str)s\n' )) From 0369c7d35670cc4bd4354779f759379949c44fa5 Mon Sep 17 00:00:00 2001 From: Dean Malmgren Date: Fri, 31 Mar 2017 18:25:56 -0500 Subject: [PATCH 3/3] updated changelog --- docs/changelog.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/changelog.rst b/docs/changelog.rst index 5d61e2b8..03c2d658 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -27,6 +27,7 @@ latest changes in development for next release * avoid unicode decode error with html parser (`#147`_ by `@suned`_) + * enabling autocomplete and improving error handling (`#149`_) 1.5.0 ----- @@ -328,3 +329,4 @@ latest changes in development for next release .. _#139: https://github.com/deanmalmgren/textract/issues/139 .. _#147: https://github.com/deanmalmgren/textract/issues/147 .. _#148: https://github.com/deanmalmgren/textract/issues/148 +.. _#149: https://github.com/deanmalmgren/textract/issues/149