diff --git a/docs/changelog.rst b/docs/changelog.rst index 8a729677..2d2fa3e4 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -32,6 +32,7 @@ latest changes in development for next release * avoid unicode decode error with html parser (`#147`_ by `@suned`_) + * enabling autocomplete and improving error handling (`#149`_) 1.5.0 ----- @@ -335,3 +336,4 @@ latest changes in development for next release .. _#146: https://github.com/deanmalmgren/textract/issues/146 .. _#147: https://github.com/deanmalmgren/textract/issues/147 .. _#148: https://github.com/deanmalmgren/textract/issues/148 +.. _#149: https://github.com/deanmalmgren/textract/issues/149 diff --git a/textract/cli.py b/textract/cli.py index 525f70b0..47e1fe84 100644 --- a/textract/cli.py +++ b/textract/cli.py @@ -8,11 +8,13 @@ import pkgutil import sys import six +import re +import glob import argcomplete from . import VERSION -from .parsers import DEFAULT_ENCODING +from .parsers import DEFAULT_ENCODING, _get_available_extensions class AddToNamespaceAction(argparse.Action): @@ -62,10 +64,8 @@ def get_parser(): ) parser.add_argument( '--extension', type=str, default=None, - choices=_get_available_encodings(), - help='Specify the extension of the file (e.g., docx or pdf). ' - 'Extension can be also passed with the ' - 'leading . (e.g., .docx or .pdf).', + choices=_get_available_extensions(), + help='Specify the extension of the file.', ) parser.add_argument( '-m', '--method', default='', diff --git a/textract/exceptions.py b/textract/exceptions.py index dbecfe8d..3453cf71 100644 --- a/textract/exceptions.py +++ b/textract/exceptions.py @@ -16,11 +16,19 @@ class ExtensionNotSupported(CommandLineError): def __init__(self, ext): self.ext = ext + from .parsers import _get_available_extensions + available_extensions = [] + for e in _get_available_extensions(): + if e.startswith('.'): + available_extensions.append(e) + self.available_extensions_str = ', '.join(available_extensions) + def __str__(self): return self.render(( 'The filename extension %(ext)s is not yet supported by\n' 'textract. Please suggest this filename extension here:\n\n' - ' https://github.com/deanmalmgren/textract/issues\n' + ' https://github.com/deanmalmgren/textract/issues\n\n' + 'Available extensions include: %(available_extensions_str)s\n' )) diff --git a/textract/parsers/__init__.py b/textract/parsers/__init__.py index 1d6b5826..0642e451 100644 --- a/textract/parsers/__init__.py +++ b/textract/parsers/__init__.py @@ -4,6 +4,8 @@ import os import importlib +import glob +import re from .. import exceptions @@ -22,6 +24,9 @@ # the command line interface DEFAULT_ENCODING = 'utf_8' +# filename format +_FILENAME_SUFFIX = '_parser' + def process(filename, encoding=DEFAULT_ENCODING, extension=None, **kwargs): """This is the core function used for extracting text. It routes the @@ -55,13 +60,14 @@ def process(filename, encoding=DEFAULT_ENCODING, extension=None, **kwargs): # to avoid conflicts with packages that are installed globally # (e.g. python's json module), all extension parser modules have # the _parser extension - rel_module = ext + '_parser' + rel_module = ext + _FILENAME_SUFFIX # If we can't import the module, the file extension isn't currently # supported try: filetype_module = importlib.import_module( - rel_module, 'textract.parsers') + rel_module, 'textract.parsers' + ) except ImportError: raise exceptions.ExtensionNotSupported(ext) @@ -69,3 +75,28 @@ def process(filename, encoding=DEFAULT_ENCODING, extension=None, **kwargs): parser = filetype_module.Parser() return parser.process(filename, encoding, **kwargs) + + +def _get_available_extensions(): + """Get a list of available file extensions to make it easy for + tab-completion and exception handling. + """ + extensions = [] + + # from filenames + parsers_dir = os.path.join(os.path.dirname(__file__)) + glob_filename = os.path.join(parsers_dir, "*" + _FILENAME_SUFFIX + ".py") + ext_re = re.compile(glob_filename.replace('*', "(?P\w+)")) + for filename in glob.glob(glob_filename): + ext_match = ext_re.match(filename) + ext = ext_match.groups()[0] + extensions.append(ext) + extensions.append('.' + ext) + + # from relevant synonyms (don't use the '' synonym) + for ext in EXTENSION_SYNONYMS.keys(): + if ext: + extensions.append(ext) + extensions.append(ext.replace('.', '', 1)) + extensions.sort() + return extensions