Skip to content

Commit 8f7f5ab

Browse files
cscanlin-kwhbosd
authored andcommitted
[IMP]: add support for file_bytes argument with managed_file_context()
1 parent e3c1115 commit 8f7f5ab

File tree

5 files changed

+171
-84
lines changed

5 files changed

+171
-84
lines changed

camelot/handlers.py

+97-54
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,26 @@
11
import multiprocessing as mp
2+
from contextlib import contextmanager
3+
import io
24
import os
35
import sys
46
from pathlib import Path
5-
from typing import Union
7+
from typing import Union, Any, IO, TypeVar
68

79
from pypdf import PdfReader
810
from pypdf import PdfWriter
9-
from pypdf._utils import StrByteType
1011

1112
from .core import TableList
1213
from .parsers import Lattice
1314
from .parsers import Stream
1415
from .utils import TemporaryDirectory
15-
from .utils import download_url
16+
from .utils import InvalidArguments
17+
from .utils import get_url_bytes
1618
from .utils import get_page_layout
1719
from .utils import get_rotation
1820
from .utils import get_text_objects
1921
from .utils import is_url
2022

23+
FilePathType = TypeVar(Union[str, IO[Any], Path, None])
2124

2225
class PDFHandler:
2326
"""Handles all operations like temp directory creation, splitting
@@ -26,21 +29,35 @@ class PDFHandler:
2629
2730
Parameters
2831
----------
29-
filepath : str
30-
Filepath or URL of the PDF file.
32+
filepath : str | pathlib.Path, optional (default: None)
33+
Filepath or URL of the PDF file. Required if file_bytes is not given
3134
pages : str, optional (default: '1')
3235
Comma-separated page numbers.
3336
Example: '1,3,4' or '1,4-end' or 'all'.
3437
password : str, optional (default: None)
3538
Password for decryption.
39+
file_bytes : io.IOBase, optional (default: None)
40+
A file-like stream. Required if filepath is not given
3641
3742
"""
3843

39-
def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None):
44+
def __init__(self, filepath: FilePathType = None, pages="1", password=None, file_bytes=None):
4045
if is_url(filepath):
41-
filepath = download_url(filepath)
42-
self.filepath: Union[StrByteType, Path] = filepath
46+
file_bytes = get_url_bytes(filepath)
4347

48+
if not filepath and not file_bytes:
49+
raise InvalidArguments('Either `filepath` or `file_bytes` is required')
50+
if not filepath:
51+
# filepath must either be passed, or taken from the name attribute
52+
try:
53+
filepath = getattr(file_bytes, 'name')
54+
except AttributeError:
55+
msg = ('Either pass a `filepath`, or give the '
56+
'`file_bytes` argument a name attribute')
57+
raise InvalidArguments(msg)
58+
self.file_bytes = file_bytes # ok to be None
59+
60+
self.filepath = filepath
4461
if isinstance(filepath, str) and not filepath.lower().endswith(".pdf"):
4562
raise NotImplementedError("File format not supported")
4663

@@ -52,13 +69,35 @@ def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None)
5269
self.password = self.password.encode("ascii")
5370
self.pages = self._get_pages(pages)
5471

72+
@contextmanager
73+
def managed_file_context(self):
74+
"""Reads from either the `filepath` or `file_bytes`
75+
attribute of this instance, to return a file-like object.
76+
Closes any open file handles on exit or error.
77+
78+
Returns
79+
-------
80+
file_bytes : io.IOBase
81+
A readable, seekable, file-like object
82+
"""
83+
if self.file_bytes:
84+
# if we can't seek, write to a BytesIO object that can,
85+
# then seek to the beginning before yielding
86+
if not hasattr(self.file_bytes, 'seek'):
87+
self.file_bytes = io.BytesIO(self.file_bytes.read())
88+
self.file_bytes.seek(0)
89+
yield self.file_bytes
90+
else:
91+
with open(self.filepath, "rb") as file_bytes:
92+
yield file_bytes
93+
5594
def _get_pages(self, pages):
5695
"""Converts pages string to list of ints.
5796
5897
Parameters
5998
----------
60-
filepath : str
61-
Filepath or URL of the PDF file.
99+
managed_file_context : io.IOBase
100+
A readable, seekable, file-like object
62101
pages : str, optional (default: '1')
63102
Comma-separated page numbers.
64103
Example: '1,3,4' or '1,4-end' or 'all'.
@@ -74,74 +113,77 @@ def _get_pages(self, pages):
74113
if pages == "1":
75114
page_numbers.append({"start": 1, "end": 1})
76115
else:
77-
infile = PdfReader(self.filepath, strict=False)
116+
with self.managed_file_context() as f:
117+
infile = PdfReader(f, strict=False)
78118

79-
if infile.is_encrypted:
80-
infile.decrypt(self.password)
119+
if infile.is_encrypted:
120+
infile.decrypt(self.password)
81121

82-
if pages == "all":
83-
page_numbers.append({"start": 1, "end": len(infile.pages)})
84-
else:
85-
for r in pages.split(","):
86-
if "-" in r:
87-
a, b = r.split("-")
88-
if b == "end":
89-
b = len(infile.pages)
90-
page_numbers.append({"start": int(a), "end": int(b)})
91-
else:
92-
page_numbers.append({"start": int(r), "end": int(r)})
122+
if pages == "all":
123+
page_numbers.append({"start": 1, "end": len(infile.pages)})
124+
else:
125+
for r in pages.split(","):
126+
if "-" in r:
127+
a, b = r.split("-")
128+
if b == "end":
129+
b = len(infile.pages)
130+
page_numbers.append({"start": int(a), "end": int(b)})
131+
else:
132+
page_numbers.append({"start": int(r), "end": int(r)})
93133

94134
result = []
95135
for p in page_numbers:
96136
result.extend(range(p["start"], p["end"] + 1))
97137
return sorted(set(result))
98138

99-
def _save_page(self, filepath: Union[StrByteType, Path], page, temp):
139+
def _save_page(self, page, temp):
100140
"""Saves specified page from PDF into a temporary directory.
101141
102142
Parameters
103143
----------
104-
filepath : str
105-
Filepath or URL of the PDF file.
144+
managed_file_context : io.IOBase
145+
A readable, seekable, file-like object
106146
page : int
107147
Page number.
108148
temp : str
109149
Tmp directory.
110150
111151
"""
112-
infile = PdfReader(filepath, strict=False)
113-
if infile.is_encrypted:
114-
infile.decrypt(self.password)
115-
fpath = os.path.join(temp, f"page-{page}.pdf")
116-
froot, fext = os.path.splitext(fpath)
117-
p = infile.pages[page - 1]
118-
outfile = PdfWriter()
119-
outfile.add_page(p)
120-
with open(fpath, "wb") as f:
121-
outfile.write(f)
122-
layout, dim = get_page_layout(fpath)
123-
# fix rotated PDF
124-
chars = get_text_objects(layout, ltype="char")
125-
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
126-
vertical_text = get_text_objects(layout, ltype="vertical_text")
127-
rotation = get_rotation(chars, horizontal_text, vertical_text)
128-
if rotation != "":
129-
fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
130-
os.rename(fpath, fpath_new)
131-
instream = open(fpath_new, "rb")
132-
infile = PdfReader(instream, strict=False)
152+
153+
with self.managed_file_context() as fileobj:
154+
infile = PdfReader(fileobj, strict=False)
133155
if infile.is_encrypted:
134156
infile.decrypt(self.password)
157+
fpath = os.path.join(temp, f"page-{page}.pdf")
158+
froot, fext = os.path.splitext(fpath)
159+
p = infile.pages[page - 1]
135160
outfile = PdfWriter()
136-
p = infile.pages[0]
137-
if rotation == "anticlockwise":
138-
p.rotate(90)
139-
elif rotation == "clockwise":
140-
p.rotate(-90)
141161
outfile.add_page(p)
142162
with open(fpath, "wb") as f:
143163
outfile.write(f)
144-
instream.close()
164+
layout, dim = get_page_layout(fpath)
165+
# fix rotated PDF
166+
chars = get_text_objects(layout, ltype="char")
167+
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
168+
vertical_text = get_text_objects(layout, ltype="vertical_text")
169+
rotation = get_rotation(chars, horizontal_text, vertical_text)
170+
if rotation != "":
171+
fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
172+
os.rename(fpath, fpath_new)
173+
instream = open(fpath_new, "rb")
174+
infile = PdfReader(instream, strict=False)
175+
if infile.is_encrypted:
176+
infile.decrypt(self.password)
177+
outfile = PdfWriter()
178+
p = infile.pages[0]
179+
if rotation == "anticlockwise":
180+
p.rotate(90)
181+
elif rotation == "clockwise":
182+
p.rotate(-90)
183+
outfile.add_page(p)
184+
with open(fpath, "wb") as f:
185+
outfile.write(f)
186+
instream.close()
145187

146188
def parse(
147189
self,
@@ -181,6 +223,7 @@ def parse(
181223
tables = []
182224
parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)
183225
with TemporaryDirectory() as tempdir:
226+
<<<<<<< HEAD
184227
cpu_count = mp.cpu_count()
185228
# Using multiprocessing only when cpu_count > 1 to prevent a stallness issue
186229
# when cpu_count is 1

camelot/io.py

+16-9
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,24 @@
11
import warnings
22
from pathlib import Path
3-
from typing import Union
43

5-
from pypdf._utils import StrByteType
4+
from .handlers import PDFHandler, FilePathType
65

7-
from .handlers import PDFHandler
8-
from .utils import remove_extra
9-
from .utils import validate_input
6+
from .utils import (
7+
InvalidArguments,
8+
validate_input,
9+
remove_extra,
10+
)
1011

1112

1213
def read_pdf(
13-
filepath: Union[StrByteType, Path],
14+
filepath: FilePathType = None,
1415
pages="1",
1516
password=None,
1617
flavor="lattice",
1718
suppress_stdout=False,
1819
parallel=False,
1920
layout_kwargs=None,
21+
file_bytes=None,
2022
**kwargs
2123
):
2224
"""Read PDF and return extracted tables.
@@ -26,8 +28,8 @@ def read_pdf(
2628
2729
Parameters
2830
----------
29-
filepath : str, Path, IO
30-
Filepath or URL of the PDF file.
31+
filepath : str | pathlib.Path, optional (default: None)
32+
Filepath or URL of the PDF file. Required if file_bytes is not given
3133
pages : str, optional (default: '1')
3234
Comma-separated page numbers.
3335
Example: '1,3,4' or '1,4-end' or 'all'.
@@ -40,6 +42,8 @@ def read_pdf(
4042
Print all logs and warnings.
4143
parallel : bool, optional (default: False)
4244
Process pages in parallel using all available cpu cores.
45+
file_bytes : io.IOBase, optional (default: None)
46+
A file-like stream. Required if filepath is not given
4347
layout_kwargs : dict, optional (default: {})
4448
A dict of `pdfminer.layout.LAParams
4549
<https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
@@ -115,12 +119,15 @@ def read_pdf(
115119
"Unknown flavor specified." " Use either 'lattice' or 'stream'"
116120
)
117121

122+
if not filepath and not file_bytes:
123+
raise InvalidArguments('Either `filepath` or `file_bytes` is required')
124+
118125
with warnings.catch_warnings():
119126
if suppress_stdout:
120127
warnings.simplefilter("ignore")
121128

122129
validate_input(kwargs, flavor=flavor)
123-
p = PDFHandler(filepath, pages=pages, password=password)
130+
p = PDFHandler(filepath, pages=pages, password=password, file_bytes=file_bytes)
124131
kwargs = remove_extra(kwargs, flavor=flavor)
125132
tables = p.parse(
126133
flavor=flavor,

camelot/utils.py

+20-20
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import os
1+
import io
22
import random
33
import re
44
import shutil
@@ -34,6 +34,10 @@
3434
_VALID_URLS.discard("")
3535

3636

37+
class InvalidArguments(Exception):
38+
pass
39+
40+
3741
# https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py
3842
def is_url(url):
3943
"""Check to see if a URL has a valid protocol.
@@ -64,34 +68,30 @@ def random_string(length):
6468
return ret
6569

6670

67-
def download_url(url):
68-
"""Download file from specified URL.
71+
def get_url_bytes(url):
72+
"""Get a stream of bytes for url
6973
7074
Parameters
7175
----------
7276
url : str or unicode
7377
7478
Returns
7579
-------
76-
filepath : str or unicode
77-
Temporary filepath.
80+
file_bytes : io.BytesIO
81+
a file-like object that cane be read
7882
7983
"""
80-
filename = f"{random_string(6)}.pdf"
81-
with tempfile.NamedTemporaryFile("wb", delete=False) as f:
82-
headers = {
83-
"User-Agent": "Mozilla/5.0",
84-
"Accept-Encoding": "gzip;q=1.0, deflate;q=0.9, br;q=0.8, compress;q=0.7, *;q=0.1"
85-
}
86-
request = Request(url, None, headers)
87-
obj = urlopen(request)
88-
content_type = obj.info().get_content_type()
89-
if content_type != "application/pdf":
90-
raise NotImplementedError("File format not supported")
91-
f.write(obj.read())
92-
filepath = os.path.join(os.path.dirname(f.name), filename)
93-
shutil.move(f.name, filepath)
94-
return filepath
84+
file_bytes = io.BytesIO()
85+
file_bytes.name = url
86+
headers = {"User-Agent": "Mozilla/5.0"}
87+
request = Request(url, data=None, headers=headers)
88+
obj = urlopen(request)
89+
content_type = obj.info().get_content_type()
90+
if content_type != "application/pdf":
91+
raise NotImplementedError("File format not supported")
92+
file_bytes.write(obj.read())
93+
file_bytes.seek(0)
94+
return file_bytes
9595

9696

9797
stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol"]

0 commit comments

Comments
 (0)