1
1
import multiprocessing as mp
2
+ from contextlib import contextmanager
3
+ import io
2
4
import os
3
5
import sys
4
6
from pathlib import Path
5
- from typing import Union
7
+ from typing import Union , Any , IO , TypeVar
6
8
7
9
from pypdf import PdfReader
8
10
from pypdf import PdfWriter
9
- from pypdf ._utils import StrByteType
10
11
11
12
from .core import TableList
12
13
from .parsers import Lattice
13
14
from .parsers import Stream
14
15
from .utils import TemporaryDirectory
15
- from .utils import download_url
16
+ from .utils import InvalidArguments
17
+ from .utils import get_url_bytes
16
18
from .utils import get_page_layout
17
19
from .utils import get_rotation
18
20
from .utils import get_text_objects
19
21
from .utils import is_url
20
22
23
+ FilePathType = TypeVar (Union [str , IO [Any ], Path , None ])
21
24
22
25
class PDFHandler :
23
26
"""Handles all operations like temp directory creation, splitting
@@ -26,21 +29,35 @@ class PDFHandler:
26
29
27
30
Parameters
28
31
----------
29
- filepath : str
30
- Filepath or URL of the PDF file.
32
+ filepath : str | pathlib.Path, optional (default: None)
33
+ Filepath or URL of the PDF file. Required if file_bytes is not given
31
34
pages : str, optional (default: '1')
32
35
Comma-separated page numbers.
33
36
Example: '1,3,4' or '1,4-end' or 'all'.
34
37
password : str, optional (default: None)
35
38
Password for decryption.
39
+ file_bytes : io.IOBase, optional (default: None)
40
+ A file-like stream. Required if filepath is not given
36
41
37
42
"""
38
43
39
- def __init__ (self , filepath : Union [ StrByteType , Path ] , pages = "1" , password = None ):
44
+ def __init__ (self , filepath : FilePathType = None , pages = "1" , password = None , file_bytes = None ):
40
45
if is_url (filepath ):
41
- filepath = download_url (filepath )
42
- self .filepath : Union [StrByteType , Path ] = filepath
46
+ file_bytes = get_url_bytes (filepath )
43
47
48
+ if not filepath and not file_bytes :
49
+ raise InvalidArguments ('Either `filepath` or `file_bytes` is required' )
50
+ if not filepath :
51
+ # filepath must either be passed, or taken from the name attribute
52
+ try :
53
+ filepath = getattr (file_bytes , 'name' )
54
+ except AttributeError :
55
+ msg = ('Either pass a `filepath`, or give the '
56
+ '`file_bytes` argument a name attribute' )
57
+ raise InvalidArguments (msg )
58
+ self .file_bytes = file_bytes # ok to be None
59
+
60
+ self .filepath = filepath
44
61
if isinstance (filepath , str ) and not filepath .lower ().endswith (".pdf" ):
45
62
raise NotImplementedError ("File format not supported" )
46
63
@@ -52,13 +69,35 @@ def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None)
52
69
self .password = self .password .encode ("ascii" )
53
70
self .pages = self ._get_pages (pages )
54
71
72
+ @contextmanager
73
+ def managed_file_context (self ):
74
+ """Reads from either the `filepath` or `file_bytes`
75
+ attribute of this instance, to return a file-like object.
76
+ Closes any open file handles on exit or error.
77
+
78
+ Returns
79
+ -------
80
+ file_bytes : io.IOBase
81
+ A readable, seekable, file-like object
82
+ """
83
+ if self .file_bytes :
84
+ # if we can't seek, write to a BytesIO object that can,
85
+ # then seek to the beginning before yielding
86
+ if not hasattr (self .file_bytes , 'seek' ):
87
+ self .file_bytes = io .BytesIO (self .file_bytes .read ())
88
+ self .file_bytes .seek (0 )
89
+ yield self .file_bytes
90
+ else :
91
+ with open (self .filepath , "rb" ) as file_bytes :
92
+ yield file_bytes
93
+
55
94
def _get_pages (self , pages ):
56
95
"""Converts pages string to list of ints.
57
96
58
97
Parameters
59
98
----------
60
- filepath : str
61
- Filepath or URL of the PDF file.
99
+ managed_file_context : io.IOBase
100
+ A readable, seekable, file-like object
62
101
pages : str, optional (default: '1')
63
102
Comma-separated page numbers.
64
103
Example: '1,3,4' or '1,4-end' or 'all'.
@@ -74,74 +113,77 @@ def _get_pages(self, pages):
74
113
if pages == "1" :
75
114
page_numbers .append ({"start" : 1 , "end" : 1 })
76
115
else :
77
- infile = PdfReader (self .filepath , strict = False )
116
+ with self .managed_file_context () as f :
117
+ infile = PdfReader (f , strict = False )
78
118
79
- if infile .is_encrypted :
80
- infile .decrypt (self .password )
119
+ if infile .is_encrypted :
120
+ infile .decrypt (self .password )
81
121
82
- if pages == "all" :
83
- page_numbers .append ({"start" : 1 , "end" : len (infile .pages )})
84
- else :
85
- for r in pages .split ("," ):
86
- if "-" in r :
87
- a , b = r .split ("-" )
88
- if b == "end" :
89
- b = len (infile .pages )
90
- page_numbers .append ({"start" : int (a ), "end" : int (b )})
91
- else :
92
- page_numbers .append ({"start" : int (r ), "end" : int (r )})
122
+ if pages == "all" :
123
+ page_numbers .append ({"start" : 1 , "end" : len (infile .pages )})
124
+ else :
125
+ for r in pages .split ("," ):
126
+ if "-" in r :
127
+ a , b = r .split ("-" )
128
+ if b == "end" :
129
+ b = len (infile .pages )
130
+ page_numbers .append ({"start" : int (a ), "end" : int (b )})
131
+ else :
132
+ page_numbers .append ({"start" : int (r ), "end" : int (r )})
93
133
94
134
result = []
95
135
for p in page_numbers :
96
136
result .extend (range (p ["start" ], p ["end" ] + 1 ))
97
137
return sorted (set (result ))
98
138
99
- def _save_page (self , filepath : Union [ StrByteType , Path ], page , temp ):
139
+ def _save_page (self , page , temp ):
100
140
"""Saves specified page from PDF into a temporary directory.
101
141
102
142
Parameters
103
143
----------
104
- filepath : str
105
- Filepath or URL of the PDF file.
144
+ managed_file_context : io.IOBase
145
+ A readable, seekable, file-like object
106
146
page : int
107
147
Page number.
108
148
temp : str
109
149
Tmp directory.
110
150
111
151
"""
112
- infile = PdfReader (filepath , strict = False )
113
- if infile .is_encrypted :
114
- infile .decrypt (self .password )
115
- fpath = os .path .join (temp , f"page-{ page } .pdf" )
116
- froot , fext = os .path .splitext (fpath )
117
- p = infile .pages [page - 1 ]
118
- outfile = PdfWriter ()
119
- outfile .add_page (p )
120
- with open (fpath , "wb" ) as f :
121
- outfile .write (f )
122
- layout , dim = get_page_layout (fpath )
123
- # fix rotated PDF
124
- chars = get_text_objects (layout , ltype = "char" )
125
- horizontal_text = get_text_objects (layout , ltype = "horizontal_text" )
126
- vertical_text = get_text_objects (layout , ltype = "vertical_text" )
127
- rotation = get_rotation (chars , horizontal_text , vertical_text )
128
- if rotation != "" :
129
- fpath_new = "" .join ([froot .replace ("page" , "p" ), "_rotated" , fext ])
130
- os .rename (fpath , fpath_new )
131
- instream = open (fpath_new , "rb" )
132
- infile = PdfReader (instream , strict = False )
152
+
153
+ with self .managed_file_context () as fileobj :
154
+ infile = PdfReader (fileobj , strict = False )
133
155
if infile .is_encrypted :
134
156
infile .decrypt (self .password )
157
+ fpath = os .path .join (temp , f"page-{ page } .pdf" )
158
+ froot , fext = os .path .splitext (fpath )
159
+ p = infile .pages [page - 1 ]
135
160
outfile = PdfWriter ()
136
- p = infile .pages [0 ]
137
- if rotation == "anticlockwise" :
138
- p .rotate (90 )
139
- elif rotation == "clockwise" :
140
- p .rotate (- 90 )
141
161
outfile .add_page (p )
142
162
with open (fpath , "wb" ) as f :
143
163
outfile .write (f )
144
- instream .close ()
164
+ layout , dim = get_page_layout (fpath )
165
+ # fix rotated PDF
166
+ chars = get_text_objects (layout , ltype = "char" )
167
+ horizontal_text = get_text_objects (layout , ltype = "horizontal_text" )
168
+ vertical_text = get_text_objects (layout , ltype = "vertical_text" )
169
+ rotation = get_rotation (chars , horizontal_text , vertical_text )
170
+ if rotation != "" :
171
+ fpath_new = "" .join ([froot .replace ("page" , "p" ), "_rotated" , fext ])
172
+ os .rename (fpath , fpath_new )
173
+ instream = open (fpath_new , "rb" )
174
+ infile = PdfReader (instream , strict = False )
175
+ if infile .is_encrypted :
176
+ infile .decrypt (self .password )
177
+ outfile = PdfWriter ()
178
+ p = infile .pages [0 ]
179
+ if rotation == "anticlockwise" :
180
+ p .rotate (90 )
181
+ elif rotation == "clockwise" :
182
+ p .rotate (- 90 )
183
+ outfile .add_page (p )
184
+ with open (fpath , "wb" ) as f :
185
+ outfile .write (f )
186
+ instream .close ()
145
187
146
188
def parse (
147
189
self ,
@@ -181,6 +223,7 @@ def parse(
181
223
tables = []
182
224
parser = Lattice (** kwargs ) if flavor == "lattice" else Stream (** kwargs )
183
225
with TemporaryDirectory () as tempdir :
226
+ < << << << HEAD
184
227
cpu_count = mp .cpu_count ()
185
228
# Using multiprocessing only when cpu_count > 1 to prevent a stallness issue
186
229
# when cpu_count is 1
0 commit comments