Skip to content

Commit 707f92f

Browse files
authored
feat: improve caching mechanism for download_dir on ingest (Unstructured-IO#314)
* `unstructured-ingest` now uses a default `--download_dir` of `$HOME/.cache/unstructured/ingest` rather than a "tmp-ingest-" dir in the working directory. * `unstructured-ingest` no longer re-downloads files when --preserve-downloads is used without --download-dir.
1 parent 95109db commit 707f92f

File tree

5 files changed

+40
-14
lines changed

5 files changed

+40
-14
lines changed

Diff for: .gitignore

+1-2
Original file line numberDiff line numberDiff line change
@@ -131,8 +131,7 @@ dmypy.json
131131

132132
# ingest outputs
133133
/structured-output
134-
# ingest temporary files
135-
/tmp-ingest*
134+
136135
# suggested ingest mirror directory
137136
/mirror
138137

Diff for: CHANGELOG.md

+7-1
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,17 @@
1-
## 0.5.2-dev1
1+
## 0.5.2-dev0
22

33
### Enhancements
44

5+
* `unstructured-ingest` now uses a default `--download_dir` of `$HOME/.cache/unstructured/ingest`
6+
rather than a "tmp-ingest-" dir in the working directory.
7+
58
### Features
69

710
### Fixes
811

12+
* `unstructured-ingest` no longer re-downloads files when --preserve-downloads
13+
is used without --download-dir.
14+
915
## 0.5.1
1016

1117
### Enhancements

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.5.2-dev1" # pragma: no cover
1+
__version__ = "0.5.2-dev0" # pragma: no cover

Diff for: unstructured/ingest/connector/s3_connector.py

-1
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,6 @@ def initialize(self):
175175
raise ValueError(
176176
f"No objects found in {self.config.s3_url} -- response list object is {response}",
177177
)
178-
os.mkdir(self.config.download_dir)
179178

180179
def _list_objects(self):
181180
response = self.s3_cli.list_objects_v2(**self._list_objects_kwargs)

Diff for: unstructured/ingest/main.py

+31-9
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
#!/usr/bin/env python3
2+
import hashlib
23
import multiprocessing as mp
3-
import random
4-
import string
54
import sys
5+
from pathlib import Path
66

77
import click
88

@@ -147,17 +147,17 @@ def run(self):
147147
@click.option(
148148
"--re-download/--no-re-download",
149149
default=False,
150-
help="Re-download files from s3 even if they are already present in --download-dir.",
150+
help="Re-download files even if they are already present in --download-dir.",
151151
)
152152
@click.option(
153153
"--download-dir",
154-
help="Where s3 files are downloaded to, defaults to tmp-ingest-<6 random chars>.",
154+
help="Where files are downloaded to, defaults to `$HOME/.cache/unstructured/ingest/<SHA256>`.",
155155
)
156156
@click.option(
157157
"--preserve-downloads",
158158
is_flag=True,
159159
default=False,
160-
help="Preserve downloaded s3 files. Otherwise each file is removed after being processed "
160+
help="Preserve downloaded files. Otherwise each file is removed after being processed "
161161
"successfully.",
162162
)
163163
@click.option(
@@ -169,7 +169,7 @@ def run(self):
169169
"--reprocess",
170170
is_flag=True,
171171
default=False,
172-
help="Reprocess a downloaded file from s3 even if the relevant structured output .json file "
172+
help="Reprocess a downloaded file even if the relevant structured output .json file "
173173
"in --structured-output-dir already exists.",
174174
)
175175
@click.option(
@@ -204,9 +204,31 @@ def main(
204204
if not preserve_downloads and download_dir:
205205
print("Warning: not preserving downloaded files but --download_dir is specified")
206206
if not download_dir:
207-
download_dir = "tmp-ingest-" + "".join(
208-
random.choice(string.ascii_letters) for i in range(6)
209-
)
207+
cache_path = Path.home() / ".cache" / "unstructured" / "ingest"
208+
if not cache_path.exists():
209+
cache_path.mkdir(parents=True, exist_ok=True)
210+
if s3_url:
211+
hashed_dir_name = hashlib.sha256(s3_url.encode("utf-8"))
212+
elif github_url:
213+
hashed_dir_name = hashlib.sha256(
214+
f"{github_url}_{github_branch}".encode("utf-8"),
215+
)
216+
elif subreddit_name:
217+
hashed_dir_name = hashlib.sha256(
218+
subreddit_name.encode("utf-8"),
219+
)
220+
elif wikipedia_page_title:
221+
hashed_dir_name = hashlib.sha256(
222+
wikipedia_page_title.encode("utf-8"),
223+
)
224+
else:
225+
raise ValueError("No connector-specific option was specified!")
226+
download_dir = cache_path / hashed_dir_name.hexdigest()[:10]
227+
if preserve_downloads:
228+
print(
229+
f"Warning: preserving downloaded files but --download-dir is not specified,"
230+
f" using {download_dir}",
231+
)
210232
if s3_url:
211233
doc_connector = S3Connector(
212234
config=SimpleS3Config(

0 commit comments

Comments
 (0)