1
1
#!/usr/bin/env python3
2
+ import hashlib
2
3
import multiprocessing as mp
3
- import random
4
- import string
5
4
import sys
5
+ from pathlib import Path
6
6
7
7
import click
8
8
@@ -147,17 +147,17 @@ def run(self):
147
147
@click .option (
148
148
"--re-download/--no-re-download" ,
149
149
default = False ,
150
- help = "Re-download files from s3 even if they are already present in --download-dir." ,
150
+ help = "Re-download files even if they are already present in --download-dir." ,
151
151
)
152
152
@click .option (
153
153
"--download-dir" ,
154
- help = "Where s3 files are downloaded to, defaults to tmp- ingest-<6 random chars> ." ,
154
+ help = "Where files are downloaded to, defaults to `$HOME/.cache/unstructured/ ingest/<SHA256>` ." ,
155
155
)
156
156
@click .option (
157
157
"--preserve-downloads" ,
158
158
is_flag = True ,
159
159
default = False ,
160
- help = "Preserve downloaded s3 files. Otherwise each file is removed after being processed "
160
+ help = "Preserve downloaded files. Otherwise each file is removed after being processed "
161
161
"successfully." ,
162
162
)
163
163
@click .option (
@@ -169,7 +169,7 @@ def run(self):
169
169
"--reprocess" ,
170
170
is_flag = True ,
171
171
default = False ,
172
- help = "Reprocess a downloaded file from s3 even if the relevant structured output .json file "
172
+ help = "Reprocess a downloaded file even if the relevant structured output .json file "
173
173
"in --structured-output-dir already exists." ,
174
174
)
175
175
@click .option (
@@ -204,9 +204,31 @@ def main(
204
204
if not preserve_downloads and download_dir :
205
205
print ("Warning: not preserving downloaded files but --download_dir is specified" )
206
206
if not download_dir :
207
- download_dir = "tmp-ingest-" + "" .join (
208
- random .choice (string .ascii_letters ) for i in range (6 )
209
- )
207
+ cache_path = Path .home () / ".cache" / "unstructured" / "ingest"
208
+ if not cache_path .exists ():
209
+ cache_path .mkdir (parents = True , exist_ok = True )
210
+ if s3_url :
211
+ hashed_dir_name = hashlib .sha256 (s3_url .encode ("utf-8" ))
212
+ elif github_url :
213
+ hashed_dir_name = hashlib .sha256 (
214
+ f"{ github_url } _{ github_branch } " .encode ("utf-8" ),
215
+ )
216
+ elif subreddit_name :
217
+ hashed_dir_name = hashlib .sha256 (
218
+ subreddit_name .encode ("utf-8" ),
219
+ )
220
+ elif wikipedia_page_title :
221
+ hashed_dir_name = hashlib .sha256 (
222
+ wikipedia_page_title .encode ("utf-8" ),
223
+ )
224
+ else :
225
+ raise ValueError ("No connector-specific option was specified!" )
226
+ download_dir = cache_path / hashed_dir_name .hexdigest ()[:10 ]
227
+ if preserve_downloads :
228
+ print (
229
+ f"Warning: preserving downloaded files but --download-dir is not specified,"
230
+ f" using { download_dir } " ,
231
+ )
210
232
if s3_url :
211
233
doc_connector = S3Connector (
212
234
config = SimpleS3Config (
0 commit comments