-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimport_to_s3.py
38 lines (31 loc) · 1.13 KB
/
import_to_s3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# compatible with python2.7
# This script will import directory structure to s3 bucket. Directory
# structure inside `path` will be imported to s3 bucket root.
# please update placeholders like PATH_OF_DATA_DIRECTORY, ACCESS_KEY and SECRET_ACCESS_KEY
# This is fastest way to import millions of small files to s3 bucket.
import boto3
import os
import dill
from pathos.multiprocessing import Pool,cpu_count
from contextlib import closing
path = 'PATH_OF_DATA_DIRECTORY'
session = boto3.Session(
aws_access_key_id='ACCESS_KEY',
aws_secret_access_key='SECRET_ACCESS_KEY'
)
s3 = session.resource('s3')
bucket = s3.Bucket('MY_BUCKET_NAME')
parallel_worker = 20
def call_execute(full_path):
with open(full_path, 'rb') as data:
bucket.put_object(Key=full_path[len(path)+1:], Body=data)
def upload_files(path):
for subdir, dirs, files in os.walk(path):
for file in files:
yield os.path.join(subdir, file)
if __name__ == "__main__":
#workers = cpu_count()
workers = parallel_worker
with closing(Pool(processes=workers)) as pool:
pool.map(call_execute , upload_files(path))
pool.terminate()