Skip to content

Commit 6b6c2c1

Browse files
authored
Merge pull request #42 from RachelTucker/PYTHONSDK-97
PYTHONSDK-97: Add functionality to specify a directory in put data call
2 parents f31224f + b4ec11d commit 6b6c2c1

File tree

4 files changed

+501
-6
lines changed

4 files changed

+501
-6
lines changed

ds3/ds3.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
import os
1616

1717
from abc import ABCMeta
18-
import posixpath
1918
from .ds3network import *
2019

2120

@@ -58,7 +57,7 @@ def __init__(self, name, size):
5857

5958
def to_xml(self):
6059
xml_object = xmldom.Element('Object')
61-
xml_object.set('Name', posixpath.normpath(self.name))
60+
xml_object.set('Name', self.name)
6261
xml_object.set('Size', str(self.size))
6362
return xml_object
6463

@@ -72,7 +71,7 @@ def __init__(self, name, length=None, offset=None, version_id=None):
7271

7372
def to_xml(self):
7473
xml_object = xmldom.Element('Object')
75-
xml_object.set('Name', posixpath.normpath(self.name))
74+
xml_object.set('Name', self.name)
7675
if self.length is not None:
7776
xml_object.set('Length', str(self.length))
7877
if self.offset is not None:
@@ -2489,7 +2488,7 @@ def __init__(self):
24892488

24902489
def parseModel(root, model):
24912490

2492-
if root.tag is 'Data':
2491+
if root.tag == 'Data':
24932492
children = list(root.iter())
24942493
if not children:
24952494
return None

ds3/ds3Helpers.py

Lines changed: 285 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,285 @@
1+
# Copyright 2021 Spectra Logic Corporation. All Rights Reserved.
2+
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use
3+
# this file except in compliance with the License. A copy of the License is located at
4+
#
5+
# http://www.apache.org/licenses/LICENSE-2.0
6+
#
7+
# or in the "license" file accompanying this file.
8+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
9+
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
10+
# specific language governing permissions and limitations under the License.
11+
12+
import time
13+
import concurrent.futures
14+
from .ds3 import *
15+
from os import walk, path
16+
from typing import List, Set, Dict
17+
18+
from platform import system
19+
20+
21+
class EmptyReader(object):
22+
@staticmethod
23+
def read(_):
24+
return None
25+
26+
@staticmethod
27+
def close():
28+
return
29+
30+
31+
class Blob(object):
32+
def __init__(self, name: str, length: int, offset: int):
33+
self.name = name
34+
self.length = length
35+
self.offset = offset
36+
37+
def __eq__(self, other):
38+
if self.name == other.name and self.length == other.length and self.offset == other.offset:
39+
return True
40+
else:
41+
return False
42+
43+
def __hash__(self):
44+
return hash((self.name, self.length, self.offset))
45+
46+
47+
class HelperPutObject(object):
48+
def __init__(self, object_name: str, file_path: str, size: int):
49+
self.object_name = object_name
50+
self.file_path = file_path
51+
self.size = size
52+
53+
def get_data_stream(self, offset: int):
54+
if self.size == 0:
55+
return EmptyReader()
56+
data_stream = open(self.file_path, "rb")
57+
data_stream.seek(offset, 0)
58+
return data_stream
59+
60+
61+
class HelperGetObject(object):
62+
def __init__(self, object_name: str, destination_path: str, version_id: str = None):
63+
self.object_name = object_name
64+
self.destination_path = destination_path
65+
self.version_id = version_id
66+
67+
def get_data_stream(self, offset: int):
68+
landing_dir = os.path.dirname(self.destination_path)
69+
if not os.path.exists(landing_dir):
70+
os.makedirs(name=landing_dir, exist_ok=True)
71+
72+
fd = os.open(self.destination_path, os.O_CREAT | os.O_WRONLY)
73+
data_stream = os.fdopen(fd, 'wb')
74+
data_stream.seek(offset, 0)
75+
return data_stream
76+
77+
78+
def file_path_to_object_store_name(file_path: str) -> str:
79+
if system().lower() == "windows":
80+
return file_path.replace('\\', '/')
81+
return file_path
82+
83+
84+
def object_name_to_file_path(object_name: str) -> str:
85+
if system().lower() == "windows":
86+
return object_name.replace('/', '\\')
87+
return object_name
88+
89+
90+
class Helper(object):
91+
def __init__(self, client: Client):
92+
self.client = client
93+
94+
def put_objects(self, put_objects: List[HelperPutObject], bucket: str, max_threads: int = 5) -> str:
95+
ds3_put_objects: List[Ds3PutObject] = []
96+
put_objects_map: Dict[str, HelperPutObject] = dict()
97+
for entry in put_objects:
98+
ds3_put_objects.append(Ds3PutObject(name=entry.object_name, size=entry.size))
99+
put_objects_map[entry.object_name] = entry
100+
101+
bulk_put = self.client.put_bulk_job_spectra_s3(
102+
PutBulkJobSpectraS3Request(bucket_name=bucket, object_list=ds3_put_objects))
103+
104+
job_id = bulk_put.result['JobId']
105+
106+
blob_set: Set[Blob] = set()
107+
for chunk in bulk_put.result['ObjectsList']:
108+
for blob in chunk['ObjectList']:
109+
name: str = blob['Name']
110+
length: int = int(blob['Length'])
111+
offset: int = int(blob['Offset'])
112+
cur_blob = Blob(name=name, length=length, offset=offset)
113+
blob_set.add(cur_blob)
114+
115+
# send until all blobs have been transferred
116+
while len(blob_set) > 0:
117+
available_chunks = self.client.get_job_chunks_ready_for_client_processing_spectra_s3(
118+
GetJobChunksReadyForClientProcessingSpectraS3Request(job_id))
119+
120+
chunks = available_chunks.result['ObjectsList']
121+
122+
if len(chunks) <= 0:
123+
time.sleep(available_chunks.retryAfter)
124+
continue
125+
126+
# retrieve all available blobs concurrently
127+
with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
128+
for chunk in chunks:
129+
for blob in chunk['ObjectList']:
130+
name: str = blob['Name']
131+
length: int = int(blob['Length'])
132+
offset: int = int(blob['Offset'])
133+
cur_blob = Blob(name=name, length=length, offset=offset)
134+
135+
if cur_blob in blob_set:
136+
blob_set.remove(cur_blob)
137+
put_object = put_objects_map[cur_blob.name]
138+
139+
executor.submit(self.put_blob, bucket, put_object, cur_blob.length, cur_blob.offset, job_id)
140+
141+
return job_id
142+
143+
def put_blob(self, bucket: str, put_object: HelperPutObject, length: int, offset: int, job_id: str):
144+
stream = put_object.get_data_stream(offset)
145+
self.client.put_object(PutObjectRequest(bucket_name=bucket,
146+
object_name=put_object.object_name,
147+
length=length,
148+
stream=stream,
149+
offset=offset,
150+
job=job_id))
151+
stream.close()
152+
153+
def put_all_objects_in_directory(self, source_dir: str, bucket: str, objects_per_bp_job: int = 1000,
154+
max_threads: int = 5) -> List[str]:
155+
obj_list: List[HelperPutObject] = list()
156+
job_list: List[str] = list()
157+
for root, dirs, files in walk(top=source_dir):
158+
for name in files:
159+
obj_path = path.join(root, name)
160+
obj_name = file_path_to_object_store_name(path.normpath(path.relpath(path=obj_path, start=source_dir)))
161+
size = os.path.getsize(obj_path)
162+
obj_list.append(HelperPutObject(object_name=obj_name, file_path=obj_path, size=size))
163+
if len(obj_list) >= objects_per_bp_job:
164+
job_list.append(self.put_objects(obj_list, bucket, max_threads=max_threads))
165+
obj_list = []
166+
167+
for name in dirs:
168+
dir_path = path.join(root, name)
169+
dir_name = file_path_to_object_store_name(
170+
path.join(path.normpath(path.relpath(path=dir_path, start=source_dir)), ""))
171+
obj_list.append(HelperPutObject(object_name=dir_name, file_path=dir_path, size=0))
172+
if len(obj_list) >= objects_per_bp_job:
173+
job_list.append(self.put_objects(obj_list, bucket, max_threads=max_threads))
174+
obj_list = []
175+
176+
if len(obj_list) > 0:
177+
job_list.append(self.put_objects(obj_list, bucket, max_threads=max_threads))
178+
179+
return job_list
180+
181+
def get_objects(self, get_objects: List[HelperGetObject], bucket: str, max_threads: int = 5) -> str:
182+
ds3_get_objects: List[Ds3GetObject] = []
183+
get_objects_map: Dict[str, HelperGetObject] = dict()
184+
for entry in get_objects:
185+
ds3_get_objects.append(Ds3GetObject(name=entry.object_name, version_id=entry.version_id))
186+
get_objects_map[entry.object_name] = entry
187+
188+
bulk_get = self.client.get_bulk_job_spectra_s3(GetBulkJobSpectraS3Request(bucket_name=bucket,
189+
object_list=ds3_get_objects))
190+
191+
job_id = bulk_get.result['JobId']
192+
193+
blob_set: Set[Blob] = set()
194+
for chunk in bulk_get.result['ObjectsList']:
195+
for blob in chunk['ObjectList']:
196+
name: str = blob['Name']
197+
length: int = int(blob['Length'])
198+
offset: int = int(blob['Offset'])
199+
cur_blob = Blob(name=name, length=length, offset=offset)
200+
blob_set.add(cur_blob)
201+
202+
# retrieve until all blobs have been transferred
203+
while len(blob_set) > 0:
204+
available_chunks = self.client.get_job_chunks_ready_for_client_processing_spectra_s3(
205+
GetJobChunksReadyForClientProcessingSpectraS3Request(job_id))
206+
207+
chunks = available_chunks.result['ObjectsList']
208+
209+
if len(chunks) <= 0:
210+
time.sleep(available_chunks.retryAfter)
211+
continue
212+
213+
# retrieve all available blobs concurrently
214+
with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
215+
for chunk in chunks:
216+
for blob in chunk['ObjectList']:
217+
name: str = blob['Name']
218+
length: int = int(blob['Length'])
219+
offset: int = int(blob['Offset'])
220+
cur_blob = Blob(name=name, length=length, offset=offset)
221+
222+
if cur_blob in blob_set:
223+
blob_set.remove(cur_blob)
224+
get_object = get_objects_map[cur_blob.name]
225+
226+
executor.submit(self.get_blob, bucket, get_object, offset, job_id)
227+
228+
return job_id
229+
230+
def get_blob(self, bucket: str, get_object: HelperGetObject, offset: int, job_id: str):
231+
stream = get_object.get_data_stream(offset)
232+
self.client.get_object(GetObjectRequest(bucket_name=bucket,
233+
object_name=get_object.object_name,
234+
stream=stream,
235+
offset=offset,
236+
job=job_id,
237+
version_id=get_object.version_id))
238+
stream.close()
239+
240+
def get_all_files_in_bucket(self, destination_dir: str, bucket: str, objects_per_bp_job: int = 1000,
241+
max_threads: int = 5) -> List[str]:
242+
truncated: str = 'true'
243+
marker = ""
244+
job_ids: List[str] = []
245+
while truncated.lower() == 'true':
246+
list_bucket = self.client.get_bucket(GetBucketRequest(bucket_name=bucket,
247+
max_keys=objects_per_bp_job,
248+
versions=False,
249+
marker=marker))
250+
251+
get_objects: List[HelperGetObject] = []
252+
for bp_object in list_bucket.result['ContentsList']:
253+
is_latest: str = bp_object['IsLatest']
254+
if is_latest.lower() != 'true':
255+
# only retrieve the latest version of objects
256+
continue
257+
258+
object_name: str = bp_object["Key"]
259+
object_destination = os.path.join(destination_dir, object_name_to_file_path(object_name))
260+
if object_name.endswith('/'):
261+
os.makedirs(object_destination, exist_ok=True)
262+
else:
263+
get_objects.append(HelperGetObject(object_name=object_name, destination_path=object_destination))
264+
265+
for bp_object in list_bucket.result['VersionList']:
266+
is_latest: str = bp_object['IsLatest']
267+
if is_latest.lower() != 'true':
268+
# only retrieve the latest version of objects
269+
continue
270+
271+
object_name: str = bp_object["Key"]
272+
object_destination = os.path.join(destination_dir, object_name_to_file_path(object_name))
273+
if object_name.endswith('/'):
274+
os.makedirs(object_destination, exist_ok=True)
275+
else:
276+
get_objects.append(HelperGetObject(object_name=object_name, destination_path=object_destination))
277+
278+
if len(get_objects) > 0:
279+
job_id = self.get_objects(get_objects=get_objects, bucket=bucket, max_threads=max_threads)
280+
job_ids.append(job_id)
281+
282+
truncated = list_bucket.result['IsTruncated']
283+
marker = list_bucket.result['NextMarker']
284+
285+
return job_ids

ds3/ds3network.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ def send_request(self, request):
217217

218218
headers.update(amz_headers)
219219

220-
if request.body is not None and request.body is not "":
220+
if request.body is not None and request.body != "":
221221
canonicalized_amz_header = self.canonicalized_amz_headers(amz_headers)
222222
headers['Content-Type'] = 'application/octet-stream'
223223
headers['Authorization'] = self.build_authorization(verb=request.http_verb,
@@ -261,6 +261,8 @@ def canonicalize_path(self, request_path, query_params):
261261
path += '?delete'
262262
if 'versioning' in query_params:
263263
path += '?versioning=' + str(query_params['versioning'])
264+
if 'versions' in query_params:
265+
path += '?versions=' + str(query_params['versions'])
264266
if 'uploads' in query_params:
265267
path += '?uploads'
266268
if query_params['uploads'] is not None:
@@ -289,7 +291,7 @@ def sign(self, key, contents):
289291
signer = hmac.new(key.encode('utf-8'), digestmod=sha1)
290292
signer.update(contents.encode('utf-8'))
291293
digest = signer.digest()
292-
return base64.encodestring(digest).strip().decode('utf-8')
294+
return base64.encodebytes(digest).strip().decode('utf-8')
293295

294296
def normalize_string(self, url):
295297
return urllib.parse.quote(url)

0 commit comments

Comments
 (0)