Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ocr/src/main/java/ocr/OcrProviderAmazon.java
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ public OcrResponse extract(String inputFile) throws Exception {
S3Object s3Object =
S3Object.builder()
.bucket(inputFileInfo.getBucketInfo().getBucketName())
.name(inputFileInfo.getFileName())
.name(inputFileInfo.getPath())
.build();
doc = Document.builder().s3Object(s3Object).build();
} else {
Expand Down
2 changes: 1 addition & 1 deletion ocr/src/main/java/ocr/OcrProviderGoogle.java
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ public OcrResponse extract(String inputFile) throws Exception {
"gs://"
+ inputFileInfo.getBucketInfo().getBucketName()
+ "/"
+ inputFileInfo.getFileName();
+ inputFileInfo.getPath();
GcsSource gcsSource = GcsSource.newBuilder().setUri(gsutilUrl).build();
inputConfig =
InputConfig.newBuilder().setMimeType("application/pdf").setGcsSource(gcsSource).build();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ private StartTranscriptionJobRequest startTranscriptionJobRequest(
String languageCode,
Collection<SubtitleFormat> subtitleFormats) {
String bucketName = input.getBucketInfo().getBucketName();
String fileName = input.getFileName();
String fileName = input.getPath();
String s3Uri = "s3://" + bucketName + "/" + fileName;
StartTranscriptionJobRequest.Builder builder =
StartTranscriptionJobRequest.builder()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ public SpeechRecognitionResponse recognizeSpeech(

private RecognitionAudio createGcsRecognitionAudio(FileInfo file) {
String bucket = file.getBucketInfo().getBucketName();
String key = file.getFileName();
String key = file.getPath();
String gcsUrl = "gs://" + bucket + "/" + key;
return RecognitionAudio.newBuilder().setUri(gcsUrl).build();
}
Expand Down
6 changes: 6 additions & 0 deletions storage/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,12 @@
<artifactId>shared</artifactId>
<version>1.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter</artifactId>
<version>5.10.0</version>
<scope>test</scope>
</dependency>
</dependencies>

<build>
Expand Down
31 changes: 15 additions & 16 deletions storage/src/main/java/storage/BucketInfo.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,12 @@
@ToString
public class BucketInfo {

public static final String AWS_BUCKET_REGEX = "(http|https)://(.*).s3.(.*.)?amazonaws.com/?";
public static final String AWS_BUCKET_REGEX = "(http|https)://(.*).s3.(.*.)?amazonaws.com/?(.*)";
public static final String GCP_BUCKET_REGEX = "(http|https)://storage.cloud.google.com/(.*?)/(.*)";
public static final Pattern AWS_BUCKET_REGEX_PATTERN = Pattern.compile(AWS_BUCKET_REGEX);
public static final Pattern GCP_BUCKET_REGEX_PATTERN = Pattern.compile(GCP_BUCKET_REGEX);

private static final String AWS_DEFAULT_REGION_WITHOUT_NAME_IN_URL = "us-east-1";

private Provider provider; // AWS | GCP
private String region; // this is null for GCP, as the region is not included in the url
Expand All @@ -33,25 +37,23 @@ public static BucketInfo parse(String bucketUrl) {

/** Get provider from bucket URL. */
private static Provider getProvider(String bucketUrl) {
if (bucketUrl.matches(AWS_BUCKET_REGEX)) {
if (AWS_BUCKET_REGEX_PATTERN.matcher(bucketUrl).matches()) {
return Provider.AWS;
} else if (bucketUrl.matches(GCP_BUCKET_REGEX)) {
} else if (GCP_BUCKET_REGEX_PATTERN.matcher(bucketUrl).matches()) {
return Provider.GCP;
}
return null;
}

/** Get the location where the storage bucket resides. */
private static String getBucketRegion(String bucketUrl) {
if (bucketUrl.matches(AWS_BUCKET_REGEX)) {
if (AWS_BUCKET_REGEX_PATTERN.matcher(bucketUrl).matches()) {
// region is encoded in the storage url
Pattern p = null;
p = Pattern.compile(AWS_BUCKET_REGEX);
Matcher m = p.matcher(bucketUrl);
Matcher m = AWS_BUCKET_REGEX_PATTERN.matcher(bucketUrl);
if (m.find()) {
String region = m.group(3);
if (region == null || region.isEmpty() || region.isBlank()) {
return null;
return AWS_DEFAULT_REGION_WITHOUT_NAME_IN_URL;
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is still wrong, i found EU URLs without region specifier

}
return region.substring(0, region.length() - 1);
}
Expand All @@ -60,17 +62,14 @@ private static String getBucketRegion(String bucketUrl) {
}

/** Get bucket name from bucket URL. */
private static String getBucketName(String buketUrl) {
Pattern p = null;
if (buketUrl.matches(AWS_BUCKET_REGEX)) {
p = Pattern.compile(AWS_BUCKET_REGEX);
Matcher m = p.matcher(buketUrl);
private static String getBucketName(String bucketUrl) {
if (AWS_BUCKET_REGEX_PATTERN.matcher(bucketUrl).matches()) {
Matcher m = AWS_BUCKET_REGEX_PATTERN.matcher(bucketUrl);
if (m.find()) {
return m.group(2);
}
} else if (buketUrl.matches(GCP_BUCKET_REGEX)) {
p = Pattern.compile(GCP_BUCKET_REGEX);
Matcher m = p.matcher(buketUrl);
} else if (GCP_BUCKET_REGEX_PATTERN.matcher(bucketUrl).matches()) {
Matcher m = GCP_BUCKET_REGEX_PATTERN.matcher(bucketUrl);
if (m.find()) {
return m.group(2);
}
Expand Down
48 changes: 37 additions & 11 deletions storage/src/main/java/storage/FileInfo.java
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
package storage;

import lombok.*;
import org.apache.commons.io.FilenameUtils;

import java.nio.file.FileSystems;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import lombok.*;
import org.apache.commons.io.FilenameUtils;

@Builder
@NoArgsConstructor
Expand All @@ -19,9 +18,24 @@ public class FileInfo {
public static final String GCP_FILE_REGEX = "(http|https)://storage.cloud.google.com/(.*?)/(.*)";

private boolean isLocal;
private String fileName;
@Deprecated private String fileName;

private String fileUrl;
private BucketInfo bucketInfo;
private String path;
private String name;

private static FileInfo parseLocalFileUrl(String fileUrl) {
String absolutePath =
FileSystems.getDefault().getPath(fileUrl).normalize().toAbsolutePath().toString();
return FileInfo.builder()
.isLocal(true)
.fileUrl(absolutePath)
.fileName(FilenameUtils.getName(fileUrl))
.name(FilenameUtils.getName(fileUrl))
.path(absolutePath)
.build();
}

public static FileInfo parse(String fileUrl) {
if (isLocalFile(fileUrl)) {
Expand All @@ -31,25 +45,37 @@ public static FileInfo parse(String fileUrl) {
}
}

private static FileInfo parseLocalFileUrl(String fileUrl) {
String absolutePath =
FileSystems.getDefault().getPath(fileUrl).normalize().toAbsolutePath().toString();
String fileName = FilenameUtils.getName(fileUrl);
return FileInfo.builder().isLocal(true).fileUrl(absolutePath).fileName(fileName).build();
}

private static FileInfo parseCloudStorageFileUrl(String fileUrl) {
String bucketUrl = getBucketUrl(fileUrl);
BucketInfo bucketInfo = BucketInfo.parse(bucketUrl);
String fileName = getFileName(fileUrl);
return FileInfo.builder()
.isLocal(false)
.fileName(fileName)
.path(fileName)
.name(FilenameUtils.getName(fileName))
.fileUrl(fileUrl)
.bucketInfo(bucketInfo)
.build();
}

/**
* Filename is confusing and returns:

* <p>- for local files the filename (e.g. for "/home/user1/file.txt" -> "file1.txt"),
*
* <p>- but path for cloud files (e.g. for
* "https://storage.cloud.google.com/region/folder1/file1.txt" -> "/folder1/file1.txt")
*
* <p>Replace with {@link #getName()} and {@link #getPath()} respectively
*
* @return as described above
*/
@Deprecated
public String getFileName() {
return fileName;
}

/** Returns true if the file is not a cloud storage url. */
private static boolean isLocalFile(String fileUrl) {
if (!fileUrl.matches(AWS_FILE_REGEX)
Expand Down
32 changes: 18 additions & 14 deletions storage/src/main/java/storage/StorageProviderAmazon.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
package storage;

import java.io.IOException;
import java.net.URI;
import java.nio.ByteBuffer;
import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors;
import shared.Credentials;
import software.amazon.awssdk.core.ResponseInputStream;
import software.amazon.awssdk.core.sync.RequestBody;
Expand All @@ -8,12 +14,6 @@
import software.amazon.awssdk.services.s3.model.*;
import software.amazon.awssdk.services.s3.paginators.ListObjectsV2Iterable;

import java.io.IOException;
import java.net.URI;
import java.nio.ByteBuffer;
import java.util.List;
import java.util.stream.Collectors;

public class StorageProviderAmazon implements StorageProvider {

private Credentials credentials;
Expand All @@ -30,7 +30,7 @@ public byte[] read(String fileUrl) throws Exception {
GetObjectRequest getObjectRequest =
GetObjectRequest.builder()
.bucket(fileInfo.getBucketInfo().getBucketName())
.key(fileInfo.getFileName())
.key(fileInfo.getPath())
.build();
ResponseInputStream<GetObjectResponse> response = s3.getObject(getObjectRequest);
byte[] data = response.readAllBytes();
Expand All @@ -46,7 +46,7 @@ public void write(byte[] data, String fileUrl) throws Exception {
PutObjectRequest objectRequest =
PutObjectRequest.builder()
.bucket(fileInfo.getBucketInfo().getBucketName())
.key(fileInfo.getFileName())
.key(fileInfo.getPath())
.build();
s3.putObject(objectRequest, RequestBody.fromByteBuffer(ByteBuffer.wrap(data)));
s3.close();
Expand All @@ -60,7 +60,7 @@ public boolean delete(String fileUrl) throws IOException {
DeleteObjectRequest deleteObjectRequest =
DeleteObjectRequest.builder()
.bucket(fileInfo.getBucketInfo().getBucketName())
.key(fileInfo.getFileName())
.key(fileInfo.getPath())
.build();
s3.deleteObject(deleteObjectRequest);
} catch (Exception e) {
Expand Down Expand Up @@ -122,20 +122,24 @@ public String getRegion(String bucketUrl) throws IOException {
s3.getBucketLocation(
GetBucketLocationRequest.builder().bucket(bucketInfo.getBucketName()).build());
String locationConstraint = response.locationConstraint().toString();
if (locationConstraint == "null") {
if (Objects.equals(locationConstraint, "null")) {
return "us-east-1";
}
return locationConstraint;
}

@Override
public List<String> listFiles(String bucketUrl) throws IOException {
BucketInfo bucketInfo = BucketInfo.parse(bucketUrl);
FileInfo fileInfo = FileInfo.parse(bucketUrl);
BucketInfo bucketInfo = fileInfo.getBucketInfo();
String region = getRegion(bucketUrl);
S3Client s3 = getAmazonS3Client(credentials, region);
ListObjectsRequest listObjects =
ListObjectsRequest.builder().bucket(bucketInfo.getBucketName()).build();
ListObjectsResponse res = s3.listObjects(listObjects);
ListObjectsRequest.Builder listObjectsRequestBuilder =
ListObjectsRequest.builder().bucket(bucketInfo.getBucketName());
if (fileInfo.getPath() != null && !fileInfo.getPath().isBlank()) {
listObjectsRequestBuilder.prefix(fileInfo.getPath());
}
ListObjectsResponse res = s3.listObjects(listObjectsRequestBuilder.build());
List<S3Object> objects = res.contents();
List<String> fileKeys = objects.stream().map(o -> o.key()).collect(Collectors.toList());
return fileKeys;
Expand Down
18 changes: 12 additions & 6 deletions storage/src/main/java/storage/StorageProviderGoogle.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,14 @@ public StorageProviderGoogle(Credentials credentials) {
public byte[] read(String fileUrl) throws Exception {
FileInfo fileInfo = FileInfo.parse(fileUrl);
Storage gcs = getGoogleCloudStorage(credentials);
return gcs.readAllBytes(fileInfo.getBucketInfo().getBucketName(), fileInfo.getFileName());
return gcs.readAllBytes(fileInfo.getBucketInfo().getBucketName(), fileInfo.getPath());
}

@Override
public void write(byte[] data, String fileUrl) throws Exception {
FileInfo fileInfo = FileInfo.parse(fileUrl);
Storage gcs = getGoogleCloudStorage(credentials);
BlobId blobId = BlobId.of(fileInfo.getBucketInfo().getBucketName(), fileInfo.getFileName());
BlobId blobId = BlobId.of(fileInfo.getBucketInfo().getBucketName(), fileInfo.getPath());
BlobInfo blobInfo = BlobInfo.newBuilder(blobId).build();
gcs.createFrom(blobInfo, new ByteArrayInputStream(data));
}
Expand All @@ -39,11 +39,11 @@ public void write(byte[] data, String fileUrl) throws Exception {
public boolean delete(String fileUrl) {
FileInfo fileInfo = FileInfo.parse(fileUrl);
Storage gcs = getGoogleCloudStorage(credentials);
Blob blob = gcs.get(fileInfo.getBucketInfo().getBucketName(), fileInfo.getFileName());
Blob blob = gcs.get(fileInfo.getBucketInfo().getBucketName(), fileInfo.getPath());
if (blob != null) {
Storage.BlobSourceOption precondition =
Storage.BlobSourceOption.generationMatch(blob.getGeneration());
gcs.delete(fileInfo.getBucketInfo().getBucketName(), fileInfo.getFileName(), precondition);
gcs.delete(fileInfo.getBucketInfo().getBucketName(), fileInfo.getPath(), precondition);
return true;
}
return false;
Expand Down Expand Up @@ -88,9 +88,15 @@ public String getRegion(String bucketUrl) throws IOException {

@Override
public List<String> listFiles(String bucketUrl) {
BucketInfo bucketInfo = BucketInfo.parse(bucketUrl);
FileInfo fileInfo = FileInfo.parse(bucketUrl);
BucketInfo bucketInfo = fileInfo.getBucketInfo();
Storage gcs = getGoogleCloudStorage(credentials);
Page<Blob> blobs = gcs.list(bucketInfo.getBucketName());
Page<Blob> blobs;
if (fileInfo.getPath() == null || fileInfo.getPath().isEmpty()) {
blobs = gcs.list(bucketInfo.getBucketName());
} else {
blobs = gcs.list(bucketInfo.getBucketName(), Storage.BlobListOption.prefix(fileInfo.getPath()));
}
List<String> fileKeys = new ArrayList<>();
for (Blob blob : blobs.iterateAll()) {
fileKeys.add(blob.getName());
Expand Down
59 changes: 59 additions & 0 deletions storage/src/test/java/storage/BucketInfoTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
package storage;

import static org.junit.jupiter.api.Assertions.*;

import org.junit.jupiter.api.Test;
import shared.Provider;

class BucketInfoTest {

@Test
public void shouldIdentifyBucketInfoFromSimpleAwsUrl() {
String url = "https://baasless-input-us-e1.s3.amazonaws.com";
BucketInfo bucketInfo = BucketInfo.parse(url);
assertEquals("baasless-input-us-e1", bucketInfo.getBucketName());
assertEquals(Provider.AWS, bucketInfo.getProvider());
assertEquals(url, bucketInfo.getBucketUrl());
assertEquals("us-east-1", bucketInfo.getRegion());
}

@Test
public void shouldIdentifyBucketInfoFromSimpleAwsUrlWithRegion() {
String url = "https://baasless-input-us-e1.s3.us-east-1.amazonaws.com";
BucketInfo bucketInfo = BucketInfo.parse(url);
assertEquals("baasless-input-us-e1", bucketInfo.getBucketName());
assertEquals(Provider.AWS, bucketInfo.getProvider());
assertEquals(url, bucketInfo.getBucketUrl());
assertEquals("us-east-1", bucketInfo.getRegion());
}

@Test
public void shouldIdentifyBucketInfoFromSimpleAwsFileUrl() {
String url = "https://baasless-input-us-e1.s3.amazonaws.com/sample-1.wav";
BucketInfo bucketInfo = BucketInfo.parse(url);
assertEquals("baasless-input-us-e1", bucketInfo.getBucketName());
assertEquals(Provider.AWS, bucketInfo.getProvider());
assertEquals(url, bucketInfo.getBucketUrl());
assertEquals("us-east-1", bucketInfo.getRegion());
}

@Test
public void shouldIdentifyBucketNameFromSimpleGcpUrl() {
String url = "https://storage.cloud.google.com/europe-west1-intents/";
BucketInfo bucketInfo = BucketInfo.parse(url);
assertEquals("europe-west1-intents", bucketInfo.getBucketName());
assertEquals(Provider.GCP, bucketInfo.getProvider());
assertEquals(url, bucketInfo.getBucketUrl());
assertNull(bucketInfo.getRegion());
}

@Test
public void shouldIdentifyBucketNameFromSimpleGcpFileUrl() {
String url = "https://storage.cloud.google.com/europe-west1-intents/sample-1.wav";
BucketInfo bucketInfo = BucketInfo.parse(url);
assertEquals("europe-west1-intents", bucketInfo.getBucketName());
assertEquals(Provider.GCP, bucketInfo.getProvider());
assertEquals(url, bucketInfo.getBucketUrl());
assertNull(bucketInfo.getRegion());
}
}
Loading