tests are passing

dpetzold · Dec 16, 2024 · 096cd83 · 096cd83
1 parent 895708a
commit 096cd83
Show file tree

Hide file tree

Showing 6 changed files with 84 additions and 21 deletions.
diff --git a/aws_log_parser/aws/s3.py b/aws_log_parser/aws/s3.py
@@ -30,7 +30,10 @@ def read_key(self, bucket, key, endswith=None):
         if self.aws_client.verbose:
             print(f"Reading s3://{bucket}/{key}")
         contents = self.client.get_object(Bucket=bucket, Key=key)
-        yield from FileIterator(BytesIO(contents["Body"]), endswith == ".gz")
+        yield from FileIterator(
+            fileobj=BytesIO(contents["Body"].iter_lines()),
+            gzipped=endswith == ".gz",
+        )
 
     def read_keys(self, bucket, prefix, endswith=None):
         for file in self.list_files(bucket, prefix, "LastModified"):

diff --git a/aws_log_parser/interface.py b/aws_log_parser/interface.py
@@ -103,8 +103,7 @@ def read_file(self, path):
             path = Path(path)
         if self.verbose:
             print(f"Reading file://{path}")
-        with path.open("rb") as fh:
-            yield from self.parse(FileIterator(fh, path.suffix == ".gz"))
+        yield from self.parse(FileIterator(path, gzipped=path.suffix == ".gz"))
 
     def read_files(self, pathname):
         """

diff --git a/aws_log_parser/io.py b/aws_log_parser/io.py
@@ -1,10 +1,15 @@
+from contextlib import contextmanager
 from dataclasses import dataclass
+from pathlib import Path
 import io
+import gzip
+import typing
 
 
 @dataclass
 class FileIterator:
-    fileobj: io.IOBase
+    path: typing.Optional[Path] = None
+    fileobj: typing.Optional[io.IOBase] = None
     gzipped: bool = False
 
     def yield_gzipped(self, fh):
@@ -13,6 +18,29 @@ def yield_gzipped(self, fh):
     def yield_plain(self, fh):
         yield from [line.decode("utf-8") for line in fh]
 
+    @contextmanager
+    def open_path(self):
+        assert self.path
+        fh = self.path.open("rb")
+        try:
+            yield fh
+        finally:
+            fh.close()
+
+    @contextmanager
+    def open_gzip(self):
+        if self.fileobj:
+            yield gzip.GzipFile(fileobj=self.fileobj)
+        else:
+            with self.open_path() as fh:
+                yield gzip.GzipFile(fileobj=fh)
+
     def __iter__(self):
         yield_func = self.yield_gzipped if self.gzipped else self.yield_plain
-        yield from yield_func(self.fileobj)
+        open_func = self.open_gzip if self.gzipped else self.open_path
+
+        if not self.gzipped and self.fileobj:
+            yield from yield_func(self.fileobj)
+        else:
+            with open_func() as fh:
+                yield from yield_func(fh)
diff --git a/test/data/cloudfront-multiple.log.gz b/test/data/cloudfront-multiple.log.gz
diff --git a/test/test_interface.py b/test/test_interface.py
@@ -14,12 +14,16 @@
 from aws_log_parser.aws.s3 import S3Service
 
 
+@dataclass
 class MockPaginator:
-    def paginate(self, **kwargs):
+    gzipped: bool = False
+
+    def paginate(self, **_):
+        suffix = ".gz" if self.gzipped else ""
         yield {
             "Contents": [
                 {
-                    "Key": "cloudfront-multiple.log",
+                    "Key": f"cloudfront-multiple.log{suffix}",
                     "LastModified": datetime.datetime(
                         2021, 11, 28, 3, 31, 56, tzinfo=tzutc()
                     ),
@@ -36,15 +40,19 @@ class MockStreamingFile:
     filename: str
 
     def iter_lines(self):
-        return open(self.filename, "rb").readlines()
+        return open(self.filename, "rb").read()
 
 
+@dataclass
 class MockS3Client:
-    def get_paginator(self, *args):
-        return MockPaginator()
+    gzipped: bool = False
 
-    def get_object(self, **kwargs):
-        return {"Body": MockStreamingFile("test/data/cloudfront-multiple.log")}
+    def get_paginator(self, *_):
+        return MockPaginator(self.gzipped)
+
+    def get_object(self, **_):
+        suffix = ".gz" if self.gzipped else ""
+        return {"Body": MockStreamingFile(f"test/data/cloudfront-multiple.log{suffix}")}
 
 
 @pytest.fixture
@@ -64,12 +72,27 @@ def test_parse_files(cloudfront_parser):
     assert len(list(entries)) == 6
 
 
-def test_parse_s3(monkeypatch, cloudfront_parser):
-    monkeypatch.setattr(S3Service, "client", MockS3Client())
+def test_parse_s3(monkeypatch, cloudfront_parser, gzipped=False):
+    monkeypatch.setattr(S3Service, "client", MockS3Client(gzipped=gzipped))
+    suffix = ".gz" if gzipped else ""
+
+    entries = cloudfront_parser.read_s3(
+        "bucket",
+        "key",
+        endswith=suffix,
+    )
+    assert len(list(entries)) == 6
+
+
+def test_parse_s3_gzipped(monkeypatch, cloudfront_parser):
+    gzipped = True
+    monkeypatch.setattr(S3Service, "client", MockS3Client(gzipped=gzipped))
+    suffix = ".gz" if gzipped else ""
 
     entries = cloudfront_parser.read_s3(
-        "aws-logs-test-data",
-        "cloudfront-multiple.log",
+        "bucket",
+        "key",
+        endswith=suffix,
     )
     assert len(list(entries)) == 6
 

diff --git a/test/test_io.py b/test/test_io.py
@@ -1,16 +1,26 @@
+from io import BytesIO
 from pathlib import Path
 from aws_log_parser.io import FileIterator
 
 
 def test_fileiterator_plain():
-
-    file_iterator = FileIterator(Path("test/data/cloudfront-multiple.log").open("rb"))
+    file_iterator = FileIterator(Path("test/data/cloudfront-multiple.log"))
     assert len(list(file_iterator)) == 8
 
 
-def test_fileiterator_gzipped():
+def test_fileiterator_gzipped_path():
+    file_iterator = FileIterator(
+        path=Path("test/data/loadbalancer_http2_entry.csv.gz"),
+        gzipped=True,
+    )
+    assert len(list(file_iterator)) == 1
 
+
+def test_fileiterator_gzipped_fileobj():
     file_iterator = FileIterator(
-        Path("test/data/loadbalancer_http2_entry.csv.gz").open("rb")
+        fileobj=BytesIO(
+            Path("test/data/loadbalancer_http2_entry.csv.gz").open("rb").read()
+        ),
+        gzipped=True,
     )
-    assert len(list(file_iterator)) == 8
+    assert len(list(file_iterator)) == 1