Skip to content

Commit

Permalink
tests are passing
Browse files Browse the repository at this point in the history
  • Loading branch information
dpetzold committed Dec 16, 2024
1 parent 895708a commit 096cd83
Show file tree
Hide file tree
Showing 6 changed files with 84 additions and 21 deletions.
5 changes: 4 additions & 1 deletion aws_log_parser/aws/s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,10 @@ def read_key(self, bucket, key, endswith=None):
if self.aws_client.verbose:
print(f"Reading s3://{bucket}/{key}")
contents = self.client.get_object(Bucket=bucket, Key=key)
yield from FileIterator(BytesIO(contents["Body"]), endswith == ".gz")
yield from FileIterator(
fileobj=BytesIO(contents["Body"].iter_lines()),
gzipped=endswith == ".gz",
)

def read_keys(self, bucket, prefix, endswith=None):
for file in self.list_files(bucket, prefix, "LastModified"):
Expand Down
3 changes: 1 addition & 2 deletions aws_log_parser/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,7 @@ def read_file(self, path):
path = Path(path)
if self.verbose:
print(f"Reading file://{path}")
with path.open("rb") as fh:
yield from self.parse(FileIterator(fh, path.suffix == ".gz"))
yield from self.parse(FileIterator(path, gzipped=path.suffix == ".gz"))

def read_files(self, pathname):
"""
Expand Down
32 changes: 30 additions & 2 deletions aws_log_parser/io.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
from contextlib import contextmanager
from dataclasses import dataclass
from pathlib import Path
import io
import gzip
import typing


@dataclass
class FileIterator:
fileobj: io.IOBase
path: typing.Optional[Path] = None
fileobj: typing.Optional[io.IOBase] = None
gzipped: bool = False

def yield_gzipped(self, fh):
Expand All @@ -13,6 +18,29 @@ def yield_gzipped(self, fh):
def yield_plain(self, fh):
yield from [line.decode("utf-8") for line in fh]

@contextmanager
def open_path(self):
assert self.path
fh = self.path.open("rb")
try:
yield fh
finally:
fh.close()

@contextmanager
def open_gzip(self):
if self.fileobj:
yield gzip.GzipFile(fileobj=self.fileobj)
else:
with self.open_path() as fh:
yield gzip.GzipFile(fileobj=fh)

def __iter__(self):
yield_func = self.yield_gzipped if self.gzipped else self.yield_plain
yield from yield_func(self.fileobj)
open_func = self.open_gzip if self.gzipped else self.open_path

if not self.gzipped and self.fileobj:
yield from yield_func(self.fileobj)
else:
with open_func() as fh:
yield from yield_func(fh)
Binary file added test/data/cloudfront-multiple.log.gz
Binary file not shown.
45 changes: 34 additions & 11 deletions test/test_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,16 @@
from aws_log_parser.aws.s3 import S3Service


@dataclass
class MockPaginator:
def paginate(self, **kwargs):
gzipped: bool = False

def paginate(self, **_):
suffix = ".gz" if self.gzipped else ""
yield {
"Contents": [
{
"Key": "cloudfront-multiple.log",
"Key": f"cloudfront-multiple.log{suffix}",
"LastModified": datetime.datetime(
2021, 11, 28, 3, 31, 56, tzinfo=tzutc()
),
Expand All @@ -36,15 +40,19 @@ class MockStreamingFile:
filename: str

def iter_lines(self):
return open(self.filename, "rb").readlines()
return open(self.filename, "rb").read()


@dataclass
class MockS3Client:
def get_paginator(self, *args):
return MockPaginator()
gzipped: bool = False

def get_object(self, **kwargs):
return {"Body": MockStreamingFile("test/data/cloudfront-multiple.log")}
def get_paginator(self, *_):
return MockPaginator(self.gzipped)

def get_object(self, **_):
suffix = ".gz" if self.gzipped else ""
return {"Body": MockStreamingFile(f"test/data/cloudfront-multiple.log{suffix}")}


@pytest.fixture
Expand All @@ -64,12 +72,27 @@ def test_parse_files(cloudfront_parser):
assert len(list(entries)) == 6


def test_parse_s3(monkeypatch, cloudfront_parser):
monkeypatch.setattr(S3Service, "client", MockS3Client())
def test_parse_s3(monkeypatch, cloudfront_parser, gzipped=False):
monkeypatch.setattr(S3Service, "client", MockS3Client(gzipped=gzipped))
suffix = ".gz" if gzipped else ""

entries = cloudfront_parser.read_s3(
"bucket",
"key",
endswith=suffix,
)
assert len(list(entries)) == 6


def test_parse_s3_gzipped(monkeypatch, cloudfront_parser):
gzipped = True
monkeypatch.setattr(S3Service, "client", MockS3Client(gzipped=gzipped))
suffix = ".gz" if gzipped else ""

entries = cloudfront_parser.read_s3(
"aws-logs-test-data",
"cloudfront-multiple.log",
"bucket",
"key",
endswith=suffix,
)
assert len(list(entries)) == 6

Expand Down
20 changes: 15 additions & 5 deletions test/test_io.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,26 @@
from io import BytesIO
from pathlib import Path
from aws_log_parser.io import FileIterator


def test_fileiterator_plain():

file_iterator = FileIterator(Path("test/data/cloudfront-multiple.log").open("rb"))
file_iterator = FileIterator(Path("test/data/cloudfront-multiple.log"))
assert len(list(file_iterator)) == 8


def test_fileiterator_gzipped():
def test_fileiterator_gzipped_path():
file_iterator = FileIterator(
path=Path("test/data/loadbalancer_http2_entry.csv.gz"),
gzipped=True,
)
assert len(list(file_iterator)) == 1


def test_fileiterator_gzipped_fileobj():
file_iterator = FileIterator(
Path("test/data/loadbalancer_http2_entry.csv.gz").open("rb")
fileobj=BytesIO(
Path("test/data/loadbalancer_http2_entry.csv.gz").open("rb").read()
),
gzipped=True,
)
assert len(list(file_iterator)) == 8
assert len(list(file_iterator)) == 1

0 comments on commit 096cd83

Please sign in to comment.