Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AvroTensorDataset] Add more py test to cover various scenarios #1795

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions tests/test_atds_avro/benchmark/test_atds_autotuning_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.
# ==============================================================================
"""ATDS benchmark with autotuning."""

import pytest
import tensorflow as tf

from tests.test_atds_avro.utils.data_source import DataSource
from tests.test_atds_avro.utils.data_source_registry import LARGE_NUM_RECORDS
from tests.test_atds_avro.utils.atds_benchmark_utils import (
run_atds_benchmark_from_data_source,
)
from tests.test_atds_avro.utils.benchmark_utils import MIXED_TYPES_SCENARIO

BATCH_SIZES = [8, 16, 32, 64, 128, 256, 512, 1024]
PARALLELISM = [1, 2, 3, 4, 5, 6, tf.data.AUTOTUNE]
PARAMS = [
(batch_size, 1024, "deflate", parallelism)
for batch_size in BATCH_SIZES
for parallelism in PARALLELISM
]


@pytest.mark.benchmark(
group="autotuning",
)
@pytest.mark.parametrize(
["batch_size", "shuffle_buffer_size", "codec", "parallelism"], PARAMS
)
def test_autotuning(batch_size, shuffle_buffer_size, codec, parallelism, benchmark):
data_source = DataSource(
scenario=MIXED_TYPES_SCENARIO, num_records=LARGE_NUM_RECORDS
)
run_atds_benchmark_from_data_source(
data_source,
batch_size,
benchmark,
parallelism=parallelism,
codec=codec,
shuffle_buffer_size=shuffle_buffer_size,
rounds=10,
)
Comment on lines +27 to +54
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The file name says that you are testing the autotune functionality. However, I see similar tests in tests/test_atds_avro/benchmark/test_atds_parallelism_benchmark.py as well. Maybe the tests can be combined ?

107 changes: 107 additions & 0 deletions tests/test_atds_avro/benchmark/test_atds_parallelism_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.
# ==============================================================================
"""ATDS benchmark with parallelism."""

import pytest
import tensorflow as tf

from tests.test_atds_avro.utils.data_source import DataSource
from tests.test_atds_avro.utils.data_source_registry import LARGE_NUM_RECORDS
from tests.test_atds_avro.utils.atds_benchmark_utils import (
run_atds_benchmark_from_data_source,
)
from tests.test_atds_avro.utils.benchmark_utils import MIXED_TYPES_SCENARIO


@pytest.mark.benchmark(
group="parallelism",
)
@pytest.mark.parametrize(
["batch_size", "shuffle_buffer_size", "codec", "parallelism"],
[
(128, 1024, "null", 1),
(128, 1024, "null", 2),
(128, 1024, "null", 3),
(128, 1024, "null", 4),
(128, 1024, "null", 5),
(128, 1024, "null", 6),
(128, 1024, "deflate", 1),
(128, 1024, "deflate", 2),
(128, 1024, "deflate", 3),
(128, 1024, "deflate", 4),
(128, 1024, "deflate", 5),
(128, 1024, "deflate", 6),
(128, 1024, "snappy", 1),
(128, 1024, "snappy", 2),
(128, 1024, "snappy", 3),
(128, 1024, "snappy", 4),
(128, 1024, "snappy", 5),
(128, 1024, "snappy", 6),
],
)
def test_parallelism(batch_size, shuffle_buffer_size, codec, parallelism, benchmark):
data_source = DataSource(
scenario=MIXED_TYPES_SCENARIO, num_records=LARGE_NUM_RECORDS
)
run_atds_benchmark_from_data_source(
data_source,
batch_size,
benchmark,
parallelism=parallelism,
codec=codec,
shuffle_buffer_size=shuffle_buffer_size,
rounds=10,
)


@pytest.mark.benchmark(
group="parallelism",
)
@pytest.mark.parametrize(
["batch_size", "shuffle_buffer_size", "parallelism", "interleave"],
[
(32, 1024, 1, 6),
(32, 1024, 2, 3),
(32, 1024, 3, 2),
(32, 1024, 6, 1),
(32, 1024, tf.data.AUTOTUNE, 1),
(32, 1024, tf.data.AUTOTUNE, 2),
(32, 1024, tf.data.AUTOTUNE, 3),
(32, 1024, tf.data.AUTOTUNE, 6),
(128, 1024, 1, 6),
(128, 1024, 2, 3),
(128, 1024, 3, 2),
(128, 1024, 6, 1),
(128, 1024, tf.data.AUTOTUNE, 1),
(128, 1024, tf.data.AUTOTUNE, 2),
(128, 1024, tf.data.AUTOTUNE, 3),
(128, 1024, tf.data.AUTOTUNE, 6),
],
)
def test_parallelism_with_interleave(
batch_size, shuffle_buffer_size, parallelism, interleave, benchmark
):
data_source = DataSource(
scenario=MIXED_TYPES_SCENARIO, num_records=LARGE_NUM_RECORDS, partitions=6
)
run_atds_benchmark_from_data_source(
data_source,
batch_size,
benchmark,
parallelism=parallelism,
interleave_parallelism=interleave,
codec="deflate",
shuffle_buffer_size=shuffle_buffer_size,
)
37 changes: 37 additions & 0 deletions tests/test_atds_avro/benchmark/test_codec_atds_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.
# ==============================================================================
"""ATDS benchmark with different codecs."""

import pytest

from tests.test_atds_avro.utils.data_source import DataSource
from tests.test_atds_avro.utils.data_source_registry import SMALL_NUM_RECORDS
from tests.test_atds_avro.utils.atds_benchmark_utils import (
run_atds_benchmark_from_data_source,
)
from tests.test_atds_avro.utils.benchmark_utils import MIXED_TYPES_SCENARIO


@pytest.mark.benchmark(
group="codec",
)
@pytest.mark.parametrize(
["batch_size", "codec"], [(128, "null"), (128, "deflate"), (128, "snappy")]
)
def test_codec(batch_size, codec, benchmark):
data_source = DataSource(
scenario=MIXED_TYPES_SCENARIO, num_records=SMALL_NUM_RECORDS
)
run_atds_benchmark_from_data_source(data_source, batch_size, benchmark, codec=codec)
Comment on lines +17 to +37
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same issue as before, I see similar tests in tests/test_atds_avro/benchmark/test_atds_parallelism_benchmark.py as well. Maybe the tests can be combined?

108 changes: 108 additions & 0 deletions tests/test_atds_avro/benchmark/test_mixed_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.
# ==============================================================================
"""ATDS benchmark for schema with mixed data types."""

import glob
import os
import pytest
import tensorflow as tf

from tests.test_atds_avro.utils.data_source import DataSource
from tests.test_atds_avro.utils.data_source_registry import SMALL_NUM_RECORDS
from tests.test_atds_avro.utils.generator.tensor_generator import (
IntTensorGenerator,
FloatTensorGenerator,
WordTensorGenerator,
)
from tests.test_atds_avro.utils.generator.sparse_tensor_generator import (
FloatSparseTensorGenerator,
ValueDistribution,
)
from tests.test_atds_avro.utils.atds_writer import ATDSWriter
from tests.test_atds_avro.utils.benchmark_utils import benchmark_func
from tests.test_atds_avro.utils.atds_benchmark_utils import (
get_dataset,
get_features_from_data_source,
)


@pytest.mark.benchmark(
group="mixed",
)
def test_mixed_benchmark_data():
scenario = {
"sparse_1d_float_small_1": FloatSparseTensorGenerator(
tf.SparseTensorSpec([3], tf.dtypes.float32), ValueDistribution.SINGLE_VALUE
),
"sparse_1d_float_large": FloatSparseTensorGenerator(
tf.SparseTensorSpec([50001], tf.dtypes.float32),
ValueDistribution.SINGLE_VALUE,
),
"dense_0d_float": FloatTensorGenerator(tf.TensorSpec([], tf.dtypes.float32)),
"dense_1d_float_large_1": FloatTensorGenerator(
tf.TensorSpec([200], tf.dtypes.float32)
),
"dense_0d_int_1": IntTensorGenerator(tf.TensorSpec([], tf.dtypes.int32)),
"sparse_1d_float_medium_1": FloatSparseTensorGenerator(
tf.SparseTensorSpec([10], tf.dtypes.float32), ValueDistribution.SINGLE_VALUE
),
"dense_1d_float_large_2": FloatTensorGenerator(
tf.TensorSpec([200], tf.dtypes.float32)
),
"dense_1d_float_small_1": FloatTensorGenerator(
tf.TensorSpec([2], tf.dtypes.float32)
),
"dense_1d_float_large_3": FloatTensorGenerator(
tf.TensorSpec([200], tf.dtypes.float32)
),
"dense_1d_float_small_2": FloatTensorGenerator(
tf.TensorSpec([2], tf.dtypes.float32)
),
"dense_1d_float_small_3": FloatTensorGenerator(
tf.TensorSpec([2], tf.dtypes.float32)
),
"sparse_1d_float_medium_2": FloatSparseTensorGenerator(
tf.SparseTensorSpec([51], tf.dtypes.float32), ValueDistribution.SINGLE_VALUE
),
"sparse_1d_float_small_2": FloatSparseTensorGenerator(
tf.SparseTensorSpec([3], tf.dtypes.float32), ValueDistribution.SINGLE_VALUE
),
"dense_1d_float_large_4": FloatTensorGenerator(
tf.TensorSpec([200], tf.dtypes.float32)
),
"dense_1d_float_small_4": FloatTensorGenerator(
tf.TensorSpec([1], tf.dtypes.float32)
),
"dense_0d_string_1": WordTensorGenerator(
tf.TensorSpec([], tf.dtypes.string), avg_length=24
),
"dense_0d_int_2": IntTensorGenerator(tf.TensorSpec([], tf.dtypes.int32)),
"dense_0d_string_2": WordTensorGenerator(
tf.TensorSpec([], tf.dtypes.string), avg_length=24
),
"dense_0d_long": IntTensorGenerator(tf.TensorSpec([], tf.dtypes.int64)),
}
num_partitions = 10
data_source = DataSource(
scenario=scenario, num_records=SMALL_NUM_RECORDS, partitions=num_partitions
)
with ATDSWriter() as writer:
dir_path = writer.write(data_source)
pattern = os.path.join(dir_path, f"*.{writer.extension}")
dataset = get_dataset(
glob.glob(pattern), get_features_from_data_source(writer, data_source)
)
dataset = dataset.unbatch()
benchmark_func(dataset)
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.
# ==============================================================================
"""ATDS benchmark with multiple files."""

import pytest

from tests.test_atds_avro.utils.data_source import DataSource
from tests.test_atds_avro.utils.data_source_registry import (
LARGE_NUM_RECORDS,
MULTIPLE_PARTITION,
)
from tests.test_atds_avro.utils.atds_benchmark_utils import (
run_atds_benchmark_from_data_source,
)
from tests.test_atds_avro.utils.benchmark_utils import MIXED_TYPES_SCENARIO


@pytest.mark.benchmark(
group="multi_partition",
)
@pytest.mark.parametrize(["batch_size", "partitions"], [(128, MULTIPLE_PARTITION)])
def test_multiple_partitions(batch_size, partitions, benchmark):
data_source = DataSource(
scenario=MIXED_TYPES_SCENARIO,
num_records=LARGE_NUM_RECORDS,
partitions=partitions,
)
run_atds_benchmark_from_data_source(data_source, batch_size, benchmark)
Loading