Skip to content

Commit 232fced

Browse files
committed
Fix bin pack chunk size for tiny target file size
1 parent 352b1cb commit 232fced

File tree

2 files changed

+7
-1
lines changed

2 files changed

+7
-1
lines changed

pyiceberg/io/pyarrow.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2681,7 +2681,7 @@ def bin_pack_arrow_table(tbl: pa.Table, target_file_size: int) -> Iterator[list[
26812681
from pyiceberg.utils.bin_packing import PackingIterator
26822682

26832683
avg_row_size_bytes = tbl.nbytes / tbl.num_rows
2684-
target_rows_per_file = target_file_size // avg_row_size_bytes
2684+
target_rows_per_file = max(1, int(target_file_size / avg_row_size_bytes))
26852685
batches = tbl.to_batches(max_chunksize=target_rows_per_file)
26862686
bin_packed_record_batches = PackingIterator(
26872687
items=batches,

tests/io/test_pyarrow.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2248,6 +2248,12 @@ def test_bin_pack_arrow_table(arrow_table_with_null: pa.Table) -> None:
22482248
assert len(list(bin_packed)) == 5
22492249

22502250

2251+
def test_bin_pack_arrow_table_target_size_smaller_than_row(arrow_table_with_null: pa.Table) -> None:
2252+
bin_packed = list(bin_pack_arrow_table(arrow_table_with_null, target_file_size=1))
2253+
assert len(bin_packed) == arrow_table_with_null.num_rows
2254+
assert sum(batch.num_rows for bin_ in bin_packed for batch in bin_) == arrow_table_with_null.num_rows
2255+
2256+
22512257
def test_schema_mismatch_type(table_schema_simple: Schema) -> None:
22522258
other_schema = pa.schema(
22532259
(

0 commit comments

Comments
 (0)