Skip to content

Commit ab6b190

Browse files
authored
Remove unneeded partitioning (#1417)
1 parent 85b2053 commit ab6b190

File tree

1 file changed

+11
-9
lines changed

1 file changed

+11
-9
lines changed

dev/provision.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,17 @@
2222
from pyiceberg.schema import Schema
2323
from pyiceberg.types import FixedType, NestedField, UUIDType
2424

25-
spark = SparkSession.builder.getOrCreate()
25+
# The configuration is important, otherwise we get many small
26+
# parquet files with a single row. When a positional delete
27+
# hits the Parquet file with one row, the parquet file gets
28+
# dropped instead of having a merge-on-read delete file.
29+
spark = (
30+
SparkSession
31+
.builder
32+
.config("spark.sql.shuffle.partitions", "1")
33+
.config("spark.default.parallelism", "1")
34+
.getOrCreate()
35+
)
2636

2737
catalogs = {
2838
'rest': load_catalog(
@@ -120,10 +130,6 @@
120130
"""
121131
)
122132

123-
# Partitioning is not really needed, but there is a bug:
124-
# https://github.com/apache/iceberg/pull/7685
125-
spark.sql(f"ALTER TABLE {catalog_name}.default.test_positional_mor_deletes ADD PARTITION FIELD years(dt) AS dt_years")
126-
127133
spark.sql(
128134
f"""
129135
INSERT INTO {catalog_name}.default.test_positional_mor_deletes
@@ -168,10 +174,6 @@
168174
"""
169175
)
170176

171-
# Partitioning is not really needed, but there is a bug:
172-
# https://github.com/apache/iceberg/pull/7685
173-
spark.sql(f"ALTER TABLE {catalog_name}.default.test_positional_mor_double_deletes ADD PARTITION FIELD years(dt) AS dt_years")
174-
175177
spark.sql(
176178
f"""
177179
INSERT INTO {catalog_name}.default.test_positional_mor_double_deletes

0 commit comments

Comments
 (0)