apache · kevinjqliu · Mar 27, 2025 · Mar 27, 2025 · Mar 27, 2025 · Mar 27, 2025
diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md
@@ -418,7 +418,7 @@ This produces the following result with `tbl.scan().to_arrow()`:
 
 ```python
 pyarrow.Table
-city: large_string
+city: string
 lat: double
 long: double
 ----
@@ -476,7 +476,7 @@ This produces the following result with `tbl.scan().to_arrow()`:
 
 ```python
 pyarrow.Table
-city: large_string
+city: string
 lat: double
 long: double
 ----
@@ -957,14 +957,14 @@ split_offsets: list<item: int64>
 equality_ids: list<item: int32>
   child 0, item: int32
 sort_order_id: int32
-readable_metrics: struct<city: struct<column_size: int64, value_count: int64, null_value_count: int64, nan_value_count: int64, lower_bound: large_string, upper_bound: large_string> not null, lat: struct<column_size: int64, value_count: int64, null_value_count: int64, nan_value_count: int64, lower_bound: double, upper_bound: double> not null, long: struct<column_size: int64, value_count: int64, null_value_count: int64, nan_value_count: int64, lower_bound: double, upper_bound: double> not null>
+readable_metrics: struct<city: struct<column_size: int64, value_count: int64, null_value_count: int64, nan_value_count: int64, lower_bound: string, upper_bound: string> not null, lat: struct<column_size: int64, value_count: int64, null_value_count: int64, nan_value_count: int64, lower_bound: double, upper_bound: double> not null, long: struct<column_size: int64, value_count: int64, null_value_count: int64, nan_value_count: int64, lower_bound: double, upper_bound: double> not null>
   child 0, city: struct<column_size: int64, value_count: int64, null_value_count: int64, nan_value_count: int64, lower_bound: string, upper_bound: string> not null
       child 0, column_size: int64
       child 1, value_count: int64
       child 2, null_value_count: int64
       child 3, nan_value_count: int64
-      child 4, lower_bound: large_string
-      child 5, upper_bound: large_string
+      child 4, lower_bound: string
+      child 5, upper_bound: string
   child 1, lat: struct<column_size: int64, value_count: int64, null_value_count: int64, nan_value_count: int64, lower_bound: double, upper_bound: double> not null
       child 0, column_size: int64
       child 1, value_count: int64
@@ -998,7 +998,7 @@ equality_ids:[[[],[]]]
 sort_order_id:[[[],[]]]
 readable_metrics: [
   -- is_valid: all not null
-  -- child 0 type: struct<column_size: int64, value_count: int64, null_value_count: int64, nan_value_count: int64, lower_bound: large_string, upper_bound: large_string>
+  -- child 0 type: struct<column_size: int64, value_count: int64, null_value_count: int64, nan_value_count: int64, lower_bound: string, upper_bound: string>
     -- is_valid: all not null
     -- child 0 type: int64
 [140]
@@ -1008,9 +1008,9 @@ readable_metrics: [
 [0]
     -- child 3 type: int64
 [null]
-    -- child 4 type: large_string
+    -- child 4 type: string
 ["Amsterdam"]
-    -- child 5 type: large_string
+    -- child 5 type: string
 ["San Francisco"]
   -- child 1 type: struct<column_size: int64, value_count: int64, null_value_count: int64, nan_value_count: int64, lower_bound: double, upper_bound: double>
     -- is_valid: all not null

diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md
@@ -199,7 +199,7 @@ PyIceberg uses [S3FileSystem](https://arrow.apache.org/docs/python/generated/pya
 
 | Key                             | Example | Description                                                                                                                                                                                                                                                                                                                                                     |
 | ------------------------------- | ------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| pyarrow.use-large-types-on-read | True    | Use large PyArrow types i.e. [large_string](https://arrow.apache.org/docs/python/generated/pyarrow.large_string.html), [large_binary](https://arrow.apache.org/docs/python/generated/pyarrow.large_binary.html) and [large_list](https://arrow.apache.org/docs/python/generated/pyarrow.large_list.html) field types on table scans. The default value is True. |
+| pyarrow.use-large-types-on-read | False    | Force large PyArrow types i.e. [large_string](https://arrow.apache.org/docs/python/generated/pyarrow.large_string.html), [large_binary](https://arrow.apache.org/docs/python/generated/pyarrow.large_binary.html) and [large_list](https://arrow.apache.org/docs/python/generated/pyarrow.large_list.html) field types on table scans. The default value is False. |
 
 <!-- markdown-link-check-enable-->
 

diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
@@ -625,7 +625,7 @@ def field(self, field: NestedField, field_result: pa.DataType) -> pa.Field:
 
     def list(self, list_type: ListType, element_result: pa.DataType) -> pa.DataType:
         element_field = self.field(list_type.element_field, element_result)
-        return pa.large_list(value_type=element_field)
+        return pa.list_(value_type=element_field)
 
     def map(self, map_type: MapType, key_result: pa.DataType, value_result: pa.DataType) -> pa.DataType:
         key_field = self.field(map_type.key_field, key_result)
@@ -675,7 +675,7 @@ def visit_timestamptz_ns(self, _: TimestamptzNanoType) -> pa.DataType:
         return pa.timestamp(unit="ns", tz="UTC")
 
     def visit_string(self, _: StringType) -> pa.DataType:
-        return pa.large_string()
+        return pa.string()
 
     def visit_uuid(self, _: UUIDType) -> pa.DataType:
         return pa.binary(16)
@@ -684,7 +684,7 @@ def visit_unknown(self, _: UnknownType) -> pa.DataType:
         return pa.null()
 
     def visit_binary(self, _: BinaryType) -> pa.DataType:
-        return pa.large_binary()
+        return pa.binary()
 
 
 def _convert_scalar(value: Any, iceberg_type: IcebergType) -> pa.scalar:
@@ -1612,7 +1612,7 @@ def _table_from_scan_task(task: FileScanTask) -> pa.Table:
                 removed_in="0.11.0",
                 help_message=f"Property `{PYARROW_USE_LARGE_TYPES_ON_READ}` will be removed.",
             )
-            result = result.cast(arrow_schema)
+            result = result.cast(_pyarrow_schema_ensure_large_types(arrow_schema))
 
         if self._limit is not None:
             return result.slice(0, self._limit)
@@ -1718,8 +1718,8 @@ def _cast_if_needed(self, field: NestedField, values: pa.Array) -> pa.Array:
                 target_schema = schema_to_pyarrow(
                     promote(file_field.field_type, field.field_type), include_field_ids=self._include_field_ids
                 )
-                if self._use_large_types is False:
-                    target_schema = _pyarrow_schema_ensure_small_types(target_schema)
+                if self._use_large_types is True:
+                    target_schema = _pyarrow_schema_ensure_large_types(target_schema)
                 return values.cast(target_schema)
             elif (target_type := schema_to_pyarrow(field.field_type, include_field_ids=self._include_field_ids)) != values.type:
                 if field.field_type == TimestampType():

diff --git a/tests/catalog/test_sql.py b/tests/catalog/test_sql.py
@@ -404,7 +404,7 @@ def test_write_pyarrow_schema(catalog: SqlCatalog, table_identifier: Identifier)
         ],
         schema=pa.schema(
             [
-                pa.field("foo", pa.large_string(), nullable=True),
+                pa.field("foo", pa.string(), nullable=True),
                 pa.field("bar", pa.int32(), nullable=False),
                 pa.field("baz", pa.bool_(), nullable=True),
                 pa.field("large", pa.large_string(), nullable=True),
@@ -1462,7 +1462,7 @@ def test_write_and_evolve(catalog: SqlCatalog, format_version: int) -> None:
         {
             "foo": ["a", None, "z"],
         },
-        schema=pa.schema([pa.field("foo", pa.large_string(), nullable=True)]),
+        schema=pa.schema([pa.field("foo", pa.string(), nullable=True)]),
     )
 
     tbl = catalog.create_table(identifier=identifier, schema=pa_table.schema, properties={"format-version": str(format_version)})
@@ -1474,7 +1474,7 @@ def test_write_and_evolve(catalog: SqlCatalog, format_version: int) -> None:
         },
         schema=pa.schema(
             [
-                pa.field("foo", pa.large_string(), nullable=True),
+                pa.field("foo", pa.string(), nullable=True),
                 pa.field("bar", pa.int32(), nullable=True),
             ]
         ),
@@ -1514,7 +1514,7 @@ def test_create_table_transaction(catalog: SqlCatalog, format_version: int) -> N
         {
             "foo": ["a", None, "z"],
         },
-        schema=pa.schema([pa.field("foo", pa.large_string(), nullable=True)]),
+        schema=pa.schema([pa.field("foo", pa.string(), nullable=True)]),
     )
 
     pa_table_with_column = pa.Table.from_pydict(
@@ -1524,7 +1524,7 @@ def test_create_table_transaction(catalog: SqlCatalog, format_version: int) -> N
         },
         schema=pa.schema(
             [
-                pa.field("foo", pa.large_string(), nullable=True),
+                pa.field("foo", pa.string(), nullable=True),
                 pa.field("bar", pa.int32(), nullable=True),
             ]
         ),

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -2510,8 +2510,8 @@ def pa_schema() -> "pa.Schema":
     return pa.schema(
         [
             ("bool", pa.bool_()),
-            ("string", pa.large_string()),
-            ("string_long", pa.large_string()),
+            ("string", pa.string()),
+            ("string_long", pa.string()),
             ("int", pa.int32()),
             ("long", pa.int64()),
             ("float", pa.float32()),
@@ -2525,7 +2525,7 @@ def pa_schema() -> "pa.Schema":
             # ("time", pa.time64("us")),
             # Not natively supported by Arrow
             # ("uuid", pa.fixed(16)),
-            ("binary", pa.large_binary()),
+            ("binary", pa.binary()),
             ("fixed", pa.binary(16)),
         ]
     )

diff --git a/tests/integration/test_reads.py b/tests/integration/test_reads.py
@@ -872,9 +872,12 @@ def test_table_scan_keep_types(catalog: Catalog) -> None:
 
 
 @pytest.mark.integration
+@pytest.mark.filterwarnings(
+    "ignore:Deprecated in 0.10.0, will be removed in 0.11.0. Property `pyarrow.use-large-types-on-read` will be removed.:DeprecationWarning"
+)
 @pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")])
-def test_table_scan_override_with_small_types(catalog: Catalog) -> None:
-    identifier = "default.test_table_scan_override_with_small_types"
+def test_table_scan_override_with_large_types(catalog: Catalog) -> None:
+    identifier = "default.test_table_scan_override_with_large_types"
     arrow_table = pa.Table.from_arrays(
         [
             pa.array(["a", "b", "c"]),
@@ -900,15 +903,15 @@ def test_table_scan_override_with_small_types(catalog: Catalog) -> None:
     with tbl.update_schema() as update_schema:
         update_schema.update_column("string-to-binary", BinaryType())
 
-    tbl.io.properties[PYARROW_USE_LARGE_TYPES_ON_READ] = "False"
+    tbl.io.properties[PYARROW_USE_LARGE_TYPES_ON_READ] = "True"
     result_table = tbl.scan().to_arrow()
 
     expected_schema = pa.schema(
         [
-            pa.field("string", pa.string()),
+            pa.field("string", pa.large_string()),
             pa.field("string-to-binary", pa.large_binary()),
-            pa.field("binary", pa.binary()),
-            pa.field("list", pa.list_(pa.string())),
+            pa.field("binary", pa.large_binary()),
+            pa.field("list", pa.large_list(pa.large_string())),
         ]
     )
     assert result_table.schema.equals(expected_schema)

diff --git a/tests/integration/test_writes/test_partitioned_writes.py b/tests/integration/test_writes/test_partitioned_writes.py
@@ -891,7 +891,7 @@ def test_unsupported_transform(
 
     with pytest.raises(
         ValueError,
-        match="FeatureUnsupported => Unsupported data type for truncate transform: LargeBinary",
+        match="FeatureUnsupported => Unsupported data type for truncate transform: Binary",
     ):
         tbl.append(arrow_table_with_null)