Skip to content

Commit 808df3d

Browse files
committed
GH-48254: [Python][Parquet] Support extension types in read_schema
1 parent 2a89d03 commit 808df3d

2 files changed

Lines changed: 43 additions & 10 deletions

File tree

python/pyarrow/parquet/core.py

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -265,9 +265,9 @@ class ParquetFile:
265265
page_checksum_verification : bool, default False
266266
If True, verify the checksum for each page read from the file.
267267
arrow_extensions_enabled : bool, default True
268-
If True, read Parquet logical types as Arrow extension types where possible,
269-
(e.g., read JSON as the canonical `arrow.json` extension type or UUID as
270-
the canonical `arrow.uuid` extension type).
268+
If True, read Parquet logical types as Arrow extension types where
269+
possible (e.g., read JSON as the canonical `arrow.json` extension type
270+
or UUID as the canonical `arrow.uuid` extension type).
271271
272272
Examples
273273
--------
@@ -2372,7 +2372,7 @@ def write_metadata(schema, where, metadata_collector=None, filesystem=None,
23722372

23732373

23742374
def read_metadata(where, memory_map=False, decryption_properties=None,
2375-
filesystem=None):
2375+
filesystem=None, arrow_extensions_enabled=True):
23762376
"""
23772377
Read FileMetaData from footer of a single Parquet file.
23782378
@@ -2387,6 +2387,10 @@ def read_metadata(where, memory_map=False, decryption_properties=None,
23872387
If nothing passed, will be inferred based on path.
23882388
Path will try to be found in the local on-disk filesystem otherwise
23892389
it will be parsed as an URI to determine the filesystem.
2390+
arrow_extensions_enabled : bool, default True
2391+
If True, read Parquet logical types as Arrow extension types where
2392+
possible (e.g. UUID as the canonical `arrow.uuid` extension type).
2393+
If False, use the underlying storage types instead.
23902394
23912395
Returns
23922396
-------
@@ -2416,13 +2420,17 @@ def read_metadata(where, memory_map=False, decryption_properties=None,
24162420
file_ctx = where = filesystem.open_input_file(where)
24172421

24182422
with file_ctx:
2419-
file = ParquetFile(where, memory_map=memory_map,
2420-
decryption_properties=decryption_properties)
2423+
file = ParquetFile(
2424+
where,
2425+
memory_map=memory_map,
2426+
decryption_properties=decryption_properties,
2427+
arrow_extensions_enabled=arrow_extensions_enabled,
2428+
)
24212429
return file.metadata
24222430

24232431

24242432
def read_schema(where, memory_map=False, decryption_properties=None,
2425-
filesystem=None):
2433+
filesystem=None, arrow_extensions_enabled=True):
24262434
"""
24272435
Read effective Arrow schema from Parquet file metadata.
24282436
@@ -2437,6 +2445,9 @@ def read_schema(where, memory_map=False, decryption_properties=None,
24372445
If nothing passed, will be inferred based on path.
24382446
Path will try to be found in the local on-disk filesystem otherwise
24392447
it will be parsed as an URI to determine the filesystem.
2448+
arrow_extensions_enabled : bool, default True
2449+
If True, read Parquet logical types as Arrow extension types where
2450+
possible (e.g., UUID as the canonical `arrow.uuid` extension type).
24402451
24412452
Returns
24422453
-------
@@ -2462,9 +2473,12 @@ def read_schema(where, memory_map=False, decryption_properties=None,
24622473

24632474
with file_ctx:
24642475
file = ParquetFile(
2465-
where, memory_map=memory_map,
2466-
decryption_properties=decryption_properties)
2467-
return file.schema.to_arrow_schema()
2476+
where,
2477+
memory_map=memory_map,
2478+
decryption_properties=decryption_properties,
2479+
arrow_extensions_enabled=arrow_extensions_enabled,
2480+
)
2481+
return file.schema_arrow
24682482

24692483

24702484
__all__ = (

python/pyarrow/tests/parquet/test_metadata.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -814,3 +814,22 @@ def msg(c):
814814

815815
with pytest.raises(TypeError, match=msg("FileMetaData")):
816816
pq.FileMetaData()
817+
818+
819+
def test_read_schema_uuid_extension_type(tmp_path):
820+
data = [
821+
b'\xe4`\xf9p\x83QGN\xac\x7f\xa4g>K\xa8\xcb',
822+
b'\x1et\x14\x95\xee\xd5C\xea\x9b\xd7s\xdc\x91BK\xaf',
823+
None,
824+
]
825+
table = pa.table([pa.array(data, type=pa.uuid())], names=["ext"])
826+
827+
file_path = tmp_path / "uuid.parquet"
828+
file_path_str = str(file_path)
829+
pq.write_table(table, file_path_str, store_schema=False)
830+
831+
schema_default = pq.read_schema(file_path_str)
832+
assert schema_default.field("ext").type == pa.uuid()
833+
834+
schema_disabled = pq.read_schema(file_path_str, arrow_extensions_enabled=False)
835+
assert schema_disabled.field("ext").type == pa.binary(16)

0 commit comments

Comments
 (0)