|
5 | 5 |
|
6 | 6 | import pandas as pd
|
7 | 7 | import pyarrow as pa
|
| 8 | +import pyarrow.fs |
8 | 9 | import pyarrow.parquet as pq
|
9 | 10 | from upath import UPath
|
10 | 11 |
|
|
13 | 14 | from ..series.utils import table_to_struct_array
|
14 | 15 | from .core import NestedFrame
|
15 | 16 |
|
| 17 | +# Use smaller block size for FSSPEC filesystems, it usually helps with parquet reads |
| 18 | +FSSPEC_BLOCK_SIZE = 32 * 1024 |
| 19 | + |
16 | 20 |
|
17 | 21 | def read_parquet(
|
18 | 22 | data: str | UPath | bytes,
|
@@ -96,10 +100,19 @@ def read_parquet(
|
96 | 100 | # If `data` is a file-like object or a sequence, pass it directly to pyarrow
|
97 | 101 | table = pq.read_table(data, columns=columns, **kwargs)
|
98 | 102 | else:
|
99 |
| - # Otherwise, treat `data` as a file path and use UPath |
100 |
| - path = UPath(data) |
101 |
| - filesystem = kwargs.pop("filesystem", path.fs) |
102 |
| - table = pq.read_table(path.path, columns=columns, filesystem=filesystem, **kwargs) |
| 103 | + # Try creating pyarrow-native filesystem |
| 104 | + try: |
| 105 | + fs, path = pa.fs.FileSystem.from_uri(data) |
| 106 | + except (TypeError, pa.ArrowInvalid): |
| 107 | + # Otherwise, treat `data` as an URI for fsspec-supported silesystem and use UPath |
| 108 | + upath = UPath(data) |
| 109 | + # Use smaller block size for better performance |
| 110 | + if upath.protocol in ("http", "https"): |
| 111 | + upath = UPath(upath, block_size=FSSPEC_BLOCK_SIZE) |
| 112 | + path = upath.path |
| 113 | + fs = upath.fs |
| 114 | + filesystem = kwargs.pop("filesystem", fs) |
| 115 | + table = pq.read_table(path, columns=columns, filesystem=filesystem, **kwargs) |
103 | 116 |
|
104 | 117 | # Resolve partial loading of nested structures
|
105 | 118 | # Using pyarrow to avoid naming conflicts from partial loading ("flux" vs "lc.flux")
|
|
0 commit comments