From 83a17742c8c92e99b922848b874fb757c90f025a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 23 Sep 2025 23:54:10 +0000 Subject: [PATCH 1/4] Initial plan From b158e905dcb7d42d070074bd9bb426fc697d1793 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 24 Sep 2025 00:09:16 +0000 Subject: [PATCH 2/4] Add fsspec optimization support via open_file_options parameter Co-authored-by: gitosaurus <6794831+gitosaurus@users.noreply.github.com> --- src/nested_pandas/nestedframe/io.py | 70 ++++++++++++++++++++-- tests/nested_pandas/nestedframe/test_io.py | 59 ++++++++++++++++++ 2 files changed, 124 insertions(+), 5 deletions(-) diff --git a/src/nested_pandas/nestedframe/io.py b/src/nested_pandas/nestedframe/io.py index 23343101..b21f1a6c 100644 --- a/src/nested_pandas/nestedframe/io.py +++ b/src/nested_pandas/nestedframe/io.py @@ -49,7 +49,9 @@ def read_parquet( autocast_list: bool, default=True If True, automatically cast list columns to nested columns with NestedDType. kwargs: dict - Keyword arguments passed to `pyarrow.parquet.read_table` + Keyword arguments passed to `pyarrow.parquet.read_table`. Special handling: + - `open_file_options`: dict of options for fsspec filesystem optimization. + Commonly used for remote storage performance (e.g., {"precache_options": {"method": "parquet"}}). Returns ------- @@ -93,13 +95,21 @@ def read_parquet( elif isinstance(reject_nesting, str): reject_nesting = [reject_nesting] + # Extract open_file_options if present, as they need special handling + open_file_options = kwargs.pop("open_file_options", None) + # First load through pyarrow # If `filesystem` is specified - use it if kwargs.get("filesystem") is not None: + # Apply open_file_options to the existing filesystem if provided + if open_file_options is not None: + filesystem = kwargs["filesystem"] + filesystem = _apply_open_file_options_to_filesystem(filesystem, open_file_options, data) + kwargs["filesystem"] = filesystem table = pq.read_table(data, columns=columns, **kwargs) # Otherwise convert with a special function else: - data, filesystem = _transform_read_parquet_data_arg(data) + data, filesystem = _transform_read_parquet_data_arg(data, open_file_options) table = pq.read_table(data, filesystem=filesystem, columns=columns, **kwargs) # Resolve partial loading of nested structures @@ -160,14 +170,53 @@ def read_parquet( return from_pyarrow(table, reject_nesting=reject_nesting, autocast_list=autocast_list) -def _transform_read_parquet_data_arg(data): +def _apply_open_file_options_to_filesystem(filesystem, open_file_options, data_path): + """Apply open_file_options to an existing filesystem by creating a new enhanced filesystem. + + Parameters + ---------- + filesystem : pyarrow.fs.FileSystem or fsspec filesystem + The existing filesystem object + open_file_options : dict + Options to apply to the filesystem (e.g., precache_options) + data_path : str + The data path, used to determine the appropriate filesystem type + + Returns + ------- + Enhanced filesystem with the new options applied + """ + # For PyArrow filesystems, we need to convert to fsspec approach + if hasattr(filesystem, 'type_name'): # PyArrow filesystem + # Convert back to UPath approach for consistency + try: + from upath import UPath + upath = UPath(data_path, **open_file_options) + return upath.fs + except Exception: + # If conversion fails, return original filesystem + return filesystem + + # For fsspec filesystems, try to create a new one with combined options + try: + existing_options = getattr(filesystem, 'storage_options', {}) + combined_options = {**existing_options, **open_file_options} + # Create new filesystem of the same type with enhanced options + filesystem_class = type(filesystem) + return filesystem_class(**combined_options) + except Exception: + # If enhancement fails, return original filesystem + return filesystem + + +def _transform_read_parquet_data_arg(data, open_file_options=None): """Transform `data` argument of read_parquet to pq.read_parquet's `source` and `filesystem`""" # Check if a list, run the function recursively and check that filesystems are all the same if isinstance(data, list): paths = [] first_fs = None for i, d in enumerate(data): - path, fs = _transform_read_parquet_data_arg(d) + path, fs = _transform_read_parquet_data_arg(d, open_file_options) paths.append(path) if i == 0: first_fs = fs @@ -181,6 +230,11 @@ def _transform_read_parquet_data_arg(data): return data, None # Check if `data` is a UPath and use it if isinstance(data, UPath): + if open_file_options is not None: + # Combine existing UPath options with new open_file_options + combined_options = {**data.storage_options, **open_file_options} + enhanced_upath = UPath(data, **combined_options) + return enhanced_upath.path, enhanced_upath.fs return data.path, data.fs # Check if `data` is a Path (Path is a superclass for UPath, so this order of checks) if isinstance(data, Path): @@ -206,7 +260,13 @@ def _transform_read_parquet_data_arg(data): return upath.path, None # If HTTP, change the default UPath object to use a smaller block size if upath.protocol in ("http", "https"): - upath = UPath(upath, block_size=FSSPEC_BLOCK_SIZE) + base_options = {"block_size": FSSPEC_BLOCK_SIZE} + if open_file_options is not None: + base_options.update(open_file_options) + upath = UPath(upath, **base_options) + elif open_file_options is not None: + # For non-HTTP protocols, apply open_file_options if provided + upath = UPath(upath, **open_file_options) return upath.path, upath.fs diff --git a/tests/nested_pandas/nestedframe/test_io.py b/tests/nested_pandas/nestedframe/test_io.py index 24b56103..e1a275ba 100644 --- a/tests/nested_pandas/nestedframe/test_io.py +++ b/tests/nested_pandas/nestedframe/test_io.py @@ -399,3 +399,62 @@ def test__transform_read_parquet_data_arg(): "https://data.lsdb.io/hats/gaia_dr3/gaia/dataset/Norder=2/Dir=0/Npix=0.parquet", ] ) + + +def test_read_parquet_with_open_file_options(): + """Test that read_parquet accepts and handles open_file_options correctly.""" + # Test with local file + local_path = "tests/test_data/nested.parquet" + + # Test basic open_file_options acceptance + open_file_options = {"precache_options": {"method": "parquet"}} + nf1 = read_parquet(local_path, open_file_options=open_file_options) + + # Should work identically to version without options for local files + nf2 = read_parquet(local_path) + + # Data should be the same + assert len(nf1) == len(nf2) + assert list(nf1.columns) == list(nf2.columns) + assert nf1.nested_columns == nf2.nested_columns + + # Test with additional kwargs + nf3 = read_parquet( + local_path, + columns=["a", "nested.flux"], + open_file_options={"precache_options": {"method": "parquet"}}, + use_threads=True + ) + + assert len(nf3) == len(nf1) + assert "a" in nf3.columns + assert "nested" in nf3.columns + + +def test_transform_read_parquet_data_arg_with_open_file_options(): + """Test _transform_read_parquet_data_arg handles open_file_options.""" + # Test backward compatibility + local_path = "tests/test_data/nested.parquet" + + # Single argument (original) + path1, fs1 = _transform_read_parquet_data_arg(local_path) + + # Two arguments with None + path2, fs2 = _transform_read_parquet_data_arg(local_path, None) + + assert path1 == path2 == local_path + assert fs1 == fs2 is None + + # With UPath and options + local_upath = UPath(local_path) + path3, fs3 = _transform_read_parquet_data_arg(local_upath) + + open_file_options = {"precache_options": {"method": "parquet"}, "block_size": 1024} + path4, fs4 = _transform_read_parquet_data_arg(local_upath, open_file_options) + + assert path3 == path4 == local_path + # fs4 should have the additional options + assert hasattr(fs4, 'storage_options') + assert 'precache_options' in fs4.storage_options + assert fs4.storage_options['precache_options']['method'] == 'parquet' + assert fs4.storage_options['block_size'] == 1024 From 0e1eb746d063b5b8b6e8785155323e61867c032f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 25 Sep 2025 18:28:04 +0000 Subject: [PATCH 3/4] Replace open_file_options approach with fsspec.parquet optimization Co-authored-by: gitosaurus <6794831+gitosaurus@users.noreply.github.com> --- benchmarks/benchmarks.py | 23 +++ src/nested_pandas/nestedframe/io.py | 164 +++++++++++++-------- tests/nested_pandas/nestedframe/test_io.py | 57 +++---- 3 files changed, 155 insertions(+), 89 deletions(-) diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py index 747e9eb6..575f40b2 100644 --- a/benchmarks/benchmarks.py +++ b/benchmarks/benchmarks.py @@ -254,3 +254,26 @@ def time_run(self): def peakmem_run(self): """Benchmark the memory usage of read_parquet(self.path, columns=self.columns)""" self.run() + + +class ReadFewColumnsHTTPSWithOptimization: + """Benchmark read_parquet("https://", columns=[...]) with fsspec optimization""" + + path = "https://data.lsdb.io/hats/gaia_dr3/gaia/dataset/Norder=2/Dir=0/Npix=0.parquet" + columns = ["_healpix_29", "ra", "astrometric_primary_flag"] + + def run(self): + """Run the benchmark with fsspec optimization.""" + _ = read_parquet( + self.path, + columns=self.columns, + open_file_options={"precache_options": {"method": "parquet"}} + ) + + def time_run(self): + """Benchmark the runtime with fsspec optimization""" + self.run() + + def peakmem_run(self): + """Benchmark the memory usage with fsspec optimization""" + self.run() diff --git a/src/nested_pandas/nestedframe/io.py b/src/nested_pandas/nestedframe/io.py index b21f1a6c..89dadf91 100644 --- a/src/nested_pandas/nestedframe/io.py +++ b/src/nested_pandas/nestedframe/io.py @@ -51,7 +51,9 @@ def read_parquet( kwargs: dict Keyword arguments passed to `pyarrow.parquet.read_table`. Special handling: - `open_file_options`: dict of options for fsspec filesystem optimization. - Commonly used for remote storage performance (e.g., {"precache_options": {"method": "parquet"}}). + For remote storage (S3, GCS, HTTP), these options can improve performance. + Common example: {"precache_options": {"method": "parquet"}} enables + intelligent precaching for better remote access performance. Returns ------- @@ -95,21 +97,19 @@ def read_parquet( elif isinstance(reject_nesting, str): reject_nesting = [reject_nesting] - # Extract open_file_options if present, as they need special handling + # Check if optimization options are provided for remote storage open_file_options = kwargs.pop("open_file_options", None) # First load through pyarrow + # If optimization options are provided and data is remote, use fsspec.parquet for better performance + if open_file_options is not None and _should_use_fsspec_optimization(data, kwargs.get("filesystem")): + table = _read_with_fsspec_optimization(data, open_file_options, columns, kwargs) # If `filesystem` is specified - use it - if kwargs.get("filesystem") is not None: - # Apply open_file_options to the existing filesystem if provided - if open_file_options is not None: - filesystem = kwargs["filesystem"] - filesystem = _apply_open_file_options_to_filesystem(filesystem, open_file_options, data) - kwargs["filesystem"] = filesystem + elif kwargs.get("filesystem") is not None: table = pq.read_table(data, columns=columns, **kwargs) # Otherwise convert with a special function else: - data, filesystem = _transform_read_parquet_data_arg(data, open_file_options) + data, filesystem = _transform_read_parquet_data_arg(data) table = pq.read_table(data, filesystem=filesystem, columns=columns, **kwargs) # Resolve partial loading of nested structures @@ -170,53 +170,14 @@ def read_parquet( return from_pyarrow(table, reject_nesting=reject_nesting, autocast_list=autocast_list) -def _apply_open_file_options_to_filesystem(filesystem, open_file_options, data_path): - """Apply open_file_options to an existing filesystem by creating a new enhanced filesystem. - - Parameters - ---------- - filesystem : pyarrow.fs.FileSystem or fsspec filesystem - The existing filesystem object - open_file_options : dict - Options to apply to the filesystem (e.g., precache_options) - data_path : str - The data path, used to determine the appropriate filesystem type - - Returns - ------- - Enhanced filesystem with the new options applied - """ - # For PyArrow filesystems, we need to convert to fsspec approach - if hasattr(filesystem, 'type_name'): # PyArrow filesystem - # Convert back to UPath approach for consistency - try: - from upath import UPath - upath = UPath(data_path, **open_file_options) - return upath.fs - except Exception: - # If conversion fails, return original filesystem - return filesystem - - # For fsspec filesystems, try to create a new one with combined options - try: - existing_options = getattr(filesystem, 'storage_options', {}) - combined_options = {**existing_options, **open_file_options} - # Create new filesystem of the same type with enhanced options - filesystem_class = type(filesystem) - return filesystem_class(**combined_options) - except Exception: - # If enhancement fails, return original filesystem - return filesystem - - -def _transform_read_parquet_data_arg(data, open_file_options=None): +def _transform_read_parquet_data_arg(data): """Transform `data` argument of read_parquet to pq.read_parquet's `source` and `filesystem`""" # Check if a list, run the function recursively and check that filesystems are all the same if isinstance(data, list): paths = [] first_fs = None for i, d in enumerate(data): - path, fs = _transform_read_parquet_data_arg(d, open_file_options) + path, fs = _transform_read_parquet_data_arg(d) paths.append(path) if i == 0: first_fs = fs @@ -230,11 +191,6 @@ def _transform_read_parquet_data_arg(data, open_file_options=None): return data, None # Check if `data` is a UPath and use it if isinstance(data, UPath): - if open_file_options is not None: - # Combine existing UPath options with new open_file_options - combined_options = {**data.storage_options, **open_file_options} - enhanced_upath = UPath(data, **combined_options) - return enhanced_upath.path, enhanced_upath.fs return data.path, data.fs # Check if `data` is a Path (Path is a superclass for UPath, so this order of checks) if isinstance(data, Path): @@ -260,13 +216,7 @@ def _transform_read_parquet_data_arg(data, open_file_options=None): return upath.path, None # If HTTP, change the default UPath object to use a smaller block size if upath.protocol in ("http", "https"): - base_options = {"block_size": FSSPEC_BLOCK_SIZE} - if open_file_options is not None: - base_options.update(open_file_options) - upath = UPath(upath, **base_options) - elif open_file_options is not None: - # For non-HTTP protocols, apply open_file_options if provided - upath = UPath(upath, **open_file_options) + upath = UPath(upath, block_size=FSSPEC_BLOCK_SIZE) return upath.path, upath.fs @@ -351,3 +301,93 @@ def _cast_list_cols_to_nested(df): if pa.types.is_list(dtype.pyarrow_dtype): df[col] = pack_lists(df[[col]]) return df + + +def _should_use_fsspec_optimization(data, explicit_filesystem): + """Determine if fsspec optimization should be used. + + Parameters + ---------- + data : str, Path, UPath, or file-like object + The data source + explicit_filesystem : filesystem or None + Explicitly provided filesystem + + Returns + ------- + bool + True if fsspec optimization should be used for this data source + """ + # Don't use optimization if explicit filesystem is provided + if explicit_filesystem is not None: + return False + + # Don't use for file-like objects + if hasattr(data, "read"): + return False + + # For UPath objects, check if they're remote (check before Path since UPath inherits from Path) + if isinstance(data, UPath): + return data.protocol not in ("", "file") + + # Don't use for Path objects (local files) + if isinstance(data, Path): + return False + + # For strings, check if they look like remote URLs + if isinstance(data, str): + return data.startswith(("http://", "https://", "s3://", "gs://", "gcs://", "azure://", "adl://")) + + return False + + +def _read_with_fsspec_optimization(data, open_file_options, columns, kwargs): + """Read parquet using fsspec optimization for better remote storage performance. + + Parameters + ---------- + data : str, UPath, or path-like + Path to the parquet file + open_file_options : dict + Options for fsspec filesystem optimization (e.g., precache_options) + columns : list or None + Columns to read + kwargs : dict + Additional kwargs for reading + + Returns + ------- + pyarrow.Table + The loaded table + """ + try: + import fsspec.parquet + except ImportError: + # Fall back to regular method if fsspec.parquet not available + data_converted, filesystem = _transform_read_parquet_data_arg(data) + return pq.read_table(data_converted, filesystem=filesystem, columns=columns, **kwargs) + + # Convert UPath to string if needed + if isinstance(data, UPath): + path_str = str(data) + # Combine UPath storage options with open_file_options + storage_options = {**data.storage_options, **open_file_options} + else: + path_str = str(data) + storage_options = open_file_options + + # Use fsspec.parquet.open_parquet_file for optimized access + try: + with fsspec.parquet.open_parquet_file( + path_str, + columns=columns, + storage_options=storage_options, + engine="pyarrow" + ) as parquet_file: + # Read the table using PyArrow with the optimized file handle + table = pq.read_table(parquet_file, columns=columns, **kwargs) + return table + except Exception: + # Fall back to regular method if optimization fails + data_converted, filesystem = _transform_read_parquet_data_arg(data) + return pq.read_table(data_converted, filesystem=filesystem, columns=columns, **kwargs) diff --git a/tests/nested_pandas/nestedframe/test_io.py b/tests/nested_pandas/nestedframe/test_io.py index e1a275ba..adbee84e 100644 --- a/tests/nested_pandas/nestedframe/test_io.py +++ b/tests/nested_pandas/nestedframe/test_io.py @@ -401,9 +401,9 @@ def test__transform_read_parquet_data_arg(): ) -def test_read_parquet_with_open_file_options(): - """Test that read_parquet accepts and handles open_file_options correctly.""" - # Test with local file +def test_read_parquet_with_fsspec_optimization(): + """Test that read_parquet handles open_file_options using fsspec optimization.""" + # Test with local file (should not use fsspec optimization) local_path = "tests/test_data/nested.parquet" # Test basic open_file_options acceptance @@ -431,30 +431,33 @@ def test_read_parquet_with_open_file_options(): assert "nested" in nf3.columns -def test_transform_read_parquet_data_arg_with_open_file_options(): - """Test _transform_read_parquet_data_arg handles open_file_options.""" - # Test backward compatibility - local_path = "tests/test_data/nested.parquet" - - # Single argument (original) - path1, fs1 = _transform_read_parquet_data_arg(local_path) - - # Two arguments with None - path2, fs2 = _transform_read_parquet_data_arg(local_path, None) +def test_fsspec_optimization_path_detection(): + """Test _should_use_fsspec_optimization correctly identifies remote paths.""" + from nested_pandas.nestedframe.io import _should_use_fsspec_optimization + from pathlib import Path - assert path1 == path2 == local_path - assert fs1 == fs2 is None - - # With UPath and options - local_upath = UPath(local_path) - path3, fs3 = _transform_read_parquet_data_arg(local_upath) + # Test cases that should NOT use optimization + assert not _should_use_fsspec_optimization("local_file.parquet", None) + assert not _should_use_fsspec_optimization("/path/to/file.parquet", None) + assert not _should_use_fsspec_optimization(Path("local_file.parquet"), None) + assert not _should_use_fsspec_optimization(UPath("local_file.parquet"), None) + assert not _should_use_fsspec_optimization("https://example.com/file.parquet", "some_filesystem") - open_file_options = {"precache_options": {"method": "parquet"}, "block_size": 1024} - path4, fs4 = _transform_read_parquet_data_arg(local_upath, open_file_options) + # Test file-like object + import io + assert not _should_use_fsspec_optimization(io.BytesIO(b"test"), None) - assert path3 == path4 == local_path - # fs4 should have the additional options - assert hasattr(fs4, 'storage_options') - assert 'precache_options' in fs4.storage_options - assert fs4.storage_options['precache_options']['method'] == 'parquet' - assert fs4.storage_options['block_size'] == 1024 + # Test cases that SHOULD use optimization + assert _should_use_fsspec_optimization("https://example.com/file.parquet", None) + assert _should_use_fsspec_optimization("s3://bucket/file.parquet", None) + assert _should_use_fsspec_optimization("gs://bucket/file.parquet", None) + assert _should_use_fsspec_optimization(UPath("https://example.com/file.parquet"), None) + assert _should_use_fsspec_optimization(UPath("s3://bucket/file.parquet"), None) + + +def test_docstring_includes_fsspec_options(): + """Test that the docstring mentions the new fsspec optimization options.""" + docstring = read_parquet.__doc__ + assert "open_file_options" in docstring + assert "precache_options" in docstring + assert "fsspec" in docstring From 17960f87972058d09090f2d0aeb17571df6df6bb Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 7 Oct 2025 22:23:35 +0000 Subject: [PATCH 4/4] Remove open_file_options parameter and make fsspec optimization automatic for remote resources Co-authored-by: gitosaurus <6794831+gitosaurus@users.noreply.github.com> --- benchmarks/benchmarks.py | 19 ++++++------- src/nested_pandas/nestedframe/io.py | 30 +++++++++------------ tests/nested_pandas/nestedframe/test_io.py | 31 +++++++--------------- 3 files changed, 33 insertions(+), 47 deletions(-) diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py index 575f40b2..9e0ef0d7 100644 --- a/benchmarks/benchmarks.py +++ b/benchmarks/benchmarks.py @@ -257,23 +257,24 @@ def peakmem_run(self): class ReadFewColumnsHTTPSWithOptimization: - """Benchmark read_parquet("https://", columns=[...]) with fsspec optimization""" + """Benchmark read_parquet("https://", columns=[...]) + + Note: fsspec optimization is now automatic for remote URLs, + so this benchmark is equivalent to ReadFewColumnsHTTPS. + Kept for historical comparison purposes. + """ path = "https://data.lsdb.io/hats/gaia_dr3/gaia/dataset/Norder=2/Dir=0/Npix=0.parquet" columns = ["_healpix_29", "ra", "astrometric_primary_flag"] def run(self): - """Run the benchmark with fsspec optimization.""" - _ = read_parquet( - self.path, - columns=self.columns, - open_file_options={"precache_options": {"method": "parquet"}} - ) + """Run the benchmark (fsspec optimization is automatic for remote URLs).""" + _ = read_parquet(self.path, columns=self.columns) def time_run(self): - """Benchmark the runtime with fsspec optimization""" + """Benchmark the runtime with automatic fsspec optimization""" self.run() def peakmem_run(self): - """Benchmark the memory usage with fsspec optimization""" + """Benchmark the memory usage with automatic fsspec optimization""" self.run() diff --git a/src/nested_pandas/nestedframe/io.py b/src/nested_pandas/nestedframe/io.py index 89dadf91..2c6a2790 100644 --- a/src/nested_pandas/nestedframe/io.py +++ b/src/nested_pandas/nestedframe/io.py @@ -49,11 +49,7 @@ def read_parquet( autocast_list: bool, default=True If True, automatically cast list columns to nested columns with NestedDType. kwargs: dict - Keyword arguments passed to `pyarrow.parquet.read_table`. Special handling: - - `open_file_options`: dict of options for fsspec filesystem optimization. - For remote storage (S3, GCS, HTTP), these options can improve performance. - Common example: {"precache_options": {"method": "parquet"}} enables - intelligent precaching for better remote access performance. + Keyword arguments passed to `pyarrow.parquet.read_table` Returns ------- @@ -61,6 +57,11 @@ def read_parquet( Notes ----- + For remote storage (S3, GCS, HTTP/HTTPS), this function automatically uses + fsspec.parquet.open_parquet_file for optimized access with intelligent + precaching, which can significantly improve performance compared to standard + PyArrow reading. + pyarrow supports partial loading of nested structures from parquet, for example ```pd.read_parquet("data.parquet", columns=["nested.a"])``` will load the "a" column of the "nested" column. Standard pandas/pyarrow @@ -97,13 +98,10 @@ def read_parquet( elif isinstance(reject_nesting, str): reject_nesting = [reject_nesting] - # Check if optimization options are provided for remote storage - open_file_options = kwargs.pop("open_file_options", None) - # First load through pyarrow - # If optimization options are provided and data is remote, use fsspec.parquet for better performance - if open_file_options is not None and _should_use_fsspec_optimization(data, kwargs.get("filesystem")): - table = _read_with_fsspec_optimization(data, open_file_options, columns, kwargs) + # If data is remote, use fsspec.parquet for better performance + if _should_use_fsspec_optimization(data, kwargs.get("filesystem")): + table = _read_with_fsspec_optimization(data, columns, kwargs) # If `filesystem` is specified - use it elif kwargs.get("filesystem") is not None: table = pq.read_table(data, columns=columns, **kwargs) @@ -341,15 +339,13 @@ def _should_use_fsspec_optimization(data, explicit_filesystem): return False -def _read_with_fsspec_optimization(data, open_file_options, columns, kwargs): +def _read_with_fsspec_optimization(data, columns, kwargs): """Read parquet using fsspec optimization for better remote storage performance. Parameters ---------- data : str, UPath, or path-like Path to the parquet file - open_file_options : dict - Options for fsspec filesystem optimization (e.g., precache_options) columns : list or None Columns to read kwargs : dict @@ -370,11 +366,11 @@ def _read_with_fsspec_optimization(data, open_file_options, columns, kwargs): # Convert UPath to string if needed if isinstance(data, UPath): path_str = str(data) - # Combine UPath storage options with open_file_options - storage_options = {**data.storage_options, **open_file_options} + # Use UPath storage options for fsspec + storage_options = data.storage_options if data.storage_options else None else: path_str = str(data) - storage_options = open_file_options + storage_options = None # Use fsspec.parquet.open_parquet_file for optimized access try: diff --git a/tests/nested_pandas/nestedframe/test_io.py b/tests/nested_pandas/nestedframe/test_io.py index adbee84e..94fefef4 100644 --- a/tests/nested_pandas/nestedframe/test_io.py +++ b/tests/nested_pandas/nestedframe/test_io.py @@ -402,33 +402,23 @@ def test__transform_read_parquet_data_arg(): def test_read_parquet_with_fsspec_optimization(): - """Test that read_parquet handles open_file_options using fsspec optimization.""" + """Test that read_parquet automatically uses fsspec optimization for remote files.""" # Test with local file (should not use fsspec optimization) local_path = "tests/test_data/nested.parquet" - # Test basic open_file_options acceptance - open_file_options = {"precache_options": {"method": "parquet"}} - nf1 = read_parquet(local_path, open_file_options=open_file_options) - - # Should work identically to version without options for local files - nf2 = read_parquet(local_path) - - # Data should be the same - assert len(nf1) == len(nf2) - assert list(nf1.columns) == list(nf2.columns) - assert nf1.nested_columns == nf2.nested_columns + # Test basic reading - local files should work as before + nf1 = read_parquet(local_path) # Test with additional kwargs - nf3 = read_parquet( + nf2 = read_parquet( local_path, columns=["a", "nested.flux"], - open_file_options={"precache_options": {"method": "parquet"}}, use_threads=True ) - assert len(nf3) == len(nf1) - assert "a" in nf3.columns - assert "nested" in nf3.columns + assert len(nf2) <= len(nf1) # filtered columns + assert "a" in nf2.columns + assert "nested" in nf2.columns def test_fsspec_optimization_path_detection(): @@ -455,9 +445,8 @@ def test_fsspec_optimization_path_detection(): assert _should_use_fsspec_optimization(UPath("s3://bucket/file.parquet"), None) -def test_docstring_includes_fsspec_options(): - """Test that the docstring mentions the new fsspec optimization options.""" +def test_docstring_includes_fsspec_notes(): + """Test that the docstring mentions the automatic fsspec optimization.""" docstring = read_parquet.__doc__ - assert "open_file_options" in docstring - assert "precache_options" in docstring assert "fsspec" in docstring + assert "remote" in docstring.lower()