Skip to content

Feature/249 support geoparquet #254

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/get_started.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ Above, we saved the data as a CSV, but you can choose another option depending o
- `type = "arrow"` uses `to_feather()` from pandas to create an Arrow/Feather file.
- `type = "joblib"` uses `joblib.dump()` to create a binary Python data file, such as for storing a trained model. See the [joblib docs](https://joblib.readthedocs.io/en/latest/) for more information.
- `type = "json"` uses `json.dump()` to create a JSON file. Pretty much every programming language can read JSON files, but they only work well for nested lists.
- `type = "geoparquet"` uses `to_parquet()` from [geopandas](https://github.com/geopandas/geopandas) to create a [GeoParquet](https://github.com/opengeospatial/geoparquet) file, which is a specialized Parquet format for geospatial data.

Note that when the data lives elsewhere, pins takes care of downloading and caching so that it's only re-downloaded when needed.
That said, most boards transmit pins over HTTP, and this is going to be slow and possibly unreliable for very large pins.
Expand Down
29 changes: 26 additions & 3 deletions pins/_adaptors.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,23 @@
from typing_extensions import TypeAlias

if TYPE_CHECKING:
import geopandas as gpd
import pandas as pd

PandasDataFrame: TypeAlias = pd.DataFrame
DataFrame: TypeAlias = PandasDataFrame
GeoPandasGeoDataFrame: TypeAlias = gpd.GeoDataFrame
DataFrame: TypeAlias = PandasDataFrame | GeoPandasGeoDataFrame


class AbstractPandasFrame(AbstractBackend):
_backends = [("pandas", "DataFrame")]


AbstractDF: TypeAlias = AbstractPandasFrame
class AbstractGeoPandasFrame(AbstractPandasFrame):
_backends = [("geopandas", "GeoDataFrame")]


AbstractDF: TypeAlias = AbstractPandasFrame | AbstractGeoPandasFrame


class Adaptor:
Expand Down Expand Up @@ -142,12 +148,29 @@ def write_feather(self, file: str) -> None:
self._d.to_feather(file)


class GeoPandasAdaptor(PandasAdaptor):
_d: ClassVar[GeoPandasGeoDataFrame] # type: ignore[reportIncompatibleVariableOverride]

def __init__(self, data: AbstractGeoPandasFrame) -> None:
super().__init__(data)

@property
def df_type(self) -> str:
# Consider over-riding this for specialized dataframes
return "GeoDataFrame"

def head(self, n: int) -> GeoPandasAdaptor:
return GeoPandasAdaptor(self._d.head(n))


@overload
def create_adaptor(obj: DataFrame) -> DFAdaptor: ...
@overload
def create_adaptor(obj: Any) -> Adaptor: ...
def create_adaptor(obj: Any | DataFrame) -> Adaptor | DFAdaptor:
if isinstance(obj, AbstractPandasFrame):
if isinstance(obj, AbstractGeoPandasFrame):
return GeoPandasAdaptor(obj)
elif isinstance(obj, AbstractPandasFrame):
return PandasAdaptor(obj)
elif isinstance(obj, Adaptor):
return obj
Expand Down
2 changes: 1 addition & 1 deletion pins/boards.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,7 @@ def pin_write(
Pin name.
type:
File type used to save `x` to disk. May be "csv", "arrow", "parquet",
"joblib", or "json".
"joblib", "json", or "geoparquet".
title:
A title for the pin; most important for shared boards so that others
can understand what the pin contains. If omitted, a brief description
Expand Down
14 changes: 14 additions & 0 deletions pins/drivers.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,16 @@ def load_data(

return pd.read_csv(f)

elif meta.type == "geoparquet":
try:
import geopandas as gpd
except ModuleNotFoundError:
raise ModuleNotFoundError(
'The "geopandas" package is required to read "geoparquet" type files.'
) from None

return gpd.read_parquet(f)

elif meta.type == "joblib":
import joblib

Expand Down Expand Up @@ -139,6 +149,8 @@ def save_data(
if apply_suffix:
if pin_type == "file":
suffix = "".join(Path(obj).suffixes)
elif pin_type == "geoparquet":
suffix = ".parquet"
else:
suffix = f".{pin_type}"
else:
Expand All @@ -162,6 +174,8 @@ def save_data(
raise NotImplementedError(msg)
elif pin_type == "parquet":
adaptor.write_parquet(final_name)
elif pin_type == "geoparquet":
adaptor.write_parquet(final_name)
elif pin_type == "joblib":
adaptor.write_joblib(final_name)
elif pin_type == "json":
Expand Down
26 changes: 26 additions & 0 deletions pins/tests/test_drivers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from pathlib import Path

import fsspec
import geopandas as gpd
import pandas as pd
import pytest

Expand Down Expand Up @@ -37,6 +38,10 @@ class D:
[
(pd.DataFrame({"x": [1, 2]}), "somename: a pinned 2 x 1 DataFrame"),
(pd.DataFrame({"x": [1], "y": [2]}), "somename: a pinned 1 x 2 DataFrame"),
(
gpd.GeoDataFrame({"x": [1], "geometry": [None]}),
"somename: a pinned 1 x 2 GeoDataFrame",
),
(ExC(), "somename: a pinned ExC object"),
(ExC().D(), "somename: a pinned ExC.D object"),
([1, 2, 3], "somename: a pinned list object"),
Expand Down Expand Up @@ -79,6 +84,27 @@ def test_driver_roundtrip(tmp_path: Path, type_):
assert df.equals(obj)


def test_driver_geoparquet_roundtrip(tmp_path):
import geopandas as gpd

gdf = gpd.GeoDataFrame(
{"x": [1, 2, 3], "geometry": gpd.points_from_xy([1, 2, 3], [1, 2, 3])}
)

fname = "some_gdf"
full_file = f"{fname}.parquet"

p_obj = tmp_path / fname
res_fname = save_data(gdf, p_obj, "geoparquet")

assert Path(res_fname).name == full_file

meta = MetaRaw(full_file, "geoparquet", "my_pin")
obj = load_data(meta, fsspec.filesystem("file"), tmp_path, allow_pickle_read=True)

assert gdf.equals(obj)


@pytest.mark.parametrize(
"type_",
[
Expand Down
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ check = [
"pyright==1.1.372", # Pinned; manually sync with .github/workflows/code-checks.yml
"ruff==0.5.4", # Pinned; manually sync with pre-commit-config.yaml
"types-appdirs",
"databricks-sdk"
"databricks-sdk",
"geopandas",
]
databricks = ["databricks-sdk"]
doc = [
Expand All @@ -65,6 +66,7 @@ test = [
"pytest-dotenv",
"pytest-parallel",
"s3fs",
"geopandas>=0.8.0", # At 0.8.0, the GeoParquet format was introduced.
"rdata",
"databricks-sdk",
]
Expand Down
17 changes: 17 additions & 0 deletions requirements/dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ cachetools==5.5.2
# via google-auth
certifi==2025.4.26
# via
# pyogrio
# pyproj
# requests
# sphobjinv
cffi==1.17.1
Expand Down Expand Up @@ -124,6 +126,8 @@ fsspec==2025.5.1
# s3fs
gcsfs==2025.5.1
# via pins (pyproject.toml)
geopandas==1.0.1
# via pins (setup.cfg)
google-api-core==2.25.0
# via
# google-cloud-core
Expand Down Expand Up @@ -247,8 +251,12 @@ nodeenv==1.9.1
numpy==2.2.6
# via
# fastparquet
# geopandas
# pandas
# pyarrow
# pyogrio
# rdata
# shapely
# xarray
oauthlib==3.2.2
# via requests-oauthlib
Expand All @@ -257,13 +265,16 @@ packaging==25.0
# black
# build
# fastparquet
# geopandas
# ipykernel
# pyogrio
# pytest
# pytest-cases
# xarray
pandas==2.2.3
# via
# fastparquet
# geopandas
# pins (pyproject.toml)
# rdata
# xarray
Expand Down Expand Up @@ -331,6 +342,10 @@ pyjwt==2.10.1
# via
# msal
# pyjwt
pyogrio==0.9.0
# via geopandas
pyproj==3.6.1
# via geopandas
pyproject-hooks==1.2.0
# via
# build
Expand Down Expand Up @@ -401,6 +416,8 @@ ruff==0.5.4
# via pins (pyproject.toml)
s3fs==2025.5.1
# via pins (pyproject.toml)
shapely==2.0.5
# via geopandas
six==1.17.0
# via
# azure-core
Expand Down
Loading