Skip to content

Commit 6d6ebfb

Browse files
authored
BUG: fix writing fids to e.g. GPKG file with use_arrow (#511)
1 parent 07416f4 commit 6d6ebfb

File tree

3 files changed

+66
-0
lines changed

3 files changed

+66
-0
lines changed

CHANGES.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
### Bug fixes
1010

1111
- Fix WKB writing on big-endian systems (#497).
12+
- Fix writing fids to e.g. GPKG file with use_arrow (#511).
1213

1314
### Packaging
1415

pyogrio/_io.pyx

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2741,6 +2741,18 @@ cdef create_fields_from_arrow_schema(
27412741
IF CTE_GDAL_VERSION < (3, 8, 0):
27422742
raise RuntimeError("Need GDAL>=3.8 for Arrow write support")
27432743

2744+
# Some formats store the FID explicitly in a real column, e.g. GPKG.
2745+
# For those formats, OGR_L_GetFIDColumn will return the column name used
2746+
# for this and otherwise it returns "". GDAL typically also provides a
2747+
# layer creation option to overrule the column name to be used as FID
2748+
# column. When a column with the appropriate name is present in the data,
2749+
# GDAL will automatically use it for the FID. Reference:
2750+
# https://gdal.org/en/stable/tutorials/vector_api_tut.html#writing-to-ogr-using-the-arrow-c-data-interface
2751+
# Hence, the column should not be created as an ordinary field as well.
2752+
# Doing so triggers a bug in GDAL < 3.10.1:
2753+
# https://github.com/OSGeo/gdal/issues/11527#issuecomment-2556092722
2754+
fid_column = get_string(OGR_L_GetFIDColumn(destLayer))
2755+
27442756
# The schema object is a struct type where each child is a column.
27452757
cdef ArrowSchema* child
27462758
for i in range(schema.n_children):
@@ -2752,6 +2764,17 @@ cdef create_fields_from_arrow_schema(
27522764
# Don't create property for geometry column
27532765
if get_string(child.name) == geometry_name or is_arrow_geometry_field(child):
27542766
continue
2767+
2768+
# Don't create property for column that will already be used as FID
2769+
# Note: it seems that GDAL initially uses a case-sensitive check of the
2770+
# FID column, but then falls back to case-insensitive matching via
2771+
# the "ordinary" field being added. So, the check here needs to be
2772+
# case-sensitive so the column is still added as regular column if the
2773+
# casing isn't matched, otherwise the column is simply "lost".
2774+
# Note2: in the non-arrow path, the FID column is also treated
2775+
# case-insensitive, so this is consistent with that.
2776+
if fid_column != "" and get_string(child.name) == fid_column:
2777+
continue
27552778

27562779
if not OGR_L_CreateFieldFromArrowSchema(destLayer, child, options):
27572780
exc = check_last_error()

pyogrio/tests/test_geopandas_io.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -993,6 +993,48 @@ def test_write_csv_encoding(tmp_path, encoding):
993993
assert csv_bytes == csv_pyogrio_bytes
994994

995995

996+
@pytest.mark.parametrize(
997+
"ext, fid_column, fid_param_value",
998+
[
999+
(".gpkg", "fid", None),
1000+
(".gpkg", "FID", None),
1001+
(".sqlite", "ogc_fid", None),
1002+
(".gpkg", "fid_custom", "fid_custom"),
1003+
(".gpkg", "FID_custom", "fid_custom"),
1004+
(".sqlite", "ogc_fid_custom", "ogc_fid_custom"),
1005+
],
1006+
)
1007+
@pytest.mark.requires_arrow_write_api
1008+
def test_write_custom_fids(tmp_path, ext, fid_column, fid_param_value, use_arrow):
1009+
"""Test to specify FIDs to save when writing to a file.
1010+
1011+
Saving custom FIDs is only supported for formats that actually store the FID, like
1012+
e.g. GPKG and SQLite. The fid_column name check is case-insensitive.
1013+
1014+
Typically, GDAL supports using a custom FID column for these file formats via a
1015+
`FID` layer creation option, which is also tested here. If `fid_param_value` is
1016+
specified (not None), an `fid` parameter is passed to `write_dataframe`, causing
1017+
GDAL to use the column name specified for the FID.
1018+
"""
1019+
input_gdf = gp.GeoDataFrame(
1020+
{fid_column: [5]}, geometry=[shapely.Point(0, 0)], crs="epsg:4326"
1021+
)
1022+
kwargs = {}
1023+
if fid_param_value is not None:
1024+
kwargs["fid"] = fid_param_value
1025+
path = tmp_path / f"test{ext}"
1026+
1027+
write_dataframe(input_gdf, path, use_arrow=use_arrow, **kwargs)
1028+
1029+
assert path.exists()
1030+
output_gdf = read_dataframe(path, fid_as_index=True, use_arrow=use_arrow)
1031+
output_gdf = output_gdf.reset_index()
1032+
1033+
# pyogrio always sets "fid" as index name with `fid_as_index`
1034+
expected_gdf = input_gdf.rename(columns={fid_column: "fid"})
1035+
assert_geodataframe_equal(output_gdf, expected_gdf)
1036+
1037+
9961038
@pytest.mark.parametrize("ext", ALL_EXTS)
9971039
@pytest.mark.requires_arrow_write_api
9981040
def test_write_dataframe(tmp_path, naturalearth_lowres, ext, use_arrow):

0 commit comments

Comments
 (0)