Skip to content

change catalog default warehouse location to not use hive-style warehouse location #2059

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jun 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions pyiceberg/catalog/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -923,6 +923,20 @@ def _resolve_table_location(self, location: Optional[str], database_name: str, t
return location.rstrip("/")

def _get_default_warehouse_location(self, database_name: str, table_name: str) -> str:
"""Return the default warehouse location using the convention of `warehousePath/databaseName/tableName`."""
database_properties = self.load_namespace_properties(database_name)
if database_location := database_properties.get(LOCATION):
database_location = database_location.rstrip("/")
return f"{database_location}/{table_name}"

if warehouse_path := self.properties.get(WAREHOUSE_LOCATION):
warehouse_path = warehouse_path.rstrip("/")
return f"{warehouse_path}/{database_name}/{table_name}"

raise ValueError("No default path is set, please specify a location when creating a table")

def _get_hive_style_warehouse_location(self, database_name: str, table_name: str) -> str:
"""Return the default warehouse location following the Hive convention of `warehousePath/databaseName.db/tableName`."""
database_properties = self.load_namespace_properties(database_name)
if database_location := database_properties.get(LOCATION):
database_location = database_location.rstrip("/")
Expand Down
4 changes: 4 additions & 0 deletions pyiceberg/catalog/dynamodb.py
Original file line number Diff line number Diff line change
Expand Up @@ -652,6 +652,10 @@ def _convert_dynamo_table_item_to_iceberg_table(self, dynamo_table_item: Dict[st
catalog=self,
)

def _get_default_warehouse_location(self, database_name: str, table_name: str) -> str:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like we're repeating this a couple of times, should we move this into pyiceberg/catalog/__init__.py?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good call, done

"""Override the default warehouse location to follow Hive-style conventions."""
return self._get_hive_style_warehouse_location(database_name, table_name)


def _get_create_table_item(database_name: str, table_name: str, properties: Properties, metadata_location: str) -> Dict[str, Any]:
current_timestamp_ms = str(round(time() * 1000))
Expand Down
4 changes: 4 additions & 0 deletions pyiceberg/catalog/glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -808,3 +808,7 @@ def view_exists(self, identifier: Union[str, Identifier]) -> bool:
@staticmethod
def __is_iceberg_table(table: TableTypeDef) -> bool:
return table.get("Parameters", {}).get(TABLE_TYPE, "").lower() == ICEBERG

def _get_default_warehouse_location(self, database_name: str, table_name: str) -> str:
"""Override the default warehouse location to follow Hive-style conventions."""
return self._get_hive_style_warehouse_location(database_name, table_name)
4 changes: 4 additions & 0 deletions pyiceberg/catalog/hive.py
Original file line number Diff line number Diff line change
Expand Up @@ -790,3 +790,7 @@ def update_namespace_properties(

def drop_view(self, identifier: Union[str, Identifier]) -> None:
raise NotImplementedError

def _get_default_warehouse_location(self, database_name: str, table_name: str) -> str:
"""Override the default warehouse location to follow Hive-style conventions."""
return self._get_hive_style_warehouse_location(database_name, table_name)
30 changes: 15 additions & 15 deletions tests/catalog/test_sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,21 +72,21 @@ def catalog_name() -> str:

@pytest.fixture(name="random_table_identifier")
def fixture_random_table_identifier(warehouse: Path, database_name: str, table_name: str) -> Identifier:
os.makedirs(f"{warehouse}/{database_name}.db/{table_name}/metadata/", exist_ok=True)
os.makedirs(f"{warehouse}/{database_name}/{table_name}/metadata/", exist_ok=True)
return database_name, table_name


@pytest.fixture(name="another_random_table_identifier")
def fixture_another_random_table_identifier(warehouse: Path, database_name: str, table_name: str) -> Identifier:
database_name = database_name + "_new"
table_name = table_name + "_new"
os.makedirs(f"{warehouse}/{database_name}.db/{table_name}/metadata/", exist_ok=True)
os.makedirs(f"{warehouse}/{database_name}/{table_name}/metadata/", exist_ok=True)
return database_name, table_name


@pytest.fixture(name="random_hierarchical_identifier")
def fixture_random_hierarchical_identifier(warehouse: Path, hierarchical_namespace_name: str, table_name: str) -> Identifier:
os.makedirs(f"{warehouse}/{hierarchical_namespace_name}.db/{table_name}/metadata/", exist_ok=True)
os.makedirs(f"{warehouse}/{hierarchical_namespace_name}/{table_name}/metadata/", exist_ok=True)
return Catalog.identifier_to_tuple(".".join((hierarchical_namespace_name, table_name)))


Expand All @@ -96,7 +96,7 @@ def fixture_another_random_hierarchical_identifier(
) -> Identifier:
hierarchical_namespace_name = hierarchical_namespace_name + "_new"
table_name = table_name + "_new"
os.makedirs(f"{warehouse}/{hierarchical_namespace_name}.db/{table_name}/metadata/", exist_ok=True)
os.makedirs(f"{warehouse}/{hierarchical_namespace_name}/{table_name}/metadata/", exist_ok=True)
return Catalog.identifier_to_tuple(".".join((hierarchical_namespace_name, table_name)))


Expand All @@ -115,7 +115,7 @@ def catalog_memory(catalog_name: str, warehouse: Path) -> Generator[SqlCatalog,
@pytest.fixture(scope="module")
def catalog_sqlite(catalog_name: str, warehouse: Path) -> Generator[SqlCatalog, None, None]:
props = {
"uri": f"sqlite:////{warehouse}/sql-catalog.db",
"uri": f"sqlite:////{warehouse}/sql-catalog",
"warehouse": f"file://{warehouse}",
}
catalog = SqlCatalog(catalog_name, **props)
Expand All @@ -126,7 +126,7 @@ def catalog_sqlite(catalog_name: str, warehouse: Path) -> Generator[SqlCatalog,

@pytest.fixture(scope="module")
def catalog_uri(warehouse: Path) -> str:
return f"sqlite:////{warehouse}/sql-catalog.db"
return f"sqlite:////{warehouse}/sql-catalog"


@pytest.fixture(scope="module")
Expand All @@ -137,7 +137,7 @@ def alchemy_engine(catalog_uri: str) -> Engine:
@pytest.fixture(scope="module")
def catalog_sqlite_without_rowcount(catalog_name: str, warehouse: Path) -> Generator[SqlCatalog, None, None]:
props = {
"uri": f"sqlite:////{warehouse}/sql-catalog.db",
"uri": f"sqlite:////{warehouse}/sql-catalog",
"warehouse": f"file://{warehouse}",
}
catalog = SqlCatalog(catalog_name, **props)
Expand All @@ -150,7 +150,7 @@ def catalog_sqlite_without_rowcount(catalog_name: str, warehouse: Path) -> Gener
@pytest.fixture(scope="module")
def catalog_sqlite_fsspec(catalog_name: str, warehouse: Path) -> Generator[SqlCatalog, None, None]:
props = {
"uri": f"sqlite:////{warehouse}/sql-catalog.db",
"uri": f"sqlite:////{warehouse}/sql-catalog",
"warehouse": f"file://{warehouse}",
PY_IO_IMPL: FSSPEC_FILE_IO,
}
Expand All @@ -176,7 +176,7 @@ def test_creation_with_echo_parameter(catalog_name: str, warehouse: Path) -> Non

for echo_param, expected_echo_value in test_cases:
props = {
"uri": f"sqlite:////{warehouse}/sql-catalog.db",
"uri": f"sqlite:////{warehouse}/sql-catalog",
"warehouse": f"file://{warehouse}",
}
# None is for default value
Expand All @@ -199,7 +199,7 @@ def test_creation_with_pool_pre_ping_parameter(catalog_name: str, warehouse: Pat

for pool_pre_ping_param, expected_pool_pre_ping_value in test_cases:
props = {
"uri": f"sqlite:////{warehouse}/sql-catalog.db",
"uri": f"sqlite:////{warehouse}/sql-catalog",
"warehouse": f"file://{warehouse}",
}
# None is for default value
Expand All @@ -219,7 +219,7 @@ def test_creation_from_impl(catalog_name: str, warehouse: Path) -> None:
catalog_name,
**{
"py-catalog-impl": "pyiceberg.catalog.sql.SqlCatalog",
"uri": f"sqlite:////{warehouse}/sql-catalog.db",
"uri": f"sqlite:////{warehouse}/sql-catalog",
"warehouse": f"file://{warehouse}",
},
),
Expand Down Expand Up @@ -493,7 +493,7 @@ def test_create_table_with_given_location_removes_trailing_slash(
identifier_tuple = Catalog.identifier_to_tuple(table_identifier)
namespace = Catalog.namespace_from(table_identifier)
table_name = Catalog.table_name_from(identifier_tuple)
location = f"file://{warehouse}/{catalog.name}.db/{table_name}-given"
location = f"file://{warehouse}/{catalog.name}/{table_name}-given"
catalog.create_namespace(namespace)
catalog.create_table(table_identifier, table_schema_nested, location=f"{location}/")
table = catalog.load_table(table_identifier)
Expand Down Expand Up @@ -1235,7 +1235,7 @@ def test_load_namespace_properties(catalog: SqlCatalog, namespace: str) -> None:
warehouse_location = "/test/location"
test_properties = {
"comment": "this is a test description",
"location": f"{warehouse_location}/{namespace}.db",
"location": f"{warehouse_location}/{namespace}",
"test_property1": "1",
"test_property2": "2",
"test_property3": "3",
Expand Down Expand Up @@ -1286,7 +1286,7 @@ def test_update_namespace_properties(catalog: SqlCatalog, namespace: str) -> Non
warehouse_location = "/test/location"
test_properties = {
"comment": "this is a test description",
"location": f"{warehouse_location}/{namespace}.db",
"location": f"{warehouse_location}/{namespace}",
"test_property1": "1",
"test_property2": "2",
"test_property3": "3",
Expand All @@ -1306,7 +1306,7 @@ def test_update_namespace_properties(catalog: SqlCatalog, namespace: str) -> Non
"comment": "updated test description",
"test_property4": "4",
"test_property5": "5",
"location": f"{warehouse_location}/{namespace}.db",
"location": f"{warehouse_location}/{namespace}",
}


Expand Down
4 changes: 2 additions & 2 deletions tests/cli/test_console.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ def test_location(catalog: InMemoryCatalog) -> None:
runner = CliRunner()
result = runner.invoke(run, ["location", "default.my_table"])
assert result.exit_code == 0
assert result.output == f"""{catalog._warehouse_location}/default.db/my_table\n"""
assert result.output == f"""{catalog._warehouse_location}/default/my_table\n"""


def test_location_does_not_exists(catalog: InMemoryCatalog) -> None:
Expand Down Expand Up @@ -700,7 +700,7 @@ def test_json_location(catalog: InMemoryCatalog) -> None:
runner = CliRunner()
result = runner.invoke(run, ["--output=json", "location", "default.my_table"])
assert result.exit_code == 0
assert result.output == f'"{catalog._warehouse_location}/default.db/my_table"\n'
assert result.output == f'"{catalog._warehouse_location}/default/my_table"\n'


def test_json_location_does_not_exists(catalog: InMemoryCatalog) -> None:
Expand Down