Skip to content

Experiment: Auto-generated configuration reference #2743

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 2 commits into
base: devel
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions dlt/common/configuration/specs/base_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,8 @@ def configspec(
) -> Callable[[Type[TAnyClass]], Type[TAnyClass]]: ...


KNOWN_CONFIG_SPEC_CLASSES = set()

@dataclass_transform(eq_default=False, field_specifiers=(dataclasses.Field, dataclasses.field))
def configspec(
cls: Optional[Type[Any]] = None, init: bool = True
Expand All @@ -187,6 +189,8 @@ def configspec(
"""

def wrap(cls: Type[TAnyClass]) -> Type[TAnyClass]:
global KNOWN_CONFIG_SPEC_CLASSES
KNOWN_CONFIG_SPEC_CLASSES.add(cls)
cls.__hint_resolvers__ = {} # type: ignore[attr-defined]
is_context = issubclass(cls, _F_ContainerInjectableContext)
# if type does not derive from BaseConfiguration then derive it
Expand Down
6 changes: 5 additions & 1 deletion dlt/common/destination/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,10 +147,14 @@ def from_normalized_mapping(
class DestinationClientConfiguration(BaseConfiguration):
destination_type: Annotated[str, NotResolved()] = dataclasses.field(
default=None, init=False, repr=False, compare=False
) # which destination to load data to
)
"""Type of this destination, e.g. `postgres` or `duckdb`"""
credentials: Optional[CredentialsConfiguration] = None
"""Credentials for this destination"""
destination_name: Optional[str] = None # name of the destination
"""Name of the destination, e.g. `my_postgres` or `my_duckdb`, will be the same as destination_type if not set"""
environment: Optional[str] = None
"""Environment of the destination, e.g. `dev` or `prod`"""

def fingerprint(self) -> str:
"""Returns a destination fingerprint which is a hash of selected configuration fields. ie. host in case of connection string"""
Expand Down
2 changes: 2 additions & 0 deletions dlt/destinations/impl/athena/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

@configspec
class AthenaClientConfiguration(DestinationClientDwhWithStagingConfiguration):
"""Configuration for the Athena destination"""

destination_type: Final[str] = dataclasses.field(default="athena", init=False, repr=False, compare=False) # type: ignore[misc]
query_result_bucket: str = None
credentials: AwsCredentials = None
Expand Down
2 changes: 1 addition & 1 deletion dlt/destinations/impl/clickhouse/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ class ClickHouseClientConfiguration(DestinationClientDwhWithStagingConfiguration
dataset_table_separator: str = "___"
"""Separator for dataset table names, defaults to '___', i.e. 'database.dataset___table'."""
table_engine_type: Optional[TTableEngineType] = "merge_tree"
"""The default table engine to use. Defaults to 'merge_tree'. Other implemented options are 'shared_merge_tree' and 'replicated_merge_tree'."""
"""The default table engine to use. Defaults to `merge_tree`. Other implemented options are `shared_merge_tree` and `replicated_merge_tree`."""
dataset_sentinel_table_name: str = "dlt_sentinel_table"
"""Special table to mark dataset as existing"""
staging_use_https: bool = True
Expand Down
1,204 changes: 1,204 additions & 0 deletions docs/website/docs/reference/configuration.md

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions docs/website/sidebars.js
Original file line number Diff line number Diff line change
Expand Up @@ -459,6 +459,7 @@ const sidebars = {
items: [
'reference/command-line-interface',
'reference/telemetry',
'reference/configuration',
'dlt-ecosystem/staging',
{
type: 'category',
Expand Down
201 changes: 201 additions & 0 deletions tools/update_config_reference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
# NOTE: experimental tool to update the config docs
from __future__ import annotations

from typing import Any, get_origin, get_args, Final, Annotated, Union, Literal

import dlt
import inspect, ast
import dataclasses
from typing import get_type_hints

from dlt.common.configuration.specs.base_configuration import KNOWN_CONFIG_SPEC_CLASSES, BaseConfiguration
from dlt.common.destination.reference import Destination


OUTPUT_FILE = "docs/website/docs/reference/configuration.md"

OUTPUT_HEADER = """---
title: Configuration Reference
description: Reference of all configuration options available in dlt
keywords: [configuration, reference]
---

# Configuration Reference

This page contains a reference of most configuration options and objects available in DLT.

"""

EXCLUDED_CLASSES = [
"ExtractorConfiguration",
"PipeIteratorConfiguration",
"BufferedDataWriterConfiguration"
"PipelineConfiguration",
]

# List of logical configuration groups, grouped by common base classes
CONFIG_GROUPS = [
{
"name": "Destination Configurations",
"base_classes": [
"DestinationClientConfiguration",
]
},
{
"name": "Credential Configurations",
"base_classes": [
"CredentialsConfiguration",
]
},
{
"name": "All other Configurations",
"base_classes": [
"BaseConfiguration",
]
}
]

def clean_type(tp: Any) -> str:

# "primitive" types
if tp in {int, float, str, bool, complex, bytes, type}:
return tp.__name__

# unwrap final, annotated, union, literal
origin = get_origin(tp)
args = get_args(tp)

if origin in {Final, Annotated}:
return clean_type(args[0])

if origin in {Union, Literal}:
non_none_args = [arg for arg in args if arg is not type(None)]
if len(non_none_args) == 1:
return clean_type(non_none_args[0]) # Strip Optional
return " | ".join(clean_type(arg) for arg in non_none_args)

# convert classes to links
if isinstance(tp, type) and issubclass(tp, BaseConfiguration):
return f"[{tp.__name__}](#{tp.__name__.lower()})"

# fallback to string
return str(tp)

def extract_config_properties(cls):
# Get source code of the class
source = inspect.getsource(cls)
tree = ast.parse(source)

# Prepare field type hints
type_hints = get_type_hints(cls)
result = {}

# Track docstrings found after each field definition
doc_map = {}

lines = source.splitlines()

for node in ast.walk(tree):
if isinstance(node, ast.AnnAssign) and isinstance(node.target, ast.Name):
field_name = node.target.id
doc = None

# Try to find docstring immediately after the field
next_line_index = node.end_lineno
if next_line_index and next_line_index < len(lines):
next_line = lines[next_line_index].strip()
if next_line.startswith('"""') or next_line.startswith("'''"):
doc = next_line.strip('"""').strip("'''").strip()

doc_map[field_name] = doc

# Combine with dataclass field info
for field in dataclasses.fields(cls):
if field.name.startswith("_"):
continue
doc = doc_map.get(field.name) or ""
doc = doc.replace("<", "").replace(">", "")

type = type_hints.get(field.name, 'Unknown')
type = clean_type(type)
type = str(type).replace("<", "").replace(">", "")
result[field.name] = {
"name": field.name,
"type": type,
"doc": doc
}

return result


def extract_config_info(cls: type) -> dict:
"""Extracts information about a config spec class"""
try:
return {
"name": cls.__name__,
"module": cls.__module__,
"description": cls.__doc__ if not cls.__doc__.startswith(cls.__name__) else None,
"bases": [b.__name__ for b in cls.__mro__],
"properties": extract_config_properties(cls)
}
except OSError as e:
print(f"Error processing {cls.__name__}: {e}")

if __name__ == "__main__":
print("Collecting config docs...")

# all parsed config specs
all_config_specs = {}

# all rendered config specs
rendered_config_specs = set()

# get sorted specs
found_config_specs = list(KNOWN_CONFIG_SPEC_CLASSES)
found_config_specs.sort(key=lambda x: x.__name__)

# process all specs
for cls in found_config_specs:
print("Processing", cls.__name__)
try:
if cls.__name__ not in EXCLUDED_CLASSES and (spec := extract_config_info(cls)):
all_config_specs[cls.__name__] = spec
except (IndentationError, NameError) as e:
print(f"Error processing {cls.__name__}: {e}")

# for each spec, find doc strings in superclasses if missing
for name, spec in all_config_specs.items():
for property in spec["properties"].values():
if not property["doc"]:
for base in spec["bases"]:
if base in all_config_specs and property["name"] in all_config_specs[base]["properties"]:
doc = all_config_specs[base]["properties"][property["name"]]["doc"]
if doc:
spec["properties"][property["name"]]["doc"] = doc
break


# write defined groups
lines = []
for group in CONFIG_GROUPS:
lines.append(f"## {group['name']}")
for name, spec in all_config_specs.items():
if name in rendered_config_specs:
continue
for base in spec["bases"]:
if base in group["base_classes"]:
rendered_config_specs.add(name)
lines.append(f"### {name}")
lines.append(f"{spec['description']}")
lines.append("")
for property in spec["properties"].values():
lines.append(f"* **`{property['name']}`** - _{property['type']}_ <br /> {property['doc']}")
lines.append("")
break

# write to output
with open(OUTPUT_FILE, "w") as f:
f.write(OUTPUT_HEADER)
f.write("\n".join(lines))

# f.write(extract_config_info(cls))
Loading