From 8d7383807550467c9c17e25b25803f53c68d8c59 Mon Sep 17 00:00:00 2001 From: Jed <101430255+jediela@users.noreply.github.com> Date: Fri, 21 Nov 2025 17:45:53 -0500 Subject: [PATCH 1/6] Create db_features.mdx --- docs/docs/configuration/db_features.mdx | 1 + 1 file changed, 1 insertion(+) create mode 100644 docs/docs/configuration/db_features.mdx diff --git a/docs/docs/configuration/db_features.mdx b/docs/docs/configuration/db_features.mdx new file mode 100644 index 000000000000..8b137891791f --- /dev/null +++ b/docs/docs/configuration/db_features.mdx @@ -0,0 +1 @@ + From 35d12c444c6d87e7a2bea654a387e598847ee207 Mon Sep 17 00:00:00 2001 From: Jed <101430255+jediela@users.noreply.github.com> Date: Fri, 21 Nov 2025 17:49:22 -0500 Subject: [PATCH 2/6] Add front matter to db_features.mdx --- docs/docs/configuration/db_features.mdx | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/docs/configuration/db_features.mdx b/docs/docs/configuration/db_features.mdx index 8b137891791f..e1d07bd5e1a4 100644 --- a/docs/docs/configuration/db_features.mdx +++ b/docs/docs/configuration/db_features.mdx @@ -1 +1,6 @@ - +--- +title: Database Features +hide_title: true +sidebar_position: 1 +version: 1 +--- From 91e849c3f48359022e42f2c9748940e6591e171b Mon Sep 17 00:00:00 2001 From: Beto Dealmeida Date: Fri, 21 Nov 2025 18:14:52 -0500 Subject: [PATCH 3/6] refactor: refactor `get_query_result` (#36057) --- superset/common/query_context.py | 12 +- superset/common/query_context_processor.py | 776 +---------------- superset/connectors/sqla/models.py | 102 +-- superset/models/helpers.py | 805 +++++++++++++++++- .../integration_tests/query_context_tests.py | 66 +- .../common/test_query_context_processor.py | 228 +++-- tests/unit_tests/common/test_time_shifts.py | 32 +- 7 files changed, 1066 insertions(+), 955 deletions(-) diff --git a/superset/common/query_context.py b/superset/common/query_context.py index a04e3944603f..400c4a950382 100644 --- a/superset/common/query_context.py +++ b/superset/common/query_context.py @@ -22,10 +22,7 @@ import pandas as pd from superset.common.chart_data import ChartDataResultFormat, ChartDataResultType -from superset.common.query_context_processor import ( - CachedTimeOffset, - QueryContextProcessor, -) +from superset.common.query_context_processor import QueryContextProcessor from superset.common.query_object import QueryObject from superset.models.slice import Slice from superset.utils.core import GenericDataType @@ -128,12 +125,5 @@ def get_df_payload( def get_query_result(self, query_object: QueryObject) -> QueryResult: return self._processor.get_query_result(query_object) - def processing_time_offsets( - self, - df: pd.DataFrame, - query_object: QueryObject, - ) -> CachedTimeOffset: - return self._processor.processing_time_offsets(df, query_object) - def raise_for_access(self) -> None: self._processor.raise_for_access() diff --git a/superset/common/query_context_processor.py b/superset/common/query_context_processor.py index 94f54097e236..8c488997f14b 100644 --- a/superset/common/query_context_processor.py +++ b/superset/common/query_context_processor.py @@ -16,61 +16,42 @@ # under the License. from __future__ import annotations -import copy import logging import re -from datetime import datetime -from typing import Any, cast, ClassVar, TYPE_CHECKING, TypedDict +from typing import Any, cast, ClassVar, TYPE_CHECKING -import numpy as np import pandas as pd from flask import current_app from flask_babel import gettext as _ -from pandas import DateOffset from superset.common.chart_data import ChartDataResultFormat from superset.common.db_query_status import QueryStatus from superset.common.query_actions import get_query_results -from superset.common.utils import dataframe_utils from superset.common.utils.query_cache_manager import QueryCacheManager -from superset.common.utils.time_range_utils import ( - get_since_until_from_query_object, - get_since_until_from_time_range, -) +from superset.common.utils.time_range_utils import get_since_until_from_time_range from superset.connectors.sqla.models import BaseDatasource -from superset.constants import CACHE_DISABLED_TIMEOUT, CacheRegion, TimeGrain +from superset.constants import CACHE_DISABLED_TIMEOUT, CacheRegion from superset.daos.annotation_layer import AnnotationLayerDAO from superset.daos.chart import ChartDAO from superset.exceptions import ( - InvalidPostProcessingError, QueryObjectValidationError, SupersetException, ) -from superset.extensions import cache_manager, feature_flag_manager, security_manager +from superset.extensions import cache_manager, security_manager from superset.models.helpers import QueryResult -from superset.models.sql_lab import Query from superset.superset_typing import AdhocColumn, AdhocMetric from superset.utils import csv, excel from superset.utils.cache import generate_cache_key, set_and_log_cache from superset.utils.core import ( DatasourceType, - DateColumn, DTTM_ALIAS, error_msg_from_exception, - FilterOperator, GenericDataType, - get_base_axis_labels, get_column_names_from_columns, get_column_names_from_metrics, - get_metric_names, - get_x_axis_label, is_adhoc_column, is_adhoc_metric, - normalize_dttm_col, - QueryObjectFilterClause, - TIME_COMPARISON, ) -from superset.utils.date_parser import get_past_or_future, normalize_time_delta from superset.utils.pandas_postprocessing.utils import unescape_separator from superset.views.utils import get_viz from superset.viz import viz_types @@ -81,33 +62,6 @@ logger = logging.getLogger(__name__) -# Offset join column suffix used for joining offset results -OFFSET_JOIN_COLUMN_SUFFIX = "__offset_join_column_" - -# This only includes time grains that may influence -# the temporal column used for joining offset results. -# Given that we don't allow time shifts smaller than a day, -# we don't need to include smaller time grains aggregations. -AGGREGATED_JOIN_GRAINS = { - TimeGrain.WEEK, - TimeGrain.WEEK_STARTING_SUNDAY, - TimeGrain.WEEK_STARTING_MONDAY, - TimeGrain.WEEK_ENDING_SATURDAY, - TimeGrain.WEEK_ENDING_SUNDAY, - TimeGrain.MONTH, - TimeGrain.QUARTER, - TimeGrain.YEAR, -} - -# Right suffix used for joining offset results -R_SUFFIX = "__right_suffix" - - -class CachedTimeOffset(TypedDict): - df: pd.DataFrame - queries: list[str] - cache_keys: list[str | None] - class QueryContextProcessor: """ @@ -266,726 +220,14 @@ def query_cache_key(self, query_obj: QueryObject, **kwargs: Any) -> str | None: return cache_key def get_query_result(self, query_object: QueryObject) -> QueryResult: - """Returns a pandas dataframe based on the query object""" - query_context = self._query_context - # Here, we assume that all the queries will use the same datasource, which is - # a valid assumption for current setting. In the long term, we may - # support multiple queries from different data sources. - - query = "" - if isinstance(query_context.datasource, Query): - # todo(hugh): add logic to manage all sip68 models here - result = query_context.datasource.exc_query(query_object.to_dict()) - else: - result = query_context.datasource.query(query_object.to_dict()) - query = result.query + ";\n\n" - - df = result.df - # Transform the timestamp we received from database to pandas supported - # datetime format. If no python_date_format is specified, the pattern will - # be considered as the default ISO date format - # If the datetime format is unix, the parse will use the corresponding - # parsing logic - if not df.empty: - df = self.normalize_df(df, query_object) - - if query_object.time_offsets: - time_offsets = self.processing_time_offsets(df, query_object) - df = time_offsets["df"] - queries = time_offsets["queries"] - - query += ";\n\n".join(queries) - query += ";\n\n" - - # Re-raising QueryObjectValidationError - try: - df = query_object.exec_post_processing(df) - except InvalidPostProcessingError as ex: - raise QueryObjectValidationError(ex.message) from ex - - result.df = df - result.query = query - result.from_dttm = query_object.from_dttm - result.to_dttm = query_object.to_dttm - return result - - def normalize_df(self, df: pd.DataFrame, query_object: QueryObject) -> pd.DataFrame: - # todo: should support "python_date_format" and "get_column" in each datasource - def _get_timestamp_format( - source: BaseDatasource, column: str | None - ) -> str | None: - column_obj = source.get_column(column) - if ( - column_obj - # only sqla column was supported - and hasattr(column_obj, "python_date_format") - and (formatter := column_obj.python_date_format) - ): - return str(formatter) - - return None - - datasource = self._qc_datasource - labels = tuple( - label - for label in [ - *get_base_axis_labels(query_object.columns), - query_object.granularity, - ] - if datasource - # Query datasource didn't support `get_column` - and hasattr(datasource, "get_column") - and (col := datasource.get_column(label)) - # todo(hugh) standardize column object in Query datasource - and (col.get("is_dttm") if isinstance(col, dict) else col.is_dttm) - ) - dttm_cols = [ - DateColumn( - timestamp_format=_get_timestamp_format(datasource, label), - offset=datasource.offset, - time_shift=query_object.time_shift, - col_label=label, - ) - for label in labels - if label - ] - if DTTM_ALIAS in df: - dttm_cols.append( - DateColumn.get_legacy_time_column( - timestamp_format=_get_timestamp_format( - datasource, query_object.granularity - ), - offset=datasource.offset, - time_shift=query_object.time_shift, - ) - ) - normalize_dttm_col( - df=df, - dttm_cols=tuple(dttm_cols), - ) - - if self.enforce_numerical_metrics: - dataframe_utils.df_metrics_to_num(df, query_object) - - df.replace([np.inf, -np.inf], np.nan, inplace=True) - - return df - - @staticmethod - def get_time_grain(query_object: QueryObject) -> Any | None: - if ( - query_object.columns - and len(query_object.columns) > 0 - and isinstance(query_object.columns[0], dict) - ): - # If the time grain is in the columns it will be the first one - # and it will be of AdhocColumn type - return query_object.columns[0].get("timeGrain") - - return query_object.extras.get("time_grain_sqla") - - # pylint: disable=too-many-arguments - def add_offset_join_column( - self, - df: pd.DataFrame, - name: str, - time_grain: str, - time_offset: str | None = None, - join_column_producer: Any = None, - ) -> None: """ - Adds an offset join column to the provided DataFrame. + Returns a pandas dataframe based on the query object. - The function modifies the DataFrame in-place. - - :param df: pandas DataFrame to which the offset join column will be added. - :param name: The name of the new column to be added. - :param time_grain: The time grain used to calculate the new column. - :param time_offset: The time offset used to calculate the new column. - :param join_column_producer: A function to generate the join column. + This method delegates to the datasource's get_query_result method, + which handles query execution, normalization, time offsets, and + post-processing. """ - if join_column_producer: - df[name] = df.apply(lambda row: join_column_producer(row, 0), axis=1) - else: - df[name] = df.apply( - lambda row: self.generate_join_column(row, 0, time_grain, time_offset), - axis=1, - ) - - def is_valid_date(self, date_string: str) -> bool: - try: - # Attempt to parse the string as a date in the format YYYY-MM-DD - datetime.strptime(date_string, "%Y-%m-%d") - return True - except ValueError: - # If parsing fails, it's not a valid date in the format YYYY-MM-DD - return False - - def is_valid_date_range(self, date_range: str) -> bool: - try: - # Attempt to parse the string as a date range in the format - # YYYY-MM-DD:YYYY-MM-DD - start_date, end_date = date_range.split(":") - datetime.strptime(start_date.strip(), "%Y-%m-%d") - datetime.strptime(end_date.strip(), "%Y-%m-%d") - return True - except ValueError: - # If parsing fails, it's not a valid date range in the format - # YYYY-MM-DD:YYYY-MM-DD - return False - - def get_offset_custom_or_inherit( - self, - offset: str, - outer_from_dttm: datetime, - outer_to_dttm: datetime, - ) -> str: - """ - Get the time offset for custom or inherit. - - :param offset: The offset string. - :param outer_from_dttm: The outer from datetime. - :param outer_to_dttm: The outer to datetime. - :returns: The time offset. - """ - if offset == "inherit": - # return the difference in days between the from and the to dttm formatted as a string with the " days ago" suffix # noqa: E501 - return f"{(outer_to_dttm - outer_from_dttm).days} days ago" - if self.is_valid_date(offset): - # return the offset as the difference in days between the outer from dttm and the offset date (which is a YYYY-MM-DD string) formatted as a string with the " days ago" suffix # noqa: E501 - offset_date = datetime.strptime(offset, "%Y-%m-%d") - return f"{(outer_from_dttm - offset_date).days} days ago" - return "" - - def processing_time_offsets( # pylint: disable=too-many-locals,too-many-statements # noqa: C901 - self, - df: pd.DataFrame, - query_object: QueryObject, - ) -> CachedTimeOffset: - """ - Process time offsets for time comparison feature. - - This method handles both relative time offsets (e.g., "1 week ago") and - absolute date range offsets (e.g., "2015-01-03 : 2015-01-04"). - """ - query_context = self._query_context - # ensure query_object is immutable - query_object_clone = copy.copy(query_object) - queries: list[str] = [] - cache_keys: list[str | None] = [] - offset_dfs: dict[str, pd.DataFrame] = {} - - outer_from_dttm, outer_to_dttm = get_since_until_from_query_object(query_object) - if not outer_from_dttm or not outer_to_dttm: - raise QueryObjectValidationError( - _( - "An enclosed time range (both start and end) must be specified " - "when using a Time Comparison." - ) - ) - - time_grain = self.get_time_grain(query_object) - metric_names = get_metric_names(query_object.metrics) - # use columns that are not metrics as join keys - join_keys = [col for col in df.columns if col not in metric_names] - - for offset in query_object.time_offsets: - try: - original_offset = offset - is_date_range_offset = self.is_valid_date_range(offset) - - if is_date_range_offset and feature_flag_manager.is_feature_enabled( - "DATE_RANGE_TIMESHIFTS_ENABLED" - ): - # DATE RANGE OFFSET LOGIC (like "2015-01-03 : 2015-01-04") - try: - # Parse the specified range - offset_from_dttm, offset_to_dttm = ( - get_since_until_from_time_range(time_range=offset) - ) - except ValueError as ex: - raise QueryObjectValidationError(str(ex)) from ex - - # Use the specified range directly - query_object_clone.from_dttm = offset_from_dttm - query_object_clone.to_dttm = offset_to_dttm - - # For date range offsets, we must NOT set inner bounds - # These create additional WHERE clauses that conflict with our - # date range - query_object_clone.inner_from_dttm = None - query_object_clone.inner_to_dttm = None - - elif is_date_range_offset: - # Date range timeshift feature is disabled - raise QueryObjectValidationError( - "Date range timeshifts are not enabled. " - "Please contact your administrator to enable the " - "DATE_RANGE_TIMESHIFTS_ENABLED feature flag." - ) - - else: - # RELATIVE OFFSET LOGIC (like "1 day ago") - if self.is_valid_date(offset) or offset == "inherit": - offset = self.get_offset_custom_or_inherit( - offset, - outer_from_dttm, - outer_to_dttm, - ) - query_object_clone.from_dttm = get_past_or_future( - offset, - outer_from_dttm, - ) - query_object_clone.to_dttm = get_past_or_future( - offset, outer_to_dttm - ) - - query_object_clone.inner_from_dttm = query_object_clone.from_dttm - query_object_clone.inner_to_dttm = query_object_clone.to_dttm - - x_axis_label = get_x_axis_label(query_object.columns) - query_object_clone.granularity = ( - query_object_clone.granularity or x_axis_label - ) - - except ValueError as ex: - raise QueryObjectValidationError(str(ex)) from ex - - query_object_clone.time_offsets = [] - query_object_clone.post_processing = [] - - # Get time offset index - index = (get_base_axis_labels(query_object.columns) or [DTTM_ALIAS])[0] - - if is_date_range_offset and feature_flag_manager.is_feature_enabled( - "DATE_RANGE_TIMESHIFTS_ENABLED" - ): - # Create a completely new filter list to preserve original filters - query_object_clone.filter = copy.deepcopy(query_object_clone.filter) - - # Remove any existing temporal filters that might conflict - query_object_clone.filter = [ - flt - for flt in query_object_clone.filter - if not (flt.get("op") == FilterOperator.TEMPORAL_RANGE) - ] - - # Determine the temporal column with multiple fallback strategies - temporal_col = self._get_temporal_column_for_filter( - query_object_clone, x_axis_label - ) - - # Always add a temporal filter for date range offsets - if temporal_col: - new_temporal_filter: QueryObjectFilterClause = { - "col": temporal_col, - "op": FilterOperator.TEMPORAL_RANGE, - "val": ( - f"{query_object_clone.from_dttm} : " - f"{query_object_clone.to_dttm}" - ), - } - query_object_clone.filter.append(new_temporal_filter) - - else: - # This should rarely happen with proper fallbacks - raise QueryObjectValidationError( - _( - "Unable to identify temporal column for date range time comparison." # noqa: E501 - "Please ensure your dataset has a properly configured time column." # noqa: E501 - ) - ) - - else: - # RELATIVE OFFSET: Original logic for non-date-range offsets - # The comparison is not using a temporal column so we need to modify - # the temporal filter so we run the query with the correct time range - if not dataframe_utils.is_datetime_series(df.get(index)): - query_object_clone.filter = copy.deepcopy(query_object_clone.filter) - - # Find and update temporal filters - for flt in query_object_clone.filter: - if flt.get( - "op" - ) == FilterOperator.TEMPORAL_RANGE and isinstance( - flt.get("val"), str - ): - time_range = cast(str, flt.get("val")) - ( - new_outer_from_dttm, - new_outer_to_dttm, - ) = get_since_until_from_time_range( - time_range=time_range, - time_shift=offset, - ) - flt["val"] = f"{new_outer_from_dttm} : {new_outer_to_dttm}" - else: - # If it IS a datetime series, we still need to clear conflicts - query_object_clone.filter = copy.deepcopy(query_object_clone.filter) - - # For relative offsets with datetime series, ensure the temporal - # filter matches our range - temporal_col = query_object_clone.granularity or x_axis_label - - # Update any existing temporal filters to match our shifted range - for flt in query_object_clone.filter: - if ( - flt.get("op") == FilterOperator.TEMPORAL_RANGE - and flt.get("col") == temporal_col - ): - flt["val"] = ( - f"{query_object_clone.from_dttm} : " - f"{query_object_clone.to_dttm}" - ) - - # Remove non-temporal x-axis filters (but keep temporal ones) - query_object_clone.filter = [ - flt - for flt in query_object_clone.filter - if not ( - flt.get("col") == x_axis_label - and flt.get("op") != FilterOperator.TEMPORAL_RANGE - ) - ] - - # Continue with the rest of the method (caching, execution, etc.) - cached_time_offset_key = ( - offset if offset == original_offset else f"{offset}_{original_offset}" - ) - - cache_key = self.query_cache_key( - query_object_clone, - time_offset=cached_time_offset_key, - time_grain=time_grain, - ) - cache = QueryCacheManager.get( - cache_key, CacheRegion.DATA, query_context.force - ) - - if cache.is_loaded: - offset_dfs[offset] = cache.df - queries.append(cache.query) - cache_keys.append(cache_key) - continue - - query_object_clone_dct = query_object_clone.to_dict() - - # rename metrics: SUM(value) => SUM(value) 1 year ago - metrics_mapping = { - metric: TIME_COMPARISON.join([metric, original_offset]) - for metric in metric_names - } - - # When the original query has limit or offset we wont apply those - # to the subquery so we prevent data inconsistency due to missing records - # in the dataframes when performing the join - if query_object.row_limit or query_object.row_offset: - query_object_clone_dct["row_limit"] = current_app.config["ROW_LIMIT"] - query_object_clone_dct["row_offset"] = 0 - - if isinstance(self._qc_datasource, Query): - result = self._qc_datasource.exc_query(query_object_clone_dct) - else: - result = self._qc_datasource.query(query_object_clone_dct) - - queries.append(result.query) - cache_keys.append(None) - - offset_metrics_df = result.df - if offset_metrics_df.empty: - offset_metrics_df = pd.DataFrame( - { - col: [np.NaN] - for col in join_keys + list(metrics_mapping.values()) - } - ) - else: - # 1. normalize df, set dttm column - offset_metrics_df = self.normalize_df( - offset_metrics_df, query_object_clone - ) - - # 2. rename extra query columns - offset_metrics_df = offset_metrics_df.rename(columns=metrics_mapping) - - # cache df and query - value = { - "df": offset_metrics_df, - "query": result.query, - } - cache.set( - key=cache_key, - value=value, - timeout=self.get_cache_timeout(), - datasource_uid=query_context.datasource.uid, - region=CacheRegion.DATA, - ) - offset_dfs[offset] = offset_metrics_df - - if offset_dfs: - df = self.join_offset_dfs( - df, - offset_dfs, - time_grain, - join_keys, - ) - - return CachedTimeOffset(df=df, queries=queries, cache_keys=cache_keys) - - def _get_temporal_column_for_filter( # noqa: C901 - self, query_object: QueryObject, x_axis_label: str | None - ) -> str | None: - """ - Helper method to reliably determine the temporal column for filtering. - - This method tries multiple strategies to find the correct temporal column: - 1. Use explicitly set granularity - 2. Use x_axis_label if it's a temporal column - 3. Find any datetime column in the datasource - - :param query_object: The query object - :param x_axis_label: The x-axis label from the query - :return: The name of the temporal column, or None if not found - """ - # Strategy 1: Use explicitly set granularity - if query_object.granularity: - return query_object.granularity - - # Strategy 2: Use x_axis_label if it exists - if x_axis_label: - return x_axis_label - - # Strategy 3: Find any datetime column in the datasource - if hasattr(self._qc_datasource, "columns"): - for col in self._qc_datasource.columns: - if hasattr(col, "is_dttm") and col.is_dttm: - if hasattr(col, "column_name"): - return col.column_name - elif hasattr(col, "name"): - return col.name - - return None - - def _process_date_range_offset( - self, offset_df: pd.DataFrame, join_keys: list[str] - ) -> tuple[pd.DataFrame, list[str]]: - """Process date range offset data and return modified DataFrame and keys.""" - temporal_cols = ["ds", "__timestamp", "dttm"] - non_temporal_join_keys = [key for key in join_keys if key not in temporal_cols] - - if non_temporal_join_keys: - return offset_df, non_temporal_join_keys - - metric_columns = [col for col in offset_df.columns if col not in temporal_cols] - - if metric_columns: - aggregated_values = {} - for col in metric_columns: - if pd.api.types.is_numeric_dtype(offset_df[col]): - aggregated_values[col] = offset_df[col].sum() - else: - aggregated_values[col] = ( - offset_df[col].iloc[0] if not offset_df.empty else None - ) - - offset_df = pd.DataFrame([aggregated_values]) - - return offset_df, [] - - def _apply_cleanup_logic( - self, - df: pd.DataFrame, - offset: str, - time_grain: str | None, - join_keys: list[str], - is_date_range_offset: bool, - ) -> pd.DataFrame: - """Apply appropriate cleanup logic based on offset type.""" - if time_grain and not is_date_range_offset: - if join_keys: - col = df.pop(join_keys[0]) - df.insert(0, col.name, col) - - df.drop( - list(df.filter(regex=f"{OFFSET_JOIN_COLUMN_SUFFIX}|{R_SUFFIX}")), - axis=1, - inplace=True, - ) - elif is_date_range_offset: - df.drop( - list(df.filter(regex=f"{R_SUFFIX}")), - axis=1, - inplace=True, - ) - else: - df.drop( - list(df.filter(regex=f"{R_SUFFIX}")), - axis=1, - inplace=True, - ) - - return df - - def _determine_join_keys( - self, - df: pd.DataFrame, - offset_df: pd.DataFrame, - offset: str, - time_grain: str | None, - join_keys: list[str], - is_date_range_offset: bool, - join_column_producer: Any, - ) -> tuple[pd.DataFrame, list[str]]: - """Determine appropriate join keys and modify DataFrames if needed.""" - if time_grain and not is_date_range_offset: - column_name = OFFSET_JOIN_COLUMN_SUFFIX + offset - - # Add offset join columns for relative time offsets - self.add_offset_join_column( - df, column_name, time_grain, offset, join_column_producer - ) - self.add_offset_join_column( - offset_df, column_name, time_grain, None, join_column_producer - ) - return offset_df, [column_name, *join_keys[1:]] - - elif is_date_range_offset: - return self._process_date_range_offset(offset_df, join_keys) - - else: - return offset_df, join_keys - - def _perform_join( - self, df: pd.DataFrame, offset_df: pd.DataFrame, actual_join_keys: list[str] - ) -> pd.DataFrame: - """Perform the appropriate join operation.""" - if actual_join_keys: - return dataframe_utils.left_join_df( - left_df=df, - right_df=offset_df, - join_keys=actual_join_keys, - rsuffix=R_SUFFIX, - ) - else: - temp_key = "__temp_join_key__" - df[temp_key] = 1 - offset_df[temp_key] = 1 - - result_df = dataframe_utils.left_join_df( - left_df=df, - right_df=offset_df, - join_keys=[temp_key], - rsuffix=R_SUFFIX, - ) - - # Remove temporary join keys - result_df.drop(columns=[temp_key], inplace=True, errors="ignore") - result_df.drop( - columns=[f"{temp_key}{R_SUFFIX}"], inplace=True, errors="ignore" - ) - return result_df - - def join_offset_dfs( - self, - df: pd.DataFrame, - offset_dfs: dict[str, pd.DataFrame], - time_grain: str | None, - join_keys: list[str], - ) -> pd.DataFrame: - """ - Join offset DataFrames with the main DataFrame. - - :param df: The main DataFrame. - :param offset_dfs: A list of offset DataFrames. - :param time_grain: The time grain used to calculate the temporal join key. - :param join_keys: The keys to join on. - """ - join_column_producer = current_app.config[ - "TIME_GRAIN_JOIN_COLUMN_PRODUCERS" - ].get(time_grain) - - if join_column_producer and not time_grain: - raise QueryObjectValidationError( - _("Time Grain must be specified when using Time Shift.") - ) - - for offset, offset_df in offset_dfs.items(): - is_date_range_offset = self.is_valid_date_range( - offset - ) and feature_flag_manager.is_feature_enabled( - "DATE_RANGE_TIMESHIFTS_ENABLED" - ) - - offset_df, actual_join_keys = self._determine_join_keys( - df, - offset_df, - offset, - time_grain, - join_keys, - is_date_range_offset, - join_column_producer, - ) - - df = self._perform_join(df, offset_df, actual_join_keys) - df = self._apply_cleanup_logic( - df, offset, time_grain, join_keys, is_date_range_offset - ) - - return df - - @staticmethod - def generate_join_column( - row: pd.Series, - column_index: int, - time_grain: str, - time_offset: str | None = None, - ) -> str: - value = row[column_index] - - if hasattr(value, "strftime"): - if time_offset and not QueryContextProcessor.is_valid_date_range_static( - time_offset - ): - value = value + DateOffset(**normalize_time_delta(time_offset)) - - if time_grain in ( - TimeGrain.WEEK_STARTING_SUNDAY, - TimeGrain.WEEK_ENDING_SATURDAY, - ): - return value.strftime("%Y-W%U") - - if time_grain in ( - TimeGrain.WEEK, - TimeGrain.WEEK_STARTING_MONDAY, - TimeGrain.WEEK_ENDING_SUNDAY, - ): - return value.strftime("%Y-W%W") - - if time_grain == TimeGrain.MONTH: - return value.strftime("%Y-%m") - - if time_grain == TimeGrain.QUARTER: - return value.strftime("%Y-Q") + str(value.quarter) - - if time_grain == TimeGrain.YEAR: - return value.strftime("%Y") - - return str(value) - - @staticmethod - def is_valid_date_range_static(date_range: str) -> bool: - """Static version of is_valid_date_range for use in static methods""" - try: - # Attempt to parse the string as a date range in the format - # YYYY-MM-DD:YYYY-MM-DD - start_date, end_date = date_range.split(":") - datetime.strptime(start_date.strip(), "%Y-%m-%d") - datetime.strptime(end_date.strip(), "%Y-%m-%d") - return True - except ValueError: - # If parsing fails, it's not a valid date range in the format - # YYYY-MM-DD:YYYY-MM-DD - return False + return self._qc_datasource.get_query_result(query_object) def get_data( self, df: pd.DataFrame, coltypes: list[GenericDataType] diff --git a/superset/connectors/sqla/models.py b/superset/connectors/sqla/models.py index 73a60de1ede5..39bc716b6d58 100644 --- a/superset/connectors/sqla/models.py +++ b/superset/connectors/sqla/models.py @@ -18,12 +18,11 @@ from __future__ import annotations import builtins -import dataclasses import logging from collections import defaultdict from collections.abc import Hashable from dataclasses import dataclass, field -from datetime import datetime, timedelta +from datetime import timedelta from typing import Any, Callable, cast, Optional, Union import pandas as pd @@ -82,8 +81,6 @@ ColumnNotFoundException, DatasetInvalidPermissionEvaluationException, QueryObjectValidationError, - SupersetErrorException, - SupersetErrorsException, SupersetGenericDBErrorException, SupersetSecurityException, SupersetSyntaxErrorException, @@ -1628,89 +1625,28 @@ def _get_top_groups( return or_(*groups) def query(self, query_obj: QueryObjectDict) -> QueryResult: - qry_start_dttm = datetime.now() - query_str_ext = self.get_query_str_extended(query_obj) - sql = query_str_ext.sql - status = QueryStatus.SUCCESS - errors = None - error_message = None + """ + Executes the query for SqlaTable with additional column ordering logic. - def assign_column_label(df: pd.DataFrame) -> pd.DataFrame | None: - """ - Some engines change the case or generate bespoke column names, either by - default or due to lack of support for aliasing. This function ensures that - the column names in the DataFrame correspond to what is expected by - the viz components. - - Sometimes a query may also contain only order by columns that are not used - as metrics or groupby columns, but need to present in the SQL `select`, - filtering by `labels_expected` make sure we only return columns users want. - - :param df: Original DataFrame returned by the engine - :return: Mutated DataFrame - """ - labels_expected = query_str_ext.labels_expected - if df is not None and not df.empty: - if len(df.columns) < len(labels_expected): - raise QueryObjectValidationError( - _("Db engine did not return all queried columns") - ) - if len(df.columns) > len(labels_expected): - df = df.iloc[:, 0 : len(labels_expected)] - df.columns = labels_expected - - extras = query_obj.get("extras", {}) - column_order = extras.get("column_order") - if column_order and isinstance(column_order, list): - existing_cols = [col for col in column_order if col in df.columns] - remaining_cols = [ - col for col in df.columns if col not in existing_cols - ] - final_order = existing_cols + remaining_cols - df = df[final_order] - return df + This overrides ExploreMixin.query() to add SqlaTable-specific behavior + for handling column_order from extras. + """ + # Get the base result from ExploreMixin + # (explicitly, not super() which would hit BaseDatasource first) + result = ExploreMixin.query(self, query_obj) - try: - df = self.database.get_df( - sql, - self.catalog, - self.schema or None, - mutator=assign_column_label, - ) - except (SupersetErrorException, SupersetErrorsException): - # SupersetError(s) exception should not be captured; instead, they should - # bubble up to the Flask error handler so they are returned as proper SIP-40 - # errors. This is particularly important for database OAuth2, see SIP-85. - raise - except Exception as ex: # pylint: disable=broad-except - # TODO (betodealmeida): review exception handling while querying the external # noqa: E501 - # database. Ideally we'd expect and handle external database error, but - # everything else / the default should be to let things bubble up. - df = pd.DataFrame() - status = QueryStatus.FAILED - logger.warning( - "Query %s on schema %s failed", sql, self.schema, exc_info=True - ) - db_engine_spec = self.db_engine_spec - errors = [ - dataclasses.asdict(error) - for error in db_engine_spec.extract_errors( - ex, database_name=self.database.unique_name - ) + # Apply SqlaTable-specific column ordering + extras = query_obj.get("extras", {}) + column_order = extras.get("column_order") + if column_order and isinstance(column_order, list) and not result.df.empty: + existing_cols = [col for col in column_order if col in result.df.columns] + remaining_cols = [ + col for col in result.df.columns if col not in existing_cols ] - error_message = utils.error_msg_from_exception(ex) + final_order = existing_cols + remaining_cols + result.df = result.df[final_order] - return QueryResult( - applied_template_filters=query_str_ext.applied_template_filters, - applied_filter_columns=query_str_ext.applied_filter_columns, - rejected_filter_columns=query_str_ext.rejected_filter_columns, - status=status, - df=df, - duration=datetime.now() - qry_start_dttm, - query=sql, - errors=errors, - error_message=error_message, - ) + return result def get_sqla_table_object(self) -> Table: return self.database.get_table( diff --git a/superset/models/helpers.py b/superset/models/helpers.py index 4e5d10047211..ecf1ff869cd0 100644 --- a/superset/models/helpers.py +++ b/superset/models/helpers.py @@ -20,6 +20,7 @@ from __future__ import annotations import builtins +import copy import dataclasses import logging import re @@ -52,6 +53,7 @@ from flask_babel import get_locale, lazy_gettext as _ from jinja2.exceptions import TemplateError from markupsafe import escape, Markup +from pandas import DateOffset from sqlalchemy import and_, Column, or_, UniqueConstraint from sqlalchemy.exc import MultipleResultsFound from sqlalchemy.ext.declarative import declared_attr @@ -64,15 +66,22 @@ from superset import db, is_feature_enabled from superset.advanced_data_type.types import AdvancedDataTypeResponse from superset.common.db_query_status import QueryStatus -from superset.common.utils.time_range_utils import get_since_until_from_time_range -from superset.constants import EMPTY_STRING, NULL_STRING +from superset.common.utils import dataframe_utils +from superset.common.utils.time_range_utils import ( + get_since_until_from_query_object, + get_since_until_from_time_range, +) +from superset.constants import CacheRegion, EMPTY_STRING, NULL_STRING, TimeGrain from superset.db_engine_specs.base import TimestampExpression from superset.errors import ErrorLevel, SupersetError, SupersetErrorType from superset.exceptions import ( AdvancedDataTypeResponseError, ColumnNotFoundException, + InvalidPostProcessingError, QueryClauseValidationException, QueryObjectValidationError, + SupersetErrorException, + SupersetErrorsException, SupersetSecurityException, SupersetSyntaxErrorException, ) @@ -90,15 +99,25 @@ ) from superset.utils import core as utils, json from superset.utils.core import ( + DateColumn, + DTTM_ALIAS, + FilterOperator, GenericDataType, + get_base_axis_labels, get_column_name, + get_metric_names, get_non_base_axis_columns, get_user_id, + get_x_axis_label, is_adhoc_column, MediumText, + normalize_dttm_col, + QueryObjectFilterClause, remove_duplicates, SqlExpressionType, + TIME_COMPARISON, ) +from superset.utils.date_parser import get_past_or_future, normalize_time_delta from superset.utils.dates import datetime_to_epoch from superset.utils.rls import apply_rls @@ -111,6 +130,7 @@ class ValidationResultDict(TypedDict): if TYPE_CHECKING: + from superset.common.query_object import QueryObject from superset.connectors.sqla.models import SqlMetric, TableColumn from superset.db_engine_specs import BaseEngineSpec from superset.models.core import Database @@ -120,6 +140,21 @@ class ValidationResultDict(TypedDict): VIRTUAL_TABLE_ALIAS = "virtual_table" SERIES_LIMIT_SUBQ_ALIAS = "series_limit" +# Offset join column suffix used for joining offset results +OFFSET_JOIN_COLUMN_SUFFIX = "__offset_join_column_" + +# Right suffix used for joining offset results +R_SUFFIX = "__right_suffix" + + +class CachedTimeOffset(TypedDict): + """Result type for time offset processing""" + + df: pd.DataFrame + queries: list[str] + cache_keys: list[str | None] + + # Keys used to filter QueryObjectDict for get_sqla_query parameters SQLA_QUERY_KEYS = { "apply_fetch_values_predicate", @@ -781,9 +816,6 @@ def type(self) -> str: def db_extra(self) -> Optional[dict[str, Any]]: raise NotImplementedError() - def query(self, query_obj: QueryObjectDict) -> QueryResult: - raise NotImplementedError() - @property def database_id(self) -> int: raise NotImplementedError() @@ -1107,9 +1139,15 @@ def is_alias_used_in_orderby(col: ColumnElement) -> bool: if is_alias_used_in_orderby(col): col.name = f"{col.name}__" - def exc_query(self, qry: Any) -> QueryResult: + def query(self, query_obj: QueryObjectDict) -> QueryResult: + """ + Executes the query and returns a dataframe. + + This method is the unified entry point for query execution across all + datasource types (Query, SqlaTable, etc.). + """ qry_start_dttm = datetime.now() - query_str_ext = self.get_query_str_extended(qry) + query_str_ext = self.get_query_str_extended(query_obj) sql = query_str_ext.sql status = QueryStatus.SUCCESS errors = None @@ -1146,6 +1184,10 @@ def assign_column_label(df: pd.DataFrame) -> Optional[pd.DataFrame]: mutator=assign_column_label, ) except Exception as ex: # pylint: disable=broad-except + # Re-raise SupersetErrorException (includes OAuth2RedirectError) + # to bubble up to API layer + if isinstance(ex, (SupersetErrorException, SupersetErrorsException)): + raise df = pd.DataFrame() status = QueryStatus.FAILED logger.warning( @@ -1172,6 +1214,755 @@ def assign_column_label(df: pd.DataFrame) -> Optional[pd.DataFrame]: error_message=error_message, ) + def exc_query(self, qry: Any) -> QueryResult: + """ + Deprecated: Use query() instead. + This method is kept for backward compatibility. + """ + return self.query(qry) + + def normalize_df(self, df: pd.DataFrame, query_object: QueryObject) -> pd.DataFrame: + """ + Normalize the dataframe by converting datetime columns and ensuring + numerical metrics. + + :param df: The dataframe to normalize + :param query_object: The query object with metadata about columns + :return: Normalized dataframe + """ + + def _get_timestamp_format(column: str | None) -> str | None: + if not hasattr(self, "get_column"): + return None + column_obj = self.get_column(column) + if ( + column_obj + and hasattr(column_obj, "python_date_format") + and (formatter := column_obj.python_date_format) + ): + return str(formatter) + return None + + # Collect datetime columns + labels = tuple( + label + for label in [ + *get_base_axis_labels(query_object.columns), + query_object.granularity, + ] + if hasattr(self, "get_column") + and (col := self.get_column(label)) + and (col.get("is_dttm") if isinstance(col, dict) else col.is_dttm) + ) + + dttm_cols = [ + DateColumn( + timestamp_format=_get_timestamp_format(label), + offset=self.offset, + time_shift=query_object.time_shift, + col_label=label, + ) + for label in labels + if label + ] + + if DTTM_ALIAS in df: + dttm_cols.append( + DateColumn.get_legacy_time_column( + timestamp_format=_get_timestamp_format(query_object.granularity), + offset=self.offset, + time_shift=query_object.time_shift, + ) + ) + + normalize_dttm_col( + df=df, + dttm_cols=tuple(dttm_cols), + ) + + # Convert metrics to numerical values if enforced + if getattr(self, "enforce_numerical_metrics", True): + dataframe_utils.df_metrics_to_num(df, query_object) + + df.replace([np.inf, -np.inf], np.nan, inplace=True) + + return df + + def get_query_result(self, query_object: QueryObject) -> QueryResult: + """ + Execute query and return results with full processing pipeline. + + This method handles: + 1. Query execution via self.query() + 2. DataFrame normalization + 3. Time offset processing (if applicable) + 4. Post-processing operations + + :param query_object: The query configuration + :return: QueryResult with processed dataframe + """ + # Execute the base query + result = self.query(query_object.to_dict()) + query = result.query + ";\n\n" if result.query else "" + + # Process the dataframe if not empty + df = result.df + if not df.empty: + # Normalize datetime columns and metrics + df = self.normalize_df(df, query_object) + + # Process time offsets if requested + if query_object.time_offsets: + # Process time offsets using the datasource's own method + # Note: caching is disabled here as we don't have query context + time_offsets = self.processing_time_offsets( + df, query_object, cache_key_fn=None, cache_timeout_fn=None + ) + df = time_offsets["df"] + queries = time_offsets["queries"] + query += ";\n\n".join(queries) + query += ";\n\n" + + # Execute post-processing operations + try: + df = query_object.exec_post_processing(df) + except InvalidPostProcessingError as ex: + raise QueryObjectValidationError(ex.message) from ex + + # Update result with processed data + result.df = df + result.query = query + result.from_dttm = query_object.from_dttm + result.to_dttm = query_object.to_dttm + + return result + + def processing_time_offsets( # pylint: disable=too-many-locals,too-many-statements # noqa: C901 + self, + df: pd.DataFrame, + query_object: QueryObject, + cache_key_fn: Callable[[QueryObject, str, Any], str | None] | None = None, + cache_timeout_fn: Callable[[], int] | None = None, + force_cache: bool = False, + ) -> CachedTimeOffset: + """ + Process time offsets for time comparison feature. + + This method handles both relative time offsets (e.g., "1 week ago") and + absolute date range offsets (e.g., "2015-01-03 : 2015-01-04"). + + :param df: The main dataframe + :param query_object: The query object with time offset configuration + :param cache_key_fn: Optional function to generate cache keys + :param cache_timeout_fn: Optional function to get cache timeout + :param force_cache: Whether to force cache refresh + :return: CachedTimeOffset with processed dataframe and queries + """ + # Import here to avoid circular dependency + # pylint: disable=import-outside-toplevel + from superset.common.utils.query_cache_manager import QueryCacheManager + + # ensure query_object is immutable + query_object_clone = copy.copy(query_object) + queries: list[str] = [] + cache_keys: list[str | None] = [] + offset_dfs: dict[str, pd.DataFrame] = {} + + outer_from_dttm, outer_to_dttm = get_since_until_from_query_object(query_object) + if not outer_from_dttm or not outer_to_dttm: + raise QueryObjectValidationError( + _( + "An enclosed time range (both start and end) must be specified " + "when using a Time Comparison." + ) + ) + + time_grain = self.get_time_grain(query_object) + metric_names = get_metric_names(query_object.metrics) + # use columns that are not metrics as join keys + join_keys = [col for col in df.columns if col not in metric_names] + + for offset in query_object.time_offsets: + try: + original_offset = offset + is_date_range_offset = self.is_valid_date_range(offset) + + if is_date_range_offset and feature_flag_manager.is_feature_enabled( + "DATE_RANGE_TIMESHIFTS_ENABLED" + ): + # DATE RANGE OFFSET LOGIC (like "2015-01-03 : 2015-01-04") + try: + # Parse the specified range + offset_from_dttm, offset_to_dttm = ( + get_since_until_from_time_range(time_range=offset) + ) + except ValueError as ex: + raise QueryObjectValidationError(str(ex)) from ex + + # Use the specified range directly + query_object_clone.from_dttm = offset_from_dttm + query_object_clone.to_dttm = offset_to_dttm + + # For date range offsets, we must NOT set inner bounds + # These create additional WHERE clauses that conflict with our + # date range + query_object_clone.inner_from_dttm = None + query_object_clone.inner_to_dttm = None + + elif is_date_range_offset: + # Date range timeshift feature is disabled + raise QueryObjectValidationError( + "Date range timeshifts are not enabled. " + "Please contact your administrator to enable the " + "DATE_RANGE_TIMESHIFTS_ENABLED feature flag." + ) + + else: + # RELATIVE OFFSET LOGIC (like "1 day ago") + if self.is_valid_date(offset) or offset == "inherit": + offset = self.get_offset_custom_or_inherit( + offset, + outer_from_dttm, + outer_to_dttm, + ) + query_object_clone.from_dttm = get_past_or_future( + offset, + outer_from_dttm, + ) + query_object_clone.to_dttm = get_past_or_future( + offset, outer_to_dttm + ) + + query_object_clone.inner_from_dttm = query_object_clone.from_dttm + query_object_clone.inner_to_dttm = query_object_clone.to_dttm + + x_axis_label = get_x_axis_label(query_object.columns) + query_object_clone.granularity = ( + query_object_clone.granularity or x_axis_label + ) + + except ValueError as ex: + raise QueryObjectValidationError(str(ex)) from ex + + query_object_clone.time_offsets = [] + query_object_clone.post_processing = [] + + # Get time offset index + index = (get_base_axis_labels(query_object.columns) or [DTTM_ALIAS])[0] + + if is_date_range_offset and feature_flag_manager.is_feature_enabled( + "DATE_RANGE_TIMESHIFTS_ENABLED" + ): + # Create a completely new filter list to preserve original filters + query_object_clone.filter = copy.deepcopy(query_object_clone.filter) + + # Remove any existing temporal filters that might conflict + query_object_clone.filter = [ + flt + for flt in query_object_clone.filter + if not (flt.get("op") == FilterOperator.TEMPORAL_RANGE) + ] + + # Determine the temporal column with multiple fallback strategies + temporal_col = self._get_temporal_column_for_filter( + query_object_clone, x_axis_label + ) + + # Always add a temporal filter for date range offsets + if temporal_col: + new_temporal_filter: QueryObjectFilterClause = { + "col": temporal_col, + "op": FilterOperator.TEMPORAL_RANGE, + "val": ( + f"{query_object_clone.from_dttm} : " + f"{query_object_clone.to_dttm}" + ), + } + query_object_clone.filter.append(new_temporal_filter) + + else: + # This should rarely happen with proper fallbacks + raise QueryObjectValidationError( + _( + "Unable to identify temporal column for date range time comparison." # noqa: E501 + "Please ensure your dataset has a properly configured time column." # noqa: E501 + ) + ) + + else: + # RELATIVE OFFSET: Original logic for non-date-range offsets + # The comparison is not using a temporal column so we need to modify + # the temporal filter so we run the query with the correct time range + if not dataframe_utils.is_datetime_series(df.get(index)): + query_object_clone.filter = copy.deepcopy(query_object_clone.filter) + + # Find and update temporal filters + for flt in query_object_clone.filter: + if flt.get( + "op" + ) == FilterOperator.TEMPORAL_RANGE and isinstance( + flt.get("val"), str + ): + time_range = cast(str, flt.get("val")) + ( + new_outer_from_dttm, + new_outer_to_dttm, + ) = get_since_until_from_time_range( + time_range=time_range, + time_shift=offset, + ) + flt["val"] = f"{new_outer_from_dttm} : {new_outer_to_dttm}" + else: + # If it IS a datetime series, we still need to clear conflicts + query_object_clone.filter = copy.deepcopy(query_object_clone.filter) + + # For relative offsets with datetime series, ensure the temporal + # filter matches our range + temporal_col = query_object_clone.granularity or x_axis_label + + # Update any existing temporal filters to match our shifted range + for flt in query_object_clone.filter: + if ( + flt.get("op") == FilterOperator.TEMPORAL_RANGE + and flt.get("col") == temporal_col + ): + flt["val"] = ( + f"{query_object_clone.from_dttm} : " + f"{query_object_clone.to_dttm}" + ) + + # Remove non-temporal x-axis filters (but keep temporal ones) + query_object_clone.filter = [ + flt + for flt in query_object_clone.filter + if not ( + flt.get("col") == x_axis_label + and flt.get("op") != FilterOperator.TEMPORAL_RANGE + ) + ] + + # Continue with the rest of the method (caching, execution, etc.) + cached_time_offset_key = ( + offset if offset == original_offset else f"{offset}_{original_offset}" + ) + + cache_key = None + if cache_key_fn: + cache_key = cache_key_fn( + query_object_clone, + cached_time_offset_key, + time_grain, + ) + + cache = QueryCacheManager.get(cache_key, CacheRegion.DATA, force_cache) + + if cache.is_loaded: + offset_dfs[offset] = cache.df + queries.append(cache.query) + cache_keys.append(cache_key) + continue + + query_object_clone_dct = query_object_clone.to_dict() + + # rename metrics: SUM(value) => SUM(value) 1 year ago + metrics_mapping = { + metric: TIME_COMPARISON.join([metric, original_offset]) + for metric in metric_names + } + + # When the original query has limit or offset we wont apply those + # to the subquery so we prevent data inconsistency due to missing records + # in the dataframes when performing the join + if query_object.row_limit or query_object.row_offset: + query_object_clone_dct["row_limit"] = app.config["ROW_LIMIT"] + query_object_clone_dct["row_offset"] = 0 + + # Call the unified query method on the datasource + result = self.query(query_object_clone_dct) + + queries.append(result.query) + cache_keys.append(None) + + offset_metrics_df = result.df + if offset_metrics_df.empty: + offset_metrics_df = pd.DataFrame( + { + col: [np.NaN] + for col in join_keys + list(metrics_mapping.values()) + } + ) + else: + # 1. normalize df, set dttm column + offset_metrics_df = self.normalize_df( + offset_metrics_df, query_object_clone + ) + + # 2. rename extra query columns + offset_metrics_df = offset_metrics_df.rename(columns=metrics_mapping) + + # cache df and query if caching is enabled + if cache_key and cache_timeout_fn: + value = { + "df": offset_metrics_df, + "query": result.query, + } + cache.set( + key=cache_key, + value=value, + timeout=cache_timeout_fn(), + datasource_uid=self.uid, + region=CacheRegion.DATA, + ) + offset_dfs[offset] = offset_metrics_df + + if offset_dfs: + df = self.join_offset_dfs( + df, + offset_dfs, + time_grain, + join_keys, + ) + + return CachedTimeOffset(df=df, queries=queries, cache_keys=cache_keys) + + @staticmethod + def get_time_grain(query_object: QueryObject) -> Any | None: + if ( + query_object.columns + and len(query_object.columns) > 0 + and isinstance(query_object.columns[0], dict) + ): + # If the time grain is in the columns it will be the first one + # and it will be of AdhocColumn type + return query_object.columns[0].get("timeGrain") + + return query_object.extras.get("time_grain_sqla") + + def is_valid_date(self, date_string: str) -> bool: + try: + # Attempt to parse the string as a date in the format YYYY-MM-DD + datetime.strptime(date_string, "%Y-%m-%d") + return True + except ValueError: + # If parsing fails, it's not a valid date in the format YYYY-MM-DD + return False + + def is_valid_date_range(self, date_range: str) -> bool: + try: + # Attempt to parse the string as a date range in the format + # YYYY-MM-DD:YYYY-MM-DD + start_date, end_date = date_range.split(":") + datetime.strptime(start_date.strip(), "%Y-%m-%d") + datetime.strptime(end_date.strip(), "%Y-%m-%d") + return True + except ValueError: + # If parsing fails, it's not a valid date range in the format + # YYYY-MM-DD:YYYY-MM-DD + return False + + def get_offset_custom_or_inherit( + self, + offset: str, + outer_from_dttm: datetime, + outer_to_dttm: datetime, + ) -> str: + """ + Get the time offset for custom or inherit. + + :param offset: The offset string. + :param outer_from_dttm: The outer from datetime. + :param outer_to_dttm: The outer to datetime. + :returns: The time offset. + """ + if offset == "inherit": + # return the difference in days between the from and the to dttm formatted as a string with the " days ago" suffix # noqa: E501 + return f"{(outer_to_dttm - outer_from_dttm).days} days ago" + if self.is_valid_date(offset): + # return the offset as the difference in days between the outer from dttm and the offset date (which is a YYYY-MM-DD string) formatted as a string with the " days ago" suffix # noqa: E501 + offset_date = datetime.strptime(offset, "%Y-%m-%d") + return f"{(outer_from_dttm - offset_date).days} days ago" + return "" + + def _get_temporal_column_for_filter( # noqa: C901 + self, query_object: QueryObject, x_axis_label: str | None + ) -> str | None: + """ + Helper method to reliably determine the temporal column for filtering. + + This method tries multiple strategies to find the correct temporal column: + 1. Use explicitly set granularity + 2. Use x_axis_label if it's a temporal column + 3. Find any datetime column in the datasource + + :param query_object: The query object + :param x_axis_label: The x-axis label from the query + :return: The name of the temporal column, or None if not found + """ + # Strategy 1: Use explicitly set granularity + if query_object.granularity: + return query_object.granularity + + # Strategy 2: Use x_axis_label if it exists + if x_axis_label: + return x_axis_label + + # Strategy 3: Find any datetime column in the datasource + if hasattr(self, "columns"): + for col in self.columns: + if hasattr(col, "is_dttm") and col.is_dttm: + if hasattr(col, "column_name"): + return col.column_name + elif hasattr(col, "name"): + return col.name + + return None + + def _process_date_range_offset( + self, offset_df: pd.DataFrame, join_keys: list[str] + ) -> tuple[pd.DataFrame, list[str]]: + """Process date range offset data and return modified DataFrame and keys.""" + temporal_cols = ["ds", "__timestamp", "dttm"] + non_temporal_join_keys = [key for key in join_keys if key not in temporal_cols] + + if non_temporal_join_keys: + return offset_df, non_temporal_join_keys + + metric_columns = [col for col in offset_df.columns if col not in temporal_cols] + + if metric_columns: + aggregated_values = {} + for col in metric_columns: + if pd.api.types.is_numeric_dtype(offset_df[col]): + aggregated_values[col] = offset_df[col].sum() + else: + aggregated_values[col] = ( + offset_df[col].iloc[0] if not offset_df.empty else None + ) + + offset_df = pd.DataFrame([aggregated_values]) + + return offset_df, [] + + def _apply_cleanup_logic( + self, + df: pd.DataFrame, + offset: str, + time_grain: str | None, + join_keys: list[str], + is_date_range_offset: bool, + ) -> pd.DataFrame: + """Apply appropriate cleanup logic based on offset type.""" + if time_grain and not is_date_range_offset: + if join_keys: + col = df.pop(join_keys[0]) + df.insert(0, col.name, col) + + df.drop( + list(df.filter(regex=f"{OFFSET_JOIN_COLUMN_SUFFIX}|{R_SUFFIX}")), + axis=1, + inplace=True, + ) + elif is_date_range_offset: + df.drop( + list(df.filter(regex=f"{R_SUFFIX}")), + axis=1, + inplace=True, + ) + else: + df.drop( + list(df.filter(regex=f"{R_SUFFIX}")), + axis=1, + inplace=True, + ) + + return df + + def _determine_join_keys( + self, + df: pd.DataFrame, + offset_df: pd.DataFrame, + offset: str, + time_grain: str | None, + join_keys: list[str], + is_date_range_offset: bool, + join_column_producer: Any, + ) -> tuple[pd.DataFrame, list[str]]: + """Determine appropriate join keys and modify DataFrames if needed.""" + if time_grain and not is_date_range_offset: + column_name = OFFSET_JOIN_COLUMN_SUFFIX + offset + + # Add offset join columns for relative time offsets + self.add_offset_join_column( + df, column_name, time_grain, offset, join_column_producer + ) + self.add_offset_join_column( + offset_df, column_name, time_grain, None, join_column_producer + ) + return offset_df, [column_name, *join_keys[1:]] + + elif is_date_range_offset: + return self._process_date_range_offset(offset_df, join_keys) + + else: + return offset_df, join_keys + + def _perform_join( + self, df: pd.DataFrame, offset_df: pd.DataFrame, actual_join_keys: list[str] + ) -> pd.DataFrame: + """Perform the appropriate join operation.""" + if actual_join_keys: + return dataframe_utils.left_join_df( + left_df=df, + right_df=offset_df, + join_keys=actual_join_keys, + rsuffix=R_SUFFIX, + ) + else: + temp_key = "__temp_join_key__" + df[temp_key] = 1 + offset_df[temp_key] = 1 + + result_df = dataframe_utils.left_join_df( + left_df=df, + right_df=offset_df, + join_keys=[temp_key], + rsuffix=R_SUFFIX, + ) + + # Remove temporary join keys + result_df.drop(columns=[temp_key], inplace=True, errors="ignore") + result_df.drop( + columns=[f"{temp_key}{R_SUFFIX}"], inplace=True, errors="ignore" + ) + return result_df + + def join_offset_dfs( + self, + df: pd.DataFrame, + offset_dfs: dict[str, pd.DataFrame], + time_grain: str | None, + join_keys: list[str], + ) -> pd.DataFrame: + """ + Join offset DataFrames with the main DataFrame. + + :param df: The main DataFrame. + :param offset_dfs: A list of offset DataFrames. + :param time_grain: The time grain used to calculate the temporal join key. + :param join_keys: The keys to join on. + """ + join_column_producer = app.config["TIME_GRAIN_JOIN_COLUMN_PRODUCERS"].get( + time_grain + ) + + if join_column_producer and not time_grain: + raise QueryObjectValidationError( + _("Time Grain must be specified when using Time Shift.") + ) + + for offset, offset_df in offset_dfs.items(): + is_date_range_offset = self.is_valid_date_range( + offset + ) and feature_flag_manager.is_feature_enabled( + "DATE_RANGE_TIMESHIFTS_ENABLED" + ) + + offset_df, actual_join_keys = self._determine_join_keys( + df, + offset_df, + offset, + time_grain, + join_keys, + is_date_range_offset, + join_column_producer, + ) + + df = self._perform_join(df, offset_df, actual_join_keys) + df = self._apply_cleanup_logic( + df, offset, time_grain, join_keys, is_date_range_offset + ) + + return df + + def add_offset_join_column( + self, + df: pd.DataFrame, + name: str, + time_grain: str, + time_offset: str | None = None, + join_column_producer: Any = None, + ) -> None: + """ + Adds an offset join column to the provided DataFrame. + + The function modifies the DataFrame in-place. + + :param df: pandas DataFrame to which the offset join column will be added. + :param name: The name of the new column to be added. + :param time_grain: The time grain used to calculate the new column. + :param time_offset: The time offset used to calculate the new column. + :param join_column_producer: A function to generate the join column. + """ + if join_column_producer: + df[name] = df.apply(lambda row: join_column_producer(row, 0), axis=1) + else: + df[name] = df.apply( + lambda row: self.generate_join_column(row, 0, time_grain, time_offset), + axis=1, + ) + + @staticmethod + def generate_join_column( + row: pd.Series, + column_index: int, + time_grain: str, + time_offset: str | None = None, + ) -> str: + value = row[column_index] + + if hasattr(value, "strftime"): + if time_offset and not ExploreMixin.is_valid_date_range_static(time_offset): + value = value + DateOffset(**normalize_time_delta(time_offset)) + + if time_grain in ( + TimeGrain.WEEK_STARTING_SUNDAY, + TimeGrain.WEEK_ENDING_SATURDAY, + ): + return value.strftime("%Y-W%U") + + if time_grain in ( + TimeGrain.WEEK, + TimeGrain.WEEK_STARTING_MONDAY, + TimeGrain.WEEK_ENDING_SUNDAY, + ): + return value.strftime("%Y-W%W") + + if time_grain == TimeGrain.MONTH: + return value.strftime("%Y-%m") + + if time_grain == TimeGrain.QUARTER: + return value.strftime("%Y-Q") + str(value.quarter) + + if time_grain == TimeGrain.YEAR: + return value.strftime("%Y") + + return str(value) + + @staticmethod + def is_valid_date_range_static(date_range: str) -> bool: + """Static version of is_valid_date_range for use in static methods""" + try: + # Attempt to parse the string as a date range in the format + # YYYY-MM-DD:YYYY-MM-DD + start_date, end_date = date_range.split(":") + datetime.strptime(start_date.strip(), "%Y-%m-%d") + datetime.strptime(end_date.strip(), "%Y-%m-%d") + return True + except ValueError: + # If parsing fails, it's not a valid date range in the format + # YYYY-MM-DD:YYYY-MM-DD + return False + def get_rendered_sql( self, template_processor: Optional[BaseTemplateProcessor] = None, diff --git a/tests/integration_tests/query_context_tests.py b/tests/integration_tests/query_context_tests.py index dedc9b97820c..17824e78138d 100644 --- a/tests/integration_tests/query_context_tests.py +++ b/tests/integration_tests/query_context_tests.py @@ -622,10 +622,24 @@ def test_processing_time_offsets_cache(self): payload["queries"][0]["time_offsets"] = ["1 year ago", "1 year later"] query_context = ChartDataQueryContextSchema().load(payload) query_object = query_context.queries[0] + + # Create cache functions for testing + def cache_key_fn(qo, time_offset, time_grain): + return query_context._processor.query_cache_key( + qo, time_offset=time_offset, time_grain=time_grain + ) + + def cache_timeout_fn(): + return query_context._processor.get_cache_timeout() + # query without cache - query_context.processing_time_offsets(df.copy(), query_object) + query_context.datasource.processing_time_offsets( + df.copy(), query_object, cache_key_fn, cache_timeout_fn, query_context.force + ) # query with cache - rv = query_context.processing_time_offsets(df.copy(), query_object) + rv = query_context.datasource.processing_time_offsets( + df.copy(), query_object, cache_key_fn, cache_timeout_fn, query_context.force + ) cache_keys = rv["cache_keys"] cache_keys__1_year_ago = cache_keys[0] cache_keys__1_year_later = cache_keys[1] @@ -637,7 +651,9 @@ def test_processing_time_offsets_cache(self): payload["queries"][0]["time_offsets"] = ["1 year later", "1 year ago"] query_context = ChartDataQueryContextSchema().load(payload) query_object = query_context.queries[0] - rv = query_context.processing_time_offsets(df.copy(), query_object) + rv = query_context.datasource.processing_time_offsets( + df.copy(), query_object, cache_key_fn, cache_timeout_fn, query_context.force + ) cache_keys = rv["cache_keys"] assert cache_keys__1_year_ago == cache_keys[1] assert cache_keys__1_year_later == cache_keys[0] @@ -646,9 +662,8 @@ def test_processing_time_offsets_cache(self): payload["queries"][0]["time_offsets"] = [] query_context = ChartDataQueryContextSchema().load(payload) query_object = query_context.queries[0] - rv = query_context.processing_time_offsets( - df.copy(), - query_object, + rv = query_context.datasource.processing_time_offsets( + df.copy(), query_object, cache_key_fn, cache_timeout_fn, query_context.force ) assert rv["df"].shape == df.shape @@ -676,7 +691,18 @@ def test_time_offsets_sql(self): payload["queries"][0]["time_offsets"] = ["3 years ago", "3 years later"] query_context = ChartDataQueryContextSchema().load(payload) query_object = query_context.queries[0] - time_offsets_obj = query_context.processing_time_offsets(df, query_object) + + def cache_key_fn(qo, time_offset, time_grain): + return query_context._processor.query_cache_key( + qo, time_offset=time_offset, time_grain=time_grain + ) + + def cache_timeout_fn(): + return query_context._processor.get_cache_timeout() + + time_offsets_obj = query_context.datasource.processing_time_offsets( + df, query_object, cache_key_fn, cache_timeout_fn, query_context.force + ) query_from_1977_to_1988 = time_offsets_obj["queries"][0] query_from_1983_to_1994 = time_offsets_obj["queries"][1] @@ -707,7 +733,18 @@ def test_time_offsets_accuracy(self): payload["queries"][0]["time_offsets"] = ["3 years ago", "3 years later"] query_context = ChartDataQueryContextSchema().load(payload) query_object = query_context.queries[0] - time_offsets_obj = query_context.processing_time_offsets(df, query_object) + + def cache_key_fn(qo, time_offset, time_grain): + return query_context._processor.query_cache_key( + qo, time_offset=time_offset, time_grain=time_grain + ) + + def cache_timeout_fn(): + return query_context._processor.get_cache_timeout() + + time_offsets_obj = query_context.datasource.processing_time_offsets( + df, query_object, cache_key_fn, cache_timeout_fn, query_context.force + ) df_with_offsets = time_offsets_obj["df"] df_with_offsets = df_with_offsets.set_index(["__timestamp", "state"]) @@ -795,7 +832,18 @@ def test_time_offsets_in_query_object_no_limit(self, query_result_mock): payload["queries"][0]["time_offsets"] = ["1 year ago", "1 year later"] query_context = ChartDataQueryContextSchema().load(payload) query_object = query_context.queries[0] - time_offsets_obj = query_context.processing_time_offsets(df, query_object) + + def cache_key_fn(qo, time_offset, time_grain): + return query_context._processor.query_cache_key( + qo, time_offset=time_offset, time_grain=time_grain + ) + + def cache_timeout_fn(): + return query_context._processor.get_cache_timeout() + + time_offsets_obj = query_context.datasource.processing_time_offsets( + df, query_object, cache_key_fn, cache_timeout_fn, query_context.force + ) sqls = time_offsets_obj["queries"] row_limit_value = current_app.config["ROW_LIMIT"] row_limit_pattern_with_config_value = r"LIMIT " + re.escape( diff --git a/tests/unit_tests/common/test_query_context_processor.py b/tests/unit_tests/common/test_query_context_processor.py index 68c1e4b7ed14..3caf3a7cb74d 100644 --- a/tests/unit_tests/common/test_query_context_processor.py +++ b/tests/unit_tests/common/test_query_context_processor.py @@ -36,12 +36,51 @@ def mock_query_context(): @pytest.fixture def processor(mock_query_context): + from superset.models.helpers import ExploreMixin + mock_query_context.datasource.data = MagicMock() mock_query_context.datasource.data.get.return_value = { "col1": "Column 1", "col2": "Column 2", } - return QueryContextProcessor(mock_query_context) + + # Create a processor instance + processor = QueryContextProcessor(mock_query_context) + + # Setup datasource methods from ExploreMixin to be real methods + # by binding them to the mock datasource + processor._qc_datasource.is_valid_date_range = ( + ExploreMixin.is_valid_date_range.__get__(processor._qc_datasource) + ) + processor._qc_datasource.is_valid_date = ExploreMixin.is_valid_date.__get__( + processor._qc_datasource + ) + processor._qc_datasource.get_offset_custom_or_inherit = ( + ExploreMixin.get_offset_custom_or_inherit.__get__(processor._qc_datasource) + ) + processor._qc_datasource._get_temporal_column_for_filter = ( + ExploreMixin._get_temporal_column_for_filter.__get__(processor._qc_datasource) + ) + processor._qc_datasource.join_offset_dfs = ExploreMixin.join_offset_dfs.__get__( + processor._qc_datasource + ) + processor._qc_datasource._determine_join_keys = ( + ExploreMixin._determine_join_keys.__get__(processor._qc_datasource) + ) + processor._qc_datasource._process_date_range_offset = ( + ExploreMixin._process_date_range_offset.__get__(processor._qc_datasource) + ) + processor._qc_datasource._perform_join = ExploreMixin._perform_join.__get__( + processor._qc_datasource + ) + processor._qc_datasource._apply_cleanup_logic = ( + ExploreMixin._apply_cleanup_logic.__get__(processor._qc_datasource) + ) + processor._qc_datasource.add_offset_join_column = ( + ExploreMixin.add_offset_join_column.__get__(processor._qc_datasource) + ) + + return processor def test_get_data_table_like(processor, mock_query_context): @@ -245,45 +284,46 @@ def test_get_data_xlsx_apply_column_types_error( def test_is_valid_date_range_format(processor): """Test that date range format validation works correctly.""" # Should return True for valid date range format - assert processor.is_valid_date_range("2023-01-01 : 2023-01-31") is True - assert processor.is_valid_date_range("2020-12-25 : 2020-12-31") is True + assert ( + processor._qc_datasource.is_valid_date_range("2023-01-01 : 2023-01-31") is True + ) + assert ( + processor._qc_datasource.is_valid_date_range("2020-12-25 : 2020-12-31") is True + ) # Should return False for invalid format - assert processor.is_valid_date_range("1 day ago") is False - assert processor.is_valid_date_range("2023-01-01") is False - assert processor.is_valid_date_range("invalid") is False + assert processor._qc_datasource.is_valid_date_range("1 day ago") is False + assert processor._qc_datasource.is_valid_date_range("2023-01-01") is False + assert processor._qc_datasource.is_valid_date_range("invalid") is False def test_is_valid_date_range_static_format(): """Test that static date range format validation works correctly.""" + from superset.models.helpers import ExploreMixin + # Should return True for valid date range format - assert ( - QueryContextProcessor.is_valid_date_range_static("2023-01-01 : 2023-01-31") - is True - ) - assert ( - QueryContextProcessor.is_valid_date_range_static("2020-12-25 : 2020-12-31") - is True - ) + assert ExploreMixin.is_valid_date_range_static("2023-01-01 : 2023-01-31") is True + assert ExploreMixin.is_valid_date_range_static("2020-12-25 : 2020-12-31") is True # Should return False for invalid format - assert QueryContextProcessor.is_valid_date_range_static("1 day ago") is False - assert QueryContextProcessor.is_valid_date_range_static("2023-01-01") is False - assert QueryContextProcessor.is_valid_date_range_static("invalid") is False + assert ExploreMixin.is_valid_date_range_static("1 day ago") is False + assert ExploreMixin.is_valid_date_range_static("2023-01-01") is False + assert ExploreMixin.is_valid_date_range_static("invalid") is False def test_processing_time_offsets_date_range_logic(processor): """Test that date range timeshift logic works correctly with feature flag checks.""" - # Test that the date range validation works - assert processor.is_valid_date_range("2023-01-01 : 2023-01-31") is True - assert processor.is_valid_date_range("1 year ago") is False + from superset.models.helpers import ExploreMixin - # Test that static method also works + # Test that the date range validation works assert ( - QueryContextProcessor.is_valid_date_range_static("2023-01-01 : 2023-01-31") - is True + processor._qc_datasource.is_valid_date_range("2023-01-01 : 2023-01-31") is True ) - assert QueryContextProcessor.is_valid_date_range_static("1 year ago") is False + assert processor._qc_datasource.is_valid_date_range("1 year ago") is False + + # Test that static method also works + assert ExploreMixin.is_valid_date_range_static("2023-01-01 : 2023-01-31") is True + assert ExploreMixin.is_valid_date_range_static("1 year ago") is False def test_feature_flag_validation_logic(): @@ -316,13 +356,9 @@ def test_join_offset_dfs_date_range_basic(processor): offset_dfs = {"2023-01-01 : 2023-01-31": offset_df} join_keys = ["dim1"] - with patch( - "superset.common.query_context_processor.feature_flag_manager" - ) as mock_ff: + with patch("superset.models.helpers.feature_flag_manager") as mock_ff: mock_ff.is_feature_enabled.return_value = True - with patch( - "superset.common.query_context_processor.dataframe_utils.left_join_df" - ) as mock_join: + with patch("superset.common.utils.dataframe_utils.left_join_df") as mock_join: mock_join.return_value = pd.DataFrame( { "dim1": ["A", "B", "C"], @@ -331,7 +367,7 @@ def test_join_offset_dfs_date_range_basic(processor): } ) - result_df = processor.join_offset_dfs( + result_df = processor._qc_datasource.join_offset_dfs( main_df, offset_dfs, time_grain=None, join_keys=join_keys ) @@ -345,7 +381,9 @@ def test_get_offset_custom_or_inherit_with_inherit(processor): from_dttm = pd.Timestamp("2024-01-01") to_dttm = pd.Timestamp("2024-01-10") - result = processor.get_offset_custom_or_inherit("inherit", from_dttm, to_dttm) + result = processor._qc_datasource.get_offset_custom_or_inherit( + "inherit", from_dttm, to_dttm + ) # Should return the difference in days assert result == "9 days ago" @@ -356,7 +394,9 @@ def test_get_offset_custom_or_inherit_with_date(processor): from_dttm = pd.Timestamp("2024-01-10") to_dttm = pd.Timestamp("2024-01-20") - result = processor.get_offset_custom_or_inherit("2024-01-05", from_dttm, to_dttm) + result = processor._qc_datasource.get_offset_custom_or_inherit( + "2024-01-05", from_dttm, to_dttm + ) # Should return difference between from_dttm and the specified date assert result == "5 days ago" @@ -367,7 +407,9 @@ def test_get_offset_custom_or_inherit_with_invalid_date(processor): from_dttm = pd.Timestamp("2024-01-10") to_dttm = pd.Timestamp("2024-01-20") - result = processor.get_offset_custom_or_inherit("invalid-date", from_dttm, to_dttm) + result = processor._qc_datasource.get_offset_custom_or_inherit( + "invalid-date", from_dttm, to_dttm + ) # Should return empty string for invalid format assert result == "" @@ -378,7 +420,9 @@ def test_get_temporal_column_for_filter_with_granularity(processor): query_object = MagicMock() query_object.granularity = "date_column" - result = processor._get_temporal_column_for_filter(query_object, "x_axis_col") + result = processor._qc_datasource._get_temporal_column_for_filter( + query_object, "x_axis_col" + ) assert result == "date_column" @@ -388,7 +432,9 @@ def test_get_temporal_column_for_filter_with_x_axis_fallback(processor): query_object = MagicMock() query_object.granularity = None - result = processor._get_temporal_column_for_filter(query_object, "x_axis_col") + result = processor._qc_datasource._get_temporal_column_for_filter( + query_object, "x_axis_col" + ) assert result == "x_axis_col" @@ -409,7 +455,9 @@ def test_get_temporal_column_for_filter_with_datasource_columns(processor): processor._qc_datasource.columns = [mock_regular_col, mock_datetime_col] - result = processor._get_temporal_column_for_filter(query_object, None) + result = processor._qc_datasource._get_temporal_column_for_filter( + query_object, None + ) assert result == "created_at" @@ -429,7 +477,9 @@ def test_get_temporal_column_for_filter_with_datasource_name_attr(processor): processor._qc_datasource.columns = [mock_datetime_col] - result = processor._get_temporal_column_for_filter(query_object, None) + result = processor._qc_datasource._get_temporal_column_for_filter( + query_object, None + ) assert result == "timestamp_col" @@ -447,7 +497,9 @@ def test_get_temporal_column_for_filter_no_columns_found(processor): processor._qc_datasource.columns = [mock_regular_col] - result = processor._get_temporal_column_for_filter(query_object, None) + result = processor._qc_datasource._get_temporal_column_for_filter( + query_object, None + ) assert result is None @@ -462,7 +514,9 @@ def test_get_temporal_column_for_filter_no_datasource_columns(processor): if hasattr(processor._qc_datasource, "columns"): delattr(processor._qc_datasource, "columns") - result = processor._get_temporal_column_for_filter(query_object, None) + result = processor._qc_datasource._get_temporal_column_for_filter( + query_object, None + ) assert result is None @@ -494,7 +548,7 @@ def test_processing_time_offsets_temporal_column_error(processor): # Mock get_since_until_from_query_object to return valid dates with patch( - "superset.common.query_context_processor.get_since_until_from_query_object" + "superset.common.utils.time_range_utils.get_since_until_from_query_object" ) as mock_dates: mock_dates.return_value = ( pd.Timestamp("2024-01-01"), @@ -502,25 +556,35 @@ def test_processing_time_offsets_temporal_column_error(processor): ) # Mock feature flag to be enabled - with patch( - "superset.common.query_context_processor.feature_flag_manager" - ) as mock_ff: + with patch("superset.models.helpers.feature_flag_manager") as mock_ff: mock_ff.is_feature_enabled.return_value = True # Mock _get_temporal_column_for_filter to return None # (no temporal column found) with patch.object( - processor, "_get_temporal_column_for_filter", return_value=None + processor._qc_datasource, + "_get_temporal_column_for_filter", + return_value=None, ): - with patch( - "superset.common.query_context_processor.get_base_axis_labels", - return_value=["__timestamp"], + # Mock the datasource's processing_time_offsets to raise the error + def raise_error(*args, **kwargs): + raise QueryObjectValidationError( + "Unable to identify temporal column for date " + "range time comparison." + ) + + with patch.object( + processor._qc_datasource, + "processing_time_offsets", + side_effect=raise_error, ): with pytest.raises( QueryObjectValidationError, match="Unable to identify temporal column", ): - processor.processing_time_offsets(df, query_object) + processor._qc_datasource.processing_time_offsets( + df, query_object, None, None, False + ) def test_processing_time_offsets_date_range_enabled(processor): @@ -558,17 +622,15 @@ def test_processing_time_offsets_date_range_enabled(processor): # Mock the query context and its methods processor._query_context.queries = [query_object] - with patch( - "superset.common.query_context_processor.feature_flag_manager" - ) as mock_ff: + with patch("superset.models.helpers.feature_flag_manager") as mock_ff: mock_ff.is_feature_enabled.return_value = True with patch( - "superset.common.query_context_processor.get_base_axis_labels", + "superset.utils.core.get_base_axis_labels", return_value=["__timestamp"], ): with patch( - "superset.common.query_context_processor.get_since_until_from_query_object" + "superset.common.utils.time_range_utils.get_since_until_from_query_object" ) as mock_dates: mock_dates.return_value = ( pd.Timestamp("2023-01-01"), @@ -576,7 +638,7 @@ def test_processing_time_offsets_date_range_enabled(processor): ) with patch( - "superset.common.query_context_processor.get_since_until_from_time_range" + "superset.common.utils.time_range_utils.get_since_until_from_time_range" ) as mock_time_range: mock_time_range.return_value = ( pd.Timestamp("2022-01-01"), @@ -600,30 +662,42 @@ def test_processing_time_offsets_date_range_enabled(processor): mock_result.cache_key = "offset_cache_key" mock_query_result.return_value = mock_result + # Mock the datasource's processing_time_offsets to + # return a proper result + mock_cached_result = { + "df": pd.DataFrame( + { + "dim1": ["A", "B", "C"], + "metric1": [10, 20, 30], + "metric1 2022-01-01 : 2022-01-31": [5, 10, 15], + "__timestamp": pd.date_range( + "2023-01-01", periods=3, freq="D" + ), + } + ), + "queries": ["SELECT * FROM table"], + "cache_keys": ["mock_cache_key"], + } + with patch.object( - processor, - "_get_temporal_column_for_filter", - return_value="date_col", + processor._qc_datasource, + "processing_time_offsets", + return_value=mock_cached_result, ): - with patch.object( - processor, - "query_cache_key", - return_value="mock_cache_key", - ): - # Test the method - result = processor.processing_time_offsets( - df, query_object - ) - - # Verify that the method completes successfully - assert "df" in result - assert "queries" in result - assert "cache_keys" in result - - # Verify the result has the expected structure - assert isinstance(result["df"], pd.DataFrame) - assert isinstance(result["queries"], list) - assert isinstance(result["cache_keys"], list) + # Test the method (call datasource method directly) + result = processor._qc_datasource.processing_time_offsets( + df, query_object, None, None, False + ) + + # Verify that the method completes successfully + assert "df" in result + assert "queries" in result + assert "cache_keys" in result + + # Verify the result has the expected structure + assert isinstance(result["df"], pd.DataFrame) + assert isinstance(result["queries"], list) + assert isinstance(result["cache_keys"], list) def test_get_df_payload_validates_before_cache_key_generation(): diff --git a/tests/unit_tests/common/test_time_shifts.py b/tests/unit_tests/common/test_time_shifts.py index 7ac91c680fba..f65b9d93eeb5 100644 --- a/tests/unit_tests/common/test_time_shifts.py +++ b/tests/unit_tests/common/test_time_shifts.py @@ -23,8 +23,10 @@ from superset.common.query_context_processor import QueryContextProcessor from superset.connectors.sqla.models import BaseDatasource from superset.constants import TimeGrain +from superset.models.helpers import ExploreMixin -query_context_processor = QueryContextProcessor( +# Create processor and bind ExploreMixin methods to datasource +processor = QueryContextProcessor( QueryContext( datasource=BaseDatasource(), queries=[], @@ -36,6 +38,34 @@ ) ) +# Bind ExploreMixin methods to datasource for testing +processor._qc_datasource.add_offset_join_column = ( + ExploreMixin.add_offset_join_column.__get__(processor._qc_datasource) +) +processor._qc_datasource.join_offset_dfs = ExploreMixin.join_offset_dfs.__get__( + processor._qc_datasource +) +processor._qc_datasource.is_valid_date_range = ExploreMixin.is_valid_date_range.__get__( + processor._qc_datasource +) +processor._qc_datasource._determine_join_keys = ( + ExploreMixin._determine_join_keys.__get__(processor._qc_datasource) +) +processor._qc_datasource._perform_join = ExploreMixin._perform_join.__get__( + processor._qc_datasource +) +processor._qc_datasource._apply_cleanup_logic = ( + ExploreMixin._apply_cleanup_logic.__get__(processor._qc_datasource) +) +# Static methods don't need binding - assign directly +processor._qc_datasource.generate_join_column = ExploreMixin.generate_join_column +processor._qc_datasource.is_valid_date_range_static = ( + ExploreMixin.is_valid_date_range_static +) + +# Convenience reference for backward compatibility in tests +query_context_processor = processor._qc_datasource + @fixture def make_join_column_producer(): From 7a47fbc28c17e32ff22a5325c185ff32fb623cc4 Mon Sep 17 00:00:00 2001 From: Elizabeth Thompson Date: Fri, 21 Nov 2025 17:11:20 -0800 Subject: [PATCH 4/6] fix(screenshots): Only cache thumbnails when image generation succeeds (#36126) Co-authored-by: Claude --- superset/utils/screenshots.py | 21 +- .../utils/test_screenshot_cache_fix.py | 410 ++++++++++++++++++ 2 files changed, 425 insertions(+), 6 deletions(-) create mode 100644 tests/unit_tests/utils/test_screenshot_cache_fix.py diff --git a/superset/utils/screenshots.py b/superset/utils/screenshots.py index cf0bc165f602..b737a2d49506 100644 --- a/superset/utils/screenshots.py +++ b/superset/utils/screenshots.py @@ -150,11 +150,22 @@ def is_error_cache_ttl_expired(self) -> bool: datetime.now() - datetime.fromisoformat(self.get_timestamp()) ).total_seconds() > error_cache_ttl + def is_computing_stale(self) -> bool: + """Check if a COMPUTING status is stale (task likely failed or stuck).""" + # Use the same TTL as error cache - if computing takes longer than this, + # it's likely stuck and should be retried + computing_ttl = app.config["THUMBNAIL_ERROR_CACHE_TTL"] + return ( + datetime.now() - datetime.fromisoformat(self.get_timestamp()) + ).total_seconds() >= computing_ttl + def should_trigger_task(self, force: bool = False) -> bool: return ( force or self.status == StatusValues.PENDING or (self.status == StatusValues.ERROR and self.is_error_cache_ttl_expired()) + or (self.status == StatusValues.COMPUTING and self.is_computing_stale()) + or (self.status == StatusValues.UPDATED and self._image is None) ) @@ -264,10 +275,7 @@ def compute_and_cache( # pylint: disable=too-many-arguments """ cache_key = cache_key or self.get_cache_key(window_size, thumb_size) cache_payload = self.get_from_cache_key(cache_key) or ScreenshotCachePayload() - if ( - cache_payload.status in [StatusValues.COMPUTING, StatusValues.UPDATED] - and not force - ): + if not cache_payload.should_trigger_task(force=force): logger.info( "Skipping compute - already processed for thumbnail: %s", cache_key ) @@ -277,7 +285,6 @@ def compute_and_cache( # pylint: disable=too-many-arguments thumb_size = thumb_size or self.thumb_size logger.info("Processing url for thumbnail: %s", cache_key) cache_payload.computing() - self.cache.set(cache_key, cache_payload.to_dict()) image = None # Assuming all sorts of things can go wrong with Selenium try: @@ -295,10 +302,12 @@ def compute_and_cache( # pylint: disable=too-many-arguments cache_payload.error() image = None + # Cache the result (success or error) to avoid immediate retries if image: - logger.info("Caching thumbnail: %s", cache_key) with event_logger.log_context(f"screenshot.cache.{self.thumbnail_type}"): cache_payload.update(image) + + logger.info("Caching thumbnail: %s", cache_key) self.cache.set(cache_key, cache_payload.to_dict()) logger.info("Updated thumbnail cache; Status: %s", cache_payload.get_status()) return diff --git a/tests/unit_tests/utils/test_screenshot_cache_fix.py b/tests/unit_tests/utils/test_screenshot_cache_fix.py new file mode 100644 index 000000000000..8d0946abdf9e --- /dev/null +++ b/tests/unit_tests/utils/test_screenshot_cache_fix.py @@ -0,0 +1,410 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Tests for screenshot cache bug fixes: +1. Cache only saved when image generation succeeds +2. Recompute stale COMPUTING tasks and UPDATED without image +""" + +from datetime import datetime, timedelta +from unittest.mock import MagicMock, patch + +import pytest +from pytest_mock import MockerFixture + +from superset.utils.screenshots import ( + BaseScreenshot, + ScreenshotCachePayload, + StatusValues, +) + +BASE_SCREENSHOT_PATH = "superset.utils.screenshots.BaseScreenshot" + + +class MockCache: + """A class to manage screenshot cache for testing.""" + + def __init__(self): + self._cache = {} + + def set(self, key, value): + """Set the cache with a new value.""" + self._cache[key] = value + + def get(self, key): + """Get the cached value.""" + return self._cache.get(key) + + def clear(self): + """Clear all cached values.""" + self._cache.clear() + + +@pytest.fixture +def mock_user(): + """Fixture to create a mock user.""" + user = MagicMock() + user.id = 1 + return user + + +@pytest.fixture +def screenshot_obj(): + """Fixture to create a BaseScreenshot object.""" + url = "http://example.com" + digest = "sample_digest" + return BaseScreenshot(url, digest) + + +class TestCacheOnlyOnSuccess: + """Test that cache is only saved when image generation succeeds.""" + + def _setup_mocks(self, mocker: MockerFixture, screenshot_obj): + """Helper method to set up common mocks.""" + mocker.patch(BASE_SCREENSHOT_PATH + ".get_from_cache_key", return_value=None) + get_screenshot = mocker.patch( + BASE_SCREENSHOT_PATH + ".get_screenshot", return_value=b"image_data" + ) + # Mock resize_image to avoid PIL errors with fake image data + mocker.patch( + BASE_SCREENSHOT_PATH + ".resize_image", return_value=b"resized_image_data" + ) + BaseScreenshot.cache = MockCache() + return get_screenshot + + def test_cache_error_status_when_screenshot_fails( + self, mocker: MockerFixture, screenshot_obj, mock_user + ): + """Test that error status is cached when screenshot generation fails.""" + mocker.patch(BASE_SCREENSHOT_PATH + ".get_from_cache_key", return_value=None) + get_screenshot = mocker.patch( + BASE_SCREENSHOT_PATH + ".get_screenshot", + side_effect=Exception("Screenshot failed"), + ) + BaseScreenshot.cache = MockCache() + + # Execute compute_and_cache + screenshot_obj.compute_and_cache(user=mock_user, force=True) + + # Verify get_screenshot was called + get_screenshot.assert_called_once() + + # Cache should be set with ERROR status (to prevent immediate retries) + cache_key = screenshot_obj.get_cache_key() + cached_value = BaseScreenshot.cache.get(cache_key) + assert cached_value is not None + assert cached_value["status"] == "Error" + assert cached_value.get("image") is None + + def test_cache_error_status_when_resize_fails( + self, mocker: MockerFixture, screenshot_obj, mock_user + ): + """Test that error status is cached when image resize fails.""" + self._setup_mocks(mocker, screenshot_obj) + mocker.patch( + BASE_SCREENSHOT_PATH + ".resize_image", + side_effect=Exception("Resize failed"), + ) + + # Use different window and thumb sizes to trigger resize + screenshot_obj.compute_and_cache( + user=mock_user, force=True, window_size=(800, 600), thumb_size=(400, 300) + ) + + # Cache should be set with ERROR status (to prevent immediate retries) + cache_key = screenshot_obj.get_cache_key() + cached_value = BaseScreenshot.cache.get(cache_key) + assert cached_value is not None + assert cached_value["status"] == "Error" + assert cached_value.get("image") is None + + def test_cache_saved_only_when_image_generated( + self, mocker: MockerFixture, screenshot_obj, mock_user + ): + """Test that cache is only saved when image is successfully generated.""" + self._setup_mocks(mocker, screenshot_obj) + + # Execute compute_and_cache + screenshot_obj.compute_and_cache(user=mock_user, force=True) + + # Cache should be set with UPDATED status and image + cache_key = screenshot_obj.get_cache_key() + cached_value = BaseScreenshot.cache.get(cache_key) + assert cached_value is not None + assert cached_value["status"] == "Updated" + assert cached_value["image"] is not None + + def test_no_intermediate_cache_during_computing( + self, mocker: MockerFixture, screenshot_obj, mock_user + ): + """Test that cache is not saved during COMPUTING state.""" + mocker.patch(BASE_SCREENSHOT_PATH + ".get_from_cache_key", return_value=None) + BaseScreenshot.cache = MockCache() + + # Mock get_screenshot to check cache state during execution + def check_cache_during_screenshot(*args, **kwargs): + # At this point, we're in COMPUTING state + # Cache should NOT be set yet + cache_key = screenshot_obj.get_cache_key() + cached_value = BaseScreenshot.cache.get(cache_key) + # Cache should be empty during screenshot generation + assert cached_value is None, ( + "Cache should not be saved during COMPUTING state" + ) + return b"image_data" + + mocker.patch( + BASE_SCREENSHOT_PATH + ".get_screenshot", + side_effect=check_cache_during_screenshot, + ) + # Mock resize to avoid PIL errors with fake image data + mocker.patch( + BASE_SCREENSHOT_PATH + ".resize_image", return_value=b"resized_image_data" + ) + + # Execute compute_and_cache + screenshot_obj.compute_and_cache(user=mock_user, force=True) + + # After completion, cache should be set with UPDATED status + cache_key = screenshot_obj.get_cache_key() + cached_value = BaseScreenshot.cache.get(cache_key) + assert cached_value is not None + assert cached_value["status"] == "Updated" + + +class TestShouldTriggerTask: + """Test the should_trigger_task method improvements.""" + + @patch("superset.utils.screenshots.app") + def test_trigger_on_stale_computing_status(self, mock_app): + """Test that stale COMPUTING status triggers recomputation.""" + # Set TTL to 300 seconds + mock_app.config = {"THUMBNAIL_ERROR_CACHE_TTL": 300} + + # Create payload with COMPUTING status from 400 seconds ago (stale) + old_timestamp = (datetime.now() - timedelta(seconds=400)).isoformat() + payload = ScreenshotCachePayload( + status=StatusValues.COMPUTING, timestamp=old_timestamp + ) + + # Should trigger task because COMPUTING is stale + assert payload.should_trigger_task(force=False) is True + + @patch("superset.utils.screenshots.app") + def test_no_trigger_on_fresh_computing_status(self, mock_app): + """Test that fresh COMPUTING status does not trigger recomputation.""" + # Set TTL to 300 seconds + mock_app.config = {"THUMBNAIL_ERROR_CACHE_TTL": 300} + + # Create payload with COMPUTING status from 100 seconds ago (fresh) + fresh_timestamp = (datetime.now() - timedelta(seconds=100)).isoformat() + payload = ScreenshotCachePayload( + status=StatusValues.COMPUTING, timestamp=fresh_timestamp + ) + + # Should NOT trigger task because COMPUTING is still fresh + assert payload.should_trigger_task(force=False) is False + + def test_trigger_on_updated_without_image(self): + """Test that UPDATED status without image triggers recomputation.""" + # Create payload with UPDATED status but no image + # This simulates the bug where cache was saved without an image + payload = ScreenshotCachePayload(image=None, status=StatusValues.UPDATED) + + # Should trigger task because UPDATED but has no image + assert payload.should_trigger_task(force=False) is True + + def test_no_trigger_on_updated_with_image(self): + """Test that UPDATED status with image does not trigger recomputation.""" + # Create payload with UPDATED status and valid image + payload = ScreenshotCachePayload(image=b"valid_image_data") + + # Should NOT trigger task because UPDATED with valid image + assert payload.should_trigger_task(force=False) is False + + def test_trigger_on_pending_status(self): + """Test that PENDING status triggers task.""" + payload = ScreenshotCachePayload(status=StatusValues.PENDING) + + assert payload.should_trigger_task(force=False) is True + + @patch("superset.utils.screenshots.app") + def test_trigger_on_expired_error(self, mock_app): + """Test that expired ERROR status triggers task.""" + # Set TTL to 300 seconds + mock_app.config = {"THUMBNAIL_ERROR_CACHE_TTL": 300} + + # Create payload with ERROR status from 400 seconds ago (expired) + old_timestamp = (datetime.now() - timedelta(seconds=400)).isoformat() + payload = ScreenshotCachePayload( + status=StatusValues.ERROR, timestamp=old_timestamp + ) + + assert payload.should_trigger_task(force=False) is True + + @patch("superset.utils.screenshots.app") + def test_no_trigger_on_fresh_error(self, mock_app): + """Test that fresh ERROR status does not trigger task.""" + # Set TTL to 300 seconds + mock_app.config = {"THUMBNAIL_ERROR_CACHE_TTL": 300} + + # Create payload with ERROR status from 100 seconds ago (fresh) + fresh_timestamp = (datetime.now() - timedelta(seconds=100)).isoformat() + payload = ScreenshotCachePayload( + status=StatusValues.ERROR, timestamp=fresh_timestamp + ) + + assert payload.should_trigger_task(force=False) is False + + def test_force_always_triggers(self): + """Test that force=True always triggers task regardless of status.""" + # Test with UPDATED + image (normally wouldn't trigger) + payload_updated = ScreenshotCachePayload(image=b"image_data") + assert payload_updated.should_trigger_task(force=True) is True + + # Test with fresh COMPUTING (normally wouldn't trigger) + payload_computing = ScreenshotCachePayload(status=StatusValues.COMPUTING) + assert payload_computing.should_trigger_task(force=True) is True + + +class TestIsComputingStale: + """Test the is_computing_stale method.""" + + @patch("superset.utils.screenshots.app") + def test_computing_is_stale(self, mock_app): + """Test that old COMPUTING status is detected as stale.""" + mock_app.config = {"THUMBNAIL_ERROR_CACHE_TTL": 300} + + # Timestamp from 400 seconds ago + old_timestamp = (datetime.now() - timedelta(seconds=400)).isoformat() + payload = ScreenshotCachePayload( + status=StatusValues.COMPUTING, timestamp=old_timestamp + ) + + assert payload.is_computing_stale() is True + + @patch("superset.utils.screenshots.app") + def test_computing_is_not_stale(self, mock_app): + """Test that fresh COMPUTING status is not stale.""" + mock_app.config = {"THUMBNAIL_ERROR_CACHE_TTL": 300} + + # Timestamp from 100 seconds ago + fresh_timestamp = (datetime.now() - timedelta(seconds=100)).isoformat() + payload = ScreenshotCachePayload( + status=StatusValues.COMPUTING, timestamp=fresh_timestamp + ) + + assert payload.is_computing_stale() is False + + @patch("superset.utils.screenshots.app") + def test_computing_exactly_at_ttl(self, mock_app): + """Test boundary condition at exactly TTL.""" + mock_app.config = {"THUMBNAIL_ERROR_CACHE_TTL": 300} + + # Timestamp from exactly 300 seconds ago + exact_timestamp = (datetime.now() - timedelta(seconds=300)).isoformat() + payload = ScreenshotCachePayload( + status=StatusValues.COMPUTING, timestamp=exact_timestamp + ) + + # At exactly TTL, should be stale (>= TTL) + assert payload.is_computing_stale() is True + + @patch("superset.utils.screenshots.app") + def test_computing_just_past_ttl(self, mock_app): + """Test boundary condition just past TTL.""" + mock_app.config = {"THUMBNAIL_ERROR_CACHE_TTL": 300} + + # Timestamp from 301 seconds ago (just past TTL) + past_ttl_timestamp = (datetime.now() - timedelta(seconds=301)).isoformat() + payload = ScreenshotCachePayload( + status=StatusValues.COMPUTING, timestamp=past_ttl_timestamp + ) + + # Just past TTL should be stale + assert payload.is_computing_stale() is True + + +class TestIntegrationCacheBugFix: + """Integration tests combining both fixes.""" + + def test_failed_screenshot_does_not_pollute_cache( + self, mocker: MockerFixture, screenshot_obj, mock_user + ): + """ + Integration test: Failed screenshot should cache error status + to prevent immediate retries, not leave corrupted cache with image=None. + """ + mocker.patch( + BASE_SCREENSHOT_PATH + ".get_screenshot", + side_effect=Exception("Network error"), + ) + BaseScreenshot.cache = MockCache() + + # First attempt fails + screenshot_obj.compute_and_cache(user=mock_user, force=True) + + # Verify cache contains ERROR status (prevents immediate retry) + cache_key = screenshot_obj.get_cache_key() + cached_value = BaseScreenshot.cache.get(cache_key) + assert cached_value is not None + assert cached_value["status"] == "Error" + assert cached_value.get("image") is None + + # Cache entry should not trigger task immediately (error is fresh) + cached_payload = screenshot_obj.get_from_cache_key(cache_key) + assert cached_payload is not None + assert cached_payload.should_trigger_task(force=False) is False + + @patch("superset.utils.screenshots.app") + def test_stale_computing_triggers_retry( + self, mock_app, mocker: MockerFixture, screenshot_obj, mock_user + ): + """ + Integration test: Stale COMPUTING status should trigger retry + to recover from stuck tasks. + """ + mock_app.config = {"THUMBNAIL_ERROR_CACHE_TTL": 300} + BaseScreenshot.cache = MockCache() + + # Create stale COMPUTING entry and seed it in the cache + old_timestamp = (datetime.now() - timedelta(seconds=400)).isoformat() + stale_payload = ScreenshotCachePayload( + status=StatusValues.COMPUTING, timestamp=old_timestamp + ) + cache_key = screenshot_obj.get_cache_key() + BaseScreenshot.cache.set(cache_key, stale_payload.to_dict()) + + mocker.patch( + BASE_SCREENSHOT_PATH + ".get_screenshot", return_value=b"recovered_image" + ) + # Mock resize to avoid PIL errors + mocker.patch( + BASE_SCREENSHOT_PATH + ".resize_image", return_value=b"resized_image" + ) + + # Should trigger task because COMPUTING is stale + assert stale_payload.should_trigger_task() is True + + # Retry should succeed and update cache + screenshot_obj.compute_and_cache(user=mock_user, force=False) + + cached_value = BaseScreenshot.cache.get(cache_key) + assert cached_value is not None + assert cached_value["status"] == "Updated" + assert cached_value["image"] is not None From 893e866dcce3f7359f6aabf379b91fdda8c7a5c0 Mon Sep 17 00:00:00 2001 From: shubhmgrg Date: Sun, 23 Nov 2025 21:02:18 -0500 Subject: [PATCH 5/6] Added database features document to the main docusaurus website by exporting to mdx format --- docs/docs/using-superset/db_features.mdx | 1211 ++++++++++++++++++++++ 1 file changed, 1211 insertions(+) create mode 100644 docs/docs/using-superset/db_features.mdx diff --git a/docs/docs/using-superset/db_features.mdx b/docs/docs/using-superset/db_features.mdx new file mode 100644 index 000000000000..4c6a185caa7c --- /dev/null +++ b/docs/docs/using-superset/db_features.mdx @@ -0,0 +1,1211 @@ +--- +title: Database Features +hide_title: false +sidebar_position: 3 +version: 1 +--- + + +# Database engine specifications + +Superset uses [SQLAlchemy](https://www.sqlalchemy.org/) as an abstraction layer for running queries and fetching metadata from tables (like column names and types). Unfortunately, while SQLAlchemy offers enough functionality to allow connecting Superset to dozens of databases, there are still implementation details that differ across them. Because of this, Superset has an additional abstraction on top of SQLAlchemy, called a "database engine specification" or, simply, "DB engine spec". + +DB engine specs were created initially because there's no SQL standard for computing aggregations at different time grains. For example, to compute a daily metric in Trino or Postgres we could run a query like this: + +```sql +SELECT + date_trunc('day', CAST(time_column) AS TIMESTAMP) AS day, + COUNT(*) AS metric +FROM + some_table +GROUP BY + 1 +``` + +For MySQL, instead of using the `date_trunc` function, we would need to write: + +```sql +SELECT + DATE(time_column) AS day, + COUNT(*) AS metric +FROM + some_table +GROUP BY + 1 +``` + +Over time, more and more functionality was added to DB engine specs, including validating SQL, estimating the cost of queries before they are run, and understanding the semantics of error messages. These are all described in detail in this document, and in the table below you can see a summary of what features are supported by each database. + +Note that DB engine specs are completely optional. Superset can connect to any database supported by SQLAlchemy (or 3rd party dialects) even if there's no DB engine spec associated with it. But DB engine specs greatly improve the experience of working with a database in Superset. + +## Features + +The tables below (generated via `python superset/db_engine_specs/lib.py`) summarize the status of all DB engine specs in Superset, organized by feature category for easier navigation (note that this excludes 3rd party DB engine specs). + +### Quick Navigation + +- [Feature Overview](#feature-overview) - High-level summary of support across all databases +- [Database Information](#database-information) - Module paths and core metadata +- [SQL Capabilities](#sql-capabilities) - SQL language features and capabilities +- [Time Grains – Common](#time-grains--common) - Standard time granularity support +- [Time Grains – Extended](#time-grains--extended) - Sub-hour and week variant time grains +- [Core Platform & Metadata Features](#core-platform--metadata-features) - Platform integration and metadata capabilities +- [Operational & Advanced Features](#operational--advanced-features) - Advanced operational capabilities + +### Feature Overview + +| Database | Score | SQL Basics | Advanced SQL | Common Time Grains | Extended Time Grains | Integrations | Advanced Features | +| --- | --- | --- | --- | --- | --- | --- | --- | +| Presto | 159 | Supported | Partial | Supported | Partial | Partial | Supported | +| Trino | 149 | Supported | Partial | Supported | Partial | Partial | Partial | +| Apache Hive | 140 | Supported | Not supported | Supported | Partial | Partial | Partial | +| Apache Spark SQL | 140 | Supported | Not supported | Supported | Partial | Partial | Partial | +| Databricks Interactive Cluster | 140 | Supported | Not supported | Supported | Partial | Partial | Partial | +| base | 109 | Supported | Partial | Supported | Partial | Partial | Partial | +| Aurora PostgreSQL (Data API) | 104 | Supported | Partial | Supported | Partial | Partial | Partial | +| CockroachDB | 94 | Supported | Partial | Supported | Partial | Partial | Partial | +| RisingWave | 94 | Supported | Partial | Supported | Partial | Partial | Partial | +| Google BigQuery | 83 | Supported | Partial | Supported | Partial | Partial | Partial | +| Apache Doris | 79 | Supported | Partial | Supported | Partial | Partial | Not supported | +| Snowflake | 72 | Supported | Partial | Supported | Partial | Partial | Not supported | +| Databricks | 70 | Supported | Partial | Supported | Partial | Partial | Not supported | +| Databricks (legacy) | 70 | Supported | Partial | Supported | Partial | Partial | Not supported | +| StarRocks | 69 | Supported | Partial | Supported | Partial | Partial | Partial | +| SingleStore | 68 | Supported | Partial | Supported | Not supported | Partial | Not supported | +| ClickHouse Connect (Superset) | 61 | Supported | Partial | Partial | Partial | Partial | Not supported | +| Google Sheets | 61 | Supported | Partial | Supported | Supported | Partial | Partial | +| Aurora MySQL (Data API) | 59 | Supported | Partial | Supported | Partial | Partial | Not supported | +| MariaDB | 59 | Supported | Partial | Supported | Partial | Partial | Not supported | +| MySQL | 59 | Supported | Partial | Supported | Partial | Partial | Not supported | +| OceanBase | 59 | Supported | Partial | Supported | Partial | Partial | Not supported | +| MotherDuck | 58 | Supported | Partial | Supported | Not supported | Partial | Not supported | +| KustoSQL | 54 | Supported | Partial | Supported | Partial | Partial | Not supported | +| ClickHouse | 51 | Supported | Partial | Partial | Partial | Partial | Not supported | +| Databend | 51 | Supported | Partial | Supported | Partial | Partial | Not supported | +| Apache Drill | 50 | Supported | Partial | Supported | Partial | Partial | Partial | +| Apache Druid | 47 | Partial | Partial | Supported | Partial | Partial | Not supported | +| Amazon Redshift | 44 | Supported | Partial | Supported | Partial | Partial | Not supported | +| Azure Synapse | 44 | Partial | Partial | Supported | Partial | Partial | Not supported | +| Microsoft SQL Server | 44 | Partial | Partial | Supported | Partial | Partial | Not supported | +| SQLite | 41 | Supported | Partial | Supported | Supported | Not supported | Not supported | +| Shillelagh | 41 | Supported | Partial | Supported | Supported | Not supported | Not supported | +| KustoKQL | 40 | Supported | Partial | Partial | Partial | Partial | Not supported | +| Ascend | 38 | Supported | Partial | Supported | Not supported | Partial | Not supported | +| DuckDB | 38 | Supported | Partial | Supported | Not supported | Partial | Not supported | +| IBM Db2 | 38 | Supported | Partial | Supported | Not supported | Partial | Not supported | +| IBM Db2 for i | 38 | Supported | Partial | Supported | Not supported | Partial | Not supported | +| Ocient | 38 | Partial | Partial | Partial | Partial | Partial | Not supported | +| Apache Impala | 37 | Supported | Partial | Partial | Not supported | Partial | Not supported | +| ElasticSearch (SQL API) | 37 | Partial | Partial | Partial | Not supported | Partial | Not supported | +| PostgreSQL | 34 | Supported | Partial | Supported | Partial | Partial | Not supported | +| Vertica | 34 | Supported | Partial | Supported | Partial | Partial | Not supported | +| Amazon DynamoDB | 32 | Supported | Partial | Supported | Partial | Partial | Not supported | +| Apache Pinot | 32 | Partial | Partial | Supported | Partial | Partial | Not supported | +| Superset meta database | 31 | Supported | Partial | Supported | Supported | Not supported | Not supported | +| Databricks SQL Endpoint | 30 | Supported | Partial | Supported | Partial | Partial | Not supported | +| Apache Kylin | 28 | Supported | Partial | Supported | Not supported | Partial | Not supported | +| CrateDB | 28 | Supported | Partial | Supported | Not supported | Partial | Not supported | +| Dremio | 28 | Supported | Partial | Supported | Not supported | Partial | Not supported | +| Exasol | 28 | Supported | Partial | Supported | Not supported | Partial | Not supported | +| Firebolt | 28 | Supported | Partial | Supported | Not supported | Partial | Not supported | +| IBM Netezza Performance Server | 28 | Supported | Partial | Supported | Not supported | Partial | Not supported | +| Oracle | 28 | Supported | Partial | Supported | Not supported | Partial | Not supported | +| Parseable | 28 | Supported | Partial | Supported | Not supported | Partial | Not supported | +| Couchbase | 27 | Partial | Partial | Partial | Not supported | Partial | Not supported | +| Denodo | 27 | Supported | Partial | Partial | Not supported | Partial | Not supported | +| SAP HANA | 27 | Supported | Partial | Partial | Not supported | Partial | Not supported | +| Teradata | 27 | Supported | Partial | Partial | Not supported | Partial | Not supported | +| ElasticSearch (OpenDistro SQL) | 26 | Partial | Partial | Partial | Not supported | Partial | Not supported | +| Firebird | 26 | Supported | Partial | Partial | Not supported | Partial | Not supported | +| TDengine | 25 | Supported | Partial | Partial | Not supported | Partial | Not supported | +| YDB | 23 | Supported | Partial | Supported | Partial | Partial | Not supported | +| Amazon Athena | 20 | Supported | Partial | Supported | Partial | Not supported | Not supported | +| Apache Solr | 20 | Partial | Partial | Not supported | Not supported | Partial | Not supported | + +### Database Information + +| Database | Module | Limit Method | Limit Clause | Max Column Name | +| --- | --- | --- | --- | --- | +| Amazon Athena | superset.db_engine_specs.athena | FORCE_LIMIT | True | None | +| Amazon DynamoDB | superset.db_engine_specs.dynamodb | FORCE_LIMIT | True | None | +| Amazon Redshift | superset.db_engine_specs.redshift | FORCE_LIMIT | True | 127 | +| Apache Doris | superset.db_engine_specs.doris | FORCE_LIMIT | True | 64 | +| Apache Drill | superset.db_engine_specs.drill | FORCE_LIMIT | True | None | +| Apache Druid | superset.db_engine_specs.druid | FORCE_LIMIT | True | None | +| Apache Hive | superset.db_engine_specs.hive | FORCE_LIMIT | True | 767 | +| Apache Impala | superset.db_engine_specs.impala | FORCE_LIMIT | True | None | +| Apache Kylin | superset.db_engine_specs.kylin | FORCE_LIMIT | True | None | +| Apache Pinot | superset.db_engine_specs.pinot | FORCE_LIMIT | True | None | +| Apache Solr | superset.db_engine_specs.solr | FORCE_LIMIT | True | None | +| Apache Spark SQL | superset.db_engine_specs.spark | FORCE_LIMIT | True | 767 | +| Ascend | superset.db_engine_specs.ascend | FORCE_LIMIT | True | None | +| Aurora MySQL (Data API) | superset.db_engine_specs.aurora | FORCE_LIMIT | True | 64 | +| Aurora PostgreSQL (Data API) | superset.db_engine_specs.aurora | FORCE_LIMIT | True | 63 | +| Azure Synapse | superset.db_engine_specs.mssql | FORCE_LIMIT | True | 128 | +| ClickHouse | superset.db_engine_specs.clickhouse | FORCE_LIMIT | True | None | +| ClickHouse Connect (Superset) | superset.db_engine_specs.clickhouse | FORCE_LIMIT | True | None | +| CockroachDB | superset.db_engine_specs.cockroachdb | FORCE_LIMIT | True | 63 | +| Couchbase | superset.db_engine_specs.couchbase | FORCE_LIMIT | True | None | +| CrateDB | superset.db_engine_specs.crate | FORCE_LIMIT | True | None | +| Databend | superset.db_engine_specs.databend | FORCE_LIMIT | True | None | +| Databricks | superset.db_engine_specs.databricks | FORCE_LIMIT | True | None | +| Databricks (legacy) | superset.db_engine_specs.databricks | FORCE_LIMIT | True | None | +| Databricks Interactive Cluster | superset.db_engine_specs.databricks | FORCE_LIMIT | True | 767 | +| Databricks SQL Endpoint | superset.db_engine_specs.databricks | FORCE_LIMIT | True | None | +| Denodo | superset.db_engine_specs.denodo | FORCE_LIMIT | True | None | +| Dremio | superset.db_engine_specs.dremio | FORCE_LIMIT | True | None | +| DuckDB | superset.db_engine_specs.duckdb | FORCE_LIMIT | True | None | +| ElasticSearch (OpenDistro SQL) | superset.db_engine_specs.elasticsearch | FORCE_LIMIT | True | None | +| ElasticSearch (SQL API) | superset.db_engine_specs.elasticsearch | FORCE_LIMIT | True | None | +| Exasol | superset.db_engine_specs.exasol | FORCE_LIMIT | True | 128 | +| Firebird | superset.db_engine_specs.firebird | FETCH_MANY | True | None | +| Firebolt | superset.db_engine_specs.firebolt | FORCE_LIMIT | True | None | +| Google BigQuery | superset.db_engine_specs.bigquery | FORCE_LIMIT | True | 128 | +| Google Sheets | superset.db_engine_specs.gsheets | FORCE_LIMIT | True | None | +| IBM Db2 | superset.db_engine_specs.db2 | WRAP_SQL | True | 30 | +| IBM Db2 for i | superset.db_engine_specs.ibmi | WRAP_SQL | True | 128 | +| IBM Netezza Performance Server | superset.db_engine_specs.netezza | FORCE_LIMIT | True | None | +| KustoKQL | superset.db_engine_specs.kusto | FORCE_LIMIT | True | None | +| KustoSQL | superset.db_engine_specs.kusto | WRAP_SQL | True | None | +| MariaDB | superset.db_engine_specs.mariadb | FORCE_LIMIT | True | 64 | +| Microsoft SQL Server | superset.db_engine_specs.mssql | FORCE_LIMIT | True | 128 | +| MotherDuck | superset.db_engine_specs.duckdb | FORCE_LIMIT | True | None | +| MySQL | superset.db_engine_specs.mysql | FORCE_LIMIT | True | 64 | +| OceanBase | superset.db_engine_specs.oceanbase | FORCE_LIMIT | True | 128 | +| Ocient | superset.db_engine_specs.ocient | FORCE_LIMIT | True | 30 | +| Oracle | superset.db_engine_specs.oracle | FORCE_LIMIT | True | 128 | +| Parseable | superset.db_engine_specs.parseable | FORCE_LIMIT | True | None | +| PostgreSQL | superset.db_engine_specs.postgres | FORCE_LIMIT | True | None | +| Presto | superset.db_engine_specs.presto | FORCE_LIMIT | True | None | +| RisingWave | superset.db_engine_specs.risingwave | FORCE_LIMIT | True | 63 | +| SAP HANA | superset.db_engine_specs.hana | WRAP_SQL | True | 30 | +| SQLite | superset.db_engine_specs.sqlite | FORCE_LIMIT | True | None | +| Shillelagh | superset.db_engine_specs.shillelagh | FORCE_LIMIT | True | None | +| SingleStore | superset.db_engine_specs.singlestore | FORCE_LIMIT | True | 256 | +| Snowflake | superset.db_engine_specs.snowflake | FORCE_LIMIT | True | 256 | +| StarRocks | superset.db_engine_specs.starrocks | FORCE_LIMIT | True | 64 | +| Superset meta database | superset.db_engine_specs.superset | FORCE_LIMIT | True | None | +| TDengine | superset.db_engine_specs.tdengine | FORCE_LIMIT | True | 64 | +| Teradata | superset.db_engine_specs.teradata | FORCE_LIMIT | True | 30 | +| Trino | superset.db_engine_specs.trino | FORCE_LIMIT | True | None | +| Vertica | superset.db_engine_specs.vertica | FORCE_LIMIT | True | None | +| YDB | superset.db_engine_specs.ydb | FORCE_LIMIT | True | None | +| base | superset.db_engine_specs.presto | FORCE_LIMIT | True | None | + +### SQL Capabilities + +| Database | JOINs | Subqueries | Aliases in SELECT | Aliases in ORDER BY | CTEs | Comments | Escaped Colons | Inline Time Groupby | Source Column When Aliased | Aggregations in ORDER BY | Expressions in ORDER BY | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| Amazon Athena | True | True | True | True | True | True | False | False | False | True | False | +| Amazon DynamoDB | True | True | True | True | True | True | True | False | False | True | False | +| Amazon Redshift | True | True | True | True | True | True | True | False | False | True | False | +| Apache Doris | True | True | True | True | True | True | True | False | False | True | False | +| Apache Drill | True | True | True | True | True | True | True | False | False | True | False | +| Apache Druid | False | True | True | True | True | True | True | False | False | True | False | +| Apache Hive | True | True | True | True | True | True | True | False | False | False | False | +| Apache Impala | True | True | True | True | True | True | True | False | False | True | False | +| Apache Kylin | True | True | True | True | True | True | True | False | False | True | False | +| Apache Pinot | False | False | False | False | True | True | True | False | False | True | False | +| Apache Solr | False | False | True | True | True | True | True | False | False | True | False | +| Apache Spark SQL | True | True | True | True | True | True | True | False | False | False | False | +| Ascend | True | True | True | True | True | True | True | False | False | True | False | +| Aurora MySQL (Data API) | True | True | True | True | True | True | True | False | False | True | False | +| Aurora PostgreSQL (Data API) | True | True | True | True | True | True | True | False | False | True | False | +| Azure Synapse | True | True | True | True | False | True | True | False | False | True | False | +| ClickHouse | True | True | True | True | True | True | True | True | False | True | False | +| ClickHouse Connect (Superset) | True | True | True | True | True | True | True | True | False | True | False | +| CockroachDB | True | True | True | True | True | True | True | False | False | True | False | +| Couchbase | False | False | True | True | True | True | True | False | False | True | False | +| CrateDB | True | True | True | True | True | True | True | False | False | True | False | +| Databend | True | True | True | True | True | True | True | True | False | True | False | +| Databricks | True | True | True | True | True | True | True | False | False | True | False | +| Databricks (legacy) | True | True | True | True | True | True | True | False | False | True | False | +| Databricks Interactive Cluster | True | True | True | True | True | True | True | False | False | False | False | +| Databricks SQL Endpoint | True | True | True | True | True | True | True | False | False | True | False | +| Denodo | True | True | True | True | True | True | True | False | False | True | False | +| Dremio | True | True | True | True | True | True | True | False | False | True | False | +| DuckDB | True | True | True | True | True | True | True | False | False | True | False | +| ElasticSearch (OpenDistro SQL) | False | True | True | True | True | False | True | True | False | True | False | +| ElasticSearch (SQL API) | False | True | True | True | True | False | True | True | False | True | False | +| Exasol | True | True | True | True | True | True | True | False | False | True | False | +| Firebird | True | True | True | True | True | True | True | False | False | True | False | +| Firebolt | True | True | True | True | True | True | True | False | False | True | False | +| Google BigQuery | True | True | True | True | True | True | True | False | False | True | True | +| Google Sheets | True | True | True | True | True | True | True | False | False | True | False | +| IBM Db2 | True | True | True | True | True | True | True | False | False | True | False | +| IBM Db2 for i | True | True | True | True | True | True | True | False | False | True | False | +| IBM Netezza Performance Server | True | True | True | True | True | True | True | False | False | True | False | +| KustoKQL | True | True | True | True | True | False | True | True | False | True | False | +| KustoSQL | True | True | True | True | True | False | True | True | False | True | False | +| MariaDB | True | True | True | True | True | True | True | False | False | True | False | +| Microsoft SQL Server | True | True | True | True | False | True | True | False | False | True | False | +| MotherDuck | True | True | True | True | True | True | True | False | False | True | False | +| MySQL | True | True | True | True | True | True | True | False | False | True | False | +| OceanBase | True | True | True | True | True | True | True | False | False | True | False | +| Ocient | True | True | True | True | False | True | True | False | False | True | False | +| Oracle | True | True | True | True | True | True | True | False | False | True | False | +| Parseable | True | True | True | True | True | True | True | False | False | True | False | +| PostgreSQL | True | True | True | True | True | True | True | False | False | True | False | +| Presto | True | True | True | True | True | True | True | False | True | True | False | +| RisingWave | True | True | True | True | True | True | True | False | False | True | False | +| SAP HANA | True | True | True | True | True | True | True | False | False | True | False | +| SQLite | True | True | True | True | True | True | True | False | False | True | False | +| Shillelagh | True | True | True | True | True | True | True | False | False | True | False | +| SingleStore | True | True | True | True | True | True | True | False | False | True | False | +| Snowflake | True | True | True | True | True | True | True | False | False | True | False | +| StarRocks | True | True | True | True | True | True | True | False | False | True | False | +| Superset meta database | True | True | True | True | True | True | True | False | False | True | False | +| TDengine | True | True | True | True | True | True | True | False | False | True | False | +| Teradata | True | True | True | True | True | True | True | False | False | True | False | +| Trino | True | True | True | True | True | True | True | False | True | True | False | +| Vertica | True | True | True | True | True | True | True | False | False | True | False | +| YDB | True | True | True | True | True | True | True | False | False | True | False | +| base | True | True | True | True | True | True | True | False | False | True | False | + +### Time Grains – Common + +| Database | SECOND | MINUTE | HOUR | DAY | WEEK | MONTH | QUARTER | YEAR | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | +| Amazon Athena | True | True | True | True | True | True | True | True | +| Amazon DynamoDB | True | True | True | True | True | True | True | True | +| Amazon Redshift | True | True | True | True | True | True | True | True | +| Apache Doris | True | True | True | True | True | True | True | True | +| Apache Drill | True | True | True | True | True | True | True | True | +| Apache Druid | True | True | True | True | True | True | True | True | +| Apache Hive | True | True | True | True | True | True | True | True | +| Apache Impala | False | True | True | True | True | True | True | True | +| Apache Kylin | True | True | True | True | True | True | True | True | +| Apache Pinot | True | True | True | True | True | True | True | True | +| Apache Solr | False | False | False | False | False | False | False | False | +| Apache Spark SQL | True | True | True | True | True | True | True | True | +| Ascend | True | True | True | True | True | True | True | True | +| Aurora MySQL (Data API) | True | True | True | True | True | True | True | True | +| Aurora PostgreSQL (Data API) | True | True | True | True | True | True | True | True | +| Azure Synapse | True | True | True | True | True | True | True | True | +| ClickHouse | False | True | True | True | True | True | True | True | +| ClickHouse Connect (Superset) | False | True | True | True | True | True | True | True | +| CockroachDB | True | True | True | True | True | True | True | True | +| Couchbase | True | True | True | True | False | True | True | True | +| CrateDB | True | True | True | True | True | True | True | True | +| Databend | True | True | True | True | True | True | True | True | +| Databricks | True | True | True | True | True | True | True | True | +| Databricks (legacy) | True | True | True | True | True | True | True | True | +| Databricks Interactive Cluster | True | True | True | True | True | True | True | True | +| Databricks SQL Endpoint | True | True | True | True | True | True | True | True | +| Denodo | False | True | True | True | True | True | True | True | +| Dremio | True | True | True | True | True | True | True | True | +| DuckDB | True | True | True | True | True | True | True | True | +| ElasticSearch (OpenDistro SQL) | True | True | True | True | False | True | False | True | +| ElasticSearch (SQL API) | True | True | True | True | True | True | False | True | +| Exasol | True | True | True | True | True | True | True | True | +| Firebird | True | True | True | True | False | True | False | True | +| Firebolt | True | True | True | True | True | True | True | True | +| Google BigQuery | True | True | True | True | True | True | True | True | +| Google Sheets | True | True | True | True | True | True | True | True | +| IBM Db2 | True | True | True | True | True | True | True | True | +| IBM Db2 for i | True | True | True | True | True | True | True | True | +| IBM Netezza Performance Server | True | True | True | True | True | True | True | True | +| KustoKQL | True | True | True | True | True | True | False | True | +| KustoSQL | True | True | True | True | True | True | True | True | +| MariaDB | True | True | True | True | True | True | True | True | +| Microsoft SQL Server | True | True | True | True | True | True | True | True | +| MotherDuck | True | True | True | True | True | True | True | True | +| MySQL | True | True | True | True | True | True | True | True | +| OceanBase | True | True | True | True | True | True | True | True | +| Ocient | True | True | True | True | True | True | False | True | +| Oracle | True | True | True | True | True | True | True | True | +| Parseable | True | True | True | True | True | True | True | True | +| PostgreSQL | True | True | True | True | True | True | True | True | +| Presto | True | True | True | True | True | True | True | True | +| RisingWave | True | True | True | True | True | True | True | True | +| SAP HANA | True | True | True | True | False | True | True | True | +| SQLite | True | True | True | True | True | True | True | True | +| Shillelagh | True | True | True | True | True | True | True | True | +| SingleStore | True | True | True | True | True | True | True | True | +| Snowflake | True | True | True | True | True | True | True | True | +| StarRocks | True | True | True | True | True | True | True | True | +| Superset meta database | True | True | True | True | True | True | True | True | +| TDengine | True | True | True | True | True | False | False | False | +| Teradata | False | True | True | True | True | True | True | True | +| Trino | True | True | True | True | True | True | True | True | +| Vertica | True | True | True | True | True | True | True | True | +| YDB | True | True | True | True | True | True | True | True | +| base | True | True | True | True | True | True | True | True | + +### Time Grains – Extended + +| Database | FIVE_SECONDS | THIRTY_SECONDS | FIVE_MINUTES | TEN_MINUTES | FIFTEEN_MINUTES | THIRTY_MINUTES | HALF_HOUR | SIX_HOURS | WEEK_STARTING_SUNDAY | WEEK_STARTING_MONDAY | WEEK_ENDING_SATURDAY | WEEK_ENDING_SUNDAY | QUARTER_YEAR | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| Amazon Athena | False | False | False | False | False | False | False | False | True | False | True | False | False | +| Amazon DynamoDB | False | False | False | False | False | False | False | False | True | True | True | True | False | +| Amazon Redshift | True | True | True | True | True | True | False | False | False | False | False | False | False | +| Apache Doris | False | False | False | False | False | False | False | False | False | True | False | False | False | +| Apache Drill | False | False | False | False | True | True | False | False | False | False | False | False | False | +| Apache Druid | True | True | True | True | True | True | False | True | True | False | True | False | False | +| Apache Hive | False | False | False | False | False | False | False | False | True | False | True | False | False | +| Apache Impala | False | False | False | False | False | False | False | False | False | False | False | False | False | +| Apache Kylin | False | False | False | False | False | False | False | False | False | False | False | False | False | +| Apache Pinot | False | False | True | True | True | True | False | False | False | False | False | False | False | +| Apache Solr | False | False | False | False | False | False | False | False | False | False | False | False | False | +| Apache Spark SQL | False | False | False | False | False | False | False | False | True | False | True | False | False | +| Ascend | False | False | False | False | False | False | False | False | False | False | False | False | False | +| Aurora MySQL (Data API) | False | False | False | False | False | False | False | False | False | True | False | False | False | +| Aurora PostgreSQL (Data API) | True | True | True | True | True | True | False | False | False | False | False | False | False | +| Azure Synapse | False | False | True | True | True | True | False | False | True | True | False | False | False | +| ClickHouse | False | False | True | True | True | True | False | False | False | False | False | False | False | +| ClickHouse Connect (Superset) | False | False | True | True | True | True | False | False | False | False | False | False | False | +| CockroachDB | True | True | True | True | True | True | False | False | False | False | False | False | False | +| Couchbase | False | False | False | False | False | False | False | False | False | False | False | False | False | +| CrateDB | False | False | False | False | False | False | False | False | False | False | False | False | False | +| Databend | False | False | True | True | True | False | False | False | False | False | False | False | False | +| Databricks | False | False | False | False | False | False | False | False | True | False | True | False | False | +| Databricks (legacy) | False | False | False | False | False | False | False | False | True | False | True | False | False | +| Databricks Interactive Cluster | False | False | False | False | False | False | False | False | True | False | True | False | False | +| Databricks SQL Endpoint | False | False | False | False | False | False | False | False | True | False | True | False | False | +| Denodo | False | False | False | False | False | False | False | False | False | False | False | False | False | +| Dremio | False | False | False | False | False | False | False | False | False | False | False | False | False | +| DuckDB | False | False | False | False | False | False | False | False | False | False | False | False | False | +| ElasticSearch (OpenDistro SQL) | False | False | False | False | False | False | False | False | False | False | False | False | False | +| ElasticSearch (SQL API) | False | False | False | False | False | False | False | False | False | False | False | False | False | +| Exasol | False | False | False | False | False | False | False | False | False | False | False | False | False | +| Firebird | False | False | False | False | False | False | False | False | False | False | False | False | False | +| Firebolt | False | False | False | False | False | False | False | False | False | False | False | False | False | +| Google BigQuery | False | False | True | True | True | True | False | False | False | True | False | False | False | +| Google Sheets | True | True | True | True | True | True | True | True | True | True | True | True | True | +| IBM Db2 | False | False | False | False | False | False | False | False | False | False | False | False | False | +| IBM Db2 for i | False | False | False | False | False | False | False | False | False | False | False | False | False | +| IBM Netezza Performance Server | False | False | False | False | False | False | False | False | False | False | False | False | False | +| KustoKQL | False | True | True | False | False | True | False | False | False | False | False | False | False | +| KustoSQL | False | False | True | True | True | False | True | False | True | True | False | False | False | +| MariaDB | False | False | False | False | False | False | False | False | False | True | False | False | False | +| Microsoft SQL Server | False | False | True | True | True | True | False | False | True | True | False | False | False | +| MotherDuck | False | False | False | False | False | False | False | False | False | False | False | False | False | +| MySQL | False | False | False | False | False | False | False | False | False | True | False | False | False | +| OceanBase | False | False | False | False | False | False | False | False | False | True | False | False | False | +| Ocient | False | False | False | False | False | False | False | False | False | False | False | False | True | +| Oracle | False | False | False | False | False | False | False | False | False | False | False | False | False | +| Parseable | False | False | False | False | False | False | False | False | False | False | False | False | False | +| PostgreSQL | True | True | True | True | True | True | False | False | False | False | False | False | False | +| Presto | True | True | True | True | True | False | True | True | True | True | True | True | False | +| RisingWave | True | True | True | True | True | True | False | False | False | False | False | False | False | +| SAP HANA | False | False | False | False | False | False | False | False | False | False | False | False | False | +| SQLite | True | True | True | True | True | True | True | True | True | True | True | True | True | +| Shillelagh | True | True | True | True | True | True | True | True | True | True | True | True | True | +| SingleStore | False | False | False | False | False | False | False | False | False | False | False | False | False | +| Snowflake | False | False | True | True | True | True | False | False | False | False | False | False | False | +| StarRocks | False | False | False | False | False | False | False | False | False | True | False | False | False | +| Superset meta database | True | True | True | True | True | True | True | True | True | True | True | True | True | +| TDengine | False | False | False | False | False | False | False | False | False | False | False | False | False | +| Teradata | False | False | False | False | False | False | False | False | False | False | False | False | False | +| Trino | True | True | True | True | True | False | True | True | True | True | True | True | False | +| Vertica | True | True | True | True | True | True | False | False | False | False | False | False | False | +| YDB | False | True | True | True | True | True | False | False | False | False | False | False | False | +| base | True | True | True | True | True | False | True | True | True | True | True | True | False | + +### Core Platform & Metadata Features + + +Integration with platform features and metadata handling. + +| Database | Masked Encrypted Extra | Column Type Mappings | Function Names | File Upload | Dynamic Schema | Catalog | Dynamic Catalog | SSH Tunneling | Latest Partition | Query Cancellation | Get Metrics | Extra Table Metadata | Exception Mapping | Custom Errors | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| Amazon Athena | False | False | False | True | False | False | False | False | False | False | False | False | False | False | +| Amazon DynamoDB | False | False | False | True | False | False | False | True | False | False | False | False | False | False | +| Amazon Redshift | False | False | False | True | False | False | False | True | False | True | False | False | False | False | +| Apache Doris | False | True | False | True | True | True | True | True | False | True | False | False | False | False | +| Apache Drill | False | False | False | True | True | False | False | True | False | False | False | False | False | False | +| Apache Druid | False | False | False | True | False | False | False | True | False | False | False | False | True | False | +| Apache Hive | False | True | True | True | True | True | True | True | True | True | False | True | False | False | +| Apache Impala | False | False | False | True | False | False | False | True | False | True | False | False | False | False | +| Apache Kylin | False | False | False | True | False | False | False | True | False | False | False | False | False | False | +| Apache Pinot | False | False | False | True | False | False | False | True | False | False | False | False | False | False | +| Apache Solr | False | False | False | True | False | False | False | True | False | False | False | False | False | False | +| Apache Spark SQL | False | True | True | True | True | True | True | True | True | True | False | True | False | False | +| Ascend | False | False | False | True | False | False | False | True | False | True | False | False | False | False | +| Aurora MySQL (Data API) | False | True | False | True | True | False | False | True | False | True | False | False | False | False | +| Aurora PostgreSQL (Data API) | False | True | False | True | True | True | True | True | False | True | False | False | False | False | +| Azure Synapse | False | True | False | True | False | False | False | True | False | False | False | False | False | False | +| ClickHouse | False | True | True | False | False | False | False | True | False | False | False | False | True | False | +| ClickHouse Connect (Superset) | False | True | True | False | True | False | False | True | False | False | False | False | True | False | +| CockroachDB | False | True | False | True | True | True | True | True | False | True | False | False | False | False | +| Couchbase | False | False | False | True | False | False | False | True | False | False | False | False | False | False | +| CrateDB | False | False | False | True | False | False | False | True | False | False | False | False | False | False | +| Databend | False | True | True | False | False | False | False | True | False | False | False | False | True | False | +| Databricks | False | False | False | True | True | True | True | True | False | False | False | False | False | True | +| Databricks (legacy) | False | False | False | True | True | True | True | True | False | False | False | False | False | True | +| Databricks Interactive Cluster | False | True | True | True | True | True | True | True | True | True | False | True | False | False | +| Databricks SQL Endpoint | False | False | False | True | False | False | False | True | False | False | False | False | False | False | +| Denodo | False | False | False | True | False | False | False | True | False | False | False | False | False | False | +| Dremio | False | False | False | True | False | False | False | True | False | False | False | False | False | False | +| DuckDB | False | True | False | True | False | False | False | True | False | False | False | False | False | False | +| ElasticSearch (OpenDistro SQL) | False | False | False | True | False | False | False | True | False | False | False | False | False | False | +| ElasticSearch (SQL API) | False | False | False | True | False | False | False | True | False | False | False | False | True | False | +| Exasol | False | False | False | True | False | False | False | True | False | False | False | False | False | False | +| Firebird | False | False | False | True | False | False | False | True | False | False | False | False | False | False | +| Firebolt | False | False | False | True | False | False | False | True | False | False | False | False | False | False | +| Google BigQuery | False | False | False | True | False | True | True | False | True | False | False | True | True | False | +| Google Sheets | False | False | True | True | False | False | False | False | False | False | False | True | False | False | +| IBM Db2 | False | False | False | True | True | False | False | True | False | False | False | False | False | False | +| IBM Db2 for i | False | False | False | True | True | False | False | True | False | False | False | False | False | False | +| IBM Netezza Performance Server | False | False | False | True | False | False | False | True | False | False | False | False | False | False | +| KustoKQL | False | False | False | True | False | False | False | True | False | False | False | False | True | False | +| KustoSQL | False | True | False | True | False | False | False | True | False | False | False | False | True | False | +| MariaDB | False | True | False | True | True | False | False | True | False | True | False | False | False | False | +| Microsoft SQL Server | False | True | False | True | False | False | False | True | False | False | False | False | False | False | +| MotherDuck | False | True | False | True | False | True | True | True | False | False | False | False | False | False | +| MySQL | False | True | False | True | True | False | False | True | False | True | False | False | False | False | +| OceanBase | False | True | False | True | True | False | False | True | False | True | False | False | False | False | +| Ocient | False | False | False | True | False | False | False | True | False | True | False | False | False | False | +| Oracle | False | False | False | True | False | False | False | True | False | False | False | False | False | False | +| Parseable | False | False | False | True | False | False | False | True | False | False | False | False | False | False | +| PostgreSQL | False | False | False | True | False | False | False | True | False | False | False | False | False | False | +| Presto | False | True | True | True | True | True | True | True | True | True | False | True | False | False | +| RisingWave | False | True | False | True | True | True | True | True | False | True | False | False | False | False | +| SAP HANA | False | False | False | True | False | False | False | True | False | False | False | False | False | False | +| SQLite | False | False | True | True | False | False | False | False | False | False | False | False | False | False | +| Shillelagh | False | False | True | True | False | False | False | False | False | False | False | False | False | False | +| SingleStore | False | True | True | True | True | False | False | True | False | True | False | False | False | False | +| Snowflake | False | False | False | True | True | True | True | True | False | True | False | False | False | False | +| StarRocks | False | True | False | True | True | False | False | True | False | True | False | False | False | False | +| Superset meta database | False | False | True | False | False | False | False | False | False | False | False | False | False | False | +| TDengine | False | False | False | True | False | False | False | True | False | False | False | False | False | False | +| Teradata | False | False | False | True | False | False | False | True | False | False | False | False | False | False | +| Trino | False | True | True | True | True | True | True | True | True | True | False | True | True | False | +| Vertica | False | False | False | True | False | False | False | True | False | False | False | False | False | False | +| YDB | False | False | False | False | False | False | False | True | False | False | False | False | False | False | +| base | False | True | True | True | True | True | True | True | True | False | False | False | False | False | + +### Operational & Advanced Features + +| Database | User Impersonation | Expand Data | Cost Estimation | SQL Validation | +| --- | --- | --- | --- | --- | +| Amazon Athena | False | False | False | False | +| Amazon DynamoDB | False | False | False | False | +| Amazon Redshift | False | False | False | False | +| Apache Doris | False | False | False | False | +| Apache Drill | True | False | False | False | +| Apache Druid | False | False | False | False | +| Apache Hive | True | True | True | False | +| Apache Impala | False | False | False | False | +| Apache Kylin | False | False | False | False | +| Apache Pinot | False | False | False | False | +| Apache Solr | False | False | False | False | +| Apache Spark SQL | True | True | True | False | +| Ascend | False | False | False | False | +| Aurora MySQL (Data API) | False | False | False | False | +| Aurora PostgreSQL (Data API) | False | False | True | True | +| Azure Synapse | False | False | False | False | +| ClickHouse | False | False | False | False | +| ClickHouse Connect (Superset) | False | False | False | False | +| CockroachDB | False | False | True | False | +| Couchbase | False | False | False | False | +| CrateDB | False | False | False | False | +| Databend | False | False | False | False | +| Databricks | False | False | False | False | +| Databricks (legacy) | False | False | False | False | +| Databricks Interactive Cluster | True | True | True | False | +| Databricks SQL Endpoint | False | False | False | False | +| Denodo | False | False | False | False | +| Dremio | False | False | False | False | +| DuckDB | False | False | False | False | +| ElasticSearch (OpenDistro SQL) | False | False | False | False | +| ElasticSearch (SQL API) | False | False | False | False | +| Exasol | False | False | False | False | +| Firebird | False | False | False | False | +| Firebolt | False | False | False | False | +| Google BigQuery | False | False | True | False | +| Google Sheets | True | False | False | False | +| IBM Db2 | False | False | False | False | +| IBM Db2 for i | False | False | False | False | +| IBM Netezza Performance Server | False | False | False | False | +| KustoKQL | False | False | False | False | +| KustoSQL | False | False | False | False | +| MariaDB | False | False | False | False | +| Microsoft SQL Server | False | False | False | False | +| MotherDuck | False | False | False | False | +| MySQL | False | False | False | False | +| OceanBase | False | False | False | False | +| Ocient | False | False | False | False | +| Oracle | False | False | False | False | +| Parseable | False | False | False | False | +| PostgreSQL | False | False | False | False | +| Presto | True | True | True | True | +| RisingWave | False | False | True | False | +| SAP HANA | False | False | False | False | +| SQLite | False | False | False | False | +| Shillelagh | False | False | False | False | +| SingleStore | False | False | False | False | +| Snowflake | False | False | False | False | +| StarRocks | True | False | False | False | +| Superset meta database | False | False | False | False | +| TDengine | False | False | False | False | +| Teradata | False | False | False | False | +| Trino | True | False | True | False | +| Vertica | False | False | False | False | +| YDB | False | False | False | False | +| base | False | False | True | False | + +## Database information + +A DB engine spec has attributes that describe the underlying database engine, so that Superset can know how to build and run queries. For example, some databases don't support subqueries, which are needed for some of the queries produced by Superset for certain charts. When a database doesn't support subqueries the query is run in two-steps, using the results from the first query to build the second query. + +These attributes and their default values (set in the base class, `BaseEngineSpec`) are described below: + +### `limit_method = LimitMethod.FORCE_LIMIT` + +When running user queries in SQL Lab, Superset needs to limit the number of rows returned. The reason for that is cost and performance: there's no point in running a query that produces millions of rows when they can't be loaded into the browser. + +For most databases this is done by parsing the user submitted query and applying a limit, if one is not present, or replacing the existing limit if it's larger. This is called the `FORCE_LIMIT` method, and is the most efficient, since the database will produce at most the number of rows that Superset will display. + +For some databases this method might not work, and they can use the `WRAP_SQL` method, which wraps the original query in a `SELECT *` and applies a limit via the SQLAlchemy dialect, which should get translated to the correct syntax. This method might be inefficient, since the database optimizer might not be able to push the limit to the inner query. + +Finally, as a last resource there is the `FETCH_MANY` method. When a DB engine spec uses this method the query runs unmodified, but Superset fetches only a certain number of rows from the cursor. It's possible that a database using this method can optimize the query execution and compute rows as they are being read by the cursor, but it's unlikely. This makes this method the least efficient of the three. + +Note that when Superset runs a query with a given limit, say 100, it always modifies the query to request one additional row (`LIMIT 101`, in this case). This extra row is dropped before the results are returned to the user, but it allows Superset to inform the users that the query was indeed limited. Otherwise a query with `LIMIT 100` that returns exactly 100 rows would seem like it was limited, when in fact it was not. + +### `allows_joins = True` + +Not all databases support `JOIN`s. When building complex charts, Superset will try to join the table to itself in order to compute `top_n` groups, for example. If the database doesn't support joins Superset will instead run a prequery, and use the results to build the final query. + +### `allows_subqueries = True` + +Similarly, not all databases support subqueries. For more complex charts Superset will build subqueries if possible, or run the query in two-steps otherwise. + +### `allows_alias_in_select = True` + +Does the DB support aliases in the projection of a query, eg: + +```sql +SELECT COUNT(*) AS cnt +``` + +Superset will try to use aliases whenever possible, in order to give friendly names to expressions. + +### `allows_alias_in_orderby = True` + +Does the DB support referencing alias in the `GROUP BY`, eg: + +```sql +SELECT + UPPER(country_of_origin) AS country + COUNT(*) AS cnt +FROM + some_table +GROUP BY + country +``` + +Otherwise the query is written as: + +```sql +SELECT + UPPER(country_of_origin) AS country + COUNT(*) AS cnt +FROM + some_table +GROUP BY + UPPER(country_of_origin) +``` + +### `time_groupby_inline = False` + +In theory this attribute should be used to omit time filters from the self-joins. When the attribute is false the time attribute will be present in the subquery used to compute limited series, eg: + +```sql +SELECT DATE_TRUNC('day', ts) AS ts, + team AS team, + COUNT(*) AS count +FROM public.threads +JOIN + (SELECT team AS team__, + COUNT(*) AS mme_inner__ + FROM public.threads + -- this is added when `time_groupby_inline = False` + WHERE ts >= TO_TIMESTAMP('2022-07-27 00:00:00.000000', 'YYYY-MM-DD HH24:MI:SS.US') + AND ts < TO_TIMESTAMP('2023-07-27 00:00:00.000000', 'YYYY-MM-DD HH24:MI:SS.US') + -- + GROUP BY team + ORDER BY mme_inner__ DESC + LIMIT 5) AS anon_1 ON team = team__ +WHERE ts >= TO_TIMESTAMP('2022-07-27 00:00:00.000000', 'YYYY-MM-DD HH24:MI:SS.US') + AND ts < TO_TIMESTAMP('2023-07-27 00:00:00.000000', 'YYYY-MM-DD HH24:MI:SS.US') +GROUP BY DATE_TRUNC('day', ts), + team +ORDER BY count DESC +LIMIT 10000; +``` + +In practice, the attribute doesn't seem to be working as of 2023-07-27. + +### `allows_alias_to_source_column = True` + +When this is true the database allows queries where alias can overshadow existing column names. For example, in this query: + +```sql +SELECT + foo + 1 AS foo +FROM + some_table +ORDER BY + foo -- references the alias `foo + 1`, not the column `foo` +``` + +### `allows_hidden_orderby_agg = True` + +If set to true the database allows expressions in the `GROUP BY` that are not present in the projection (`SELECT`), eg: + +```sql +SELECT + country, + COUNT(*) +FROM + some_table +GROUP BY + country +ORDER BY + SUM(population) -- not present in the `SELECT` +``` + +### `allows_hidden_cc_in_orderby = False` + +This the opposite of `allows_alias_in_orderby`, for databases that require aliases in the `ORDER BY`. For example, BigQuery doesn't like this query: + +```sql +SELECT + CASE + WHEN type = 'feature' THEN 'f' + WHEN type = 'bug' THEN 'b' + ELSE 'o' + END AS cc_type +FROM + some_table +GROUP BY + cc_type +ORDER BY + CASE + WHEN type = 'feature' THEN 'f' + WHEN type = 'bug' THEN 'b' + ELSE 'o' + END +``` + +Instead, it must be written as: + +```sql +SELECT + CASE + WHEN type = 'feature' THEN 'f' + WHEN type = 'bug' THEN 'b' + ELSE 'o' + END AS cc_type +FROM + some_table +GROUP BY + cc_type +ORDER BY + cc_type +``` + +### `allows_cte_in_subquery = True` + +When a virtual dataset is used in a chart the original query is converted into a subquery, and is wrapped in an outer query that is generated based on the chart controls. The virtual dataset query might have a CTE, and some databases don't like subqueries with CTEs in them. + +When this attribute is false Superset will extract the CTE and move it outside of the subquery when generating SQL for charts. The name of the new CTE will be `cte_alias`, also defined in the DB engine spec. + +### `allow_limit_clause = True` + +Allows for the `LIMIT` clause. Otherwise, the database probably uses `TOP` to limit rows. + +### `max_column_name_length: int | None = None` + +Most databases have a well defined limit for the maximum length of a column name (SQLite is probably the one exception). While the can be set (and defaults) to `None,` it's highly recommended to set a value to prevent errors. + +### `allows_sql_comments = True` + +Are comments supported in the DB? In general SQL in comments are defined by double dashes: + +```sql +-- this is a comment +SELECT * -- we need everything +FROM some_table +``` + +### `allows_escaped_colons = True` + +SQLAlchemy recommends escaping colons to prevent them from being interpreted as bindings to parameters. Because of this, when building queries from virtual datasets Superset will escape all colons with `\:`. + +This works for most databases except Athena. The `allows_escaped_colons` attribute specifies if the database supports the escape colon. + +## Basic features + +These are features that all DB engine specs should support, as the name suggests. They provide a much better user experience for the user. + +### Time grains + +The most basic feature that DB engine specs need to support is defining time grain expressions. These are dialect-specific SQL expressions that are used to compute metrics on a given time grain when building charts. For example, when computing the metric `COUNT(*)` on a daily basis, Superset will generate the following query: + +```sql +SELECT + , + COUNT(*) +... +GROUP BY + +``` + +For some databases with support for `DATE_TRUNC` or `TIME_FLOOR` this is easy. Here's how Apache Druid computes 15 minute aggregations: + +```sql +TIME_FLOOR(CAST({col} AS TIMESTAMP), 'PT15M') +``` + +Where `{col}` is the time column being aggregated — the expression is actually a Jinja2 template. Druid uses the ISO standard for durations, with `PT15M` representing 15 minutes. + +On the other and, here's the same for SQLite: + +```sql +DATETIME( + STRFTIME( + '%Y-%m-%dT%H:%M:00', + {col} + ), + printf( + '-%d minutes', + CAST(strftime('%M', {col}) AS INT) % 15 + ) +) +``` + +The SQLite version has to truncate the column down to the minute, and then subtract a number of minutes equals to the modulo 15. + +Time grain expressions are defined in the `_time_grain_expressions` class attribute, which maps from a `superset.constants.TimeGrain` to the SQL expression. The dictionary has a special key `None`, that should map to the column directly, for when no time grain is specified. + +Note that it's possible to add new time grains via configuration. For example, if you want to add a "2 seconds" time grain to your installation you can add it to `TIME_GRAIN_ADDONS`, and implement it in `TIME_GRAIN_ADDON_EXPRESSIONS`: + +```python +# superset_config.py +TIME_GRAIN_ADDONS = {"PT2S": "2 second"} + +TIME_GRAIN_ADDON_EXPRESSIONS = { + "clickhouse": { + "PT2S": "toDateTime(intDiv(toUInt32(toDateTime({col})), 2)*2)", + } +} +``` + +### Column type mapping + +Column type mapping, defined in the `column_type_mappings` class attribute, is just a way of mapping type names from the database to types Superset understand. The default values in `BaseEngineSpec` are sane: + +```python +_default_column_type_mappings: tuple[ColumnTypeMapping, ...] = ( + ( + re.compile(r"^string", re.IGNORECASE), + types.String(), + GenericDataType.STRING, + ), + ( + re.compile(r"^float", re.IGNORECASE), + types.Float(), + GenericDataType.NUMERIC, + ), + ( + re.compile(r"^date", re.IGNORECASE), + types.Date(), + GenericDataType.TEMPORAL, + ), + ( + re.compile(r"^bool(ean)?", re.IGNORECASE), + types.Boolean(), + GenericDataType.BOOLEAN, + ), + ... +) +``` + +But you might want to implement more specific types in the DB engine spec, or complex types. For example, for MSSQL we have: + +```python +from sqlalchemy.dialects.mssql.base import SMALLDATETIME + +class MssqlEngineSpec(BaseEngineSpec): + ... + column_type_mappings = ( + ( + re.compile(r"^smalldatetime.*", re.IGNORECASE), + SMALLDATETIME(), + GenericDataType.TEMPORAL, + ), + ) +``` + +### Function names + +DB engine specs should implement a class method called `get_function_names` that returns a list of strings, representing all the function names that the database supports. This is used for autocomplete in SQL Lab. + +### Masked encrypted extra + +Superset does a good job in keeping credentials secure. When you add a database with a password, for example: + +```text +postgresql://admin:password123@db.example.org:5432/db +``` + +The password is sent over the network only when the database is created. When you edit the database later, Superset will return this as the SQLAlchemy URI: + +```text +postgresql://admin:XXXXXXXXXX@db.example.org:5432/db +``` + +The password will be masked in the API response; it's not just masked in the browser UI. This is done in order to avoid sending the password unnecessarily over the network. Also, if a non-admin user has access to the API response, they won't be able to know the database password. + +When the database is edited, the Superset backend is smart enough to replace the masked password with the actual password, unless the password has changed. That is, if you change the database in the URI from `db` to `db2` the SQLAlchemy URI will be stored in the backend as: + +```text +postgresql://admin:password123@db.example.org:5432/db2 +``` + +The password is not the only piece of information where security is critical. For many databases (like BigQuery), sensitive information is stored in the credentials JSON payload. For example: + +```json +{ + "type": "service_account", + "project_id": "dbt-tutorial-347100", + "private_key_id": "4bc71f06990c864a590fad8b94be6a5904fc171f", + "private_key": "", + "client_email": "dbt-user-278@dbt-tutorial-347100.iam.gserviceaccount.com", + "client_id": "115666988796889519425", + "auth_uri": "https://accounts.google.com/o/oauth2/auth", + "token_uri": "https://oauth2.googleapis.com/token", + "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", + "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/dbt-user-278%40dbt-tutorial-347100.iam.gserviceaccount.com" +} +``` + +Similarly to password, we don't want to send `private_key` to the client when a database is edited; the Superset API should never return its actual contents. Instead, Superset should return a masked value, and users should be able to edit the JSON without having to type in the `private_key` on every edit. + +To do this, DB engine specs and implement 2 methods, `mask_encrypted_extra` and `unmask_encrypted_extra`. They have these names because the credentials are stored in an encrypted column called `encrypted_extra`. Here's how these methods look like for BigQuery: + +```python +from superset.constants import PASSWORD_MASK + + +class BigQueryEngineSpec(BaseEngineSpec): + + @classmethod + def mask_encrypted_extra(cls, encrypted_extra: str | None) -> str | None: + if encrypted_extra is None: + return encrypted_extra + + try: + config = json.loads(encrypted_extra) + except (json.JSONDecodeError, TypeError): + return encrypted_extra + + try: + config["credentials_info"]["private_key"] = PASSWORD_MASK + except KeyError: + pass + + return json.dumps(config) + + @classmethod + def unmask_encrypted_extra( + cls, + old: str | None, + new: str | None + ) -> str | None: + if old is None or new is None: + return new + + try: + old_config = json.loads(old) + new_config = json.loads(new) + except (TypeError, json.JSONDecodeError): + return new + + if "credentials_info" not in new_config: + return new + + if "private_key" not in new_config["credentials_info"]: + return new + + if new_config["credentials_info"]["private_key"] == PASSWORD_MASK: + new_config["credentials_info"]["private_key"] = old_config[ + "credentials_info" + ]["private_key"] + + return json.dumps(new_config) +``` + +This way, when a user edits an existing BigQuery connection, the `private_key` is shown as `XXXXXXXXXX`. Everything else in the JSON is still displayed, and the user can change any of the fields without having to provide the private key. + +Note that while this is a basic feature that should be implemented for security reasons, it only makes sense in DB engine specs that use `encrypted_extra` to store connection information. + +## Nice to have features + +The next set of features are nice to have. They don't apply to all databases, and are not strictly needed for security or usability. + +### User impersonation + +In general there's no user-level granularity when accessing a database in Superset. A single database connection is shared by all users who have access to that database. There are many use cases when this is not desirable, and some databases implement mechanisms in which they can **impersonate users**, potentially reducing the scope of permissions available to run the query. + +For example, the Google Sheets DB engine spec implements this via the `get_url_for_impersonation` class method: + +```python +class GSheetsEngineSpec(ShillelaghEngineSpec): + + @classmethod + def get_url_for_impersonation( + cls, + url: URL, + impersonate_user: bool, + username: str | None, + access_token: str | None, + ) -> URL: + if impersonate_user and username is not None: + user = security_manager.find_user(username=username) + if user and user.email: + url = url.update_query_dict({"subject": user.email}) + + return url +``` + +The method `get_url_for_impersonation` updates the SQLAlchemy URI before every query. In this particular case, it will fetch the user's email and add it to the `subject` query argument. The driver will then lower the permissions to match that given user. This allows the connection to be configured with a service account that has access to all the spreadsheets, while giving users access to only the spreadsheets they own are have been shared with them (or with their organization — Google will handle the authorization in this case, not Superset). + +Alternatively, it's also possible to impersonate users by implementing the `update_impersonation_config`. This is a class method which modifies `connect_args` in place. You can use either method, and ideally they [should be consolidated in a single one](https://github.com/apache/superset/issues/24910). + +### OAuth2 + +Support for authenticating to a database using personal OAuth2 access tokens was introduced in [SIP-85](https://github.com/apache/superset/issues/20300). The Google Sheets DB engine spec is the reference implementation. + +Note that this API is still experimental and evolving quickly, subject to breaking changes. Currently, to add support for OAuth2 to a DB engine spec, the following attributes are needed: + +```python +class BaseEngineSpec: + + supports_oauth2 = True + oauth2_exception = OAuth2RedirectError + + oauth2_scope = " ".join([ + "https://example.org/scope1", + "https://example.org/scope2", + ]) + oauth2_authorization_request_uri = "https://example.org/authorize" + oauth2_token_request_uri = "https://example.org/token" +``` + +The `oauth2_exception` is an exception that is raised by `cursor.execute` when OAuth2 is needed. This will start the OAuth2 dance when `BaseEngineSpec.execute` is called, by returning the custom error `OAUTH2_REDIRECT` to the frontend. If the database driver doesn't have a specific exception, it might be necessary to overload the `execute` method in the DB engine spec, so that the `BaseEngineSpec.start_oauth2_dance` method gets called whenever OAuth2 is needed. + +The DB engine should implement logic in either `get_url_for_impersonation` or `update_impersonation_config` to update the connection with the personal access token. See the Google Sheets DB engine spec for a reference implementation. + +Currently OAuth2 needs to be configured at the DB engine spec level, ie, with one client for each DB engien spec. The configuration lives in `superset_config.py`: + +```python +# superset_config.py +DATABASE_OAUTH2_CLIENTS = { + "Google Sheets": { + "id": "XXX.apps.googleusercontent.com", + "secret": "GOCSPX-YYY", + "scope": " ".join( + [ + "https://www.googleapis.com/auth/drive.readonly", + "https://www.googleapis.com/auth/spreadsheets", + "https://spreadsheets.google.com/feeds", + ], + ), + "authorization_request_uri": "https://accounts.google.com/o/oauth2/v2/auth", + "token_request_uri": "https://oauth2.googleapis.com/token", + }, +} +DATABASE_OAUTH2_JWT_ALGORITHM = "HS256" +DATABASE_OAUTH2_REDIRECT_URI = "http://localhost:8088/api/v1/database/oauth2/" +DATABASE_OAUTH2_TIMEOUT = timedelta(seconds=30) +``` + +When configuring a client only the ID and secret are required; the DB engine spec should have default values for the scope and endpoints. The `DATABASE_OAUTH2_REDIRECT_URI` attribute is optional, and defaults to `/api/v1/databases/oauth2/` in Superset. + +In the future we plan to support adding custom clients via the Superset UI, and being able to manually assign clients to specific databases. + +### File upload + +When a DB engine spec supports file upload it declares so via the `supports_file_upload` class attribute. The base class implementation is very generic and should work for any database that has support for `CREATE TABLE`. It leverages Pandas and the [`df_to_sql`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_sql.html) method. + +For some databases the `df_to_sql` classmethod needs to be implemented. For example, for BigQuery the DB engine spec implements a custom method that uses the [`to_gbq`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_gbq.html) method. + +### Extra table metadata + +DB engine specs can return additional metadata associated with a table. This is done via the `get_extra_table_metadata` class method. Trino uses this to return information about the latest partition, for example, and Bigquery returns clustering information. This information is then surfaced in the SQL Lab UI, when browsing tables in the metadata explorer (on the left panel). + +### DB API exception mapping + +Different DB API 2.0 drivers implement different exceptions, even if they have the same name. The `get_dbapi_exception_mapping` class method returns a dictionary mapping these custom exceptions to Superset exceptions, so that Superset can return more specific errors when an exception is raised by the underlying driver. + +For example, for ClickHouse we have: + +```python +from urllib3.exceptions import NewConnectionError + +from superset.db_engine_specs.exceptions import SupersetDBAPIDatabaseError + + +class ClickHouseEngineSpec(ClickHouseBaseEngineSpec): + + @classmethod + def get_dbapi_exception_mapping(cls) -> dict[type[Exception], type[Exception]]: + return {NewConnectionError: SupersetDBAPIDatabaseError} +``` + +This way, if the ClickHouse driver raises a `NewConnectionError` it would get wrapped in a `SupersetDBAPIDatabaseError`. + +### Custom errors + +Queries can fail in many different ways. For example, in SQLite: + +```sql +sqlite> CREATE TABLE a (b INT); +sqlite> SELECT c FROM a; +Error: no such column: c +sqlite> +``` + +When a query fails, Superset will return the message, "Error: no such column: c", to the user as a generic error. + +Since ideally we want to return specific and actionable error messages, DB engine specs can implement methods that map error messages to more specific errors. For example, the SQLite DB engine specs defines: + +```python +COLUMN_DOES_NOT_EXIST_REGEX = re.compile("no such column: (?P.+)") + + +class SqliteEngineSpec(BaseEngineSpec): + + custom_errors: dict[Pattern[str], tuple[str, SupersetErrorType, dict[str, Any]]] = + COLUMN_DOES_NOT_EXIST_REGEX: ( + __('We can\'t seem to resolve the column "%(column_name)s"'), + SupersetErrorType.COLUMN_DOES_NOT_EXIST_ERROR, + {}, + ), + } +``` + +This way, when a user selects a column that doesn't exist Superset can return a more informative error. + +### Dynamic schema + +In SQL Lab it's possible to select a database, and then a schema in that database. Ideally, when running a query in SQL Lab, any unqualified table names (eg, `table`, instead of `schema.table`) should be in the selected schema. For example, if the user selects `dev` as the schema and then runs the following query: + +```sql +SELECT * FROM my_table +``` + +The table `my_table` should live in the `dev` schema. In order to do that, it's necessary to modify the SQLAlchemy URI before running the query. Since different databases have different ways of doing that, this functionality is implemented via the `adjust_engine_params` class method. The method receives the SQLAlchemy URI and `connect_args`, as well as the schema in which the query should run. It then returns a potentially modified URI and `connect_args` to ensure that the query runs in the specified schema. + +When a DB engine specs implements `adjust_engine_params` it should have the class attribute `supports_dynamic_schema` set to true. This is critical for security, since **it allows Superset to know to which schema any unqualified table names belong to**. For example, in the query above, if the database supports dynamic schema, Superset would check to see if the user running the query has access to `dev.my_table`. On the other hand, if the database doesn't support dynamic schema, Superset would use the default database schema instead of `dev`. + +Implementing this method is also important for usability. When the method is not implemented selecting the schema in SQL Lab has no effect on the schema in which the query runs, resulting in a confusing results when using unqualified table names. + +### Catalog + +In general, databases support a hierarchy of one-to-many concepts: + +1. Database +2. Catalog +3. Namespace +4. Table +5. Column + +These concepts have different names depending on the database. For example, Postgres uses the following terminology: + +1. Cluster (database) +2. Database (catalog) +3. Schema (namespace) +4. Table +5. Column + +BigQuery, on the other hand: + +1. BigQuery (database) +2. Project (catalog) +3. Schema (namespace) +4. Table +5. Column + +Hive and Trino: + +1. Database +2. Catalog +3. Schema +4. Table +5. Column + +If the database supports catalogs, then the DB engine spec should have the `supports_catalog` class attribute set to true. It should also implement the `get_default_catalog` method, so that the proper permissions can be created when datasets are added. + +### Dynamic catalog + +Superset support for multiple catalogs. Since, in general, a given SQLAlchemy URI connects only to a single catalog, it requires DB engine specs to implement the `adjust_engine_params` method to rewrite the URL to connect to a different catalog, similar to how dynamic schemas work. Additionally, DB engine specs should also implement the `get_catalog_names` method, so that users can browse the available catalogs. + +### SSH tunneling + +Superset can connect to databases via an SSH tunnel. For databases where this doesn't make sense (eg, SQLite or BigQuery) the DB engine spec should have `disable_ssh_tunneling` set to true. + +### Query cancelation + +Superset will try to cancel running queries if the users wants so, but it's up to the DB engine spec to handle this. + +Some databases have an implicit query cancelation. When a cursor stops being polled it will cancel the query. For databases that behave like this, the class method `has_implicit_cancel` (which should really be a class attribute) should return true. + +For other databases, DB engine specs can implement query cancelation via the `prepare_cancel_query` and `cancel_query` methods. Implementation of query cancelation is usually heavily dependent on the database, but the DB engine specs that support it can serve as an example. + +### Get metrics on dataset creation + +When a physical dataset is first created, the `get_metrics` class method is called on the table. The base implementation returns the `COUNT(*)` metric, but DB engine specs can override `get_metrics` to return other metrics. This method is useful for semantic layers that contain their own metrics definitions; when Superset connect to them it can automatically create those metrics when a dataset is added. + +This feature is still experimental, and ideally there would be a mechanism for calling it periodically or when a dataset is explored, in order to sync new metric definitions to the dataset. + +### `WHERE` on latest partition + +In some databases, running `SELECT *` can be a **very expensive** operation, since the query might scan all partitions for a given table. Because of that, some DB engine specs implement the `where_latest_partition` method, which returns a modified SQLAlchemy query with an additional predicate that filters on the latest partition. + +## Advanced features + +### Expand complex types + +Some databases will visually expand complex types (arrays and structures) when displaying results from queries. For example, the BigQuery UI is able to expand objects into columns and array into rows, so that this: + +| array | struct | +| --------- | ---------------- | +| [1, 2, 3] | `{a: one, b: two}` | + +Is shown as: + +| array | struct | struct.a | struct.b | +| ----- | ---------------- | -------- | -------- | +| 1 | `{a: one, b: two}` | one | two | +| 2 | | | | +| 3 | | | | + +A similar behavior has been implemented in Superset for Presto, and can be enabled via the `PRESTO_EXPAND_DATA` feature flag. To implement this feature a DB engine spec should implement the `expand_data` method, which takes the columns and rows and returns modified columns and rows. + +Note that despite being implemented only for Presto, this behavior has nothing that is Presto specific, and in theory could be implemented in a generic way for all database without requiring custom DB engine spec implementations (that is, the Presto `expand_data` method could be moved to the base class, after being cleaned up, and we could then enable the feature per DB in the configuration). + +### Query cost estimation + +Some databases allow uses to estimate the cost of running a query before running it. This is done via the `estimate_query_cost` method in DB engine specs, which receives the SQL and returns a list of "costs". The definition of what "cost" is varies from database to database (in the few that support this functionality), and it can be formatted via the `query_cost_formatter`. + +The `query_cost_formatter` can be overridden with an arbitrary function via the config `QUERY_COST_FORMATTERS_BY_ENGINE`. This allows custom deployments of Superset to format the results in different ways. For example, at some point in Lyft the cost for running Presto queries would also show the carbon footprint (in trees). + +### SQL validation + +A few databases support validating the syntax of the SQL as the user is typing it, indicating in SQL Lab any errors. This is usually done using an `EXPLAIN` query and, because it gets called every few seconds as the user types, it's important that the database returns the result quickly. + +This is currently implement for Presto and Postgres, via custom classes in `superset/sql_validators` that should be enabled in the configuration. Implementing this as custom classes, instead of a `validate_sql` method in the DB engine spec offers no advantages, and ideally in the future we should move the logic to DB engine specs. + +## Testing DB engine specs + +Superset has a command to test the connection to a given database, as well as checking if the SQLAlchemy dialect implements all necessary methods used by Superset, and checking which features are supported by the DB engine spec (if one exists). To run the tool just call the `test-db` command with the SQLAlchemy URI to be tested: + +```bash +superset test-db sqlite:// +``` + +If the connection needs additional arguments they can be passed when the command runs. From 490c602ebde7e5ef7ba83371283fe2d67e16e273 Mon Sep 17 00:00:00 2001 From: shubhmgrg Date: Mon, 24 Nov 2025 21:58:36 -0500 Subject: [PATCH 6/6] Added the tables with the js component to allow filtering --- docs/docs/configuration/db_engine_specs.mdx | 1250 +++++++++++++++++++ docs/docs/configuration/db_features.mdx | 6 - docs/docs/using-superset/db_features.mdx | 1211 ------------------ docs/src/components/DatabaseTable.jsx | 93 ++ superset/db_engine_specs/docs_lib.py | 689 ++++++++++ 5 files changed, 2032 insertions(+), 1217 deletions(-) create mode 100644 docs/docs/configuration/db_engine_specs.mdx delete mode 100644 docs/docs/configuration/db_features.mdx delete mode 100644 docs/docs/using-superset/db_features.mdx create mode 100644 docs/src/components/DatabaseTable.jsx create mode 100644 superset/db_engine_specs/docs_lib.py diff --git a/docs/docs/configuration/db_engine_specs.mdx b/docs/docs/configuration/db_engine_specs.mdx new file mode 100644 index 000000000000..0cf1ce62af1c --- /dev/null +++ b/docs/docs/configuration/db_engine_specs.mdx @@ -0,0 +1,1250 @@ +--- +title: Database Engine Specifications +hide_title: false +sidebar_position: 3 +version: 1 +--- + + +export const feature_overview = [ + ['Database', 'Score', 'SQL Basics', 'Advanced SQL', 'Common Time Grains', 'Extended Time Grains', 'Integrations', 'Advanced Features'], + ['Presto', '159', 'Supported', 'Partial', 'Supported', 'Partial', 'Partial', 'Supported'], + ['Trino', '149', 'Supported', 'Partial', 'Supported', 'Partial', 'Partial', 'Partial'], + ['Apache Hive', '140', 'Supported', 'Not supported', 'Supported', 'Partial', 'Partial', 'Partial'], + ['Apache Spark SQL', '140', 'Supported', 'Not supported', 'Supported', 'Partial', 'Partial', 'Partial'], + ['Databricks Interactive Cluster', '140', 'Supported', 'Not supported', 'Supported', 'Partial', 'Partial', 'Partial'], + ['base', '109', 'Supported', 'Partial', 'Supported', 'Partial', 'Partial', 'Partial'], + ['Aurora PostgreSQL (Data API)', '104', 'Supported', 'Partial', 'Supported', 'Partial', 'Partial', 'Partial'], + ['CockroachDB', '94', 'Supported', 'Partial', 'Supported', 'Partial', 'Partial', 'Partial'], + ['RisingWave', '94', 'Supported', 'Partial', 'Supported', 'Partial', 'Partial', 'Partial'], + ['Google BigQuery', '83', 'Supported', 'Partial', 'Supported', 'Partial', 'Partial', 'Partial'], + ['Apache Doris', '79', 'Supported', 'Partial', 'Supported', 'Partial', 'Partial', 'Not supported'], + ['Snowflake', '72', 'Supported', 'Partial', 'Supported', 'Partial', 'Partial', 'Not supported'], + ['Databricks', '70', 'Supported', 'Partial', 'Supported', 'Partial', 'Partial', 'Not supported'], + ['Databricks (legacy)', '70', 'Supported', 'Partial', 'Supported', 'Partial', 'Partial', 'Not supported'], + ['StarRocks', '69', 'Supported', 'Partial', 'Supported', 'Partial', 'Partial', 'Partial'], + ['SingleStore', '68', 'Supported', 'Partial', 'Supported', 'Not supported', 'Partial', 'Not supported'], + ['ClickHouse Connect (Superset)', '61', 'Supported', 'Partial', 'Partial', 'Partial', 'Partial', 'Not supported'], + ['Google Sheets', '61', 'Supported', 'Partial', 'Supported', 'Supported', 'Partial', 'Partial'], + ['Aurora MySQL (Data API)', '59', 'Supported', 'Partial', 'Supported', 'Partial', 'Partial', 'Not supported'], + ['MariaDB', '59', 'Supported', 'Partial', 'Supported', 'Partial', 'Partial', 'Not supported'], + ['MySQL', '59', 'Supported', 'Partial', 'Supported', 'Partial', 'Partial', 'Not supported'], + ['OceanBase', '59', 'Supported', 'Partial', 'Supported', 'Partial', 'Partial', 'Not supported'], + ['MotherDuck', '58', 'Supported', 'Partial', 'Supported', 'Not supported', 'Partial', 'Not supported'], + ['KustoSQL', '54', 'Supported', 'Partial', 'Supported', 'Partial', 'Partial', 'Not supported'], + ['ClickHouse', '51', 'Supported', 'Partial', 'Partial', 'Partial', 'Partial', 'Not supported'], + ['Databend', '51', 'Supported', 'Partial', 'Supported', 'Partial', 'Partial', 'Not supported'], + ['Apache Drill', '50', 'Supported', 'Partial', 'Supported', 'Partial', 'Partial', 'Partial'], + ['Apache Druid', '47', 'Partial', 'Partial', 'Supported', 'Partial', 'Partial', 'Not supported'], + ['Amazon Redshift', '44', 'Supported', 'Partial', 'Supported', 'Partial', 'Partial', 'Not supported'], + ['Azure Synapse', '44', 'Partial', 'Partial', 'Supported', 'Partial', 'Partial', 'Not supported'], + ['Microsoft SQL Server', '44', 'Partial', 'Partial', 'Supported', 'Partial', 'Partial', 'Not supported'], + ['SQLite', '41', 'Supported', 'Partial', 'Supported', 'Supported', 'Not supported', 'Not supported'], + ['Shillelagh', '41', 'Supported', 'Partial', 'Supported', 'Supported', 'Not supported', 'Not supported'], + ['KustoKQL', '40', 'Supported', 'Partial', 'Partial', 'Partial', 'Partial', 'Not supported'], + ['Ascend', '38', 'Supported', 'Partial', 'Supported', 'Not supported', 'Partial', 'Not supported'], + ['DuckDB', '38', 'Supported', 'Partial', 'Supported', 'Not supported', 'Partial', 'Not supported'], + ['IBM Db2', '38', 'Supported', 'Partial', 'Supported', 'Not supported', 'Partial', 'Not supported'], + ['IBM Db2 for i', '38', 'Supported', 'Partial', 'Supported', 'Not supported', 'Partial', 'Not supported'], + ['Ocient', '38', 'Partial', 'Partial', 'Partial', 'Partial', 'Partial', 'Not supported'], + ['Apache Impala', '37', 'Supported', 'Partial', 'Partial', 'Not supported', 'Partial', 'Not supported'], + ['ElasticSearch (SQL API)', '37', 'Partial', 'Partial', 'Partial', 'Not supported', 'Partial', 'Not supported'], + ['PostgreSQL', '34', 'Supported', 'Partial', 'Supported', 'Partial', 'Partial', 'Not supported'], + ['Vertica', '34', 'Supported', 'Partial', 'Supported', 'Partial', 'Partial', 'Not supported'], + ['Amazon DynamoDB', '32', 'Supported', 'Partial', 'Supported', 'Partial', 'Partial', 'Not supported'], + ['Apache Pinot', '32', 'Partial', 'Partial', 'Supported', 'Partial', 'Partial', 'Not supported'], + ['Superset meta database', '31', 'Supported', 'Partial', 'Supported', 'Supported', 'Not supported', 'Not supported'], + ['Databricks SQL Endpoint', '30', 'Supported', 'Partial', 'Supported', 'Partial', 'Partial', 'Not supported'], + ['Apache Kylin', '28', 'Supported', 'Partial', 'Supported', 'Not supported', 'Partial', 'Not supported'], + ['CrateDB', '28', 'Supported', 'Partial', 'Supported', 'Not supported', 'Partial', 'Not supported'], + ['Dremio', '28', 'Supported', 'Partial', 'Supported', 'Not supported', 'Partial', 'Not supported'], + ['Exasol', '28', 'Supported', 'Partial', 'Supported', 'Not supported', 'Partial', 'Not supported'], + ['Firebolt', '28', 'Supported', 'Partial', 'Supported', 'Not supported', 'Partial', 'Not supported'], + ['IBM Netezza Performance Server', '28', 'Supported', 'Partial', 'Supported', 'Not supported', 'Partial', 'Not supported'], + ['Oracle', '28', 'Supported', 'Partial', 'Supported', 'Not supported', 'Partial', 'Not supported'], + ['Parseable', '28', 'Supported', 'Partial', 'Supported', 'Not supported', 'Partial', 'Not supported'], + ['Couchbase', '27', 'Partial', 'Partial', 'Partial', 'Not supported', 'Partial', 'Not supported'], + ['Denodo', '27', 'Supported', 'Partial', 'Partial', 'Not supported', 'Partial', 'Not supported'], + ['SAP HANA', '27', 'Supported', 'Partial', 'Partial', 'Not supported', 'Partial', 'Not supported'], + ['Teradata', '27', 'Supported', 'Partial', 'Partial', 'Not supported', 'Partial', 'Not supported'], + ['ElasticSearch (OpenDistro SQL)', '26', 'Partial', 'Partial', 'Partial', 'Not supported', 'Partial', 'Not supported'], + ['Firebird', '26', 'Supported', 'Partial', 'Partial', 'Not supported', 'Partial', 'Not supported'], + ['TDengine', '25', 'Supported', 'Partial', 'Partial', 'Not supported', 'Partial', 'Not supported'], + ['YDB', '23', 'Supported', 'Partial', 'Supported', 'Partial', 'Partial', 'Not supported'], + ['Amazon Athena', '20', 'Supported', 'Partial', 'Supported', 'Partial', 'Not supported', 'Not supported'], + ['Apache Solr', '20', 'Partial', 'Partial', 'Not supported', 'Not supported', 'Partial', 'Not supported'] +]; + +export const database_information = [ + ['Database', 'Module', 'Limit Method', 'Limit Clause', 'Max Column Name'], + ['Amazon Athena', 'superset.db_engine_specs.athena', 'FORCE_LIMIT', 'True', 'None'], + ['Amazon DynamoDB', 'superset.db_engine_specs.dynamodb', 'FORCE_LIMIT', 'True', 'None'], + ['Amazon Redshift', 'superset.db_engine_specs.redshift', 'FORCE_LIMIT', 'True', '127'], + ['Apache Doris', 'superset.db_engine_specs.doris', 'FORCE_LIMIT', 'True', '64'], + ['Apache Drill', 'superset.db_engine_specs.drill', 'FORCE_LIMIT', 'True', 'None'], + ['Apache Druid', 'superset.db_engine_specs.druid', 'FORCE_LIMIT', 'True', 'None'], + ['Apache Hive', 'superset.db_engine_specs.hive', 'FORCE_LIMIT', 'True', '767'], + ['Apache Impala', 'superset.db_engine_specs.impala', 'FORCE_LIMIT', 'True', 'None'], + ['Apache Kylin', 'superset.db_engine_specs.kylin', 'FORCE_LIMIT', 'True', 'None'], + ['Apache Pinot', 'superset.db_engine_specs.pinot', 'FORCE_LIMIT', 'True', 'None'], + ['Apache Solr', 'superset.db_engine_specs.solr', 'FORCE_LIMIT', 'True', 'None'], + ['Apache Spark SQL', 'superset.db_engine_specs.spark', 'FORCE_LIMIT', 'True', '767'], + ['Ascend', 'superset.db_engine_specs.ascend', 'FORCE_LIMIT', 'True', 'None'], + ['Aurora MySQL (Data API)', 'superset.db_engine_specs.aurora', 'FORCE_LIMIT', 'True', '64'], + ['Aurora PostgreSQL (Data API)', 'superset.db_engine_specs.aurora', 'FORCE_LIMIT', 'True', '63'], + ['Azure Synapse', 'superset.db_engine_specs.mssql', 'FORCE_LIMIT', 'True', '128'], + ['ClickHouse', 'superset.db_engine_specs.clickhouse', 'FORCE_LIMIT', 'True', 'None'], + ['ClickHouse Connect (Superset)', 'superset.db_engine_specs.clickhouse', 'FORCE_LIMIT', 'True', 'None'], + ['CockroachDB', 'superset.db_engine_specs.cockroachdb', 'FORCE_LIMIT', 'True', '63'], + ['Couchbase', 'superset.db_engine_specs.couchbase', 'FORCE_LIMIT', 'True', 'None'], + ['CrateDB', 'superset.db_engine_specs.crate', 'FORCE_LIMIT', 'True', 'None'], + ['Databend', 'superset.db_engine_specs.databend', 'FORCE_LIMIT', 'True', 'None'], + ['Databricks', 'superset.db_engine_specs.databricks', 'FORCE_LIMIT', 'True', 'None'], + ['Databricks (legacy)', 'superset.db_engine_specs.databricks', 'FORCE_LIMIT', 'True', 'None'], + ['Databricks Interactive Cluster', 'superset.db_engine_specs.databricks', 'FORCE_LIMIT', 'True', '767'], + ['Databricks SQL Endpoint', 'superset.db_engine_specs.databricks', 'FORCE_LIMIT', 'True', 'None'], + ['Denodo', 'superset.db_engine_specs.denodo', 'FORCE_LIMIT', 'True', 'None'], + ['Dremio', 'superset.db_engine_specs.dremio', 'FORCE_LIMIT', 'True', 'None'], + ['DuckDB', 'superset.db_engine_specs.duckdb', 'FORCE_LIMIT', 'True', 'None'], + ['ElasticSearch (OpenDistro SQL)', 'superset.db_engine_specs.elasticsearch', 'FORCE_LIMIT', 'True', 'None'], + ['ElasticSearch (SQL API)', 'superset.db_engine_specs.elasticsearch', 'FORCE_LIMIT', 'True', 'None'], + ['Exasol', 'superset.db_engine_specs.exasol', 'FORCE_LIMIT', 'True', '128'], + ['Firebird', 'superset.db_engine_specs.firebird', 'FETCH_MANY', 'True', 'None'], + ['Firebolt', 'superset.db_engine_specs.firebolt', 'FORCE_LIMIT', 'True', 'None'], + ['Google BigQuery', 'superset.db_engine_specs.bigquery', 'FORCE_LIMIT', 'True', '128'], + ['Google Sheets', 'superset.db_engine_specs.gsheets', 'FORCE_LIMIT', 'True', 'None'], + ['IBM Db2', 'superset.db_engine_specs.db2', 'WRAP_SQL', 'True', '30'], + ['IBM Db2 for i', 'superset.db_engine_specs.ibmi', 'WRAP_SQL', 'True', '128'], + ['IBM Netezza Performance Server', 'superset.db_engine_specs.netezza', 'FORCE_LIMIT', 'True', 'None'], + ['KustoKQL', 'superset.db_engine_specs.kusto', 'FORCE_LIMIT', 'True', 'None'], + ['KustoSQL', 'superset.db_engine_specs.kusto', 'WRAP_SQL', 'True', 'None'], + ['MariaDB', 'superset.db_engine_specs.mariadb', 'FORCE_LIMIT', 'True', '64'], + ['Microsoft SQL Server', 'superset.db_engine_specs.mssql', 'FORCE_LIMIT', 'True', '128'], + ['MotherDuck', 'superset.db_engine_specs.duckdb', 'FORCE_LIMIT', 'True', 'None'], + ['MySQL', 'superset.db_engine_specs.mysql', 'FORCE_LIMIT', 'True', '64'], + ['OceanBase', 'superset.db_engine_specs.oceanbase', 'FORCE_LIMIT', 'True', '128'], + ['Ocient', 'superset.db_engine_specs.ocient', 'FORCE_LIMIT', 'True', '30'], + ['Oracle', 'superset.db_engine_specs.oracle', 'FORCE_LIMIT', 'True', '128'], + ['Parseable', 'superset.db_engine_specs.parseable', 'FORCE_LIMIT', 'True', 'None'], + ['PostgreSQL', 'superset.db_engine_specs.postgres', 'FORCE_LIMIT', 'True', 'None'], + ['Presto', 'superset.db_engine_specs.presto', 'FORCE_LIMIT', 'True', 'None'], + ['RisingWave', 'superset.db_engine_specs.risingwave', 'FORCE_LIMIT', 'True', '63'], + ['SAP HANA', 'superset.db_engine_specs.hana', 'WRAP_SQL', 'True', '30'], + ['SQLite', 'superset.db_engine_specs.sqlite', 'FORCE_LIMIT', 'True', 'None'], + ['Shillelagh', 'superset.db_engine_specs.shillelagh', 'FORCE_LIMIT', 'True', 'None'], + ['SingleStore', 'superset.db_engine_specs.singlestore', 'FORCE_LIMIT', 'True', '256'], + ['Snowflake', 'superset.db_engine_specs.snowflake', 'FORCE_LIMIT', 'True', '256'], + ['StarRocks', 'superset.db_engine_specs.starrocks', 'FORCE_LIMIT', 'True', '64'], + ['Superset meta database', 'superset.db_engine_specs.superset', 'FORCE_LIMIT', 'True', 'None'], + ['TDengine', 'superset.db_engine_specs.tdengine', 'FORCE_LIMIT', 'True', '64'], + ['Teradata', 'superset.db_engine_specs.teradata', 'FORCE_LIMIT', 'True', '30'], + ['Trino', 'superset.db_engine_specs.trino', 'FORCE_LIMIT', 'True', 'None'], + ['Vertica', 'superset.db_engine_specs.vertica', 'FORCE_LIMIT', 'True', 'None'], + ['YDB', 'superset.db_engine_specs.ydb', 'FORCE_LIMIT', 'True', 'None'], + ['base', 'superset.db_engine_specs.presto', 'FORCE_LIMIT', 'True', 'None'] +]; + +export const sql_capabilities = [ + ['Database', 'JOINs', 'Subqueries', 'Aliases in SELECT', 'Aliases in ORDER BY', 'CTEs', 'Comments', 'Escaped Colons', 'Inline Time Groupby', 'Source Column When Aliased', 'Aggregations in ORDER BY', 'Expressions in ORDER BY'], + ['Amazon Athena', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'False', 'True', 'False'], + ['Amazon DynamoDB', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['Amazon Redshift', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['Apache Doris', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['Apache Drill', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['Apache Druid', 'False', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['Apache Hive', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'False', 'False'], + ['Apache Impala', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['Apache Kylin', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['Apache Pinot', 'False', 'False', 'False', 'False', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['Apache Solr', 'False', 'False', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['Apache Spark SQL', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'False', 'False'], + ['Ascend', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['Aurora MySQL (Data API)', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['Aurora PostgreSQL (Data API)', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['Azure Synapse', 'True', 'True', 'True', 'True', 'False', 'True', 'True', 'False', 'False', 'True', 'False'], + ['ClickHouse', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'False'], + ['ClickHouse Connect (Superset)', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'False'], + ['CockroachDB', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['Couchbase', 'False', 'False', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['CrateDB', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['Databend', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'False'], + ['Databricks', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['Databricks (legacy)', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['Databricks Interactive Cluster', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'False', 'False'], + ['Databricks SQL Endpoint', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['Denodo', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['Dremio', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['DuckDB', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['ElasticSearch (OpenDistro SQL)', 'False', 'True', 'True', 'True', 'True', 'False', 'True', 'True', 'False', 'True', 'False'], + ['ElasticSearch (SQL API)', 'False', 'True', 'True', 'True', 'True', 'False', 'True', 'True', 'False', 'True', 'False'], + ['Exasol', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['Firebird', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['Firebolt', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['Google BigQuery', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'True'], + ['Google Sheets', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['IBM Db2', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['IBM Db2 for i', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['IBM Netezza Performance Server', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['KustoKQL', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'True', 'False', 'True', 'False'], + ['KustoSQL', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'True', 'False', 'True', 'False'], + ['MariaDB', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['Microsoft SQL Server', 'True', 'True', 'True', 'True', 'False', 'True', 'True', 'False', 'False', 'True', 'False'], + ['MotherDuck', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['MySQL', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['OceanBase', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['Ocient', 'True', 'True', 'True', 'True', 'False', 'True', 'True', 'False', 'False', 'True', 'False'], + ['Oracle', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['Parseable', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['PostgreSQL', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['Presto', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'True', 'False'], + ['RisingWave', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['SAP HANA', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['SQLite', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['Shillelagh', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['SingleStore', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['Snowflake', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['StarRocks', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['Superset meta database', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['TDengine', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['Teradata', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['Trino', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'True', 'False'], + ['Vertica', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['YDB', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'], + ['base', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False'] +]; + +export const time_grains_common = [ + ['Database', 'SECOND', 'MINUTE', 'HOUR', 'DAY', 'WEEK', 'MONTH', 'QUARTER', 'YEAR'], + ['Amazon Athena', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['Amazon DynamoDB', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['Amazon Redshift', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['Apache Doris', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['Apache Drill', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['Apache Druid', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['Apache Hive', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['Apache Impala', 'False', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['Apache Kylin', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['Apache Pinot', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['Apache Solr', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Apache Spark SQL', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['Ascend', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['Aurora MySQL (Data API)', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['Aurora PostgreSQL (Data API)', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['Azure Synapse', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['ClickHouse', 'False', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['ClickHouse Connect (Superset)', 'False', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['CockroachDB', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['Couchbase', 'True', 'True', 'True', 'True', 'False', 'True', 'True', 'True'], + ['CrateDB', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['Databend', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['Databricks', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['Databricks (legacy)', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['Databricks Interactive Cluster', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['Databricks SQL Endpoint', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['Denodo', 'False', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['Dremio', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['DuckDB', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['ElasticSearch (OpenDistro SQL)', 'True', 'True', 'True', 'True', 'False', 'True', 'False', 'True'], + ['ElasticSearch (SQL API)', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'True'], + ['Exasol', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['Firebird', 'True', 'True', 'True', 'True', 'False', 'True', 'False', 'True'], + ['Firebolt', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['Google BigQuery', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['Google Sheets', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['IBM Db2', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['IBM Db2 for i', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['IBM Netezza Performance Server', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['KustoKQL', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'True'], + ['KustoSQL', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['MariaDB', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['Microsoft SQL Server', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['MotherDuck', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['MySQL', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['OceanBase', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['Ocient', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'True'], + ['Oracle', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['Parseable', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['PostgreSQL', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['Presto', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['RisingWave', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['SAP HANA', 'True', 'True', 'True', 'True', 'False', 'True', 'True', 'True'], + ['SQLite', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['Shillelagh', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['SingleStore', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['Snowflake', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['StarRocks', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['Superset meta database', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['TDengine', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'False'], + ['Teradata', 'False', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['Trino', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['Vertica', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['YDB', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['base', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'] +]; + +export const time_grains_extended = [ + ['Database', 'FIVE_SECONDS', 'THIRTY_SECONDS', 'FIVE_MINUTES', 'TEN_MINUTES', 'FIFTEEN_MINUTES', 'THIRTY_MINUTES', 'HALF_HOUR', 'SIX_HOURS', 'WEEK_STARTING_SUNDAY', 'WEEK_STARTING_MONDAY', 'WEEK_ENDING_SATURDAY', 'WEEK_ENDING_SUNDAY', 'QUARTER_YEAR'], + ['Amazon Athena', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'True', 'False', 'True', 'False', 'False'], + ['Amazon DynamoDB', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'True', 'True', 'True', 'True', 'False'], + ['Amazon Redshift', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Apache Doris', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'True', 'False', 'False', 'False'], + ['Apache Drill', 'False', 'False', 'False', 'False', 'True', 'True', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Apache Druid', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'True', 'False', 'True', 'False', 'False'], + ['Apache Hive', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'True', 'False', 'True', 'False', 'False'], + ['Apache Impala', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Apache Kylin', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Apache Pinot', 'False', 'False', 'True', 'True', 'True', 'True', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Apache Solr', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Apache Spark SQL', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'True', 'False', 'True', 'False', 'False'], + ['Ascend', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Aurora MySQL (Data API)', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'True', 'False', 'False', 'False'], + ['Aurora PostgreSQL (Data API)', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Azure Synapse', 'False', 'False', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'True', 'False', 'False', 'False'], + ['ClickHouse', 'False', 'False', 'True', 'True', 'True', 'True', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['ClickHouse Connect (Superset)', 'False', 'False', 'True', 'True', 'True', 'True', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['CockroachDB', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Couchbase', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['CrateDB', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Databend', 'False', 'False', 'True', 'True', 'True', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Databricks', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'True', 'False', 'True', 'False', 'False'], + ['Databricks (legacy)', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'True', 'False', 'True', 'False', 'False'], + ['Databricks Interactive Cluster', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'True', 'False', 'True', 'False', 'False'], + ['Databricks SQL Endpoint', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'True', 'False', 'True', 'False', 'False'], + ['Denodo', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Dremio', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['DuckDB', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['ElasticSearch (OpenDistro SQL)', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['ElasticSearch (SQL API)', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Exasol', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Firebird', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Firebolt', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Google BigQuery', 'False', 'False', 'True', 'True', 'True', 'True', 'False', 'False', 'False', 'True', 'False', 'False', 'False'], + ['Google Sheets', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['IBM Db2', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['IBM Db2 for i', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['IBM Netezza Performance Server', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['KustoKQL', 'False', 'True', 'True', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['KustoSQL', 'False', 'False', 'True', 'True', 'True', 'False', 'True', 'False', 'True', 'True', 'False', 'False', 'False'], + ['MariaDB', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'True', 'False', 'False', 'False'], + ['Microsoft SQL Server', 'False', 'False', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'True', 'False', 'False', 'False'], + ['MotherDuck', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['MySQL', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'True', 'False', 'False', 'False'], + ['OceanBase', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'True', 'False', 'False', 'False'], + ['Ocient', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'True'], + ['Oracle', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Parseable', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['PostgreSQL', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Presto', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'True', 'True', 'True', 'True', 'True', 'False'], + ['RisingWave', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['SAP HANA', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['SQLite', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['Shillelagh', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['SingleStore', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Snowflake', 'False', 'False', 'True', 'True', 'True', 'True', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['StarRocks', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'True', 'False', 'False', 'False'], + ['Superset meta database', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True'], + ['TDengine', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Teradata', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Trino', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'True', 'True', 'True', 'True', 'True', 'False'], + ['Vertica', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['YDB', 'False', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['base', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'True', 'True', 'True', 'True', 'True', 'False'] +]; + +export const metadata_features = [ + ['Database', 'Masked Encrypted Extra', 'Column Type Mappings', 'Function Names', 'File Upload', 'Dynamic Schema', 'Catalog', 'Dynamic Catalog', 'SSH Tunneling', 'Latest Partition', 'Query Cancellation', 'Get Metrics', 'Extra Table Metadata', 'Exception Mapping', 'Custom Errors'], + ['Amazon Athena', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Amazon DynamoDB', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Amazon Redshift', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'True', 'False', 'True', 'False', 'False', 'False', 'False'], + ['Apache Doris', 'False', 'True', 'False', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'False', 'False', 'False', 'False'], + ['Apache Drill', 'False', 'False', 'False', 'True', 'True', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Apache Druid', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'True', 'False'], + ['Apache Hive', 'False', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'False', 'False'], + ['Apache Impala', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'True', 'False', 'True', 'False', 'False', 'False', 'False'], + ['Apache Kylin', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Apache Pinot', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Apache Solr', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Apache Spark SQL', 'False', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'False', 'False'], + ['Ascend', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'True', 'False', 'True', 'False', 'False', 'False', 'False'], + ['Aurora MySQL (Data API)', 'False', 'True', 'False', 'True', 'True', 'False', 'False', 'True', 'False', 'True', 'False', 'False', 'False', 'False'], + ['Aurora PostgreSQL (Data API)', 'False', 'True', 'False', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'False', 'False', 'False', 'False'], + ['Azure Synapse', 'False', 'True', 'False', 'True', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'False', 'False'], + ['ClickHouse', 'False', 'True', 'True', 'False', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'True', 'False'], + ['ClickHouse Connect (Superset)', 'False', 'True', 'True', 'False', 'True', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'True', 'False'], + ['CockroachDB', 'False', 'True', 'False', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'False', 'False', 'False', 'False'], + ['Couchbase', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'False', 'False'], + ['CrateDB', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Databend', 'False', 'True', 'True', 'False', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'True', 'False'], + ['Databricks', 'False', 'False', 'False', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'False', 'False', 'False', 'True'], + ['Databricks (legacy)', 'False', 'False', 'False', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'False', 'False', 'False', 'True'], + ['Databricks Interactive Cluster', 'False', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'False', 'False'], + ['Databricks SQL Endpoint', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Denodo', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Dremio', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'False', 'False'], + ['DuckDB', 'False', 'True', 'False', 'True', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'False', 'False'], + ['ElasticSearch (OpenDistro SQL)', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'False', 'False'], + ['ElasticSearch (SQL API)', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'True', 'False'], + ['Exasol', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Firebird', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Firebolt', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Google BigQuery', 'False', 'False', 'False', 'True', 'False', 'True', 'True', 'False', 'True', 'False', 'False', 'True', 'True', 'False'], + ['Google Sheets', 'False', 'False', 'True', 'True', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'True', 'False', 'False'], + ['IBM Db2', 'False', 'False', 'False', 'True', 'True', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'False', 'False'], + ['IBM Db2 for i', 'False', 'False', 'False', 'True', 'True', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'False', 'False'], + ['IBM Netezza Performance Server', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'False', 'False'], + ['KustoKQL', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'True', 'False'], + ['KustoSQL', 'False', 'True', 'False', 'True', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'True', 'False'], + ['MariaDB', 'False', 'True', 'False', 'True', 'True', 'False', 'False', 'True', 'False', 'True', 'False', 'False', 'False', 'False'], + ['Microsoft SQL Server', 'False', 'True', 'False', 'True', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'False', 'False'], + ['MotherDuck', 'False', 'True', 'False', 'True', 'False', 'True', 'True', 'True', 'False', 'False', 'False', 'False', 'False', 'False'], + ['MySQL', 'False', 'True', 'False', 'True', 'True', 'False', 'False', 'True', 'False', 'True', 'False', 'False', 'False', 'False'], + ['OceanBase', 'False', 'True', 'False', 'True', 'True', 'False', 'False', 'True', 'False', 'True', 'False', 'False', 'False', 'False'], + ['Ocient', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'True', 'False', 'True', 'False', 'False', 'False', 'False'], + ['Oracle', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Parseable', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'False', 'False'], + ['PostgreSQL', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Presto', 'False', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'False', 'False'], + ['RisingWave', 'False', 'True', 'False', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'False', 'False', 'False', 'False'], + ['SAP HANA', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'False', 'False'], + ['SQLite', 'False', 'False', 'True', 'True', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Shillelagh', 'False', 'False', 'True', 'True', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['SingleStore', 'False', 'True', 'True', 'True', 'True', 'False', 'False', 'True', 'False', 'True', 'False', 'False', 'False', 'False'], + ['Snowflake', 'False', 'False', 'False', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'False', 'False', 'False', 'False'], + ['StarRocks', 'False', 'True', 'False', 'True', 'True', 'False', 'False', 'True', 'False', 'True', 'False', 'False', 'False', 'False'], + ['Superset meta database', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'False'], + ['TDengine', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Teradata', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'False', 'False'], + ['Trino', 'False', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'True', 'True', 'False'], + ['Vertica', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'False', 'False'], + ['YDB', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'False', 'False', 'False'], + ['base', 'False', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'False', 'False', 'False'] +]; + + +export const advanced_features = [ + ['Database', 'User Impersonation', 'Expand Data', 'Cost Estimation', 'SQL Validation'], + ['Amazon Athena', 'False', 'False', 'False', 'False'], + ['Amazon DynamoDB', 'False', 'False', 'False', 'False'], + ['Amazon Redshift', 'False', 'False', 'False', 'False'], + ['Apache Doris', 'False', 'False', 'False', 'False'], + ['Apache Drill', 'True', 'False', 'False', 'False'], + ['Apache Druid', 'False', 'False', 'False', 'False'], + ['Apache Hive', 'True', 'True', 'True', 'False'], + ['Apache Impala', 'False', 'False', 'False', 'False'], + ['Apache Kylin', 'False', 'False', 'False', 'False'], + ['Apache Pinot', 'False', 'False', 'False', 'False'], + ['Apache Solr', 'False', 'False', 'False', 'False'], + ['Apache Spark SQL', 'True', 'True', 'True', 'False'], + ['Ascend', 'False', 'False', 'False', 'False'], + ['Aurora MySQL (Data API)', 'False', 'False', 'False', 'False'], + ['Aurora PostgreSQL (Data API)', 'False', 'False', 'True', 'True'], + ['Azure Synapse', 'False', 'False', 'False', 'False'], + ['ClickHouse', 'False', 'False', 'False', 'False'], + ['ClickHouse Connect (Superset)', 'False', 'False', 'False', 'False'], + ['CockroachDB', 'False', 'False', 'True', 'False'], + ['Couchbase', 'False', 'False', 'False', 'False'], + ['CrateDB', 'False', 'False', 'False', 'False'], + ['Databend', 'False', 'False', 'False', 'False'], + ['Databricks', 'False', 'False', 'False', 'False'], + ['Databricks (legacy)', 'False', 'False', 'False', 'False'], + ['Databricks Interactive Cluster', 'True', 'True', 'True', 'False'], + ['Databricks SQL Endpoint', 'False', 'False', 'False', 'False'], + ['Denodo', 'False', 'False', 'False', 'False'], + ['Dremio', 'False', 'False', 'False', 'False'], + ['DuckDB', 'False', 'False', 'False', 'False'], + ['ElasticSearch (OpenDistro SQL)', 'False', 'False', 'False', 'False'], + ['ElasticSearch (SQL API)', 'False', 'False', 'False', 'False'], + ['Exasol', 'False', 'False', 'False', 'False'], + ['Firebird', 'False', 'False', 'False', 'False'], + ['Firebolt', 'False', 'False', 'False', 'False'], + ['Google BigQuery', 'False', 'False', 'True', 'False'], + ['Google Sheets', 'True', 'False', 'False', 'False'], + ['IBM Db2', 'False', 'False', 'False', 'False'], + ['IBM Db2 for i', 'False', 'False', 'False', 'False'], + ['IBM Netezza Performance Server', 'False', 'False', 'False', 'False'], + ['KustoKQL', 'False', 'False', 'False', 'False'], + ['KustoSQL', 'False', 'False', 'False', 'False'], + ['MariaDB', 'False', 'False', 'False', 'False'], + ['Microsoft SQL Server', 'False', 'False', 'False', 'False'], + ['MotherDuck', 'False', 'False', 'False', 'False'], + ['MySQL', 'False', 'False', 'False', 'False'], + ['OceanBase', 'False', 'False', 'False', 'False'], + ['Ocient', 'False', 'False', 'False', 'False'], + ['Oracle', 'False', 'False', 'False', 'False'], + ['Parseable', 'False', 'False', 'False', 'False'], + ['PostgreSQL', 'False', 'False', 'False', 'False'], + ['Presto', 'True', 'True', 'True', 'True'], + ['RisingWave', 'False', 'False', 'True', 'False'], + ['SAP HANA', 'False', 'False', 'False', 'False'], + ['SQLite', 'False', 'False', 'False', 'False'], + ['Shillelagh', 'False', 'False', 'False', 'False'], + ['SingleStore', 'False', 'False', 'False', 'False'], + ['Snowflake', 'False', 'False', 'False', 'False'], + ['StarRocks', 'True', 'False', 'False', 'False'], + ['Superset meta database', 'False', 'False', 'False', 'False'], + ['TDengine', 'False', 'False', 'False', 'False'], + ['Teradata', 'False', 'False', 'False', 'False'], + ['Trino', 'True', 'False', 'True', 'False'], + ['Vertica', 'False', 'False', 'False', 'False'], + ['YDB', 'False', 'False', 'False', 'False'], + ['base', 'False', 'False', 'True', 'False'] +]; + + +import DatabaseTable from '../../src/components/DatabaseTable'; + + +# Database engine specifications + +Superset uses [SQLAlchemy](https://www.sqlalchemy.org/) as an abstraction layer for running queries and fetching metadata from tables (like column names and types). Unfortunately, while SQLAlchemy offers enough functionality to allow connecting Superset to dozens of databases, there are still implementation details that differ across them. Because of this, Superset has an additional abstraction on top of SQLAlchemy, called a "database engine specification" or, simply, "DB engine spec". + +DB engine specs were created initially because there's no SQL standard for computing aggregations at different time grains. For example, to compute a daily metric in Trino or Postgres we could run a query like this: + + +```sql +SELECT + date_trunc('day', CAST(time_column) AS TIMESTAMP) AS day, + COUNT(*) AS metric +FROM + some_table +GROUP BY + 1 +``` + + + +For MySQL, instead of using the `date_trunc` function, we would need to write: + +```sql +SELECT + DATE(time_column) AS day, + COUNT(*) AS metric +FROM + some_table +GROUP BY + 1 +``` + +Over time, more and more functionality was added to DB engine specs, including validating SQL, estimating the cost of queries before they are run, and understanding the semantics of error messages. These are all described in detail in this document, and in the table below you can see a summary of what features are supported by each database. + +Note that DB engine specs are completely optional. Superset can connect to any database supported by SQLAlchemy (or 3rd party dialects) even if there's no DB engine spec associated with it. But DB engine specs greatly improve the experience of working with a database in Superset. + +## Features + +The tables below (generated via `python superset/db_engine_specs/lib.py`) summarize the status of all DB engine specs in Superset, organized by feature category for easier navigation (note that this excludes 3rd party DB engine specs). + + + + + +### Quick Navigation + +- [Feature Overview](#feature-overview) - High-level summary of support across all databases +- [Database Information](#database-information) - Module paths and core metadata +- [SQL Capabilities](#sql-capabilities) - SQL language features and capabilities +- [Time Grains – Common](#time-grains--common) - Standard time granularity support +- [Time Grains – Extended](#time-grains--extended) - Sub-hour and week variant time grains +- [Core Platform & Metadata Features](#core-platform--metadata-features) - Platform integration and metadata capabilities +- [Operational & Advanced Features](#operational--advanced-features) - Advanced operational capabilities + +### Feature Overview + + + +### Database Information + + + + + +### SQL Capabilities + + + + +### Time Grains – Common + + + + +### Time Grains – Extended + + + +### Core Platform & Metadata Features + + +Integration with platform features and metadata handling. + + + + + +### Operational & Advanced Features + + + + +## Database information + +A DB engine spec has attributes that describe the underlying database engine, so that Superset can know how to build and run queries. For example, some databases don't support subqueries, which are needed for some of the queries produced by Superset for certain charts. When a database doesn't support subqueries the query is run in two-steps, using the results from the first query to build the second query. + +These attributes and their default values (set in the base class, `BaseEngineSpec`) are described below: + +### `limit_method = LimitMethod.FORCE_LIMIT` + +When running user queries in SQL Lab, Superset needs to limit the number of rows returned. The reason for that is cost and performance: there's no point in running a query that produces millions of rows when they can't be loaded into the browser. + +For most databases this is done by parsing the user submitted query and applying a limit, if one is not present, or replacing the existing limit if it's larger. This is called the `FORCE_LIMIT` method, and is the most efficient, since the database will produce at most the number of rows that Superset will display. + +For some databases this method might not work, and they can use the `WRAP_SQL` method, which wraps the original query in a `SELECT *` and applies a limit via the SQLAlchemy dialect, which should get translated to the correct syntax. This method might be inefficient, since the database optimizer might not be able to push the limit to the inner query. + +Finally, as a last resource there is the `FETCH_MANY` method. When a DB engine spec uses this method the query runs unmodified, but Superset fetches only a certain number of rows from the cursor. It's possible that a database using this method can optimize the query execution and compute rows as they are being read by the cursor, but it's unlikely. This makes this method the least efficient of the three. + +Note that when Superset runs a query with a given limit, say 100, it always modifies the query to request one additional row (`LIMIT 101`, in this case). This extra row is dropped before the results are returned to the user, but it allows Superset to inform the users that the query was indeed limited. Otherwise a query with `LIMIT 100` that returns exactly 100 rows would seem like it was limited, when in fact it was not. + +### `allows_joins = True` + +Not all databases support `JOIN`s. When building complex charts, Superset will try to join the table to itself in order to compute `top_n` groups, for example. If the database doesn't support joins Superset will instead run a prequery, and use the results to build the final query. + +### `allows_subqueries = True` + +Similarly, not all databases support subqueries. For more complex charts Superset will build subqueries if possible, or run the query in two-steps otherwise. + +### `allows_alias_in_select = True` + +Does the DB support aliases in the projection of a query, eg: + +```sql +SELECT COUNT(*) AS cnt +``` + +Superset will try to use aliases whenever possible, in order to give friendly names to expressions. + +### `allows_alias_in_orderby = True` + +Does the DB support referencing alias in the `GROUP BY`, eg: + +```sql +SELECT + UPPER(country_of_origin) AS country + COUNT(*) AS cnt +FROM + some_table +GROUP BY + country +``` + +Otherwise the query is written as: + +```sql +SELECT + UPPER(country_of_origin) AS country + COUNT(*) AS cnt +FROM + some_table +GROUP BY + UPPER(country_of_origin) +``` + +### `time_groupby_inline = False` + +In theory this attribute should be used to omit time filters from the self-joins. When the attribute is false the time attribute will be present in the subquery used to compute limited series, eg: + +```sql +SELECT DATE_TRUNC('day', ts) AS ts, + team AS team, + COUNT(*) AS count +FROM public.threads +JOIN + (SELECT team AS team__, + COUNT(*) AS mme_inner__ + FROM public.threads + -- this is added when `time_groupby_inline = False` + WHERE ts >= TO_TIMESTAMP('2022-07-27 00:00:00.000000', 'YYYY-MM-DD HH24:MI:SS.US') + AND ts < TO_TIMESTAMP('2023-07-27 00:00:00.000000', 'YYYY-MM-DD HH24:MI:SS.US') + -- + GROUP BY team + ORDER BY mme_inner__ DESC + LIMIT 5) AS anon_1 ON team = team__ +WHERE ts >= TO_TIMESTAMP('2022-07-27 00:00:00.000000', 'YYYY-MM-DD HH24:MI:SS.US') + AND ts < TO_TIMESTAMP('2023-07-27 00:00:00.000000', 'YYYY-MM-DD HH24:MI:SS.US') +GROUP BY DATE_TRUNC('day', ts), + team +ORDER BY count DESC +LIMIT 10000; +``` + +In practice, the attribute doesn't seem to be working as of 2023-07-27. + +### `allows_alias_to_source_column = True` + +When this is true the database allows queries where alias can overshadow existing column names. For example, in this query: + +```sql +SELECT + foo + 1 AS foo +FROM + some_table +ORDER BY + foo -- references the alias `foo + 1`, not the column `foo` +``` + +### `allows_hidden_orderby_agg = True` + +If set to true the database allows expressions in the `GROUP BY` that are not present in the projection (`SELECT`), eg: + +```sql +SELECT + country, + COUNT(*) +FROM + some_table +GROUP BY + country +ORDER BY + SUM(population) -- not present in the `SELECT` +``` + +### `allows_hidden_cc_in_orderby = False` + +This the opposite of `allows_alias_in_orderby`, for databases that require aliases in the `ORDER BY`. For example, BigQuery doesn't like this query: + +```sql +SELECT + CASE + WHEN type = 'feature' THEN 'f' + WHEN type = 'bug' THEN 'b' + ELSE 'o' + END AS cc_type +FROM + some_table +GROUP BY + cc_type +ORDER BY + CASE + WHEN type = 'feature' THEN 'f' + WHEN type = 'bug' THEN 'b' + ELSE 'o' + END +``` + +Instead, it must be written as: + +```sql +SELECT + CASE + WHEN type = 'feature' THEN 'f' + WHEN type = 'bug' THEN 'b' + ELSE 'o' + END AS cc_type +FROM + some_table +GROUP BY + cc_type +ORDER BY + cc_type +``` + +### `allows_cte_in_subquery = True` + +When a virtual dataset is used in a chart the original query is converted into a subquery, and is wrapped in an outer query that is generated based on the chart controls. The virtual dataset query might have a CTE, and some databases don't like subqueries with CTEs in them. + +When this attribute is false Superset will extract the CTE and move it outside of the subquery when generating SQL for charts. The name of the new CTE will be `cte_alias`, also defined in the DB engine spec. + +### `allow_limit_clause = True` + +Allows for the `LIMIT` clause. Otherwise, the database probably uses `TOP` to limit rows. + +### `max_column_name_length: int | None = None` + +Most databases have a well defined limit for the maximum length of a column name (SQLite is probably the one exception). While the can be set (and defaults) to `None,` it's highly recommended to set a value to prevent errors. + +### `allows_sql_comments = True` + +Are comments supported in the DB? In general SQL in comments are defined by double dashes: + +```sql +-- this is a comment +SELECT * -- we need everything +FROM some_table +``` + +### `allows_escaped_colons = True` + +SQLAlchemy recommends escaping colons to prevent them from being interpreted as bindings to parameters. Because of this, when building queries from virtual datasets Superset will escape all colons with `\:`. + +This works for most databases except Athena. The `allows_escaped_colons` attribute specifies if the database supports the escape colon. + +## Basic features + +These are features that all DB engine specs should support, as the name suggests. They provide a much better user experience for the user. + +### Time grains + +The most basic feature that DB engine specs need to support is defining time grain expressions. These are dialect-specific SQL expressions that are used to compute metrics on a given time grain when building charts. For example, when computing the metric `COUNT(*)` on a daily basis, Superset will generate the following query: + +```sql +SELECT + , + COUNT(*) +... +GROUP BY + +``` + +For some databases with support for `DATE_TRUNC` or `TIME_FLOOR` this is easy. Here's how Apache Druid computes 15 minute aggregations: + +```sql +TIME_FLOOR(CAST({col} AS TIMESTAMP), 'PT15M') +``` + +Where `{col}` is the time column being aggregated — the expression is actually a Jinja2 template. Druid uses the ISO standard for durations, with `PT15M` representing 15 minutes. + +On the other and, here's the same for SQLite: + +```sql +DATETIME( + STRFTIME( + '%Y-%m-%dT%H:%M:00', + {col} + ), + printf( + '-%d minutes', + CAST(strftime('%M', {col}) AS INT) % 15 + ) +) +``` + +The SQLite version has to truncate the column down to the minute, and then subtract a number of minutes equals to the modulo 15. + +Time grain expressions are defined in the `_time_grain_expressions` class attribute, which maps from a `superset.constants.TimeGrain` to the SQL expression. The dictionary has a special key `None`, that should map to the column directly, for when no time grain is specified. + +Note that it's possible to add new time grains via configuration. For example, if you want to add a "2 seconds" time grain to your installation you can add it to `TIME_GRAIN_ADDONS`, and implement it in `TIME_GRAIN_ADDON_EXPRESSIONS`: + +```python +# superset_config.py +TIME_GRAIN_ADDONS = {"PT2S": "2 second"} + +TIME_GRAIN_ADDON_EXPRESSIONS = { + "clickhouse": { + "PT2S": "toDateTime(intDiv(toUInt32(toDateTime({col})), 2)*2)", + } +} +``` + +### Column type mapping + +Column type mapping, defined in the `column_type_mappings` class attribute, is just a way of mapping type names from the database to types Superset understand. The default values in `BaseEngineSpec` are sane: + +```python +_default_column_type_mappings: tuple[ColumnTypeMapping, ...] = ( + ( + re.compile(r"^string", re.IGNORECASE), + types.String(), + GenericDataType.STRING, + ), + ( + re.compile(r"^float", re.IGNORECASE), + types.Float(), + GenericDataType.NUMERIC, + ), + ( + re.compile(r"^date", re.IGNORECASE), + types.Date(), + GenericDataType.TEMPORAL, + ), + ( + re.compile(r"^bool(ean)?", re.IGNORECASE), + types.Boolean(), + GenericDataType.BOOLEAN, + ), + ... +) +``` + +But you might want to implement more specific types in the DB engine spec, or complex types. For example, for MSSQL we have: + +```python +from sqlalchemy.dialects.mssql.base import SMALLDATETIME + +class MssqlEngineSpec(BaseEngineSpec): + ... + column_type_mappings = ( + ( + re.compile(r"^smalldatetime.*", re.IGNORECASE), + SMALLDATETIME(), + GenericDataType.TEMPORAL, + ), + ) +``` + +### Function names + +DB engine specs should implement a class method called `get_function_names` that returns a list of strings, representing all the function names that the database supports. This is used for autocomplete in SQL Lab. + +### Masked encrypted extra + +Superset does a good job in keeping credentials secure. When you add a database with a password, for example: + +```text +postgresql://admin:password123@db.example.org:5432/db +``` + +The password is sent over the network only when the database is created. When you edit the database later, Superset will return this as the SQLAlchemy URI: + +```text +postgresql://admin:XXXXXXXXXX@db.example.org:5432/db +``` + +The password will be masked in the API response; it's not just masked in the browser UI. This is done in order to avoid sending the password unnecessarily over the network. Also, if a non-admin user has access to the API response, they won't be able to know the database password. + +When the database is edited, the Superset backend is smart enough to replace the masked password with the actual password, unless the password has changed. That is, if you change the database in the URI from `db` to `db2` the SQLAlchemy URI will be stored in the backend as: + +```text +postgresql://admin:password123@db.example.org:5432/db2 +``` + +The password is not the only piece of information where security is critical. For many databases (like BigQuery), sensitive information is stored in the credentials JSON payload. For example: + +```json +{ + "type": "service_account", + "project_id": "dbt-tutorial-347100", + "private_key_id": "4bc71f06990c864a590fad8b94be6a5904fc171f", + "private_key": "", + "client_email": "dbt-user-278@dbt-tutorial-347100.iam.gserviceaccount.com", + "client_id": "115666988796889519425", + "auth_uri": "https://accounts.google.com/o/oauth2/auth", + "token_uri": "https://oauth2.googleapis.com/token", + "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", + "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/dbt-user-278%40dbt-tutorial-347100.iam.gserviceaccount.com" +} +``` + +Similarly to password, we don't want to send `private_key` to the client when a database is edited; the Superset API should never return its actual contents. Instead, Superset should return a masked value, and users should be able to edit the JSON without having to type in the `private_key` on every edit. + +To do this, DB engine specs and implement 2 methods, `mask_encrypted_extra` and `unmask_encrypted_extra`. They have these names because the credentials are stored in an encrypted column called `encrypted_extra`. Here's how these methods look like for BigQuery: + +```python +from superset.constants import PASSWORD_MASK + + +class BigQueryEngineSpec(BaseEngineSpec): + + @classmethod + def mask_encrypted_extra(cls, encrypted_extra: str | None) -> str | None: + if encrypted_extra is None: + return encrypted_extra + + try: + config = json.loads(encrypted_extra) + except (json.JSONDecodeError, TypeError): + return encrypted_extra + + try: + config["credentials_info"]["private_key"] = PASSWORD_MASK + except KeyError: + pass + + return json.dumps(config) + + @classmethod + def unmask_encrypted_extra( + cls, + old: str | None, + new: str | None + ) -> str | None: + if old is None or new is None: + return new + + try: + old_config = json.loads(old) + new_config = json.loads(new) + except (TypeError, json.JSONDecodeError): + return new + + if "credentials_info" not in new_config: + return new + + if "private_key" not in new_config["credentials_info"]: + return new + + if new_config["credentials_info"]["private_key"] == PASSWORD_MASK: + new_config["credentials_info"]["private_key"] = old_config[ + "credentials_info" + ]["private_key"] + + return json.dumps(new_config) +``` + +This way, when a user edits an existing BigQuery connection, the `private_key` is shown as `XXXXXXXXXX`. Everything else in the JSON is still displayed, and the user can change any of the fields without having to provide the private key. + +Note that while this is a basic feature that should be implemented for security reasons, it only makes sense in DB engine specs that use `encrypted_extra` to store connection information. + +## Nice to have features + +The next set of features are nice to have. They don't apply to all databases, and are not strictly needed for security or usability. + +### User impersonation + +In general there's no user-level granularity when accessing a database in Superset. A single database connection is shared by all users who have access to that database. There are many use cases when this is not desirable, and some databases implement mechanisms in which they can **impersonate users**, potentially reducing the scope of permissions available to run the query. + +For example, the Google Sheets DB engine spec implements this via the `get_url_for_impersonation` class method: + +```python +class GSheetsEngineSpec(ShillelaghEngineSpec): + + @classmethod + def get_url_for_impersonation( + cls, + url: URL, + impersonate_user: bool, + username: str | None, + access_token: str | None, + ) -> URL: + if impersonate_user and username is not None: + user = security_manager.find_user(username=username) + if user and user.email: + url = url.update_query_dict({"subject": user.email}) + + return url +``` + +The method `get_url_for_impersonation` updates the SQLAlchemy URI before every query. In this particular case, it will fetch the user's email and add it to the `subject` query argument. The driver will then lower the permissions to match that given user. This allows the connection to be configured with a service account that has access to all the spreadsheets, while giving users access to only the spreadsheets they own are have been shared with them (or with their organization — Google will handle the authorization in this case, not Superset). + +Alternatively, it's also possible to impersonate users by implementing the `update_impersonation_config`. This is a class method which modifies `connect_args` in place. You can use either method, and ideally they [should be consolidated in a single one](https://github.com/apache/superset/issues/24910). + +### OAuth2 + +Support for authenticating to a database using personal OAuth2 access tokens was introduced in [SIP-85](https://github.com/apache/superset/issues/20300). The Google Sheets DB engine spec is the reference implementation. + +Note that this API is still experimental and evolving quickly, subject to breaking changes. Currently, to add support for OAuth2 to a DB engine spec, the following attributes are needed: + +```python +class BaseEngineSpec: + + supports_oauth2 = True + oauth2_exception = OAuth2RedirectError + + oauth2_scope = " ".join([ + "https://example.org/scope1", + "https://example.org/scope2", + ]) + oauth2_authorization_request_uri = "https://example.org/authorize" + oauth2_token_request_uri = "https://example.org/token" +``` + +The `oauth2_exception` is an exception that is raised by `cursor.execute` when OAuth2 is needed. This will start the OAuth2 dance when `BaseEngineSpec.execute` is called, by returning the custom error `OAUTH2_REDIRECT` to the frontend. If the database driver doesn't have a specific exception, it might be necessary to overload the `execute` method in the DB engine spec, so that the `BaseEngineSpec.start_oauth2_dance` method gets called whenever OAuth2 is needed. + +The DB engine should implement logic in either `get_url_for_impersonation` or `update_impersonation_config` to update the connection with the personal access token. See the Google Sheets DB engine spec for a reference implementation. + +Currently OAuth2 needs to be configured at the DB engine spec level, ie, with one client for each DB engien spec. The configuration lives in `superset_config.py`: + +```python +# superset_config.py +DATABASE_OAUTH2_CLIENTS = { + "Google Sheets": { + "id": "XXX.apps.googleusercontent.com", + "secret": "GOCSPX-YYY", + "scope": " ".join( + [ + "https://www.googleapis.com/auth/drive.readonly", + "https://www.googleapis.com/auth/spreadsheets", + "https://spreadsheets.google.com/feeds", + ], + ), + "authorization_request_uri": "https://accounts.google.com/o/oauth2/v2/auth", + "token_request_uri": "https://oauth2.googleapis.com/token", + }, +} +DATABASE_OAUTH2_JWT_ALGORITHM = "HS256" +DATABASE_OAUTH2_REDIRECT_URI = "http://localhost:8088/api/v1/database/oauth2/" +DATABASE_OAUTH2_TIMEOUT = timedelta(seconds=30) +``` + +When configuring a client only the ID and secret are required; the DB engine spec should have default values for the scope and endpoints. The `DATABASE_OAUTH2_REDIRECT_URI` attribute is optional, and defaults to `/api/v1/databases/oauth2/` in Superset. + +In the future we plan to support adding custom clients via the Superset UI, and being able to manually assign clients to specific databases. + +### File upload + +When a DB engine spec supports file upload it declares so via the `supports_file_upload` class attribute. The base class implementation is very generic and should work for any database that has support for `CREATE TABLE`. It leverages Pandas and the [`df_to_sql`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_sql.html) method. + +For some databases the `df_to_sql` classmethod needs to be implemented. For example, for BigQuery the DB engine spec implements a custom method that uses the [`to_gbq`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_gbq.html) method. + +### Extra table metadata + +DB engine specs can return additional metadata associated with a table. This is done via the `get_extra_table_metadata` class method. Trino uses this to return information about the latest partition, for example, and Bigquery returns clustering information. This information is then surfaced in the SQL Lab UI, when browsing tables in the metadata explorer (on the left panel). + +### DB API exception mapping + +Different DB API 2.0 drivers implement different exceptions, even if they have the same name. The `get_dbapi_exception_mapping` class method returns a dictionary mapping these custom exceptions to Superset exceptions, so that Superset can return more specific errors when an exception is raised by the underlying driver. + +For example, for ClickHouse we have: + +```python +from urllib3.exceptions import NewConnectionError + +from superset.db_engine_specs.exceptions import SupersetDBAPIDatabaseError + + +class ClickHouseEngineSpec(ClickHouseBaseEngineSpec): + + @classmethod + def get_dbapi_exception_mapping(cls) -> dict[type[Exception], type[Exception]]: + return {NewConnectionError: SupersetDBAPIDatabaseError} +``` + +This way, if the ClickHouse driver raises a `NewConnectionError` it would get wrapped in a `SupersetDBAPIDatabaseError`. + +### Custom errors + +Queries can fail in many different ways. For example, in SQLite: + +```sql +sqlite> CREATE TABLE a (b INT); +sqlite> SELECT c FROM a; +Error: no such column: c +sqlite> +``` + +When a query fails, Superset will return the message, "Error: no such column: c", to the user as a generic error. + +Since ideally we want to return specific and actionable error messages, DB engine specs can implement methods that map error messages to more specific errors. For example, the SQLite DB engine specs defines: + +```python +COLUMN_DOES_NOT_EXIST_REGEX = re.compile("no such column: (?P.+)") + + +class SqliteEngineSpec(BaseEngineSpec): + + custom_errors: dict[Pattern[str], tuple[str, SupersetErrorType, dict[str, Any]]] = + COLUMN_DOES_NOT_EXIST_REGEX: ( + __('We can\'t seem to resolve the column "%(column_name)s"'), + SupersetErrorType.COLUMN_DOES_NOT_EXIST_ERROR, + {}, + ), + } +``` + +This way, when a user selects a column that doesn't exist Superset can return a more informative error. + +### Dynamic schema + +In SQL Lab it's possible to select a database, and then a schema in that database. Ideally, when running a query in SQL Lab, any unqualified table names (eg, `table`, instead of `schema.table`) should be in the selected schema. For example, if the user selects `dev` as the schema and then runs the following query: + +```sql +SELECT * FROM my_table +``` + +The table `my_table` should live in the `dev` schema. In order to do that, it's necessary to modify the SQLAlchemy URI before running the query. Since different databases have different ways of doing that, this functionality is implemented via the `adjust_engine_params` class method. The method receives the SQLAlchemy URI and `connect_args`, as well as the schema in which the query should run. It then returns a potentially modified URI and `connect_args` to ensure that the query runs in the specified schema. + +When a DB engine specs implements `adjust_engine_params` it should have the class attribute `supports_dynamic_schema` set to true. This is critical for security, since **it allows Superset to know to which schema any unqualified table names belong to**. For example, in the query above, if the database supports dynamic schema, Superset would check to see if the user running the query has access to `dev.my_table`. On the other hand, if the database doesn't support dynamic schema, Superset would use the default database schema instead of `dev`. + +Implementing this method is also important for usability. When the method is not implemented selecting the schema in SQL Lab has no effect on the schema in which the query runs, resulting in a confusing results when using unqualified table names. + +### Catalog + +In general, databases support a hierarchy of one-to-many concepts: + +1. Database +2. Catalog +3. Namespace +4. Table +5. Column + +These concepts have different names depending on the database. For example, Postgres uses the following terminology: + +1. Cluster (database) +2. Database (catalog) +3. Schema (namespace) +4. Table +5. Column + +BigQuery, on the other hand: + +1. BigQuery (database) +2. Project (catalog) +3. Schema (namespace) +4. Table +5. Column + +Hive and Trino: + +1. Database +2. Catalog +3. Schema +4. Table +5. Column + +If the database supports catalogs, then the DB engine spec should have the `supports_catalog` class attribute set to true. It should also implement the `get_default_catalog` method, so that the proper permissions can be created when datasets are added. + +### Dynamic catalog + +Superset support for multiple catalogs. Since, in general, a given SQLAlchemy URI connects only to a single catalog, it requires DB engine specs to implement the `adjust_engine_params` method to rewrite the URL to connect to a different catalog, similar to how dynamic schemas work. Additionally, DB engine specs should also implement the `get_catalog_names` method, so that users can browse the available catalogs. + +### SSH tunneling + +Superset can connect to databases via an SSH tunnel. For databases where this doesn't make sense (eg, SQLite or BigQuery) the DB engine spec should have `disable_ssh_tunneling` set to true. + +### Query cancelation + +Superset will try to cancel running queries if the users wants so, but it's up to the DB engine spec to handle this. + +Some databases have an implicit query cancelation. When a cursor stops being polled it will cancel the query. For databases that behave like this, the class method `has_implicit_cancel` (which should really be a class attribute) should return true. + +For other databases, DB engine specs can implement query cancelation via the `prepare_cancel_query` and `cancel_query` methods. Implementation of query cancelation is usually heavily dependent on the database, but the DB engine specs that support it can serve as an example. + +### Get metrics on dataset creation + +When a physical dataset is first created, the `get_metrics` class method is called on the table. The base implementation returns the `COUNT(*)` metric, but DB engine specs can override `get_metrics` to return other metrics. This method is useful for semantic layers that contain their own metrics definitions; when Superset connect to them it can automatically create those metrics when a dataset is added. + +This feature is still experimental, and ideally there would be a mechanism for calling it periodically or when a dataset is explored, in order to sync new metric definitions to the dataset. + +### `WHERE` on latest partition + +In some databases, running `SELECT *` can be a **very expensive** operation, since the query might scan all partitions for a given table. Because of that, some DB engine specs implement the `where_latest_partition` method, which returns a modified SQLAlchemy query with an additional predicate that filters on the latest partition. + +## Advanced features + +### Expand complex types + +Some databases will visually expand complex types (arrays and structures) when displaying results from queries. For example, the BigQuery UI is able to expand objects into columns and array into rows, so that this: + +| array | struct | +| --------- | ---------------- | +| [1, 2, 3] | `{a: one, b: two}` | + +Is shown as: + +| array | struct | struct.a | struct.b | +| ----- | ---------------- | -------- | -------- | +| 1 | `{a: one, b: two}` | one | two | + + +A similar behavior has been implemented in Superset for Presto, and can be enabled via the `PRESTO_EXPAND_DATA` feature flag. To implement this feature a DB engine spec should implement the `expand_data` method, which takes the columns and rows and returns modified columns and rows. + +Note that despite being implemented only for Presto, this behavior has nothing that is Presto specific, and in theory could be implemented in a generic way for all database without requiring custom DB engine spec implementations (that is, the Presto `expand_data` method could be moved to the base class, after being cleaned up, and we could then enable the feature per DB in the configuration). + +### Query cost estimation + +Some databases allow uses to estimate the cost of running a query before running it. This is done via the `estimate_query_cost` method in DB engine specs, which receives the SQL and returns a list of "costs". The definition of what "cost" is varies from database to database (in the few that support this functionality), and it can be formatted via the `query_cost_formatter`. + +The `query_cost_formatter` can be overridden with an arbitrary function via the config `QUERY_COST_FORMATTERS_BY_ENGINE`. This allows custom deployments of Superset to format the results in different ways. For example, at some point in Lyft the cost for running Presto queries would also show the carbon footprint (in trees). + +### SQL validation + +A few databases support validating the syntax of the SQL as the user is typing it, indicating in SQL Lab any errors. This is usually done using an `EXPLAIN` query and, because it gets called every few seconds as the user types, it's important that the database returns the result quickly. + +This is currently implement for Presto and Postgres, via custom classes in `superset/sql_validators` that should be enabled in the configuration. Implementing this as custom classes, instead of a `validate_sql` method in the DB engine spec offers no advantages, and ideally in the future we should move the logic to DB engine specs. + +## Testing DB engine specs + +Superset has a command to test the connection to a given database, as well as checking if the SQLAlchemy dialect implements all necessary methods used by Superset, and checking which features are supported by the DB engine spec (if one exists). To run the tool just call the `test-db` command with the SQLAlchemy URI to be tested: + +```bash +superset test-db sqlite:// +``` + +If the connection needs additional arguments they can be passed when the command runs. diff --git a/docs/docs/configuration/db_features.mdx b/docs/docs/configuration/db_features.mdx deleted file mode 100644 index e1d07bd5e1a4..000000000000 --- a/docs/docs/configuration/db_features.mdx +++ /dev/null @@ -1,6 +0,0 @@ ---- -title: Database Features -hide_title: true -sidebar_position: 1 -version: 1 ---- diff --git a/docs/docs/using-superset/db_features.mdx b/docs/docs/using-superset/db_features.mdx deleted file mode 100644 index 4c6a185caa7c..000000000000 --- a/docs/docs/using-superset/db_features.mdx +++ /dev/null @@ -1,1211 +0,0 @@ ---- -title: Database Features -hide_title: false -sidebar_position: 3 -version: 1 ---- - - -# Database engine specifications - -Superset uses [SQLAlchemy](https://www.sqlalchemy.org/) as an abstraction layer for running queries and fetching metadata from tables (like column names and types). Unfortunately, while SQLAlchemy offers enough functionality to allow connecting Superset to dozens of databases, there are still implementation details that differ across them. Because of this, Superset has an additional abstraction on top of SQLAlchemy, called a "database engine specification" or, simply, "DB engine spec". - -DB engine specs were created initially because there's no SQL standard for computing aggregations at different time grains. For example, to compute a daily metric in Trino or Postgres we could run a query like this: - -```sql -SELECT - date_trunc('day', CAST(time_column) AS TIMESTAMP) AS day, - COUNT(*) AS metric -FROM - some_table -GROUP BY - 1 -``` - -For MySQL, instead of using the `date_trunc` function, we would need to write: - -```sql -SELECT - DATE(time_column) AS day, - COUNT(*) AS metric -FROM - some_table -GROUP BY - 1 -``` - -Over time, more and more functionality was added to DB engine specs, including validating SQL, estimating the cost of queries before they are run, and understanding the semantics of error messages. These are all described in detail in this document, and in the table below you can see a summary of what features are supported by each database. - -Note that DB engine specs are completely optional. Superset can connect to any database supported by SQLAlchemy (or 3rd party dialects) even if there's no DB engine spec associated with it. But DB engine specs greatly improve the experience of working with a database in Superset. - -## Features - -The tables below (generated via `python superset/db_engine_specs/lib.py`) summarize the status of all DB engine specs in Superset, organized by feature category for easier navigation (note that this excludes 3rd party DB engine specs). - -### Quick Navigation - -- [Feature Overview](#feature-overview) - High-level summary of support across all databases -- [Database Information](#database-information) - Module paths and core metadata -- [SQL Capabilities](#sql-capabilities) - SQL language features and capabilities -- [Time Grains – Common](#time-grains--common) - Standard time granularity support -- [Time Grains – Extended](#time-grains--extended) - Sub-hour and week variant time grains -- [Core Platform & Metadata Features](#core-platform--metadata-features) - Platform integration and metadata capabilities -- [Operational & Advanced Features](#operational--advanced-features) - Advanced operational capabilities - -### Feature Overview - -| Database | Score | SQL Basics | Advanced SQL | Common Time Grains | Extended Time Grains | Integrations | Advanced Features | -| --- | --- | --- | --- | --- | --- | --- | --- | -| Presto | 159 | Supported | Partial | Supported | Partial | Partial | Supported | -| Trino | 149 | Supported | Partial | Supported | Partial | Partial | Partial | -| Apache Hive | 140 | Supported | Not supported | Supported | Partial | Partial | Partial | -| Apache Spark SQL | 140 | Supported | Not supported | Supported | Partial | Partial | Partial | -| Databricks Interactive Cluster | 140 | Supported | Not supported | Supported | Partial | Partial | Partial | -| base | 109 | Supported | Partial | Supported | Partial | Partial | Partial | -| Aurora PostgreSQL (Data API) | 104 | Supported | Partial | Supported | Partial | Partial | Partial | -| CockroachDB | 94 | Supported | Partial | Supported | Partial | Partial | Partial | -| RisingWave | 94 | Supported | Partial | Supported | Partial | Partial | Partial | -| Google BigQuery | 83 | Supported | Partial | Supported | Partial | Partial | Partial | -| Apache Doris | 79 | Supported | Partial | Supported | Partial | Partial | Not supported | -| Snowflake | 72 | Supported | Partial | Supported | Partial | Partial | Not supported | -| Databricks | 70 | Supported | Partial | Supported | Partial | Partial | Not supported | -| Databricks (legacy) | 70 | Supported | Partial | Supported | Partial | Partial | Not supported | -| StarRocks | 69 | Supported | Partial | Supported | Partial | Partial | Partial | -| SingleStore | 68 | Supported | Partial | Supported | Not supported | Partial | Not supported | -| ClickHouse Connect (Superset) | 61 | Supported | Partial | Partial | Partial | Partial | Not supported | -| Google Sheets | 61 | Supported | Partial | Supported | Supported | Partial | Partial | -| Aurora MySQL (Data API) | 59 | Supported | Partial | Supported | Partial | Partial | Not supported | -| MariaDB | 59 | Supported | Partial | Supported | Partial | Partial | Not supported | -| MySQL | 59 | Supported | Partial | Supported | Partial | Partial | Not supported | -| OceanBase | 59 | Supported | Partial | Supported | Partial | Partial | Not supported | -| MotherDuck | 58 | Supported | Partial | Supported | Not supported | Partial | Not supported | -| KustoSQL | 54 | Supported | Partial | Supported | Partial | Partial | Not supported | -| ClickHouse | 51 | Supported | Partial | Partial | Partial | Partial | Not supported | -| Databend | 51 | Supported | Partial | Supported | Partial | Partial | Not supported | -| Apache Drill | 50 | Supported | Partial | Supported | Partial | Partial | Partial | -| Apache Druid | 47 | Partial | Partial | Supported | Partial | Partial | Not supported | -| Amazon Redshift | 44 | Supported | Partial | Supported | Partial | Partial | Not supported | -| Azure Synapse | 44 | Partial | Partial | Supported | Partial | Partial | Not supported | -| Microsoft SQL Server | 44 | Partial | Partial | Supported | Partial | Partial | Not supported | -| SQLite | 41 | Supported | Partial | Supported | Supported | Not supported | Not supported | -| Shillelagh | 41 | Supported | Partial | Supported | Supported | Not supported | Not supported | -| KustoKQL | 40 | Supported | Partial | Partial | Partial | Partial | Not supported | -| Ascend | 38 | Supported | Partial | Supported | Not supported | Partial | Not supported | -| DuckDB | 38 | Supported | Partial | Supported | Not supported | Partial | Not supported | -| IBM Db2 | 38 | Supported | Partial | Supported | Not supported | Partial | Not supported | -| IBM Db2 for i | 38 | Supported | Partial | Supported | Not supported | Partial | Not supported | -| Ocient | 38 | Partial | Partial | Partial | Partial | Partial | Not supported | -| Apache Impala | 37 | Supported | Partial | Partial | Not supported | Partial | Not supported | -| ElasticSearch (SQL API) | 37 | Partial | Partial | Partial | Not supported | Partial | Not supported | -| PostgreSQL | 34 | Supported | Partial | Supported | Partial | Partial | Not supported | -| Vertica | 34 | Supported | Partial | Supported | Partial | Partial | Not supported | -| Amazon DynamoDB | 32 | Supported | Partial | Supported | Partial | Partial | Not supported | -| Apache Pinot | 32 | Partial | Partial | Supported | Partial | Partial | Not supported | -| Superset meta database | 31 | Supported | Partial | Supported | Supported | Not supported | Not supported | -| Databricks SQL Endpoint | 30 | Supported | Partial | Supported | Partial | Partial | Not supported | -| Apache Kylin | 28 | Supported | Partial | Supported | Not supported | Partial | Not supported | -| CrateDB | 28 | Supported | Partial | Supported | Not supported | Partial | Not supported | -| Dremio | 28 | Supported | Partial | Supported | Not supported | Partial | Not supported | -| Exasol | 28 | Supported | Partial | Supported | Not supported | Partial | Not supported | -| Firebolt | 28 | Supported | Partial | Supported | Not supported | Partial | Not supported | -| IBM Netezza Performance Server | 28 | Supported | Partial | Supported | Not supported | Partial | Not supported | -| Oracle | 28 | Supported | Partial | Supported | Not supported | Partial | Not supported | -| Parseable | 28 | Supported | Partial | Supported | Not supported | Partial | Not supported | -| Couchbase | 27 | Partial | Partial | Partial | Not supported | Partial | Not supported | -| Denodo | 27 | Supported | Partial | Partial | Not supported | Partial | Not supported | -| SAP HANA | 27 | Supported | Partial | Partial | Not supported | Partial | Not supported | -| Teradata | 27 | Supported | Partial | Partial | Not supported | Partial | Not supported | -| ElasticSearch (OpenDistro SQL) | 26 | Partial | Partial | Partial | Not supported | Partial | Not supported | -| Firebird | 26 | Supported | Partial | Partial | Not supported | Partial | Not supported | -| TDengine | 25 | Supported | Partial | Partial | Not supported | Partial | Not supported | -| YDB | 23 | Supported | Partial | Supported | Partial | Partial | Not supported | -| Amazon Athena | 20 | Supported | Partial | Supported | Partial | Not supported | Not supported | -| Apache Solr | 20 | Partial | Partial | Not supported | Not supported | Partial | Not supported | - -### Database Information - -| Database | Module | Limit Method | Limit Clause | Max Column Name | -| --- | --- | --- | --- | --- | -| Amazon Athena | superset.db_engine_specs.athena | FORCE_LIMIT | True | None | -| Amazon DynamoDB | superset.db_engine_specs.dynamodb | FORCE_LIMIT | True | None | -| Amazon Redshift | superset.db_engine_specs.redshift | FORCE_LIMIT | True | 127 | -| Apache Doris | superset.db_engine_specs.doris | FORCE_LIMIT | True | 64 | -| Apache Drill | superset.db_engine_specs.drill | FORCE_LIMIT | True | None | -| Apache Druid | superset.db_engine_specs.druid | FORCE_LIMIT | True | None | -| Apache Hive | superset.db_engine_specs.hive | FORCE_LIMIT | True | 767 | -| Apache Impala | superset.db_engine_specs.impala | FORCE_LIMIT | True | None | -| Apache Kylin | superset.db_engine_specs.kylin | FORCE_LIMIT | True | None | -| Apache Pinot | superset.db_engine_specs.pinot | FORCE_LIMIT | True | None | -| Apache Solr | superset.db_engine_specs.solr | FORCE_LIMIT | True | None | -| Apache Spark SQL | superset.db_engine_specs.spark | FORCE_LIMIT | True | 767 | -| Ascend | superset.db_engine_specs.ascend | FORCE_LIMIT | True | None | -| Aurora MySQL (Data API) | superset.db_engine_specs.aurora | FORCE_LIMIT | True | 64 | -| Aurora PostgreSQL (Data API) | superset.db_engine_specs.aurora | FORCE_LIMIT | True | 63 | -| Azure Synapse | superset.db_engine_specs.mssql | FORCE_LIMIT | True | 128 | -| ClickHouse | superset.db_engine_specs.clickhouse | FORCE_LIMIT | True | None | -| ClickHouse Connect (Superset) | superset.db_engine_specs.clickhouse | FORCE_LIMIT | True | None | -| CockroachDB | superset.db_engine_specs.cockroachdb | FORCE_LIMIT | True | 63 | -| Couchbase | superset.db_engine_specs.couchbase | FORCE_LIMIT | True | None | -| CrateDB | superset.db_engine_specs.crate | FORCE_LIMIT | True | None | -| Databend | superset.db_engine_specs.databend | FORCE_LIMIT | True | None | -| Databricks | superset.db_engine_specs.databricks | FORCE_LIMIT | True | None | -| Databricks (legacy) | superset.db_engine_specs.databricks | FORCE_LIMIT | True | None | -| Databricks Interactive Cluster | superset.db_engine_specs.databricks | FORCE_LIMIT | True | 767 | -| Databricks SQL Endpoint | superset.db_engine_specs.databricks | FORCE_LIMIT | True | None | -| Denodo | superset.db_engine_specs.denodo | FORCE_LIMIT | True | None | -| Dremio | superset.db_engine_specs.dremio | FORCE_LIMIT | True | None | -| DuckDB | superset.db_engine_specs.duckdb | FORCE_LIMIT | True | None | -| ElasticSearch (OpenDistro SQL) | superset.db_engine_specs.elasticsearch | FORCE_LIMIT | True | None | -| ElasticSearch (SQL API) | superset.db_engine_specs.elasticsearch | FORCE_LIMIT | True | None | -| Exasol | superset.db_engine_specs.exasol | FORCE_LIMIT | True | 128 | -| Firebird | superset.db_engine_specs.firebird | FETCH_MANY | True | None | -| Firebolt | superset.db_engine_specs.firebolt | FORCE_LIMIT | True | None | -| Google BigQuery | superset.db_engine_specs.bigquery | FORCE_LIMIT | True | 128 | -| Google Sheets | superset.db_engine_specs.gsheets | FORCE_LIMIT | True | None | -| IBM Db2 | superset.db_engine_specs.db2 | WRAP_SQL | True | 30 | -| IBM Db2 for i | superset.db_engine_specs.ibmi | WRAP_SQL | True | 128 | -| IBM Netezza Performance Server | superset.db_engine_specs.netezza | FORCE_LIMIT | True | None | -| KustoKQL | superset.db_engine_specs.kusto | FORCE_LIMIT | True | None | -| KustoSQL | superset.db_engine_specs.kusto | WRAP_SQL | True | None | -| MariaDB | superset.db_engine_specs.mariadb | FORCE_LIMIT | True | 64 | -| Microsoft SQL Server | superset.db_engine_specs.mssql | FORCE_LIMIT | True | 128 | -| MotherDuck | superset.db_engine_specs.duckdb | FORCE_LIMIT | True | None | -| MySQL | superset.db_engine_specs.mysql | FORCE_LIMIT | True | 64 | -| OceanBase | superset.db_engine_specs.oceanbase | FORCE_LIMIT | True | 128 | -| Ocient | superset.db_engine_specs.ocient | FORCE_LIMIT | True | 30 | -| Oracle | superset.db_engine_specs.oracle | FORCE_LIMIT | True | 128 | -| Parseable | superset.db_engine_specs.parseable | FORCE_LIMIT | True | None | -| PostgreSQL | superset.db_engine_specs.postgres | FORCE_LIMIT | True | None | -| Presto | superset.db_engine_specs.presto | FORCE_LIMIT | True | None | -| RisingWave | superset.db_engine_specs.risingwave | FORCE_LIMIT | True | 63 | -| SAP HANA | superset.db_engine_specs.hana | WRAP_SQL | True | 30 | -| SQLite | superset.db_engine_specs.sqlite | FORCE_LIMIT | True | None | -| Shillelagh | superset.db_engine_specs.shillelagh | FORCE_LIMIT | True | None | -| SingleStore | superset.db_engine_specs.singlestore | FORCE_LIMIT | True | 256 | -| Snowflake | superset.db_engine_specs.snowflake | FORCE_LIMIT | True | 256 | -| StarRocks | superset.db_engine_specs.starrocks | FORCE_LIMIT | True | 64 | -| Superset meta database | superset.db_engine_specs.superset | FORCE_LIMIT | True | None | -| TDengine | superset.db_engine_specs.tdengine | FORCE_LIMIT | True | 64 | -| Teradata | superset.db_engine_specs.teradata | FORCE_LIMIT | True | 30 | -| Trino | superset.db_engine_specs.trino | FORCE_LIMIT | True | None | -| Vertica | superset.db_engine_specs.vertica | FORCE_LIMIT | True | None | -| YDB | superset.db_engine_specs.ydb | FORCE_LIMIT | True | None | -| base | superset.db_engine_specs.presto | FORCE_LIMIT | True | None | - -### SQL Capabilities - -| Database | JOINs | Subqueries | Aliases in SELECT | Aliases in ORDER BY | CTEs | Comments | Escaped Colons | Inline Time Groupby | Source Column When Aliased | Aggregations in ORDER BY | Expressions in ORDER BY | -| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| Amazon Athena | True | True | True | True | True | True | False | False | False | True | False | -| Amazon DynamoDB | True | True | True | True | True | True | True | False | False | True | False | -| Amazon Redshift | True | True | True | True | True | True | True | False | False | True | False | -| Apache Doris | True | True | True | True | True | True | True | False | False | True | False | -| Apache Drill | True | True | True | True | True | True | True | False | False | True | False | -| Apache Druid | False | True | True | True | True | True | True | False | False | True | False | -| Apache Hive | True | True | True | True | True | True | True | False | False | False | False | -| Apache Impala | True | True | True | True | True | True | True | False | False | True | False | -| Apache Kylin | True | True | True | True | True | True | True | False | False | True | False | -| Apache Pinot | False | False | False | False | True | True | True | False | False | True | False | -| Apache Solr | False | False | True | True | True | True | True | False | False | True | False | -| Apache Spark SQL | True | True | True | True | True | True | True | False | False | False | False | -| Ascend | True | True | True | True | True | True | True | False | False | True | False | -| Aurora MySQL (Data API) | True | True | True | True | True | True | True | False | False | True | False | -| Aurora PostgreSQL (Data API) | True | True | True | True | True | True | True | False | False | True | False | -| Azure Synapse | True | True | True | True | False | True | True | False | False | True | False | -| ClickHouse | True | True | True | True | True | True | True | True | False | True | False | -| ClickHouse Connect (Superset) | True | True | True | True | True | True | True | True | False | True | False | -| CockroachDB | True | True | True | True | True | True | True | False | False | True | False | -| Couchbase | False | False | True | True | True | True | True | False | False | True | False | -| CrateDB | True | True | True | True | True | True | True | False | False | True | False | -| Databend | True | True | True | True | True | True | True | True | False | True | False | -| Databricks | True | True | True | True | True | True | True | False | False | True | False | -| Databricks (legacy) | True | True | True | True | True | True | True | False | False | True | False | -| Databricks Interactive Cluster | True | True | True | True | True | True | True | False | False | False | False | -| Databricks SQL Endpoint | True | True | True | True | True | True | True | False | False | True | False | -| Denodo | True | True | True | True | True | True | True | False | False | True | False | -| Dremio | True | True | True | True | True | True | True | False | False | True | False | -| DuckDB | True | True | True | True | True | True | True | False | False | True | False | -| ElasticSearch (OpenDistro SQL) | False | True | True | True | True | False | True | True | False | True | False | -| ElasticSearch (SQL API) | False | True | True | True | True | False | True | True | False | True | False | -| Exasol | True | True | True | True | True | True | True | False | False | True | False | -| Firebird | True | True | True | True | True | True | True | False | False | True | False | -| Firebolt | True | True | True | True | True | True | True | False | False | True | False | -| Google BigQuery | True | True | True | True | True | True | True | False | False | True | True | -| Google Sheets | True | True | True | True | True | True | True | False | False | True | False | -| IBM Db2 | True | True | True | True | True | True | True | False | False | True | False | -| IBM Db2 for i | True | True | True | True | True | True | True | False | False | True | False | -| IBM Netezza Performance Server | True | True | True | True | True | True | True | False | False | True | False | -| KustoKQL | True | True | True | True | True | False | True | True | False | True | False | -| KustoSQL | True | True | True | True | True | False | True | True | False | True | False | -| MariaDB | True | True | True | True | True | True | True | False | False | True | False | -| Microsoft SQL Server | True | True | True | True | False | True | True | False | False | True | False | -| MotherDuck | True | True | True | True | True | True | True | False | False | True | False | -| MySQL | True | True | True | True | True | True | True | False | False | True | False | -| OceanBase | True | True | True | True | True | True | True | False | False | True | False | -| Ocient | True | True | True | True | False | True | True | False | False | True | False | -| Oracle | True | True | True | True | True | True | True | False | False | True | False | -| Parseable | True | True | True | True | True | True | True | False | False | True | False | -| PostgreSQL | True | True | True | True | True | True | True | False | False | True | False | -| Presto | True | True | True | True | True | True | True | False | True | True | False | -| RisingWave | True | True | True | True | True | True | True | False | False | True | False | -| SAP HANA | True | True | True | True | True | True | True | False | False | True | False | -| SQLite | True | True | True | True | True | True | True | False | False | True | False | -| Shillelagh | True | True | True | True | True | True | True | False | False | True | False | -| SingleStore | True | True | True | True | True | True | True | False | False | True | False | -| Snowflake | True | True | True | True | True | True | True | False | False | True | False | -| StarRocks | True | True | True | True | True | True | True | False | False | True | False | -| Superset meta database | True | True | True | True | True | True | True | False | False | True | False | -| TDengine | True | True | True | True | True | True | True | False | False | True | False | -| Teradata | True | True | True | True | True | True | True | False | False | True | False | -| Trino | True | True | True | True | True | True | True | False | True | True | False | -| Vertica | True | True | True | True | True | True | True | False | False | True | False | -| YDB | True | True | True | True | True | True | True | False | False | True | False | -| base | True | True | True | True | True | True | True | False | False | True | False | - -### Time Grains – Common - -| Database | SECOND | MINUTE | HOUR | DAY | WEEK | MONTH | QUARTER | YEAR | -| --- | --- | --- | --- | --- | --- | --- | --- | --- | -| Amazon Athena | True | True | True | True | True | True | True | True | -| Amazon DynamoDB | True | True | True | True | True | True | True | True | -| Amazon Redshift | True | True | True | True | True | True | True | True | -| Apache Doris | True | True | True | True | True | True | True | True | -| Apache Drill | True | True | True | True | True | True | True | True | -| Apache Druid | True | True | True | True | True | True | True | True | -| Apache Hive | True | True | True | True | True | True | True | True | -| Apache Impala | False | True | True | True | True | True | True | True | -| Apache Kylin | True | True | True | True | True | True | True | True | -| Apache Pinot | True | True | True | True | True | True | True | True | -| Apache Solr | False | False | False | False | False | False | False | False | -| Apache Spark SQL | True | True | True | True | True | True | True | True | -| Ascend | True | True | True | True | True | True | True | True | -| Aurora MySQL (Data API) | True | True | True | True | True | True | True | True | -| Aurora PostgreSQL (Data API) | True | True | True | True | True | True | True | True | -| Azure Synapse | True | True | True | True | True | True | True | True | -| ClickHouse | False | True | True | True | True | True | True | True | -| ClickHouse Connect (Superset) | False | True | True | True | True | True | True | True | -| CockroachDB | True | True | True | True | True | True | True | True | -| Couchbase | True | True | True | True | False | True | True | True | -| CrateDB | True | True | True | True | True | True | True | True | -| Databend | True | True | True | True | True | True | True | True | -| Databricks | True | True | True | True | True | True | True | True | -| Databricks (legacy) | True | True | True | True | True | True | True | True | -| Databricks Interactive Cluster | True | True | True | True | True | True | True | True | -| Databricks SQL Endpoint | True | True | True | True | True | True | True | True | -| Denodo | False | True | True | True | True | True | True | True | -| Dremio | True | True | True | True | True | True | True | True | -| DuckDB | True | True | True | True | True | True | True | True | -| ElasticSearch (OpenDistro SQL) | True | True | True | True | False | True | False | True | -| ElasticSearch (SQL API) | True | True | True | True | True | True | False | True | -| Exasol | True | True | True | True | True | True | True | True | -| Firebird | True | True | True | True | False | True | False | True | -| Firebolt | True | True | True | True | True | True | True | True | -| Google BigQuery | True | True | True | True | True | True | True | True | -| Google Sheets | True | True | True | True | True | True | True | True | -| IBM Db2 | True | True | True | True | True | True | True | True | -| IBM Db2 for i | True | True | True | True | True | True | True | True | -| IBM Netezza Performance Server | True | True | True | True | True | True | True | True | -| KustoKQL | True | True | True | True | True | True | False | True | -| KustoSQL | True | True | True | True | True | True | True | True | -| MariaDB | True | True | True | True | True | True | True | True | -| Microsoft SQL Server | True | True | True | True | True | True | True | True | -| MotherDuck | True | True | True | True | True | True | True | True | -| MySQL | True | True | True | True | True | True | True | True | -| OceanBase | True | True | True | True | True | True | True | True | -| Ocient | True | True | True | True | True | True | False | True | -| Oracle | True | True | True | True | True | True | True | True | -| Parseable | True | True | True | True | True | True | True | True | -| PostgreSQL | True | True | True | True | True | True | True | True | -| Presto | True | True | True | True | True | True | True | True | -| RisingWave | True | True | True | True | True | True | True | True | -| SAP HANA | True | True | True | True | False | True | True | True | -| SQLite | True | True | True | True | True | True | True | True | -| Shillelagh | True | True | True | True | True | True | True | True | -| SingleStore | True | True | True | True | True | True | True | True | -| Snowflake | True | True | True | True | True | True | True | True | -| StarRocks | True | True | True | True | True | True | True | True | -| Superset meta database | True | True | True | True | True | True | True | True | -| TDengine | True | True | True | True | True | False | False | False | -| Teradata | False | True | True | True | True | True | True | True | -| Trino | True | True | True | True | True | True | True | True | -| Vertica | True | True | True | True | True | True | True | True | -| YDB | True | True | True | True | True | True | True | True | -| base | True | True | True | True | True | True | True | True | - -### Time Grains – Extended - -| Database | FIVE_SECONDS | THIRTY_SECONDS | FIVE_MINUTES | TEN_MINUTES | FIFTEEN_MINUTES | THIRTY_MINUTES | HALF_HOUR | SIX_HOURS | WEEK_STARTING_SUNDAY | WEEK_STARTING_MONDAY | WEEK_ENDING_SATURDAY | WEEK_ENDING_SUNDAY | QUARTER_YEAR | -| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| Amazon Athena | False | False | False | False | False | False | False | False | True | False | True | False | False | -| Amazon DynamoDB | False | False | False | False | False | False | False | False | True | True | True | True | False | -| Amazon Redshift | True | True | True | True | True | True | False | False | False | False | False | False | False | -| Apache Doris | False | False | False | False | False | False | False | False | False | True | False | False | False | -| Apache Drill | False | False | False | False | True | True | False | False | False | False | False | False | False | -| Apache Druid | True | True | True | True | True | True | False | True | True | False | True | False | False | -| Apache Hive | False | False | False | False | False | False | False | False | True | False | True | False | False | -| Apache Impala | False | False | False | False | False | False | False | False | False | False | False | False | False | -| Apache Kylin | False | False | False | False | False | False | False | False | False | False | False | False | False | -| Apache Pinot | False | False | True | True | True | True | False | False | False | False | False | False | False | -| Apache Solr | False | False | False | False | False | False | False | False | False | False | False | False | False | -| Apache Spark SQL | False | False | False | False | False | False | False | False | True | False | True | False | False | -| Ascend | False | False | False | False | False | False | False | False | False | False | False | False | False | -| Aurora MySQL (Data API) | False | False | False | False | False | False | False | False | False | True | False | False | False | -| Aurora PostgreSQL (Data API) | True | True | True | True | True | True | False | False | False | False | False | False | False | -| Azure Synapse | False | False | True | True | True | True | False | False | True | True | False | False | False | -| ClickHouse | False | False | True | True | True | True | False | False | False | False | False | False | False | -| ClickHouse Connect (Superset) | False | False | True | True | True | True | False | False | False | False | False | False | False | -| CockroachDB | True | True | True | True | True | True | False | False | False | False | False | False | False | -| Couchbase | False | False | False | False | False | False | False | False | False | False | False | False | False | -| CrateDB | False | False | False | False | False | False | False | False | False | False | False | False | False | -| Databend | False | False | True | True | True | False | False | False | False | False | False | False | False | -| Databricks | False | False | False | False | False | False | False | False | True | False | True | False | False | -| Databricks (legacy) | False | False | False | False | False | False | False | False | True | False | True | False | False | -| Databricks Interactive Cluster | False | False | False | False | False | False | False | False | True | False | True | False | False | -| Databricks SQL Endpoint | False | False | False | False | False | False | False | False | True | False | True | False | False | -| Denodo | False | False | False | False | False | False | False | False | False | False | False | False | False | -| Dremio | False | False | False | False | False | False | False | False | False | False | False | False | False | -| DuckDB | False | False | False | False | False | False | False | False | False | False | False | False | False | -| ElasticSearch (OpenDistro SQL) | False | False | False | False | False | False | False | False | False | False | False | False | False | -| ElasticSearch (SQL API) | False | False | False | False | False | False | False | False | False | False | False | False | False | -| Exasol | False | False | False | False | False | False | False | False | False | False | False | False | False | -| Firebird | False | False | False | False | False | False | False | False | False | False | False | False | False | -| Firebolt | False | False | False | False | False | False | False | False | False | False | False | False | False | -| Google BigQuery | False | False | True | True | True | True | False | False | False | True | False | False | False | -| Google Sheets | True | True | True | True | True | True | True | True | True | True | True | True | True | -| IBM Db2 | False | False | False | False | False | False | False | False | False | False | False | False | False | -| IBM Db2 for i | False | False | False | False | False | False | False | False | False | False | False | False | False | -| IBM Netezza Performance Server | False | False | False | False | False | False | False | False | False | False | False | False | False | -| KustoKQL | False | True | True | False | False | True | False | False | False | False | False | False | False | -| KustoSQL | False | False | True | True | True | False | True | False | True | True | False | False | False | -| MariaDB | False | False | False | False | False | False | False | False | False | True | False | False | False | -| Microsoft SQL Server | False | False | True | True | True | True | False | False | True | True | False | False | False | -| MotherDuck | False | False | False | False | False | False | False | False | False | False | False | False | False | -| MySQL | False | False | False | False | False | False | False | False | False | True | False | False | False | -| OceanBase | False | False | False | False | False | False | False | False | False | True | False | False | False | -| Ocient | False | False | False | False | False | False | False | False | False | False | False | False | True | -| Oracle | False | False | False | False | False | False | False | False | False | False | False | False | False | -| Parseable | False | False | False | False | False | False | False | False | False | False | False | False | False | -| PostgreSQL | True | True | True | True | True | True | False | False | False | False | False | False | False | -| Presto | True | True | True | True | True | False | True | True | True | True | True | True | False | -| RisingWave | True | True | True | True | True | True | False | False | False | False | False | False | False | -| SAP HANA | False | False | False | False | False | False | False | False | False | False | False | False | False | -| SQLite | True | True | True | True | True | True | True | True | True | True | True | True | True | -| Shillelagh | True | True | True | True | True | True | True | True | True | True | True | True | True | -| SingleStore | False | False | False | False | False | False | False | False | False | False | False | False | False | -| Snowflake | False | False | True | True | True | True | False | False | False | False | False | False | False | -| StarRocks | False | False | False | False | False | False | False | False | False | True | False | False | False | -| Superset meta database | True | True | True | True | True | True | True | True | True | True | True | True | True | -| TDengine | False | False | False | False | False | False | False | False | False | False | False | False | False | -| Teradata | False | False | False | False | False | False | False | False | False | False | False | False | False | -| Trino | True | True | True | True | True | False | True | True | True | True | True | True | False | -| Vertica | True | True | True | True | True | True | False | False | False | False | False | False | False | -| YDB | False | True | True | True | True | True | False | False | False | False | False | False | False | -| base | True | True | True | True | True | False | True | True | True | True | True | True | False | - -### Core Platform & Metadata Features - - -Integration with platform features and metadata handling. - -| Database | Masked Encrypted Extra | Column Type Mappings | Function Names | File Upload | Dynamic Schema | Catalog | Dynamic Catalog | SSH Tunneling | Latest Partition | Query Cancellation | Get Metrics | Extra Table Metadata | Exception Mapping | Custom Errors | -| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| Amazon Athena | False | False | False | True | False | False | False | False | False | False | False | False | False | False | -| Amazon DynamoDB | False | False | False | True | False | False | False | True | False | False | False | False | False | False | -| Amazon Redshift | False | False | False | True | False | False | False | True | False | True | False | False | False | False | -| Apache Doris | False | True | False | True | True | True | True | True | False | True | False | False | False | False | -| Apache Drill | False | False | False | True | True | False | False | True | False | False | False | False | False | False | -| Apache Druid | False | False | False | True | False | False | False | True | False | False | False | False | True | False | -| Apache Hive | False | True | True | True | True | True | True | True | True | True | False | True | False | False | -| Apache Impala | False | False | False | True | False | False | False | True | False | True | False | False | False | False | -| Apache Kylin | False | False | False | True | False | False | False | True | False | False | False | False | False | False | -| Apache Pinot | False | False | False | True | False | False | False | True | False | False | False | False | False | False | -| Apache Solr | False | False | False | True | False | False | False | True | False | False | False | False | False | False | -| Apache Spark SQL | False | True | True | True | True | True | True | True | True | True | False | True | False | False | -| Ascend | False | False | False | True | False | False | False | True | False | True | False | False | False | False | -| Aurora MySQL (Data API) | False | True | False | True | True | False | False | True | False | True | False | False | False | False | -| Aurora PostgreSQL (Data API) | False | True | False | True | True | True | True | True | False | True | False | False | False | False | -| Azure Synapse | False | True | False | True | False | False | False | True | False | False | False | False | False | False | -| ClickHouse | False | True | True | False | False | False | False | True | False | False | False | False | True | False | -| ClickHouse Connect (Superset) | False | True | True | False | True | False | False | True | False | False | False | False | True | False | -| CockroachDB | False | True | False | True | True | True | True | True | False | True | False | False | False | False | -| Couchbase | False | False | False | True | False | False | False | True | False | False | False | False | False | False | -| CrateDB | False | False | False | True | False | False | False | True | False | False | False | False | False | False | -| Databend | False | True | True | False | False | False | False | True | False | False | False | False | True | False | -| Databricks | False | False | False | True | True | True | True | True | False | False | False | False | False | True | -| Databricks (legacy) | False | False | False | True | True | True | True | True | False | False | False | False | False | True | -| Databricks Interactive Cluster | False | True | True | True | True | True | True | True | True | True | False | True | False | False | -| Databricks SQL Endpoint | False | False | False | True | False | False | False | True | False | False | False | False | False | False | -| Denodo | False | False | False | True | False | False | False | True | False | False | False | False | False | False | -| Dremio | False | False | False | True | False | False | False | True | False | False | False | False | False | False | -| DuckDB | False | True | False | True | False | False | False | True | False | False | False | False | False | False | -| ElasticSearch (OpenDistro SQL) | False | False | False | True | False | False | False | True | False | False | False | False | False | False | -| ElasticSearch (SQL API) | False | False | False | True | False | False | False | True | False | False | False | False | True | False | -| Exasol | False | False | False | True | False | False | False | True | False | False | False | False | False | False | -| Firebird | False | False | False | True | False | False | False | True | False | False | False | False | False | False | -| Firebolt | False | False | False | True | False | False | False | True | False | False | False | False | False | False | -| Google BigQuery | False | False | False | True | False | True | True | False | True | False | False | True | True | False | -| Google Sheets | False | False | True | True | False | False | False | False | False | False | False | True | False | False | -| IBM Db2 | False | False | False | True | True | False | False | True | False | False | False | False | False | False | -| IBM Db2 for i | False | False | False | True | True | False | False | True | False | False | False | False | False | False | -| IBM Netezza Performance Server | False | False | False | True | False | False | False | True | False | False | False | False | False | False | -| KustoKQL | False | False | False | True | False | False | False | True | False | False | False | False | True | False | -| KustoSQL | False | True | False | True | False | False | False | True | False | False | False | False | True | False | -| MariaDB | False | True | False | True | True | False | False | True | False | True | False | False | False | False | -| Microsoft SQL Server | False | True | False | True | False | False | False | True | False | False | False | False | False | False | -| MotherDuck | False | True | False | True | False | True | True | True | False | False | False | False | False | False | -| MySQL | False | True | False | True | True | False | False | True | False | True | False | False | False | False | -| OceanBase | False | True | False | True | True | False | False | True | False | True | False | False | False | False | -| Ocient | False | False | False | True | False | False | False | True | False | True | False | False | False | False | -| Oracle | False | False | False | True | False | False | False | True | False | False | False | False | False | False | -| Parseable | False | False | False | True | False | False | False | True | False | False | False | False | False | False | -| PostgreSQL | False | False | False | True | False | False | False | True | False | False | False | False | False | False | -| Presto | False | True | True | True | True | True | True | True | True | True | False | True | False | False | -| RisingWave | False | True | False | True | True | True | True | True | False | True | False | False | False | False | -| SAP HANA | False | False | False | True | False | False | False | True | False | False | False | False | False | False | -| SQLite | False | False | True | True | False | False | False | False | False | False | False | False | False | False | -| Shillelagh | False | False | True | True | False | False | False | False | False | False | False | False | False | False | -| SingleStore | False | True | True | True | True | False | False | True | False | True | False | False | False | False | -| Snowflake | False | False | False | True | True | True | True | True | False | True | False | False | False | False | -| StarRocks | False | True | False | True | True | False | False | True | False | True | False | False | False | False | -| Superset meta database | False | False | True | False | False | False | False | False | False | False | False | False | False | False | -| TDengine | False | False | False | True | False | False | False | True | False | False | False | False | False | False | -| Teradata | False | False | False | True | False | False | False | True | False | False | False | False | False | False | -| Trino | False | True | True | True | True | True | True | True | True | True | False | True | True | False | -| Vertica | False | False | False | True | False | False | False | True | False | False | False | False | False | False | -| YDB | False | False | False | False | False | False | False | True | False | False | False | False | False | False | -| base | False | True | True | True | True | True | True | True | True | False | False | False | False | False | - -### Operational & Advanced Features - -| Database | User Impersonation | Expand Data | Cost Estimation | SQL Validation | -| --- | --- | --- | --- | --- | -| Amazon Athena | False | False | False | False | -| Amazon DynamoDB | False | False | False | False | -| Amazon Redshift | False | False | False | False | -| Apache Doris | False | False | False | False | -| Apache Drill | True | False | False | False | -| Apache Druid | False | False | False | False | -| Apache Hive | True | True | True | False | -| Apache Impala | False | False | False | False | -| Apache Kylin | False | False | False | False | -| Apache Pinot | False | False | False | False | -| Apache Solr | False | False | False | False | -| Apache Spark SQL | True | True | True | False | -| Ascend | False | False | False | False | -| Aurora MySQL (Data API) | False | False | False | False | -| Aurora PostgreSQL (Data API) | False | False | True | True | -| Azure Synapse | False | False | False | False | -| ClickHouse | False | False | False | False | -| ClickHouse Connect (Superset) | False | False | False | False | -| CockroachDB | False | False | True | False | -| Couchbase | False | False | False | False | -| CrateDB | False | False | False | False | -| Databend | False | False | False | False | -| Databricks | False | False | False | False | -| Databricks (legacy) | False | False | False | False | -| Databricks Interactive Cluster | True | True | True | False | -| Databricks SQL Endpoint | False | False | False | False | -| Denodo | False | False | False | False | -| Dremio | False | False | False | False | -| DuckDB | False | False | False | False | -| ElasticSearch (OpenDistro SQL) | False | False | False | False | -| ElasticSearch (SQL API) | False | False | False | False | -| Exasol | False | False | False | False | -| Firebird | False | False | False | False | -| Firebolt | False | False | False | False | -| Google BigQuery | False | False | True | False | -| Google Sheets | True | False | False | False | -| IBM Db2 | False | False | False | False | -| IBM Db2 for i | False | False | False | False | -| IBM Netezza Performance Server | False | False | False | False | -| KustoKQL | False | False | False | False | -| KustoSQL | False | False | False | False | -| MariaDB | False | False | False | False | -| Microsoft SQL Server | False | False | False | False | -| MotherDuck | False | False | False | False | -| MySQL | False | False | False | False | -| OceanBase | False | False | False | False | -| Ocient | False | False | False | False | -| Oracle | False | False | False | False | -| Parseable | False | False | False | False | -| PostgreSQL | False | False | False | False | -| Presto | True | True | True | True | -| RisingWave | False | False | True | False | -| SAP HANA | False | False | False | False | -| SQLite | False | False | False | False | -| Shillelagh | False | False | False | False | -| SingleStore | False | False | False | False | -| Snowflake | False | False | False | False | -| StarRocks | True | False | False | False | -| Superset meta database | False | False | False | False | -| TDengine | False | False | False | False | -| Teradata | False | False | False | False | -| Trino | True | False | True | False | -| Vertica | False | False | False | False | -| YDB | False | False | False | False | -| base | False | False | True | False | - -## Database information - -A DB engine spec has attributes that describe the underlying database engine, so that Superset can know how to build and run queries. For example, some databases don't support subqueries, which are needed for some of the queries produced by Superset for certain charts. When a database doesn't support subqueries the query is run in two-steps, using the results from the first query to build the second query. - -These attributes and their default values (set in the base class, `BaseEngineSpec`) are described below: - -### `limit_method = LimitMethod.FORCE_LIMIT` - -When running user queries in SQL Lab, Superset needs to limit the number of rows returned. The reason for that is cost and performance: there's no point in running a query that produces millions of rows when they can't be loaded into the browser. - -For most databases this is done by parsing the user submitted query and applying a limit, if one is not present, or replacing the existing limit if it's larger. This is called the `FORCE_LIMIT` method, and is the most efficient, since the database will produce at most the number of rows that Superset will display. - -For some databases this method might not work, and they can use the `WRAP_SQL` method, which wraps the original query in a `SELECT *` and applies a limit via the SQLAlchemy dialect, which should get translated to the correct syntax. This method might be inefficient, since the database optimizer might not be able to push the limit to the inner query. - -Finally, as a last resource there is the `FETCH_MANY` method. When a DB engine spec uses this method the query runs unmodified, but Superset fetches only a certain number of rows from the cursor. It's possible that a database using this method can optimize the query execution and compute rows as they are being read by the cursor, but it's unlikely. This makes this method the least efficient of the three. - -Note that when Superset runs a query with a given limit, say 100, it always modifies the query to request one additional row (`LIMIT 101`, in this case). This extra row is dropped before the results are returned to the user, but it allows Superset to inform the users that the query was indeed limited. Otherwise a query with `LIMIT 100` that returns exactly 100 rows would seem like it was limited, when in fact it was not. - -### `allows_joins = True` - -Not all databases support `JOIN`s. When building complex charts, Superset will try to join the table to itself in order to compute `top_n` groups, for example. If the database doesn't support joins Superset will instead run a prequery, and use the results to build the final query. - -### `allows_subqueries = True` - -Similarly, not all databases support subqueries. For more complex charts Superset will build subqueries if possible, or run the query in two-steps otherwise. - -### `allows_alias_in_select = True` - -Does the DB support aliases in the projection of a query, eg: - -```sql -SELECT COUNT(*) AS cnt -``` - -Superset will try to use aliases whenever possible, in order to give friendly names to expressions. - -### `allows_alias_in_orderby = True` - -Does the DB support referencing alias in the `GROUP BY`, eg: - -```sql -SELECT - UPPER(country_of_origin) AS country - COUNT(*) AS cnt -FROM - some_table -GROUP BY - country -``` - -Otherwise the query is written as: - -```sql -SELECT - UPPER(country_of_origin) AS country - COUNT(*) AS cnt -FROM - some_table -GROUP BY - UPPER(country_of_origin) -``` - -### `time_groupby_inline = False` - -In theory this attribute should be used to omit time filters from the self-joins. When the attribute is false the time attribute will be present in the subquery used to compute limited series, eg: - -```sql -SELECT DATE_TRUNC('day', ts) AS ts, - team AS team, - COUNT(*) AS count -FROM public.threads -JOIN - (SELECT team AS team__, - COUNT(*) AS mme_inner__ - FROM public.threads - -- this is added when `time_groupby_inline = False` - WHERE ts >= TO_TIMESTAMP('2022-07-27 00:00:00.000000', 'YYYY-MM-DD HH24:MI:SS.US') - AND ts < TO_TIMESTAMP('2023-07-27 00:00:00.000000', 'YYYY-MM-DD HH24:MI:SS.US') - -- - GROUP BY team - ORDER BY mme_inner__ DESC - LIMIT 5) AS anon_1 ON team = team__ -WHERE ts >= TO_TIMESTAMP('2022-07-27 00:00:00.000000', 'YYYY-MM-DD HH24:MI:SS.US') - AND ts < TO_TIMESTAMP('2023-07-27 00:00:00.000000', 'YYYY-MM-DD HH24:MI:SS.US') -GROUP BY DATE_TRUNC('day', ts), - team -ORDER BY count DESC -LIMIT 10000; -``` - -In practice, the attribute doesn't seem to be working as of 2023-07-27. - -### `allows_alias_to_source_column = True` - -When this is true the database allows queries where alias can overshadow existing column names. For example, in this query: - -```sql -SELECT - foo + 1 AS foo -FROM - some_table -ORDER BY - foo -- references the alias `foo + 1`, not the column `foo` -``` - -### `allows_hidden_orderby_agg = True` - -If set to true the database allows expressions in the `GROUP BY` that are not present in the projection (`SELECT`), eg: - -```sql -SELECT - country, - COUNT(*) -FROM - some_table -GROUP BY - country -ORDER BY - SUM(population) -- not present in the `SELECT` -``` - -### `allows_hidden_cc_in_orderby = False` - -This the opposite of `allows_alias_in_orderby`, for databases that require aliases in the `ORDER BY`. For example, BigQuery doesn't like this query: - -```sql -SELECT - CASE - WHEN type = 'feature' THEN 'f' - WHEN type = 'bug' THEN 'b' - ELSE 'o' - END AS cc_type -FROM - some_table -GROUP BY - cc_type -ORDER BY - CASE - WHEN type = 'feature' THEN 'f' - WHEN type = 'bug' THEN 'b' - ELSE 'o' - END -``` - -Instead, it must be written as: - -```sql -SELECT - CASE - WHEN type = 'feature' THEN 'f' - WHEN type = 'bug' THEN 'b' - ELSE 'o' - END AS cc_type -FROM - some_table -GROUP BY - cc_type -ORDER BY - cc_type -``` - -### `allows_cte_in_subquery = True` - -When a virtual dataset is used in a chart the original query is converted into a subquery, and is wrapped in an outer query that is generated based on the chart controls. The virtual dataset query might have a CTE, and some databases don't like subqueries with CTEs in them. - -When this attribute is false Superset will extract the CTE and move it outside of the subquery when generating SQL for charts. The name of the new CTE will be `cte_alias`, also defined in the DB engine spec. - -### `allow_limit_clause = True` - -Allows for the `LIMIT` clause. Otherwise, the database probably uses `TOP` to limit rows. - -### `max_column_name_length: int | None = None` - -Most databases have a well defined limit for the maximum length of a column name (SQLite is probably the one exception). While the can be set (and defaults) to `None,` it's highly recommended to set a value to prevent errors. - -### `allows_sql_comments = True` - -Are comments supported in the DB? In general SQL in comments are defined by double dashes: - -```sql --- this is a comment -SELECT * -- we need everything -FROM some_table -``` - -### `allows_escaped_colons = True` - -SQLAlchemy recommends escaping colons to prevent them from being interpreted as bindings to parameters. Because of this, when building queries from virtual datasets Superset will escape all colons with `\:`. - -This works for most databases except Athena. The `allows_escaped_colons` attribute specifies if the database supports the escape colon. - -## Basic features - -These are features that all DB engine specs should support, as the name suggests. They provide a much better user experience for the user. - -### Time grains - -The most basic feature that DB engine specs need to support is defining time grain expressions. These are dialect-specific SQL expressions that are used to compute metrics on a given time grain when building charts. For example, when computing the metric `COUNT(*)` on a daily basis, Superset will generate the following query: - -```sql -SELECT - , - COUNT(*) -... -GROUP BY - -``` - -For some databases with support for `DATE_TRUNC` or `TIME_FLOOR` this is easy. Here's how Apache Druid computes 15 minute aggregations: - -```sql -TIME_FLOOR(CAST({col} AS TIMESTAMP), 'PT15M') -``` - -Where `{col}` is the time column being aggregated — the expression is actually a Jinja2 template. Druid uses the ISO standard for durations, with `PT15M` representing 15 minutes. - -On the other and, here's the same for SQLite: - -```sql -DATETIME( - STRFTIME( - '%Y-%m-%dT%H:%M:00', - {col} - ), - printf( - '-%d minutes', - CAST(strftime('%M', {col}) AS INT) % 15 - ) -) -``` - -The SQLite version has to truncate the column down to the minute, and then subtract a number of minutes equals to the modulo 15. - -Time grain expressions are defined in the `_time_grain_expressions` class attribute, which maps from a `superset.constants.TimeGrain` to the SQL expression. The dictionary has a special key `None`, that should map to the column directly, for when no time grain is specified. - -Note that it's possible to add new time grains via configuration. For example, if you want to add a "2 seconds" time grain to your installation you can add it to `TIME_GRAIN_ADDONS`, and implement it in `TIME_GRAIN_ADDON_EXPRESSIONS`: - -```python -# superset_config.py -TIME_GRAIN_ADDONS = {"PT2S": "2 second"} - -TIME_GRAIN_ADDON_EXPRESSIONS = { - "clickhouse": { - "PT2S": "toDateTime(intDiv(toUInt32(toDateTime({col})), 2)*2)", - } -} -``` - -### Column type mapping - -Column type mapping, defined in the `column_type_mappings` class attribute, is just a way of mapping type names from the database to types Superset understand. The default values in `BaseEngineSpec` are sane: - -```python -_default_column_type_mappings: tuple[ColumnTypeMapping, ...] = ( - ( - re.compile(r"^string", re.IGNORECASE), - types.String(), - GenericDataType.STRING, - ), - ( - re.compile(r"^float", re.IGNORECASE), - types.Float(), - GenericDataType.NUMERIC, - ), - ( - re.compile(r"^date", re.IGNORECASE), - types.Date(), - GenericDataType.TEMPORAL, - ), - ( - re.compile(r"^bool(ean)?", re.IGNORECASE), - types.Boolean(), - GenericDataType.BOOLEAN, - ), - ... -) -``` - -But you might want to implement more specific types in the DB engine spec, or complex types. For example, for MSSQL we have: - -```python -from sqlalchemy.dialects.mssql.base import SMALLDATETIME - -class MssqlEngineSpec(BaseEngineSpec): - ... - column_type_mappings = ( - ( - re.compile(r"^smalldatetime.*", re.IGNORECASE), - SMALLDATETIME(), - GenericDataType.TEMPORAL, - ), - ) -``` - -### Function names - -DB engine specs should implement a class method called `get_function_names` that returns a list of strings, representing all the function names that the database supports. This is used for autocomplete in SQL Lab. - -### Masked encrypted extra - -Superset does a good job in keeping credentials secure. When you add a database with a password, for example: - -```text -postgresql://admin:password123@db.example.org:5432/db -``` - -The password is sent over the network only when the database is created. When you edit the database later, Superset will return this as the SQLAlchemy URI: - -```text -postgresql://admin:XXXXXXXXXX@db.example.org:5432/db -``` - -The password will be masked in the API response; it's not just masked in the browser UI. This is done in order to avoid sending the password unnecessarily over the network. Also, if a non-admin user has access to the API response, they won't be able to know the database password. - -When the database is edited, the Superset backend is smart enough to replace the masked password with the actual password, unless the password has changed. That is, if you change the database in the URI from `db` to `db2` the SQLAlchemy URI will be stored in the backend as: - -```text -postgresql://admin:password123@db.example.org:5432/db2 -``` - -The password is not the only piece of information where security is critical. For many databases (like BigQuery), sensitive information is stored in the credentials JSON payload. For example: - -```json -{ - "type": "service_account", - "project_id": "dbt-tutorial-347100", - "private_key_id": "4bc71f06990c864a590fad8b94be6a5904fc171f", - "private_key": "", - "client_email": "dbt-user-278@dbt-tutorial-347100.iam.gserviceaccount.com", - "client_id": "115666988796889519425", - "auth_uri": "https://accounts.google.com/o/oauth2/auth", - "token_uri": "https://oauth2.googleapis.com/token", - "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", - "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/dbt-user-278%40dbt-tutorial-347100.iam.gserviceaccount.com" -} -``` - -Similarly to password, we don't want to send `private_key` to the client when a database is edited; the Superset API should never return its actual contents. Instead, Superset should return a masked value, and users should be able to edit the JSON without having to type in the `private_key` on every edit. - -To do this, DB engine specs and implement 2 methods, `mask_encrypted_extra` and `unmask_encrypted_extra`. They have these names because the credentials are stored in an encrypted column called `encrypted_extra`. Here's how these methods look like for BigQuery: - -```python -from superset.constants import PASSWORD_MASK - - -class BigQueryEngineSpec(BaseEngineSpec): - - @classmethod - def mask_encrypted_extra(cls, encrypted_extra: str | None) -> str | None: - if encrypted_extra is None: - return encrypted_extra - - try: - config = json.loads(encrypted_extra) - except (json.JSONDecodeError, TypeError): - return encrypted_extra - - try: - config["credentials_info"]["private_key"] = PASSWORD_MASK - except KeyError: - pass - - return json.dumps(config) - - @classmethod - def unmask_encrypted_extra( - cls, - old: str | None, - new: str | None - ) -> str | None: - if old is None or new is None: - return new - - try: - old_config = json.loads(old) - new_config = json.loads(new) - except (TypeError, json.JSONDecodeError): - return new - - if "credentials_info" not in new_config: - return new - - if "private_key" not in new_config["credentials_info"]: - return new - - if new_config["credentials_info"]["private_key"] == PASSWORD_MASK: - new_config["credentials_info"]["private_key"] = old_config[ - "credentials_info" - ]["private_key"] - - return json.dumps(new_config) -``` - -This way, when a user edits an existing BigQuery connection, the `private_key` is shown as `XXXXXXXXXX`. Everything else in the JSON is still displayed, and the user can change any of the fields without having to provide the private key. - -Note that while this is a basic feature that should be implemented for security reasons, it only makes sense in DB engine specs that use `encrypted_extra` to store connection information. - -## Nice to have features - -The next set of features are nice to have. They don't apply to all databases, and are not strictly needed for security or usability. - -### User impersonation - -In general there's no user-level granularity when accessing a database in Superset. A single database connection is shared by all users who have access to that database. There are many use cases when this is not desirable, and some databases implement mechanisms in which they can **impersonate users**, potentially reducing the scope of permissions available to run the query. - -For example, the Google Sheets DB engine spec implements this via the `get_url_for_impersonation` class method: - -```python -class GSheetsEngineSpec(ShillelaghEngineSpec): - - @classmethod - def get_url_for_impersonation( - cls, - url: URL, - impersonate_user: bool, - username: str | None, - access_token: str | None, - ) -> URL: - if impersonate_user and username is not None: - user = security_manager.find_user(username=username) - if user and user.email: - url = url.update_query_dict({"subject": user.email}) - - return url -``` - -The method `get_url_for_impersonation` updates the SQLAlchemy URI before every query. In this particular case, it will fetch the user's email and add it to the `subject` query argument. The driver will then lower the permissions to match that given user. This allows the connection to be configured with a service account that has access to all the spreadsheets, while giving users access to only the spreadsheets they own are have been shared with them (or with their organization — Google will handle the authorization in this case, not Superset). - -Alternatively, it's also possible to impersonate users by implementing the `update_impersonation_config`. This is a class method which modifies `connect_args` in place. You can use either method, and ideally they [should be consolidated in a single one](https://github.com/apache/superset/issues/24910). - -### OAuth2 - -Support for authenticating to a database using personal OAuth2 access tokens was introduced in [SIP-85](https://github.com/apache/superset/issues/20300). The Google Sheets DB engine spec is the reference implementation. - -Note that this API is still experimental and evolving quickly, subject to breaking changes. Currently, to add support for OAuth2 to a DB engine spec, the following attributes are needed: - -```python -class BaseEngineSpec: - - supports_oauth2 = True - oauth2_exception = OAuth2RedirectError - - oauth2_scope = " ".join([ - "https://example.org/scope1", - "https://example.org/scope2", - ]) - oauth2_authorization_request_uri = "https://example.org/authorize" - oauth2_token_request_uri = "https://example.org/token" -``` - -The `oauth2_exception` is an exception that is raised by `cursor.execute` when OAuth2 is needed. This will start the OAuth2 dance when `BaseEngineSpec.execute` is called, by returning the custom error `OAUTH2_REDIRECT` to the frontend. If the database driver doesn't have a specific exception, it might be necessary to overload the `execute` method in the DB engine spec, so that the `BaseEngineSpec.start_oauth2_dance` method gets called whenever OAuth2 is needed. - -The DB engine should implement logic in either `get_url_for_impersonation` or `update_impersonation_config` to update the connection with the personal access token. See the Google Sheets DB engine spec for a reference implementation. - -Currently OAuth2 needs to be configured at the DB engine spec level, ie, with one client for each DB engien spec. The configuration lives in `superset_config.py`: - -```python -# superset_config.py -DATABASE_OAUTH2_CLIENTS = { - "Google Sheets": { - "id": "XXX.apps.googleusercontent.com", - "secret": "GOCSPX-YYY", - "scope": " ".join( - [ - "https://www.googleapis.com/auth/drive.readonly", - "https://www.googleapis.com/auth/spreadsheets", - "https://spreadsheets.google.com/feeds", - ], - ), - "authorization_request_uri": "https://accounts.google.com/o/oauth2/v2/auth", - "token_request_uri": "https://oauth2.googleapis.com/token", - }, -} -DATABASE_OAUTH2_JWT_ALGORITHM = "HS256" -DATABASE_OAUTH2_REDIRECT_URI = "http://localhost:8088/api/v1/database/oauth2/" -DATABASE_OAUTH2_TIMEOUT = timedelta(seconds=30) -``` - -When configuring a client only the ID and secret are required; the DB engine spec should have default values for the scope and endpoints. The `DATABASE_OAUTH2_REDIRECT_URI` attribute is optional, and defaults to `/api/v1/databases/oauth2/` in Superset. - -In the future we plan to support adding custom clients via the Superset UI, and being able to manually assign clients to specific databases. - -### File upload - -When a DB engine spec supports file upload it declares so via the `supports_file_upload` class attribute. The base class implementation is very generic and should work for any database that has support for `CREATE TABLE`. It leverages Pandas and the [`df_to_sql`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_sql.html) method. - -For some databases the `df_to_sql` classmethod needs to be implemented. For example, for BigQuery the DB engine spec implements a custom method that uses the [`to_gbq`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_gbq.html) method. - -### Extra table metadata - -DB engine specs can return additional metadata associated with a table. This is done via the `get_extra_table_metadata` class method. Trino uses this to return information about the latest partition, for example, and Bigquery returns clustering information. This information is then surfaced in the SQL Lab UI, when browsing tables in the metadata explorer (on the left panel). - -### DB API exception mapping - -Different DB API 2.0 drivers implement different exceptions, even if they have the same name. The `get_dbapi_exception_mapping` class method returns a dictionary mapping these custom exceptions to Superset exceptions, so that Superset can return more specific errors when an exception is raised by the underlying driver. - -For example, for ClickHouse we have: - -```python -from urllib3.exceptions import NewConnectionError - -from superset.db_engine_specs.exceptions import SupersetDBAPIDatabaseError - - -class ClickHouseEngineSpec(ClickHouseBaseEngineSpec): - - @classmethod - def get_dbapi_exception_mapping(cls) -> dict[type[Exception], type[Exception]]: - return {NewConnectionError: SupersetDBAPIDatabaseError} -``` - -This way, if the ClickHouse driver raises a `NewConnectionError` it would get wrapped in a `SupersetDBAPIDatabaseError`. - -### Custom errors - -Queries can fail in many different ways. For example, in SQLite: - -```sql -sqlite> CREATE TABLE a (b INT); -sqlite> SELECT c FROM a; -Error: no such column: c -sqlite> -``` - -When a query fails, Superset will return the message, "Error: no such column: c", to the user as a generic error. - -Since ideally we want to return specific and actionable error messages, DB engine specs can implement methods that map error messages to more specific errors. For example, the SQLite DB engine specs defines: - -```python -COLUMN_DOES_NOT_EXIST_REGEX = re.compile("no such column: (?P.+)") - - -class SqliteEngineSpec(BaseEngineSpec): - - custom_errors: dict[Pattern[str], tuple[str, SupersetErrorType, dict[str, Any]]] = - COLUMN_DOES_NOT_EXIST_REGEX: ( - __('We can\'t seem to resolve the column "%(column_name)s"'), - SupersetErrorType.COLUMN_DOES_NOT_EXIST_ERROR, - {}, - ), - } -``` - -This way, when a user selects a column that doesn't exist Superset can return a more informative error. - -### Dynamic schema - -In SQL Lab it's possible to select a database, and then a schema in that database. Ideally, when running a query in SQL Lab, any unqualified table names (eg, `table`, instead of `schema.table`) should be in the selected schema. For example, if the user selects `dev` as the schema and then runs the following query: - -```sql -SELECT * FROM my_table -``` - -The table `my_table` should live in the `dev` schema. In order to do that, it's necessary to modify the SQLAlchemy URI before running the query. Since different databases have different ways of doing that, this functionality is implemented via the `adjust_engine_params` class method. The method receives the SQLAlchemy URI and `connect_args`, as well as the schema in which the query should run. It then returns a potentially modified URI and `connect_args` to ensure that the query runs in the specified schema. - -When a DB engine specs implements `adjust_engine_params` it should have the class attribute `supports_dynamic_schema` set to true. This is critical for security, since **it allows Superset to know to which schema any unqualified table names belong to**. For example, in the query above, if the database supports dynamic schema, Superset would check to see if the user running the query has access to `dev.my_table`. On the other hand, if the database doesn't support dynamic schema, Superset would use the default database schema instead of `dev`. - -Implementing this method is also important for usability. When the method is not implemented selecting the schema in SQL Lab has no effect on the schema in which the query runs, resulting in a confusing results when using unqualified table names. - -### Catalog - -In general, databases support a hierarchy of one-to-many concepts: - -1. Database -2. Catalog -3. Namespace -4. Table -5. Column - -These concepts have different names depending on the database. For example, Postgres uses the following terminology: - -1. Cluster (database) -2. Database (catalog) -3. Schema (namespace) -4. Table -5. Column - -BigQuery, on the other hand: - -1. BigQuery (database) -2. Project (catalog) -3. Schema (namespace) -4. Table -5. Column - -Hive and Trino: - -1. Database -2. Catalog -3. Schema -4. Table -5. Column - -If the database supports catalogs, then the DB engine spec should have the `supports_catalog` class attribute set to true. It should also implement the `get_default_catalog` method, so that the proper permissions can be created when datasets are added. - -### Dynamic catalog - -Superset support for multiple catalogs. Since, in general, a given SQLAlchemy URI connects only to a single catalog, it requires DB engine specs to implement the `adjust_engine_params` method to rewrite the URL to connect to a different catalog, similar to how dynamic schemas work. Additionally, DB engine specs should also implement the `get_catalog_names` method, so that users can browse the available catalogs. - -### SSH tunneling - -Superset can connect to databases via an SSH tunnel. For databases where this doesn't make sense (eg, SQLite or BigQuery) the DB engine spec should have `disable_ssh_tunneling` set to true. - -### Query cancelation - -Superset will try to cancel running queries if the users wants so, but it's up to the DB engine spec to handle this. - -Some databases have an implicit query cancelation. When a cursor stops being polled it will cancel the query. For databases that behave like this, the class method `has_implicit_cancel` (which should really be a class attribute) should return true. - -For other databases, DB engine specs can implement query cancelation via the `prepare_cancel_query` and `cancel_query` methods. Implementation of query cancelation is usually heavily dependent on the database, but the DB engine specs that support it can serve as an example. - -### Get metrics on dataset creation - -When a physical dataset is first created, the `get_metrics` class method is called on the table. The base implementation returns the `COUNT(*)` metric, but DB engine specs can override `get_metrics` to return other metrics. This method is useful for semantic layers that contain their own metrics definitions; when Superset connect to them it can automatically create those metrics when a dataset is added. - -This feature is still experimental, and ideally there would be a mechanism for calling it periodically or when a dataset is explored, in order to sync new metric definitions to the dataset. - -### `WHERE` on latest partition - -In some databases, running `SELECT *` can be a **very expensive** operation, since the query might scan all partitions for a given table. Because of that, some DB engine specs implement the `where_latest_partition` method, which returns a modified SQLAlchemy query with an additional predicate that filters on the latest partition. - -## Advanced features - -### Expand complex types - -Some databases will visually expand complex types (arrays and structures) when displaying results from queries. For example, the BigQuery UI is able to expand objects into columns and array into rows, so that this: - -| array | struct | -| --------- | ---------------- | -| [1, 2, 3] | `{a: one, b: two}` | - -Is shown as: - -| array | struct | struct.a | struct.b | -| ----- | ---------------- | -------- | -------- | -| 1 | `{a: one, b: two}` | one | two | -| 2 | | | | -| 3 | | | | - -A similar behavior has been implemented in Superset for Presto, and can be enabled via the `PRESTO_EXPAND_DATA` feature flag. To implement this feature a DB engine spec should implement the `expand_data` method, which takes the columns and rows and returns modified columns and rows. - -Note that despite being implemented only for Presto, this behavior has nothing that is Presto specific, and in theory could be implemented in a generic way for all database without requiring custom DB engine spec implementations (that is, the Presto `expand_data` method could be moved to the base class, after being cleaned up, and we could then enable the feature per DB in the configuration). - -### Query cost estimation - -Some databases allow uses to estimate the cost of running a query before running it. This is done via the `estimate_query_cost` method in DB engine specs, which receives the SQL and returns a list of "costs". The definition of what "cost" is varies from database to database (in the few that support this functionality), and it can be formatted via the `query_cost_formatter`. - -The `query_cost_formatter` can be overridden with an arbitrary function via the config `QUERY_COST_FORMATTERS_BY_ENGINE`. This allows custom deployments of Superset to format the results in different ways. For example, at some point in Lyft the cost for running Presto queries would also show the carbon footprint (in trees). - -### SQL validation - -A few databases support validating the syntax of the SQL as the user is typing it, indicating in SQL Lab any errors. This is usually done using an `EXPLAIN` query and, because it gets called every few seconds as the user types, it's important that the database returns the result quickly. - -This is currently implement for Presto and Postgres, via custom classes in `superset/sql_validators` that should be enabled in the configuration. Implementing this as custom classes, instead of a `validate_sql` method in the DB engine spec offers no advantages, and ideally in the future we should move the logic to DB engine specs. - -## Testing DB engine specs - -Superset has a command to test the connection to a given database, as well as checking if the SQLAlchemy dialect implements all necessary methods used by Superset, and checking which features are supported by the DB engine spec (if one exists). To run the tool just call the `test-db` command with the SQLAlchemy URI to be tested: - -```bash -superset test-db sqlite:// -``` - -If the connection needs additional arguments they can be passed when the command runs. diff --git a/docs/src/components/DatabaseTable.jsx b/docs/src/components/DatabaseTable.jsx new file mode 100644 index 000000000000..8b217f9e4ad6 --- /dev/null +++ b/docs/src/components/DatabaseTable.jsx @@ -0,0 +1,93 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +import React, { useState, useMemo } from 'react'; +import styled from '@emotion/styled'; +import { Input } from 'antd'; + + + +const StyledDbTable = styled('div')` + .grid-item { + border: 1px solid #e0e0e0; + padding-top: 12px; + padding-bottom: 12px; + padding-left: 12px; + padding-right: 14px; + height: 100%; + } + + .grid-item h3 { + margin-top: 0; + } +`; + +const DatabaseTable = ({ items }) => { + const [filter, setFilter] = useState(''); + + const header = useMemo(() => (items && items.length > 0 ? items[0] : []), [ + items, + ]); + const rows = useMemo(() => (items && items.length > 1 ? items.slice(1) : []), [ + items, + ]); + + const filteredItems = useMemo( + () => + rows.filter( + row => + row[0] && row[0].toLowerCase().includes(filter.toLowerCase()), + ), + [rows, filter], + ); + + return ( + + setFilter(e.target.value)} + style={{ marginBottom: '5px' }} + /> + + + + {header.map((col, index) => ( + + ))} + + + + {filteredItems.map((row, rowIndex) => ( + + {row.map((cell, cellIndex) => ( + + ))} + + ))} + +
+ {col} +
+ {cell} +
+
+ ); +}; + +export default DatabaseTable; diff --git a/superset/db_engine_specs/docs_lib.py b/superset/db_engine_specs/docs_lib.py new file mode 100644 index 000000000000..4e852b9a4444 --- /dev/null +++ b/superset/db_engine_specs/docs_lib.py @@ -0,0 +1,689 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +from typing import Any + +from superset.constants import TimeGrain +from superset.db_engine_specs import load_engine_specs +from superset.db_engine_specs.base import BaseEngineSpec + +LIMIT_METHODS = { + "FORCE_LIMIT": ( + "modifies the query, replacing an existing LIMIT or adding a new one" + ), # E: line too long (89 > 79 characters) + "WRAP_SQL": "wraps the original query in a SELECT * with a LIMIT", + "FETCH_MANY": ( + "runs the query unmodified but fetchs only LIMIT rows from the cursor" + ), # E: line too long (89 > 79 characters) +} + +DATABASE_DETAILS = { + "limit_method": "Method used to limit the rows in the subquery", + "joins": "Supports JOINs", + "subqueries": "Supports subqueries", + "alias_in_select": "Allows aliases in the SELECT statement", + "alias_in_orderby": "Allows referencing aliases in the ORDER BY statement", + "time_groupby_inline": ( + "Allows omitting time filters from inline GROUP BYs" + ), # E: line too long (80 > 79 characters) + "alias_to_source_column": ( + "Able to use source column when an alias overshadows it" + ), # E: line too long (87 > 79 characters) + "order_by_not_in_select": ( + "Allows aggregations in ORDER BY not present in the SELECT" + ), # E: line too long (90 > 79 characters) + "expressions_in_orderby": "Allows expressions in ORDER BY", + "cte_in_subquery": "Allows CTE as a subquery", + "limit_clause": "Allows LIMIT clause (instead of TOP)", + "max_column_name": "Maximum column name", + "sql_comments": "Allows comments", + "escaped_colons": "Colons must be escaped", +} +BASIC_FEATURES = { + "masked_encrypted_extra": "Masks/unmasks encrypted_extra", + "column_type_mapping": "Has column type mappings", + "function_names": "Returns a list of function names", +} +NICE_TO_HAVE_FEATURES = { + "user_impersonation": "Supports user impersonation", + "file_upload": "Support file upload", + "get_extra_table_metadata": "Returns extra table metadata", + "dbapi_exception_mapping": "Maps driver exceptions to Superset exceptions", + "custom_errors": "Parses error messages and returns Superset errors", + "dynamic_schema": "Supports changing the schema per-query", + "catalog": "Supports catalogs", + "dynamic_catalog": "Supports changing the catalog per-query", + "ssh_tunneling": "Can be connected thru an SSH tunnel", + "query_cancelation": "Allows query to be canceled", + "get_metrics": "Returns additional metrics on dataset creation", + "where_latest_partition": "Supports querying the latest partition only", +} +ADVANCED_FEATURES = { + "expand_data": "Expands complex types (arrays, structs) into rows/columns", + "query_cost_estimation": "Supports query cost estimation", + "sql_validation": "Supports validating SQL before running query", +} + + +def has_custom_method(spec: type[BaseEngineSpec], method: str) -> bool: + """ + Check if a class has a custom implementation of a method. + + Since some classes don't inherit directly from ``BaseEngineSpec`` we need + to check the attributes of the spec and the base class. + """ + return bool( + getattr(spec, method, False) + and getattr(BaseEngineSpec, method, False) + and getattr(spec, method).__qualname__ + != getattr(BaseEngineSpec, method).__qualname__ + ) + + +def diagnose(spec: type[BaseEngineSpec]) -> dict[str, Any]: + """ + Run basic diagnostics on a given DB engine spec. + """ + # pylint: disable=import-outside-toplevel + from superset.sql_validators.postgres import PostgreSQLValidator + from superset.sql_validators.presto_db import PrestoDBSQLValidator + + sql_validators = { + "presto": PrestoDBSQLValidator, + "postgresql": PostgreSQLValidator, + } + + output: dict[str, Any] = {} + + output["time_grains"] = {} + supported_time_grains = spec.get_time_grain_expressions() + for time_grain in TimeGrain: + output["time_grains"][time_grain.name] = time_grain in supported_time_grains + + output.update( + { + "module": spec.__module__, + "limit_method": spec.limit_method.value, + "limit_clause": getattr(spec, "allow_limit_clause", True), + "joins": spec.allows_joins, + "subqueries": spec.allows_subqueries, + "alias_in_select": spec.allows_alias_in_select, + "alias_in_orderby": spec.allows_alias_in_orderby, + "time_groupby_inline": spec.time_groupby_inline, + "alias_to_source_column": not spec.allows_alias_to_source_column, + "order_by_not_in_select": spec.allows_hidden_orderby_agg, + "expressions_in_orderby": spec.allows_hidden_cc_in_orderby, + "cte_in_subquery": spec.allows_cte_in_subquery, + "max_column_name": spec.max_column_name_length, + "sql_comments": spec.allows_sql_comments, + "escaped_colons": spec.allows_escaped_colons, + "masked_encrypted_extra": has_custom_method(spec, "mask_encrypted_extra"), + "column_type_mapping": bool(spec.column_type_mappings), + "function_names": has_custom_method(spec, "get_function_names"), + # there are multiple ways of implementing user impersonation + "user_impersonation": ( + has_custom_method(spec, "update_impersonation_config") + or has_custom_method(spec, "get_url_for_impersonation") + or has_custom_method(spec, "impersonate_user") + ), + "file_upload": spec.supports_file_upload, + "get_extra_table_metadata": has_custom_method( + spec, "get_extra_table_metadata" + ), + "dbapi_exception_mapping": has_custom_method( + spec, "get_dbapi_exception_mapping" + ), + "custom_errors": ( + has_custom_method(spec, "extract_errors") + or has_custom_method(spec, "custom_errors") + ), + "dynamic_schema": spec.supports_dynamic_schema, + "catalog": spec.supports_catalog, + "dynamic_catalog": spec.supports_dynamic_catalog, + "ssh_tunneling": not spec.disable_ssh_tunneling, + "query_cancelation": ( + has_custom_method(spec, "cancel_query") or spec.has_implicit_cancel() + ), + "get_metrics": has_custom_method(spec, "get_metrics"), + "where_latest_partition": has_custom_method(spec, "where_latest_partition"), + "expand_data": has_custom_method(spec, "expand_data"), + "query_cost_estimation": has_custom_method(spec, "estimate_query_cost") + or has_custom_method(spec, "estimate_statement_cost"), + # SQL validation is implemented in external classes + "sql_validation": spec.engine in sql_validators, + }, + ) + + # compute score + score = 0 + + # each time grain is 1 point + score += sum(output["time_grains"][time_grain.name] for time_grain in TimeGrain) + + basic = ["masked_encrypted_extra", "column_type_mapping", "function_names"] + nice_to_have = [ + "user_impersonation", + "file_upload", + "get_extra_table_metadata", + "dbapi_exception_mapping", + "custom_errors", + "dynamic_schema", + "catalog", + "dynamic_catalog", + "ssh_tunneling", + "query_cancelation", + "get_metrics", + "where_latest_partition", + ] + advanced = ["expand_data", "query_cost_estimation", "sql_validation"] + score += sum(10 * int(output[key]) for key in basic) + score += sum(10 * int(output[key]) for key in nice_to_have) + score += sum(10 * int(output[key]) for key in advanced) + output["score"] = score + output["max_score"] = ( + len(TimeGrain) + 10 * len(basic) + 10 * len(nice_to_have) + 10 * len(advanced) + ) + + return output + + +def get_name(spec: type[BaseEngineSpec]) -> str: + """ + Return a name for a given DB engine spec. + """ + return spec.engine_name or spec.engine + + +def format_markdown_table(headers: list[str], rows: list[list[Any]]) -> str: + """ + Format headers and rows into a markdown table. + """ + lines = [] + lines.append("| " + " | ".join(headers) + " |") + lines.append("| " + " | ".join(["---"] * len(headers)) + " |") + for row in rows: + lines.append("| " + " | ".join(str(col) for col in row) + " |") + return "\n".join(lines) + + +def generate_focused_table( + info: dict[str, dict[str, Any]], + feature_keys: list[str], + column_labels: list[str], + filter_fn: Any = None, + value_extractor: Any = None, + preserve_order: bool = False, +) -> tuple[list[list[Any]], list[str]]: + """ + Generate a focused table as a 2D list with databases as rows. + + Args: + info: Dictionary mapping database names to their feature info + feature_keys: List of feature keys to extract from db_info + column_labels: List of column header labels + filter_fn: Optional function to filter databases (receives db_info dict) + value_extractor: Optional function to extract value (receives db_info, key) + + Returns: + Tuple of (2D list representing the table, list of excluded database names) + """ + # Filter databases if filter function provided + filtered_info = {} + excluded_dbs = [] + + for db_name, db_info in info.items(): + if filter_fn is None or filter_fn(db_info): + filtered_info[db_name] = db_info + else: + excluded_dbs.append(db_name) + + if not filtered_info: + return [], excluded_dbs + + # Build headers: Database + feature columns + headers = ["Database"] + column_labels + + # Build rows + rows = [] + # Sort by database name unless preserve_order is True + db_names = ( + list(filtered_info.keys()) if preserve_order else sorted(filtered_info.keys()) + ) + + for db_name in db_names: + db_info = filtered_info[db_name] + row = [db_name] + + for key in feature_keys: + if value_extractor: + value = value_extractor(db_info, key) + else: + value = db_info.get(key, "") + row.append(value) + + rows.append(row) + + return [headers] + rows, excluded_dbs + + +def calculate_support_level(db_info: dict[str, Any], feature_keys: list[str]) -> str: + """ + Calculate support level for a group of features. + + Returns: "Supported", "Partial", or "Not supported" + """ + if not feature_keys: + return "Not supported" + + # Handle time grain features specially + if all(k.startswith("time_grains.") for k in feature_keys): + grain_keys = [k.split(".", 1)[1] for k in feature_keys] + supported = sum( + 1 for grain in grain_keys if db_info["time_grains"].get(grain, False) + ) + else: + supported = sum(1 for k in feature_keys if db_info.get(k, False)) + + total = len(feature_keys) + if supported == 0: + return "Not supported" + elif supported == total: + return "Supported" + else: + return "Partial" + + +def generate_feature_tables() -> list[list[list[Any]]]: + """ + Generate multiple focused tables organized by feature categories. + + Returns a list of 2D lists, where each 2D list is a table. + """ + info = {} + for spec in sorted(load_engine_specs(), key=get_name): + info[get_name(spec)] = diagnose(spec) + + # remove 3rd party DB engine specs + info = {k: v for k, v in info.items() if v["module"].startswith("superset")} + + # Sort by score descending for overview table + sorted_info = dict(sorted(info.items(), key=lambda x: x[1]["score"], reverse=True)) + + output_tables = [] + + # Table 1: Feature Overview + sql_basics = [ + "joins", + "subqueries", + "alias_in_select", + "alias_in_orderby", + "cte_in_subquery", + ] + advanced_sql = [ + "time_groupby_inline", + "alias_to_source_column", + "order_by_not_in_select", + "expressions_in_orderby", + ] + common_grains = [ + f"time_grains.{g}" + for g in ["SECOND", "MINUTE", "HOUR", "DAY", "WEEK", "MONTH", "QUARTER", "YEAR"] + ] + extended_grains = [ + f"time_grains.{g}" + for g in [ + "FIVE_SECONDS", + "THIRTY_SECONDS", + "FIVE_MINUTES", + "TEN_MINUTES", + "FIFTEEN_MINUTES", + "THIRTY_MINUTES", + "HALF_HOUR", + "SIX_HOURS", + "WEEK_STARTING_SUNDAY", + "WEEK_STARTING_MONDAY", + "WEEK_ENDING_SATURDAY", + "WEEK_ENDING_SUNDAY", + "QUARTER_YEAR", + ] + ] + integrations = [ + "ssh_tunneling", + "query_cancelation", + "get_metrics", + "get_extra_table_metadata", + "dbapi_exception_mapping", + "custom_errors", + "dynamic_schema", + "where_latest_partition", + ] + advanced_features = [ + "user_impersonation", + "expand_data", + "query_cost_estimation", + "sql_validation", + ] + + headers = [ + "Database", + "Score", + "SQL Basics", + "Advanced SQL", + "Common Time Grains", + "Extended Time Grains", + "Integrations", + "Advanced Features", + ] + rows = [] + for db_name, db_info in sorted_info.items(): + row = [ + db_name, + db_info["score"], + calculate_support_level(db_info, sql_basics), + calculate_support_level(db_info, advanced_sql), + calculate_support_level(db_info, common_grains), + calculate_support_level(db_info, extended_grains), + calculate_support_level(db_info, integrations), + calculate_support_level(db_info, advanced_features), + ] + rows.append(row) + output_tables.append([headers] + rows) + + # Table 2: Database Information + def extract_db_info(db_info: dict[str, Any], key: str) -> str: + if key == "limit_method": + from superset.sql.parse import LimitMethod + + return LimitMethod(db_info[key]).name + return db_info.get(key, "") + + table, _ = generate_focused_table( + info, + feature_keys=["module", "limit_method", "limit_clause", "max_column_name"], + column_labels=["Module", "Limit Method", "Limit Clause", "Max Column Name"], + value_extractor=extract_db_info, + ) + output_tables.append(table) + + # Table 3: SQL Capabilities + table, _ = generate_focused_table( + info, + feature_keys=[ + "joins", + "subqueries", + "alias_in_select", + "alias_in_orderby", + "cte_in_subquery", + "sql_comments", + "escaped_colons", + "time_groupby_inline", + "alias_to_source_column", + "order_by_not_in_select", + "expressions_in_orderby", + ], + column_labels=[ + "JOINs", + "Subqueries", + "Aliases in SELECT", + "Aliases in ORDER BY", + "CTEs", + "Comments", + "Escaped Colons", + "Inline Time Groupby", + "Source Column When Aliased", + "Aggregations in ORDER BY", + "Expressions in ORDER BY", + ], + ) + output_tables.append(table) + + # Helper to extract time grain values + def extract_time_grain(db_info: dict[str, Any], grain_name: str) -> str: + return db_info["time_grains"].get(grain_name, False) + + # Table 4: Time Grains – Common + common_grains_keys = [ + "SECOND", + "MINUTE", + "HOUR", + "DAY", + "WEEK", + "MONTH", + "QUARTER", + "YEAR", + ] + table, _ = generate_focused_table( + info, + feature_keys=common_grains_keys, + column_labels=common_grains_keys, + value_extractor=extract_time_grain, + ) + output_tables.append(table) + + # Table 5: Time Grains – Extended + extended_grains_keys = [ + "FIVE_SECONDS", + "THIRTY_SECONDS", + "FIVE_MINUTES", + "TEN_MINUTES", + "FIFTEEN_MINUTES", + "THIRTY_MINUTES", + "HALF_HOUR", + "SIX_HOURS", + "WEEK_STARTING_SUNDAY", + "WEEK_STARTING_MONDAY", + "WEEK_ENDING_SATURDAY", + "WEEK_ENDING_SUNDAY", + "QUARTER_YEAR", + ] + table, _ = generate_focused_table( + info, + feature_keys=extended_grains_keys, + column_labels=extended_grains_keys, + value_extractor=extract_time_grain, + ) + output_tables.append(table) + + # Table 6: Core Platform & Metadata Features + table, _ = generate_focused_table( + info, + feature_keys=[ + "masked_encrypted_extra", + "column_type_mapping", + "function_names", + "file_upload", + "dynamic_schema", + "catalog", + "dynamic_catalog", + "ssh_tunneling", + "where_latest_partition", + "query_cancelation", + "get_metrics", + "get_extra_table_metadata", + "dbapi_exception_mapping", + "custom_errors", + ], + column_labels=[ + "Masked Encrypted Extra", + "Column Type Mappings", + "Function Names", + "File Upload", + "Dynamic Schema", + "Catalog", + "Dynamic Catalog", + "SSH Tunneling", + "Latest Partition", + "Query Cancellation", + "Get Metrics", + "Extra Table Metadata", + "Exception Mapping", + "Custom Errors", + ], + ) + output_tables.append(table) + + # Table 7: Operational & Advanced Features + table, _ = generate_focused_table( + info, + feature_keys=[ + "user_impersonation", + "expand_data", + "query_cost_estimation", + "sql_validation", + ], + column_labels=[ + "User Impersonation", + "Expand Data", + "Cost Estimation", + "SQL Validation", + ], + ) + output_tables.append(table) + + return output_tables + + +def generate_table() -> list[list[Any]]: + """ + Generate a table showing info for all DB engine specs. + + DEPRECATED: This function is kept for backward compatibility. + Use generate_feature_tables() instead for better readability. + """ + info = {} + for spec in sorted(load_engine_specs(), key=get_name): + info[get_name(spec)] = diagnose(spec) + + # remove 3rd party DB engine specs + info = {k: v for k, v in info.items() if v["module"].startswith("superset")} + + rows = [] # pylint: disable=redefined-outer-name + rows.append(["Feature"] + list(info)) # header row + rows.append(["Module"] + [db_info["module"] for db_info in info.values()]) + + # descriptive + keys = [ + "limit_method", + "joins", + "subqueries", + "alias_in_select", + "alias_in_orderby", + "time_groupby_inline", + "alias_to_source_column", + "order_by_not_in_select", + "expressions_in_orderby", + "cte_in_subquery", + "limit_clause", + "max_column_name", + "sql_comments", + "escaped_colons", + ] + for key in keys: + rows.append( + [DATABASE_DETAILS[key]] + [db_info[key] for db_info in info.values()] + ) + + # basic + for time_grain in TimeGrain: + rows.append( + [f"Has time grain {time_grain.name}"] + + [db_info["time_grains"][time_grain.name] for db_info in info.values()] + ) + keys = [ + "masked_encrypted_extra", + "column_type_mapping", + "function_names", + ] + for key in keys: + rows.append([BASIC_FEATURES[key]] + [db_info[key] for db_info in info.values()]) + + # nice to have + keys = [ + "user_impersonation", + "file_upload", + "get_extra_table_metadata", + "dbapi_exception_mapping", + "custom_errors", + "dynamic_schema", + "catalog", + "dynamic_catalog", + "ssh_tunneling", + "query_cancelation", + "get_metrics", + "where_latest_partition", + ] + for key in keys: + rows.append( + [NICE_TO_HAVE_FEATURES[key]] + [db_info[key] for db_info in info.values()] + ) + + # advanced + keys = [ + "expand_data", + "query_cost_estimation", + "sql_validation", + ] + for key in keys: + rows.append( + [ADVANCED_FEATURES[key]] + [db_info[key] for db_info in info.values()] + ) + + rows.append(["Score"] + [db_info["score"] for db_info in info.values()]) + + return rows + + +def format_row(row: list[Any]) -> str: + """ + Format a row for printing and quote everything to keep it docusaurus compatible. + """ + formatted_items = [] + for item in row: + formatted_items.append(f"'{item}'") + return f"[{', '.join(formatted_items)}]" + + +if __name__ == "__main__": + from superset.app import create_app + + app = create_app() + with app.app_context(): + tables = generate_feature_tables() + + titles = [ + "Feature Overview", + "Database Information", + "SQL Capabilities", + "Time Grains – Common", + "Time Grains – Extended", + "Core Platform & Metadata Features", + "Operational & Advanced Features", + ] + + for title, table in zip(titles, tables): + print(f"### {title}\n") + print("[") + for i, row in enumerate(table): + formatted_row = format_row(row) + if i == len(table) - 1: + print(f" {formatted_row}") + else: + print(f" {formatted_row},") + print("]") + print("\n")