diff --git a/README.md b/README.md index 4de5831..8a98684 100644 --- a/README.md +++ b/README.md @@ -110,23 +110,28 @@ benchmark configurations using supported modules.** ## Support Matrix -### setup / installation +### Setup / Installation -| system | local | aws | docker | gcp | azure | -|------------|-------|---------------|--------|-----|-------| -| Exasol | ✗ | ✓1 | ✗ | ✗ | ✗ | -| ClickHouse | ✗ | ✓1 | ✗ | ✗ | ✗ | +| system | local | aws | docker | gcp | azure | +|------------|-------|-----|--------|-----|-------| +| Exasol | ✗ | ✓ | ✗ | ✗ | ✗ | +| ClickHouse | ✗ | ✓ | ✗ | ✗ | ✗ | -Notes: +### Workloads -1. Only single-node deployments supported at this time. + +[clickbench]: benchkit/workloads/clickbench/README.md "ClickHouse ClickBench" +[estuary]: benchkit/workloads/estuary/README.md "Estuary Warehouse Report" -### "tpch" workload +| system | tpch | [clickbench] | [estuary] | +|------------|------|--------------|-----------------| +| Exasol | ✓ | ✓ | ✓(1) | +| ClickHouse | ✓ | ✓ | ✗(1) | + +Notes: + +1. Work in Progress -| system | local | aws | docker | gcp | azure | -|------------|-------|-----|--------|-----|-------| -| Exasol | ✗ | ✓ | ✗ | ✗ | ✗ | -| ClickHouse | ✗ | ✓ | ✗ | ✗ | ✗ | ## Documentation diff --git a/benchkit/common/__init__.py b/benchkit/common/__init__.py index e69de29..89dc356 100644 --- a/benchkit/common/__init__.py +++ b/benchkit/common/__init__.py @@ -0,0 +1,2 @@ +from .file_management import download_file_to_storage +from .markers import exclude_from_package diff --git a/benchkit/common/file_management.py b/benchkit/common/file_management.py new file mode 100644 index 0000000..5d4d07b --- /dev/null +++ b/benchkit/common/file_management.py @@ -0,0 +1,13 @@ +from pathlib import Path + + +def download_file_to_storage(url: str, target: Path) -> None: + """Download data from a given URL to given target path""" + import requests + + with ( + requests.get(url, allow_redirects=True, stream=True) as handle, + open(target, "wb") as file, + ): + handle.raise_for_status() + file.write(handle.content) diff --git a/benchkit/common/markers.py b/benchkit/common/markers.py index 48b7859..13ff446 100644 --- a/benchkit/common/markers.py +++ b/benchkit/common/markers.py @@ -27,5 +27,5 @@ def exclude_from_package(obj: T) -> T: ... def install(self) -> bool: ... return self._install_database() """ - obj._exclude_from_package = True # type: ignore + obj._exclude_from_package = True # type: ignore[attr-defined] return obj diff --git a/benchkit/package/creator.py b/benchkit/package/creator.py index 2916cfc..19455ff 100644 --- a/benchkit/package/creator.py +++ b/benchkit/package/creator.py @@ -5,17 +5,16 @@ import shutil import zipfile from pathlib import Path -from typing import TYPE_CHECKING, Any, Literal, cast +from typing import Any, Literal, cast from jinja2 import Environment, FileSystemLoader from rich.console import Console -if TYPE_CHECKING: - from ..systems.base import SystemUnderTest - from ..util import ensure_directory - from .code_minimizer import CodeMinimizer - from .formatter import PackageFormatter - from .import_cleaner import ImportCleaner +from ..util import ensure_directory +from .code_minimizer import CodeMinimizer +from .formatter import PackageFormatter +from .import_cleaner import ImportCleaner +from ..systems.base import SystemUnderTest console = Console() @@ -352,9 +351,6 @@ def _copy_minimal_framework_files(self) -> None: "run": [ "parsers.py" ], # Only parsers, no __init__.py to avoid import issues - "package": [ - "markers.py", # imported by systems/base - ], "systems": None, # Copy all - needed for database connections "workloads": None, # Copy all - needed for workload execution "common": None, diff --git a/benchkit/run/runner.py b/benchkit/run/runner.py index 12f361c..60db368 100644 --- a/benchkit/run/runner.py +++ b/benchkit/run/runner.py @@ -6,20 +6,25 @@ from collections.abc import Callable from dataclasses import dataclass, field from pathlib import Path -from typing import Any, Literal +from typing import TYPE_CHECKING, Any, Literal import pandas as pd from rich.console import Console -from ..common.markers import exclude_from_package +from benchkit.common import exclude_from_package +from benchkit.util import ensure_directory, load_json, save_json + from ..debug import is_debug_enabled from ..systems import create_system -from ..systems.base import SystemUnderTest -from ..util import ensure_directory, load_json, save_json from ..workloads import create_workload from .parallel_executor import ParallelExecutor from .parsers import normalize_runs +if TYPE_CHECKING: + from benchkit.systems import SystemUnderTest + from benchkit.workloads import Workload + + console = Console() @@ -409,7 +414,7 @@ def _execute_phase( phase: PhaseConfig, context: ExecutionContext, force: bool = False, - workload: Any = None, + workload: "Workload | None" = None, ) -> bool | list[dict[str, Any]]: """ Universal phase executor handling setup, load, and query phases. @@ -514,7 +519,7 @@ def _build_phase_tasks( context: ExecutionContext, force: bool, package_path: Path | None, - workload: Any, + workload: "Workload", ) -> dict[str, Callable[[], TaskResult]]: """ Build task callables for each system in the benchmark. @@ -650,7 +655,7 @@ def _get_system_for_context( self, system_config: dict[str, Any], context: ExecutionContext, - ) -> SystemUnderTest: + ) -> "SystemUnderTest": """ Create system instance configured for the execution context. @@ -736,11 +741,11 @@ def _get_system_for_context( @exclude_from_package def _setup_operation( self, - system: SystemUnderTest, + system: "SystemUnderTest", system_config: dict[str, Any], instance_manager: Any, package_path: Path | None, # unused for setup - workload: Any, # unused for setup + workload: "Workload", # unused for setup ) -> tuple[bool, dict[str, Any]]: """ Execute setup operation for a single system. @@ -849,11 +854,11 @@ def _setup_operation( def _load_operation( self, - system: SystemUnderTest, + system: "SystemUnderTest", system_config: dict[str, Any], instance_manager: Any, package_path: Path | None, - workload: Any, + workload: "Workload", ) -> tuple[bool, dict[str, Any]]: """ Execute load operation for a single system. @@ -902,11 +907,11 @@ def _load_operation( def _query_operation( self, - system: SystemUnderTest, + system: "SystemUnderTest", system_config: dict[str, Any], instance_manager: Any, package_path: Path | None, - workload: Any, + workload: "Workload", ) -> tuple[bool, list[dict[str, Any]]]: """ Execute query operation for a single system. @@ -1044,7 +1049,7 @@ def _create_system_for_local_execution( system_config: dict[str, Any], instance_manager: Any, output_callback: Callable[[str], None] | None = None, - ) -> SystemUnderTest: + ) -> "SystemUnderTest": """Create system object configured for local-to-remote execution. IMPORTANT: Uses the PUBLIC IP from the cloud instance manager to enable @@ -1483,17 +1488,12 @@ def _prepare_storage_phase(self) -> bool: return True def _execute_queries( - self, system: Any, workload: Any + self, system: "SystemUnderTest", workload: "Workload" ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: """Execute benchmark queries with timing and monitoring.""" - query_names = self.config["workload"]["queries"]["include"] runs_per_query = self.config["workload"]["runs_per_query"] warmup_runs = self.config["workload"]["warmup_runs"] - # If no queries specified, use workload's default (all queries) - if not query_names: - query_names = workload.queries_to_include - # Extract multiuser configuration multiuser_config = self.config["workload"].get("multiuser") or {} num_streams = 1 @@ -1508,7 +1508,7 @@ def _execute_queries( # Execute queries result_dict = workload.run_workload( system=system, - query_names=query_names, + query_names=workload.get_included_queries(), runs_per_query=runs_per_query, warmup_runs=warmup_runs, num_streams=num_streams, @@ -1570,7 +1570,7 @@ def _save_setup_summary( def _load_setup_summary_to_system( self, - system: SystemUnderTest, + system: "SystemUnderTest", system_name: str, executor: "ParallelExecutor | None" = None, ) -> None: diff --git a/benchkit/systems/base.py b/benchkit/systems/base.py index 375758d..2de44ce 100644 --- a/benchkit/systems/base.py +++ b/benchkit/systems/base.py @@ -7,13 +7,15 @@ from pathlib import Path from typing import TYPE_CHECKING, Any, Literal -from benchkit.common.markers import exclude_from_package +from benchkit.common import exclude_from_package + +from ..util import safe_command if TYPE_CHECKING: # avoid cyclic dependency problems - from ..util import safe_command from ..workloads import Workload + TableOperation = Literal[ "DEFAULT", "OPTIMIZE TABLE", @@ -70,6 +72,8 @@ def __init__( # Workload configuration for dynamic tuning self.workload_config = workload_config or {} + # default schema for queries + self.schema: str | None = None # Command recording for report reproduction self.setup_commands: list[dict[str, Any]] = [] @@ -421,6 +425,64 @@ def load_data_from_iterable( """ pass + def load_data_from_url_with_download( + self, + schema_name: str, + table_name: str, + data_url: str | list[str], + /, + extension: str = ".csv", + **kwargs: Any, + ) -> bool: + """ + Download given resources to host filesystem + and then load files into database using `load_data()` above + """ + from benchkit.common import download_file_to_storage + + local_storage: Path = self.data_dir or Path("/var/tmp") + downloaded_files: list[Path] = [] + file_part: int = 0 + ## download files locally + url_list: list[str] = [data_url] if isinstance(data_url, str) else data_url + for url in url_list: + file_name: str = f"{table_name}_{file_part}{extension}" + target_path: Path = local_storage / file_name + if target_path.exists(): + self._log(f"Download: reusing existing file {target_path}") + downloaded_files.append(target_path) + continue + self._log(f"Downloading {file_name} from {url}") + try: + download_file_to_storage(url, target_path) + downloaded_files.append(target_path) + except Exception as e: + self._log(f"Error downloading {file_name}: {e}") + return False + + ## then, import files one by one + for file in downloaded_files: + if not self.load_data(table_name, file, schema=schema_name): + return False + return True + + def load_data_from_url( + self, + schema_name: str, + table_name: str, + data_url: str | list[str], + /, + extension: str = ".csv", + **kwargs: Any, + ) -> bool: + """ + Load table data from a URL or a set of URLs. + Default implementation downloads data to local storage and then imports the downloaded file(s) using load_data() + """ + return self.load_data_from_url_with_download( + schema_name, table_name, data_url, extension=extension, **kwargs + ) + @abstractmethod def execute_query( self, @@ -1774,6 +1836,62 @@ def estimate_execution_time( """ return timedelta(minutes=5) + # noinspection PyMethodMayBeStatic + def split_sql_statements(self, sql: str) -> list[str]: + """ + Split SQL script into individual statements. + Method is not static in case some system uses different syntax for splitting + + Handles: + - Semicolon-separated statements + - SQL comments (-- and /* */) + - Empty lines + + Returns: + List of individual SQL statements + """ + statements = [] + current_statement = [] + in_comment = False + + for line in sql.split("\n"): + stripped = line.strip() + + # Skip SQL comments + if stripped.startswith("--"): + continue + + # Handle multi-line comments + if "/*" in stripped: + in_comment = True + if "*/" in stripped: + in_comment = False + continue + if in_comment: + continue + + # Skip empty lines + if not stripped: + continue + + # Check if line ends with semicolon (statement terminator) + if stripped.endswith(";"): + # Add the line without semicolon to current statement + current_statement.append(stripped[:-1]) + # Join and add to statements list + statements.append("\n".join(current_statement)) + # Reset for next statement + current_statement = [] + else: + # Add line to current statement + current_statement.append(stripped) + + # Add any remaining statement (for scripts without trailing semicolon) + if current_statement: + statements.append("\n".join(current_statement)) + + return statements + def get_system_class(system_kind: str) -> type | None: """ diff --git a/benchkit/systems/clickhouse.py b/benchkit/systems/clickhouse.py index e3ca7e3..488f5d9 100644 --- a/benchkit/systems/clickhouse.py +++ b/benchkit/systems/clickhouse.py @@ -15,11 +15,11 @@ from benchkit.common.markers import exclude_from_package +from ..util import Timer from .base import SystemUnderTest, TableOperation if TYPE_CHECKING: # avoid cyclic dependency problems - from ..util import Timer from ..workloads import Workload @@ -1490,6 +1490,79 @@ def load_data_from_iterable( ) -> bool: raise NotImplementedError("clickhouse.load_data_from_iterable") + def load_data_from_url( + self, + schema_name: str, + table_name: str, + data_url: str | list[str], + /, + extension: str = ".csv", + **kwargs: Any, + ) -> bool: + if isinstance(data_url, list): + raise NotImplementedError("Loading multiple URLs into clickhouse") + + try: + self._log(f"Loading {data_url} into {table_name}...") + + import_query: str = ( + f"INSERT INTO {schema_name}.{table_name} SELECT * FROM url('{data_url}', CSV)" + ) + + if clickhouse_connect: + # Use clickhouse-connect for verification + client = self._get_client() + if not client: + self._log("Error: failed to connect to clickhouse system") + return False + + import_res = client.query(import_query) + self._log(f"{import_res}") + count_result = client.query( + f"SELECT COUNT(*) FROM {schema_name}.{table_name}" + ) + row_count = count_result.result_rows[0][0] + + else: + import_cmd = ( + f"clickhouse-client " + f"--user={self.username} " + f"--password={self.password} " + f"--query='{import_query}'" + ) + + # Execute the import + result = self.execute_command(import_cmd, timeout=3600.0, record=False) + + if not result.get("success", False): + self._log( + f"Failed to load data: {result.get('stderr', 'Unknown error')}" + ) + return False + + # Verify data was loaded by counting rows + count_cmd = ( + f"clickhouse-client " + f"--user={self.username} " + f"--password={self.password} " + f'--query="SELECT COUNT(*) FROM {schema_name}.{table_name} FORMAT TSV"' + ) + count_result = self.execute_command( + count_cmd, timeout=60.0, record=False + ) + if count_result.get("success", False): + row_count = int(count_result.get("stdout", "0").strip()) + else: + self._log("Warning: Could not verify row count") + return True # Assume success if import succeeded + + self._log(f"Successfully loaded {row_count:,} rows into {table_name}") + return True + + except Exception as e: + self._log(f"Failed to load data into {table_name}: {e}") + return False + def execute_query( self, query: str, diff --git a/benchkit/systems/exasol.py b/benchkit/systems/exasol.py index 8f7717a..cd46a77 100644 --- a/benchkit/systems/exasol.py +++ b/benchkit/systems/exasol.py @@ -11,11 +11,11 @@ from benchkit.common.markers import exclude_from_package +from ..util import Timer from .base import SystemUnderTest if TYPE_CHECKING: # avoid cyclic dependency problems - from ..util import Timer from ..workloads import Workload @@ -2109,3 +2109,53 @@ def _cleanup_disturbing_services(self, play_id: str) -> bool: return removed > 0 except Exception: return False + + def load_data_from_url( + self, + schema_name: str, + table_name: str, + data_url: str | list[str], + /, + extension: str = ".csv", + **kwargs: Any, + ) -> bool: + conn = None + + # split URL into bases and file names + data_sources: dict[Path, list[str]] = {} + for url in [data_url] if isinstance(data_url, str) else data_url: + p = Path(url) + prefix: Path = p.parent + if prefix not in data_sources: + data_sources[prefix] = [p.name] + else: + data_sources[prefix].append(p.name) + + try: + conn = self._get_connection() + if not conn: + return False + if not self._schema_created: + if self._schema_exists(conn, schema_name): + self._schema_created = True + conn.execute(f"OPEN SCHEMA {schema_name}") + + self._log(f"Loading {data_url} into {table_name}...") + base_sql = f"IMPORT INTO {schema_name}.{table_name} FROM CSV AT " + for host, files in data_sources.items(): + base_sql += f"'{host}' " + " ".join([f"FILE '{f}'" for f in files]) + + conn.execute(base_sql) + + # Verify data was loaded + result = conn.execute(f"SELECT COUNT(*) FROM {table_name}") + row_count = result.fetchone()[0] + self._log(f"Successfully loaded {row_count:,} rows into {table_name}") + return True + + except Exception as e: + self._log(f"Failed to load data into {table_name}: {e}") + return False + finally: + if conn: + conn.close() diff --git a/benchkit/workloads/__init__.py b/benchkit/workloads/__init__.py index d5533a6..605dad3 100644 --- a/benchkit/workloads/__init__.py +++ b/benchkit/workloads/__init__.py @@ -1,13 +1,15 @@ """Benchmark workloads.""" from .base import Workload +from .clickbench import Clickbench from .estuary import Estuary from .tpch import TPCH # Workload factory mapping -WORKLOAD_IMPLEMENTATIONS = { +WORKLOAD_IMPLEMENTATIONS: dict[str, type[Workload]] = { "tpch": TPCH, "estuary": Estuary, + "clickbench": Clickbench, } @@ -32,4 +34,11 @@ def create_workload(config: dict) -> Workload: return WORKLOAD_IMPLEMENTATIONS[name](config) -__all__ = ["Workload", "TPCH", "Estuary", "create_workload", "WORKLOAD_IMPLEMENTATIONS"] +__all__ = [ + "Workload", + "TPCH", + "Estuary", + "Clickbench", + "create_workload", + "WORKLOAD_IMPLEMENTATIONS", +] diff --git a/benchkit/workloads/base.py b/benchkit/workloads/base.py index d298a21..a2eb9bc 100644 --- a/benchkit/workloads/base.py +++ b/benchkit/workloads/base.py @@ -5,9 +5,12 @@ from abc import ABC, abstractmethod from collections.abc import Callable from concurrent.futures import ThreadPoolExecutor, as_completed +from datetime import timedelta from pathlib import Path from typing import Any +from jinja2 import Environment, FileSystemLoader + from ..systems.base import SystemUnderTest @@ -19,6 +22,40 @@ def __init__(self, config: dict[str, Any]): self.scale_factor = config.get("scale_factor", 1) self.config = config self.data_dir = Path(f"data/{self.name}/sf{self.scale_factor}") + self.workload_dir = ( + Path(__file__).parent.parent.parent / "workloads" / self.name + ) + self.template_env: Environment | None = None + # Store system for template resolution + self._current_system: SystemUnderTest | None = None + + self.data_format = config.get("data_format", "tbl") # TPC-H standard format + self.variant = config.get("variant", "official") # Query variant to use + self.generator: str = config.get("generator", "") + + self.system_variants = ( + config.get("system_variants") or {} + ) # Per-system variant overrides + + # Determine which queries to include based on include/exclude logic + queries_config = config.get("queries", {}) + self.queries_include = queries_config.get("include", []) + self.queries_exclude = queries_config.get("exclude", []) + + def get_template_env(self) -> Environment: + """Get the workload's jinja2 template environment""" + if not self.template_env: + self.template_env = Environment( + loader=FileSystemLoader( + [ + self.workload_dir / "queries", + self.workload_dir / "setup", + ] + ), + trim_blocks=True, + lstrip_blocks=True, + ) + return self.template_env def display_name(self) -> str: """Return user-friendly display name for workload""" @@ -56,7 +93,6 @@ def generate_data(self, output_dir: Path) -> bool: """ pass - @abstractmethod def create_schema(self, system: SystemUnderTest) -> bool: """ Create database schema and tables for the workload. @@ -67,7 +103,20 @@ def create_schema(self, system: SystemUnderTest) -> bool: Returns: True if schema creation successful, False otherwise """ - pass + # Use workload specific schema name like "tpch_sf100" + schema: str = self.get_schema_name() + + # First, create the schema using the system's method + if hasattr(system, "create_schema"): + print(f"Creating schema '{schema}'...") + if not system.create_schema(schema): + print(f"Failed to create schema '{schema}'") + return False + print(f"✓ Schema '{schema}' created successfully") + + # Then create the tables using the templated script + print(f"Creating tables for {self.name}") # TODO: display_name() + return self.execute_setup_script(system, "create_tables.sql") @abstractmethod def load_data(self, system: SystemUnderTest) -> bool: @@ -82,7 +131,6 @@ def load_data(self, system: SystemUnderTest) -> bool: """ pass - @abstractmethod def get_queries(self, system: SystemUnderTest | None = None) -> dict[str, str]: """ Get the benchmark queries. @@ -93,7 +141,26 @@ def get_queries(self, system: SystemUnderTest | None = None) -> dict[str, str]: Returns: Dictionary mapping query names to SQL text """ - pass + # Use provided system or stored system + target_system = system or self._current_system + if target_system is None: + raise ValueError( + "System must be provided either as parameter or stored from previous call" + ) + + # Store system for future template resolution + self._current_system = target_system + + # Get and log the variant being used for this system + variant = self._get_query_variant_for_system(target_system) + if variant != "official": + print(f"Loading '{variant}' variant queries for {target_system.kind}") + + queries = {} + for query_name in self.get_included_queries(): + queries[query_name] = self._get_query_sql(query_name, target_system) + + return queries @abstractmethod def get_all_query_names(self) -> list[str]: @@ -105,7 +172,19 @@ def get_all_query_names(self) -> list[str]: """ pass - @abstractmethod + def get_included_queries(self) -> list[str]: + all_queries = self.get_all_query_names() + + if self.queries_include: + # If include is specified, use only those queries + return [q for q in all_queries if q in self.queries_include] + elif self.queries_exclude: + # If exclude is specified, use all queries except excluded ones + return [q for q in all_queries if q not in self.queries_exclude] + else: + # If neither is specified, use all queries + return all_queries + def run_query( self, system: SystemUnderTest, query_name: str, query_sql: str ) -> dict[str, Any]: @@ -120,7 +199,11 @@ def run_query( Returns: Query execution result dictionary """ - pass + # Substitute schema name in query + schema_name = self.get_schema_name() + formatted_sql = query_sql.format(schema=schema_name) + + return system.execute_query(formatted_sql, query_name=query_name) def prepare(self, system: SystemUnderTest) -> bool: """ @@ -244,7 +327,7 @@ def _run_workload_sequential( Returns: Dictionary with 'measured' and 'warmup' keys containing results """ - all_queries = self.get_queries() + all_queries = self.get_queries(system) measured_results = [] warmup_results = [] @@ -317,7 +400,7 @@ def _run_workload_multiuser( Returns: Dictionary with 'measured' and 'warmup' keys containing results """ - all_queries = self.get_queries() + all_queries = self.get_queries(system) warmup_results = [] print(f"Running multiuser workload with {num_streams} concurrent streams") @@ -603,6 +686,138 @@ def get_workload_info(self) -> dict[str, Any]: "schema_name": self.get_schema_name(), } + def execute_setup_script(self, system: SystemUnderTest, script_name: str) -> bool: + """Execute a templated setup script by rendering the jinja2 template and splitting into individual statements.""" + try: + # Get system extra config for conditional features + system_extra = {} + if hasattr(system, "setup_config"): + system_extra = system.setup_config.get("extra", {}) + + # Load and render the template + template = self.get_template_env().get_template(script_name) + + # Get node_count and cluster for multinode support + node_count = getattr(system, "node_count", 1) + cluster = getattr(system, "cluster_name", "benchmark_cluster") + + rendered_sql = template.render( + system_kind=system.kind, + scale_factor=self.scale_factor, + schema=self.get_schema_name(), + system_extra=system_extra, + node_count=node_count, + cluster=cluster, + ) + + # Split SQL into individual statements and execute them one by one + statements = system.split_sql_statements(rendered_sql) + + for idx, statement in enumerate(statements): + # Skip empty statements + if not statement.strip(): + continue + + # Calculate dynamic timeout for OPTIMIZE operations + # These can take a long time for large tables + timeout = self.calculate_statement_timeout(statement, system) + + # Execute each statement individually with calculated timeout + result = system.execute_query( + statement, + query_name=f"setup_{script_name.replace('.sql', '')}_{idx+1}", + timeout=int(timeout.total_seconds()), + ) + + if not result["success"]: + print( + f"Failed to execute statement {idx+1} in {script_name}: {result.get('error', 'Unknown error')}" + ) + print(f"Statement was: {statement[:200]}...") + return False + + return True + + except Exception as e: + print(f"Error executing setup script {script_name}: {e}") + return False + + def _get_query_variant_for_system(self, system: SystemUnderTest) -> str: + """ + Determine which query variant to use for a given system. + + Args: + system: System under test + + Returns: + Variant name to use for this system + """ + # Check if system has a specific variant override + if self.system_variants and system.name in self.system_variants: + return str(self.system_variants[system.name]) + # Otherwise use global variant + return str(self.variant) + + def _get_query_sql(self, query_name: str, system: SystemUnderTest) -> str: + """ + Get SQL text for a specific TPC-H query with variant and templates resolved. + + Priority order for loading queries: + 1. variants/{variant}/{system_kind}/{query_name}.sql (system-specific variant) + 2. variants/{variant}/{query_name}.sql (generic variant) + 3. {query_name}.sql (default/official with inline conditionals) + """ + try: + variant = self._get_query_variant_for_system(system) + + # Build priority-ordered list of query paths + query_paths = [ + f"variants/{variant}/{system.kind}/{query_name}.sql", + f"variants/{variant}/{query_name}.sql", + f"{query_name}.sql", + ] + + template = None + + # Try each path in order until one succeeds + for path in query_paths: + try: + template = self.get_template_env().get_template(path) + break + except Exception: + continue + + if template is None: + raise FileNotFoundError( + f"Query {query_name} not found in any variant path" + ) + + # Get system extra config for conditional features + system_extra = {} + if hasattr(system, "setup_config"): + system_extra = system.setup_config.get("extra", {}) + + # Render template with variant context + rendered_sql = template.render( + system_kind=system.kind, + scale_factor=self.scale_factor, + schema=self.get_schema_name(), + variant=variant, + system_extra=system_extra, + ) + + return rendered_sql + + except Exception as e: + print(f"Error loading query {query_name}: {e}") + return f"-- Error loading query {query_name}: {e}" + + def calculate_statement_timeout( + self, statement: str, system: SystemUnderTest + ) -> timedelta: + """Default implementation: 5 minutes for any statement""" + return timedelta(minutes=5) + def get_workload_class(workload_name: str) -> type | None: """ diff --git a/benchkit/workloads/clickbench/README.md b/benchkit/workloads/clickbench/README.md new file mode 100644 index 0000000..19eff61 --- /dev/null +++ b/benchkit/workloads/clickbench/README.md @@ -0,0 +1,8 @@ +# ClickBench + +## Abstract + +ClickBench is a benchmark developed by ClickHouse, hosted at [GitHub](https://github.com/ClickHouse/ClickBench), +including an online result browser at [https://benchmark.clickhouse.com](https://benchmark.clickhouse.com). + +The benchmark encompasses 43 queries of different complexities on a single table of nearly 100 million rows. diff --git a/benchkit/workloads/clickbench/__init__.py b/benchkit/workloads/clickbench/__init__.py new file mode 100644 index 0000000..edacbdf --- /dev/null +++ b/benchkit/workloads/clickbench/__init__.py @@ -0,0 +1 @@ +from .clickbench import Clickbench diff --git a/benchkit/workloads/clickbench/clickbench.py b/benchkit/workloads/clickbench/clickbench.py new file mode 100644 index 0000000..6c3060c --- /dev/null +++ b/benchkit/workloads/clickbench/clickbench.py @@ -0,0 +1,46 @@ +from pathlib import Path +from typing import Any + +from benchkit.systems import SystemUnderTest +from benchkit.workloads import Workload + + +class Clickbench(Workload): + + CSV_SOURCE_URL: str = "https://datasets.clickhouse.com/hits_compatible/hits.csv.gz" + + def __init__(self, config: dict[str, Any]): + super().__init__(config) + # override default from base + self.data_format = config.get("data_format", "csv") + + def display_name(self) -> str: + # no scale factor + return "ClickBench" + + def generate_data(self, output_dir: Path) -> bool: + """TODO: download (and decompress?) data from web server""" + if self.generator == "download": + # is easily implemented, all components are there in common.file_management and systems.base + raise NotImplementedError("downloading data before load") + return True + + def load_data(self, system: SystemUnderTest) -> bool: + """Load data into the single benchmark table""" + system.schema = self.get_schema_name() + return system.load_data_from_url( + self.get_schema_name(), "hits", self.CSV_SOURCE_URL + ) + + def get_all_query_names(self) -> list[str]: + # uppercase Q, and query number with no leading zeroes + return [f"Q{n}" for n in range(43)] + + def get_schema_name(self) -> str: + return "clickbench" + + def estimate_filesystem_usage_gb(self, system: SystemUnderTest) -> int: + if system.SUPPORTS_STREAMLOAD: + return 0 + # if system.SUPPORTS_LOAD_FROM_COMPRESSED: + return 90 diff --git a/workloads/estuary/README.md b/benchkit/workloads/estuary/README.md similarity index 98% rename from workloads/estuary/README.md rename to benchkit/workloads/estuary/README.md index 8a8c8a7..439d8fb 100644 --- a/workloads/estuary/README.md +++ b/benchkit/workloads/estuary/README.md @@ -33,7 +33,7 @@ This is a benchmark using the TPC-H schema and data, but different queries. | Microsoft Fabric | DW3000c | ? | 0.99 | $340 | 343 min | | Microsoft Fabric | DW1500c | ? | 0.48 | $290 | 604 min | | Microsoft Fabric | DW500c | ? | x | $236 | - | -| BigQuery | serveless | ? | 15.64 | $241 | 15 min | +| BigQuery | serverless | ? | 15.64 | $241 | 15 min | Notes: diff --git a/benchkit/workloads/estuary/estuary.py b/benchkit/workloads/estuary/estuary.py index 3dbcc65..f5d5672 100644 --- a/benchkit/workloads/estuary/estuary.py +++ b/benchkit/workloads/estuary/estuary.py @@ -1,16 +1,11 @@ from pathlib import Path from typing import Any -from jinja2 import Environment, FileSystemLoader - from benchkit.systems import SystemUnderTest -from benchkit.workloads.tpch import TPCH - +from benchkit.workloads import Workload -## TODO -- refactoring #14 -class Estuary(TPCH): - """Inherits all but very few methods from the TPC-H benchmark""" +class Estuary(Workload): @classmethod def get_python_dependencies(cls) -> list[str]: """Return Python packages required for Estuary workload.""" @@ -19,21 +14,10 @@ def get_python_dependencies(cls) -> list[str]: def __init__(self, config: dict[str, Any]): super().__init__(config) - # Override workload folders - self.workload_dir = ( - Path(__file__).parent.parent.parent / "workloads" / "estuary" - ) - self.template_env = Environment( - loader=FileSystemLoader( - [self.workload_dir / "queries", self.workload_dir / "setup"] - ), - trim_blocks=True, - lstrip_blocks=True, - ) - + # arbitrary restriction introduced by the row calculation in load_data and the broken Faker setup assert ( - 1 <= self.scale_factor <= 1000 - ), "estuary benchmark only supports scale factors 1 to 1000" + 1 <= self.scale_factor <= 100 + ), "estuary benchmark currently only supports scale factors 1 to 100" def get_workload_description(self) -> dict[str, Any]: """Return Estuary workload description.""" @@ -119,7 +103,20 @@ def estimate_filesystem_usage_gb(self, system: SystemUnderTest) -> int: """ Estuary workload uses streaming import where possible, in which case it does not require local storage. + Otherwise, same code as TPC-H (although not 100% correct, as Estuary loads table-by-table) """ if system.SUPPORTS_STREAMLOAD: return 0 - return super().estimate_filesystem_usage_gb(system) + + def scale_multiplier(sf: float) -> float: + # 2.0 at very small sf (≈1–10), ~1.6 at 30, →1.3 for sf ≥ 100 + # f(sf) = 1.3 + 0.7 / (1 + (sf/K)^p), with K≈26.8537, p≈2.5966 + if sf <= 10: + return 2.0 + val = 1.3 + 0.7 / (1.0 + (sf / 26.853725639548) ** 2.5965770266157073) + return float(max(1.3, min(val, 2.0))) + + def estimate_gb(sf: float) -> int: + return int(max(sf * scale_multiplier(sf), 3.0)) + + return estimate_gb(float(self.scale_factor)) diff --git a/benchkit/workloads/tpch.py b/benchkit/workloads/tpch.py index d7d108e..a97ab95 100644 --- a/benchkit/workloads/tpch.py +++ b/benchkit/workloads/tpch.py @@ -6,8 +6,7 @@ from pathlib import Path from typing import TYPE_CHECKING, Any -from jinja2 import Environment, FileSystemLoader - +from ..systems.base import SystemUnderTest from ..util import safe_command from .base import Workload @@ -26,43 +25,6 @@ def get_python_dependencies(cls) -> list[str]: def __init__(self, config: dict[str, Any]): super().__init__(config) - self.data_format = config.get("data_format", "tbl") # TPC-H standard format - self.variant = config.get("variant", "official") # Query variant to use - self.system_variants = ( - config.get("system_variants") or {} - ) # Per-system variant overrides - - # Determine which queries to include based on include/exclude logic - queries_config = config.get("queries", {}) - queries_include = queries_config.get("include", []) - queries_exclude = queries_config.get("exclude", []) - - all_queries = list(self.get_all_query_names()) - - if queries_include: - # If include is specified, use only those queries - self.queries_to_include = queries_include - elif queries_exclude: - # If exclude is specified, use all queries except excluded ones - self.queries_to_include = [ - q for q in all_queries if q not in queries_exclude - ] - else: - # If neither is specified, use all queries - self.queries_to_include = all_queries - - # Setup template environment for TPC-H workload - self.workload_dir = Path(__file__).parent.parent.parent / "workloads" / "tpch" - self.template_env = Environment( - loader=FileSystemLoader( - [self.workload_dir / "queries", self.workload_dir / "setup"] - ), - trim_blocks=True, - lstrip_blocks=True, - ) - - # Store system for template resolution - self._current_system: SystemUnderTest | None = None def generate_data(self, output_dir: Path) -> bool: """Generate TPC-H data using tpchgen-cli (modern Python approach).""" @@ -117,111 +79,20 @@ def _generate_with_tpchgen_cli(self, output_dir: Path) -> bool: print(f"Data generation failed: {e}") return False - def create_schema(self, system: SystemUnderTest) -> bool: - """Create TPC-H schema and tables using templated setup scripts.""" - # Use TPC-H specific schema name (e.g., tpch_sf100) - schema = self.get_schema_name() - - # First, create the schema using the system's method - if hasattr(system, "create_schema"): - print(f"Creating schema '{schema}'...") - if not system.create_schema(schema): - print(f"Failed to create schema '{schema}'") - return False - print(f"✓ Schema '{schema}' created successfully") - - # Then create the tables using the templated script - print("Creating TPC-H tables...") - return self._execute_setup_script(system, "create_tables.sql") - def create_indexes(self, system: SystemUnderTest) -> bool: """Create TPC-H indexes using templated setup scripts.""" - return self._execute_setup_script(system, "create_indexes.sql") + return self.execute_setup_script(system, "create_indexes.sql") def analyze_tables(self, system: SystemUnderTest) -> bool: """Analyze TPC-H tables using templated setup scripts.""" - return self._execute_setup_script(system, "analyze_tables.sql") - - def _execute_setup_script(self, system: SystemUnderTest, script_name: str) -> bool: - """Execute a templated setup script by splitting into individual statements.""" - try: - # Get system extra config for conditional features - system_extra = {} - if hasattr(system, "setup_config"): - system_extra = system.setup_config.get("extra", {}) - - # Load and render the template - template = self.template_env.get_template(script_name) - - # Get node_count and cluster for multinode support - node_count = getattr(system, "node_count", 1) - cluster = getattr(system, "cluster_name", "benchmark_cluster") - - rendered_sql = template.render( - system_kind=system.kind, - scale_factor=self.scale_factor, - schema=self.get_schema_name(), - system_extra=system_extra, - node_count=node_count, - cluster=cluster, - ) - - # Split SQL into individual statements and execute them one by one - statements = self._split_sql_statements(rendered_sql) - - for idx, statement in enumerate(statements): - # Skip empty statements - if not statement.strip(): - continue - - # Calculate dynamic timeout for OPTIMIZE operations - # These can take a long time for large tables - timeout: int = int( - self._calculate_statement_timeout(statement, system).total_seconds() - ) - - # Execute each statement individually with calculated timeout - result = system.execute_query( - statement, - query_name=f"setup_{script_name.replace('.sql', '')}_{idx+1}", - timeout=timeout, - ) - - if not result["success"]: - print( - f"Failed to execute statement {idx+1} in {script_name}: {result.get('error', 'Unknown error')}" - ) - print(f"Statement was: {statement[:200]}...") - return False + return self.execute_setup_script(system, "analyze_tables.sql") - return True - - except Exception as e: - print(f"Error executing setup script {script_name}: {e}") - return False - - def _calculate_statement_timeout( + def calculate_statement_timeout( self, statement: str, system: SystemUnderTest ) -> timedelta: - """ - Calculate dynamic timeout for a SQL statement based on scale factor and node count. - - For OPTIMIZE TABLE operations on large datasets, the default timeout is often - insufficient. This method calculates appropriate timeouts based on - - expected table/data size - - the operation - - the system executing the operation (kind and node count) - - Args: - statement: SQL statement to execute - system: System under test, with its own characteristics - - Returns: - Recommended statement timeout - """ - # Check if this is an OPTIMIZE operation (ClickHouse specific) statement_upper = statement.upper().strip() + if "OPTIMIZE TABLE" in statement_upper: # Base timeout for OPTIMIZE: 10 minutes per SF1 # Scale factor 100 = ~1000 minutes base for lineitem @@ -258,61 +129,6 @@ def _calculate_statement_timeout( return system.estimate_execution_time("DEFAULT", self.scale_factor) - # note: method is not not static in case systems use different split semantics - def _split_sql_statements(self, sql: str) -> list[str]: - """ - Split SQL script into individual statements. - - Handles: - - Semicolon-separated statements - - SQL comments (-- and /* */) - - Empty lines - - Returns: - List of individual SQL statements - """ - statements = [] - current_statement = [] - in_comment = False - - for line in sql.split("\n"): - stripped = line.strip() - - # Skip SQL comments - if stripped.startswith("--"): - continue - - # Handle multi-line comments - if "/*" in stripped: - in_comment = True - if "*/" in stripped: - in_comment = False - continue - if in_comment: - continue - - # Skip empty lines - if not stripped: - continue - - # Check if line ends with semicolon (statement terminator) - if stripped.endswith(";"): - # Add the line without semicolon to current statement - current_statement.append(stripped[:-1]) - # Join and add to statements list - statements.append("\n".join(current_statement)) - # Reset for next statement - current_statement = [] - else: - # Add line to current statement - current_statement.append(stripped) - - # Add any remaining statement (for scripts without trailing semicolon) - if current_statement: - statements.append("\n".join(current_statement)) - - return statements - def prepare(self, system: SystemUnderTest) -> bool: """Complete TPC-H setup process: generate data, create tables, load data, create indexes, analyze tables.""" print("Setting up TPC-H workload...") @@ -398,30 +214,6 @@ def load_data(self, system: SystemUnderTest) -> bool: return True - def get_queries(self, system: SystemUnderTest | None = None) -> dict[str, str]: - """Get TPC-H queries with templates and variants resolved for the target system.""" - # Use provided system or stored system - target_system = system or self._current_system - if target_system is None: - raise ValueError( - "System must be provided either as parameter or stored from previous call" - ) - - # Store system for future template resolution - self._current_system = target_system - - # Get and log the variant being used for this system - variant = self._get_query_variant_for_system(target_system) - if variant != "official": - print(f"Loading '{variant}' variant queries for {target_system.kind}") - - queries = {} - for query_name in self.get_all_query_names(): - if query_name in self.queries_to_include: - queries[query_name] = self._get_query_sql(query_name, target_system) - - return queries - def run_workload( self, system: SystemUnderTest, @@ -459,16 +251,6 @@ def run_workload( return result_dict - def run_query( - self, system: SystemUnderTest, query_name: str, query_sql: str - ) -> dict[str, Any]: - """Execute a TPC-H query.""" - # Substitute schema name in query - schema_name = self.get_schema_name() - formatted_sql = query_sql.format(schema=schema_name) - - return system.execute_query(formatted_sql, query_name=query_name) - def get_workload_description(self) -> dict[str, Any]: """Return TPC-H workload description.""" return { @@ -640,7 +422,7 @@ def get_rendered_setup_scripts(self, system: SystemUnderTest) -> dict[str, str]: for script_name in ["create_tables", "create_indexes", "analyze_tables"]: try: - template = self.template_env.get_template(f"{script_name}.sql") + template = self.get_template_env().get_template(f"{script_name}.sql") system_extra = {} if hasattr(system, "setup_config"): system_extra = system.setup_config.get("extra", {}) @@ -668,78 +450,6 @@ def get_all_query_names(self) -> list[str]: """Get list of all TPC-H query names.""" return [f"Q{i:02d}" for i in range(1, 23)] # Q01 through Q22 - # Note: Table DDLs are now handled by templated setup scripts - - def _get_query_variant_for_system(self, system: SystemUnderTest) -> str: - """ - Determine which query variant to use for a given system. - - Args: - system: System under test - - Returns: - Variant name to use for this system - """ - # Check if system has a specific variant override - if self.system_variants and system.name in self.system_variants: - return str(self.system_variants[system.name]) - # Otherwise use global variant - return str(self.variant) - - def _get_query_sql(self, query_name: str, system: SystemUnderTest) -> str: - """ - Get SQL text for a specific TPC-H query with variant and templates resolved. - - Priority order for loading queries: - 1. variants/{variant}/{system_kind}/{query_name}.sql (system-specific variant) - 2. variants/{variant}/{query_name}.sql (generic variant) - 3. {query_name}.sql (default/official with inline conditionals) - """ - try: - variant = self._get_query_variant_for_system(system) - - # Build priority-ordered list of query paths - query_paths = [ - f"variants/{variant}/{system.kind}/{query_name}.sql", - f"variants/{variant}/{query_name}.sql", - f"{query_name}.sql", - ] - - template = None - - # Try each path in order until one succeeds - for path in query_paths: - try: - template = self.template_env.get_template(path) - break - except Exception: - continue - - if template is None: - raise FileNotFoundError( - f"Query {query_name} not found in any variant path" - ) - - # Get system extra config for conditional features - system_extra = {} - if hasattr(system, "setup_config"): - system_extra = system.setup_config.get("extra", {}) - - # Render template with variant context - rendered_sql = template.render( - system_kind=system.kind, - scale_factor=self.scale_factor, - schema=self.get_schema_name(), - variant=variant, - system_extra=system_extra, - ) - - return rendered_sql - - except Exception as e: - print(f"Error loading query {query_name}: {e}") - return f"-- Error loading query {query_name}: {e}" - def get_schema_name(self) -> str: """Get the schema name for TPC-H workload.""" # ClickHouse uses 'database', Exasol uses 'schema' diff --git a/configs/clickbench_exa_vs_ch.yaml b/configs/clickbench_exa_vs_ch.yaml new file mode 100644 index 0000000..9e81cd4 --- /dev/null +++ b/configs/clickbench_exa_vs_ch.yaml @@ -0,0 +1,82 @@ +title: "ClickBench comparison for Exasol vs ClickHouse, single user" +author: "Stefan Reich, Principal Database Consultant at Exasol AG" + +# Execution configuration (optional) +execution: + parallel: true # Enable parallel execution of systems (default: false) + max_workers: 2 # Max concurrent systems (default: number of systems) + +env: + mode: "aws" # aws, gcp, azure, or local + region: "eu-west-1" + allow_external_database_access: true # Allow external access to database ports + # Instance configuration per system + instances: + exasol: + instance_type: "c6a.8xlarge" # 32 vCPUs, 64GB RAM, $1.224 per hour (+ EBS) + disk: + type: "ebs" # Use local NVMe for best performance + size_gb: 150 + label: "sr-bench-exa" + clickhouse: + instance_type: "c6a.8xlarge" # 32 vCPUs, 64GB RAM, $1.224 per hour (+ EBS) + disk: + type: "ebs" + size_gb: 150 + label: "sr-bench-ch" + os_image: "ubuntu-22.04" + ssh_key_name: "sr-testkey" # AWS key pair name (without .pem extension) + ssh_private_key_path: "~/.ssh/sr-testkey.pem" # Local private key file path + +systems: + # Baseline: Exasol with original queries (first system is baseline) + - name: "exasol" + kind: "exasol" + version: "2025.1.8" # Latest Exasol version + setup: + method: "installer" # Native c4 installation + use_additional_disk: true # Auto-detect and use additional NVMe disk + c4_version: "4.28.5" # Latest c4 version + host_addrs: "$EXASOL_PRIVATE_IP" # Will be resolved from infrastructure + host_external_addrs: "$EXASOL_PUBLIC_IP" + image_password: "exasol123" + db_password: "exasol456" + license_file: "configs/exasol.license" # License file path + ## current code NEEDS this schema name here + schema: "clickbench" + db_mem_size: 51200 # RAM in MB + extra: + optimizer_mode: "analytical" + # Exasol database parameters for analytical workload optimization + db_params: + - "-useQueryCache=0" + - name: "clickhouse" + kind: "clickhouse" + version: "25.10.2.65" # Latest stable ClickHouse version + setup: + method: "native" # Native APT installation + use_additional_disk: true # Auto-detect and use additional NVMe disk + data_dir: "/data/clickhouse" + host: "$CLICKHOUSE_PUBLIC_IP" # Use public IP for external access + port: 8123 # HTTP port for clickhouse-connect + username: "default" + password: "clickhouse123" # Password for benchmark user + database: "clickbench" + extra: + memory_limit: "50g" # Memory limit for ClickHouse + max_threads: "32" # Match instance vCPUs + max_memory_usage: "50000000000" # ~128GB in bytes + +workload: + name: "clickbench" + runs_per_query: 5 # Number of measured runs + warmup_runs: 1 # Warmup runs before measurement + scale_factor: 1 # ClickBench does not have a scale factor, but the template check expects one + generator: "url-import" # fallback to "url-download" if url-import is not supported by system + format: "csv" + +report: + show_boxplots: true + show_latency_cdf: true + show_bar_chart: true + show_heatmap: true diff --git a/configs/clickbench_exa_vs_ch_mu.yaml b/configs/clickbench_exa_vs_ch_mu.yaml new file mode 100644 index 0000000..2413c75 --- /dev/null +++ b/configs/clickbench_exa_vs_ch_mu.yaml @@ -0,0 +1,87 @@ +title: "ClickBench comparison for Exasol vs ClickHouse, 10 users" +author: "Stefan Reich, Principal Database Consultant at Exasol AG" + +# Execution configuration (optional) +execution: + parallel: true # Enable parallel execution of systems (default: false) + max_workers: 2 # Max concurrent systems (default: number of systems) + +env: + mode: "aws" # aws, gcp, azure, or local + region: "eu-west-1" + allow_external_database_access: true # Allow external access to database ports + # Instance configuration per system + instances: + exasol: + instance_type: "c6a.8xlarge" # 32 vCPUs, 64GB RAM, $1.224 per hour (+ EBS) + disk: + type: "ebs" # Use local NVMe for best performance + size_gb: 150 + label: "sr-bench-exa" + clickhouse: + instance_type: "c6a.8xlarge" # 32 vCPUs, 64GB RAM, $1.224 per hour (+ EBS) + disk: + type: "ebs" + size_gb: 150 + label: "sr-bench-ch" + os_image: "ubuntu-22.04" + ssh_key_name: "sr-testkey" # AWS key pair name (without .pem extension) + ssh_private_key_path: "~/.ssh/sr-testkey.pem" # Local private key file path + +systems: + # Baseline: Exasol with original queries (first system is baseline) + - name: "exasol" + kind: "exasol" + version: "2025.1.8" # Latest Exasol version + setup: + method: "installer" # Native c4 installation + use_additional_disk: true # Auto-detect and use additional NVMe disk + c4_version: "4.28.5" # Latest c4 version + host_addrs: "$EXASOL_PRIVATE_IP" # Will be resolved from infrastructure + host_external_addrs: "$EXASOL_PUBLIC_IP" + image_password: "exasol123" + db_password: "exasol456" + license_file: "configs/exasol.license" # License file path + ## current code NEEDS this schema name here + schema: "clickbench" + db_mem_size: 51200 # RAM in MB + extra: + optimizer_mode: "analytical" + # Exasol database parameters for analytical workload optimization + db_params: + - "-useQueryCache=0" + - name: "clickhouse" + kind: "clickhouse" + version: "25.10.2.65" # Latest stable ClickHouse version + setup: + method: "native" # Native APT installation + use_additional_disk: true # Auto-detect and use additional NVMe disk + data_dir: "/data/clickhouse" + host: "$CLICKHOUSE_PUBLIC_IP" # Use public IP for external access + port: 8123 # HTTP port for clickhouse-connect + username: "default" + password: "clickhouse123" # Password for benchmark user + database: "clickbench" + extra: + memory_limit: "50g" # Memory limit for ClickHouse + max_threads: "32" # Match instance vCPUs + max_memory_usage: "50000000000" # ~128GB in bytes + +workload: + name: "clickbench" + runs_per_query: 5 # Number of measured runs + warmup_runs: 1 # Warmup runs before measurement + scale_factor: 1 # ClickBench does not have a scale factor, but the template check expects one + generator: "url-import" # fallback to "url-download" if url-import is not supported by system + format: "csv" + multiuser: + enabled: true + num_streams: 10 + randomize: true + random_seed: 64738 + +report: + show_boxplots: true + show_latency_cdf: true + show_bar_chart: true + show_heatmap: true diff --git a/configs/test_exasol_estuary_10g.yaml b/configs/wip_exasol_estuary_10g.yaml similarity index 98% rename from configs/test_exasol_estuary_10g.yaml rename to configs/wip_exasol_estuary_10g.yaml index 217423e..1a15b61 100644 --- a/configs/test_exasol_estuary_10g.yaml +++ b/configs/wip_exasol_estuary_10g.yaml @@ -1,5 +1,6 @@ title: "Exasol Standalone for estuary warehouse benchmark" author: "Stefan Reich, Principal Database Consultant at Exasol AG" +note: "WORK IN PROGRESS -- not stable yet" # Execution configuration (optional) execution: diff --git a/pyproject.toml b/pyproject.toml index a9bd912..6df63ef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,8 @@ dev = [ "types-Markdown>=3.0.0", "pandas-stubs>=2.0.0", "boto3-stubs[sts,ec2]>=1.26.0", + "types-requests", + "types-PyYAML", # Database drivers for testing framework (not needed in generated packages) "pyexasol>=0.25.0", "clickhouse-connect>=0.6.0", diff --git a/templates/package/cli.py.j2 b/templates/package/cli.py.j2 index f27a769..c5384fc 100644 --- a/templates/package/cli.py.j2 +++ b/templates/package/cli.py.j2 @@ -16,6 +16,7 @@ console = Console() def load_workload_config(config_path: str) -> dict[str, Any]: """Load workload config without full validation (minimal fields only).""" + config: dict with open(config_path, "r") as f: config = yaml.safe_load(f) return config diff --git a/templates/package/workload_runner.py.j2 b/templates/package/workload_runner.py.j2 index c958ac0..43d78cc 100644 --- a/templates/package/workload_runner.py.j2 +++ b/templates/package/workload_runner.py.j2 @@ -63,14 +63,11 @@ def execute_queries(config: dict[str, Any], output_dir: Path, force: bool = Fals random_seed = multiuser_config.get("random_seed", None) console.print(f"[dim]Multiuser mode: {num_streams} streams, randomize={randomize}, seed={random_seed}[/dim]") - # Get query names from workload object (which handles include/exclude logic) - query_names = workload.queries_to_include if hasattr(workload, 'queries_to_include') else [] - # Execute queries only (data should already be loaded) console.print(f"[dim]Running queries...[/dim]") try: result_dict = workload.run_workload( - system, query_names, runs_per_query, warmup_runs, + system, workload.get_included_queries(), runs_per_query, warmup_runs, num_streams, randomize, random_seed ) # Handle dict return format (measured and warmup results) diff --git a/tests/test_downloading.py b/tests/test_downloading.py new file mode 100644 index 0000000..5b8ff4d --- /dev/null +++ b/tests/test_downloading.py @@ -0,0 +1,65 @@ +from pathlib import Path + +# noinspection PyUnusedImports +import pytest +from requests.exceptions import ConnectionError, HTTPError, MissingSchema + +from benchkit.common import download_file_to_storage + + +def get_temp_file_name() -> Path: + import tempfile + + with tempfile.NamedTemporaryFile() as f: + return Path(f.name) + + +def test_bad_url(): + with pytest.raises(MissingSchema): + download_file_to_storage("hello world", Path("data.csv")) + + +def test_no_server(): + with pytest.raises(ConnectionError): + download_file_to_storage( + "https://localhost:123/data.csv.gz", Path("data.csv.gz") + ) + + +def test_no_file(): + with pytest.raises(HTTPError) as e: + download_file_to_storage( + "https://exasol.com/data_no_such_file.tgz", Path("none") + ) + assert "404" in str(e.value) + + +def test_no_access(): + with pytest.raises(HTTPError) as e: + download_file_to_storage( + "https://x-up.s3.amazonaws.com/releases/c4/linux/x86_64/no_such_version/c4", + Path("xxx"), + ) + assert "403" in str(e.value) + + +def test_good_file(): + from hashlib import file_digest + + target: Path = get_temp_file_name() + assert not target.exists() + try: + download_file_to_storage( + "https://github.githubassets.com/favicons/favicon.png", target + ) + assert target.exists() + assert target.stat().st_size == 958 + with open(target, "rb") as file: + x = file_digest(file, "sha256") + assert ( + x.hexdigest() + == "74cf90ac2fe6624ab1056cacea11cf7ed4f8bef54bbb0e869638013bba45bc08" + ) + + finally: + target.unlink(missing_ok=True) diff --git a/tests/test_workload_interface.py b/tests/test_workload_interface.py index 466d072..5e49518 100644 --- a/tests/test_workload_interface.py +++ b/tests/test_workload_interface.py @@ -89,16 +89,16 @@ def test_timeout_calculation( workload: TPCH = TPCH({"name": "tpch", "scale_factor": scale_factor}) system.node_count = node_count - assert workload._calculate_statement_timeout( + assert workload.calculate_statement_timeout( "OPTIMIZE TABLE ORDERS", system ) == timedelta(seconds=orders_timeout_seconds) - assert workload._calculate_statement_timeout( + assert workload.calculate_statement_timeout( "SELECT * FROM ORDERS", system ) == timedelta(minutes=5), "should be default timeout" assert ( timedelta(minutes=5) - <= workload._calculate_statement_timeout("MATERIALIZE STATISTICS", system) + <= workload.calculate_statement_timeout("MATERIALIZE STATISTICS", system) <= timedelta(hours=1) ), "timeout should be within bounds" diff --git a/workloads/clickbench/queries/Q0.sql b/workloads/clickbench/queries/Q0.sql new file mode 100644 index 0000000..d0bc616 --- /dev/null +++ b/workloads/clickbench/queries/Q0.sql @@ -0,0 +1,2 @@ +SELECT COUNT(*) FROM hits; + diff --git a/workloads/clickbench/queries/Q1.sql b/workloads/clickbench/queries/Q1.sql new file mode 100644 index 0000000..a4f6026 --- /dev/null +++ b/workloads/clickbench/queries/Q1.sql @@ -0,0 +1,2 @@ +SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0; + diff --git a/workloads/clickbench/queries/Q10.sql b/workloads/clickbench/queries/Q10.sql new file mode 100644 index 0000000..9f04931 --- /dev/null +++ b/workloads/clickbench/queries/Q10.sql @@ -0,0 +1,10 @@ +SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u +FROM hits +{% if system_kind == 'exasol' %} +WHERE MobilePhoneModel IS NOT NULL +{% else %} +WHERE MobilePhoneModel <> '' +{% endif %} +GROUP BY MobilePhoneModel +ORDER BY u DESC LIMIT 10; + diff --git a/workloads/clickbench/queries/Q11.sql b/workloads/clickbench/queries/Q11.sql new file mode 100644 index 0000000..0500953 --- /dev/null +++ b/workloads/clickbench/queries/Q11.sql @@ -0,0 +1,9 @@ +SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits +{% if system_kind == 'exasol' %} +WHERE MobilePhoneModel IS NOT NULL +{% else %} +WHERE MobilePhoneModel <> '' +{% endif %} +GROUP BY MobilePhone, MobilePhoneModel +ORDER BY u DESC LIMIT 10; + diff --git a/workloads/clickbench/queries/Q12.sql b/workloads/clickbench/queries/Q12.sql new file mode 100644 index 0000000..90ae056 --- /dev/null +++ b/workloads/clickbench/queries/Q12.sql @@ -0,0 +1,10 @@ +SELECT SearchPhrase, COUNT(*) AS c +FROM hits +{% if system_kind == 'exasol' %} +WHERE SearchPhrase IS NOT NULL +{% else %} +WHERE SearchPhrase <> '' +{% endif %} +GROUP BY SearchPhrase +ORDER BY c DESC LIMIT 10; + diff --git a/workloads/clickbench/queries/Q13.sql b/workloads/clickbench/queries/Q13.sql new file mode 100644 index 0000000..6ae1a5d --- /dev/null +++ b/workloads/clickbench/queries/Q13.sql @@ -0,0 +1,10 @@ +SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u +FROM hits +{% if system_kind == 'exasol' %} +WHERE SearchPhrase IS NOT NULL +{% else %} +WHERE SearchPhrase <> '' +{% endif %} +GROUP BY SearchPhrase +ORDER BY u DESC LIMIT 10; + diff --git a/workloads/clickbench/queries/Q14.sql b/workloads/clickbench/queries/Q14.sql new file mode 100644 index 0000000..77634c5 --- /dev/null +++ b/workloads/clickbench/queries/Q14.sql @@ -0,0 +1,10 @@ +SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c +FROM hits +{% if system_kind == 'exasol' %} +WHERE SearchPhrase IS NOT NULL +{% else %} +WHERE SearchPhrase <> '' +{% endif %} +GROUP BY SearchEngineID, SearchPhrase +ORDER BY c DESC LIMIT 10; + diff --git a/workloads/clickbench/queries/Q15.sql b/workloads/clickbench/queries/Q15.sql new file mode 100644 index 0000000..136570c --- /dev/null +++ b/workloads/clickbench/queries/Q15.sql @@ -0,0 +1,5 @@ +SELECT UserID, COUNT(*) +FROM hits +GROUP BY UserID +ORDER BY COUNT(*) DESC LIMIT 10; + diff --git a/workloads/clickbench/queries/Q16.sql b/workloads/clickbench/queries/Q16.sql new file mode 100644 index 0000000..d2cacff --- /dev/null +++ b/workloads/clickbench/queries/Q16.sql @@ -0,0 +1,5 @@ +SELECT UserID, SearchPhrase, COUNT(*) +FROM hits +GROUP BY UserID, SearchPhrase +ORDER BY COUNT(*) DESC LIMIT 10; + diff --git a/workloads/clickbench/queries/Q17.sql b/workloads/clickbench/queries/Q17.sql new file mode 100644 index 0000000..8bb3e68 --- /dev/null +++ b/workloads/clickbench/queries/Q17.sql @@ -0,0 +1,4 @@ +SELECT UserID, SearchPhrase, COUNT(*) +FROM hits +GROUP BY UserID, SearchPhrase LIMIT 10; + diff --git a/workloads/clickbench/queries/Q18.sql b/workloads/clickbench/queries/Q18.sql new file mode 100644 index 0000000..0b7718a --- /dev/null +++ b/workloads/clickbench/queries/Q18.sql @@ -0,0 +1,5 @@ +SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) +FROM hits +GROUP BY UserID, extract(minute FROM EventTime), SearchPhrase +ORDER BY COUNT(*) DESC LIMIT 10; + diff --git a/workloads/clickbench/queries/Q19.sql b/workloads/clickbench/queries/Q19.sql new file mode 100644 index 0000000..0d01c7a --- /dev/null +++ b/workloads/clickbench/queries/Q19.sql @@ -0,0 +1,2 @@ +SELECT UserID FROM hits WHERE UserID = 435090932899640449; + diff --git a/workloads/clickbench/queries/Q2.sql b/workloads/clickbench/queries/Q2.sql new file mode 100644 index 0000000..22bad77 --- /dev/null +++ b/workloads/clickbench/queries/Q2.sql @@ -0,0 +1,2 @@ +SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits; + diff --git a/workloads/clickbench/queries/Q20.sql b/workloads/clickbench/queries/Q20.sql new file mode 100644 index 0000000..7cc5b51 --- /dev/null +++ b/workloads/clickbench/queries/Q20.sql @@ -0,0 +1,2 @@ +SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%'; + diff --git a/workloads/clickbench/queries/Q21.sql b/workloads/clickbench/queries/Q21.sql new file mode 100644 index 0000000..00807f9 --- /dev/null +++ b/workloads/clickbench/queries/Q21.sql @@ -0,0 +1,11 @@ +SELECT SearchPhrase, MIN(URL), COUNT(*) AS c +FROM hits +WHERE URL LIKE '%google%' +{% if system_kind == 'exasol' %} +AND SearchPhrase IS NOT NULL +{% else %} +AND SearchPhrase <> '' +{% endif %} +GROUP BY SearchPhrase +ORDER BY c DESC LIMIT 10; + diff --git a/workloads/clickbench/queries/Q22.sql b/workloads/clickbench/queries/Q22.sql new file mode 100644 index 0000000..2b2f167 --- /dev/null +++ b/workloads/clickbench/queries/Q22.sql @@ -0,0 +1,11 @@ +SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) +FROM hits +WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' +{% if system_kind == 'exasol' %} +AND SearchPhrase IS NOT NULL +{% else %} +AND SearchPhrase <> '' +{% endif %} +GROUP BY SearchPhrase +ORDER BY c DESC LIMIT 10; + diff --git a/workloads/clickbench/queries/Q23.sql b/workloads/clickbench/queries/Q23.sql new file mode 100644 index 0000000..599b096 --- /dev/null +++ b/workloads/clickbench/queries/Q23.sql @@ -0,0 +1,2 @@ +SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10; + diff --git a/workloads/clickbench/queries/Q24.sql b/workloads/clickbench/queries/Q24.sql new file mode 100644 index 0000000..a9d8955 --- /dev/null +++ b/workloads/clickbench/queries/Q24.sql @@ -0,0 +1,8 @@ +SELECT SearchPhrase FROM hits +{% if system_kind == 'exasol' %} +WHERE SearchPhrase IS NOT NULL +{% else %} +WHERE SearchPhrase <> '' +{% endif %} +ORDER BY EventTime LIMIT 10; + diff --git a/workloads/clickbench/queries/Q25.sql b/workloads/clickbench/queries/Q25.sql new file mode 100644 index 0000000..4581dd9 --- /dev/null +++ b/workloads/clickbench/queries/Q25.sql @@ -0,0 +1,8 @@ +SELECT SearchPhrase FROM hits +{% if system_kind == 'exasol' %} +WHERE SearchPhrase IS NOT NULL +{% else %} +WHERE SearchPhrase <> '' +{% endif %} +ORDER BY SearchPhrase LIMIT 10; + diff --git a/workloads/clickbench/queries/Q26.sql b/workloads/clickbench/queries/Q26.sql new file mode 100644 index 0000000..212138e --- /dev/null +++ b/workloads/clickbench/queries/Q26.sql @@ -0,0 +1,8 @@ +SELECT SearchPhrase FROM hits +{% if system_kind == 'exasol' %} +WHERE SearchPhrase IS NOT NULL +{% else %} +WHERE SearchPhrase <> '' +{% endif %} +ORDER BY EventTime, SearchPhrase LIMIT 10; + diff --git a/workloads/clickbench/queries/Q27.sql b/workloads/clickbench/queries/Q27.sql new file mode 100644 index 0000000..5218fac --- /dev/null +++ b/workloads/clickbench/queries/Q27.sql @@ -0,0 +1,11 @@ +SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c +FROM hits +{% if system_kind == 'exasol' %} +WHERE URL IS NOT NULL +{% else %} +WHERE URL <> '' +{% endif %} +GROUP BY CounterID +HAVING COUNT(*) > 100000 +ORDER BY l DESC LIMIT 25; + diff --git a/workloads/clickbench/queries/Q28.sql b/workloads/clickbench/queries/Q28.sql new file mode 100644 index 0000000..50b4d9c --- /dev/null +++ b/workloads/clickbench/queries/Q28.sql @@ -0,0 +1,11 @@ +SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) +FROM hits +{% if system_kind == 'exasol' %} +WHERE Referer IS NOT NULL +{% else %} +WHERE Referer <> '' +{% endif %} +GROUP BY REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') +HAVING COUNT(*) > 100000 +ORDER BY l DESC LIMIT 25; + diff --git a/workloads/clickbench/queries/Q29.sql b/workloads/clickbench/queries/Q29.sql new file mode 100644 index 0000000..62f4f3c --- /dev/null +++ b/workloads/clickbench/queries/Q29.sql @@ -0,0 +1,3 @@ +SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) +FROM hits; + diff --git a/workloads/clickbench/queries/Q3.sql b/workloads/clickbench/queries/Q3.sql new file mode 100644 index 0000000..deef647 --- /dev/null +++ b/workloads/clickbench/queries/Q3.sql @@ -0,0 +1,2 @@ +SELECT AVG(UserID) FROM hits; + diff --git a/workloads/clickbench/queries/Q30.sql b/workloads/clickbench/queries/Q30.sql new file mode 100644 index 0000000..31fe84b --- /dev/null +++ b/workloads/clickbench/queries/Q30.sql @@ -0,0 +1,10 @@ +SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) +FROM hits +{% if system_kind == 'exasol' %} +WHERE SearchPhrase IS NOT NULL +{% else %} +WHERE SearchPhrase <> '' +{% endif %} +GROUP BY SearchEngineID, ClientIP +ORDER BY c DESC LIMIT 10; + diff --git a/workloads/clickbench/queries/Q31.sql b/workloads/clickbench/queries/Q31.sql new file mode 100644 index 0000000..00bbebd --- /dev/null +++ b/workloads/clickbench/queries/Q31.sql @@ -0,0 +1,10 @@ +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) +FROM hits +{% if system_kind == 'exasol' %} +WHERE SearchPhrase IS NOT NULL +{% else %} +WHERE SearchPhrase <> '' +{% endif %} +GROUP BY WatchID, ClientIP +ORDER BY c DESC LIMIT 10; + diff --git a/workloads/clickbench/queries/Q32.sql b/workloads/clickbench/queries/Q32.sql new file mode 100644 index 0000000..68e399d --- /dev/null +++ b/workloads/clickbench/queries/Q32.sql @@ -0,0 +1,2 @@ +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; + diff --git a/workloads/clickbench/queries/Q33.sql b/workloads/clickbench/queries/Q33.sql new file mode 100644 index 0000000..ba10b7a --- /dev/null +++ b/workloads/clickbench/queries/Q33.sql @@ -0,0 +1,2 @@ +SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10; + diff --git a/workloads/clickbench/queries/Q34.sql b/workloads/clickbench/queries/Q34.sql new file mode 100644 index 0000000..e36a81f --- /dev/null +++ b/workloads/clickbench/queries/Q34.sql @@ -0,0 +1,2 @@ +SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10; + diff --git a/workloads/clickbench/queries/Q35.sql b/workloads/clickbench/queries/Q35.sql new file mode 100644 index 0000000..c37e5dd --- /dev/null +++ b/workloads/clickbench/queries/Q35.sql @@ -0,0 +1,5 @@ +SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c +FROM hits +GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 +ORDER BY c DESC LIMIT 10; + diff --git a/workloads/clickbench/queries/Q36.sql b/workloads/clickbench/queries/Q36.sql new file mode 100644 index 0000000..e5686ac --- /dev/null +++ b/workloads/clickbench/queries/Q36.sql @@ -0,0 +1,11 @@ +SELECT URL, COUNT(*) AS PageViews +FROM hits +WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 +{% if system_kind == 'exasol' %} +AND URL IS NOT NULL +{% else %} +AND URL <> '' +{% endif %} +GROUP BY URL +ORDER BY PageViews DESC LIMIT 10; + diff --git a/workloads/clickbench/queries/Q37.sql b/workloads/clickbench/queries/Q37.sql new file mode 100644 index 0000000..4529a6f --- /dev/null +++ b/workloads/clickbench/queries/Q37.sql @@ -0,0 +1,11 @@ +SELECT Title, COUNT(*) AS PageViews +FROM hits +WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 +{% if system_kind == 'exasol' %} +AND Title IS NOT NULL +{% else %} +AND Title <> '' +{% endif %} +GROUP BY Title +ORDER BY PageViews DESC LIMIT 10; + diff --git a/workloads/clickbench/queries/Q38.sql b/workloads/clickbench/queries/Q38.sql new file mode 100644 index 0000000..2111856 --- /dev/null +++ b/workloads/clickbench/queries/Q38.sql @@ -0,0 +1,6 @@ +SELECT URL, COUNT(*) AS PageViews +FROM hits +WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 +GROUP BY URL +ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; + diff --git a/workloads/clickbench/queries/Q39.sql b/workloads/clickbench/queries/Q39.sql new file mode 100644 index 0000000..47112c9 --- /dev/null +++ b/workloads/clickbench/queries/Q39.sql @@ -0,0 +1,6 @@ +SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews +FROM hits +WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 +GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END, URL +ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; + diff --git a/workloads/clickbench/queries/Q4.sql b/workloads/clickbench/queries/Q4.sql new file mode 100644 index 0000000..fa54f20 --- /dev/null +++ b/workloads/clickbench/queries/Q4.sql @@ -0,0 +1,2 @@ +SELECT COUNT(DISTINCT UserID) FROM hits; + diff --git a/workloads/clickbench/queries/Q40.sql b/workloads/clickbench/queries/Q40.sql new file mode 100644 index 0000000..66b41db --- /dev/null +++ b/workloads/clickbench/queries/Q40.sql @@ -0,0 +1,6 @@ +SELECT URLHash, EventDate, COUNT(*) AS PageViews +FROM hits +WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 +GROUP BY URLHash, EventDate +ORDER BY PageViews DESC LIMIT 10 OFFSET 100; + diff --git a/workloads/clickbench/queries/Q41.sql b/workloads/clickbench/queries/Q41.sql new file mode 100644 index 0000000..54935b4 --- /dev/null +++ b/workloads/clickbench/queries/Q41.sql @@ -0,0 +1,6 @@ +SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews +FROM hits +WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 +GROUP BY WindowClientWidth, WindowClientHeight +ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; + diff --git a/workloads/clickbench/queries/Q42.sql b/workloads/clickbench/queries/Q42.sql new file mode 100644 index 0000000..060741f --- /dev/null +++ b/workloads/clickbench/queries/Q42.sql @@ -0,0 +1,6 @@ +SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews +FROM hits +WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 +GROUP BY DATE_TRUNC('minute', EventTime) +ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000; + diff --git a/workloads/clickbench/queries/Q5.sql b/workloads/clickbench/queries/Q5.sql new file mode 100644 index 0000000..7e8f7cd --- /dev/null +++ b/workloads/clickbench/queries/Q5.sql @@ -0,0 +1,2 @@ +SELECT COUNT(DISTINCT SearchPhrase) FROM hits; + diff --git a/workloads/clickbench/queries/Q6.sql b/workloads/clickbench/queries/Q6.sql new file mode 100644 index 0000000..18f4f88 --- /dev/null +++ b/workloads/clickbench/queries/Q6.sql @@ -0,0 +1,2 @@ +SELECT MIN(EventDate), MAX(EventDate) FROM hits; + diff --git a/workloads/clickbench/queries/Q7.sql b/workloads/clickbench/queries/Q7.sql new file mode 100644 index 0000000..d1faebd --- /dev/null +++ b/workloads/clickbench/queries/Q7.sql @@ -0,0 +1,6 @@ +SELECT AdvEngineID, COUNT(*) +FROM hits +WHERE AdvEngineID <> 0 +GROUP BY AdvEngineID +ORDER BY COUNT(*) DESC; + diff --git a/workloads/clickbench/queries/Q8.sql b/workloads/clickbench/queries/Q8.sql new file mode 100644 index 0000000..67de842 --- /dev/null +++ b/workloads/clickbench/queries/Q8.sql @@ -0,0 +1,5 @@ +SELECT RegionID, COUNT(DISTINCT UserID) AS u +FROM hits +GROUP BY RegionID +ORDER BY u DESC LIMIT 10; + diff --git a/workloads/clickbench/queries/Q9.sql b/workloads/clickbench/queries/Q9.sql new file mode 100644 index 0000000..2e88978 --- /dev/null +++ b/workloads/clickbench/queries/Q9.sql @@ -0,0 +1,5 @@ +SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) +FROM hits +GROUP BY RegionID +ORDER BY c DESC LIMIT 10; + diff --git a/workloads/clickbench/setup/create_tables.sql b/workloads/clickbench/setup/create_tables.sql new file mode 100644 index 0000000..1568974 --- /dev/null +++ b/workloads/clickbench/setup/create_tables.sql @@ -0,0 +1,237 @@ +-- TPC-H Table Creation Script +-- Creates all 8 TPC-H tables with appropriate data types and storage options for each database system + +{% if system_kind == 'exasol' %} +-- Exasol table creation +CREATE OR REPLACE TABLE {{ schema }}.hits +( + WatchID BIGINT NOT NULL, + JavaEnable SMALLINT NOT NULL, + Title VARCHAR(2000000), + GoodEvent SMALLINT NOT NULL, + EventTime TIMESTAMP NOT NULL, + EventDate Date NOT NULL, + CounterID INTEGER NOT NULL, + ClientIP INTEGER NOT NULL, + RegionID INTEGER NOT NULL, + UserID BIGINT NOT NULL, + CounterClass SMALLINT NOT NULL, + OS SMALLINT NOT NULL, + UserAgent SMALLINT NOT NULL, + URL VARCHAR(2000000), + Referer VARCHAR(2000000), + IsRefresh SMALLINT NOT NULL, + RefererCategoryID SMALLINT NOT NULL, + RefererRegionID INTEGER NOT NULL, + URLCategoryID SMALLINT NOT NULL, + URLRegionID INTEGER NOT NULL, + ResolutionWidth SMALLINT NOT NULL, + ResolutionHeight SMALLINT NOT NULL, + ResolutionDepth SMALLINT NOT NULL, + FlashMajor SMALLINT NOT NULL, + FlashMinor SMALLINT NOT NULL, + FlashMinor2 VARCHAR(2000000), + NetMajor SMALLINT NOT NULL, + NetMinor SMALLINT NOT NULL, + UserAgentMajor SMALLINT NOT NULL, + UserAgentMinor VARCHAR(255), + CookieEnable SMALLINT NOT NULL, + JavascriptEnable SMALLINT NOT NULL, + IsMobile SMALLINT NOT NULL, + MobilePhone SMALLINT NOT NULL, + MobilePhoneModel VARCHAR(2000000), + Params VARCHAR(2000000), + IPNetworkID INTEGER NOT NULL, + TraficSourceID SMALLINT NOT NULL, + SearchEngineID SMALLINT NOT NULL, + SearchPhrase VARCHAR(2000000), + AdvEngineID SMALLINT NOT NULL, + IsArtifical SMALLINT NOT NULL, + WindowClientWidth SMALLINT NOT NULL, + WindowClientHeight SMALLINT NOT NULL, + ClientTimeZone SMALLINT NOT NULL, + ClientEventTime TIMESTAMP NOT NULL, + SilverlightVersion1 SMALLINT NOT NULL, + SilverlightVersion2 SMALLINT NOT NULL, + SilverlightVersion3 INTEGER NOT NULL, + SilverlightVersion4 SMALLINT NOT NULL, + PageCharset VARCHAR(2000000), + CodeVersion INTEGER NOT NULL, + IsLink SMALLINT NOT NULL, + IsDownload SMALLINT NOT NULL, + IsNotBounce SMALLINT NOT NULL, + FUniqID BIGINT NOT NULL, + OriginalURL VARCHAR(2000000), + HID INTEGER NOT NULL, + IsOldCounter SMALLINT NOT NULL, + IsEvent SMALLINT NOT NULL, + IsParameter SMALLINT NOT NULL, + DontCountHits SMALLINT NOT NULL, + WithHash SMALLINT NOT NULL, + HitColor CHAR NOT NULL, + LocalEventTime TIMESTAMP NOT NULL, + Age SMALLINT NOT NULL, + Sex SMALLINT NOT NULL, + Income SMALLINT NOT NULL, + Interests SMALLINT NOT NULL, + Robotness SMALLINT NOT NULL, + RemoteIP INTEGER NOT NULL, + WindowName INTEGER NOT NULL, + OpenerName INTEGER NOT NULL, + HistoryLength SMALLINT NOT NULL, + BrowserLanguage VARCHAR(2000000), + BrowserCountry VARCHAR(2000000), + SocialNetwork VARCHAR(2000000), + SocialAction VARCHAR(2000000), + HTTPError SMALLINT NOT NULL, + SendTiming INTEGER NOT NULL, + DNSTiming INTEGER NOT NULL, + ConnectTiming INTEGER NOT NULL, + ResponseStartTiming INTEGER NOT NULL, + ResponseEndTiming INTEGER NOT NULL, + FetchTiming INTEGER NOT NULL, + SocialSourceNetworkID SMALLINT NOT NULL, + SocialSourcePage VARCHAR(2000000), + ParamPrice BIGINT NOT NULL, + ParamOrderID VARCHAR(2000000), + ParamCurrency VARCHAR(2000000), + ParamCurrencyID SMALLINT NOT NULL, + OpenstatServiceName VARCHAR(2000000), + OpenstatCampaignID VARCHAR(2000000), + OpenstatAdID VARCHAR(2000000), + OpenstatSourceID VARCHAR(2000000), + UTMSource VARCHAR(2000000), + UTMMedium VARCHAR(2000000), + UTMCampaign VARCHAR(2000000), + UTMContent VARCHAR(2000000), + UTMTerm VARCHAR(2000000), + FromTag VARCHAR(2000000), + HasGCLID SMALLINT NOT NULL, + RefererHash BIGINT NOT NULL, + URLHash BIGINT NOT NULL, + CLID INTEGER NOT NULL, + PRIMARY KEY (CounterID, EventDate, UserID, EventTime, WatchID), + PARTITION BY EventDate +); +{% elif system_kind == 'clickhouse' %} +-- ClickHouse table creation +{% if node_count > 1 %} +-- MULTINODE CLUSTER: Creating local tables on all nodes + distributed tables +{{ UNSUPPORTED_SYSTEM_KIND_ERROR_FOR[system_kind] }} +{% else %} +-- Single node setup +CREATE TABLE hits +( + WatchID BIGINT NOT NULL, + JavaEnable SMALLINT NOT NULL, + Title TEXT NOT NULL, + GoodEvent SMALLINT NOT NULL, + EventTime TIMESTAMP NOT NULL, + EventDate Date NOT NULL, + CounterID INTEGER NOT NULL, + ClientIP INTEGER NOT NULL, + RegionID INTEGER NOT NULL, + UserID BIGINT NOT NULL, + CounterClass SMALLINT NOT NULL, + OS SMALLINT NOT NULL, + UserAgent SMALLINT NOT NULL, + URL TEXT NOT NULL, + Referer TEXT NOT NULL, + IsRefresh SMALLINT NOT NULL, + RefererCategoryID SMALLINT NOT NULL, + RefererRegionID INTEGER NOT NULL, + URLCategoryID SMALLINT NOT NULL, + URLRegionID INTEGER NOT NULL, + ResolutionWidth SMALLINT NOT NULL, + ResolutionHeight SMALLINT NOT NULL, + ResolutionDepth SMALLINT NOT NULL, + FlashMajor SMALLINT NOT NULL, + FlashMinor SMALLINT NOT NULL, + FlashMinor2 TEXT NOT NULL, + NetMajor SMALLINT NOT NULL, + NetMinor SMALLINT NOT NULL, + UserAgentMajor SMALLINT NOT NULL, + UserAgentMinor VARCHAR(255) NOT NULL, + CookieEnable SMALLINT NOT NULL, + JavascriptEnable SMALLINT NOT NULL, + IsMobile SMALLINT NOT NULL, + MobilePhone SMALLINT NOT NULL, + MobilePhoneModel TEXT NOT NULL, + Params TEXT NOT NULL, + IPNetworkID INTEGER NOT NULL, + TraficSourceID SMALLINT NOT NULL, + SearchEngineID SMALLINT NOT NULL, + SearchPhrase TEXT NOT NULL, + AdvEngineID SMALLINT NOT NULL, + IsArtifical SMALLINT NOT NULL, + WindowClientWidth SMALLINT NOT NULL, + WindowClientHeight SMALLINT NOT NULL, + ClientTimeZone SMALLINT NOT NULL, + ClientEventTime TIMESTAMP NOT NULL, + SilverlightVersion1 SMALLINT NOT NULL, + SilverlightVersion2 SMALLINT NOT NULL, + SilverlightVersion3 INTEGER NOT NULL, + SilverlightVersion4 SMALLINT NOT NULL, + PageCharset TEXT NOT NULL, + CodeVersion INTEGER NOT NULL, + IsLink SMALLINT NOT NULL, + IsDownload SMALLINT NOT NULL, + IsNotBounce SMALLINT NOT NULL, + FUniqID BIGINT NOT NULL, + OriginalURL TEXT NOT NULL, + HID INTEGER NOT NULL, + IsOldCounter SMALLINT NOT NULL, + IsEvent SMALLINT NOT NULL, + IsParameter SMALLINT NOT NULL, + DontCountHits SMALLINT NOT NULL, + WithHash SMALLINT NOT NULL, + HitColor CHAR NOT NULL, + LocalEventTime TIMESTAMP NOT NULL, + Age SMALLINT NOT NULL, + Sex SMALLINT NOT NULL, + Income SMALLINT NOT NULL, + Interests SMALLINT NOT NULL, + Robotness SMALLINT NOT NULL, + RemoteIP INTEGER NOT NULL, + WindowName INTEGER NOT NULL, + OpenerName INTEGER NOT NULL, + HistoryLength SMALLINT NOT NULL, + BrowserLanguage TEXT NOT NULL, + BrowserCountry TEXT NOT NULL, + SocialNetwork TEXT NOT NULL, + SocialAction TEXT NOT NULL, + HTTPError SMALLINT NOT NULL, + SendTiming INTEGER NOT NULL, + DNSTiming INTEGER NOT NULL, + ConnectTiming INTEGER NOT NULL, + ResponseStartTiming INTEGER NOT NULL, + ResponseEndTiming INTEGER NOT NULL, + FetchTiming INTEGER NOT NULL, + SocialSourceNetworkID SMALLINT NOT NULL, + SocialSourcePage TEXT NOT NULL, + ParamPrice BIGINT NOT NULL, + ParamOrderID TEXT NOT NULL, + ParamCurrency TEXT NOT NULL, + ParamCurrencyID SMALLINT NOT NULL, + OpenstatServiceName TEXT NOT NULL, + OpenstatCampaignID TEXT NOT NULL, + OpenstatAdID TEXT NOT NULL, + OpenstatSourceID TEXT NOT NULL, + UTMSource TEXT NOT NULL, + UTMMedium TEXT NOT NULL, + UTMCampaign TEXT NOT NULL, + UTMContent TEXT NOT NULL, + UTMTerm TEXT NOT NULL, + FromTag TEXT NOT NULL, + HasGCLID SMALLINT NOT NULL, + RefererHash BIGINT NOT NULL, + URLHash BIGINT NOT NULL, + CLID INTEGER NOT NULL, + PRIMARY KEY (CounterID, EventDate, UserID, EventTime, WatchID) +); +{% endif %} +{% else %} +{{ UNSUPPORTED_SYSTEM_KIND_ERROR_FOR[system_kind] }} +{% endif %} + +