diff --git a/desktop/core/src/desktop/api_public_urls_v1.py b/desktop/core/src/desktop/api_public_urls_v1.py index cf4eac70b9e..3f23d1fc95f 100644 --- a/desktop/core/src/desktop/api_public_urls_v1.py +++ b/desktop/core/src/desktop/api_public_urls_v1.py @@ -164,6 +164,7 @@ re_path(r'^importer/file/guess_metadata/?$', importer_api.guess_file_metadata, name='importer_guess_file_metadata'), re_path(r'^importer/file/guess_header/?$', importer_api.guess_file_header, name='importer_guess_file_header'), re_path(r'^importer/file/preview/?$', importer_api.preview_file, name='importer_preview_file'), + re_path(r'^importer/table/create/?$', importer_api.create_table, name='importer_create_table'), re_path(r'^importer/sql_type_mapping/?$', importer_api.get_sql_type_mapping, name='importer_get_sql_type_mapping'), ] diff --git a/desktop/core/src/desktop/lib/importer/api.py b/desktop/core/src/desktop/lib/importer/api.py index 8d1713905fa..a66a2707e38 100644 --- a/desktop/core/src/desktop/lib/importer/api.py +++ b/desktop/core/src/desktop/lib/importer/api.py @@ -26,6 +26,7 @@ from desktop.lib.importer import operations from desktop.lib.importer.serializers import ( + CreateTableSerializer, GuessFileHeaderSerializer, GuessFileMetadataSerializer, LocalFileUploadSerializer, @@ -279,3 +280,67 @@ def get_sql_type_mapping(request: Request) -> Response: except Exception as e: LOG.exception(f"Error getting SQL type mapping: {e}", exc_info=True) return Response({"error": str(e)}, status=status.HTTP_500_INTERNAL_SERVER_ERROR) + + +@api_view(["POST"]) +@parser_classes([JSONParser]) +@api_error_handler +def create_table(request: Request) -> Response: + """Create a table from a file based on the provided parameters. + + This API endpoint creates a table in the specified database using + the file data as the source. It can handle different file formats, + storage formats, and SQL dialects. + + Args: + request: Request object containing parameters for table creation + + Returns: + Response containing the SQL query for table creation and metadata: + - sql: The SQL query to create the table + - table_name: The name of the created table + - database_name: The database name + - column_count: The number of columns + """ + + serializer = CreateTableSerializer(data=request.data) + + if not serializer.is_valid(): + return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST) + + validated_data = serializer.validated_data + + try: + result = operations.create_table( + file_path=validated_data["file_path"], + file_type=validated_data["file_type"], + import_type=validated_data["import_type"], + sql_dialect=validated_data["sql_dialect"], + database_name=validated_data["database_name"], + table_name=validated_data["table_name"], + columns=validated_data["columns"], + has_header=validated_data.get("has_header", False), + partition_columns=validated_data.get("partition_columns", []), + comment=validated_data.get("comment", ""), + external=validated_data.get("external", False), + external_path=validated_data.get("external_path", ""), + table_format=validated_data.get("table_format", "text"), + is_transactional=validated_data.get("is_transactional", False), + is_iceberg=validated_data.get("is_iceberg", False), + is_insert_only=validated_data.get("is_insert_only", False), + primary_keys=validated_data.get("primary_keys", []), + sheet_name=validated_data.get("sheet_name"), + field_separator=validated_data.get("field_separator"), + quote_char=validated_data.get("quote_char"), + record_separator=validated_data.get("record_separator"), + load_data=validated_data.get("load_data", True), + fs=request.fs if validated_data["import_type"] == "remote" else None, + ) + + return Response(result, status=status.HTTP_200_OK) + + except ValueError as e: + return Response({"error": str(e)}, status=status.HTTP_400_BAD_REQUEST) + except Exception as e: + LOG.exception(f"Error creating table: {e}", exc_info=True) + return Response({"error": str(e)}, status=status.HTTP_500_INTERNAL_SERVER_ERROR) diff --git a/desktop/core/src/desktop/lib/importer/operations.py b/desktop/core/src/desktop/lib/importer/operations.py index 6809e23b3ba..999058af4c9 100644 --- a/desktop/core/src/desktop/lib/importer/operations.py +++ b/desktop/core/src/desktop/lib/importer/operations.py @@ -734,3 +734,31 @@ def _map_polars_dtype_to_sql_type(dialect: str, polars_type: str) -> str: raise ValueError(f"No mapping for Polars dtype {polars_type} in dialect {dialect}") return mapping[polars_type] + + +def create_table( + file_path: str, + file_type: str, + import_type: str, + sql_dialect: str, + database_name: str, + table_name: str, + columns: List[Dict], + has_header: bool = False, + partition_columns: List[Dict] = None, + comment: str = "", + external: bool = False, + external_path: str = "", + table_format: str = "text", + is_transactional: bool = False, + is_iceberg: bool = False, + is_insert_only: bool = False, + primary_keys: List[str] = None, + sheet_name: Optional[str] = None, + field_separator: Optional[str] = ",", + quote_char: Optional[str] = '"', + record_separator: Optional[str] = "\n", + load_data: bool = True, + fs=None, +) -> Dict[str, Any]: + pass diff --git a/desktop/core/src/desktop/lib/importer/serializers.py b/desktop/core/src/desktop/lib/importer/serializers.py index 7f20efb9ee1..7e6d3c40598 100644 --- a/desktop/core/src/desktop/lib/importer/serializers.py +++ b/desktop/core/src/desktop/lib/importer/serializers.py @@ -173,3 +173,133 @@ def validate(self, data): raise serializers.ValidationError({"sheet_name": "Sheet name is required for Excel files."}) return data + + +class CreateTableSerializer(serializers.Serializer): + """Serializer for table creation request validation. + + This serializer validates the parameters required for creating a SQL table from a file. + + Attributes: + file_path: Path to the source file to import data from + file_type: Type of file format (csv, tsv, excel, delimiter_format) + import_type: Type of import (local or remote) + sql_dialect: Target SQL dialect for table creation + database_name: The database name in which to create the table + table_name: The name of the table to create + has_header: Whether the file has a header row + columns: List of column definitions (each with name, type, etc.) + partition_columns: List of partition column definitions (optional) + external: Whether to create an external table (optional) + external_path: External table location path (optional, required if external=True) + table_format: Table storage format (text, parquet, orc, avro, kudu, iceberg) (optional) + field_separator: Field separator character (required for delimited files) + quote_char: Quote character (required for delimited files) + record_separator: Record separator character (required for delimited files) + sheet_name: Sheet name for Excel files (required when file_type is excel) + comment: Table comment (optional) + is_transactional: Whether to create a transactional table (optional, Hive only) + is_iceberg: Whether to create an Iceberg table (optional) + is_insert_only: Whether the transactional table is insert-only (optional) + primary_keys: List of primary key column names (optional, required for Kudu tables) + """ + + # Source file information + file_path = serializers.CharField(required=True, help_text="Path to the file to import data from") + file_type = serializers.ChoiceField( + choices=["csv", "tsv", "excel", "delimiter_format"], required=True, help_text="Type of file (csv, tsv, excel, delimiter_format)" + ) + import_type = serializers.ChoiceField( + choices=["local", "remote"], required=True, help_text="Whether the file is local or on a remote filesystem" + ) + + # Target table information + sql_dialect = serializers.ChoiceField( + choices=["hive", "impala", "trino", "phoenix", "sparksql"], required=True, help_text="SQL dialect for creating the table" + ) + database_name = serializers.CharField(required=True, help_text="Database name where the table will be created") + table_name = serializers.CharField(required=True, help_text="Name of the table to create") + + # Data format information + has_header = serializers.BooleanField(default=False, help_text="Whether the file has a header row") + + # Column definitions + columns = serializers.ListField( + child=serializers.DictField(), required=True, help_text="List of column definitions with name, type, etc." + ) + + # Optional parameters + partition_columns = serializers.ListField( + child=serializers.DictField(), required=False, default=[], help_text="List of partition column definitions" + ) + comment = serializers.CharField(required=False, allow_blank=True, default="", help_text="Table comment") + + # Table storage options + external = serializers.BooleanField(default=False, help_text="Whether to create an external table") + external_path = serializers.CharField(required=False, allow_blank=True, help_text="Location path for external tables") + table_format = serializers.ChoiceField( + choices=["text", "parquet", "orc", "avro", "kudu", "iceberg"], default="text", help_text="Storage format for the table" + ) + + # Hive/Impala specific options + is_transactional = serializers.BooleanField(default=False, help_text="Whether to create a transactional table (Hive)") + is_insert_only = serializers.BooleanField(default=False, help_text="Whether the transactional table is insert-only") + is_iceberg = serializers.BooleanField(default=False, help_text="Whether to create an Iceberg table") + + # Kudu specific options + primary_keys = serializers.ListField( + child=serializers.CharField(), required=False, default=[], help_text="List of primary key column names (required for Kudu tables)" + ) + + # Excel-specific fields + sheet_name = serializers.CharField(required=False, help_text="Sheet name for Excel files") + + # Delimited file-specific fields + field_separator = serializers.CharField(required=False, help_text="Field separator character") + quote_char = serializers.CharField(required=False, help_text="Quote character") + record_separator = serializers.CharField(required=False, help_text="Record separator character") + + # Additional options + load_data = serializers.BooleanField(default=True, help_text="Whether to load data from the file into the table") + + def validate(self, data): + """Validate the complete data set with interdependent field validation.""" + + # Validate Excel-specific parameters + if data.get("file_type") == "excel" and not data.get("sheet_name"): + raise serializers.ValidationError({"sheet_name": "Sheet name is required for Excel files."}) + + # Validate delimited file-specific parameters + if data.get("file_type") in ["csv", "tsv", "delimiter_format"]: + if not data.get("field_separator"): + # If not provided, set default value based on file type + if data.get("file_type") == "csv": + data["field_separator"] = "," + elif data.get("file_type") == "tsv": + data["field_separator"] = "\t" + else: + raise serializers.ValidationError({"field_separator": "Field separator is required for delimited files"}) + + if not data.get("quote_char"): + data["quote_char"] = '"' # Default quote character + + if not data.get("record_separator"): + data["record_separator"] = "\n" # Default record separator + + # Validate external table parameters + if data.get("external") and not data.get("external_path"): + raise serializers.ValidationError({"external_path": "External path is required for external tables."}) + + # Validate Kudu table parameters + if data.get("table_format") == "kudu" and not data.get("primary_keys"): + raise serializers.ValidationError({"primary_keys": "Primary keys are required for Kudu tables."}) + + # Validate transaction table parameters + if data.get("is_transactional") and data.get("sql_dialect") not in ["hive", "impala"]: + raise serializers.ValidationError({"is_transactional": "Transactional tables are only supported in Hive and Impala."}) + + # Validate Iceberg table parameters + if data.get("is_iceberg") and data.get("sql_dialect") not in ["hive", "impala", "sparksql"]: + raise serializers.ValidationError({"is_iceberg": "Iceberg tables are only supported in Hive, Impala, and SparkSQL."}) + + return data