diff --git a/iceberg/iceberg-catalog/src/main/java/org/apache/iceberg/hive/HiveSchemaConverter.java b/iceberg/iceberg-catalog/src/main/java/org/apache/iceberg/hive/HiveSchemaConverter.java index 5d912351ab0c..40ed5ee01aa2 100644 --- a/iceberg/iceberg-catalog/src/main/java/org/apache/iceberg/hive/HiveSchemaConverter.java +++ b/iceberg/iceberg-catalog/src/main/java/org/apache/iceberg/hive/HiveSchemaConverter.java @@ -143,7 +143,8 @@ Type convertType(TypeInfo typeInfo) { int listId = id++; Type listType = convertType(listTypeInfo.getListElementTypeInfo()); return Types.ListType.ofOptional(listId, listType); - case UNION: + case VARIANT: + return Types.VariantType.get(); default: throw new IllegalArgumentException("Unknown type " + typeInfo.getCategory()); } diff --git a/iceberg/iceberg-catalog/src/main/java/org/apache/iceberg/hive/HiveSchemaUtil.java b/iceberg/iceberg-catalog/src/main/java/org/apache/iceberg/hive/HiveSchemaUtil.java index 32eb2519ba7f..383883166527 100644 --- a/iceberg/iceberg-catalog/src/main/java/org/apache/iceberg/hive/HiveSchemaUtil.java +++ b/iceberg/iceberg-catalog/src/main/java/org/apache/iceberg/hive/HiveSchemaUtil.java @@ -323,6 +323,8 @@ public static String convertToTypeString(Type type) { case MAP: final Types.MapType mapType = type.asMapType(); return String.format("map<%s,%s>", convert(mapType.keyType()), convert(mapType.valueType())); + case VARIANT: + return "variant"; default: throw new UnsupportedOperationException(type + " is not supported"); } diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/Deserializer.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/Deserializer.java index 87dc5b6f549e..dbafe176f6bd 100644 --- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/Deserializer.java +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/Deserializer.java @@ -19,6 +19,8 @@ package org.apache.iceberg.mr.hive; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; import java.util.List; import java.util.Map; import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; @@ -27,6 +29,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.variant.Variant; import org.apache.iceberg.Schema; import org.apache.iceberg.data.GenericRecord; import org.apache.iceberg.data.Record; @@ -35,11 +38,13 @@ import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.schema.SchemaWithPartnerVisitor; import org.apache.iceberg.types.Type.PrimitiveType; +import org.apache.iceberg.types.Types; import org.apache.iceberg.types.Types.ListType; import org.apache.iceberg.types.Types.MapType; import org.apache.iceberg.types.Types.NestedField; import org.apache.iceberg.types.Types.StructType; - +import org.apache.iceberg.variants.VariantMetadata; +import org.apache.iceberg.variants.VariantValue; class Deserializer { private final FieldDeserializer fieldDeserializer; @@ -164,6 +169,26 @@ public FieldDeserializer list(ListType listTypeInfo, ObjectInspectorPair pair, F }; } + @Override + public FieldDeserializer variant(Types.VariantType variantType, ObjectInspectorPair pair) { + return variantObj -> { + if (variantObj == null) { + return null; + } + // Extract data from the struct representation + StructObjectInspector variantOI = (StructObjectInspector) pair.sourceInspector(); + Variant variant = Variant.from(variantOI.getStructFieldsDataAsList(variantObj)); + + VariantMetadata metadata = VariantMetadata.from( + ByteBuffer.wrap(variant.getMetadata()).order(ByteOrder.LITTLE_ENDIAN)); + + VariantValue value = VariantValue.from(metadata, + ByteBuffer.wrap(variant.getValue()).order(ByteOrder.LITTLE_ENDIAN)); + + return org.apache.iceberg.variants.Variant.of(metadata, value); + }; + } + @Override public FieldDeserializer map(MapType mapType, ObjectInspectorPair pair, FieldDeserializer keyDeserializer, FieldDeserializer valueDeserializer) { diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergObjectInspector.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergObjectInspector.java index 625f8f65d296..0951e30128a0 100644 --- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergObjectInspector.java +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergObjectInspector.java @@ -152,4 +152,8 @@ public ObjectInspector struct(Types.StructType structType, List return new IcebergRecordObjectInspector(structType, fieldObjectInspectors); } + @Override + public ObjectInspector variant(Types.VariantType variantType) { + return IcebergVariantObjectInspector.get(); + } } diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergVariantObjectInspector.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergVariantObjectInspector.java new file mode 100644 index 000000000000..192d4b25bd02 --- /dev/null +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/serde/objectinspector/IcebergVariantObjectInspector.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.mr.hive.serde.objectinspector; + +import java.nio.ByteBuffer; +import java.util.List; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.VariantObjectInspector; +import org.apache.iceberg.variants.Variant; + +/** + * ObjectInspector for Iceberg's Variant type in Hive. + *

+ * This ObjectInspector enables Hive to work with Iceberg's Variant type, which stores + * polymorphic data in a single column. Variant types are particularly useful for + * semi-structured data like JSON where the actual type may vary per row. + *

+ * The ObjectInspector exposes each Variant as a Hive struct with two binary fields: + *

+ *

+ */ +public final class IcebergVariantObjectInspector extends VariantObjectInspector { + + private static final ObjectInspector INSTANCE = new IcebergVariantObjectInspector(); + + private IcebergVariantObjectInspector() { + } + + public static ObjectInspector get() { + return INSTANCE; + } + + @Override + public Object getStructFieldData(Object data, StructField fieldRef) { + if (data == null) { + return null; + } + Variant variant = (Variant) data; + MyField field = (MyField) fieldRef; + + switch (field.getFieldID()) { + case 0: // "metadata" field (binary) + ByteBuffer metadata = ByteBuffer.allocate(variant.metadata().sizeInBytes()); + variant.metadata().writeTo(metadata, 0); + return metadata.array(); + case 1: // "value" field (binary) + ByteBuffer value = ByteBuffer.allocate(variant.value().sizeInBytes()); + variant.value().writeTo(value, 0); + return value.array(); + default: + throw new IllegalArgumentException("Unknown field position: " + field.getFieldID()); + } + } + + @Override + public List getStructFieldsDataAsList(Object data) { + if (data == null) { + return null; + } + Variant variant = (Variant) data; + ByteBuffer metadata = ByteBuffer.allocate(variant.metadata().sizeInBytes()); + variant.metadata().writeTo(metadata, 0); + + ByteBuffer value = ByteBuffer.allocate(variant.value().sizeInBytes()); + variant.value().writeTo(value, 0); + + // Return the data for our fields in the correct order: metadata, value + return List.of(metadata.array(), value.array()); + } +} diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/vector/ParquetSchemaFieldNameVisitor.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/vector/ParquetSchemaFieldNameVisitor.java index 37b089e7b5e9..acf600811f7c 100644 --- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/vector/ParquetSchemaFieldNameVisitor.java +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/vector/ParquetSchemaFieldNameVisitor.java @@ -114,6 +114,17 @@ public Type list(Types.ListType iList, GroupType array, Type element) { return array; } + @Override + public Type variant(Types.VariantType iVariant, GroupType variant, Type result) { + if (variant.getId() != null) { + typesById.put(variant.getId().intValue(), variant); + } + // Add the variant field name to the column names list + appendToColNamesList(variant instanceof MessageType, variant.getName()); + + return variant; + } + @Override public Type map(Types.MapType iMap, GroupType map, Type key, Type value) { if (map.getId() != null) { diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/writer/HiveFileWriterFactory.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/writer/HiveFileWriterFactory.java index a24ff33bf4bc..588a7b30d3e7 100644 --- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/writer/HiveFileWriterFactory.java +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/writer/HiveFileWriterFactory.java @@ -77,7 +77,7 @@ protected void configurePositionDelete(Avro.DeleteWriteBuilder builder) { @Override protected void configureDataWrite(Parquet.DataWriteBuilder builder) { - builder.createWriterFunc(GenericParquetWriter::buildWriter); + builder.createWriterFunc(GenericParquetWriter::create); } @Override @@ -87,7 +87,7 @@ protected void configureEqualityDelete(Parquet.DeleteWriteBuilder builder) { @Override protected void configurePositionDelete(Parquet.DeleteWriteBuilder builder) { - builder.createWriterFunc(GenericParquetWriter::buildWriter); + builder.createWriterFunc(GenericParquetWriter::create); } @Override diff --git a/iceberg/iceberg-handler/src/test/queries/negative/variant_type_partition_column.q b/iceberg/iceberg-handler/src/test/queries/negative/variant_type_partition_column.q new file mode 100644 index 000000000000..1c9eddf73d4a --- /dev/null +++ b/iceberg/iceberg-handler/src/test/queries/negative/variant_type_partition_column.q @@ -0,0 +1,5 @@ +CREATE EXTERNAL TABLE variant_test_partition ( + id INT, + data VARIANT +) PARTITIONED BY spec (data) +STORED BY ICEBERG tblproperties('format-version'='3'); \ No newline at end of file diff --git a/iceberg/iceberg-handler/src/test/queries/positive/variant_type.q b/iceberg/iceberg-handler/src/test/queries/positive/variant_type.q new file mode 100644 index 000000000000..a6a10532dcc2 --- /dev/null +++ b/iceberg/iceberg-handler/src/test/queries/positive/variant_type.q @@ -0,0 +1,212 @@ +-- Mask random uuid +--! qt:replace:/(\s+'uuid'=')\S+('\s*)/$1#Masked#$2/ +-- Mask random snapshot id +--! qt:replace:/('current-snapshot-id'=')\d+/$1#SnapshotId#/ +-- Mask current-snapshot-timestamp-ms +--! qt:replace:/('current-snapshot-timestamp-ms'=')\d+/$1#Masked#/ + +-- Create test table +CREATE EXTERNAL TABLE variant_test_basic ( + id INT, + data VARIANT +) STORED BY ICEBERG tblproperties('format-version'='3'); + +-- Insert primitive types +INSERT INTO variant_test_basic VALUES +(1, parse_json('null')), +(2, parse_json('true')), +(3, parse_json('false')), +(4, parse_json('42')), +(5, parse_json('3.14')), +(6, parse_json('"hello world"')); + +-- Retrieve and verify +SELECT id, to_json(data) as json_data FROM variant_test_basic ORDER BY id; + +-- Create table for complex structures +CREATE EXTERNAL TABLE variant_test_complex ( + id INT, + data VARIANT +) STORED BY ICEBERG tblproperties('format-version'='3');; + +-- Insert complex JSON structures +INSERT INTO variant_test_complex VALUES +(1, parse_json('{"name": "John", "age": 30, "active": true}')), +(2, parse_json('{"nested": {"level1": {"level2": "deep"}}, "array": [1, 2, 3]}')), +(3, parse_json('["apple", "banana", "cherry"]')), +(4, parse_json('{"mixed": [1, "text", true, null, {"key": "value"}]}')), +(5, parse_json('{"empty_obj": {}, "empty_array": [], "null_val": null}')); + +-- Retrieve and verify +SELECT id, to_json(data) as json_data FROM variant_test_complex ORDER BY id; + +-- Create table for edge cases +CREATE EXTERNAL TABLE variant_test_edge_cases ( + id INT, + data VARIANT +) STORED BY ICEBERG tblproperties('format-version'='3');; + +-- Insert edge cases +INSERT INTO variant_test_edge_cases VALUES +(1, parse_json('{"very_long_string": "This is a very long string that should test the string encoding limits and ensure proper handling of large text content in variant types"}')), +(2, parse_json('{"large_number": 123456789012345}')), +(3, parse_json('{"decimal_value": 123.456789}')), +(4, parse_json('{"special_chars": "Hello\\tWorld\\nNew Line! \\"Quoted\\""}')), +(5, parse_json('{"unicode": "Hello δΈ–η•Œ 🌍"}')), +(6, parse_json('{"deep_nesting": {"level1": {"level2": {"level3": {"level4": "deep"}}}}}')); + +-- Retrieve and verify +SELECT id, to_json(data) as json_data FROM variant_test_edge_cases ORDER BY id; + +-- Create table for multiple operations +CREATE TABLE variant_test_operations ( + id INT, + metadata VARIANT, + payload VARIANT +) STORED BY ICEBERG tblproperties('format-version'='3'); + +-- Insert data with multiple variant columns +INSERT INTO variant_test_operations VALUES +(1, + parse_json('{"timestamp": "2023-01-01", "version": "1.0"}'), + parse_json('{"user": "john_doe", "actions": ["login", "view", "logout"]}') +), +(2, + parse_json('{"timestamp": "2023-01-02", "version": "1.1"}'), + parse_json('{"user": "jane_smith", "actions": ["login", "edit", "save", "logout"]}') +); + +-- Complex queries with variant data +SELECT + id, + to_json(metadata) as metadata_json, + to_json(payload) as payload_json +FROM variant_test_operations +ORDER BY id; + +-- Test null values +SELECT variant_get(data, '$') as result FROM variant_test_basic WHERE id = 1; + +-- Test boolean true +SELECT variant_get(data, '$') as result FROM variant_test_basic WHERE id = 2; + +-- Test boolean false +SELECT variant_get(data, '$') as result FROM variant_test_basic WHERE id = 3; + +-- Test object field access +SELECT variant_get(data, '$.name') as name FROM variant_test_complex WHERE id = 1; + +SELECT variant_get(data, '$.age') as age FROM variant_test_complex WHERE id = 1; + +SELECT variant_get(data, '$.active') as active FROM variant_test_complex WHERE id = 1; + +-- Test nested object access +SELECT variant_get(data, '$.nested.level1.level2') as deep_value FROM variant_test_complex WHERE id = 2; + +-- Test array access +SELECT variant_get(data, '$[0]') as first_element FROM variant_test_complex WHERE id = 3; + +SELECT variant_get(data, '$[1]') as second_element FROM variant_test_complex WHERE id = 3; + +SELECT variant_get(data, '$[2]') as third_element FROM variant_test_complex WHERE id = 3; + +-- Test mixed array access +SELECT variant_get(data, '$.mixed[0]') as first_mixed FROM variant_test_complex WHERE id = 4; + +SELECT variant_get(data, '$.mixed[1]') as second_mixed FROM variant_test_complex WHERE id = 4; + +SELECT variant_get(data, '$.mixed[2]') as third_mixed FROM variant_test_complex WHERE id = 4; + +SELECT variant_get(data, '$.mixed[3]') as fourth_mixed FROM variant_test_complex WHERE id = 4; + +SELECT variant_get(data, '$.mixed[4].key') as nested_key FROM variant_test_complex WHERE id = 4; + +-- Test empty structures +SELECT variant_get(data, '$.empty_obj') as empty_obj FROM variant_test_complex WHERE id = 5; + +SELECT variant_get(data, '$.empty_array') as empty_array FROM variant_test_complex WHERE id = 5; + +SELECT variant_get(data, '$.null_val') as null_val FROM variant_test_complex WHERE id = 5; + +-- Test long string +SELECT variant_get(data, '$.very_long_string') as long_string FROM variant_test_edge_cases WHERE id = 1; + +-- Test large number +SELECT variant_get(data, '$.large_number') as large_num FROM variant_test_edge_cases WHERE id = 2; + +-- Test decimal value +SELECT variant_get(data, '$.decimal_value') as decimal_val FROM variant_test_edge_cases WHERE id = 3; + +-- Test special characters +SELECT variant_get(data, '$.special_chars') as special_chars FROM variant_test_edge_cases WHERE id = 4; + +-- Test unicode +SELECT variant_get(data, '$.unicode') as unicode_str FROM variant_test_edge_cases WHERE id = 5; + +-- Test deep nesting +SELECT variant_get(data, '$.deep_nesting.level1.level2.level3.level4') as deep_value FROM variant_test_edge_cases WHERE id = 6; + +-- Test type casting with primitive values +SELECT + variant_get(data, '$', 'string') as as_string, + variant_get(data, '$', 'int') as as_int, + variant_get(data, '$', 'double') as as_double, + variant_get(data, '$', 'boolean') as as_boolean +FROM variant_test_basic WHERE id = 4; + +-- Test type casting with string values +SELECT + variant_get(data, '$', 'string') as as_string, + variant_get(data, '$', 'int') as as_int, -- Should be null + variant_get(data, '$', 'double') as as_double -- Should be null +FROM variant_test_basic WHERE id = 6; + +-- Test type casting with object fields +SELECT + variant_get(data, '$.age', 'string') as age_string, + variant_get(data, '$.age', 'int') as age_int, + variant_get(data, '$.age', 'double') as age_double +FROM variant_test_complex WHERE id = 1; + +-- Validate complex structures +SELECT + id, + variant_get(data, '$.name') as name, + variant_get(data, '$.age') as age, + variant_get(data, '$.active') as active +FROM variant_test_complex +WHERE id = 1; + +-- Validate array access +SELECT + id, + variant_get(data, '$[0]') as elem0, + variant_get(data, '$[1]') as elem1, + variant_get(data, '$[2]') as elem2 +FROM variant_test_complex +WHERE id = 3; + +-- try with AVRO table +CREATE EXTERNAL TABLE variant_test_basic_avro ( + id INT, + data VARIANT +) STORED BY ICEBERG stored as avro tblproperties('format-version'='3'); + +-- Insert timestamp types +INSERT INTO variant_test_basic_avro VALUES +(7, parse_json('"2023-01-01T12:00:00.123456Z"')), +(8, parse_json('"2023-01-01T12:00:00.123456"')), +(9, parse_json('"2023-01-01T12:00:00.123456789Z"')), +(10, parse_json('"2023-01-01T12:00:00.123456789"')), +(11, parse_json('"12:30:45.123456"')), +(12, parse_json('"2023-12-25"')); + +-- Retrieve and verify timestamps +SELECT id, to_json(data) as json_data FROM variant_test_basic_avro; + +-- Add a variant type column to an existing table +ALTER TABLE variant_test_basic ADD COLUMNS (extra_info VARIANT); +INSERT INTO variant_test_basic VALUES +(7, parse_json('{"key": "value"}'), parse_json('{"additional": "info"}')); + +select id, to_json(data), to_json(extra_info) from variant_test_basic where id = 7; \ No newline at end of file diff --git a/iceberg/iceberg-handler/src/test/results/negative/variant_type_partition_column.q.out b/iceberg/iceberg-handler/src/test/results/negative/variant_type_partition_column.q.out new file mode 100644 index 000000000000..5f1355c80cf1 --- /dev/null +++ b/iceberg/iceberg-handler/src/test/results/negative/variant_type_partition_column.q.out @@ -0,0 +1,18 @@ +PREHOOK: query: CREATE EXTERNAL TABLE variant_test_partition ( + id INT, + data VARIANT +) PARTITIONED BY spec (data) +STORED BY ICEBERG tblproperties('format-version'='3') +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@variant_test_partition +FAILED: Execution Error, return code 40000 from org.apache.hadoop.hive.ql.ddl.DDLTask. org.apache.iceberg.exceptions.ValidationException: Cannot partition by non-primitive source field: variant +PREHOOK: query: CREATE EXTERNAL TABLE variant_test_partition ( + id INT, + data VARIANT +) PARTITIONED BY spec (data) +STORED BY ICEBERG tblproperties('format-version'='3') +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@variant_test_partition +FAILED: Execution Error, return code 40000 from org.apache.hadoop.hive.ql.ddl.DDLTask. org.apache.iceberg.exceptions.ValidationException: Cannot partition by non-primitive source field: variant diff --git a/iceberg/iceberg-handler/src/test/results/positive/variant_type.q.out b/iceberg/iceberg-handler/src/test/results/positive/variant_type.q.out new file mode 100644 index 000000000000..c71778f5c260 --- /dev/null +++ b/iceberg/iceberg-handler/src/test/results/positive/variant_type.q.out @@ -0,0 +1,588 @@ +PREHOOK: query: CREATE EXTERNAL TABLE variant_test_basic ( + id INT, + data VARIANT +) STORED BY ICEBERG tblproperties('format-version'='3') +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@variant_test_basic +POSTHOOK: query: CREATE EXTERNAL TABLE variant_test_basic ( + id INT, + data VARIANT +) STORED BY ICEBERG tblproperties('format-version'='3') +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@variant_test_basic +PREHOOK: query: INSERT INTO variant_test_basic VALUES +(1, parse_json('null')), +(2, parse_json('true')), +(3, parse_json('false')), +(4, parse_json('42')), +(5, parse_json('3.14')), +(6, parse_json('"hello world"')) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@variant_test_basic +POSTHOOK: query: INSERT INTO variant_test_basic VALUES +(1, parse_json('null')), +(2, parse_json('true')), +(3, parse_json('false')), +(4, parse_json('42')), +(5, parse_json('3.14')), +(6, parse_json('"hello world"')) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@variant_test_basic +PREHOOK: query: SELECT id, to_json(data) as json_data FROM variant_test_basic ORDER BY id +PREHOOK: type: QUERY +PREHOOK: Input: default@variant_test_basic +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT id, to_json(data) as json_data FROM variant_test_basic ORDER BY id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@variant_test_basic +POSTHOOK: Output: hdfs://### HDFS PATH ### +1 null +2 true +3 false +4 42 +5 3.14 +6 "hello world" +PREHOOK: query: CREATE EXTERNAL TABLE variant_test_complex ( + id INT, + data VARIANT +) STORED BY ICEBERG tblproperties('format-version'='3') +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@variant_test_complex +POSTHOOK: query: CREATE EXTERNAL TABLE variant_test_complex ( + id INT, + data VARIANT +) STORED BY ICEBERG tblproperties('format-version'='3') +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@variant_test_complex +PREHOOK: query: INSERT INTO variant_test_complex VALUES +(1, parse_json('{"name": "John", "age": 30, "active": true}')), +(2, parse_json('{"nested": {"level1": {"level2": "deep"}}, "array": [1, 2, 3]}')), +(3, parse_json('["apple", "banana", "cherry"]')), +(4, parse_json('{"mixed": [1, "text", true, null, {"key": "value"}]}')), +(5, parse_json('{"empty_obj": {}, "empty_array": [], "null_val": null}')) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@variant_test_complex +POSTHOOK: query: INSERT INTO variant_test_complex VALUES +(1, parse_json('{"name": "John", "age": 30, "active": true}')), +(2, parse_json('{"nested": {"level1": {"level2": "deep"}}, "array": [1, 2, 3]}')), +(3, parse_json('["apple", "banana", "cherry"]')), +(4, parse_json('{"mixed": [1, "text", true, null, {"key": "value"}]}')), +(5, parse_json('{"empty_obj": {}, "empty_array": [], "null_val": null}')) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@variant_test_complex +PREHOOK: query: SELECT id, to_json(data) as json_data FROM variant_test_complex ORDER BY id +PREHOOK: type: QUERY +PREHOOK: Input: default@variant_test_complex +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT id, to_json(data) as json_data FROM variant_test_complex ORDER BY id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@variant_test_complex +POSTHOOK: Output: hdfs://### HDFS PATH ### +1 {"active":true,"age":30,"name":"John"} +2 {"array":[1,2,3],"nested":{"level1":{"level2":"deep"}}} +3 ["apple","banana","cherry"] +4 {"mixed":[1,"text",true,null,{"key":"value"}]} +5 {"empty_array":[],"empty_obj":{},"null_val":null} +PREHOOK: query: CREATE EXTERNAL TABLE variant_test_edge_cases ( + id INT, + data VARIANT +) STORED BY ICEBERG tblproperties('format-version'='3') +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@variant_test_edge_cases +POSTHOOK: query: CREATE EXTERNAL TABLE variant_test_edge_cases ( + id INT, + data VARIANT +) STORED BY ICEBERG tblproperties('format-version'='3') +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@variant_test_edge_cases +PREHOOK: query: INSERT INTO variant_test_edge_cases VALUES +(1, parse_json('{"very_long_string": "This is a very long string that should test the string encoding limits and ensure proper handling of large text content in variant types"}')), +(2, parse_json('{"large_number": 123456789012345}')), +(3, parse_json('{"decimal_value": 123.456789}')), +(4, parse_json('{"special_chars": "Hello\\tWorld\\nNew Line! \\"Quoted\\""}')), +(5, parse_json('{"unicode": "Hello δΈ–η•Œ 🌍"}')), +(6, parse_json('{"deep_nesting": {"level1": {"level2": {"level3": {"level4": "deep"}}}}}')) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@variant_test_edge_cases +POSTHOOK: query: INSERT INTO variant_test_edge_cases VALUES +(1, parse_json('{"very_long_string": "This is a very long string that should test the string encoding limits and ensure proper handling of large text content in variant types"}')), +(2, parse_json('{"large_number": 123456789012345}')), +(3, parse_json('{"decimal_value": 123.456789}')), +(4, parse_json('{"special_chars": "Hello\\tWorld\\nNew Line! \\"Quoted\\""}')), +(5, parse_json('{"unicode": "Hello δΈ–η•Œ 🌍"}')), +(6, parse_json('{"deep_nesting": {"level1": {"level2": {"level3": {"level4": "deep"}}}}}')) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@variant_test_edge_cases +PREHOOK: query: SELECT id, to_json(data) as json_data FROM variant_test_edge_cases ORDER BY id +PREHOOK: type: QUERY +PREHOOK: Input: default@variant_test_edge_cases +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT id, to_json(data) as json_data FROM variant_test_edge_cases ORDER BY id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@variant_test_edge_cases +POSTHOOK: Output: hdfs://### HDFS PATH ### +1 {"very_long_string":"This is a very long string that should test the string encoding limits and ensure proper handling of large text content in variant types"} +2 {"large_number":123456789012345} +3 {"decimal_value":123.456789} +4 {"special_chars":"Hello\tWorld\nNew Line! \"Quoted\""} +5 {"unicode":"Hello δΈ–η•Œ 🌍"} +6 {"deep_nesting":{"level1":{"level2":{"level3":{"level4":"deep"}}}}} +PREHOOK: query: CREATE TABLE variant_test_operations ( + id INT, + metadata VARIANT, + payload VARIANT +) STORED BY ICEBERG tblproperties('format-version'='3') +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@variant_test_operations +POSTHOOK: query: CREATE TABLE variant_test_operations ( + id INT, + metadata VARIANT, + payload VARIANT +) STORED BY ICEBERG tblproperties('format-version'='3') +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@variant_test_operations +PREHOOK: query: INSERT INTO variant_test_operations VALUES +(1, + parse_json('{"timestamp": "2023-01-01", "version": "1.0"}'), + parse_json('{"user": "john_doe", "actions": ["login", "view", "logout"]}') +), +(2, + parse_json('{"timestamp": "2023-01-02", "version": "1.1"}'), + parse_json('{"user": "jane_smith", "actions": ["login", "edit", "save", "logout"]}') +) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@variant_test_operations +POSTHOOK: query: INSERT INTO variant_test_operations VALUES +(1, + parse_json('{"timestamp": "2023-01-01", "version": "1.0"}'), + parse_json('{"user": "john_doe", "actions": ["login", "view", "logout"]}') +), +(2, + parse_json('{"timestamp": "2023-01-02", "version": "1.1"}'), + parse_json('{"user": "jane_smith", "actions": ["login", "edit", "save", "logout"]}') +) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@variant_test_operations +PREHOOK: query: SELECT + id, + to_json(metadata) as metadata_json, + to_json(payload) as payload_json +FROM variant_test_operations +ORDER BY id +PREHOOK: type: QUERY +PREHOOK: Input: default@variant_test_operations +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT + id, + to_json(metadata) as metadata_json, + to_json(payload) as payload_json +FROM variant_test_operations +ORDER BY id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@variant_test_operations +POSTHOOK: Output: hdfs://### HDFS PATH ### +1 {"timestamp":"2023-01-01","version":"1.0"} {"actions":["login","view","logout"],"user":"john_doe"} +2 {"timestamp":"2023-01-02","version":"1.1"} {"actions":["login","edit","save","logout"],"user":"jane_smith"} +PREHOOK: query: SELECT variant_get(data, '$') as result FROM variant_test_basic WHERE id = 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@variant_test_basic +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT variant_get(data, '$') as result FROM variant_test_basic WHERE id = 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@variant_test_basic +POSTHOOK: Output: hdfs://### HDFS PATH ### +NULL +PREHOOK: query: SELECT variant_get(data, '$') as result FROM variant_test_basic WHERE id = 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@variant_test_basic +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT variant_get(data, '$') as result FROM variant_test_basic WHERE id = 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@variant_test_basic +POSTHOOK: Output: hdfs://### HDFS PATH ### +true +PREHOOK: query: SELECT variant_get(data, '$') as result FROM variant_test_basic WHERE id = 3 +PREHOOK: type: QUERY +PREHOOK: Input: default@variant_test_basic +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT variant_get(data, '$') as result FROM variant_test_basic WHERE id = 3 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@variant_test_basic +POSTHOOK: Output: hdfs://### HDFS PATH ### +false +PREHOOK: query: SELECT variant_get(data, '$.name') as name FROM variant_test_complex WHERE id = 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@variant_test_complex +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT variant_get(data, '$.name') as name FROM variant_test_complex WHERE id = 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@variant_test_complex +POSTHOOK: Output: hdfs://### HDFS PATH ### +John +PREHOOK: query: SELECT variant_get(data, '$.age') as age FROM variant_test_complex WHERE id = 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@variant_test_complex +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT variant_get(data, '$.age') as age FROM variant_test_complex WHERE id = 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@variant_test_complex +POSTHOOK: Output: hdfs://### HDFS PATH ### +30 +PREHOOK: query: SELECT variant_get(data, '$.active') as active FROM variant_test_complex WHERE id = 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@variant_test_complex +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT variant_get(data, '$.active') as active FROM variant_test_complex WHERE id = 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@variant_test_complex +POSTHOOK: Output: hdfs://### HDFS PATH ### +true +PREHOOK: query: SELECT variant_get(data, '$.nested.level1.level2') as deep_value FROM variant_test_complex WHERE id = 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@variant_test_complex +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT variant_get(data, '$.nested.level1.level2') as deep_value FROM variant_test_complex WHERE id = 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@variant_test_complex +POSTHOOK: Output: hdfs://### HDFS PATH ### +deep +PREHOOK: query: SELECT variant_get(data, '$[0]') as first_element FROM variant_test_complex WHERE id = 3 +PREHOOK: type: QUERY +PREHOOK: Input: default@variant_test_complex +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT variant_get(data, '$[0]') as first_element FROM variant_test_complex WHERE id = 3 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@variant_test_complex +POSTHOOK: Output: hdfs://### HDFS PATH ### +apple +PREHOOK: query: SELECT variant_get(data, '$[1]') as second_element FROM variant_test_complex WHERE id = 3 +PREHOOK: type: QUERY +PREHOOK: Input: default@variant_test_complex +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT variant_get(data, '$[1]') as second_element FROM variant_test_complex WHERE id = 3 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@variant_test_complex +POSTHOOK: Output: hdfs://### HDFS PATH ### +banana +PREHOOK: query: SELECT variant_get(data, '$[2]') as third_element FROM variant_test_complex WHERE id = 3 +PREHOOK: type: QUERY +PREHOOK: Input: default@variant_test_complex +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT variant_get(data, '$[2]') as third_element FROM variant_test_complex WHERE id = 3 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@variant_test_complex +POSTHOOK: Output: hdfs://### HDFS PATH ### +cherry +PREHOOK: query: SELECT variant_get(data, '$.mixed[0]') as first_mixed FROM variant_test_complex WHERE id = 4 +PREHOOK: type: QUERY +PREHOOK: Input: default@variant_test_complex +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT variant_get(data, '$.mixed[0]') as first_mixed FROM variant_test_complex WHERE id = 4 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@variant_test_complex +POSTHOOK: Output: hdfs://### HDFS PATH ### +1 +PREHOOK: query: SELECT variant_get(data, '$.mixed[1]') as second_mixed FROM variant_test_complex WHERE id = 4 +PREHOOK: type: QUERY +PREHOOK: Input: default@variant_test_complex +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT variant_get(data, '$.mixed[1]') as second_mixed FROM variant_test_complex WHERE id = 4 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@variant_test_complex +POSTHOOK: Output: hdfs://### HDFS PATH ### +text +PREHOOK: query: SELECT variant_get(data, '$.mixed[2]') as third_mixed FROM variant_test_complex WHERE id = 4 +PREHOOK: type: QUERY +PREHOOK: Input: default@variant_test_complex +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT variant_get(data, '$.mixed[2]') as third_mixed FROM variant_test_complex WHERE id = 4 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@variant_test_complex +POSTHOOK: Output: hdfs://### HDFS PATH ### +true +PREHOOK: query: SELECT variant_get(data, '$.mixed[3]') as fourth_mixed FROM variant_test_complex WHERE id = 4 +PREHOOK: type: QUERY +PREHOOK: Input: default@variant_test_complex +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT variant_get(data, '$.mixed[3]') as fourth_mixed FROM variant_test_complex WHERE id = 4 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@variant_test_complex +POSTHOOK: Output: hdfs://### HDFS PATH ### +NULL +PREHOOK: query: SELECT variant_get(data, '$.mixed[4].key') as nested_key FROM variant_test_complex WHERE id = 4 +PREHOOK: type: QUERY +PREHOOK: Input: default@variant_test_complex +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT variant_get(data, '$.mixed[4].key') as nested_key FROM variant_test_complex WHERE id = 4 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@variant_test_complex +POSTHOOK: Output: hdfs://### HDFS PATH ### +value +PREHOOK: query: SELECT variant_get(data, '$.empty_obj') as empty_obj FROM variant_test_complex WHERE id = 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@variant_test_complex +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT variant_get(data, '$.empty_obj') as empty_obj FROM variant_test_complex WHERE id = 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@variant_test_complex +POSTHOOK: Output: hdfs://### HDFS PATH ### +{} +PREHOOK: query: SELECT variant_get(data, '$.empty_array') as empty_array FROM variant_test_complex WHERE id = 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@variant_test_complex +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT variant_get(data, '$.empty_array') as empty_array FROM variant_test_complex WHERE id = 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@variant_test_complex +POSTHOOK: Output: hdfs://### HDFS PATH ### +[] +PREHOOK: query: SELECT variant_get(data, '$.null_val') as null_val FROM variant_test_complex WHERE id = 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@variant_test_complex +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT variant_get(data, '$.null_val') as null_val FROM variant_test_complex WHERE id = 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@variant_test_complex +POSTHOOK: Output: hdfs://### HDFS PATH ### +NULL +PREHOOK: query: SELECT variant_get(data, '$.very_long_string') as long_string FROM variant_test_edge_cases WHERE id = 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@variant_test_edge_cases +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT variant_get(data, '$.very_long_string') as long_string FROM variant_test_edge_cases WHERE id = 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@variant_test_edge_cases +POSTHOOK: Output: hdfs://### HDFS PATH ### +This is a very long string that should test the string encoding limits and ensure proper handling of large text content in variant types +PREHOOK: query: SELECT variant_get(data, '$.large_number') as large_num FROM variant_test_edge_cases WHERE id = 2 +PREHOOK: type: QUERY +PREHOOK: Input: default@variant_test_edge_cases +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT variant_get(data, '$.large_number') as large_num FROM variant_test_edge_cases WHERE id = 2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@variant_test_edge_cases +POSTHOOK: Output: hdfs://### HDFS PATH ### +123456789012345 +PREHOOK: query: SELECT variant_get(data, '$.decimal_value') as decimal_val FROM variant_test_edge_cases WHERE id = 3 +PREHOOK: type: QUERY +PREHOOK: Input: default@variant_test_edge_cases +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT variant_get(data, '$.decimal_value') as decimal_val FROM variant_test_edge_cases WHERE id = 3 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@variant_test_edge_cases +POSTHOOK: Output: hdfs://### HDFS PATH ### +123.456789 +PREHOOK: query: SELECT variant_get(data, '$.special_chars') as special_chars FROM variant_test_edge_cases WHERE id = 4 +PREHOOK: type: QUERY +PREHOOK: Input: default@variant_test_edge_cases +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT variant_get(data, '$.special_chars') as special_chars FROM variant_test_edge_cases WHERE id = 4 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@variant_test_edge_cases +POSTHOOK: Output: hdfs://### HDFS PATH ### +Hello World +New Line! "Quoted" +PREHOOK: query: SELECT variant_get(data, '$.unicode') as unicode_str FROM variant_test_edge_cases WHERE id = 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@variant_test_edge_cases +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT variant_get(data, '$.unicode') as unicode_str FROM variant_test_edge_cases WHERE id = 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@variant_test_edge_cases +POSTHOOK: Output: hdfs://### HDFS PATH ### +Hello δΈ–η•Œ 🌍 +PREHOOK: query: SELECT variant_get(data, '$.deep_nesting.level1.level2.level3.level4') as deep_value FROM variant_test_edge_cases WHERE id = 6 +PREHOOK: type: QUERY +PREHOOK: Input: default@variant_test_edge_cases +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT variant_get(data, '$.deep_nesting.level1.level2.level3.level4') as deep_value FROM variant_test_edge_cases WHERE id = 6 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@variant_test_edge_cases +POSTHOOK: Output: hdfs://### HDFS PATH ### +deep +PREHOOK: query: SELECT + variant_get(data, '$', 'string') as as_string, + variant_get(data, '$', 'int') as as_int, + variant_get(data, '$', 'double') as as_double, + variant_get(data, '$', 'boolean') as as_boolean +FROM variant_test_basic WHERE id = 4 +PREHOOK: type: QUERY +PREHOOK: Input: default@variant_test_basic +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT + variant_get(data, '$', 'string') as as_string, + variant_get(data, '$', 'int') as as_int, + variant_get(data, '$', 'double') as as_double, + variant_get(data, '$', 'boolean') as as_boolean +FROM variant_test_basic WHERE id = 4 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@variant_test_basic +POSTHOOK: Output: hdfs://### HDFS PATH ### +42 42 42.0 true +PREHOOK: query: SELECT + variant_get(data, '$', 'string') as as_string, + variant_get(data, '$', 'int') as as_int, -- Should be null + variant_get(data, '$', 'double') as as_double -- Should be null +FROM variant_test_basic WHERE id = 6 +PREHOOK: type: QUERY +PREHOOK: Input: default@variant_test_basic +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT + variant_get(data, '$', 'string') as as_string, + variant_get(data, '$', 'int') as as_int, -- Should be null + variant_get(data, '$', 'double') as as_double -- Should be null +FROM variant_test_basic WHERE id = 6 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@variant_test_basic +POSTHOOK: Output: hdfs://### HDFS PATH ### +hello world NULL NULL +PREHOOK: query: SELECT + variant_get(data, '$.age', 'string') as age_string, + variant_get(data, '$.age', 'int') as age_int, + variant_get(data, '$.age', 'double') as age_double +FROM variant_test_complex WHERE id = 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@variant_test_complex +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT + variant_get(data, '$.age', 'string') as age_string, + variant_get(data, '$.age', 'int') as age_int, + variant_get(data, '$.age', 'double') as age_double +FROM variant_test_complex WHERE id = 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@variant_test_complex +POSTHOOK: Output: hdfs://### HDFS PATH ### +30 30 30.0 +PREHOOK: query: SELECT + id, + variant_get(data, '$.name') as name, + variant_get(data, '$.age') as age, + variant_get(data, '$.active') as active +FROM variant_test_complex +WHERE id = 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@variant_test_complex +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT + id, + variant_get(data, '$.name') as name, + variant_get(data, '$.age') as age, + variant_get(data, '$.active') as active +FROM variant_test_complex +WHERE id = 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@variant_test_complex +POSTHOOK: Output: hdfs://### HDFS PATH ### +1 John 30 true +PREHOOK: query: SELECT + id, + variant_get(data, '$[0]') as elem0, + variant_get(data, '$[1]') as elem1, + variant_get(data, '$[2]') as elem2 +FROM variant_test_complex +WHERE id = 3 +PREHOOK: type: QUERY +PREHOOK: Input: default@variant_test_complex +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT + id, + variant_get(data, '$[0]') as elem0, + variant_get(data, '$[1]') as elem1, + variant_get(data, '$[2]') as elem2 +FROM variant_test_complex +WHERE id = 3 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@variant_test_complex +POSTHOOK: Output: hdfs://### HDFS PATH ### +3 apple banana cherry +PREHOOK: query: CREATE EXTERNAL TABLE variant_test_basic_avro ( + id INT, + data VARIANT +) STORED BY ICEBERG stored as avro tblproperties('format-version'='3') +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@variant_test_basic_avro +POSTHOOK: query: CREATE EXTERNAL TABLE variant_test_basic_avro ( + id INT, + data VARIANT +) STORED BY ICEBERG stored as avro tblproperties('format-version'='3') +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@variant_test_basic_avro +PREHOOK: query: INSERT INTO variant_test_basic_avro VALUES +(7, parse_json('"2023-01-01T12:00:00.123456Z"')), +(8, parse_json('"2023-01-01T12:00:00.123456"')), +(9, parse_json('"2023-01-01T12:00:00.123456789Z"')), +(10, parse_json('"2023-01-01T12:00:00.123456789"')), +(11, parse_json('"12:30:45.123456"')), +(12, parse_json('"2023-12-25"')) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@variant_test_basic_avro +POSTHOOK: query: INSERT INTO variant_test_basic_avro VALUES +(7, parse_json('"2023-01-01T12:00:00.123456Z"')), +(8, parse_json('"2023-01-01T12:00:00.123456"')), +(9, parse_json('"2023-01-01T12:00:00.123456789Z"')), +(10, parse_json('"2023-01-01T12:00:00.123456789"')), +(11, parse_json('"12:30:45.123456"')), +(12, parse_json('"2023-12-25"')) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@variant_test_basic_avro +PREHOOK: query: SELECT id, to_json(data) as json_data FROM variant_test_basic_avro +PREHOOK: type: QUERY +PREHOOK: Input: default@variant_test_basic_avro +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: SELECT id, to_json(data) as json_data FROM variant_test_basic_avro +POSTHOOK: type: QUERY +POSTHOOK: Input: default@variant_test_basic_avro +POSTHOOK: Output: hdfs://### HDFS PATH ### +7 "2023-01-01T12:00:00.123456Z" +8 "2023-01-01T12:00:00.123456" +9 "2023-01-01T12:00:00.123456789Z" +10 "2023-01-01T12:00:00.123456789" +11 "12:30:45.123456" +12 "2023-12-25" +PREHOOK: query: ALTER TABLE variant_test_basic ADD COLUMNS (extra_info VARIANT) +PREHOOK: type: ALTERTABLE_ADDCOLS +PREHOOK: Input: default@variant_test_basic +PREHOOK: Output: default@variant_test_basic +POSTHOOK: query: ALTER TABLE variant_test_basic ADD COLUMNS (extra_info VARIANT) +POSTHOOK: type: ALTERTABLE_ADDCOLS +POSTHOOK: Input: default@variant_test_basic +POSTHOOK: Output: default@variant_test_basic +PREHOOK: query: INSERT INTO variant_test_basic VALUES +(7, parse_json('{"key": "value"}'), parse_json('{"additional": "info"}')) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@variant_test_basic +POSTHOOK: query: INSERT INTO variant_test_basic VALUES +(7, parse_json('{"key": "value"}'), parse_json('{"additional": "info"}')) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@variant_test_basic +PREHOOK: query: select id, to_json(data), to_json(extra_info) from variant_test_basic where id = 7 +PREHOOK: type: QUERY +PREHOOK: Input: default@variant_test_basic +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: select id, to_json(data), to_json(extra_info) from variant_test_basic where id = 7 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@variant_test_basic +POSTHOOK: Output: hdfs://### HDFS PATH ### +7 {"key":"value"} {"additional":"info"} diff --git a/parser/src/java/org/apache/hadoop/hive/ql/parse/HiveLexerParent.g b/parser/src/java/org/apache/hadoop/hive/ql/parse/HiveLexerParent.g index bf13e8c4ea98..de06106b7b30 100644 --- a/parser/src/java/org/apache/hadoop/hive/ql/parse/HiveLexerParent.g +++ b/parser/src/java/org/apache/hadoop/hive/ql/parse/HiveLexerParent.g @@ -126,6 +126,7 @@ KW_ARRAY: 'ARRAY'; KW_STRUCT: 'STRUCT'; KW_MAP: 'MAP'; KW_UNIONTYPE: 'UNIONTYPE'; +KW_VARIANT: 'VARIANT'; KW_REDUCE: 'REDUCE'; KW_PARTITIONED: 'PARTITIONED'; KW_CLUSTERED: 'CLUSTERED'; diff --git a/parser/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g b/parser/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g index 713c5c7248ec..ddce6aa85af6 100644 --- a/parser/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g +++ b/parser/src/java/org/apache/hadoop/hive/ql/parse/HiveParser.g @@ -156,6 +156,7 @@ TOK_LIST; TOK_STRUCT; TOK_MAP; TOK_UNIONTYPE; +TOK_VARIANT; TOK_COLTYPELIST; TOK_CREATECATALOG; TOK_CREATEDATABASE; @@ -2389,7 +2390,8 @@ type | listType | structType | mapType - | unionType; + | unionType + | variantType; primitiveType @init { pushMsg("primitive type specification", state); } @@ -2442,6 +2444,12 @@ unionType : KW_UNIONTYPE LESSTHAN colTypeList GREATERTHAN -> ^(TOK_UNIONTYPE colTypeList) ; +variantType +@init { pushMsg("variant type", state); } +@after { popMsg(state); } + : KW_VARIANT -> TOK_VARIANT + ; + setOperator @init { pushMsg("set operator", state); } @after { popMsg(state); } diff --git a/parser/src/java/org/apache/hadoop/hive/ql/parse/IdentifiersParser.g b/parser/src/java/org/apache/hadoop/hive/ql/parse/IdentifiersParser.g index 7f08cb8828ea..54ec367a677d 100644 --- a/parser/src/java/org/apache/hadoop/hive/ql/parse/IdentifiersParser.g +++ b/parser/src/java/org/apache/hadoop/hive/ql/parse/IdentifiersParser.g @@ -1045,5 +1045,5 @@ nonReserved //The following SQL2011 reserved keywords are used as function name only, but not as identifiers. sql11ReservedKeywordsUsedAsFunctionName : - KW_IF | KW_ARRAY | KW_MAP | KW_BIGINT | KW_BINARY | KW_BOOLEAN | KW_CURRENT_DATE | KW_CURRENT_TIMESTAMP | KW_DATE | KW_DOUBLE | KW_FLOAT | KW_REAL | KW_GROUPING | KW_INT | KW_SMALLINT | KW_TIMESTAMP + KW_IF | KW_ARRAY | KW_MAP | KW_BIGINT | KW_BINARY | KW_BOOLEAN | KW_CURRENT_DATE | KW_CURRENT_TIMESTAMP | KW_DATE | KW_DOUBLE | KW_FLOAT | KW_REAL | KW_GROUPING | KW_INT | KW_SMALLINT | KW_TIMESTAMP | KW_VARIANT ; diff --git a/parser/src/test/org/apache/hadoop/hive/ql/parse/TestReservedWords.java b/parser/src/test/org/apache/hadoop/hive/ql/parse/TestReservedWords.java index 99d6f195d14e..50491173384e 100644 --- a/parser/src/test/org/apache/hadoop/hive/ql/parse/TestReservedWords.java +++ b/parser/src/test/org/apache/hadoop/hive/ql/parse/TestReservedWords.java @@ -173,7 +173,8 @@ public static Collection data() { "WHEN", "WHERE", "WINDOW", - "WITH" + "WITH", + "VARIANT" ); } diff --git a/pom.xml b/pom.xml index 58aa3532d086..a3ff777005d1 100644 --- a/pom.xml +++ b/pom.xml @@ -126,7 +126,7 @@ 1.10 1.1 2.16.1 - 3.12.0 + 3.14.0 3.6.1 2.12.0 1.10.0 diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java index 33c8153e8602..7991abc8433a 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java @@ -785,6 +785,11 @@ public final class FunctionRegistry { system.registerFunction("ST_Y", ST_Y.class); system.registerFunction("ST_Z", ST_Z.class); + // Iceberg UDFs + system.registerGenericUDF("parse_json", GenericUDFParseJson.class); + system.registerGenericUDF("to_json", GenericUDFToJson.class); + system.registerGenericUDF("variant_get", GenericUDFVariantGet.class); + system.registerGenericUDF("try_variant_get", GenericUDFTryVariantGet.class); try { system.registerGenericUDF("iceberg_bucket", diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Partition.java b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Partition.java index 884aa2016279..736e6e8c9f1a 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Partition.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Partition.java @@ -42,6 +42,7 @@ import org.apache.hadoop.hive.metastore.api.FieldSchema; import org.apache.hadoop.hive.metastore.api.MetaException; import org.apache.hadoop.hive.metastore.api.Order; +import org.apache.hadoop.hive.ql.ddl.DDLUtils; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils; import org.apache.hadoop.hive.ql.io.HiveOutputFormat; @@ -600,7 +601,7 @@ public Map, String> getSkewedColValueLocationMaps() { public void checkValidity() throws HiveException { if (!tPartition.getSd().equals(table.getSd())) { - Table.validateColumns(getCols(), table.getPartCols()); + Table.validateColumns(getCols(), table.getPartCols(), DDLUtils.isIcebergTable(table)); } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java index 7dc1e16236f2..8a37073509ef 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/Table.java @@ -60,6 +60,7 @@ import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; import org.apache.hadoop.hive.metastore.utils.MetaStoreServerUtils; import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils; +import org.apache.hadoop.hive.ql.ddl.DDLUtils; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.io.AcidUtils; import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils; @@ -87,6 +88,7 @@ import com.google.common.base.Preconditions; import static org.apache.commons.lang3.StringUtils.isBlank; +import static org.apache.hadoop.hive.serde.serdeConstants.VARIANT_TYPE_NAME; /** * A Hive Table: is a fundamental unit of data in Hive that shares a common schema/DDL. @@ -263,10 +265,12 @@ public void setTTable(org.apache.hadoop.hive.metastore.api.Table tTable) { public void checkValidity(Configuration conf) throws HiveException { // check for validity validateName(conf); + if (getCols().isEmpty()) { - throw new HiveException( - "at least one column must be specified for the table"); + throw new HiveException("at least one column must be specified for the table"); } + validateColumns(getCols(), getPartCols(), DDLUtils.isIcebergTable(this)); + if (!isView()) { if (null == getDeserializer(false)) { throw new HiveException("must specify a non-null serDe"); @@ -286,8 +290,6 @@ public void checkValidity(Configuration conf) throws HiveException { assert(getViewOriginalText() == null); assert(getViewExpandedText() == null); } - - validateColumns(getCols(), getPartCols()); } public void validateName(Configuration conf) throws HiveException { @@ -1149,7 +1151,7 @@ public static boolean shouldStoreFieldsInMetastore( return deserializer.shouldStoreFieldsInMetastore(tableParams); } - public static void validateColumns(List columns, List partCols) + public static void validateColumns(List columns, List partCols, boolean icebergTable) throws HiveException { Set colNames = new HashSet<>(); for (FieldSchema col: columns) { @@ -1158,6 +1160,10 @@ public static void validateColumns(List columns, List throw new HiveException("Duplicate column name " + colName + " in the table definition."); } + if (!icebergTable && VARIANT_TYPE_NAME.equalsIgnoreCase(col.getType())) { + throw new HiveException( + "Column name " + colName + " cannot be of type 'variant' as it is not supported in non-Iceberg tables."); + } colNames.add(colName); } if (partCols != null) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java index b2da8195c4ec..a23502b34bd9 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/BaseSemanticAnalyzer.java @@ -1047,6 +1047,7 @@ public static String getTypeStringFromAST(ASTNode typeNode) TOKEN_TO_TYPE.put(HiveParser.TOK_INTERVAL_YEAR_MONTH, serdeConstants.INTERVAL_YEAR_MONTH_TYPE_NAME); TOKEN_TO_TYPE.put(HiveParser.TOK_INTERVAL_DAY_TIME, serdeConstants.INTERVAL_DAY_TIME_TYPE_NAME); TOKEN_TO_TYPE.put(HiveParser.TOK_DECIMAL, serdeConstants.DECIMAL_TYPE_NAME); + TOKEN_TO_TYPE.put(HiveParser.TOK_VARIANT, serdeConstants.VARIANT_TYPE_NAME); } private static String getTypeName(ASTNode node) throws SemanticException { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFParseJson.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFParseJson.java new file mode 100644 index 000000000000..e03bbf68f3b9 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFParseJson.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.udf.generic; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.variant.Variant; +import org.apache.hadoop.hive.serde2.variant.VariantBuilder; + +import java.io.IOException; +import java.util.List; + +@Description(name = "parse_json", value = "_FUNC_(json_string) - Parses a JSON string into a VARIANT type", extended = """ + Example: + > SELECT _FUNC_('{"a":5}'); + {"a":5}""") +public class GenericUDFParseJson extends GenericUDF { + private PrimitiveObjectInspector inputOI; + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + if (arguments.length != 1) { + throw new UDFArgumentLengthException("parse_json requires one argument"); + } + if (arguments[0].getCategory() != ObjectInspector.Category.PRIMITIVE + || ((PrimitiveObjectInspector) arguments[0]).getPrimitiveCategory() + != PrimitiveObjectInspector.PrimitiveCategory.STRING) { + throw new UDFArgumentTypeException(0, "Only string input is accepted"); + } + inputOI = (PrimitiveObjectInspector) arguments[0]; + + // Return a Variant OI + return ObjectInspectorFactory.getVariantObjectInspector(); + } + + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException { + Object input = arguments[0].get(); + if (input == null) { + return null; + } + String json = inputOI.getPrimitiveJavaObject(input).toString(); + try { + Variant variant = VariantBuilder.parseJson(json, true); + return List.of(variant.getMetadata(), variant.getValue()); + + } catch (IOException e) { + throw new HiveException("Failed to parse JSON: " + json, e); + } + } + + @Override + public String getDisplayString(String[] children) { + return "parse_json(" + children[0] + ")"; + } +} \ No newline at end of file diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFToJson.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFToJson.java new file mode 100644 index 000000000000..3827f68a1ac3 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFToJson.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.udf.generic; + +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.variant.Variant; + +import java.time.ZoneOffset; + +public class GenericUDFToJson extends GenericUDF { + private StructObjectInspector inputOI; + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + if (arguments.length != 1) { + throw new UDFArgumentException("to_json takes exactly 1 argument"); + } + if (!(arguments[0] instanceof StructObjectInspector)) { + throw new UDFArgumentTypeException(0, "Argument must be VARIANT (struct)"); + } + inputOI = (StructObjectInspector) arguments[0]; + return PrimitiveObjectInspectorFactory.javaStringObjectInspector; + } + + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException { + Object variantObj = arguments[0].get(); + if (variantObj == null) { + return null; + } + Variant variant = Variant.from(inputOI.getStructFieldsDataAsList(variantObj)); + // convert to JSON + return variant.toJson(ZoneOffset.UTC); + } + + @Override + public String getDisplayString(String[] children) { + return "to_json(" + children[0] + ")"; + } +} \ No newline at end of file diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFTryVariantGet.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFTryVariantGet.java new file mode 100644 index 000000000000..cfe3238bf4ff --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFTryVariantGet.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.udf.generic; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.metadata.HiveException; + +@Description( + name = "try_variant_get", + value = "_FUNC_(variant, path[, type]) - Extracts a sub-variant from variant according to path, and casts it to type. Returns null on error.", + extended = """ + Example: + > SELECT _FUNC_(parse_json('{"a": 1}'), '$.a', 'int'); + 1 + > SELECT _FUNC_(parse_json('[1, "hello"]'), '$[1]', 'int'); + NULL""" +) +public class GenericUDFTryVariantGet extends GenericUDFVariantGet { + + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException { + try { + return super.evaluate(arguments); + } catch (Exception e) { + return null; // try_variant_get returns null on errors instead of throwing + } + } + + @Override + public String getDisplayString(String[] children) { + return "try_variant_get(" + String.join(", ", children) + ")"; + } +} \ No newline at end of file diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFVariantGet.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFVariantGet.java new file mode 100644 index 000000000000..f3fb0c12897f --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFVariantGet.java @@ -0,0 +1,353 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.udf.generic; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.variant.Variant; +import org.apache.hadoop.hive.serde2.variant.VariantUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.time.Instant; +import java.time.LocalDate; +import java.time.ZoneOffset; +import java.time.temporal.ChronoUnit; + +import java.util.ArrayList; +import java.util.Base64; +import java.util.Collections; +import java.util.List; + +@Description(name = "variant_get", value = "_FUNC_(variant, path[, type]) - Extracts a sub-variant from variant according to path, and casts it to type", extended = """ + Example: + > SELECT _FUNC_(parse_json('{"a": 1}'), '$.a', 'int'); + 1 + > SELECT _FUNC_(parse_json('{"a": 1}'), '$.b', 'int'); + NULL + > SELECT _FUNC_(parse_json('[1, "2"]'), '$[1]', 'string'); + 2 + > SELECT _FUNC_(parse_json('[1, "hello"]'), '$[1]'); + "hello\"""") +public class GenericUDFVariantGet extends GenericUDF { + private static final Logger LOG = LoggerFactory.getLogger(GenericUDFVariantGet.class); + private static final ObjectMapper MAPPER = new ObjectMapper(); + + private StructObjectInspector variantOI; + private PrimitiveObjectInspector pathOI; + + private PrimitiveObjectInspector typeOI; + private boolean hasTypeArgument; + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + if (arguments.length < 2 || arguments.length > 3) { + throw new UDFArgumentException("variant_get requires 2 or 3 arguments"); + } + + if (!(arguments[0] instanceof StructObjectInspector)) { + throw new UDFArgumentException("First argument must be VARIANT"); + } + variantOI = (StructObjectInspector) arguments[0]; + + if (!(arguments[1] instanceof PrimitiveObjectInspector)) { + throw new UDFArgumentException("Second argument must be string path"); + } + pathOI = (PrimitiveObjectInspector) arguments[1]; + + hasTypeArgument = arguments.length == 3; + if (hasTypeArgument) { + if (!(arguments[2] instanceof PrimitiveObjectInspector)) { + throw new UDFArgumentException("Third argument must be string type name"); + } + typeOI = (PrimitiveObjectInspector) arguments[2]; + } + + return PrimitiveObjectInspectorFactory.javaStringObjectInspector; + } + + @Override + public Object evaluate(DeferredObject[] arguments) throws HiveException { + try { + Object variantObj = arguments[0].get(); + if (variantObj == null) { + return null; + } + Variant variant = Variant.from(variantOI.getStructFieldsDataAsList(variantObj)); + + Object pathObj = arguments[1].get(); + if (pathObj == null) { + return null; + } + String path = pathOI.getPrimitiveJavaObject(pathObj).toString(); + + String targetType = null; + if (hasTypeArgument) { + Object typeObj = arguments[2].get(); + if (typeObj != null) { + targetType = typeOI.getPrimitiveJavaObject(typeObj).toString(); + } + } + + Variant result = extractValueByPath(variant, path); + // cast to target type + return castValue(result, targetType); + + } catch (Exception e) { + throw new HiveException("Failed to extract variant: " + e.getMessage(), e); + } + } + + @Override + public String getDisplayString(String[] children) { + return "variant_get(" + String.join(", ", children) + ")"; + } + + /** + * Extract a variant value by following a JSONPath-like path. + * Supports complex nested patterns like: $.field, $[0], $.field[1], $.mixed[4].key, etc. + */ + private static Variant extractValueByPath(Variant variant, String path) { + if (variant == null || path == null) { + return null; + } + try { + List tokens = VariantPathParser.parse(path); + Variant current = variant; + + for (VariantToken token : tokens) { + if (current == null) { + // The path goes deeper than the object structure. + return null; + } + current = token.get(current); + } + return current; + + } catch (IllegalArgumentException e) { + LOG.warn("Invalid path syntax provided: {}", e.getMessage()); + return null; + } + } + + private static Object castValue(Variant value, String targetType) { + if (value == null || value.getType() == VariantUtil.Type.NULL) { + return null; + } + if (targetType == null) { + return unescapeJson(value.toJson(ZoneOffset.UTC)); + } + try { + return switch (targetType.toLowerCase()) { + case "boolean", "bool" -> toBoolean(value); + case "int", "integer" -> toInteger(value); + case "long" -> toLong(value); + case "double" -> toDouble(value); + case "string" -> toString(value); + default -> throw new IllegalArgumentException("Unsupported target type: " + targetType); + }; + } catch (NumberFormatException e) { + LOG.warn("Invalid target type syntax provided: {}", e.getMessage()); + return null; + } + } + + private static Integer toInteger(Variant value) { + return switch (value.getType()) { + case LONG -> (int) value.getLong(); + case DOUBLE -> (int) value.getDouble(); + case FLOAT -> (int) value.getFloat(); + case DECIMAL -> value.getDecimal().intValue(); + case STRING -> Integer.parseInt(value.getString()); + case BOOLEAN -> value.getBoolean() ? 1 : 0; + default -> null; + }; + } + + private static Long toLong(Variant value) { + return switch (value.getType()) { + case LONG -> value.getLong(); + case DOUBLE -> (long) value.getDouble(); + case FLOAT -> (long) value.getFloat(); + case DECIMAL -> value.getDecimal().longValue(); + case STRING -> Long.parseLong(value.getString()); + case BOOLEAN -> value.getBoolean() ? 1L : 0L; + case DATE -> value.getLong(); // Return days since epoch + case TIMESTAMP, TIMESTAMP_NTZ -> value.getLong(); // Return microseconds since epoch + default -> null; + }; + } + + private static Double toDouble(Variant value) { + return switch (value.getType()) { + case LONG -> (double) value.getLong(); + case DOUBLE -> value.getDouble(); + case FLOAT -> (double) value.getFloat(); + case DECIMAL -> value.getDecimal().doubleValue(); + case STRING -> Double.parseDouble(value.getString()); + case BOOLEAN -> value.getBoolean() ? 1.0 : 0.0; + default -> null; + }; + } + + private static Boolean toBoolean(Variant value) { + return switch (value.getType()) { + case BOOLEAN -> value.getBoolean(); + case LONG -> value.getLong() != 0; + case DOUBLE -> value.getDouble() != 0.0; + case FLOAT -> value.getFloat() != 0.0f; + case STRING -> Boolean.parseBoolean(value.getString()); + default -> null; + }; + } + + private static String toString(Variant value) { + return switch (value.getType()) { + case BOOLEAN -> String.valueOf(value.getBoolean()); + case LONG -> String.valueOf(value.getLong()); + case DOUBLE -> String.valueOf(value.getDouble()); + case FLOAT -> String.valueOf(value.getFloat()); + case DECIMAL -> value.getDecimal().toPlainString(); + case STRING -> value.getString(); + case BINARY -> Base64.getEncoder().encodeToString(value.getBinary()); + case DATE -> LocalDate.ofEpochDay(value.getLong()).toString(); + case TIMESTAMP, TIMESTAMP_NTZ -> { + Instant instant = Instant.EPOCH.plus(value.getLong(), ChronoUnit.MICROS); + yield instant.toString(); + } + case UUID -> value.getUuid().toString(); + case OBJECT, ARRAY -> value.toJson(ZoneOffset.UTC); + default -> null; + }; + } + + /** + * Represents a single segment in a parsed path. + */ + private interface VariantToken { + Variant get(Variant target); + } + + /** + * A {@link VariantToken} representing an object field access (e.g., ".name"). + */ + private record FieldToken(String key) implements VariantToken { + @Override + public Variant get(Variant target) { + if (target != null && target.getType() == VariantUtil.Type.OBJECT) { + return target.getFieldByKey(key); + } + return null; + } + } + + /** + * A {@link VariantToken} representing an array element access (e.g., "[123]"). + */ + private record IndexToken(int index) implements VariantToken { + @Override + public Variant get(Variant target) { + if (target != null && target.getType() == VariantUtil.Type.ARRAY) { + return target.getElementAtIndex(index); + } + return null; + } + } + + /** + * A simple parser for a simplified JSONPath-like syntax. + */ + private static final class VariantPathParser { + /** + * Parses a path string into a sequence of {@link VariantToken} tokens. + */ + public static List parse(String path) { + if (path == null || !path.startsWith("$")) { + throw new IllegalArgumentException("Invalid path: must start with '$'."); + } + if (path.length() == 1) { + return Collections.emptyList(); // root path itself + } + List tokens = new ArrayList<>(); + int i = 1; // Current position, start after the '$' + + while (i < path.length()) { + char c = path.charAt(i); + if (c == '.') { + i++; // Move past the dot + int start = i; + // Find the end of the field name (next dot or bracket) + while (i < path.length() && path.charAt(i) != '.' && path.charAt(i) != '[') { + i++; + } + String key = path.substring(start, i); + if (key.isEmpty()) { + throw new IllegalArgumentException( + "Invalid path: empty field name at position " + start); + } + tokens.add(new FieldToken(key)); + + } else if (c == '[') { + i++; // Move past the opening bracket + int start = i; + int end = path.indexOf(']', start); + if (end == -1) { + throw new IllegalArgumentException( + "Invalid path: unclosed array index at position " + start); + } + String indexStr = path.substring(start, end).trim(); + try { + int index = Integer.parseInt(indexStr); + tokens.add(new IndexToken(index)); + + } catch (NumberFormatException e) { + throw new IllegalArgumentException( + "Invalid path: non-integer array index '" + indexStr + "'"); + } + i = end + 1; // Move past the closing bracket + } else { + throw new IllegalArgumentException( + "Invalid path: unexpected character '" + c + "' at position " + i); + } + } + return tokens; + } + } + + private static String unescapeJson(String str) { + if (str == null) { + return null; + } + // For arrays and objects, return as-is (e.g., "[]", "{}") + if (str.startsWith("[") || str.startsWith("{")) { + return str; + } + try { + return MAPPER.readValue(str, String.class); + } catch (JsonProcessingException e) { + return null; + } + } +} \ No newline at end of file diff --git a/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java b/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java index 07699e0fc601..f975da045ad7 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/stats/TestStatsUtils.java @@ -85,6 +85,7 @@ public void testPrimitiveSizeEstimations() throws Exception { exclusions.add(serdeConstants.MAP_TYPE_NAME); exclusions.add(serdeConstants.STRUCT_TYPE_NAME); exclusions.add(serdeConstants.UNION_TYPE_NAME); + exclusions.add(serdeConstants.VARIANT_TYPE_NAME); Field[] serdeFields = serdeConstants.class.getFields(); for (Field field : serdeFields) { if (!Modifier.isStatic(field.getModifiers())) { diff --git a/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFInitializeOnCompareUDF.java b/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFInitializeOnCompareUDF.java index ff8d66ed4b2e..90623ba2a715 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFInitializeOnCompareUDF.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFInitializeOnCompareUDF.java @@ -97,7 +97,7 @@ private static Stream generateArgsWithSameCategoryNoBothPrimitive( return generateArguments().stream().filter(args -> { ObjectInspector.Category left = args.left.getCategory(); ObjectInspector.Category right = args.right.getCategory(); - return left.equals(right) && !(PRIMITIVE.equals(left) && PRIMITIVE.equals(right)); + return left.equals(right) && !PRIMITIVE.equals(left); }); } diff --git a/ql/src/test/queries/clientnegative/variant_type_add_column.q b/ql/src/test/queries/clientnegative/variant_type_add_column.q new file mode 100644 index 000000000000..d6fb332f9434 --- /dev/null +++ b/ql/src/test/queries/clientnegative/variant_type_add_column.q @@ -0,0 +1,2 @@ +create table emp(id int); +alter table emp add columns (v VARIANT); \ No newline at end of file diff --git a/ql/src/test/queries/clientnegative/variant_type_non_iceberg_table.q b/ql/src/test/queries/clientnegative/variant_type_non_iceberg_table.q new file mode 100644 index 000000000000..76fc7da394e9 --- /dev/null +++ b/ql/src/test/queries/clientnegative/variant_type_non_iceberg_table.q @@ -0,0 +1 @@ +CREATE EXTERNAL TABLE variant_test_basic (id INT, data VARIANT); \ No newline at end of file diff --git a/ql/src/test/results/clientnegative/variant_type_add_column.q.out b/ql/src/test/results/clientnegative/variant_type_add_column.q.out new file mode 100644 index 000000000000..f4c4fc55327d --- /dev/null +++ b/ql/src/test/results/clientnegative/variant_type_add_column.q.out @@ -0,0 +1,13 @@ +PREHOOK: query: create table emp(id int) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@emp +POSTHOOK: query: create table emp(id int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@emp +PREHOOK: query: alter table emp add columns (v VARIANT) +PREHOOK: type: ALTERTABLE_ADDCOLS +PREHOOK: Input: default@emp +PREHOOK: Output: default@emp +FAILED: Execution Error, return code 40000 from org.apache.hadoop.hive.ql.ddl.DDLTask. Column name v cannot be of type 'variant' as it is not supported in non-Iceberg tables. diff --git a/ql/src/test/results/clientnegative/variant_type_non_iceberg_table.q.out b/ql/src/test/results/clientnegative/variant_type_non_iceberg_table.q.out new file mode 100644 index 000000000000..49dfecd07ccf --- /dev/null +++ b/ql/src/test/results/clientnegative/variant_type_non_iceberg_table.q.out @@ -0,0 +1,5 @@ +PREHOOK: query: CREATE EXTERNAL TABLE variant_test_basic (id INT, data VARIANT) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@variant_test_basic +FAILED: Execution Error, return code 40000 from org.apache.hadoop.hive.ql.ddl.DDLTask. org.apache.hadoop.hive.ql.metadata.HiveException: Column name data cannot be of type 'variant' as it is not supported in non-Iceberg tables. diff --git a/ql/src/test/results/clientpositive/llap/show_functions.q.out b/ql/src/test/results/clientpositive/llap/show_functions.q.out index 9150f13015b0..cb8955e3808c 100644 --- a/ql/src/test/results/clientpositive/llap/show_functions.q.out +++ b/ql/src/test/results/clientpositive/llap/show_functions.q.out @@ -296,6 +296,7 @@ nullif nvl octet_length or +parse_json parse_url parse_url_tuple percent_rank @@ -459,6 +460,7 @@ tan tanh to_date to_epoch_milli +to_json to_unix_timestamp to_utc_timestamp toarray @@ -467,6 +469,7 @@ tostruct translate trim trunc +try_variant_get tumbling_window typeof ucase @@ -486,6 +489,7 @@ validate_acid_sort_order var_pop var_samp variance +variant_get version weekofyear when @@ -931,6 +935,7 @@ nullif nvl octet_length or +parse_json parse_url parse_url_tuple percent_rank @@ -1094,6 +1099,7 @@ tan tanh to_date to_epoch_milli +to_json to_unix_timestamp to_utc_timestamp toarray @@ -1102,6 +1108,7 @@ tostruct translate trim trunc +try_variant_get tumbling_window typeof ucase @@ -1121,6 +1128,7 @@ validate_acid_sort_order var_pop var_samp variance +variant_get version weekofyear when diff --git a/serde/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/serde/serdeConstants.java b/serde/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/serde/serdeConstants.java index bde6aace7ef2..20e607a67bdb 100644 --- a/serde/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/serde/serdeConstants.java +++ b/serde/src/gen/thrift/gen-javabean/org/apache/hadoop/hive/serde/serdeConstants.java @@ -93,6 +93,8 @@ public class serdeConstants { public static final java.lang.String UNION_TYPE_NAME = "uniontype"; + public static final java.lang.String VARIANT_TYPE_NAME = "variant"; + public static final java.lang.String LIST_COLUMNS = "columns"; public static final java.lang.String LIST_COLUMN_TYPES = "columns.types"; diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspector.java b/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspector.java index 99b565dd3fcb..8e4183b615f4 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspector.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspector.java @@ -46,8 +46,8 @@ public interface ObjectInspector extends Cloneable { * Category. * */ - public static enum Category { - PRIMITIVE, LIST, MAP, STRUCT, UNION + enum Category { + PRIMITIVE, LIST, MAP, STRUCT, UNION, VARIANT }; /** diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorFactory.java b/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorFactory.java index 1c3cad2d33b7..8e251d19bb3b 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorFactory.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/ObjectInspectorFactory.java @@ -363,6 +363,10 @@ public static UnionStructObjectInspector getUnionStructObjectInspector( return result; } + public static ObjectInspector getVariantObjectInspector() { + return VariantObjectInspector.get(); + } + public static ColumnarStructObjectInspector getColumnarStructObjectInspector( List structFieldNames, List structFieldObjectInspectors) { diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/VariantObjectInspector.java b/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/VariantObjectInspector.java new file mode 100644 index 000000000000..c2def602bb23 --- /dev/null +++ b/serde/src/java/org/apache/hadoop/hive/serde2/objectinspector/VariantObjectInspector.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.serde2.objectinspector; + +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import java.util.List; + +public class VariantObjectInspector extends StandardStructObjectInspector { + + private static final ObjectInspector INSTANCE = new VariantObjectInspector(); + + protected VariantObjectInspector() { + super(List.of("metadata", "value"), createObjectInspectors()); + } + + public static ObjectInspector get() { + return INSTANCE; + } + + private static List createObjectInspectors() { + return List.of( + PrimitiveObjectInspectorFactory.javaByteArrayObjectInspector, + PrimitiveObjectInspectorFactory.javaByteArrayObjectInspector + ); + } +} diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfoFactory.java b/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfoFactory.java index baafb7509778..977bbd0277f9 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfoFactory.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfoFactory.java @@ -241,6 +241,10 @@ public static TypeInfo getUnionTypeInfo(List typeInfos) { return result; } + public static TypeInfo getVariantTypeInfo() { + return VariantTypeInfo.get(); + } + static ConcurrentHashMap cachedListTypeInfo = new ConcurrentHashMap(); public static TypeInfo getListTypeInfo(TypeInfo elementTypeInfo) { diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfoUtils.java b/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfoUtils.java index 4517dbb200c1..581437194db4 100644 --- a/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfoUtils.java +++ b/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/TypeInfoUtils.java @@ -398,6 +398,7 @@ private Token expect(String item, String alternative) { && !serdeConstants.MAP_TYPE_NAME.equals(t.text) && !serdeConstants.STRUCT_TYPE_NAME.equals(t.text) && !serdeConstants.UNION_TYPE_NAME.equals(t.text) + && !serdeConstants.VARIANT_TYPE_NAME.equals(t.text) && null == PrimitiveObjectInspectorUtils .getTypeEntryFromTypeName(t.text) && !t.text.equals(alternative)) { @@ -568,6 +569,11 @@ private TypeInfo parseType() { return TypeInfoFactory.getUnionTypeInfo(objectTypeInfos); } + // Is this a variant type? + if (serdeConstants.VARIANT_TYPE_NAME.equals(t.text)) { + return TypeInfoFactory.getVariantTypeInfo(); + } + throw new RuntimeException("Internal error parsing position " + t.position + " of '" + typeInfoString + "'"); } diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/VariantTypeInfo.java b/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/VariantTypeInfo.java new file mode 100644 index 000000000000..e00b8f9b69db --- /dev/null +++ b/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/VariantTypeInfo.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.serde2.typeinfo; + +import org.apache.hadoop.hive.common.classification.InterfaceAudience; +import org.apache.hadoop.hive.common.classification.InterfaceStability; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; + +import java.io.Serial; +import java.io.Serializable; +import java.util.Objects; + +/** + * VariantTypeInfo represents the TypeInfo of a Variant type in Hive. + *

+ * A Variant type is a flexible data type that can store values of different types + * in a single column. It is particularly useful for semi-structured data like JSON + * where the actual type of a value may not be known at schema definition time. + *

+ * The Variant type is primarily used in conjunction with Apache Iceberg tables + * and provides a way to store polymorphic data efficiently. When used with Iceberg, + * variant values are internally represented as a struct containing: + *

    + *
  • metadata: Binary metadata describing the actual type of the value
  • + *
  • value: Binary representation of the actual value
  • + *
+ */ +@InterfaceAudience.Public +@InterfaceStability.Stable +public final class VariantTypeInfo extends TypeInfo implements Serializable { + + @Serial + private static final long serialVersionUID = 1L; + + private static final VariantTypeInfo INSTANCE = new VariantTypeInfo(); + + public static VariantTypeInfo get() { + return INSTANCE; + } + + @Override + public Category getCategory() { + return Category.VARIANT; + } + + @Override + public String getTypeName() { + return "variant"; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + return o instanceof VariantTypeInfo; + } + + @Override + public int hashCode() { + return Objects.hash(VariantTypeInfo.class, getTypeName()); + } +} diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/variant/Variant.java b/serde/src/java/org/apache/hadoop/hive/serde2/variant/Variant.java new file mode 100644 index 000000000000..2f07ccb2ba24 --- /dev/null +++ b/serde/src/java/org/apache/hadoop/hive/serde2/variant/Variant.java @@ -0,0 +1,373 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.serde2.variant; + +import com.fasterxml.jackson.core.JsonFactory; +import com.fasterxml.jackson.core.JsonGenerator; + +import java.io.CharArrayWriter; +import java.io.IOException; +import java.math.BigDecimal; +import java.time.Instant; +import java.time.LocalDate; +import java.time.ZoneId; +import java.time.ZoneOffset; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeFormatterBuilder; +import java.time.temporal.ChronoUnit; +import java.util.Arrays; +import java.util.Base64; +import java.util.List; +import java.util.Locale; +import java.util.UUID; + +import static org.apache.hadoop.hive.serde2.variant.VariantUtil.*; + +public final class Variant { + final byte[] value; + final byte[] metadata; + // The variant value doesn't use the whole `value` binary, but starts from its `pos` index and + // spans a size of `valueSize(value, pos)`. This design avoids frequent copies of the value binary + // when reading a sub-variant in the array/object element. + final int pos; + + public Variant(byte[] value, byte[] metadata) { + this(value, metadata, 0); + } + + public static Variant from(List data) { + if (data == null || data.size() != 2) { + throw malformedVariant(); + } + byte[] metadata = convertToByteArray(data.get(0)); + byte[] value = convertToByteArray(data.get(1)); + + return new Variant(value, metadata); + } + + Variant(byte[] value, byte[] metadata, int pos) { + this.value = value; + this.metadata = metadata; + this.pos = pos; + // There is currently only one allowed version. + if (metadata.length < 1 || (metadata[0] & VERSION_MASK) != VERSION) { + throw malformedVariant(); + } + // Don't attempt to use a Variant larger than 16 MiB. We'll never produce one, and it risks + // memory instability. + if (metadata.length > SIZE_LIMIT || value.length > SIZE_LIMIT) { + throw variantConstructorSizeLimit(); + } + } + + private static byte[] convertToByteArray(Object obj) { + if (obj == null) { + throw malformedVariant(); + } + return switch (obj) { + case byte[] bytes -> bytes; + case org.apache.hadoop.io.BytesWritable bytesWritable -> bytesWritable.getBytes(); + case org.apache.hadoop.io.Text text -> text.getBytes(); + default -> + throw new IllegalArgumentException("Unsupported type for Variant field: " + obj.getClass()); + }; + } + + public byte[] getValue() { + if (pos == 0) return value; + int size = valueSize(value, pos); + checkIndex(pos + size - 1, value.length); + return Arrays.copyOfRange(value, pos, pos + size); + } + + public byte[] getMetadata() { + return metadata; + } + + // Get a boolean value from the variant. + public boolean getBoolean() { + return VariantUtil.getBoolean(value, pos); + } + + // Get a long value from the variant. + public long getLong() { + return VariantUtil.getLong(value, pos); + } + + // Get a double value from the variant. + public double getDouble() { + return VariantUtil.getDouble(value, pos); + } + + // Get a decimal value from the variant. + public BigDecimal getDecimal() { + return VariantUtil.getDecimal(value, pos); + } + + // Get a float value from the variant. + public float getFloat() { + return VariantUtil.getFloat(value, pos); + } + + // Get a binary value from the variant. + public byte[] getBinary() { + return VariantUtil.getBinary(value, pos); + } + + // Get a string value from the variant. + public String getString() { + return VariantUtil.getString(value, pos); + } + + // Get the type info bits from a variant value. + public int getTypeInfo() { + return VariantUtil.getTypeInfo(value, pos); + } + + // Get the value type of the variant. + public Type getType() { + return VariantUtil.getType(value, pos); + } + + // Get a UUID value from the variant. + public UUID getUuid() { + return VariantUtil.getUuid(value, pos); + } + + // Get the number of object fields in the variant. + // It is only legal to call it when `getType()` is `Type.OBJECT`. + public int objectSize() { + return handleObject(value, pos, + (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> size); + } + + // Find the field value whose key is equal to `key`. Return null if the key is not found. + // It is only legal to call it when `getType()` is `Type.OBJECT`. + public Variant getFieldByKey(String key) { + return handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> { + // Use linear search for a short list. Switch to binary search when the length reaches + // `BINARY_SEARCH_THRESHOLD`. + final int BINARY_SEARCH_THRESHOLD = 32; + if (size < BINARY_SEARCH_THRESHOLD) { + for (int i = 0; i < size; ++i) { + int id = readUnsigned(value, idStart + idSize * i, idSize); + if (key.equals(getMetadataKey(metadata, id))) { + int offset = readUnsigned(value, offsetStart + offsetSize * i, offsetSize); + return new Variant(value, metadata, dataStart + offset); + } + } + } else { + int low = 0; + int high = size - 1; + while (low <= high) { + // Use unsigned right shift to compute the middle of `low` and `high`. This is not only a + // performance optimization, because it can properly handle the case where `low + high` + // overflows int. + int mid = (low + high) >>> 1; + int id = readUnsigned(value, idStart + idSize * mid, idSize); + int cmp = getMetadataKey(metadata, id).compareTo(key); + if (cmp < 0) { + low = mid + 1; + } else if (cmp > 0) { + high = mid - 1; + } else { + int offset = readUnsigned(value, offsetStart + offsetSize * mid, offsetSize); + return new Variant(value, metadata, dataStart + offset); + } + } + } + return null; + }); + } + + public record ObjectField(String key, Variant value) { + } + + // Get the object field at the `index` slot. Return null if `index` is out of the bound of + // `[0, objectSize())`. + // It is only legal to call it when `getType()` is `Type.OBJECT`. + public ObjectField getFieldAtIndex(int index) { + return handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> { + if (index < 0 || index >= size) return null; + int id = readUnsigned(value, idStart + idSize * index, idSize); + int offset = readUnsigned(value, offsetStart + offsetSize * index, offsetSize); + String key = getMetadataKey(metadata, id); + Variant v = new Variant(value, metadata, dataStart + offset); + return new ObjectField(key, v); + }); + } + + // Get the dictionary ID for the object field at the `index` slot. Throws malformedVariant if + // `index` is out of the bound of `[0, objectSize())`. + // It is only legal to call it when `getType()` is `Type.OBJECT`. + public int getDictionaryIdAtIndex(int index) { + return handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> { + if (index < 0 || index >= size) { + throw malformedVariant(); + } + return readUnsigned(value, idStart + idSize * index, idSize); + }); + } + + // Get the number of array elements in the variant. + // It is only legal to call it when `getType()` is `Type.ARRAY`. + public int arraySize() { + return handleArray(value, pos, (size, offsetSize, offsetStart, dataStart) -> size); + } + + // Get the array element at the `index` slot. Return null if `index` is out of the bound of + // `[0, arraySize())`. + // It is only legal to call it when `getType()` is `Type.ARRAY`. + public Variant getElementAtIndex(int index) { + return handleArray(value, pos, (size, offsetSize, offsetStart, dataStart) -> { + if (index < 0 || index >= size) return null; + int offset = readUnsigned(value, offsetStart + offsetSize * index, offsetSize); + return new Variant(value, metadata, dataStart + offset); + }); + } + + // Stringify the variant in JSON format. + // Throw `MALFORMED_VARIANT` if the variant is malformed. + public String toJson(ZoneId zoneId) { + StringBuilder sb = new StringBuilder(); + toJsonImpl(value, metadata, pos, sb, zoneId); + return sb.toString(); + } + + // Escape a string so that it can be pasted into JSON structure. + // For example, if `str` only contains a new-line character, then the result content is "\n" + // (4 characters). + static String escapeJson(String str) { + try (CharArrayWriter writer = new CharArrayWriter(); + JsonGenerator gen = new JsonFactory().createGenerator(writer)) { + gen.writeString(str); + gen.flush(); + return writer.toString(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + // A simplified and more performant version of `sb.append(escapeJson(str))`. It is used when we + // know `str` doesn't contain any special character that needs escaping. + static void appendQuoted(StringBuilder sb, String str) { + sb.append('"'); + sb.append(str); + sb.append('"'); + } + + private static final DateTimeFormatter TIMESTAMP_NTZ_FORMATTER = new DateTimeFormatterBuilder() + .append(DateTimeFormatter.ISO_LOCAL_DATE) + .appendLiteral(' ') + .append(DateTimeFormatter.ISO_LOCAL_TIME) + .toFormatter(Locale.US); + + private static final DateTimeFormatter TIMESTAMP_FORMATTER = new DateTimeFormatterBuilder() + .append(TIMESTAMP_NTZ_FORMATTER) + .appendOffset("+HH:MM", "+00:00") + .toFormatter(Locale.US); + + private static Instant microsToInstant(long timestamp) { + return Instant.EPOCH.plus(timestamp, ChronoUnit.MICROS); + } + + static void toJsonImpl(byte[] value, byte[] metadata, int pos, StringBuilder sb, ZoneId zoneId) { + switch (VariantUtil.getType(value, pos)) { + case OBJECT: + handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> { + sb.append('{'); + for (int i = 0; i < size; ++i) { + int id = readUnsigned(value, idStart + idSize * i, idSize); + int offset = readUnsigned(value, offsetStart + offsetSize * i, offsetSize); + int elementPos = dataStart + offset; + if (i != 0) sb.append(','); + sb.append(escapeJson(getMetadataKey(metadata, id))); + sb.append(':'); + toJsonImpl(value, metadata, elementPos, sb, zoneId); + } + sb.append('}'); + return null; + }); + break; + case ARRAY: + handleArray(value, pos, (size, offsetSize, offsetStart, dataStart) -> { + sb.append('['); + for (int i = 0; i < size; ++i) { + int offset = readUnsigned(value, offsetStart + offsetSize * i, offsetSize); + int elementPos = dataStart + offset; + if (i != 0) sb.append(','); + toJsonImpl(value, metadata, elementPos, sb, zoneId); + } + sb.append(']'); + return null; + }); + break; + case NULL: + sb.append("null"); + break; + case BOOLEAN: + sb.append(VariantUtil.getBoolean(value, pos)); + break; + case LONG: + sb.append(VariantUtil.getLong(value, pos)); + break; + case STRING: + sb.append(escapeJson(VariantUtil.getString(value, pos))); + break; + case DOUBLE: { + double d = VariantUtil.getDouble(value, pos); + if (Double.isFinite(d)) { + sb.append(d); + } else { + appendQuoted(sb, Double.toString(d)); + } + break; + } + case DECIMAL: + sb.append(VariantUtil.getDecimal(value, pos).toPlainString()); + break; + case DATE: + appendQuoted(sb, LocalDate.ofEpochDay((int) VariantUtil.getLong(value, pos)).toString()); + break; + case TIMESTAMP: + appendQuoted(sb, TIMESTAMP_FORMATTER.format( + microsToInstant(VariantUtil.getLong(value, pos)).atZone(zoneId))); + break; + case TIMESTAMP_NTZ: + appendQuoted(sb, TIMESTAMP_NTZ_FORMATTER.format( + microsToInstant(VariantUtil.getLong(value, pos)).atZone(ZoneOffset.UTC))); + break; + case FLOAT: { + float f = VariantUtil.getFloat(value, pos); + if (Float.isFinite(f)) { + sb.append(f); + } else { + appendQuoted(sb, Float.toString(f)); + } + break; + } + case BINARY: + appendQuoted(sb, Base64.getEncoder().encodeToString(VariantUtil.getBinary(value, pos))); + break; + case UUID: + appendQuoted(sb, VariantUtil.getUuid(value, pos).toString()); + break; + } + } +} diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/variant/VariantBuilder.java b/serde/src/java/org/apache/hadoop/hive/serde2/variant/VariantBuilder.java new file mode 100644 index 000000000000..88e9240140dc --- /dev/null +++ b/serde/src/java/org/apache/hadoop/hive/serde2/variant/VariantBuilder.java @@ -0,0 +1,589 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.serde2.variant; + +import java.io.IOException; +import java.math.BigDecimal; +import java.math.BigInteger; +import java.nio.charset.StandardCharsets; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + +import java.util.Arrays; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.UUID; +import java.util.Collections; +import java.util.Comparator; + +import com.fasterxml.jackson.core.JsonFactory; +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.core.JsonParseException; +import com.fasterxml.jackson.core.JsonToken; +import com.fasterxml.jackson.core.exc.InputCoercionException; +import com.google.common.collect.ImmutableMap; + +import static org.apache.hadoop.hive.serde2.variant.VariantUtil.*; + +/** + * Build variant value and metadata by parsing JSON values. + */ +public class VariantBuilder { + + // The write buffer in building the variant value. Its first `writePos` bytes has been written. + private byte[] writeBuffer = new byte[128]; + private int writePos = 0; + // Map keys to a monotonically increasing id. + private final Map dictionary = new HashMap<>(); + // Store all keys in `dictionary` in the order of id. + private final List dictionaryKeys = new ArrayList<>(); + private final boolean allowDuplicateKeys; + + public VariantBuilder(boolean allowDuplicateKeys) { + this.allowDuplicateKeys = allowDuplicateKeys; + } + + /** + * Parse a JSON string as a Variant value. + * @throws VariantSizeLimitException if the resulting variant value or metadata would exceed + * the SIZE_LIMIT (for example, this could be a maximum of 16 MiB). + * @throws IOException if any JSON parsing error happens. + */ + public static Variant parseJson(String json, boolean allowDuplicateKeys) throws IOException { + try (JsonParser parser = new JsonFactory().createParser(json)) { + parser.nextToken(); + return parseJson(parser, allowDuplicateKeys); + } + } + + /** + * Similar {@link #parseJson(String, boolean)}, but takes a JSON parser instead of string input. + */ + public static Variant parseJson(JsonParser parser, boolean allowDuplicateKeys) + throws IOException { + VariantBuilder builder = new VariantBuilder(allowDuplicateKeys); + builder.buildJson(parser); + return builder.result(); + } + + // Build the variant metadata from `dictionaryKeys` and return the variant result. + public Variant result() { + int numKeys = dictionaryKeys.size(); + // Use long to avoid overflow in accumulating lengths. + long dictionaryStringSize = 0; + for (byte[] key : dictionaryKeys) { + dictionaryStringSize += key.length; + } + // Determine the number of bytes required per offset entry. + // The largest offset is the one-past-the-end value, which is total string size. It's very + // unlikely that the number of keys could be larger, but incorporate that into the calcualtion + // in case of pathological data. + long maxSize = Math.max(dictionaryStringSize, numKeys); + if (maxSize > SIZE_LIMIT) { + throw new VariantSizeLimitException(); + } + int offsetSize = getIntegerSize((int)maxSize); + + int offsetStart = 1 + offsetSize; + int stringStart = offsetStart + (numKeys + 1) * offsetSize; + long metadataSize = stringStart + dictionaryStringSize; + + if (metadataSize > SIZE_LIMIT) { + throw new VariantSizeLimitException(); + } + byte[] metadata = new byte[(int) metadataSize]; + int headerByte = VERSION | ((offsetSize - 1) << 6); + writeLong(metadata, 0, headerByte, 1); + writeLong(metadata, 1, numKeys, offsetSize); + int currentOffset = 0; + for (int i = 0; i < numKeys; ++i) { + writeLong(metadata, offsetStart + i * offsetSize, currentOffset, offsetSize); + byte[] key = dictionaryKeys.get(i); + System.arraycopy(key, 0, metadata, stringStart + currentOffset, key.length); + currentOffset += key.length; + } + writeLong(metadata, offsetStart + numKeys * offsetSize, currentOffset, offsetSize); + return new Variant(Arrays.copyOfRange(writeBuffer, 0, writePos), metadata); + } + + // Return the variant value only, without metadata. + // Used in shredding to produce a final value, where all shredded values refer to a common + // metadata. It is expected to be called instead of `result()`, although it is valid to call both + // methods, in any order. + public byte[] valueWithoutMetadata() { + return Arrays.copyOfRange(writeBuffer, 0, writePos); + } + + public void appendString(String str) { + byte[] text = str.getBytes(StandardCharsets.UTF_8); + boolean longStr = text.length > MAX_SHORT_STR_SIZE; + checkCapacity((longStr ? 1 + U32_SIZE : 1) + text.length); + if (longStr) { + writeBuffer[writePos++] = primitiveHeader(LONG_STR); + writeLong(writeBuffer, writePos, text.length, U32_SIZE); + writePos += U32_SIZE; + } else { + writeBuffer[writePos++] = shortStrHeader(text.length); + } + System.arraycopy(text, 0, writeBuffer, writePos, text.length); + writePos += text.length; + } + + public void appendNull() { + checkCapacity(1); + writeBuffer[writePos++] = primitiveHeader(NULL); + } + + public void appendBoolean(boolean b) { + checkCapacity(1); + writeBuffer[writePos++] = primitiveHeader(b ? TRUE : FALSE); + } + + // Append a long value to the variant builder. The actual used integer type depends on the value + // range of the long value. + public void appendLong(long l) { + checkCapacity(1 + 8); + if (l == (byte) l) { + writeBuffer[writePos++] = primitiveHeader(INT1); + writeLong(writeBuffer, writePos, l, 1); + writePos += 1; + } else if (l == (short) l) { + writeBuffer[writePos++] = primitiveHeader(INT2); + writeLong(writeBuffer, writePos, l, 2); + writePos += 2; + } else if (l == (int) l) { + writeBuffer[writePos++] = primitiveHeader(INT4); + writeLong(writeBuffer, writePos, l, 4); + writePos += 4; + } else { + writeBuffer[writePos++] = primitiveHeader(INT8); + writeLong(writeBuffer, writePos, l, 8); + writePos += 8; + } + } + + public void appendDouble(double d) { + checkCapacity(1 + 8); + writeBuffer[writePos++] = primitiveHeader(DOUBLE); + writeLong(writeBuffer, writePos, Double.doubleToLongBits(d), 8); + writePos += 8; + } + + // Append a decimal value to the variant builder. The caller should guarantee that its precision + // and scale fit into `MAX_DECIMAL16_PRECISION`. + public void appendDecimal(BigDecimal d) { + checkCapacity(2 + 16); + BigInteger unscaled = d.unscaledValue(); + if (d.scale() <= MAX_DECIMAL4_PRECISION && d.precision() <= MAX_DECIMAL4_PRECISION) { + writeBuffer[writePos++] = primitiveHeader(DECIMAL4); + writeBuffer[writePos++] = (byte) d.scale(); + writeLong(writeBuffer, writePos, unscaled.intValueExact(), 4); + writePos += 4; + } else if (d.scale() <= MAX_DECIMAL8_PRECISION && d.precision() <= MAX_DECIMAL8_PRECISION) { + writeBuffer[writePos++] = primitiveHeader(DECIMAL8); + writeBuffer[writePos++] = (byte) d.scale(); + writeLong(writeBuffer, writePos, unscaled.longValueExact(), 8); + writePos += 8; + } else { + assert d.scale() <= MAX_DECIMAL16_PRECISION && d.precision() <= MAX_DECIMAL16_PRECISION; + writeBuffer[writePos++] = primitiveHeader(DECIMAL16); + writeBuffer[writePos++] = (byte) d.scale(); + // `toByteArray` returns a big-endian representation. We need to copy it reversely and sign + // extend it to 16 bytes. + byte[] bytes = unscaled.toByteArray(); + for (int i = 0; i < bytes.length; ++i) { + writeBuffer[writePos + i] = bytes[bytes.length - 1 - i]; + } + byte sign = (byte) (bytes[0] < 0 ? -1 : 0); + for (int i = bytes.length; i < 16; ++i) { + writeBuffer[writePos + i] = sign; + } + writePos += 16; + } + } + + public void appendDate(int daysSinceEpoch) { + checkCapacity(1 + 4); + writeBuffer[writePos++] = primitiveHeader(DATE); + writeLong(writeBuffer, writePos, daysSinceEpoch, 4); + writePos += 4; + } + + public void appendTimestamp(long microsSinceEpoch) { + checkCapacity(1 + 8); + writeBuffer[writePos++] = primitiveHeader(TIMESTAMP); + writeLong(writeBuffer, writePos, microsSinceEpoch, 8); + writePos += 8; + } + + public void appendTimestampNtz(long microsSinceEpoch) { + checkCapacity(1 + 8); + writeBuffer[writePos++] = primitiveHeader(TIMESTAMP_NTZ); + writeLong(writeBuffer, writePos, microsSinceEpoch, 8); + writePos += 8; + } + + public void appendFloat(float f) { + checkCapacity(1 + 4); + writeBuffer[writePos++] = primitiveHeader(FLOAT); + writeLong(writeBuffer, writePos, Float.floatToIntBits(f), 4); + writePos += 4; + } + + public void appendBinary(byte[] binary) { + checkCapacity(1 + U32_SIZE + binary.length); + writeBuffer[writePos++] = primitiveHeader(BINARY); + writeLong(writeBuffer, writePos, binary.length, U32_SIZE); + writePos += U32_SIZE; + System.arraycopy(binary, 0, writeBuffer, writePos, binary.length); + writePos += binary.length; + } + + public void appendUuid(UUID uuid) { + checkCapacity(1 + 16); + writeBuffer[writePos++] = primitiveHeader(UUID); + + // UUID is stored big-endian, so don't use writeLong. + ByteBuffer buffer = ByteBuffer.wrap(writeBuffer, writePos, 16); + buffer.order(ByteOrder.BIG_ENDIAN); + buffer.putLong(writePos, uuid.getMostSignificantBits()); + buffer.putLong(writePos + 8, uuid.getLeastSignificantBits()); + writePos += 16; + } + + // Add a key to the variant dictionary. If the key already exists, the dictionary is not modified. + // In either case, return the id of the key. + public int addKey(String key) { + int id; + if (dictionary.containsKey(key)) { + id = dictionary.get(key); + } else { + id = dictionaryKeys.size(); + dictionary.put(key, id); + dictionaryKeys.add(key.getBytes(StandardCharsets.UTF_8)); + } + return id; + } + + // Return the current write position of the variant builder. It is used together with + // `finishWritingObject` or `finishWritingArray`. + public int getWritePos() { + return writePos; + } + + // Finish writing a variant object after all of its fields have already been written. The process + // is as follows: + // 1. The caller calls `getWritePos` before writing any fields to obtain the `start` parameter. + // 2. The caller appends all the object fields to the builder. In the meantime, it should maintain + // the `fields` parameter. Before appending each field, it should append an entry to `fields` to + // record the offset of the field. The offset is computed as `getWritePos() - start`. + // 3. The caller calls `finishWritingObject` to finish writing a variant object. + // + // This function is responsible to sort the fields by key. If there are duplicate field keys: + // - when `allowDuplicateKeys` is true, the field with the greatest offset value (the last + // appended one) is kept. + // - otherwise, throw an exception. + public void finishWritingObject(int start, ArrayList fields) { + int size = fields.size(); + Collections.sort(fields); + int maxId = size == 0 ? 0 : fields.getFirst().id; + if (allowDuplicateKeys) { + int distinctPos = 0; + // Maintain a list of distinct keys in-place. + for (int i = 1; i < size; ++i) { + maxId = Math.max(maxId, fields.get(i).id); + if (fields.get(i).id == fields.get(i - 1).id) { + // Found a duplicate key. Keep the field with a greater offset. + if (fields.get(distinctPos).offset < fields.get(i).offset) { + fields.set(distinctPos, fields.get(distinctPos).withNewOffset(fields.get(i).offset)); + } + } else { + // Found a distinct key. Add the field to the list. + ++distinctPos; + fields.set(distinctPos, fields.get(i)); + } + } + if (distinctPos + 1 < fields.size()) { + size = distinctPos + 1; + // Resize `fields` to `size`. + fields.subList(size, fields.size()).clear(); + // Sort the fields by offsets so that we can move the value data of each field to the new + // offset without overwriting the fields after it. + fields.sort(Comparator.comparingInt(f -> f.offset)); + int currentOffset = 0; + for (int i = 0; i < size; ++i) { + int oldOffset = fields.get(i).offset; + int fieldSize = VariantUtil.valueSize(writeBuffer, start + oldOffset); + System.arraycopy(writeBuffer, start + oldOffset, + writeBuffer, start + currentOffset, fieldSize); + fields.set(i, fields.get(i).withNewOffset(currentOffset)); + currentOffset += fieldSize; + } + writePos = start + currentOffset; + // Change back to the sort order by field keys to meet the variant spec. + Collections.sort(fields); + } + } else { + for (int i = 1; i < size; ++i) { + maxId = Math.max(maxId, fields.get(i).id); + String key = fields.get(i).key; + if (key.equals(fields.get(i - 1).key)) { + throw new RuntimeException("VARIANT_DUPLICATE_KEY" + ImmutableMap.of(key, key)); + } + } + } + int dataSize = writePos - start; + boolean largeSize = size > U8_MAX; + int sizeBytes = largeSize ? U32_SIZE : 1; + int idSize = getIntegerSize(maxId); + int offsetSize = getIntegerSize(dataSize); + // The space for header byte, object size, id list, and offset list. + int headerSize = 1 + sizeBytes + size * idSize + (size + 1) * offsetSize; + checkCapacity(headerSize); + // Shift the just-written field data to make room for the object header section. + System.arraycopy(writeBuffer, start, writeBuffer, start + headerSize, dataSize); + writePos += headerSize; + writeBuffer[start] = objectHeader(largeSize, idSize, offsetSize); + writeLong(writeBuffer, start + 1, size, sizeBytes); + int idStart = start + 1 + sizeBytes; + int offsetStart = idStart + size * idSize; + for (int i = 0; i < size; ++i) { + writeLong(writeBuffer, idStart + i * idSize, fields.get(i).id, idSize); + writeLong(writeBuffer, offsetStart + i * offsetSize, fields.get(i).offset, offsetSize); + } + writeLong(writeBuffer, offsetStart + size * offsetSize, dataSize, offsetSize); + } + + // Finish writing a variant array after all of its elements have already been written. The process + // is similar to that of `finishWritingObject`. + public void finishWritingArray(int start, List offsets) { + int dataSize = writePos - start; + int size = offsets.size(); + boolean largeSize = size > U8_MAX; + int sizeBytes = largeSize ? U32_SIZE : 1; + int offsetSize = getIntegerSize(dataSize); + // The space for header byte, object size, and offset list. + int headerSize = 1 + sizeBytes + (size + 1) * offsetSize; + checkCapacity(headerSize); + // Shift the just-written field data to make room for the header section. + System.arraycopy(writeBuffer, start, writeBuffer, start + headerSize, dataSize); + writePos += headerSize; + writeBuffer[start] = arrayHeader(largeSize, offsetSize); + writeLong(writeBuffer, start + 1, size, sizeBytes); + int offsetStart = start + 1 + sizeBytes; + for (int i = 0; i < size; ++i) { + writeLong(writeBuffer, offsetStart + i * offsetSize, offsets.get(i), offsetSize); + } + writeLong(writeBuffer, offsetStart + size * offsetSize, dataSize, offsetSize); + } + + // Append a variant value to the variant builder. We need to insert the keys in the input variant + // into the current variant dictionary and rebuild it with new field ids. For scalar values in the + // input variant, we can directly copy the binary slice. + public void appendVariant(Variant v) { + appendVariantImpl(v.value, v.metadata, v.pos); + } + + private void appendVariantImpl(byte[] value, byte[] metadata, int pos) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + switch (basicType) { + case OBJECT: + handleObject(value, pos, (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> { + ArrayList fields = new ArrayList<>(size); + int start = writePos; + for (int i = 0; i < size; ++i) { + int id = readUnsigned(value, idStart + idSize * i, idSize); + int offset = readUnsigned(value, offsetStart + offsetSize * i, offsetSize); + int elementPos = dataStart + offset; + String key = getMetadataKey(metadata, id); + int newId = addKey(key); + fields.add(new FieldEntry(key, newId, writePos - start)); + appendVariantImpl(value, metadata, elementPos); + } + finishWritingObject(start, fields); + return null; + }); + break; + case ARRAY: + handleArray(value, pos, (size, offsetSize, offsetStart, dataStart) -> { + ArrayList offsets = new ArrayList<>(size); + int start = writePos; + for (int i = 0; i < size; ++i) { + int offset = readUnsigned(value, offsetStart + offsetSize * i, offsetSize); + int elementPos = dataStart + offset; + offsets.add(writePos - start); + appendVariantImpl(value, metadata, elementPos); + } + finishWritingArray(start, offsets); + return null; + }); + break; + default: + shallowAppendVariantImpl(value, pos); + break; + } + } + + // Append the variant value without rewriting or creating any metadata. This is used when + // building an object during shredding, where there is a fixed pre-existing metadata that + // all shredded values will refer to. + public void shallowAppendVariant(Variant v) { + shallowAppendVariantImpl(v.value, v.pos); + } + + private void shallowAppendVariantImpl(byte[] value, int pos) { + int size = valueSize(value, pos); + checkIndex(pos + size - 1, value.length); + checkCapacity(size); + System.arraycopy(value, pos, writeBuffer, writePos, size); + writePos += size; + } + + private void checkCapacity(int additional) { + int required = writePos + additional; + if (required > writeBuffer.length) { + // Allocate a new buffer with a capacity of the next power of 2 of `required`. + int newCapacity = Integer.highestOneBit(required); + newCapacity = newCapacity < required ? newCapacity * 2 : newCapacity; + if (newCapacity > SIZE_LIMIT) { + throw new VariantSizeLimitException(); + } + byte[] newValue = new byte[newCapacity]; + System.arraycopy(writeBuffer, 0, newValue, 0, writePos); + writeBuffer = newValue; + } + } + + // Temporarily store the information of a field. We need to collect all fields in an JSON object, + // sort them by their keys, and build the variant object in sorted order. + public static final class FieldEntry implements Comparable { + final String key; + final int id; + final int offset; + + public FieldEntry(String key, int id, int offset) { + this.key = key; + this.id = id; + this.offset = offset; + } + + FieldEntry withNewOffset(int newOffset) { + return new FieldEntry(key, id, newOffset); + } + + @Override + public int compareTo(FieldEntry other) { + return key.compareTo(other.key); + } + } + + private void buildJson(JsonParser parser) throws IOException { + JsonToken token = parser.currentToken(); + if (token == null) { + throw new JsonParseException(parser, "Unexpected null token"); + } + switch (token) { + case START_OBJECT: { + ArrayList fields = new ArrayList<>(); + int start = writePos; + while (parser.nextToken() != JsonToken.END_OBJECT) { + String key = parser.currentName(); + parser.nextToken(); + int id = addKey(key); + fields.add(new FieldEntry(key, id, writePos - start)); + buildJson(parser); + } + finishWritingObject(start, fields); + break; + } + case START_ARRAY: { + ArrayList offsets = new ArrayList<>(); + int start = writePos; + while (parser.nextToken() != JsonToken.END_ARRAY) { + offsets.add(writePos - start); + buildJson(parser); + } + finishWritingArray(start, offsets); + break; + } + case VALUE_STRING: + appendString(parser.getText()); + break; + case VALUE_NUMBER_INT: + try { + appendLong(parser.getLongValue()); + } catch (InputCoercionException ignored) { + // If the value doesn't fit any integer type, parse it as decimal or floating instead. + parseFloatingPoint(parser); + } + break; + case VALUE_NUMBER_FLOAT: + parseFloatingPoint(parser); + break; + case VALUE_TRUE: + appendBoolean(true); + break; + case VALUE_FALSE: + appendBoolean(false); + break; + case VALUE_NULL: + appendNull(); + break; + default: + throw new JsonParseException(parser, "Unexpected token " + token); + } + } + + // Choose the smallest unsigned integer type that can store `value`. It must be within + // `[0, SIZE_LIMIT]`. + private int getIntegerSize(int value) { + assert value >= 0 && value <= SIZE_LIMIT; + if (value <= U8_MAX) return 1; + if (value <= U16_MAX) return 2; + if (value <= U24_MAX) return 3; + return 4; + } + + private void parseFloatingPoint(JsonParser parser) throws IOException { + if (!tryParseDecimal(parser.getText())) { + appendDouble(parser.getDoubleValue()); + } + } + + // Try to parse a JSON number as a decimal. Return whether the parsing succeeds. The input must + // only use the decimal format (an integer value with an optional '.' in it) and must not use + // scientific notation. It also must fit into the precision limitation of decimal types. + private boolean tryParseDecimal(String input) { + for (int i = 0; i < input.length(); ++i) { + char ch = input.charAt(i); + if (ch != '-' && ch != '.' && !(ch >= '0' && ch <= '9')) { + return false; + } + } + BigDecimal d = new BigDecimal(input); + if (d.scale() <= MAX_DECIMAL16_PRECISION && d.precision() <= MAX_DECIMAL16_PRECISION) { + appendDecimal(d); + return true; + } + return false; + } +} diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/variant/VariantSizeLimitException.java b/serde/src/java/org/apache/hadoop/hive/serde2/variant/VariantSizeLimitException.java new file mode 100644 index 000000000000..ba0031d208cf --- /dev/null +++ b/serde/src/java/org/apache/hadoop/hive/serde2/variant/VariantSizeLimitException.java @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.serde2.variant; + +/** + * An exception indicating that we are attempting to build a variant with it value or metadata + * exceeding the 16MiB size limit. + */ +public class VariantSizeLimitException extends RuntimeException { +} diff --git a/serde/src/java/org/apache/hadoop/hive/serde2/variant/VariantUtil.java b/serde/src/java/org/apache/hadoop/hive/serde2/variant/VariantUtil.java new file mode 100644 index 000000000000..0e4d54a25702 --- /dev/null +++ b/serde/src/java/org/apache/hadoop/hive/serde2/variant/VariantUtil.java @@ -0,0 +1,589 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.serde2.variant; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.UUID; +import java.math.BigDecimal; +import java.math.BigInteger; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + +import com.google.common.collect.ImmutableMap; + +/** + * This class defines constants related to the variant format and provides functions for + * manipulating variant binaries. + + * A variant is made up of 2 binaries: value and metadata. A variant value consists of a one-byte + * header and a number of content bytes (can be zero). The header byte is divided into upper 6 bits + * (called "type info") and lower 2 bits (called "basic type"). The content format is explained in + * the below constants for all possible basic type and type info values. + + * The variant metadata includes a version id and a dictionary of distinct strings (case-sensitive). + * Its binary format is: + * - Version: 1-byte unsigned integer. The only acceptable value is 1 currently. + * - Dictionary size: 4-byte little-endian unsigned integer. The number of keys in the + * dictionary. + * - Offsets: (size + 1) * 4-byte little-endian unsigned integers. `offsets[i]` represents the + * starting position of string i, counting starting from the address of `offsets[0]`. Strings + * must be stored contiguously, so we don’t need to store the string size, instead, we compute it + * with `offset[i + 1] - offset[i]`. + * - UTF-8 string data. + */ +public class VariantUtil { + + public static final int BASIC_TYPE_BITS = 2; + public static final int BASIC_TYPE_MASK = 0x3; + public static final int TYPE_INFO_MASK = 0x3F; + // The inclusive maximum value of the type info value. It is the size limit of `SHORT_STR`. + public static final int MAX_SHORT_STR_SIZE = 0x3F; + + // Below is all possible basic type values. + // Primitive value. The type info value must be one of the values in the below section. + public static final int PRIMITIVE = 0; + // Short string value. The type info value is the string size, which must be in `[0, + // kMaxShortStrSize]`. + // The string content bytes directly follow the header byte. + public static final int SHORT_STR = 1; + // Object value. The content contains a size, a list of field ids, a list of field offsets, and + // the actual field data. The length of the id list is `size`, while the length of the offset + // list is `size + 1`, where the last offset represent the total size of the field data. The + // fields in an object must be sorted by the field name in alphabetical order. Duplicate field + // names in one object are not allowed. + // We use 5 bits in the type info to specify the integer type of the object header: it should + // be 0_b4_b3b2_b1b0 (MSB is 0), where: + // - b4 specifies the type of size. When it is 0/1, `size` is a little-endian 1/4-byte + // unsigned integer. + // - b3b2/b1b0 specifies the integer type of id and offset. When the 2 bits are 0/1/2, the + // list contains 1/2/3-byte little-endian unsigned integers. + public static final int OBJECT = 2; + // Array value. The content contains a size, a list of field offsets, and the actual element + // data. It is similar to an object without the id list. The length of the offset list + // is `size + 1`, where the last offset represent the total size of the element data. + // Its type info should be: 000_b2_b1b0: + // - b2 specifies the type of size. + // - b1b0 specifies the integer type of offset. + public static final int ARRAY = 3; + + // Below is all possible type info values for `PRIMITIVE`. + // JSON Null value. Empty content. + public static final int NULL = 0; + // True value. Empty content. + public static final int TRUE = 1; + // False value. Empty content. + public static final int FALSE = 2; + // 1-byte little-endian signed integer. + public static final int INT1 = 3; + // 2-byte little-endian signed integer. + public static final int INT2 = 4; + // 4-byte little-endian signed integer. + public static final int INT4 = 5; + // 4-byte little-endian signed integer. + public static final int INT8 = 6; + // 8-byte IEEE double. + public static final int DOUBLE = 7; + // 4-byte decimal. Content is 1-byte scale + 4-byte little-endian signed integer. + public static final int DECIMAL4 = 8; + // 8-byte decimal. Content is 1-byte scale + 8-byte little-endian signed integer. + public static final int DECIMAL8 = 9; + // 16-byte decimal. Content is 1-byte scale + 16-byte little-endian signed integer. + public static final int DECIMAL16 = 10; + // Date value. Content is 4-byte little-endian signed integer that represents the number of days + // from the Unix epoch. + public static final int DATE = 11; + // Timestamp value. Content is 8-byte little-endian signed integer that represents the number of + // microseconds elapsed since the Unix epoch, 1970-01-01 00:00:00 UTC. It is displayed to users in + // their local time zones and may be displayed differently depending on the execution environment. + public static final int TIMESTAMP = 12; + // Timestamp_ntz value. It has the same content as `TIMESTAMP` but should always be interpreted + // as if the local time zone is UTC. + public static final int TIMESTAMP_NTZ = 13; + // 4-byte IEEE float. + public static final int FLOAT = 14; + // Binary value. The content is (4-byte little-endian unsigned integer representing the binary + // size) + (size bytes of binary content). + public static final int BINARY = 15; + // Long string value. The content is (4-byte little-endian unsigned integer representing the + // string size) + (size bytes of string content). + public static final int LONG_STR = 16; + + // UUID, 16-byte big-endian. + public static final int UUID = 20; + + public static final byte VERSION = 1; + // The lower 4 bits of the first metadata byte contain the version. + public static final byte VERSION_MASK = 0x0F; + + public static final int U8_MAX = 0xFF; + public static final int U16_MAX = 0xFFFF; + public static final int U24_MAX = 0xFFFFFF; + public static final int U24_SIZE = 3; + public static final int U32_SIZE = 4; + + // Both variant value and variant metadata need to be no longer than 128MiB. + // Note: to make tests more reliable, we set the max size to 16Mib to avoid OOM in tests. + public static final int SIZE_LIMIT = + isTesting() ? U24_MAX + 1 : 128 * 1024 * 1024; + + public static final int MAX_DECIMAL4_PRECISION = 9; + public static final int MAX_DECIMAL8_PRECISION = 18; + public static final int MAX_DECIMAL16_PRECISION = 38; + + // Write the least significant `numBytes` bytes in `value` into `bytes[pos, pos + numBytes)` in + // little endian. + public static void writeLong(byte[] bytes, int pos, long value, int numBytes) { + for (int i = 0; i < numBytes; ++i) { + bytes[pos + i] = (byte) ((value >>> (8 * i)) & 0xFF); + } + } + + public static byte primitiveHeader(int type) { + return (byte) (type << 2 | PRIMITIVE); + } + + public static byte shortStrHeader(int size) { + return (byte) (size << 2 | SHORT_STR); + } + + public static byte objectHeader(boolean largeSize, int idSize, int offsetSize) { + return (byte) (((largeSize ? 1 : 0) << (BASIC_TYPE_BITS + 4)) | + ((idSize - 1) << (BASIC_TYPE_BITS + 2)) | + ((offsetSize - 1) << BASIC_TYPE_BITS) | OBJECT); + } + + public static byte arrayHeader(boolean largeSize, int offsetSize) { + return (byte) (((largeSize ? 1 : 0) << (BASIC_TYPE_BITS + 2)) | + ((offsetSize - 1) << BASIC_TYPE_BITS) | ARRAY); + } + + // An exception indicating that the variant value or metadata doesn't + static RuntimeException malformedVariant() { + return new RuntimeException("MALFORMED_VARIANT"); + } + + static RuntimeException unknownPrimitiveTypeInVariant(int id) { + return new RuntimeException("UNKNOWN_PRIMITIVE_TYPE_IN_VARIANT" + + ImmutableMap.of("id", Integer.toString(id))); + } + + // An exception indicating that an external caller tried to call the Variant constructor with + // value or metadata exceeding the 16MiB size limit. We will never construct a Variant this large, + // so it should only be possible to encounter this exception when reading a Variant produced by + // another tool. + static RuntimeException variantConstructorSizeLimit() { + return new RuntimeException("VARIANT_CONSTRUCTOR_SIZE_LIMIT"); + } + + // Check the validity of an array index `pos`. Throw `MALFORMED_VARIANT` if it is out of bound, + // meaning that the variant is malformed. + static void checkIndex(int pos, int length) { + if (pos < 0 || pos >= length) throw malformedVariant(); + } + + // Read a little-endian signed long value from `bytes[pos, pos + numBytes)`. + static long readLong(byte[] bytes, int pos, int numBytes) { + checkIndex(pos, bytes.length); + checkIndex(pos + numBytes - 1, bytes.length); + long result = 0; + // All bytes except the most significant byte should be unsign-extended and shifted (so we need + // `& 0xFF`). The most significant byte should be sign-extended and is handled after the loop. + for (int i = 0; i < numBytes - 1; ++i) { + long unsignedByteValue = bytes[pos + i] & 0xFF; + result |= unsignedByteValue << (8 * i); + } + long signedByteValue = bytes[pos + numBytes - 1]; + result |= signedByteValue << (8 * (numBytes - 1)); + return result; + } + + // Read a little-endian unsigned int value from `bytes[pos, pos + numBytes)`. The value must fit + // into a non-negative int (`[0, Integer.MAX_VALUE]`). + public static int readUnsigned(byte[] bytes, int pos, int numBytes) { + checkIndex(pos, bytes.length); + checkIndex(pos + numBytes - 1, bytes.length); + int result = 0; + // Similar to the `readLong` loop, but all bytes should be unsign-extended. + for (int i = 0; i < numBytes; ++i) { + int unsignedByteValue = bytes[pos + i] & 0xFF; + result |= unsignedByteValue << (8 * i); + } + if (result < 0) throw malformedVariant(); + return result; + } + + // The value type of variant value. It is determined by the header byte but not a 1:1 mapping + // (for example, INT1/2/4/8 all maps to `Type.LONG`). + public enum Type { + OBJECT, + ARRAY, + NULL, + BOOLEAN, + LONG, + STRING, + DOUBLE, + DECIMAL, + DATE, + TIMESTAMP, + TIMESTAMP_NTZ, + FLOAT, + BINARY, + UUID, + } + + public static int getTypeInfo(byte[] value, int pos) { + checkIndex(pos, value.length); + return (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + } + + // Get the value type of variant value `value[pos...]`. It is only legal to call `get*` if + // `getType` returns this type (for example, it is only legal to call `getLong` if `getType` + // returns `Type.Long`). + // Throw `MALFORMED_VARIANT` if the variant is malformed. + public static Type getType(byte[] value, int pos) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + return switch (basicType) { + case SHORT_STR -> Type.STRING; + case OBJECT -> Type.OBJECT; + case ARRAY -> Type.ARRAY; + default -> switch (typeInfo) { + case NULL -> Type.NULL; + case TRUE, FALSE -> Type.BOOLEAN; + case INT1, INT2, INT4, INT8 -> Type.LONG; + case DOUBLE -> Type.DOUBLE; + case DECIMAL4, DECIMAL8, DECIMAL16 -> Type.DECIMAL; + case DATE -> Type.DATE; + case TIMESTAMP -> Type.TIMESTAMP; + case TIMESTAMP_NTZ -> Type.TIMESTAMP_NTZ; + case FLOAT -> Type.FLOAT; + case BINARY -> Type.BINARY; + case LONG_STR -> Type.STRING; + case UUID -> Type.UUID; + default -> throw unknownPrimitiveTypeInVariant(typeInfo); + }; + }; + } + + // Compute the size in bytes of the variant value `value[pos...]`. `value.length - pos` is an + // upper bound of the size, but the actual size can be smaller. + // Throw `MALFORMED_VARIANT` if the variant is malformed. + public static int valueSize(byte[] value, int pos) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + return switch (basicType) { + case SHORT_STR -> 1 + typeInfo; + case OBJECT -> handleObject(value, pos, + (size, idSize, offsetSize, idStart, offsetStart, dataStart) -> + dataStart - pos + readUnsigned(value, offsetStart + size * offsetSize, offsetSize)); + case ARRAY -> handleArray(value, pos, (size, offsetSize, offsetStart, dataStart) -> + dataStart - pos + readUnsigned(value, offsetStart + size * offsetSize, offsetSize)); + default -> switch (typeInfo) { + case NULL, TRUE, FALSE -> 1; + case INT1 -> 2; + case INT2 -> 3; + case INT4, DATE, FLOAT -> 5; + case INT8, DOUBLE, TIMESTAMP, TIMESTAMP_NTZ -> 9; + case DECIMAL4 -> 6; + case DECIMAL8 -> 10; + case DECIMAL16 -> 18; + case BINARY, LONG_STR -> 1 + U32_SIZE + readUnsigned(value, pos + 1, U32_SIZE); + case UUID -> 17; + default -> throw unknownPrimitiveTypeInVariant(typeInfo); + }; + }; + } + + static IllegalStateException unexpectedType(Type type) { + return new IllegalStateException("Expect type to be " + type); + } + + // Get a boolean value from variant value `value[pos...]`. + // Throw `MALFORMED_VARIANT` if the variant is malformed. + public static boolean getBoolean(byte[] value, int pos) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType != PRIMITIVE || (typeInfo != TRUE && typeInfo != FALSE)) { + throw unexpectedType(Type.BOOLEAN); + } + return typeInfo == TRUE; + } + + // Get a long value from variant value `value[pos...]`. + // It is only legal to call it if `getType` returns one of `Type.LONG/DATE/TIMESTAMP/ + // TIMESTAMP_NTZ`. If the type is `DATE`, the return value is guaranteed to fit into an int and + // represents the number of days from the Unix epoch. + // If the type is `TIMESTAMP/TIMESTAMP_NTZ`, the return value represents the number of + // microseconds from the Unix epoch. + // Throw `MALFORMED_VARIANT` if the variant is malformed. + public static long getLong(byte[] value, int pos) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + String exceptionMessage = "Expect type to be LONG/DATE/TIMESTAMP/TIMESTAMP_NTZ"; + if (basicType != PRIMITIVE) throw new IllegalStateException(exceptionMessage); + return switch (typeInfo) { + case INT1 -> readLong(value, pos + 1, 1); + case INT2 -> readLong(value, pos + 1, 2); + case INT4, DATE -> readLong(value, pos + 1, 4); + case INT8, TIMESTAMP, TIMESTAMP_NTZ -> readLong(value, pos + 1, 8); + default -> throw new IllegalStateException(exceptionMessage); + }; + } + + // Get a double value from variant value `value[pos...]`. + // Throw `MALFORMED_VARIANT` if the variant is malformed. + public static double getDouble(byte[] value, int pos) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType != PRIMITIVE || typeInfo != DOUBLE) throw unexpectedType(Type.DOUBLE); + return Double.longBitsToDouble(readLong(value, pos + 1, 8)); + } + + // Check whether the precision and scale of the decimal are within the limit. + private static void checkDecimal(BigDecimal d, int maxPrecision) { + if (d.precision() > maxPrecision || d.scale() > maxPrecision) { + throw malformedVariant(); + } + } + + // Get a decimal value from variant value `value[pos...]`. + // Throw `MALFORMED_VARIANT` if the variant is malformed. + public static BigDecimal getDecimalWithOriginalScale(byte[] value, int pos) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType != PRIMITIVE) throw unexpectedType(Type.DECIMAL); + // Interpret the scale byte as unsigned. If it is a negative byte, the unsigned value must be + // greater than `MAX_DECIMAL16_PRECISION` and will trigger an error in `checkDecimal`. + int scale = value[pos + 1] & 0xFF; + BigDecimal result; + switch (typeInfo) { + case DECIMAL4: + result = BigDecimal.valueOf(readLong(value, pos + 2, 4), scale); + checkDecimal(result, MAX_DECIMAL4_PRECISION); + break; + case DECIMAL8: + result = BigDecimal.valueOf(readLong(value, pos + 2, 8), scale); + checkDecimal(result, MAX_DECIMAL8_PRECISION); + break; + case DECIMAL16: + checkIndex(pos + 17, value.length); + byte[] bytes = new byte[16]; + // Copy the bytes reversely because the `BigInteger` constructor expects a big-endian + // representation. + for (int i = 0; i < 16; ++i) { + bytes[i] = value[pos + 17 - i]; + } + result = new BigDecimal(new BigInteger(bytes), scale); + checkDecimal(result, MAX_DECIMAL16_PRECISION); + break; + default: + throw unexpectedType(Type.DECIMAL); + } + return result; + } + + public static BigDecimal getDecimal(byte[] value, int pos) { + return getDecimalWithOriginalScale(value, pos).stripTrailingZeros(); + } + + // Get a float value from variant value `value[pos...]`. + // Throw `MALFORMED_VARIANT` if the variant is malformed. + public static float getFloat(byte[] value, int pos) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType != PRIMITIVE || typeInfo != FLOAT) throw unexpectedType(Type.FLOAT); + return Float.intBitsToFloat((int) readLong(value, pos + 1, 4)); + } + + // Get a binary value from variant value `value[pos...]`. + // Throw `MALFORMED_VARIANT` if the variant is malformed. + public static byte[] getBinary(byte[] value, int pos) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType != PRIMITIVE || typeInfo != BINARY) throw unexpectedType(Type.BINARY); + int start = pos + 1 + U32_SIZE; + int length = readUnsigned(value, pos + 1, U32_SIZE); + checkIndex(start + length - 1, value.length); + return Arrays.copyOfRange(value, start, start + length); + } + + // Get a string value from variant value `value[pos...]`. + // Throw `MALFORMED_VARIANT` if the variant is malformed. + public static String getString(byte[] value, int pos) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType == SHORT_STR || (basicType == PRIMITIVE && typeInfo == LONG_STR)) { + int start; + int length; + if (basicType == SHORT_STR) { + start = pos + 1; + length = typeInfo; + } else { + start = pos + 1 + U32_SIZE; + length = readUnsigned(value, pos + 1, U32_SIZE); + } + checkIndex(start + length - 1, value.length); + return new String(value, start, length); + } + throw unexpectedType(Type.STRING); + } + + // Get a UUID value from variant value `value[pos...]`. + // Throw `MALFORMED_VARIANT` if the variant is malformed. + public static UUID getUuid(byte[] value, int pos) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType != PRIMITIVE || typeInfo != UUID) throw unexpectedType(Type.UUID); + int start = pos + 1; + checkIndex(start + 15, value.length); + // UUID values are big-endian, so we can't use VariantUtil.readLong(). + ByteBuffer bb = ByteBuffer.wrap(value, start, 16).order(ByteOrder.BIG_ENDIAN); + return new UUID(bb.getLong(), bb.getLong()); + } + + public interface ObjectHandler { + /** + * @param size Number of object fields. + * @param idSize The integer size of the field id list. + * @param offsetSize The integer size of the offset list. + * @param idStart The starting index of the field id list in the variant value array. + * @param offsetStart The starting index of the offset list in the variant value array. + * @param dataStart The starting index of field data in the variant value array. + */ + T apply(int size, int idSize, int offsetSize, int idStart, int offsetStart, int dataStart); + } + + // A helper function to access a variant object. It provides `handler` with its required + // parameters and returns what it returns. + public static T handleObject(byte[] value, int pos, ObjectHandler handler) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType != OBJECT) { + throw unexpectedType(Type.OBJECT); + } + // Refer to the comment of the `OBJECT` constant for the details of the object header encoding. + // Suppose `typeInfo` has a bit representation of 0_b4_b3b2_b1b0, the following line extracts + // b4 to determine whether the object uses a 1/4-byte size. + boolean largeSize = ((typeInfo >> 4) & 0x1) != 0; + int sizeBytes = (largeSize ? U32_SIZE : 1); + int size = readUnsigned(value, pos + 1, sizeBytes); + // Extracts b3b2 to determine the integer size of the field id list. + int idSize = ((typeInfo >> 2) & 0x3) + 1; + // Extracts b1b0 to determine the integer size of the offset list. + int offsetSize = (typeInfo & 0x3) + 1; + int idStart = pos + 1 + sizeBytes; + int offsetStart = idStart + size * idSize; + int dataStart = offsetStart + (size + 1) * offsetSize; + return handler.apply(size, idSize, offsetSize, idStart, offsetStart, dataStart); + } + + public interface ArrayHandler { + /** + * @param size Number of array elements. + * @param offsetSize The integer size of the offset list. + * @param offsetStart The starting index of the offset list in the variant value array. + * @param dataStart The starting index of element data in the variant value array. + */ + T apply(int size, int offsetSize, int offsetStart, int dataStart); + } + + // A helper function to access a variant array. + public static T handleArray(byte[] value, int pos, ArrayHandler handler) { + checkIndex(pos, value.length); + int basicType = value[pos] & BASIC_TYPE_MASK; + int typeInfo = (value[pos] >> BASIC_TYPE_BITS) & TYPE_INFO_MASK; + if (basicType != ARRAY) throw unexpectedType(Type.ARRAY); + // Refer to the comment of the `ARRAY` constant for the details of the object header encoding. + // Suppose `typeInfo` has a bit representation of 000_b2_b1b0, the following line extracts + // b2 to determine whether the object uses a 1/4-byte size. + boolean largeSize = ((typeInfo >> 2) & 0x1) != 0; + int sizeBytes = (largeSize ? U32_SIZE : 1); + int size = readUnsigned(value, pos + 1, sizeBytes); + // Extracts b1b0 to determine the integer size of the offset list. + int offsetSize = (typeInfo & 0x3) + 1; + int offsetStart = pos + 1 + sizeBytes; + int dataStart = offsetStart + (size + 1) * offsetSize; + return handler.apply(size, offsetSize, offsetStart, dataStart); + } + + // Get a key at `id` in the variant metadata. + // Throw `MALFORMED_VARIANT` if the variant is malformed. An out-of-bound `id` is also considered + // a malformed variant because it is read from the corresponding variant value. + public static String getMetadataKey(byte[] metadata, int id) { + checkIndex(0, metadata.length); + // Extracts the highest 2 bits in the metadata header to determine the integer size of the + // offset list. + int offsetSize = ((metadata[0] >> 6) & 0x3) + 1; + int dictSize = readUnsigned(metadata, 1, offsetSize); + if (id >= dictSize) throw malformedVariant(); + // There are a header byte, a `dictSize` with `offsetSize` bytes, and `(dictSize + 1)` offsets + // before the string data. + int stringStart = 1 + (dictSize + 2) * offsetSize; + int offset = readUnsigned(metadata, 1 + (id + 1) * offsetSize, offsetSize); + int nextOffset = readUnsigned(metadata, 1 + (id + 2) * offsetSize, offsetSize); + if (offset > nextOffset) throw malformedVariant(); + checkIndex(stringStart + nextOffset - 1, metadata.length); + return new String(metadata, stringStart + offset, nextOffset - offset); + } + + /** + * Parse metadata to extract dictionary strings. + * Returns a list of dictionary strings from the variant metadata. + */ + public static List parseMetadata(byte[] metadata) { + List dictionary = new ArrayList<>(); + + if (metadata == null || metadata.length < 1) { + return dictionary; + } + + try { + // Extracts the highest 2 bits in the metadata header to determine the integer size of the + // offset list. + int offsetSize = ((metadata[0] >> 6) & 0x3) + 1; + int dictSize = readUnsigned(metadata, 1, offsetSize); + + for (int i = 0; i < dictSize; i++) { + dictionary.add(getMetadataKey(metadata, i)); + } + } catch (Exception e) { + // Return empty dictionary on parse error + return new ArrayList<>(); + } + + return dictionary; + } + + private static boolean isTesting() { + return System.getenv("HIVE_IN_TEST") != null || System.getProperty("hive.in.test") != null; + } +} \ No newline at end of file diff --git a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/ColumnType.java b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/ColumnType.java index 01de9945087a..9db912182425 100644 --- a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/ColumnType.java +++ b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/ColumnType.java @@ -82,6 +82,8 @@ public class ColumnType { public static final String UNION_TYPE_NAME = "uniontype"; + public static final String VARIANT_TYPE_NAME = "variant"; + public static final String LIST_COLUMNS = "columns"; public static final String LIST_COLUMN_TYPES = "columns.types"; @@ -197,6 +199,7 @@ public class ColumnType { MAP_TYPE_NAME, STRUCT_TYPE_NAME, UNION_TYPE_NAME, + VARIANT_TYPE_NAME, LIST_COLUMNS, LIST_COLUMN_TYPES, COLUMN_NAME_DELIMITER diff --git a/standalone-metastore/pom.xml b/standalone-metastore/pom.xml index 3753bde0a7a8..b877b1fb26d8 100644 --- a/standalone-metastore/pom.xml +++ b/standalone-metastore/pom.xml @@ -65,7 +65,7 @@ 4.9.3 2.0.0-M24 - 3.12.0 + 3.14.0 1.1.3 2.12.0 2.0.0