From a3dfed781c58bc0e2808fb0f4d5408bc33d39573 Mon Sep 17 00:00:00 2001 From: zouyunhe Date: Tue, 14 Oct 2025 07:47:06 +0000 Subject: [PATCH 01/17] support proctime window --- .../StreamExecGlobalWindowAggregate.java | 10 +- .../StreamExecLocalWindowAggregate.java | 5 +- .../stream/StreamExecWindowAggregate.java | 41 +++++--- .../apache/gluten/rexnode/WindowUtils.java | 36 ++++++- .../SourceTransformationTranslator.java | 95 +++++++++++++++++++ .../runtime/config/VeloxQueryConfig.java | 3 + 6 files changed, 172 insertions(+), 18 deletions(-) diff --git a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecGlobalWindowAggregate.java b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecGlobalWindowAggregate.java index ee8c91a3757..5825447086f 100644 --- a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecGlobalWindowAggregate.java +++ b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecGlobalWindowAggregate.java @@ -37,6 +37,7 @@ import org.apache.flink.FlinkVersion; import org.apache.flink.api.dag.Transformation; +import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.api.java.tuple.Tuple5; import org.apache.flink.configuration.ReadableConfig; import org.apache.flink.streaming.api.operators.OneInputStreamOperator; @@ -205,11 +206,15 @@ protected Transformation translateToPlanInternal( // TODO: support more window types. Tuple5 windowSpecParams = WindowUtils.extractWindowParameters(windowing); + Tuple2 windowStartAndEndIndexes = + WindowUtils.getWindowStartAndEndIndexes(namedWindowProperties, (RowType) getOutputType()); long size = windowSpecParams.f0; long slide = windowSpecParams.f1; long offset = windowSpecParams.f2; int rowtimeIndex = windowSpecParams.f3; int windowType = windowSpecParams.f4; + int windowStartIndex = windowStartAndEndIndexes.f0; + int windowEndIndex = windowStartAndEndIndexes.f1; PartitionFunctionSpec sliceAssignerSpec = new StreamWindowPartitionFunctionSpec( inputType, rowtimeIndex, size, slide, offset, windowType); @@ -252,7 +257,10 @@ protected Transformation translateToPlanInternal( offset, windowType, outputType, - rowtimeIndex); + false, + rowtimeIndex, + windowStartIndex, + windowEndIndex); final OneInputStreamOperator windowOperator = new GlutenVectorOneInputOperator( new StatefulPlanNode(windowAgg.getId(), windowAgg), diff --git a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecLocalWindowAggregate.java b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecLocalWindowAggregate.java index 4ffcf7998c2..d1b01e04632 100644 --- a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecLocalWindowAggregate.java +++ b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecLocalWindowAggregate.java @@ -220,7 +220,10 @@ protected Transformation translateToPlanInternal( offset, windowType, outputType, - rowtimeIndex); + false, + rowtimeIndex, + -1, + -1); final OneInputStreamOperator localAggOperator = new GlutenVectorOneInputOperator( new StatefulPlanNode(windowAgg.getId(), windowAgg), diff --git a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecWindowAggregate.java b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecWindowAggregate.java index afe5d014839..667ed1876fd 100644 --- a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecWindowAggregate.java +++ b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecWindowAggregate.java @@ -37,6 +37,7 @@ import org.apache.flink.FlinkVersion; import org.apache.flink.api.dag.Transformation; +import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.api.java.tuple.Tuple5; import org.apache.flink.configuration.ReadableConfig; import org.apache.flink.streaming.api.operators.OneInputStreamOperator; @@ -65,6 +66,8 @@ import org.apache.calcite.rel.core.AggregateCall; import org.apache.commons.math3.util.ArithmeticUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import javax.annotation.Nullable; @@ -97,6 +100,7 @@ minStateVersion = FlinkVersion.v1_15) public class StreamExecWindowAggregate extends StreamExecWindowAggregateBase { + private static final Logger LOG = LoggerFactory.getLogger(StreamExecWindowAggregate.class); public static final String WINDOW_AGGREGATE_TRANSFORMATION = "window-aggregate"; private static final long WINDOW_AGG_MEMORY_RATIO = 100; @@ -197,11 +201,16 @@ protected Transformation translateToPlanInternal( // TODO: support more window types. Tuple5 windowSpecParams = WindowUtils.extractWindowParameters(windowing); + Tuple2 windowStartAndEndIndexes = + WindowUtils.getWindowStartAndEndIndexes(namedWindowProperties, (RowType) getOutputType()); long size = windowSpecParams.f0; long slide = windowSpecParams.f1; long offset = windowSpecParams.f2; int rowtimeIndex = windowSpecParams.f3; int windowType = windowSpecParams.f4; + boolean isRowTime = windowing.isRowtime(); + int windowStartIndex = windowStartAndEndIndexes.f0; + int windowEndIndex = windowStartAndEndIndexes.f1; PartitionFunctionSpec sliceAssignerSpec = new StreamWindowPartitionFunctionSpec( inputType, rowtimeIndex, size, slide, offset, windowType); @@ -210,7 +219,7 @@ protected Transformation translateToPlanInternal( PlanNodeIdGenerator.newId(), AggregateStep.SINGLE, groupingKeys, - groupingKeys, + isRowTime ? groupingKeys : List.of(), aggNames, aggregates, false, @@ -218,17 +227,19 @@ protected Transformation translateToPlanInternal( null, List.of()); PlanNode localAgg = - new AggregationNode( - PlanNodeIdGenerator.newId(), - AggregateStep.SINGLE, - groupingKeys, - groupingKeys, - aggNames, - aggregates, - false, - List.of(new EmptyNode(inputType)), - null, - List.of()); + isRowTime + ? new AggregationNode( + PlanNodeIdGenerator.newId(), + AggregateStep.SINGLE, + groupingKeys, + groupingKeys, + aggNames, + aggregates, + false, + List.of(new EmptyNode(inputType)), + null, + List.of()) + : null; PlanNode windowAgg = new StreamWindowAggregationNode( PlanNodeIdGenerator.newId(), @@ -244,7 +255,11 @@ protected Transformation translateToPlanInternal( offset, windowType, outputType, - rowtimeIndex); + windowing.isRowtime(), + rowtimeIndex, + windowStartIndex, + windowEndIndex); + LOG.info("windowStartIndex:{}, windowEndIndex:{}", windowStartIndex, windowEndIndex); final OneInputStreamOperator windowOperator = new GlutenVectorOneInputOperator( new StatefulPlanNode(windowAgg.getId(), windowAgg), diff --git a/gluten-flink/planner/src/main/java/org/apache/gluten/rexnode/WindowUtils.java b/gluten-flink/planner/src/main/java/org/apache/gluten/rexnode/WindowUtils.java index 6f7a64f6f4e..3f8f8ad5aaf 100644 --- a/gluten-flink/planner/src/main/java/org/apache/gluten/rexnode/WindowUtils.java +++ b/gluten-flink/planner/src/main/java/org/apache/gluten/rexnode/WindowUtils.java @@ -16,20 +16,32 @@ */ package org.apache.gluten.rexnode; +import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.api.java.tuple.Tuple5; +import org.apache.flink.table.planner.plan.logical.CumulativeWindowSpec; import org.apache.flink.table.planner.plan.logical.HoppingWindowSpec; +import org.apache.flink.table.planner.plan.logical.SessionWindowSpec; import org.apache.flink.table.planner.plan.logical.SliceAttachedWindowingStrategy; import org.apache.flink.table.planner.plan.logical.TimeAttributeWindowingStrategy; import org.apache.flink.table.planner.plan.logical.TumblingWindowSpec; import org.apache.flink.table.planner.plan.logical.WindowAttachedWindowingStrategy; import org.apache.flink.table.planner.plan.logical.WindowSpec; import org.apache.flink.table.planner.plan.logical.WindowingStrategy; +import org.apache.flink.table.runtime.groupwindow.NamedWindowProperty; +import org.apache.flink.table.runtime.groupwindow.WindowEnd; +import org.apache.flink.table.runtime.groupwindow.WindowStart; +import org.apache.flink.table.types.logical.RowType; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.time.Duration; +import java.util.List; /** Utility to store some useful functions. */ public class WindowUtils { + private static final Logger LOG = LoggerFactory.getLogger(WindowUtils.class); // Get names for project node. public static Tuple5 extractWindowParameters( WindowingStrategy windowing) { @@ -53,24 +65,28 @@ public static Tuple5 extractWindowParameters if (windowOffset != null) { offset = windowOffset.toMillis(); } + windowType = 0; } else if (windowSpec instanceof TumblingWindowSpec) { size = ((TumblingWindowSpec) windowSpec).getSize().toMillis(); Duration windowOffset = ((TumblingWindowSpec) windowSpec).getOffset(); if (windowOffset != null) { offset = windowOffset.toMillis(); } + windowType = 1; + } else if (windowSpec instanceof CumulativeWindowSpec) { + windowType = 2; + } else if (windowSpec instanceof SessionWindowSpec) { + windowType = 3; } else { throw new RuntimeException("Not support window spec " + windowSpec); } - + LOG.info("window strategy:{}", windowing.getClass().getName()); if (windowing instanceof TimeAttributeWindowingStrategy) { if (windowing.isRowtime()) { rowtimeIndex = ((TimeAttributeWindowingStrategy) windowing).getTimeAttributeIndex(); } - windowType = 0; } else if (windowing instanceof WindowAttachedWindowingStrategy) { rowtimeIndex = ((WindowAttachedWindowingStrategy) windowing).getWindowEnd(); - windowType = 1; } else if (windowing instanceof SliceAttachedWindowingStrategy) { rowtimeIndex = ((SliceAttachedWindowingStrategy) windowing).getSliceEnd(); } else { @@ -79,4 +95,18 @@ public static Tuple5 extractWindowParameters return new Tuple5( size, slide, offset, rowtimeIndex, windowType); } + + public static Tuple2 getWindowStartAndEndIndexes( + NamedWindowProperty[] props, RowType outputType) { + int windowStartIndex = -1, windowEndIndex = -1; + List outputNames = outputType.getFieldNames(); + for (NamedWindowProperty prop : props) { + if (prop.getProperty() instanceof WindowStart) { + windowStartIndex = outputNames.indexOf(prop.getName()); + } else if (prop.getProperty() instanceof WindowEnd) { + windowEndIndex = outputNames.indexOf(prop.getName()); + } + } + return new Tuple2(windowStartIndex, windowEndIndex); + } } diff --git a/gluten-flink/runtime/src/main/java/org/apache/flink/streaming/runtime/translators/SourceTransformationTranslator.java b/gluten-flink/runtime/src/main/java/org/apache/flink/streaming/runtime/translators/SourceTransformationTranslator.java index 5e7dd62f0df..31dcd08a363 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/flink/streaming/runtime/translators/SourceTransformationTranslator.java +++ b/gluten-flink/runtime/src/main/java/org/apache/flink/streaming/runtime/translators/SourceTransformationTranslator.java @@ -22,6 +22,8 @@ import org.apache.gluten.util.PlanNodeIdGenerator; import org.apache.gluten.util.ReflectUtils; +import io.github.zhztheplayer.velox4j.connector.KafkaConnectorSplit; +import io.github.zhztheplayer.velox4j.connector.KafkaTableHandle; import io.github.zhztheplayer.velox4j.connector.NexmarkConnectorSplit; import io.github.zhztheplayer.velox4j.connector.NexmarkTableHandle; import io.github.zhztheplayer.velox4j.plan.PlanNode; @@ -31,6 +33,7 @@ import org.apache.flink.annotation.Internal; import org.apache.flink.api.common.ExecutionConfig; +import org.apache.flink.api.connector.source.Source; import org.apache.flink.api.connector.source.SourceSplit; import org.apache.flink.streaming.api.graph.SimpleTransformationTranslator; import org.apache.flink.streaming.api.graph.StreamGraph; @@ -43,8 +46,11 @@ import java.util.Collection; import java.util.Collections; +import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Properties; +import java.util.UUID; import static org.apache.flink.util.Preconditions.checkNotNull; @@ -130,6 +136,95 @@ private Collection translateInternal( null, transformation.getOutputType(), "Source: " + transformation.getName()); + } else if (sourceClazz.getSimpleName().equals("KafkaSource")) { + RowType outputType = + (RowType) + LogicalTypeConverter.toVLType( + ((InternalTypeInfo) transformation.getOutputType()).toLogicalType()); + String connectorId = "connector-kafka"; + Source kafkaSource = transformation.getSource(); + Properties properties = + (Properties) ReflectUtils.getObjectField(sourceClazz, kafkaSource, "props"); + Object kafkaSubscriber = ReflectUtils.getObjectField(sourceClazz, kafkaSource, "subscriber"); + Object topics = + ReflectUtils.getObjectField(kafkaSubscriber.getClass(), kafkaSubscriber, "topics"); + Object deserializer = + ReflectUtils.getObjectField(sourceClazz, kafkaSource, "deserializationSchema"); + if (deserializer.getClass().getSimpleName().equals("KafkaDeserializationSchemaWrapper")) { + deserializer = + ReflectUtils.getObjectField( + deserializer.getClass(), deserializer, "kafkaDeserializationSchema"); + if (deserializer.getClass().getSimpleName().equals("DynamicKafkaDeserializationSchema")) { + deserializer = + ReflectUtils.getObjectField( + deserializer.getClass(), deserializer, "valueDeserialization"); + } + } + Object offsetStartInitializer = + ReflectUtils.getObjectField(sourceClazz, kafkaSource, "startingOffsetsInitializer"); + String startupMode = "group-offsets"; + String offsetStartInitializerClazzName = offsetStartInitializer.getClass().getSimpleName(); + if (offsetStartInitializerClazzName.equals("LatestOffsetsInitializer")) { + startupMode = "latest-offsets"; + } else if (offsetStartInitializerClazzName.equals("ReaderHandledOffsetsInitializer")) { + Long offset = + (Long) + ReflectUtils.getObjectField( + offsetStartInitializer.getClass(), offsetStartInitializer, "startingOffset"); + startupMode = + offset == -1 ? "latest-offsets" : offset == -2 ? "earliest-offsets" : "group-offsets"; + } + String planId = PlanNodeIdGenerator.newId(); + String topic = ((List) topics).get(0); + String format = + deserializer.getClass().getSimpleName().equals("JsonParserRowDataDeserializationSchema") + ? "json" + : "raw"; + Map kafkaTableParameters = new HashMap(); + for (String key : properties.stringPropertyNames()) { + kafkaTableParameters.put(key, properties.getProperty(key)); + } + kafkaTableParameters.put("topic", topic); + kafkaTableParameters.put("format", format); + kafkaTableParameters.put("scan.startup.mode", startupMode); + kafkaTableParameters.put( + "enable.auto.commit", + context.getStreamGraph().getCheckpointConfig().isCheckpointingEnabled() + ? "false" + : "true"); + kafkaTableParameters.put( + "client.id", + properties.getProperty("client.id.prefix", "connector-kafka") + "-" + UUID.randomUUID()); + KafkaTableHandle kafkaTableHandle = + new KafkaTableHandle(connectorId, topic, outputType, kafkaTableParameters); + KafkaConnectorSplit connectorSplit = + new KafkaConnectorSplit( + connectorId, + 0, + false, + kafkaTableParameters.get("bootstrap.servers"), + kafkaTableParameters.get("group.id"), + format, + Boolean.valueOf(kafkaTableParameters.getOrDefault("enable.auto.commit", "false")), + "latest", + List.of()); + TableScanNode kafkaScan = new TableScanNode(planId, outputType, kafkaTableHandle, List.of()); + StreamOperatorFactory operatorFactory = + SimpleOperatorFactory.of( + new GlutenStreamSource( + new GlutenVectorSourceFunction( + new StatefulPlanNode(kafkaScan.getId(), kafkaScan), + Map.of(kafkaScan.getId(), outputType), + kafkaScan.getId(), + connectorSplit))); + streamGraph.addLegacySource( + transformationId, + slotSharingGroup, + transformation.getCoLocationGroupKey(), + operatorFactory, + null, + transformation.getOutputType(), + "Source: " + transformation.getName()); } else { SourceOperatorFactory operatorFactory = new SourceOperatorFactory<>( diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/config/VeloxQueryConfig.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/config/VeloxQueryConfig.java index 788ecb4a68b..a1026cfe49a 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/config/VeloxQueryConfig.java +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/config/VeloxQueryConfig.java @@ -31,6 +31,8 @@ public class VeloxQueryConfig { private static final String keyVeloxAdjustTimestampToSessionTimeZone = "adjust_timestamp_to_session_timezone"; private static final String keyVeloxSessionTimezone = "session_timezone"; + private static final String kStreamingAggregationMinOutputBatchRows = + "streaming_aggregation_min_output_batch_rows"; public static Config getConfig(RuntimeContext context) { if (!(context instanceof StreamingRuntimeContext)) { @@ -47,6 +49,7 @@ public static Config getConfig(RuntimeContext context) { } else { configMap.put(keyVeloxSessionTimezone, localTimeZone); } + configMap.put(kStreamingAggregationMinOutputBatchRows, String.valueOf(1)); return Config.create(configMap); } } From 9dc019844b74481a29349a035df202e8ec9283ac Mon Sep 17 00:00:00 2001 From: zouyunhe Date: Thu, 16 Oct 2025 01:46:29 +0000 Subject: [PATCH 02/17] remove useless changes --- .../stream/StreamExecWindowAggregate.java | 6 +- .../apache/gluten/rexnode/WindowUtils.java | 4 - .../SourceTransformationTranslator.java | 95 ------------------- 3 files changed, 2 insertions(+), 103 deletions(-) diff --git a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecWindowAggregate.java b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecWindowAggregate.java index 667ed1876fd..8efb8be4e3a 100644 --- a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecWindowAggregate.java +++ b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecWindowAggregate.java @@ -66,8 +66,6 @@ import org.apache.calcite.rel.core.AggregateCall; import org.apache.commons.math3.util.ArithmeticUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import javax.annotation.Nullable; @@ -100,7 +98,6 @@ minStateVersion = FlinkVersion.v1_15) public class StreamExecWindowAggregate extends StreamExecWindowAggregateBase { - private static final Logger LOG = LoggerFactory.getLogger(StreamExecWindowAggregate.class); public static final String WINDOW_AGGREGATE_TRANSFORMATION = "window-aggregate"; private static final long WINDOW_AGG_MEMORY_RATIO = 100; @@ -226,6 +223,8 @@ protected Transformation translateToPlanInternal( List.of(new EmptyNode(inputType)), null, List.of()); + // processing time window can not apply to local-global aggregate optimization, so here we need + // to set local aggregtate as null when it is not event time window. PlanNode localAgg = isRowTime ? new AggregationNode( @@ -259,7 +258,6 @@ protected Transformation translateToPlanInternal( rowtimeIndex, windowStartIndex, windowEndIndex); - LOG.info("windowStartIndex:{}, windowEndIndex:{}", windowStartIndex, windowEndIndex); final OneInputStreamOperator windowOperator = new GlutenVectorOneInputOperator( new StatefulPlanNode(windowAgg.getId(), windowAgg), diff --git a/gluten-flink/planner/src/main/java/org/apache/gluten/rexnode/WindowUtils.java b/gluten-flink/planner/src/main/java/org/apache/gluten/rexnode/WindowUtils.java index 3f8f8ad5aaf..5a419509e93 100644 --- a/gluten-flink/planner/src/main/java/org/apache/gluten/rexnode/WindowUtils.java +++ b/gluten-flink/planner/src/main/java/org/apache/gluten/rexnode/WindowUtils.java @@ -32,9 +32,6 @@ import org.apache.flink.table.runtime.groupwindow.WindowStart; import org.apache.flink.table.types.logical.RowType; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import java.time.Duration; import java.util.List; @@ -80,7 +77,6 @@ public static Tuple5 extractWindowParameters } else { throw new RuntimeException("Not support window spec " + windowSpec); } - LOG.info("window strategy:{}", windowing.getClass().getName()); if (windowing instanceof TimeAttributeWindowingStrategy) { if (windowing.isRowtime()) { rowtimeIndex = ((TimeAttributeWindowingStrategy) windowing).getTimeAttributeIndex(); diff --git a/gluten-flink/runtime/src/main/java/org/apache/flink/streaming/runtime/translators/SourceTransformationTranslator.java b/gluten-flink/runtime/src/main/java/org/apache/flink/streaming/runtime/translators/SourceTransformationTranslator.java index 31dcd08a363..5e7dd62f0df 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/flink/streaming/runtime/translators/SourceTransformationTranslator.java +++ b/gluten-flink/runtime/src/main/java/org/apache/flink/streaming/runtime/translators/SourceTransformationTranslator.java @@ -22,8 +22,6 @@ import org.apache.gluten.util.PlanNodeIdGenerator; import org.apache.gluten.util.ReflectUtils; -import io.github.zhztheplayer.velox4j.connector.KafkaConnectorSplit; -import io.github.zhztheplayer.velox4j.connector.KafkaTableHandle; import io.github.zhztheplayer.velox4j.connector.NexmarkConnectorSplit; import io.github.zhztheplayer.velox4j.connector.NexmarkTableHandle; import io.github.zhztheplayer.velox4j.plan.PlanNode; @@ -33,7 +31,6 @@ import org.apache.flink.annotation.Internal; import org.apache.flink.api.common.ExecutionConfig; -import org.apache.flink.api.connector.source.Source; import org.apache.flink.api.connector.source.SourceSplit; import org.apache.flink.streaming.api.graph.SimpleTransformationTranslator; import org.apache.flink.streaming.api.graph.StreamGraph; @@ -46,11 +43,8 @@ import java.util.Collection; import java.util.Collections; -import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.Properties; -import java.util.UUID; import static org.apache.flink.util.Preconditions.checkNotNull; @@ -136,95 +130,6 @@ private Collection translateInternal( null, transformation.getOutputType(), "Source: " + transformation.getName()); - } else if (sourceClazz.getSimpleName().equals("KafkaSource")) { - RowType outputType = - (RowType) - LogicalTypeConverter.toVLType( - ((InternalTypeInfo) transformation.getOutputType()).toLogicalType()); - String connectorId = "connector-kafka"; - Source kafkaSource = transformation.getSource(); - Properties properties = - (Properties) ReflectUtils.getObjectField(sourceClazz, kafkaSource, "props"); - Object kafkaSubscriber = ReflectUtils.getObjectField(sourceClazz, kafkaSource, "subscriber"); - Object topics = - ReflectUtils.getObjectField(kafkaSubscriber.getClass(), kafkaSubscriber, "topics"); - Object deserializer = - ReflectUtils.getObjectField(sourceClazz, kafkaSource, "deserializationSchema"); - if (deserializer.getClass().getSimpleName().equals("KafkaDeserializationSchemaWrapper")) { - deserializer = - ReflectUtils.getObjectField( - deserializer.getClass(), deserializer, "kafkaDeserializationSchema"); - if (deserializer.getClass().getSimpleName().equals("DynamicKafkaDeserializationSchema")) { - deserializer = - ReflectUtils.getObjectField( - deserializer.getClass(), deserializer, "valueDeserialization"); - } - } - Object offsetStartInitializer = - ReflectUtils.getObjectField(sourceClazz, kafkaSource, "startingOffsetsInitializer"); - String startupMode = "group-offsets"; - String offsetStartInitializerClazzName = offsetStartInitializer.getClass().getSimpleName(); - if (offsetStartInitializerClazzName.equals("LatestOffsetsInitializer")) { - startupMode = "latest-offsets"; - } else if (offsetStartInitializerClazzName.equals("ReaderHandledOffsetsInitializer")) { - Long offset = - (Long) - ReflectUtils.getObjectField( - offsetStartInitializer.getClass(), offsetStartInitializer, "startingOffset"); - startupMode = - offset == -1 ? "latest-offsets" : offset == -2 ? "earliest-offsets" : "group-offsets"; - } - String planId = PlanNodeIdGenerator.newId(); - String topic = ((List) topics).get(0); - String format = - deserializer.getClass().getSimpleName().equals("JsonParserRowDataDeserializationSchema") - ? "json" - : "raw"; - Map kafkaTableParameters = new HashMap(); - for (String key : properties.stringPropertyNames()) { - kafkaTableParameters.put(key, properties.getProperty(key)); - } - kafkaTableParameters.put("topic", topic); - kafkaTableParameters.put("format", format); - kafkaTableParameters.put("scan.startup.mode", startupMode); - kafkaTableParameters.put( - "enable.auto.commit", - context.getStreamGraph().getCheckpointConfig().isCheckpointingEnabled() - ? "false" - : "true"); - kafkaTableParameters.put( - "client.id", - properties.getProperty("client.id.prefix", "connector-kafka") + "-" + UUID.randomUUID()); - KafkaTableHandle kafkaTableHandle = - new KafkaTableHandle(connectorId, topic, outputType, kafkaTableParameters); - KafkaConnectorSplit connectorSplit = - new KafkaConnectorSplit( - connectorId, - 0, - false, - kafkaTableParameters.get("bootstrap.servers"), - kafkaTableParameters.get("group.id"), - format, - Boolean.valueOf(kafkaTableParameters.getOrDefault("enable.auto.commit", "false")), - "latest", - List.of()); - TableScanNode kafkaScan = new TableScanNode(planId, outputType, kafkaTableHandle, List.of()); - StreamOperatorFactory operatorFactory = - SimpleOperatorFactory.of( - new GlutenStreamSource( - new GlutenVectorSourceFunction( - new StatefulPlanNode(kafkaScan.getId(), kafkaScan), - Map.of(kafkaScan.getId(), outputType), - kafkaScan.getId(), - connectorSplit))); - streamGraph.addLegacySource( - transformationId, - slotSharingGroup, - transformation.getCoLocationGroupKey(), - operatorFactory, - null, - transformation.getOutputType(), - "Source: " + transformation.getName()); } else { SourceOperatorFactory operatorFactory = new SourceOperatorFactory<>( From 2909b250b89e7425a78dd10d2c3cd7909308829b Mon Sep 17 00:00:00 2001 From: zouyunhe Date: Thu, 16 Oct 2025 01:48:21 +0000 Subject: [PATCH 03/17] remove useless changes --- .../src/main/java/org/apache/gluten/rexnode/WindowUtils.java | 1 - 1 file changed, 1 deletion(-) diff --git a/gluten-flink/planner/src/main/java/org/apache/gluten/rexnode/WindowUtils.java b/gluten-flink/planner/src/main/java/org/apache/gluten/rexnode/WindowUtils.java index 5a419509e93..021969f2a26 100644 --- a/gluten-flink/planner/src/main/java/org/apache/gluten/rexnode/WindowUtils.java +++ b/gluten-flink/planner/src/main/java/org/apache/gluten/rexnode/WindowUtils.java @@ -38,7 +38,6 @@ /** Utility to store some useful functions. */ public class WindowUtils { - private static final Logger LOG = LoggerFactory.getLogger(WindowUtils.class); // Get names for project node. public static Tuple5 extractWindowParameters( WindowingStrategy windowing) { From 332fb11ba7d615037a92124ecf6e131fdabba3d7 Mon Sep 17 00:00:00 2001 From: zouyunhe Date: Wed, 11 Jun 2025 10:49:59 +0000 Subject: [PATCH 04/17] support kafka data source --- .github/workflows/flink.yml | 2 +- gluten-flink/docs/Flink.md | 2 +- .../nodes/exec/common/CommonExecSink.java | 5 +- .../stream/StreamExecTableSourceScan.java | 8 +- ...er.java => FromElementsSourceFactory.java} | 31 +++- .../gluten/velox/KafkaSourceSinkFactory.java | 141 +++++++++++++++ .../gluten/velox/NexmarkSourceFactory.java | 107 +++++++++++ ...SinkBuilder.java => PrintSinkFactory.java} | 25 ++- ...apache.gluten.velox.VeloxSourceSinkFactory | 4 + .../SourceTransformationTranslator.java | 168 ------------------ .../gluten/velox/VeloxSourceSinkFactory.java | 58 ++++++ gluten-flink/ut/pom.xml | 30 ++++ .../runtime/stream/custom/NexmarkTest.java | 102 +++++++++-- .../src/test/resources/nexmark/ddl_kafka.sql | 46 +++++ .../src/test/resources/nexmark/ddl_views.sql | 6 +- 15 files changed, 533 insertions(+), 202 deletions(-) rename gluten-flink/planner/src/main/java/org/apache/gluten/velox/{VeloxSourceBuilder.java => FromElementsSourceFactory.java} (81%) create mode 100644 gluten-flink/planner/src/main/java/org/apache/gluten/velox/KafkaSourceSinkFactory.java create mode 100644 gluten-flink/planner/src/main/java/org/apache/gluten/velox/NexmarkSourceFactory.java rename gluten-flink/planner/src/main/java/org/apache/gluten/velox/{VeloxSinkBuilder.java => PrintSinkFactory.java} (86%) create mode 100644 gluten-flink/planner/src/main/resources/META-INF/services/org.apache.gluten.velox.VeloxSourceSinkFactory delete mode 100644 gluten-flink/runtime/src/main/java/org/apache/flink/streaming/runtime/translators/SourceTransformationTranslator.java create mode 100644 gluten-flink/runtime/src/main/java/org/apache/gluten/velox/VeloxSourceSinkFactory.java create mode 100644 gluten-flink/ut/src/test/resources/nexmark/ddl_kafka.sql diff --git a/.github/workflows/flink.yml b/.github/workflows/flink.yml index 4c0f1fdd4e8..099ee734389 100644 --- a/.github/workflows/flink.yml +++ b/.github/workflows/flink.yml @@ -61,7 +61,7 @@ jobs: sudo yum install https://mirror.stream.centos.org/9-stream/BaseOS/x86_64/os/Packages/tzdata-2025a-1.el9.noarch.rpm -y sudo .github/workflows/util/install-flink-resources.sh git clone -b gluten-0530 https://github.com/bigo-sg/velox4j.git - cd velox4j && git reset --hard 14eea127c5088f972cdf1ca0987fd95429485a0e + cd velox4j && git reset --hard 1753fa68f71d8a1a0df2d4a0ff346ae00e973e9c git apply $GITHUB_WORKSPACE/gluten-flink/patches/fix-velox4j.patch mvn clean install -DskipTests -Dgpg.skip -Dspotless.skip=true cd .. diff --git a/gluten-flink/docs/Flink.md b/gluten-flink/docs/Flink.md index bc3e2ed0924..a73b56bb32f 100644 --- a/gluten-flink/docs/Flink.md +++ b/gluten-flink/docs/Flink.md @@ -48,7 +48,7 @@ As some features have not been committed to upstream, you have to use the follow ## fetch velox4j code git clone -b gluten-0530 https://github.com/bigo-sg/velox4j.git cd velox4j -git reset --hard 14eea127c5088f972cdf1ca0987fd95429485a0e +git reset --hard 1753fa68f71d8a1a0df2d4a0ff346ae00e973e9c mvn clean install -DskipTests -Dgpg.skip -Dspotless.skip=true ``` **Get gluten** diff --git a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/common/CommonExecSink.java b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/common/CommonExecSink.java index f11d3f79587..efd23619688 100644 --- a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/common/CommonExecSink.java +++ b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/common/CommonExecSink.java @@ -19,7 +19,7 @@ import org.apache.gluten.table.runtime.operators.GlutenOneInputOperator; import org.apache.gluten.util.LogicalTypeConverter; import org.apache.gluten.util.PlanNodeIdGenerator; -import org.apache.gluten.velox.VeloxSinkBuilder; +import org.apache.gluten.velox.VeloxSourceSinkFactory; import org.apache.flink.api.common.io.OutputFormat; import org.apache.flink.api.dag.Transformation; @@ -470,7 +470,8 @@ public Optional generateUid(String name) { Transformation sinkTransformation = createSinkFunctionTransformation( sinkFunction, env, inputTransform, rowtimeFieldIndex, sinkMeta, sinkParallelism); - return VeloxSinkBuilder.build(env.getConfiguration(), sinkTransformation); + return VeloxSourceSinkFactory.getFactory(sinkTransformation) + .buildSink(env.getConfiguration(), sinkTransformation); // --- End Gluten-specific code changes --- } else if (runtimeProvider instanceof OutputFormatProvider) { OutputFormat outputFormat = diff --git a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecTableSourceScan.java b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecTableSourceScan.java index 07189d5f5ee..0d7f7533f2e 100644 --- a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecTableSourceScan.java +++ b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecTableSourceScan.java @@ -16,7 +16,7 @@ */ package org.apache.flink.table.planner.plan.nodes.exec.stream; -import org.apache.gluten.velox.VeloxSourceBuilder; +import org.apache.gluten.velox.VeloxSourceSinkFactory; import org.apache.flink.FlinkVersion; import org.apache.flink.api.common.io.InputFormat; @@ -106,7 +106,11 @@ protected Transformation translateToPlanInternal( .getScanTableSource( planner.getFlinkContext(), ShortcutUtils.unwrapTypeFactory(planner)); Transformation sourceTransformation = super.translateToPlanInternal(planner, config); - return VeloxSourceBuilder.build(sourceTransformation, tableSource); + VeloxSourceSinkFactory factory = VeloxSourceSinkFactory.getFactory(sourceTransformation); + return factory.buildSource( + sourceTransformation, + tableSource, + planner.getExecEnv().getCheckpointConfig().isCheckpointingEnabled()); // --- End Gluten-specific code changes --- } } diff --git a/gluten-flink/planner/src/main/java/org/apache/gluten/velox/VeloxSourceBuilder.java b/gluten-flink/planner/src/main/java/org/apache/gluten/velox/FromElementsSourceFactory.java similarity index 81% rename from gluten-flink/planner/src/main/java/org/apache/gluten/velox/VeloxSourceBuilder.java rename to gluten-flink/planner/src/main/java/org/apache/gluten/velox/FromElementsSourceFactory.java index 7c56bef7299..5a5ad79aeb9 100644 --- a/gluten-flink/planner/src/main/java/org/apache/gluten/velox/VeloxSourceBuilder.java +++ b/gluten-flink/planner/src/main/java/org/apache/gluten/velox/FromElementsSourceFactory.java @@ -28,6 +28,8 @@ import io.github.zhztheplayer.velox4j.plan.TableScanNode; import org.apache.flink.api.dag.Transformation; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.streaming.api.operators.StreamSource; import org.apache.flink.streaming.api.transformations.LegacySourceTransformation; import org.apache.flink.table.connector.source.ScanTableSource; import org.apache.flink.table.data.RowData; @@ -40,22 +42,27 @@ import java.util.List; import java.util.Map; -public class VeloxSourceBuilder { +public class FromElementsSourceFactory implements VeloxSourceSinkFactory { - public static Transformation build( - Transformation transformation, ScanTableSource scanTableSource) { + @SuppressWarnings("rawtypes") + @Override + public boolean match(Transformation transformation) { if (transformation instanceof LegacySourceTransformation) { - if (scanTableSource.getClass().getSimpleName().equals("TestValuesScanLookupTableSource")) { - return buildFromElementsSource(transformation, scanTableSource); + StreamSource source = ((LegacySourceTransformation) transformation).getOperator(); + String sourceFunctionName = source.getUserFunction().getClass().getSimpleName(); + if (sourceFunctionName.equals("FromElementsFunction")) { + return true; } } - return transformation; + return false; } - /** `FromElementsSource` is designed for ut tests, and we map it to velox source. */ @SuppressWarnings({"rawtypes", "unchecked"}) - private static Transformation buildFromElementsSource( - Transformation transformation, ScanTableSource tableSource) { + @Override + public Transformation buildSource( + Transformation transformation, + ScanTableSource tableSource, + boolean checkpointEnabled) { LegacySourceTransformation sourceTransformation = (LegacySourceTransformation) transformation; try { @@ -106,4 +113,10 @@ private static Transformation buildFromElementsSource( throw new FlinkRuntimeException(e); } } + + @Override + public Transformation buildSink( + ReadableConfig config, Transformation transformation) { + throw new FlinkRuntimeException("Unimplemented method 'buildSink'"); + } } diff --git a/gluten-flink/planner/src/main/java/org/apache/gluten/velox/KafkaSourceSinkFactory.java b/gluten-flink/planner/src/main/java/org/apache/gluten/velox/KafkaSourceSinkFactory.java new file mode 100644 index 00000000000..c9b20d4c33e --- /dev/null +++ b/gluten-flink/planner/src/main/java/org/apache/gluten/velox/KafkaSourceSinkFactory.java @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.velox; + +import org.apache.gluten.streaming.api.operators.GlutenStreamSource; +import org.apache.gluten.table.runtime.operators.GlutenVectorSourceFunction; +import org.apache.gluten.util.LogicalTypeConverter; +import org.apache.gluten.util.PlanNodeIdGenerator; +import org.apache.gluten.util.ReflectUtils; + +import io.github.zhztheplayer.velox4j.connector.KafkaConnectorSplit; +import io.github.zhztheplayer.velox4j.connector.KafkaTableHandle; +import io.github.zhztheplayer.velox4j.plan.StatefulPlanNode; +import io.github.zhztheplayer.velox4j.plan.TableScanNode; +import io.github.zhztheplayer.velox4j.type.RowType; + +import org.apache.flink.api.connector.source.Source; +import org.apache.flink.api.dag.Transformation; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.streaming.api.transformations.LegacySourceTransformation; +import org.apache.flink.streaming.api.transformations.SourceTransformation; +import org.apache.flink.table.connector.format.DecodingFormat; +import org.apache.flink.table.connector.source.ScanTableSource; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; +import org.apache.flink.util.FlinkRuntimeException; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.UUID; + +public class KafkaSourceSinkFactory implements VeloxSourceSinkFactory { + + @SuppressWarnings("rawtypes") + @Override + public boolean match(Transformation transformation) { + if (transformation instanceof SourceTransformation) { + Source source = ((SourceTransformation) transformation).getSource(); + return source.getClass().getSimpleName().equals("KafkaSource"); + } + return false; + } + + @SuppressWarnings({"unchecked", "rawtypes"}) + @Override + public Transformation buildSource( + Transformation transformation, + ScanTableSource tableSource, + boolean checkpointEnabled) { + RowType outputType = + (RowType) + LogicalTypeConverter.toVLType( + ((InternalTypeInfo) transformation.getOutputType()).toLogicalType()); + try { + Class tableSourceClazz = + Class.forName("org.apache.flink.streaming.connectors.kafka.table.KafkaDynamicSource"); + Properties properties = + (Properties) ReflectUtils.getObjectField(tableSourceClazz, tableSource, "properties"); + List topics = + (List) ReflectUtils.getObjectField(tableSourceClazz, tableSource, "topics"); + DecodingFormat decodingFormat = + (DecodingFormat) + ReflectUtils.getObjectField(tableSourceClazz, tableSource, "valueDecodingFormat"); + String startupMode = + String.valueOf(ReflectUtils.getObjectField(tableSourceClazz, tableSource, "startupMode")); + String connectorId = "connector-kafka"; + String planId = PlanNodeIdGenerator.newId(); + String topic = topics.get(0); + String format = + decodingFormat.getClass().getName().contains("JsonFormatFactory") ? "json" : "raw"; + Map kafkaTableParameters = new HashMap(); + for (String key : properties.stringPropertyNames()) { + kafkaTableParameters.put(key, properties.getProperty(key)); + } + kafkaTableParameters.put("topic", topic); + kafkaTableParameters.put("format", format); + kafkaTableParameters.put( + "scan.startup.mode", + startupMode.equals("LATEST") + ? "latest-offsets" + : startupMode.equals("EARLIEST") ? "earliest-offsets" : "group-offsets"); + kafkaTableParameters.put("enable.auto.commit", checkpointEnabled ? "false" : "true"); + kafkaTableParameters.put( + "client.id", + properties.getProperty("client.id.prefix", connectorId) + "-" + UUID.randomUUID()); + KafkaTableHandle kafkaTableHandle = + new KafkaTableHandle(connectorId, topic, outputType, kafkaTableParameters); + KafkaConnectorSplit connectorSplit = + new KafkaConnectorSplit( + connectorId, + 0, + false, + kafkaTableParameters.get("bootstrap.servers"), + kafkaTableParameters.get("group.id"), + format, + Boolean.valueOf(kafkaTableParameters.getOrDefault("enable.auto.commit", "false")), + "latest", + List.of()); + TableScanNode kafkaScan = new TableScanNode(planId, outputType, kafkaTableHandle, List.of()); + GlutenStreamSource sourceOp = + new GlutenStreamSource( + new GlutenVectorSourceFunction( + new StatefulPlanNode(kafkaScan.getId(), kafkaScan), + Map.of(kafkaScan.getId(), outputType), + kafkaScan.getId(), + connectorSplit)); + SourceTransformation sourceTransformation = (SourceTransformation) transformation; + return new LegacySourceTransformation( + sourceTransformation.getName(), + sourceOp, + transformation.getOutputType(), + sourceTransformation.getParallelism(), + sourceTransformation.getBoundedness(), + false); + } catch (Exception e) { + throw new FlinkRuntimeException(e); + } + } + + @Override + public Transformation buildSink( + ReadableConfig config, Transformation transformation) { + throw new FlinkRuntimeException("Unimplemented method 'buildSink'"); + } +} diff --git a/gluten-flink/planner/src/main/java/org/apache/gluten/velox/NexmarkSourceFactory.java b/gluten-flink/planner/src/main/java/org/apache/gluten/velox/NexmarkSourceFactory.java new file mode 100644 index 00000000000..07ce6089e6f --- /dev/null +++ b/gluten-flink/planner/src/main/java/org/apache/gluten/velox/NexmarkSourceFactory.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.velox; + +import org.apache.gluten.streaming.api.operators.GlutenStreamSource; +import org.apache.gluten.table.runtime.operators.GlutenVectorSourceFunction; +import org.apache.gluten.util.LogicalTypeConverter; +import org.apache.gluten.util.PlanNodeIdGenerator; +import org.apache.gluten.util.ReflectUtils; + +import io.github.zhztheplayer.velox4j.connector.NexmarkConnectorSplit; +import io.github.zhztheplayer.velox4j.connector.NexmarkTableHandle; +import io.github.zhztheplayer.velox4j.plan.PlanNode; +import io.github.zhztheplayer.velox4j.plan.StatefulPlanNode; +import io.github.zhztheplayer.velox4j.plan.TableScanNode; +import io.github.zhztheplayer.velox4j.type.RowType; + +import org.apache.flink.api.dag.Transformation; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.streaming.api.transformations.LegacySourceTransformation; +import org.apache.flink.streaming.api.transformations.SourceTransformation; +import org.apache.flink.table.connector.source.ScanTableSource; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; + +import java.util.List; +import java.util.Map; + +public class NexmarkSourceFactory implements VeloxSourceSinkFactory { + + @SuppressWarnings("rawtypes") + @Override + public boolean match(Transformation transformation) { + if (transformation instanceof SourceTransformation) { + Class sourceClazz = ((SourceTransformation) transformation).getSource().getClass(); + return sourceClazz.getSimpleName().equals("NexmarkSource"); + } + return false; + } + + @SuppressWarnings({"rawtypes", "unchecked"}) + @Override + public Transformation buildSource( + Transformation transformation, + ScanTableSource tableSource, + boolean checkpointEnabled) { + RowType outputType = + (RowType) + LogicalTypeConverter.toVLType( + ((InternalTypeInfo) transformation.getOutputType()).toLogicalType()); + Object nexmarkSource = ((SourceTransformation) transformation).getSource(); + String id = PlanNodeIdGenerator.newId(); + List nexmarkSourceSplits = + (List) + ReflectUtils.invokeObjectMethod( + nexmarkSource.getClass(), + nexmarkSource, + "getSplits", + new Class[] {int.class}, + new Object[] {transformation.getParallelism()}); + Object nexmarkSourceSplit = nexmarkSourceSplits.get(0); + Object generatorConfig = + ReflectUtils.getObjectField( + nexmarkSourceSplit.getClass(), nexmarkSourceSplit, "generatorConfig"); + Long maxEvents = + (Long) + ReflectUtils.getObjectField(generatorConfig.getClass(), generatorConfig, "maxEvents"); + PlanNode tableScan = + new TableScanNode(id, outputType, new NexmarkTableHandle("connector-nexmark"), List.of()); + GlutenStreamSource sourceOp = + new GlutenStreamSource( + new GlutenVectorSourceFunction( + new StatefulPlanNode(tableScan.getId(), tableScan), + Map.of(id, outputType), + id, + new NexmarkConnectorSplit( + "connector-nexmark", + maxEvents > Integer.MAX_VALUE ? Integer.MAX_VALUE : maxEvents.intValue()))); + return new LegacySourceTransformation( + transformation.getName(), + sourceOp, + transformation.getOutputType(), + transformation.getParallelism(), + ((SourceTransformation) transformation).getBoundedness(), + false); + } + + @Override + public Transformation buildSink( + ReadableConfig config, Transformation transformation) { + throw new UnsupportedOperationException("Unimplemented method 'buildSink'"); + } +} diff --git a/gluten-flink/planner/src/main/java/org/apache/gluten/velox/VeloxSinkBuilder.java b/gluten-flink/planner/src/main/java/org/apache/gluten/velox/PrintSinkFactory.java similarity index 86% rename from gluten-flink/planner/src/main/java/org/apache/gluten/velox/VeloxSinkBuilder.java rename to gluten-flink/planner/src/main/java/org/apache/gluten/velox/PrintSinkFactory.java index f27dec0a44a..2b6554f63d9 100644 --- a/gluten-flink/planner/src/main/java/org/apache/gluten/velox/VeloxSinkBuilder.java +++ b/gluten-flink/planner/src/main/java/org/apache/gluten/velox/PrintSinkFactory.java @@ -36,6 +36,8 @@ import org.apache.flink.streaming.api.operators.OneInputStreamOperator; import org.apache.flink.streaming.api.operators.SimpleOperatorFactory; import org.apache.flink.streaming.api.transformations.LegacySinkTransformation; +import org.apache.flink.table.connector.source.ScanTableSource; +import org.apache.flink.table.data.RowData; import org.apache.flink.table.runtime.operators.sink.SinkOperator; import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; import org.apache.flink.util.FlinkRuntimeException; @@ -43,9 +45,11 @@ import java.util.List; import java.util.Map; -public class VeloxSinkBuilder { +public class PrintSinkFactory implements VeloxSourceSinkFactory { - public static Transformation build(ReadableConfig config, Transformation transformation) { + @SuppressWarnings("rawtypes") + @Override + public boolean match(Transformation transformation) { if (transformation instanceof LegacySinkTransformation) { SimpleOperatorFactory operatorFactory = (SimpleOperatorFactory) ((LegacySinkTransformation) transformation).getOperatorFactory(); @@ -56,14 +60,23 @@ public static Transformation build(ReadableConfig config, Transformation transfo .getClass() .getSimpleName() .equals("RowDataPrintFunction")) { - return buildPrintSink(config, (LegacySinkTransformation) transformation); + return true; } } - return transformation; + return false; } - private static LegacySinkTransformation buildPrintSink( - ReadableConfig config, LegacySinkTransformation transformation) { + @Override + public Transformation buildSource( + Transformation transformation, + ScanTableSource tableSource, + boolean checkpointEnabled) { + throw new FlinkRuntimeException("Unimplemented method 'buildSource'"); + } + + @SuppressWarnings({"rawtypes", "unchecked"}) + @Override + public Transformation buildSink(ReadableConfig config, Transformation transformation) { Transformation inputTrans = (Transformation) transformation.getInputs().get(0); InternalTypeInfo inputTypeInfo = (InternalTypeInfo) inputTrans.getOutputType(); String logDir = config.get(CoreOptions.FLINK_LOG_DIR); diff --git a/gluten-flink/planner/src/main/resources/META-INF/services/org.apache.gluten.velox.VeloxSourceSinkFactory b/gluten-flink/planner/src/main/resources/META-INF/services/org.apache.gluten.velox.VeloxSourceSinkFactory new file mode 100644 index 00000000000..9d7623b7ec7 --- /dev/null +++ b/gluten-flink/planner/src/main/resources/META-INF/services/org.apache.gluten.velox.VeloxSourceSinkFactory @@ -0,0 +1,4 @@ +org.apache.gluten.velox.FromElementsSourceFactory +org.apache.gluten.velox.KafkaSourceSinkFactory +org.apache.gluten.velox.PrintSinkFactory +org.apache.gluten.velox.NexmarkSourceFactory diff --git a/gluten-flink/runtime/src/main/java/org/apache/flink/streaming/runtime/translators/SourceTransformationTranslator.java b/gluten-flink/runtime/src/main/java/org/apache/flink/streaming/runtime/translators/SourceTransformationTranslator.java deleted file mode 100644 index 5e7dd62f0df..00000000000 --- a/gluten-flink/runtime/src/main/java/org/apache/flink/streaming/runtime/translators/SourceTransformationTranslator.java +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.flink.streaming.runtime.translators; - -import org.apache.gluten.streaming.api.operators.GlutenStreamSource; -import org.apache.gluten.table.runtime.operators.GlutenVectorSourceFunction; -import org.apache.gluten.util.LogicalTypeConverter; -import org.apache.gluten.util.PlanNodeIdGenerator; -import org.apache.gluten.util.ReflectUtils; - -import io.github.zhztheplayer.velox4j.connector.NexmarkConnectorSplit; -import io.github.zhztheplayer.velox4j.connector.NexmarkTableHandle; -import io.github.zhztheplayer.velox4j.plan.PlanNode; -import io.github.zhztheplayer.velox4j.plan.StatefulPlanNode; -import io.github.zhztheplayer.velox4j.plan.TableScanNode; -import io.github.zhztheplayer.velox4j.type.RowType; - -import org.apache.flink.annotation.Internal; -import org.apache.flink.api.common.ExecutionConfig; -import org.apache.flink.api.connector.source.SourceSplit; -import org.apache.flink.streaming.api.graph.SimpleTransformationTranslator; -import org.apache.flink.streaming.api.graph.StreamGraph; -import org.apache.flink.streaming.api.graph.TransformationTranslator; -import org.apache.flink.streaming.api.operators.SimpleOperatorFactory; -import org.apache.flink.streaming.api.operators.SourceOperatorFactory; -import org.apache.flink.streaming.api.operators.StreamOperatorFactory; -import org.apache.flink.streaming.api.transformations.SourceTransformation; -import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; - -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.Map; - -import static org.apache.flink.util.Preconditions.checkNotNull; - -/** - * A {@link TransformationTranslator} for the {@link SourceTransformation}. - * - * @param The type of the elements that this source produces. - */ -@Internal -public class SourceTransformationTranslator - extends SimpleTransformationTranslator> { - - @Override - protected Collection translateForBatchInternal( - final SourceTransformation transformation, final Context context) { - - return translateInternal( - transformation, context, false /* don't emit progressive watermarks */); - } - - @Override - protected Collection translateForStreamingInternal( - final SourceTransformation transformation, final Context context) { - - return translateInternal(transformation, context, true /* emit progressive watermarks */); - } - - private Collection translateInternal( - final SourceTransformation transformation, - final Context context, - boolean emitProgressiveWatermarks) { - checkNotNull(transformation); - checkNotNull(context); - - final StreamGraph streamGraph = context.getStreamGraph(); - final String slotSharingGroup = context.getSlotSharingGroup(); - final int transformationId = transformation.getId(); - final ExecutionConfig executionConfig = streamGraph.getExecutionConfig(); - - // --- Begin Gluten-specific code changes --- - Class sourceClazz = transformation.getSource().getClass(); - if (sourceClazz.getSimpleName().equals("NexmarkSource")) { - RowType outputType = - (RowType) - LogicalTypeConverter.toVLType( - ((InternalTypeInfo) transformation.getOutputType()).toLogicalType()); - String id = PlanNodeIdGenerator.newId(); - Object nexmarkSource = transformation.getSource(); - List nexmarkSourceSplits = - (List) - ReflectUtils.invokeObjectMethod( - sourceClazz, - nexmarkSource, - "getSplits", - new Class[] {int.class}, - new Object[] {transformation.getParallelism()}); - Object nexmarkSourceSplit = nexmarkSourceSplits.get(0); - Object generatorConfig = - ReflectUtils.getObjectField( - nexmarkSourceSplit.getClass(), nexmarkSourceSplit, "generatorConfig"); - Long maxEvents = - (Long) - ReflectUtils.getObjectField(generatorConfig.getClass(), generatorConfig, "maxEvents"); - PlanNode tableScan = - new TableScanNode(id, outputType, new NexmarkTableHandle("connector-nexmark"), List.of()); - StreamOperatorFactory operatorFactory = - SimpleOperatorFactory.of( - new GlutenStreamSource( - new GlutenVectorSourceFunction( - new StatefulPlanNode(tableScan.getId(), tableScan), - Map.of(id, outputType), - id, - new NexmarkConnectorSplit( - "connector-nexmark", - maxEvents > Integer.MAX_VALUE - ? Integer.MAX_VALUE - : maxEvents.intValue())))); - streamGraph.addLegacySource( - transformationId, - slotSharingGroup, - transformation.getCoLocationGroupKey(), - operatorFactory, - null, - transformation.getOutputType(), - "Source: " + transformation.getName()); - } else { - SourceOperatorFactory operatorFactory = - new SourceOperatorFactory<>( - transformation.getSource(), - transformation.getWatermarkStrategy(), - emitProgressiveWatermarks); - - operatorFactory.setChainingStrategy(transformation.getChainingStrategy()); - operatorFactory.setCoordinatorListeningID(transformation.getCoordinatorListeningID()); - - streamGraph.addSource( - transformationId, - slotSharingGroup, - transformation.getCoLocationGroupKey(), - operatorFactory, - null, - transformation.getOutputType(), - "Source: " + transformation.getName()); - } - // --- End Gluten-specific code changes --- - - final int parallelism = - transformation.getParallelism() != ExecutionConfig.PARALLELISM_DEFAULT - ? transformation.getParallelism() - : executionConfig.getParallelism(); - - streamGraph.setParallelism( - transformationId, parallelism, transformation.isParallelismConfigured()); - streamGraph.setMaxParallelism(transformationId, transformation.getMaxParallelism()); - - streamGraph.setSupportsConcurrentExecutionAttempts( - transformationId, transformation.isSupportsConcurrentExecutionAttempts()); - - return Collections.singleton(transformationId); - } -} diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/velox/VeloxSourceSinkFactory.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/velox/VeloxSourceSinkFactory.java new file mode 100644 index 00000000000..1946d1b90bd --- /dev/null +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/velox/VeloxSourceSinkFactory.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.velox; + +import org.apache.flink.api.dag.Transformation; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.table.connector.source.ScanTableSource; +import org.apache.flink.table.data.RowData; +import org.apache.flink.util.FlinkRuntimeException; + +import java.util.HashSet; +import java.util.ServiceLoader; +import java.util.Set; + +public interface VeloxSourceSinkFactory { + + /** Match the conditions to determine whether the operator can be offloaded to velox. */ + boolean match(Transformation transformation); + + /** Build source transformation that offload the operator to velox. */ + Transformation buildSource( + Transformation transformation, + ScanTableSource tableSource, + boolean checkpointEnabled); + + /** Build sink transformation that offload the operator to velox. */ + Transformation buildSink(ReadableConfig config, Transformation transformation); + + /** Choose the matched source/sink factory by given transformation. */ + static VeloxSourceSinkFactory getFactory(Transformation transformation) { + ServiceLoader factories = + ServiceLoader.load(VeloxSourceSinkFactory.class); + Set factoryNames = new HashSet<>(); + for (VeloxSourceSinkFactory factory : factories) { + factoryNames.add(factory.getClass().getName()); + if (factory.match(transformation)) { + return factory; + } + } + throw new FlinkRuntimeException( + "Not find implemented factory to build velox transformation, available factories:" + + factoryNames); + } +} diff --git a/gluten-flink/ut/pom.xml b/gluten-flink/ut/pom.xml index 3583f7823fc..74311055a60 100644 --- a/gluten-flink/ut/pom.xml +++ b/gluten-flink/ut/pom.xml @@ -152,6 +152,36 @@ tests test + + com.salesforce.kafka.test + kafka-junit5 + 3.2.5 + test + + + org.apache.kafka + kafka_2.12 + 3.4.0 + test + + + org.apache.flink + flink-connector-kafka + 3.3.0-1.19 + test + + + org.apache.flink + flink-connector-base + ${flink.version} + test + + + org.apache.flink + flink-json + ${flink.version} + test + diff --git a/gluten-flink/ut/src/test/java/org/apache/gluten/table/runtime/stream/custom/NexmarkTest.java b/gluten-flink/ut/src/test/java/org/apache/gluten/table/runtime/stream/custom/NexmarkTest.java index 5454b293964..c527b5f28ad 100644 --- a/gluten-flink/ut/src/test/java/org/apache/gluten/table/runtime/stream/custom/NexmarkTest.java +++ b/gluten-flink/ut/src/test/java/org/apache/gluten/table/runtime/stream/custom/NexmarkTest.java @@ -18,13 +18,18 @@ import org.apache.gluten.table.runtime.stream.common.Velox4jEnvironment; +import org.apache.flink.api.common.JobStatus; +import org.apache.flink.core.execution.JobClient; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.table.api.EnvironmentSettings; import org.apache.flink.table.api.TableResult; import org.apache.flink.table.api.bridge.java.StreamTableEnvironment; +import com.salesforce.kafka.test.junit5.SharedKafkaTestResource; +import com.salesforce.kafka.test.listeners.PlainListener; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.RegisterExtension; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -61,9 +66,30 @@ public class NexmarkTest { put("PERSON_PROPORTION", "1"); put("AUCTION_PROPORTION", "3"); put("BID_PROPORTION", "46"); + put("NEXMARK_TABLE", "datagen"); } }; + private static final int KAFKA_PORT = 9092; + private static String topicName = "nexmark"; + + @RegisterExtension + public static final SharedKafkaTestResource sharedKafkaTestResource = + new SharedKafkaTestResource() + .withBrokers(1) + .registerListener(new PlainListener().onPorts(KAFKA_PORT)); + + private static final Map KAFKA_VARIABLES = + new HashMap<>() { + { + put("BOOTSTRAP_SERVERS", "localhost:9092"); + put("NEXMARK_TABLE", "kafka"); + } + }; + + private static final List VIEWS = List.of("person", "auction", "bid", "B"); + private static final List FUNCTIONS = List.of("count_char"); + private static StreamTableEnvironment tEnv; @BeforeAll @@ -76,31 +102,49 @@ static void setup() { EnvironmentSettings settings = EnvironmentSettings.newInstance().inStreamingMode().build(); tEnv = StreamTableEnvironment.create(env, settings); - - setupNexmarkEnvironment(tEnv); } @Test - void testAllNexmarkQueries() throws ExecutionException, InterruptedException, TimeoutException { + void testAllNexmarkSourceQueries() + throws ExecutionException, InterruptedException, TimeoutException { + setupNexmarkEnvironment(tEnv, "ddl_gen.sql", NEXMARK_VARIABLES); List queryFiles = getQueries(); assertThat(queryFiles).isNotEmpty(); + LOG.warn("Found {} Nexmark query files: {}", queryFiles.size(), queryFiles); + for (String queryFile : queryFiles) { + LOG.warn("Executing nextmark query from file: {}", queryFile); + executeQuery(tEnv, queryFile, false); + } + clearEnvironment(tEnv); + } + + @Test + void testAllKafkaSourceQueries() + throws ExecutionException, InterruptedException, TimeoutException { + sharedKafkaTestResource.getKafkaTestUtils().createTopic(topicName, 1, (short) 1); + setupNexmarkEnvironment(tEnv, "ddl_kafka.sql", KAFKA_VARIABLES); + List queryFiles = getQueries(); + assertThat(queryFiles).isNotEmpty(); LOG.warn("Found {} Nexmark query files: {}", queryFiles.size(), queryFiles); for (String queryFile : queryFiles) { - LOG.warn("Executing query from file: {}", queryFile); - executeQuery(tEnv, queryFile); + LOG.warn("Executing kafka query from file:{}", queryFile); + executeQuery(tEnv, queryFile, true); } + clearEnvironment(tEnv); } - private static void setupNexmarkEnvironment(StreamTableEnvironment tEnv) { - String createNexmarkSource = readSqlFromFile(NEXMARK_RESOURCE_DIR + "/ddl_gen.sql"); - createNexmarkSource = replaceVariables(createNexmarkSource, NEXMARK_VARIABLES); + private static void setupNexmarkEnvironment( + StreamTableEnvironment tEnv, String sourceFileName, Map variables) { + String createNexmarkSource = readSqlFromFile(NEXMARK_RESOURCE_DIR + "/" + sourceFileName); + createNexmarkSource = replaceVariables(createNexmarkSource, variables); tEnv.executeSql(createNexmarkSource); String createTableView = readSqlFromFile(NEXMARK_RESOURCE_DIR + "/ddl_views.sql"); String[] sqlTableView = createTableView.split(";"); for (String sql : sqlTableView) { + sql = replaceVariables(sql, variables); String trimmedSql = sql.trim(); if (!trimmedSql.isEmpty()) { tEnv.executeSql(trimmedSql); @@ -116,7 +160,23 @@ private static String replaceVariables(String sql, Map variables return result; } - private void executeQuery(StreamTableEnvironment tEnv, String queryFileName) + private static void clearEnvironment(StreamTableEnvironment tEnv) { + for (int i = 0; i <= 22; ++i) { + String tableName = "nexmark_q" + i; + String sql = String.format("drop table if exists %s", tableName); + tEnv.executeSql(sql); + } + for (String view : VIEWS) { + String sql = String.format("drop view if exists %s", view); + tEnv.executeSql(sql); + } + for (String func : FUNCTIONS) { + String sql = String.format("drop function if exists %s", func); + tEnv.executeSql(sql); + } + } + + private void executeQuery(StreamTableEnvironment tEnv, String queryFileName, boolean kafkaSource) throws ExecutionException, InterruptedException, TimeoutException { String queryContent = readSqlFromFile(NEXMARK_RESOURCE_DIR + "/" + queryFileName); @@ -136,7 +196,11 @@ private void executeQuery(StreamTableEnvironment tEnv, String queryFileName) String insertQuery = sqlStatements[sqlStatements.length - 2].trim(); if (!insertQuery.isEmpty()) { TableResult insertResult = tEnv.executeSql(insertQuery); - waitForJobCompletion(insertResult, 30000); + if (kafkaSource) { + assertThat(checkJobRunningStatus(insertResult, 30000) == true); + } else { + waitForJobCompletion(insertResult, 30000); + } } assertTrue(sqlStatements[sqlStatements.length - 1].trim().isEmpty()); } @@ -147,6 +211,24 @@ private void waitForJobCompletion(TableResult result, long timeoutMs) result.getJobClient().get().getJobExecutionResult().get(timeoutMs, TimeUnit.MILLISECONDS); } + private boolean checkJobRunningStatus(TableResult result, long timeoutMs) + throws InterruptedException { + long startTime = System.currentTimeMillis(); + assertTrue(result.getJobClient().isPresent()); + JobClient jobClient = result.getJobClient().get(); + while (System.currentTimeMillis() < startTime + timeoutMs) { + if (jobClient.getJobStatus().complete(JobStatus.RUNNING)) { + jobClient.cancel(); + return true; + } else { + Thread.sleep(1000); + } + } + LOG.warn("Job not running in " + timeoutMs + " millseconds."); + jobClient.cancel(); + return false; + } + private List getQueries() { URL resourceUrl = getClass().getClassLoader().getResource(NEXMARK_RESOURCE_DIR); diff --git a/gluten-flink/ut/src/test/resources/nexmark/ddl_kafka.sql b/gluten-flink/ut/src/test/resources/nexmark/ddl_kafka.sql new file mode 100644 index 00000000000..28affcdd348 --- /dev/null +++ b/gluten-flink/ut/src/test/resources/nexmark/ddl_kafka.sql @@ -0,0 +1,46 @@ +CREATE TABLE kafka ( + event_type int, + person ROW< + id BIGINT, + name VARCHAR, + emailAddress VARCHAR, + creditCard VARCHAR, + city VARCHAR, + state VARCHAR, + `dateTime` TIMESTAMP(3), + extra VARCHAR>, + auction ROW< + id BIGINT, + itemName VARCHAR, + description VARCHAR, + initialBid BIGINT, + reserve BIGINT, + `dateTime` TIMESTAMP(3), + expires TIMESTAMP(3), + seller BIGINT, + category BIGINT, + extra VARCHAR>, + bid ROW< + auction BIGINT, + bidder BIGINT, + price BIGINT, + channel VARCHAR, + url VARCHAR, + `dateTime` TIMESTAMP(3), + extra VARCHAR>, + `dateTime` AS + CASE + WHEN event_type = 0 THEN person.`dateTime` + WHEN event_type = 1 THEN auction.`dateTime` + ELSE bid.`dateTime` + END, + WATERMARK FOR `dateTime` AS `dateTime` - INTERVAL '4' SECOND +) WITH ( + 'connector' = 'kafka', + 'topic' = 'nexmark', + 'properties.bootstrap.servers' = '${BOOTSTRAP_SERVERS}', + 'properties.group.id' = 'nexmark', + 'scan.startup.mode' = 'earliest-offset', + 'sink.partitioner' = 'round-robin', + 'format' = 'json' +); \ No newline at end of file diff --git a/gluten-flink/ut/src/test/resources/nexmark/ddl_views.sql b/gluten-flink/ut/src/test/resources/nexmark/ddl_views.sql index 54902b4b44a..36f368dd92d 100644 --- a/gluten-flink/ut/src/test/resources/nexmark/ddl_views.sql +++ b/gluten-flink/ut/src/test/resources/nexmark/ddl_views.sql @@ -8,7 +8,7 @@ SELECT person.state, `dateTime`, person.extra -FROM datagen WHERE event_type = 0; +FROM ${NEXMARK_TABLE} WHERE event_type = 0; CREATE VIEW auction AS SELECT @@ -22,7 +22,7 @@ SELECT auction.seller, auction.category, auction.extra -FROM datagen WHERE event_type = 1; +FROM ${NEXMARK_TABLE} WHERE event_type = 1; CREATE VIEW bid AS SELECT @@ -33,4 +33,4 @@ SELECT bid.url, `dateTime`, bid.extra -FROM datagen WHERE event_type = 2; +FROM ${NEXMARK_TABLE} WHERE event_type = 2; From 992023a90f9f997ab387f1e24971d7e9fbda42a6 Mon Sep 17 00:00:00 2001 From: zouyunhe Date: Tue, 9 Dec 2025 03:54:23 +0000 Subject: [PATCH 05/17] fix reviews --- .../nodes/exec/common/CommonExecSink.java | 3 +- .../stream/StreamExecTableSourceScan.java | 3 +- .../velox/FromElementsSourceFactory.java | 10 ++-- .../gluten/velox/KafkaSourceSinkFactory.java | 4 +- .../gluten/velox/NexmarkSourceFactory.java | 4 +- .../apache/gluten/velox/PrintSinkFactory.java | 4 +- .../gluten/velox/VeloxSourceSinkFactory.java | 57 ++++++++++++++----- .../runtime/stream/custom/NexmarkTest.java | 4 +- .../src/test/resources/nexmark/ddl_kafka.sql | 2 +- 9 files changed, 59 insertions(+), 32 deletions(-) diff --git a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/common/CommonExecSink.java b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/common/CommonExecSink.java index efd23619688..a774da12492 100644 --- a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/common/CommonExecSink.java +++ b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/common/CommonExecSink.java @@ -470,8 +470,7 @@ public Optional generateUid(String name) { Transformation sinkTransformation = createSinkFunctionTransformation( sinkFunction, env, inputTransform, rowtimeFieldIndex, sinkMeta, sinkParallelism); - return VeloxSourceSinkFactory.getFactory(sinkTransformation) - .buildSink(env.getConfiguration(), sinkTransformation); + return VeloxSourceSinkFactory.buildSink(env.getConfiguration(), sinkTransformation); // --- End Gluten-specific code changes --- } else if (runtimeProvider instanceof OutputFormatProvider) { OutputFormat outputFormat = diff --git a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecTableSourceScan.java b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecTableSourceScan.java index 0d7f7533f2e..23143d60ace 100644 --- a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecTableSourceScan.java +++ b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecTableSourceScan.java @@ -106,8 +106,7 @@ protected Transformation translateToPlanInternal( .getScanTableSource( planner.getFlinkContext(), ShortcutUtils.unwrapTypeFactory(planner)); Transformation sourceTransformation = super.translateToPlanInternal(planner, config); - VeloxSourceSinkFactory factory = VeloxSourceSinkFactory.getFactory(sourceTransformation); - return factory.buildSource( + return VeloxSourceSinkFactory.buildSource( sourceTransformation, tableSource, planner.getExecEnv().getCheckpointConfig().isCheckpointingEnabled()); diff --git a/gluten-flink/planner/src/main/java/org/apache/gluten/velox/FromElementsSourceFactory.java b/gluten-flink/planner/src/main/java/org/apache/gluten/velox/FromElementsSourceFactory.java index 5a5ad79aeb9..70d6dcd87e7 100644 --- a/gluten-flink/planner/src/main/java/org/apache/gluten/velox/FromElementsSourceFactory.java +++ b/gluten-flink/planner/src/main/java/org/apache/gluten/velox/FromElementsSourceFactory.java @@ -29,6 +29,7 @@ import org.apache.flink.api.dag.Transformation; import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.streaming.api.functions.source.FromElementsFunction; import org.apache.flink.streaming.api.operators.StreamSource; import org.apache.flink.streaming.api.transformations.LegacySourceTransformation; import org.apache.flink.table.connector.source.ScanTableSource; @@ -49,17 +50,14 @@ public class FromElementsSourceFactory implements VeloxSourceSinkFactory { public boolean match(Transformation transformation) { if (transformation instanceof LegacySourceTransformation) { StreamSource source = ((LegacySourceTransformation) transformation).getOperator(); - String sourceFunctionName = source.getUserFunction().getClass().getSimpleName(); - if (sourceFunctionName.equals("FromElementsFunction")) { - return true; - } + return source.getUserFunction() instanceof FromElementsFunction; } return false; } @SuppressWarnings({"rawtypes", "unchecked"}) @Override - public Transformation buildSource( + public Transformation buildVeloxSource( Transformation transformation, ScanTableSource tableSource, boolean checkpointEnabled) { @@ -115,7 +113,7 @@ public Transformation buildSource( } @Override - public Transformation buildSink( + public Transformation buildVeloxSink( ReadableConfig config, Transformation transformation) { throw new FlinkRuntimeException("Unimplemented method 'buildSink'"); } diff --git a/gluten-flink/planner/src/main/java/org/apache/gluten/velox/KafkaSourceSinkFactory.java b/gluten-flink/planner/src/main/java/org/apache/gluten/velox/KafkaSourceSinkFactory.java index c9b20d4c33e..f840cdce805 100644 --- a/gluten-flink/planner/src/main/java/org/apache/gluten/velox/KafkaSourceSinkFactory.java +++ b/gluten-flink/planner/src/main/java/org/apache/gluten/velox/KafkaSourceSinkFactory.java @@ -59,7 +59,7 @@ public boolean match(Transformation transformation) { @SuppressWarnings({"unchecked", "rawtypes"}) @Override - public Transformation buildSource( + public Transformation buildVeloxSource( Transformation transformation, ScanTableSource tableSource, boolean checkpointEnabled) { @@ -134,7 +134,7 @@ public Transformation buildSource( } @Override - public Transformation buildSink( + public Transformation buildVeloxSink( ReadableConfig config, Transformation transformation) { throw new FlinkRuntimeException("Unimplemented method 'buildSink'"); } diff --git a/gluten-flink/planner/src/main/java/org/apache/gluten/velox/NexmarkSourceFactory.java b/gluten-flink/planner/src/main/java/org/apache/gluten/velox/NexmarkSourceFactory.java index 07ce6089e6f..c7598c81f0a 100644 --- a/gluten-flink/planner/src/main/java/org/apache/gluten/velox/NexmarkSourceFactory.java +++ b/gluten-flink/planner/src/main/java/org/apache/gluten/velox/NexmarkSourceFactory.java @@ -54,7 +54,7 @@ public boolean match(Transformation transformation) { @SuppressWarnings({"rawtypes", "unchecked"}) @Override - public Transformation buildSource( + public Transformation buildVeloxSource( Transformation transformation, ScanTableSource tableSource, boolean checkpointEnabled) { @@ -100,7 +100,7 @@ public Transformation buildSource( } @Override - public Transformation buildSink( + public Transformation buildVeloxSink( ReadableConfig config, Transformation transformation) { throw new UnsupportedOperationException("Unimplemented method 'buildSink'"); } diff --git a/gluten-flink/planner/src/main/java/org/apache/gluten/velox/PrintSinkFactory.java b/gluten-flink/planner/src/main/java/org/apache/gluten/velox/PrintSinkFactory.java index 2b6554f63d9..b6415c84397 100644 --- a/gluten-flink/planner/src/main/java/org/apache/gluten/velox/PrintSinkFactory.java +++ b/gluten-flink/planner/src/main/java/org/apache/gluten/velox/PrintSinkFactory.java @@ -67,7 +67,7 @@ public boolean match(Transformation transformation) { } @Override - public Transformation buildSource( + public Transformation buildVeloxSource( Transformation transformation, ScanTableSource tableSource, boolean checkpointEnabled) { @@ -76,7 +76,7 @@ public Transformation buildSource( @SuppressWarnings({"rawtypes", "unchecked"}) @Override - public Transformation buildSink(ReadableConfig config, Transformation transformation) { + public Transformation buildVeloxSink(ReadableConfig config, Transformation transformation) { Transformation inputTrans = (Transformation) transformation.getInputs().get(0); InternalTypeInfo inputTypeInfo = (InternalTypeInfo) inputTrans.getOutputType(); String logDir = config.get(CoreOptions.FLINK_LOG_DIR); diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/velox/VeloxSourceSinkFactory.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/velox/VeloxSourceSinkFactory.java index 1946d1b90bd..07e8593e9db 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/velox/VeloxSourceSinkFactory.java +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/velox/VeloxSourceSinkFactory.java @@ -20,39 +20,70 @@ import org.apache.flink.configuration.ReadableConfig; import org.apache.flink.table.connector.source.ScanTableSource; import org.apache.flink.table.data.RowData; -import org.apache.flink.util.FlinkRuntimeException; -import java.util.HashSet; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Optional; import java.util.ServiceLoader; -import java.util.Set; public interface VeloxSourceSinkFactory { - /** Match the conditions to determine whether the operator can be offloaded to velox. */ + static final Logger LOG = LoggerFactory.getLogger(VeloxSourceSinkFactory.class); + + /** Match the conditions to determine if the operator can be offloaded to velox. */ boolean match(Transformation transformation); /** Build source transformation that offload the operator to velox. */ - Transformation buildSource( + Transformation buildVeloxSource( Transformation transformation, ScanTableSource tableSource, boolean checkpointEnabled); /** Build sink transformation that offload the operator to velox. */ - Transformation buildSink(ReadableConfig config, Transformation transformation); + Transformation buildVeloxSink( + ReadableConfig config, Transformation transformation); /** Choose the matched source/sink factory by given transformation. */ - static VeloxSourceSinkFactory getFactory(Transformation transformation) { + private static Optional getFactory( + Transformation transformation) { ServiceLoader factories = ServiceLoader.load(VeloxSourceSinkFactory.class); - Set factoryNames = new HashSet<>(); for (VeloxSourceSinkFactory factory : factories) { - factoryNames.add(factory.getClass().getName()); if (factory.match(transformation)) { - return factory; + return Optional.of(factory); } } - throw new FlinkRuntimeException( - "Not find implemented factory to build velox transformation, available factories:" - + factoryNames); + return Optional.empty(); + } + + /** Build Velox source, or fallback to flink orignal source . */ + static Transformation buildSource( + Transformation transformation, + ScanTableSource tableSource, + boolean checkpointEnabled) { + Optional factory = getFactory(transformation); + if (factory.isEmpty()) { + LOG.warn( + "Not find matched factory to build velox source transformation, and we will use flink original transformation {} instead.", + transformation.getClass().getName()); + return transformation; + } else { + return factory.get().buildVeloxSource(transformation, tableSource, checkpointEnabled); + } + } + + /** Build Velox sink, or fallback to flink original sink. */ + static Transformation buildSink( + ReadableConfig config, Transformation transformation) { + Optional factory = getFactory(transformation); + if (factory.isEmpty()) { + LOG.warn( + "Not find matched factory to build velox sink transformation, and we will use flink original transformation {} instead.", + transformation.getClass().getName()); + return transformation; + } else { + return factory.get().buildVeloxSink(config, transformation); + } } } diff --git a/gluten-flink/ut/src/test/java/org/apache/gluten/table/runtime/stream/custom/NexmarkTest.java b/gluten-flink/ut/src/test/java/org/apache/gluten/table/runtime/stream/custom/NexmarkTest.java index c527b5f28ad..476c4cba4d0 100644 --- a/gluten-flink/ut/src/test/java/org/apache/gluten/table/runtime/stream/custom/NexmarkTest.java +++ b/gluten-flink/ut/src/test/java/org/apache/gluten/table/runtime/stream/custom/NexmarkTest.java @@ -74,7 +74,7 @@ public class NexmarkTest { private static String topicName = "nexmark"; @RegisterExtension - public static final SharedKafkaTestResource sharedKafkaTestResource = + public static final SharedKafkaTestResource kafkaInstance = new SharedKafkaTestResource() .withBrokers(1) .registerListener(new PlainListener().onPorts(KAFKA_PORT)); @@ -122,7 +122,7 @@ void testAllNexmarkSourceQueries() @Test void testAllKafkaSourceQueries() throws ExecutionException, InterruptedException, TimeoutException { - sharedKafkaTestResource.getKafkaTestUtils().createTopic(topicName, 1, (short) 1); + kafkaInstance.getKafkaTestUtils().createTopic(topicName, 1, (short) 1); setupNexmarkEnvironment(tEnv, "ddl_kafka.sql", KAFKA_VARIABLES); List queryFiles = getQueries(); assertThat(queryFiles).isNotEmpty(); diff --git a/gluten-flink/ut/src/test/resources/nexmark/ddl_kafka.sql b/gluten-flink/ut/src/test/resources/nexmark/ddl_kafka.sql index 28affcdd348..27757eaeafc 100644 --- a/gluten-flink/ut/src/test/resources/nexmark/ddl_kafka.sql +++ b/gluten-flink/ut/src/test/resources/nexmark/ddl_kafka.sql @@ -43,4 +43,4 @@ CREATE TABLE kafka ( 'scan.startup.mode' = 'earliest-offset', 'sink.partitioner' = 'round-robin', 'format' = 'json' -); \ No newline at end of file +); From 9b5739abc4a654a2ebb93d9e9acc33add5dfc33b Mon Sep 17 00:00:00 2001 From: zouyunhe Date: Fri, 12 Dec 2025 02:18:51 +0000 Subject: [PATCH 06/17] optimize code --- .../nodes/exec/common/CommonExecSink.java | 4 +++- .../stream/StreamExecTableSourceScan.java | 8 ++++++-- .../velox/FromElementsSourceFactory.java | 9 ++++----- .../gluten/velox/KafkaSourceSinkFactory.java | 10 +++++----- .../gluten/velox/NexmarkSourceFactory.java | 8 ++------ .../apache/gluten/velox/PrintSinkFactory.java | 11 +++++------ .../gluten/velox/VeloxSourceSinkFactory.java | 19 +++++++------------ 7 files changed, 32 insertions(+), 37 deletions(-) diff --git a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/common/CommonExecSink.java b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/common/CommonExecSink.java index a774da12492..a1b0e0e7b79 100644 --- a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/common/CommonExecSink.java +++ b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/common/CommonExecSink.java @@ -24,6 +24,7 @@ import org.apache.flink.api.common.io.OutputFormat; import org.apache.flink.api.dag.Transformation; import org.apache.flink.api.java.typeutils.InputTypeConfigurable; +import org.apache.flink.configuration.Configuration; import org.apache.flink.configuration.ReadableConfig; import org.apache.flink.runtime.state.KeyGroupRangeAssignment; import org.apache.flink.streaming.api.datastream.CustomSinkOperatorUidHashes; @@ -470,7 +471,8 @@ public Optional generateUid(String name) { Transformation sinkTransformation = createSinkFunctionTransformation( sinkFunction, env, inputTransform, rowtimeFieldIndex, sinkMeta, sinkParallelism); - return VeloxSourceSinkFactory.buildSink(env.getConfiguration(), sinkTransformation); + return VeloxSourceSinkFactory.buildSink( + sinkTransformation, Map.of(Configuration.class.getName(), env.getConfiguration())); // --- End Gluten-specific code changes --- } else if (runtimeProvider instanceof OutputFormatProvider) { OutputFormat outputFormat = diff --git a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecTableSourceScan.java b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecTableSourceScan.java index 23143d60ace..90b3981f0f8 100644 --- a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecTableSourceScan.java +++ b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecTableSourceScan.java @@ -40,6 +40,7 @@ import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.annotation.JsonProperty; import java.util.Collections; +import java.util.Map; /** * Stream {@link ExecNode} to read data from an external source defined by a {@link @@ -108,8 +109,11 @@ protected Transformation translateToPlanInternal( Transformation sourceTransformation = super.translateToPlanInternal(planner, config); return VeloxSourceSinkFactory.buildSource( sourceTransformation, - tableSource, - planner.getExecEnv().getCheckpointConfig().isCheckpointingEnabled()); + Map.of( + ScanTableSource.class.getName(), + tableSource, + "checkpoint.enabled", + planner.getExecEnv().getCheckpointConfig().isCheckpointingEnabled())); // --- End Gluten-specific code changes --- } } diff --git a/gluten-flink/planner/src/main/java/org/apache/gluten/velox/FromElementsSourceFactory.java b/gluten-flink/planner/src/main/java/org/apache/gluten/velox/FromElementsSourceFactory.java index 70d6dcd87e7..da31edeccd1 100644 --- a/gluten-flink/planner/src/main/java/org/apache/gluten/velox/FromElementsSourceFactory.java +++ b/gluten-flink/planner/src/main/java/org/apache/gluten/velox/FromElementsSourceFactory.java @@ -28,7 +28,6 @@ import io.github.zhztheplayer.velox4j.plan.TableScanNode; import org.apache.flink.api.dag.Transformation; -import org.apache.flink.configuration.ReadableConfig; import org.apache.flink.streaming.api.functions.source.FromElementsFunction; import org.apache.flink.streaming.api.operators.StreamSource; import org.apache.flink.streaming.api.transformations.LegacySourceTransformation; @@ -58,12 +57,12 @@ public boolean match(Transformation transformation) { @SuppressWarnings({"rawtypes", "unchecked"}) @Override public Transformation buildVeloxSource( - Transformation transformation, - ScanTableSource tableSource, - boolean checkpointEnabled) { + Transformation transformation, Map parameters) { LegacySourceTransformation sourceTransformation = (LegacySourceTransformation) transformation; try { + ScanTableSource tableSource = + (ScanTableSource) parameters.get(ScanTableSource.class.getName()); Class tableSourceClazz = Class.forName( "org.apache.flink.table.planner.factories.TestValuesTableFactory$TestValuesScanTableSourceWithoutProjectionPushDown"); @@ -114,7 +113,7 @@ public Transformation buildVeloxSource( @Override public Transformation buildVeloxSink( - ReadableConfig config, Transformation transformation) { + Transformation transformation, Map parameters) { throw new FlinkRuntimeException("Unimplemented method 'buildSink'"); } } diff --git a/gluten-flink/planner/src/main/java/org/apache/gluten/velox/KafkaSourceSinkFactory.java b/gluten-flink/planner/src/main/java/org/apache/gluten/velox/KafkaSourceSinkFactory.java index f840cdce805..54e2b8b4361 100644 --- a/gluten-flink/planner/src/main/java/org/apache/gluten/velox/KafkaSourceSinkFactory.java +++ b/gluten-flink/planner/src/main/java/org/apache/gluten/velox/KafkaSourceSinkFactory.java @@ -30,7 +30,6 @@ import org.apache.flink.api.connector.source.Source; import org.apache.flink.api.dag.Transformation; -import org.apache.flink.configuration.ReadableConfig; import org.apache.flink.streaming.api.transformations.LegacySourceTransformation; import org.apache.flink.streaming.api.transformations.SourceTransformation; import org.apache.flink.table.connector.format.DecodingFormat; @@ -60,14 +59,15 @@ public boolean match(Transformation transformation) { @SuppressWarnings({"unchecked", "rawtypes"}) @Override public Transformation buildVeloxSource( - Transformation transformation, - ScanTableSource tableSource, - boolean checkpointEnabled) { + Transformation transformation, Map parameters) { RowType outputType = (RowType) LogicalTypeConverter.toVLType( ((InternalTypeInfo) transformation.getOutputType()).toLogicalType()); try { + ScanTableSource tableSource = + (ScanTableSource) parameters.get(ScanTableSource.class.getName()); + boolean checkpointEnabled = (Boolean) parameters.get("checkpoint.enabled"); Class tableSourceClazz = Class.forName("org.apache.flink.streaming.connectors.kafka.table.KafkaDynamicSource"); Properties properties = @@ -135,7 +135,7 @@ public Transformation buildVeloxSource( @Override public Transformation buildVeloxSink( - ReadableConfig config, Transformation transformation) { + Transformation transformation, Map parameters) { throw new FlinkRuntimeException("Unimplemented method 'buildSink'"); } } diff --git a/gluten-flink/planner/src/main/java/org/apache/gluten/velox/NexmarkSourceFactory.java b/gluten-flink/planner/src/main/java/org/apache/gluten/velox/NexmarkSourceFactory.java index c7598c81f0a..736f3cc3c72 100644 --- a/gluten-flink/planner/src/main/java/org/apache/gluten/velox/NexmarkSourceFactory.java +++ b/gluten-flink/planner/src/main/java/org/apache/gluten/velox/NexmarkSourceFactory.java @@ -30,10 +30,8 @@ import io.github.zhztheplayer.velox4j.type.RowType; import org.apache.flink.api.dag.Transformation; -import org.apache.flink.configuration.ReadableConfig; import org.apache.flink.streaming.api.transformations.LegacySourceTransformation; import org.apache.flink.streaming.api.transformations.SourceTransformation; -import org.apache.flink.table.connector.source.ScanTableSource; import org.apache.flink.table.data.RowData; import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; @@ -55,9 +53,7 @@ public boolean match(Transformation transformation) { @SuppressWarnings({"rawtypes", "unchecked"}) @Override public Transformation buildVeloxSource( - Transformation transformation, - ScanTableSource tableSource, - boolean checkpointEnabled) { + Transformation transformation, Map parameters) { RowType outputType = (RowType) LogicalTypeConverter.toVLType( @@ -101,7 +97,7 @@ public Transformation buildVeloxSource( @Override public Transformation buildVeloxSink( - ReadableConfig config, Transformation transformation) { + Transformation transformation, Map parameters) { throw new UnsupportedOperationException("Unimplemented method 'buildSink'"); } } diff --git a/gluten-flink/planner/src/main/java/org/apache/gluten/velox/PrintSinkFactory.java b/gluten-flink/planner/src/main/java/org/apache/gluten/velox/PrintSinkFactory.java index b6415c84397..b00a76a21f1 100644 --- a/gluten-flink/planner/src/main/java/org/apache/gluten/velox/PrintSinkFactory.java +++ b/gluten-flink/planner/src/main/java/org/apache/gluten/velox/PrintSinkFactory.java @@ -31,12 +31,11 @@ import org.apache.flink.api.dag.Transformation; import org.apache.flink.configuration.ConfigConstants; +import org.apache.flink.configuration.Configuration; import org.apache.flink.configuration.CoreOptions; -import org.apache.flink.configuration.ReadableConfig; import org.apache.flink.streaming.api.operators.OneInputStreamOperator; import org.apache.flink.streaming.api.operators.SimpleOperatorFactory; import org.apache.flink.streaming.api.transformations.LegacySinkTransformation; -import org.apache.flink.table.connector.source.ScanTableSource; import org.apache.flink.table.data.RowData; import org.apache.flink.table.runtime.operators.sink.SinkOperator; import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; @@ -68,17 +67,17 @@ public boolean match(Transformation transformation) { @Override public Transformation buildVeloxSource( - Transformation transformation, - ScanTableSource tableSource, - boolean checkpointEnabled) { + Transformation transformation, Map parameters) { throw new FlinkRuntimeException("Unimplemented method 'buildSource'"); } @SuppressWarnings({"rawtypes", "unchecked"}) @Override - public Transformation buildVeloxSink(ReadableConfig config, Transformation transformation) { + public Transformation buildVeloxSink( + Transformation transformation, Map parameters) { Transformation inputTrans = (Transformation) transformation.getInputs().get(0); InternalTypeInfo inputTypeInfo = (InternalTypeInfo) inputTrans.getOutputType(); + Configuration config = (Configuration) parameters.get(Configuration.class.getName()); String logDir = config.get(CoreOptions.FLINK_LOG_DIR); String printPath; if (logDir != null) { diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/velox/VeloxSourceSinkFactory.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/velox/VeloxSourceSinkFactory.java index 07e8593e9db..aff1bb77955 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/velox/VeloxSourceSinkFactory.java +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/velox/VeloxSourceSinkFactory.java @@ -17,13 +17,12 @@ package org.apache.gluten.velox; import org.apache.flink.api.dag.Transformation; -import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.table.connector.source.ScanTableSource; import org.apache.flink.table.data.RowData; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.util.Map; import java.util.Optional; import java.util.ServiceLoader; @@ -36,13 +35,11 @@ public interface VeloxSourceSinkFactory { /** Build source transformation that offload the operator to velox. */ Transformation buildVeloxSource( - Transformation transformation, - ScanTableSource tableSource, - boolean checkpointEnabled); + Transformation transformation, Map parameters); /** Build sink transformation that offload the operator to velox. */ Transformation buildVeloxSink( - ReadableConfig config, Transformation transformation); + Transformation transformation, Map parameters); /** Choose the matched source/sink factory by given transformation. */ private static Optional getFactory( @@ -59,9 +56,7 @@ private static Optional getFactory( /** Build Velox source, or fallback to flink orignal source . */ static Transformation buildSource( - Transformation transformation, - ScanTableSource tableSource, - boolean checkpointEnabled) { + Transformation transformation, Map parameters) { Optional factory = getFactory(transformation); if (factory.isEmpty()) { LOG.warn( @@ -69,13 +64,13 @@ static Transformation buildSource( transformation.getClass().getName()); return transformation; } else { - return factory.get().buildVeloxSource(transformation, tableSource, checkpointEnabled); + return factory.get().buildVeloxSource(transformation, parameters); } } /** Build Velox sink, or fallback to flink original sink. */ static Transformation buildSink( - ReadableConfig config, Transformation transformation) { + Transformation transformation, Map parameters) { Optional factory = getFactory(transformation); if (factory.isEmpty()) { LOG.warn( @@ -83,7 +78,7 @@ static Transformation buildSink( transformation.getClass().getName()); return transformation; } else { - return factory.get().buildVeloxSink(config, transformation); + return factory.get().buildVeloxSink(transformation, parameters); } } } From ac8e16fbec2641fc2da41a1e4368c727a0f80d6b Mon Sep 17 00:00:00 2001 From: zouyunhe Date: Wed, 24 Dec 2025 11:34:49 +0000 Subject: [PATCH 07/17] fix memory leak --- .../flink/client/StreamGraphTranslator.java | 12 ++--- .../api/operators/GlutenStreamSource.java | 27 +++++++++++ .../runtime/tasks/GlutenOutputCollector.java | 3 -- .../runtime/config/VeloxSessionConfig.java | 45 +++++++++++++++++++ .../GlutenVectorOneInputOperator.java | 2 + .../operators/GlutenVectorSourceFunction.java | 11 ++--- .../typeutils/GlutenRowVectorSerializer.java | 45 ++++++++----------- .../ut/src/test/resources/nexmark/q5.sql | 38 ---------------- .../ut/src/test/resources/nexmark/q7.sql | 21 --------- .../ut/src/test/resources/nexmark/q8.sql | 27 ----------- 10 files changed, 103 insertions(+), 128 deletions(-) create mode 100644 gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/config/VeloxSessionConfig.java delete mode 100644 gluten-flink/ut/src/test/resources/nexmark/q5.sql delete mode 100644 gluten-flink/ut/src/test/resources/nexmark/q7.sql delete mode 100644 gluten-flink/ut/src/test/resources/nexmark/q8.sql diff --git a/gluten-flink/runtime/src/main/java/org/apache/flink/client/StreamGraphTranslator.java b/gluten-flink/runtime/src/main/java/org/apache/flink/client/StreamGraphTranslator.java index db702d56053..3fdbd4ad4aa 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/flink/client/StreamGraphTranslator.java +++ b/gluten-flink/runtime/src/main/java/org/apache/flink/client/StreamGraphTranslator.java @@ -132,7 +132,7 @@ private void buildGlutenChains(StreamConfig taskConfig, Map nodeToNonChainedOuts = new HashMap<>(outEdges.size()); taskConfig @@ -216,14 +216,16 @@ private void buildGlutenChains(StreamConfig taskConfig, Map implements WatermarkGaugeExposingOutput> { protected final Map>> outputs; - private final Random random = new XORShiftRandom(); private final WatermarkGauge watermarkGauge = new WatermarkGauge(); protected final Counter numRecordsOutForTask; diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/config/VeloxSessionConfig.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/config/VeloxSessionConfig.java new file mode 100644 index 00000000000..ee55a258654 --- /dev/null +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/config/VeloxSessionConfig.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.table.runtime.config; + +import io.github.zhztheplayer.velox4j.session.Session; + +import java.io.Serializable; +import java.util.HashMap; +import java.util.Map; + +public class VeloxSessionConfig implements Serializable { + private final Map sessions; + + private static VeloxSessionConfig config = new VeloxSessionConfig(); + + private VeloxSessionConfig() { + sessions = new HashMap<>(); + } + + public static VeloxSessionConfig getSessionConfig() { + return config; + } + + public void addSession(String id, Session session) { + sessions.put(id, session); + } + + public Session getSession(String id) { + return sessions.getOrDefault(id, null); + } +} diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenVectorOneInputOperator.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenVectorOneInputOperator.java index e859e15ca37..3b21d60e74e 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenVectorOneInputOperator.java +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenVectorOneInputOperator.java @@ -18,6 +18,7 @@ import org.apache.gluten.streaming.api.operators.GlutenOperator; import org.apache.gluten.table.runtime.config.VeloxQueryConfig; +import org.apache.gluten.table.runtime.config.VeloxSessionConfig; import io.github.zhztheplayer.velox4j.Velox4j; import io.github.zhztheplayer.velox4j.config.ConnectorConfig; @@ -82,6 +83,7 @@ public GlutenVectorOneInputOperator( void initGlutenTask() { memoryManager = MemoryManager.create(AllocationListener.NOOP); session = Velox4j.newSession(memoryManager); + VeloxSessionConfig.getSessionConfig().addSession(id, session); // add a mock input as velox not allow the source is empty. StatefulPlanNode mockInput = new StatefulPlanNode( diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenVectorSourceFunction.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenVectorSourceFunction.java index 472bd0bfed0..9cb09a14f7e 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenVectorSourceFunction.java +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenVectorSourceFunction.java @@ -33,7 +33,6 @@ import io.github.zhztheplayer.velox4j.stateful.StatefulElement; import io.github.zhztheplayer.velox4j.type.RowType; -import org.apache.flink.api.common.state.CheckpointListener; import org.apache.flink.configuration.Configuration; import org.apache.flink.runtime.state.FunctionInitializationContext; import org.apache.flink.runtime.state.FunctionSnapshotContext; @@ -53,7 +52,7 @@ * instead of RowData to avoid data convert. */ public class GlutenVectorSourceFunction extends RichParallelSourceFunction - implements CheckpointedFunction, CheckpointListener { + implements CheckpointedFunction { private static final Logger LOG = LoggerFactory.getLogger(GlutenVectorSourceFunction.class); private final StatefulPlanNode planNode; @@ -105,7 +104,6 @@ public void open(Configuration parameters) throws Exception { new Query( planNode, VeloxQueryConfig.getConfig(getRuntimeContext()), ConnectorConfig.empty()); allocator = new RootAllocator(Long.MAX_VALUE); - task = session.queryOps().execute(query); task.addSplit(id, split); task.noMoreSplits(id); @@ -133,7 +131,6 @@ public void run(SourceContext sourceContext) throws Exception { } taskMetrics.updateMetrics(task, id); } - task.close(); session.close(); memoryManager.close(); @@ -170,13 +167,11 @@ public void initializeState(FunctionInitializationContext context) throws Except this.task.initializeState(0); } - @Override - public void notifyCheckpointComplete(long checkpointId) throws Exception { + public String[] notifyCheckpointComplete(long checkpointId) throws Exception { // TODO: notify velox - this.task.notifyCheckpointComplete(checkpointId); + return this.task.notifyCheckpointComplete(checkpointId); } - @Override public void notifyCheckpointAborted(long checkpointId) throws Exception { // TODO: notify velox this.task.notifyCheckpointAborted(checkpointId); diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/typeutils/GlutenRowVectorSerializer.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/typeutils/GlutenRowVectorSerializer.java index db17a47a2d5..6f340ad1e50 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/typeutils/GlutenRowVectorSerializer.java +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/typeutils/GlutenRowVectorSerializer.java @@ -16,11 +16,10 @@ */ package org.apache.gluten.table.runtime.typeutils; -import io.github.zhztheplayer.velox4j.Velox4j; +import org.apache.gluten.streaming.api.operators.GlutenOperator; +import org.apache.gluten.table.runtime.config.VeloxSessionConfig; + import io.github.zhztheplayer.velox4j.data.RowVector; -import io.github.zhztheplayer.velox4j.memory.AllocationListener; -import io.github.zhztheplayer.velox4j.memory.MemoryManager; -import io.github.zhztheplayer.velox4j.session.Session; import io.github.zhztheplayer.velox4j.stateful.StatefulRecord; import io.github.zhztheplayer.velox4j.type.RowType; @@ -31,24 +30,23 @@ import org.apache.flink.core.memory.DataInputView; import org.apache.flink.core.memory.DataOutputView; -import java.io.Closeable; import java.io.IOException; /** Serializer for {@link RowVector}. */ @Internal -public class GlutenRowVectorSerializer extends TypeSerializer implements Closeable { +public class GlutenRowVectorSerializer extends TypeSerializer { private static final long serialVersionUID = 1L; private final RowType rowType; - private transient MemoryManager memoryManager; - private transient Session session; + private final GlutenOperator operator; - public GlutenRowVectorSerializer(RowType rowType) { + public GlutenRowVectorSerializer(RowType rowType, GlutenOperator operator) { this.rowType = rowType; + this.operator = operator; } @Override public TypeSerializer duplicate() { - return new GlutenRowVectorSerializer(rowType); + return new GlutenRowVectorSerializer(rowType, operator); } @Override @@ -65,14 +63,15 @@ public void serialize(StatefulRecord record, DataOutputView target) throws IOExc @Override public StatefulRecord deserialize(DataInputView source) throws IOException { - if (memoryManager == null) { - memoryManager = MemoryManager.create(AllocationListener.NOOP); - session = Velox4j.newSession(memoryManager); - } int len = source.readInt(); byte[] str = new byte[len]; source.readFully(str); - RowVector rowVector = session.baseVectorOps().deserializeOne(new String(str)).asRowVector(); + RowVector rowVector = + VeloxSessionConfig.getSessionConfig() + .getSession(operator.getId()) + .baseVectorOps() + .deserializeOne(new String(str)) + .asRowVector(); StatefulRecord record = new StatefulRecord(null, 0, 0, false, -1); record.setRowVector(rowVector); return record; @@ -131,15 +130,7 @@ public int getLength() { @Override public TypeSerializerSnapshot snapshotConfiguration() { - return new RowVectorSerializerSnapshot(rowType); - } - - @Override - public void close() { - if (memoryManager != null) { - memoryManager.close(); - session.close(); - } + return new RowVectorSerializerSnapshot(rowType, operator); } /** {@link TypeSerializerSnapshot} for Gluten RowVector.. */ @@ -148,14 +139,16 @@ public static final class RowVectorSerializerSnapshot private static final int CURRENT_VERSION = 1; private RowType rowType; + private GlutenOperator operator; @SuppressWarnings("unused") public RowVectorSerializerSnapshot() { // this constructor is used when restoring from a checkpoint/savepoint. } - RowVectorSerializerSnapshot(RowType rowType) { + RowVectorSerializerSnapshot(RowType rowType, GlutenOperator operator) { this.rowType = rowType; + this.operator = operator; } @Override @@ -172,7 +165,7 @@ public void readSnapshot(int readVersion, DataInputView in, ClassLoader userCode @Override public GlutenRowVectorSerializer restoreSerializer() { - return new GlutenRowVectorSerializer(rowType); + return new GlutenRowVectorSerializer(rowType, operator); } @Override diff --git a/gluten-flink/ut/src/test/resources/nexmark/q5.sql b/gluten-flink/ut/src/test/resources/nexmark/q5.sql deleted file mode 100644 index 98954bacf20..00000000000 --- a/gluten-flink/ut/src/test/resources/nexmark/q5.sql +++ /dev/null @@ -1,38 +0,0 @@ -CREATE TABLE nexmark_q5 ( - auction BIGINT, - num BIGINT -) WITH ( - 'connector' = 'blackhole' -); - -INSERT INTO nexmark_q5 -SELECT AuctionBids.auction, AuctionBids.num - FROM ( - SELECT - auction, - count(*) AS num, - window_start AS starttime, - window_end AS endtime - FROM TABLE( - HOP(TABLE bid, DESCRIPTOR(`dateTime`), INTERVAL '2' SECOND, INTERVAL '10' SECOND)) - GROUP BY auction, window_start, window_end - ) AS AuctionBids - JOIN ( - SELECT - max(CountBids.num) AS maxn, - CountBids.starttime, - CountBids.endtime - FROM ( - SELECT - count(*) AS num, - window_start AS starttime, - window_end AS endtime - FROM TABLE( - HOP(TABLE bid, DESCRIPTOR(`dateTime`), INTERVAL '2' SECOND, INTERVAL '10' SECOND)) - GROUP BY auction, window_start, window_end - ) AS CountBids - GROUP BY CountBids.starttime, CountBids.endtime - ) AS MaxBids - ON AuctionBids.starttime = MaxBids.starttime AND - AuctionBids.endtime = MaxBids.endtime AND - AuctionBids.num >= MaxBids.maxn; diff --git a/gluten-flink/ut/src/test/resources/nexmark/q7.sql b/gluten-flink/ut/src/test/resources/nexmark/q7.sql deleted file mode 100644 index 1b0ec308e94..00000000000 --- a/gluten-flink/ut/src/test/resources/nexmark/q7.sql +++ /dev/null @@ -1,21 +0,0 @@ -CREATE TABLE nexmark_q7 ( - auction BIGINT, - bidder BIGINT, - price BIGINT, - `dateTime` TIMESTAMP(3), - extra VARCHAR -) WITH ( - 'connector' = 'blackhole' -); - -INSERT INTO nexmark_q7 -SELECT B.auction, B.price, B.bidder, B.`dateTime`, B.extra -from bid B -JOIN ( - SELECT MAX(price) AS maxprice, window_end as `dateTime` - FROM TABLE( - TUMBLE(TABLE bid, DESCRIPTOR(`dateTime`), INTERVAL '10' SECOND)) - GROUP BY window_start, window_end -) B1 -ON B.price = B1.maxprice -WHERE B.`dateTime` BETWEEN B1.`dateTime` - INTERVAL '10' SECOND AND B1.`dateTime`; diff --git a/gluten-flink/ut/src/test/resources/nexmark/q8.sql b/gluten-flink/ut/src/test/resources/nexmark/q8.sql deleted file mode 100644 index 1b112f9fc75..00000000000 --- a/gluten-flink/ut/src/test/resources/nexmark/q8.sql +++ /dev/null @@ -1,27 +0,0 @@ -CREATE TABLE nexmark_q8 ( - id BIGINT, - name VARCHAR, - stime TIMESTAMP(3) -) WITH ( - 'connector' = 'blackhole' -); - -INSERT INTO nexmark_q8 -SELECT P.id, P.name, P.starttime -FROM ( - SELECT id, name, - window_start AS starttime, - window_end AS endtime - FROM TABLE( - TUMBLE(TABLE person, DESCRIPTOR(`dateTime`), INTERVAL '10' SECOND)) - GROUP BY id, name, window_start, window_end -) P -JOIN ( - SELECT seller, - window_start AS starttime, - window_end AS endtime - FROM TABLE( - TUMBLE(TABLE auction, DESCRIPTOR(`dateTime`), INTERVAL '10' SECOND)) - GROUP BY seller, window_start, window_end -) A -ON P.id = A.seller AND P.starttime = A.starttime AND P.endtime = A.endtime; From ed984f3f05c087179ac20da2821ba2efadaf6ac0 Mon Sep 17 00:00:00 2001 From: zouyunhe Date: Wed, 11 Mar 2026 07:22:57 +0000 Subject: [PATCH 08/17] merge master --- .../gluten/client/OffloadedJobGraphGenerator.java | 8 ++++---- .../operators/GlutenVectorOneInputOperator.java | 1 - .../operators/GlutenVectorSourceFunction.java | 2 -- .../typeutils/GlutenStatefulRecordSerializer.java | 14 +++++--------- 4 files changed, 9 insertions(+), 16 deletions(-) diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/client/OffloadedJobGraphGenerator.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/client/OffloadedJobGraphGenerator.java index 42784fce289..d10ddeb1036 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/client/OffloadedJobGraphGenerator.java +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/client/OffloadedJobGraphGenerator.java @@ -292,19 +292,19 @@ private void createOffloadedTwoInputOperator( private void setOffloadedOutputSerializer(StreamConfig opConfig, GlutenOperator operator) { RowType rowType = operator.getOutputTypes().entrySet().iterator().next().getValue(); - opConfig.setTypeSerializerOut(new GlutenStatefulRecordSerializer(rowType, operator.getId())); + opConfig.setTypeSerializerOut(new GlutenStatefulRecordSerializer(rowType, operator)); } private void setOffloadedInputSerializer(StreamConfig opConfig, GlutenOperator operator) { opConfig.setupNetworkInputs( - new GlutenStatefulRecordSerializer(operator.getInputType(), operator.getId())); + new GlutenStatefulRecordSerializer(operator.getInputType(), operator)); } private void setOffloadedInputSerializersForTwoInputOperator( StreamConfig opConfig, GlutenTwoInputOperator operator) { opConfig.setupNetworkInputs( - new GlutenStatefulRecordSerializer(operator.getLeftInputType(), operator.getId()), - new GlutenStatefulRecordSerializer(operator.getRightInputType(), operator.getId())); + new GlutenStatefulRecordSerializer(operator.getLeftInputType(), operator), + new GlutenStatefulRecordSerializer(operator.getRightInputType(), operator)); } private void setOffloadedStatePartitioner( diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenVectorOneInputOperator.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenVectorOneInputOperator.java index 3b21d60e74e..32cea6148f0 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenVectorOneInputOperator.java +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenVectorOneInputOperator.java @@ -183,7 +183,6 @@ public void initializeState(StateInitializationContext context) throws Exception initGlutenTask(); } // TODO: implement it - task.initializeState(0); super.initializeState(context); } diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenVectorSourceFunction.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenVectorSourceFunction.java index 9cb09a14f7e..ae505cea506 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenVectorSourceFunction.java +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenVectorSourceFunction.java @@ -163,8 +163,6 @@ public void initializeState(FunctionInitializationContext context) throws Except task.addSplit(id, split); task.noMoreSplits(id); } - // TODO: implement it - this.task.initializeState(0); } public String[] notifyCheckpointComplete(long checkpointId) throws Exception { diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/typeutils/GlutenStatefulRecordSerializer.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/typeutils/GlutenStatefulRecordSerializer.java index d9a789fbfa4..a0fbbc0a7c8 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/typeutils/GlutenStatefulRecordSerializer.java +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/typeutils/GlutenStatefulRecordSerializer.java @@ -30,27 +30,23 @@ import org.apache.flink.core.memory.DataInputView; import org.apache.flink.core.memory.DataOutputView; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.Closeable; import java.io.IOException; /** Serializer for {@link RowVector}. */ @Internal -public class GlutenRowVectorSerializer extends TypeSerializer { +public class GlutenStatefulRecordSerializer extends TypeSerializer { private static final long serialVersionUID = 1L; private final RowType rowType; private final GlutenOperator operator; - public GlutenRowVectorSerializer(RowType rowType, GlutenOperator operator) { + public GlutenStatefulRecordSerializer(RowType rowType, GlutenOperator operator) { this.rowType = rowType; this.operator = operator; } @Override public TypeSerializer duplicate() { - return new GlutenRowVectorSerializer(rowType, operator); + return new GlutenStatefulRecordSerializer(rowType, operator); } @Override @@ -168,8 +164,8 @@ public void readSnapshot(int readVersion, DataInputView in, ClassLoader userCode throws IOException {} @Override - public GlutenRowVectorSerializer restoreSerializer() { - return new GlutenRowVectorSerializer(rowType, operator); + public GlutenStatefulRecordSerializer restoreSerializer() { + return new GlutenStatefulRecordSerializer(rowType, operator); } @Override From ed16d9c8c028de7bc0bc4d6c14d527d1b75d25c2 Mon Sep 17 00:00:00 2001 From: zouyunhe Date: Wed, 8 Apr 2026 09:20:33 +0000 Subject: [PATCH 09/17] xxx --- .../StreamExecGlobalWindowAggregate.java | 39 +++- .../stream/StreamExecWindowAggregate.java | 8 +- ...kIntoTableSourceScanAcrossCalcRulexxx.java | 53 +++++ ...shWatermarkIntoTableSourceScanRulexxx.java | 53 +++++ .../client/OffloadedJobGraphGenerator.java | 2 +- .../runtime/config/VeloxSessionConfig.java | 45 ---- .../operators/GlutenOneInputOperator.java | 14 +- ...ource.java => GlutenSessionResources.java} | 41 ++++ .../operators/GlutenSourceFunction.java | 6 +- .../operators/GlutenTwoInputOperator.java | 2 +- .../GlutenVectorOneInputOperator.java | 202 ------------------ .../operators/GlutenVectorSourceFunction.java | 177 --------------- .../GlutenStatefulRecordSerializer.java | 4 +- 13 files changed, 193 insertions(+), 453 deletions(-) create mode 100644 gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/rules/logical/PushWatermarkIntoTableSourceScanAcrossCalcRulexxx.java create mode 100644 gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/rules/logical/PushWatermarkIntoTableSourceScanRulexxx.java delete mode 100644 gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/config/VeloxSessionConfig.java rename gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/{GlutenSessionResource.java => GlutenSessionResources.java} (65%) delete mode 100644 gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenVectorOneInputOperator.java delete mode 100644 gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenVectorSourceFunction.java diff --git a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecGlobalWindowAggregate.java b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecGlobalWindowAggregate.java index 9c0908c9aef..62f9251d860 100644 --- a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecGlobalWindowAggregate.java +++ b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecGlobalWindowAggregate.java @@ -19,7 +19,6 @@ import org.apache.gluten.rexnode.AggregateCallConverter; import org.apache.gluten.rexnode.Utils; import org.apache.gluten.rexnode.WindowUtils; -import org.apache.gluten.table.runtime.operators.GlutenOneInputOperator; import org.apache.gluten.util.LogicalTypeConverter; import org.apache.gluten.util.PlanNodeIdGenerator; @@ -56,6 +55,7 @@ import org.apache.flink.table.planner.plan.nodes.exec.InputProperty; import org.apache.flink.table.planner.plan.nodes.exec.utils.ExecNodeUtil; import org.apache.flink.table.planner.plan.utils.AggregateInfoList; +import org.apache.flink.table.planner.plan.utils.AggregateUtil; import org.apache.flink.table.planner.plan.utils.KeySelectorUtil; import org.apache.flink.table.planner.utils.JavaScalaConversionUtil; import org.apache.flink.table.planner.utils.TableConfigUtils; @@ -67,6 +67,7 @@ import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; import org.apache.flink.table.runtime.util.TimeWindowUtil; import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.LogicalType; import org.apache.flink.table.types.logical.RowType; import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.annotation.JsonCreator; @@ -177,6 +178,8 @@ public StreamExecGlobalWindowAggregate( @Override protected Transformation translateToPlanInternal( PlannerBase planner, ExecNodeConfig config) { + org.slf4j.Logger LOG = org.slf4j.LoggerFactory.getLogger(StreamExecGlobalWindowAggregate.class); + LOG.info("global window aggregate plan node"); final ExecEdge inputEdge = getInputEdges().get(0); final Transformation inputTransform = (Transformation) inputEdge.translateToPlan(planner); @@ -185,6 +188,14 @@ protected Transformation translateToPlanInternal( final ZoneId shiftTimeZone = TimeWindowUtil.getShiftTimeZone( windowing.getTimeAttributeType(), TableConfigUtils.getLocalTimeZone(config)); + final AggregateInfoList globalAggInfoList = + AggregateUtil.deriveStreamWindowAggregateInfoList( + planner.getTypeFactory(), + localAggInputRowType, // should use original input here + JavaScalaConversionUtil.toScala(Arrays.asList(aggCalls)), + needRetraction, + windowing.getWindow(), + true); // --- Begin Gluten-specific code changes --- // TODO: velox window not equal to flink window. @@ -257,27 +268,33 @@ protected Transformation translateToPlanInternal( offset, windowType, outputType, - false, + true, rowtimeIndex, windowStartIndex, windowEndIndex); - final OneInputStreamOperator windowOperator = - new GlutenOneInputOperator( + final LogicalType[] accTypes = convertToLogicalTypes(globalAggInfoList.getAccTypes()); + final RowDataKeySelector selector = + KeySelectorUtil.getRowDataSelector( + planner.getFlinkContext().getClassLoader(), + grouping, + InternalTypeInfo.of(inputRowType)); + final org.apache.flink.api.common.typeutils.TypeSerializer windowSerializer = + org.apache.flink.api.common.typeutils.base.LongSerializer.INSTANCE; + final OneInputStreamOperator windowOperator = + new org.apache.gluten.table.runtime.operators.WindowAggOperator( new StatefulPlanNode(windowAgg.getId(), windowAgg), PlanNodeIdGenerator.newId(), inputType, Map.of(windowAgg.getId(), outputType), RowData.class, RowData.class, - "StreamExecGlobalWindowAggregate"); + "StreamExecWindowAggregate", + selector.getProducedType(), + globalAggInfoList.getAggNames(), + accTypes, + windowSerializer); // --- End Gluten-specific code changes --- - final RowDataKeySelector selector = - KeySelectorUtil.getRowDataSelector( - planner.getFlinkContext().getClassLoader(), - grouping, - InternalTypeInfo.of(inputRowType)); - final OneInputTransformation transform = ExecNodeUtil.createOneInputTransformation( inputTransform, diff --git a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecWindowAggregate.java b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecWindowAggregate.java index 69fd9eb4424..c1f35885dcc 100644 --- a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecWindowAggregate.java +++ b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecWindowAggregate.java @@ -279,6 +279,11 @@ protected Transformation translateToPlanInternal( .map(x -> x.getLogicalType()) .collect(Collectors.toList()) .toArray(new LogicalType[] {}); + // For TVF windows (Tumbling, Hopping, Cumulative, Session), the window namespace + // is identified by the window end timestamp (Long). If count-based windows are + // supported in the future, a different serializer may be needed. + final org.apache.flink.api.common.typeutils.TypeSerializer windowSerializer = + org.apache.flink.api.common.typeutils.base.LongSerializer.INSTANCE; final OneInputStreamOperator windowOperator = new org.apache.gluten.table.runtime.operators.WindowAggOperator( new StatefulPlanNode(windowAgg.getId(), windowAgg), @@ -290,7 +295,8 @@ protected Transformation translateToPlanInternal( "StreamExecWindowAggregate", selector.getProducedType(), aggInfoList.getAggNames(), - accTypes); + accTypes, + windowSerializer); // --- End Gluten-specific code changes --- final OneInputTransformation transform = diff --git a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/rules/logical/PushWatermarkIntoTableSourceScanAcrossCalcRulexxx.java b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/rules/logical/PushWatermarkIntoTableSourceScanAcrossCalcRulexxx.java new file mode 100644 index 00000000000..4ada321a3bb --- /dev/null +++ b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/rules/logical/PushWatermarkIntoTableSourceScanAcrossCalcRulexxx.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.flink.table.planner.plan.rules.logical; + +import org.apache.flink.api.common.eventtime.WatermarkStrategy; +import org.apache.flink.table.planner.plan.nodes.logical.FlinkLogicalCalc; +import org.apache.flink.table.planner.plan.nodes.logical.FlinkLogicalTableSourceScan; +import org.apache.flink.table.planner.plan.nodes.logical.FlinkLogicalWatermarkAssigner; + +import org.apache.calcite.plan.RelOptRuleCall; + +/** + * Rule to push the {@link FlinkLogicalWatermarkAssigner} across the {@link FlinkLogicalCalc} to the + * {@link FlinkLogicalTableSourceScan}. The rule will first look for the computed column in the + * {@link FlinkLogicalCalc} and then translate the watermark expression and the computed column into + * a {@link WatermarkStrategy}. With the new scan the rule will build a new {@link + * FlinkLogicalCalc}. + */ +public class PushWatermarkIntoTableSourceScanAcrossCalcRulexxx + extends PushWatermarkIntoTableSourceScanRuleBase { + public static final PushWatermarkIntoTableSourceScanAcrossCalcRulexxx INSTANCE = + new PushWatermarkIntoTableSourceScanAcrossCalcRulexxx(); + + public PushWatermarkIntoTableSourceScanAcrossCalcRulexxx() { + super( + operand( + FlinkLogicalWatermarkAssigner.class, + operand(FlinkLogicalCalc.class, operand(FlinkLogicalTableSourceScan.class, none()))), + "PushWatermarkIntoFlinkTableSourceScanAcrossCalcRule"); + } + + @Override + public boolean matches(RelOptRuleCall call) { + return false; + } + + @Override + public void onMatch(RelOptRuleCall call) {} +} diff --git a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/rules/logical/PushWatermarkIntoTableSourceScanRulexxx.java b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/rules/logical/PushWatermarkIntoTableSourceScanRulexxx.java new file mode 100644 index 00000000000..4abc784a66b --- /dev/null +++ b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/rules/logical/PushWatermarkIntoTableSourceScanRulexxx.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.flink.table.planner.plan.rules.logical; + +import org.apache.flink.table.planner.plan.nodes.logical.FlinkLogicalTableSourceScan; +import org.apache.flink.table.planner.plan.nodes.logical.FlinkLogicalWatermarkAssigner; + +import org.apache.calcite.plan.RelOptRuleCall; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Rule to push the {@link FlinkLogicalWatermarkAssigner} into the {@link + * FlinkLogicalTableSourceScan}. + */ +public class PushWatermarkIntoTableSourceScanRulexxx + extends PushWatermarkIntoTableSourceScanRuleBase { + private static final Logger LOG = + LoggerFactory.getLogger(PushWatermarkIntoTableSourceScanRulexxx.class); + public static final PushWatermarkIntoTableSourceScanRulexxx INSTANCE = + new PushWatermarkIntoTableSourceScanRulexxx(); + + public PushWatermarkIntoTableSourceScanRulexxx() { + super( + operand( + FlinkLogicalWatermarkAssigner.class, + operand(FlinkLogicalTableSourceScan.class, none())), + "PushWatermarkIntoTableSourceScanRule"); + } + + @Override + public boolean matches(RelOptRuleCall call) { + LOG.info("PushWatermarkIntoTableSourceScanRule does not match xxxxx"); + return false; + } + + @Override + public void onMatch(RelOptRuleCall call) {} +} diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/client/OffloadedJobGraphGenerator.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/client/OffloadedJobGraphGenerator.java index d10ddeb1036..27dc0b1933c 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/client/OffloadedJobGraphGenerator.java +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/client/OffloadedJobGraphGenerator.java @@ -240,7 +240,7 @@ private void createOffloadedOneInputOperator( Class inClass = supportsVectorInput ? StatefulRecord.class : RowData.class; Class outClass = supportsVectorOutput ? StatefulRecord.class : RowData.class; GlutenOneInputOperator newOneInputOp = - sourceOperator.cloneWithInputOutputClasses(inClass, outClass); + sourceOperator.cloneWithInputOutputClasses(planNode, inClass, outClass); offloadedOpConfig.setStreamOperator(newOneInputOp); if (supportsVectorOutput) { setOffloadedOutputSerializer(offloadedOpConfig, sourceOperator); diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/config/VeloxSessionConfig.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/config/VeloxSessionConfig.java deleted file mode 100644 index ee55a258654..00000000000 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/config/VeloxSessionConfig.java +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.gluten.table.runtime.config; - -import io.github.zhztheplayer.velox4j.session.Session; - -import java.io.Serializable; -import java.util.HashMap; -import java.util.Map; - -public class VeloxSessionConfig implements Serializable { - private final Map sessions; - - private static VeloxSessionConfig config = new VeloxSessionConfig(); - - private VeloxSessionConfig() { - sessions = new HashMap<>(); - } - - public static VeloxSessionConfig getSessionConfig() { - return config; - } - - public void addSession(String id, Session session) { - sessions.put(id, session); - } - - public Session getSession(String id) { - return sessions.getOrDefault(id, null); - } -} diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenOneInputOperator.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenOneInputOperator.java index a2733c2438f..d8c9e4b51e5 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenOneInputOperator.java +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenOneInputOperator.java @@ -121,6 +121,7 @@ void initSession() { outputBridge = VectorOutputBridge.Factory.create(outClass); } sessionResource = new GlutenSessionResource(); + GlutenSessionResources.getInstance().addSessionResource(id, sessionResource); inputQueue = sessionResource.getSession().externalStreamOps().newBlockingQueue(); // add a mock input as velox not allow the source is empty. if (inputType == null) { @@ -164,7 +165,6 @@ public void processElement(StreamRecord element) { inputBridge.convertToStatefulRecord( element, sessionResource.getAllocator(), sessionResource.getSession(), inputType); inputQueue.put(statefulRecord.getRowVector()); - // Only the rowvectors generated by this operator should be closed here. if (getId().equals(statefulRecord.getNodeId())) { statefulRecord.close(); @@ -172,7 +172,7 @@ public void processElement(StreamRecord element) { processElementInternal(); } - private void processElementInternal() { + protected void processElementInternal() { while (true) { UpIterator.State state = task.advance(); if (state == UpIterator.State.AVAILABLE) { @@ -192,15 +192,9 @@ private void processElementInternal() { } public GlutenOneInputOperator cloneWithInputOutputClasses( - Class newInClass, Class newOutClass) { + StatefulPlanNode plan, Class newInClass, Class newOutClass) { return new GlutenOneInputOperator<>( - this.glutenPlan, - this.id, - this.inputType, - this.outputTypes, - newInClass, - newOutClass, - this.description); + plan, this.id, this.inputType, this.outputTypes, newInClass, newOutClass, this.description); } @Override diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenSessionResource.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenSessionResources.java similarity index 65% rename from gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenSessionResource.java rename to gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenSessionResources.java index ea38229e950..2f709817819 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenSessionResource.java +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenSessionResources.java @@ -16,6 +16,8 @@ */ package org.apache.gluten.table.runtime.operators; +import org.apache.gluten.streaming.api.operators.GlutenOperator; + import io.github.zhztheplayer.velox4j.Velox4j; import io.github.zhztheplayer.velox4j.memory.AllocationListener; import io.github.zhztheplayer.velox4j.memory.MemoryManager; @@ -25,6 +27,11 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.HashMap; +import java.util.Map; // Manage the session and resource for Velox. class GlutenSessionResource { @@ -74,3 +81,37 @@ public void setKeyedStateBackend(KeyedStateBackend keyedStateBackend) { this.keyedStateBackend = keyedStateBackend; } } + +public class GlutenSessionResources { + private static final Logger LOG = LoggerFactory.getLogger(GlutenSessionResources.class); + private static final GlutenSessionResources instance = new GlutenSessionResources(); + private Map sessionResources = new HashMap<>(); + private Map operators = new HashMap<>(); + + private GlutenSessionResources() {} + + public static GlutenSessionResources getInstance() { + return instance; + } + + public GlutenSessionResource getSessionResource(String id) { + return sessionResources.get(id); + } + + public void addSessionResource(String id, GlutenSessionResource sessionResource) { + sessionResources.put(id, sessionResource); + } + + public Session getSession(String id) { + return sessionResources.get(id).getSession(); + } + + public void addOperator(String id, GlutenOperator operator) { + operators.put(id, operator); + } + + public GlutenOperator getOperator(String id) { + LOG.info("getOperator: {}, {}", id, operators.keySet()); + return operators.get(id); + } +} diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenSourceFunction.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenSourceFunction.java index ea0ddcbc7c8..3d7f520b14a 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenSourceFunction.java +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenSourceFunction.java @@ -32,12 +32,13 @@ import io.github.zhztheplayer.velox4j.stateful.StatefulWatermark; import io.github.zhztheplayer.velox4j.type.RowType; +import org.apache.flink.api.common.eventtime.WatermarkGenerator; +import org.apache.flink.api.common.eventtime.WatermarkOutput; import org.apache.flink.configuration.Configuration; import org.apache.flink.runtime.state.FunctionInitializationContext; import org.apache.flink.runtime.state.FunctionSnapshotContext; import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction; import org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction; -import org.apache.flink.streaming.api.watermark.Watermark; import org.apache.flink.table.data.RowData; import org.slf4j.Logger; @@ -109,10 +110,8 @@ public void run(SourceContext sourceContext) throws Exception { processAvailableElement(sourceContext); break; case BLOCKED: - LOG.debug("Get empty row"); break; default: - LOG.info("Velox task finished"); return; } taskMetrics.updateMetrics(task, id); @@ -233,6 +232,7 @@ private void initSession() { return; } sessionResource = new GlutenSessionResource(); + GlutenSessionResources.getInstance().addSessionResource(id, sessionResource); Session session = sessionResource.getSession(); query = new Query( diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenTwoInputOperator.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenTwoInputOperator.java index 2352d749436..94db3072e4b 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenTwoInputOperator.java +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenTwoInputOperator.java @@ -263,7 +263,7 @@ private void initSession() { } sessionResource = new GlutenSessionResource(); - + GlutenSessionResources.getInstance().addSessionResource(getId(), sessionResource); leftInputQueue = sessionResource.getSession().externalStreamOps().newBlockingQueue(); rightInputQueue = sessionResource.getSession().externalStreamOps().newBlockingQueue(); diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenVectorOneInputOperator.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenVectorOneInputOperator.java deleted file mode 100644 index 32cea6148f0..00000000000 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenVectorOneInputOperator.java +++ /dev/null @@ -1,202 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.gluten.table.runtime.operators; - -import org.apache.gluten.streaming.api.operators.GlutenOperator; -import org.apache.gluten.table.runtime.config.VeloxQueryConfig; -import org.apache.gluten.table.runtime.config.VeloxSessionConfig; - -import io.github.zhztheplayer.velox4j.Velox4j; -import io.github.zhztheplayer.velox4j.config.ConnectorConfig; -import io.github.zhztheplayer.velox4j.connector.ExternalStreamConnectorSplit; -import io.github.zhztheplayer.velox4j.connector.ExternalStreamTableHandle; -import io.github.zhztheplayer.velox4j.connector.ExternalStreams; -import io.github.zhztheplayer.velox4j.data.RowVector; -import io.github.zhztheplayer.velox4j.iterator.UpIterator; -import io.github.zhztheplayer.velox4j.memory.AllocationListener; -import io.github.zhztheplayer.velox4j.memory.MemoryManager; -import io.github.zhztheplayer.velox4j.plan.StatefulPlanNode; -import io.github.zhztheplayer.velox4j.plan.TableScanNode; -import io.github.zhztheplayer.velox4j.query.Query; -import io.github.zhztheplayer.velox4j.query.SerialTask; -import io.github.zhztheplayer.velox4j.serde.Serde; -import io.github.zhztheplayer.velox4j.session.Session; -import io.github.zhztheplayer.velox4j.stateful.StatefulElement; -import io.github.zhztheplayer.velox4j.stateful.StatefulRecord; -import io.github.zhztheplayer.velox4j.stateful.StatefulWatermark; -import io.github.zhztheplayer.velox4j.type.RowType; - -import org.apache.flink.runtime.state.StateInitializationContext; -import org.apache.flink.runtime.state.StateSnapshotContext; -import org.apache.flink.streaming.api.operators.OneInputStreamOperator; -import org.apache.flink.streaming.api.watermark.Watermark; -import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; -import org.apache.flink.table.runtime.operators.TableStreamOperator; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.List; -import java.util.Map; - -/** Calculate operator in gluten, which will call Velox to run. */ -public class GlutenVectorOneInputOperator extends TableStreamOperator - implements OneInputStreamOperator, GlutenOperator { - - private static final Logger LOG = LoggerFactory.getLogger(GlutenVectorOneInputOperator.class); - - private final StatefulPlanNode glutenPlan; - private final String id; - private final RowType inputType; - private final Map outputTypes; - - private StreamRecord outElement = null; - - private MemoryManager memoryManager; - private Session session; - private Query query; - private ExternalStreams.BlockingQueue inputQueue; - private SerialTask task; - - public GlutenVectorOneInputOperator( - StatefulPlanNode plan, String id, RowType inputType, Map outputTypes) { - this.glutenPlan = plan; - this.id = id; - this.inputType = inputType; - this.outputTypes = outputTypes; - } - - void initGlutenTask() { - memoryManager = MemoryManager.create(AllocationListener.NOOP); - session = Velox4j.newSession(memoryManager); - VeloxSessionConfig.getSessionConfig().addSession(id, session); - // add a mock input as velox not allow the source is empty. - StatefulPlanNode mockInput = - new StatefulPlanNode( - id, - new TableScanNode( - id, - inputType, - new ExternalStreamTableHandle("connector-external-stream"), - List.of())); - mockInput.addTarget(glutenPlan); - LOG.debug("Gluten Plan: {}", Serde.toJson(mockInput)); - LOG.debug("OutTypes: {}", outputTypes.keySet()); - query = - new Query( - mockInput, VeloxQueryConfig.getConfig(getRuntimeContext()), ConnectorConfig.empty()); - task = session.queryOps().execute(query); - } - - @Override - public void open() throws Exception { - super.open(); - outElement = new StreamRecord(null); - inputQueue = session.externalStreamOps().newBlockingQueue(); - ExternalStreamConnectorSplit split = - new ExternalStreamConnectorSplit("connector-external-stream", inputQueue.id()); - task.addSplit(id, split); - task.noMoreSplits(id); - } - - @Override - public void processElement(StreamRecord element) { - RowVector inRv = element.getValue().getRowVector(); - inputQueue.put(inRv); - while (true) { - UpIterator.State state = task.advance(); - if (state == UpIterator.State.AVAILABLE) { - final StatefulElement statefulElement = task.statefulGet(); - if (statefulElement.isWatermark()) { - StatefulWatermark watermark = statefulElement.asWatermark(); - output.emitWatermark(new Watermark(watermark.getTimestamp())); - } else { - final StatefulRecord statefulRecord = statefulElement.asRecord(); - output.collect(outElement.replace(statefulRecord)); - statefulRecord.close(); - } - } else { - break; - } - } - inRv.close(); - } - - @Override - public void close() throws Exception { - inputQueue.close(); - task.close(); - session.close(); - memoryManager.close(); - } - - @Override - public StatefulPlanNode getPlanNode() { - return glutenPlan; - } - - @Override - public RowType getInputType() { - return inputType; - } - - @Override - public Map getOutputTypes() { - return outputTypes; - } - - @Override - public String getId() { - return id; - } - - @Override - public void prepareSnapshotPreBarrier(long checkpointId) throws Exception { - // TODO: notify velox - super.prepareSnapshotPreBarrier(checkpointId); - } - - @Override - public void snapshotState(StateSnapshotContext context) throws Exception { - // TODO: implement it - task.snapshotState(0); - super.snapshotState(context); - } - - @Override - public void initializeState(StateInitializationContext context) throws Exception { - if (task == null) { - initGlutenTask(); - } - // TODO: implement it - super.initializeState(context); - } - - @Override - public void notifyCheckpointComplete(long checkpointId) throws Exception { - // TODO: notify velox - task.notifyCheckpointComplete(checkpointId); - super.notifyCheckpointComplete(checkpointId); - } - - @Override - public void notifyCheckpointAborted(long checkpointId) throws Exception { - // TODO: notify velox - task.notifyCheckpointAborted(checkpointId); - super.notifyCheckpointAborted(checkpointId); - } -} diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenVectorSourceFunction.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenVectorSourceFunction.java deleted file mode 100644 index ae505cea506..00000000000 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenVectorSourceFunction.java +++ /dev/null @@ -1,177 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.gluten.table.runtime.operators; - -import org.apache.gluten.table.runtime.config.VeloxQueryConfig; -import org.apache.gluten.table.runtime.metrics.SourceTaskMetrics; - -import io.github.zhztheplayer.velox4j.Velox4j; -import io.github.zhztheplayer.velox4j.config.ConnectorConfig; -import io.github.zhztheplayer.velox4j.connector.ConnectorSplit; -import io.github.zhztheplayer.velox4j.iterator.UpIterator; -import io.github.zhztheplayer.velox4j.memory.AllocationListener; -import io.github.zhztheplayer.velox4j.memory.MemoryManager; -import io.github.zhztheplayer.velox4j.plan.StatefulPlanNode; -import io.github.zhztheplayer.velox4j.query.Query; -import io.github.zhztheplayer.velox4j.query.SerialTask; -import io.github.zhztheplayer.velox4j.serde.Serde; -import io.github.zhztheplayer.velox4j.session.Session; -import io.github.zhztheplayer.velox4j.stateful.StatefulElement; -import io.github.zhztheplayer.velox4j.type.RowType; - -import org.apache.flink.configuration.Configuration; -import org.apache.flink.runtime.state.FunctionInitializationContext; -import org.apache.flink.runtime.state.FunctionSnapshotContext; -import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction; -import org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction; -import org.apache.flink.streaming.api.watermark.Watermark; - -import org.apache.arrow.memory.BufferAllocator; -import org.apache.arrow.memory.RootAllocator; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.Map; - -/** - * Gluten legacy source function, call velox plan to execute. It sends RowVector to downstream - * instead of RowData to avoid data convert. - */ -public class GlutenVectorSourceFunction extends RichParallelSourceFunction - implements CheckpointedFunction { - private static final Logger LOG = LoggerFactory.getLogger(GlutenVectorSourceFunction.class); - - private final StatefulPlanNode planNode; - private final Map outputTypes; - private final String id; - private final ConnectorSplit split; - private volatile boolean isRunning = true; - - private Session session; - private Query query; - private BufferAllocator allocator; - private MemoryManager memoryManager; - private SerialTask task; - private SourceTaskMetrics taskMetrics; - - public GlutenVectorSourceFunction( - StatefulPlanNode planNode, - Map outputTypes, - String id, - ConnectorSplit split) { - this.planNode = planNode; - this.outputTypes = outputTypes; - this.id = id; - this.split = split; - } - - public StatefulPlanNode getPlanNode() { - return planNode; - } - - public Map getOutputTypes() { - return outputTypes; - } - - public String getId() { - return id; - } - - public ConnectorSplit getConnectorSplit() { - return split; - } - - @Override - public void open(Configuration parameters) throws Exception { - if (memoryManager == null) { - memoryManager = MemoryManager.create(AllocationListener.NOOP); - session = Velox4j.newSession(memoryManager); - query = - new Query( - planNode, VeloxQueryConfig.getConfig(getRuntimeContext()), ConnectorConfig.empty()); - allocator = new RootAllocator(Long.MAX_VALUE); - task = session.queryOps().execute(query); - task.addSplit(id, split); - task.noMoreSplits(id); - } - taskMetrics = new SourceTaskMetrics(getRuntimeContext().getMetricGroup()); - } - - @Override - public void run(SourceContext sourceContext) throws Exception { - while (isRunning) { - UpIterator.State state = task.advance(); - if (state == UpIterator.State.AVAILABLE) { - final StatefulElement element = task.statefulGet(); - if (element.isWatermark()) { - sourceContext.emitWatermark(new Watermark(element.asWatermark().getTimestamp())); - } else { - sourceContext.collect(element); - } - element.close(); - } else if (state == UpIterator.State.BLOCKED) { - LOG.debug("Get empty row"); - } else { - LOG.info("Velox task finished"); - break; - } - taskMetrics.updateMetrics(task, id); - } - task.close(); - session.close(); - memoryManager.close(); - allocator.close(); - } - - @Override - public void cancel() { - isRunning = false; - } - - @Override - public void snapshotState(FunctionSnapshotContext context) throws Exception { - // TODO: implement it - this.task.snapshotState(0); - } - - @Override - public void initializeState(FunctionInitializationContext context) throws Exception { - if (memoryManager == null) { - LOG.debug("Running GlutenSourceFunction: " + Serde.toJson(planNode)); - memoryManager = MemoryManager.create(AllocationListener.NOOP); - session = Velox4j.newSession(memoryManager); - query = - new Query( - planNode, VeloxQueryConfig.getConfig(getRuntimeContext()), ConnectorConfig.empty()); - allocator = new RootAllocator(Long.MAX_VALUE); - - task = session.queryOps().execute(query); - task.addSplit(id, split); - task.noMoreSplits(id); - } - } - - public String[] notifyCheckpointComplete(long checkpointId) throws Exception { - // TODO: notify velox - return this.task.notifyCheckpointComplete(checkpointId); - } - - public void notifyCheckpointAborted(long checkpointId) throws Exception { - // TODO: notify velox - this.task.notifyCheckpointAborted(checkpointId); - } -} diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/typeutils/GlutenStatefulRecordSerializer.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/typeutils/GlutenStatefulRecordSerializer.java index a0fbbc0a7c8..57c273037d9 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/typeutils/GlutenStatefulRecordSerializer.java +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/typeutils/GlutenStatefulRecordSerializer.java @@ -17,7 +17,7 @@ package org.apache.gluten.table.runtime.typeutils; import org.apache.gluten.streaming.api.operators.GlutenOperator; -import org.apache.gluten.table.runtime.config.VeloxSessionConfig; +import org.apache.gluten.table.runtime.operators.GlutenSessionResources; import io.github.zhztheplayer.velox4j.data.RowVector; import io.github.zhztheplayer.velox4j.stateful.StatefulRecord; @@ -67,7 +67,7 @@ public StatefulRecord deserialize(DataInputView source) throws IOException { byte[] str = new byte[len]; source.readFully(str); RowVector rowVector = - VeloxSessionConfig.getSessionConfig() + GlutenSessionResources.getInstance() .getSession(operator.getId()) .baseVectorOps() .deserializeOne(new String(str)) From aaf47bd3bf9033a0ebeb00143895bf7c67ec528b Mon Sep 17 00:00:00 2001 From: zouyunhe Date: Tue, 28 Apr 2026 02:00:33 +0000 Subject: [PATCH 10/17] support event time window --- .../StreamExecGlobalWindowAggregate.java | 103 ++++++++---------- .../StreamExecLocalWindowAggregate.java | 44 +------- .../stream/StreamExecWindowAggregate.java | 19 +--- ...kIntoTableSourceScanAcrossCalcRulexx.java} | 8 +- ...shWatermarkIntoTableSourceScanRulexx.java} | 10 +- .../rexnode/AggregateCallConverter.java | 27 +++++ .../functions/SubstractRexCallConverter.java | 5 +- .../api/operators/GlutenOperator.java | 13 +++ .../operators/GlutenOneInputOperator.java | 48 ++++---- .../operators/GlutenSessionResources.java | 9 +- .../operators/GlutenSourceFunction.java | 3 +- .../operators/GlutenTwoInputOperator.java | 3 +- .../runtime/operators/WindowAggOperator.java | 15 ++- .../GlutenStatefulRecordSerializer.java | 2 +- 14 files changed, 152 insertions(+), 157 deletions(-) rename gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/rules/logical/{PushWatermarkIntoTableSourceScanAcrossCalcRulexxx.java => PushWatermarkIntoTableSourceScanAcrossCalcRulexx.java} (91%) rename gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/rules/logical/{PushWatermarkIntoTableSourceScanRulexxx.java => PushWatermarkIntoTableSourceScanRulexx.java} (88%) diff --git a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecGlobalWindowAggregate.java b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecGlobalWindowAggregate.java index 62f9251d860..a0f3c416942 100644 --- a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecGlobalWindowAggregate.java +++ b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecGlobalWindowAggregate.java @@ -43,9 +43,8 @@ import org.apache.flink.streaming.api.operators.SimpleOperatorFactory; import org.apache.flink.streaming.api.transformations.OneInputTransformation; import org.apache.flink.table.data.RowData; -import org.apache.flink.table.planner.codegen.CodeGeneratorContext; -import org.apache.flink.table.planner.codegen.agg.AggsHandlerCodeGenerator; import org.apache.flink.table.planner.delegation.PlannerBase; +import org.apache.flink.table.planner.plan.logical.SliceAttachedWindowingStrategy; import org.apache.flink.table.planner.plan.logical.WindowingStrategy; import org.apache.flink.table.planner.plan.nodes.exec.ExecEdge; import org.apache.flink.table.planner.plan.nodes.exec.ExecNode; @@ -59,32 +58,33 @@ import org.apache.flink.table.planner.plan.utils.KeySelectorUtil; import org.apache.flink.table.planner.utils.JavaScalaConversionUtil; import org.apache.flink.table.planner.utils.TableConfigUtils; -import org.apache.flink.table.runtime.generated.GeneratedNamespaceAggsHandleFunction; import org.apache.flink.table.runtime.groupwindow.NamedWindowProperty; -import org.apache.flink.table.runtime.groupwindow.WindowProperty; import org.apache.flink.table.runtime.keyselector.RowDataKeySelector; -import org.apache.flink.table.runtime.operators.window.tvf.slicing.SliceAssigner; import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; import org.apache.flink.table.runtime.util.TimeWindowUtil; -import org.apache.flink.table.types.DataType; import org.apache.flink.table.types.logical.LogicalType; import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.RowType.RowField; import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.annotation.JsonCreator; import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.annotation.JsonProperty; import org.apache.calcite.rel.core.AggregateCall; -import org.apache.calcite.tools.RelBuilder; import org.apache.commons.math3.util.ArithmeticUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import javax.annotation.Nullable; import java.time.ZoneId; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Optional; +import java.util.Set; import java.util.TimeZone; import java.util.stream.Collectors; @@ -102,6 +102,7 @@ minStateVersion = FlinkVersion.v1_15) public class StreamExecGlobalWindowAggregate extends StreamExecWindowAggregateBase { + private static final Logger LOG = LoggerFactory.getLogger(StreamExecGlobalWindowAggregate.class); public static final String GLOBAL_WINDOW_AGGREGATE_TRANSFORMATION = "global-window-aggregate"; public static final String FIELD_NAME_LOCAL_AGG_INPUT_ROW_TYPE = "localAggInputRowType"; @@ -174,12 +175,17 @@ public StreamExecGlobalWindowAggregate( this.needRetraction = Optional.ofNullable(needRetraction).orElse(false); } + private int getSliceEndIndex() { + if (windowing instanceof SliceAttachedWindowingStrategy) { + return ((SliceAttachedWindowingStrategy) windowing).getSliceEnd(); + } + return -1; + } + @SuppressWarnings("unchecked") @Override protected Transformation translateToPlanInternal( PlannerBase planner, ExecNodeConfig config) { - org.slf4j.Logger LOG = org.slf4j.LoggerFactory.getLogger(StreamExecGlobalWindowAggregate.class); - LOG.info("global window aggregate plan node"); final ExecEdge inputEdge = getInputEdges().get(0); final Transformation inputTransform = (Transformation) inputEdge.translateToPlan(planner); @@ -188,6 +194,18 @@ protected Transformation translateToPlanInternal( final ZoneId shiftTimeZone = TimeWindowUtil.getShiftTimeZone( windowing.getTimeAttributeType(), TableConfigUtils.getLocalTimeZone(config)); + Set nonAggFieldIndexes = new HashSet<>(); + Arrays.stream(grouping).forEach(nonAggFieldIndexes::add); + nonAggFieldIndexes.add(getSliceEndIndex()); + List intermediateAggInputRowFields = new ArrayList<>(); + for (int i = 0; i < inputRowType.getFieldNames().size(); i++) { + RowField rowField = + new RowField(inputRowType.getFieldNames().get(i), inputRowType.getChildren().get(i)); + if (!nonAggFieldIndexes.contains(i)) { + intermediateAggInputRowFields.add(rowField); + } + } + final RowType intermediateInputType = new RowType(intermediateAggInputRowFields); final AggregateInfoList globalAggInfoList = AggregateUtil.deriveStreamWindowAggregateInfoList( planner.getTypeFactory(), @@ -196,16 +214,21 @@ protected Transformation translateToPlanInternal( needRetraction, windowing.getWindow(), true); - // --- Begin Gluten-specific code changes --- // TODO: velox window not equal to flink window. + io.github.zhztheplayer.velox4j.type.RowType intermediateInputRowType = + (io.github.zhztheplayer.velox4j.type.RowType) + LogicalTypeConverter.toVLType(intermediateInputType); io.github.zhztheplayer.velox4j.type.RowType inputType = (io.github.zhztheplayer.velox4j.type.RowType) LogicalTypeConverter.toVLType(inputRowType); io.github.zhztheplayer.velox4j.type.RowType outputType = (io.github.zhztheplayer.velox4j.type.RowType) LogicalTypeConverter.toVLType(getOutputType()); List groupingKeys = Utils.generateFieldAccesses(inputType, grouping); - List aggregates = AggregateCallConverter.toAggregates(aggCalls, inputType); + List intermediateAggregates = + AggregateCallConverter.toIntermediateAggregates(aggCalls, intermediateInputRowType); + List finalAggregates = + AggregateCallConverter.toIntermediateAggregates(aggCalls, intermediateInputRowType); checkArgument(outputType.getNames().size() >= grouping.length + aggCalls.length); List aggNames = outputType.getNames().stream() @@ -229,26 +252,26 @@ protected Transformation translateToPlanInternal( PartitionFunctionSpec sliceAssignerSpec = new StreamWindowPartitionFunctionSpec( inputType, rowtimeIndex, size, slide, offset, windowType); - PlanNode aggregation = + PlanNode finalAgg = new AggregationNode( PlanNodeIdGenerator.newId(), - AggregateStep.SINGLE, + AggregateStep.FINAL, groupingKeys, groupingKeys, aggNames, - aggregates, + finalAggregates, false, List.of(new EmptyNode(inputType)), null, List.of()); - PlanNode localAgg = + PlanNode intermediateAgg = new AggregationNode( PlanNodeIdGenerator.newId(), - AggregateStep.SINGLE, + AggregateStep.INTERMEDIATE, groupingKeys, groupingKeys, aggNames, - aggregates, + intermediateAggregates, false, List.of(new EmptyNode(inputType)), null, @@ -256,8 +279,8 @@ protected Transformation translateToPlanInternal( PlanNode windowAgg = new StreamWindowAggregationNode( PlanNodeIdGenerator.newId(), - aggregation, - localAgg, + finalAgg, + intermediateAgg, keySelectorSpec, sliceAssignerSpec, ArithmeticUtils.gcd(size, slide), @@ -278,8 +301,8 @@ protected Transformation translateToPlanInternal( planner.getFlinkContext().getClassLoader(), grouping, InternalTypeInfo.of(inputRowType)); - final org.apache.flink.api.common.typeutils.TypeSerializer windowSerializer = - org.apache.flink.api.common.typeutils.base.LongSerializer.INSTANCE; + // final org.apache.flink.api.common.typeutils.TypeSerializer windowSerializer = + // org.apache.flink.api.common.typeutils.base.LongSerializer.INSTANCE; final OneInputStreamOperator windowOperator = new org.apache.gluten.table.runtime.operators.WindowAggOperator( new StatefulPlanNode(windowAgg.getId(), windowAgg), @@ -292,7 +315,7 @@ protected Transformation translateToPlanInternal( selector.getProducedType(), globalAggInfoList.getAggNames(), accTypes, - windowSerializer); + windowing.isRowtime()); // --- End Gluten-specific code changes --- final OneInputTransformation transform = @@ -310,40 +333,4 @@ protected Transformation translateToPlanInternal( transform.setStateKeyType(selector.getProducedType()); return transform; } - - private GeneratedNamespaceAggsHandleFunction createAggsHandler( - String name, - SliceAssigner sliceAssigner, - AggregateInfoList aggInfoList, - int mergedAccOffset, - boolean mergedAccIsOnHeap, - DataType[] mergedAccExternalTypes, - ExecNodeConfig config, - ClassLoader classLoader, - RelBuilder relBuilder, - ZoneId shifTimeZone) { - final AggsHandlerCodeGenerator generator = - new AggsHandlerCodeGenerator( - new CodeGeneratorContext(config, classLoader), - relBuilder, - JavaScalaConversionUtil.toScala(localAggInputRowType.getChildren()), - true) // copyInputField - .needAccumulate() - .needMerge(mergedAccOffset, mergedAccIsOnHeap, mergedAccExternalTypes); - - final List windowProperties = - Arrays.asList( - Arrays.stream(namedWindowProperties) - .map(NamedWindowProperty::getProperty) - .toArray(WindowProperty[]::new)); - - return generator.generateNamespaceAggsHandler( - name, - aggInfoList, - JavaScalaConversionUtil.toScala(windowProperties), - sliceAssigner, - // we use window end timestamp to indicate a slicing window, see SliceAssigner - Long.class, - shifTimeZone); - } } diff --git a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecLocalWindowAggregate.java b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecLocalWindowAggregate.java index d67d61709e3..7a7bc4520eb 100644 --- a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecLocalWindowAggregate.java +++ b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecLocalWindowAggregate.java @@ -42,8 +42,6 @@ import org.apache.flink.streaming.api.operators.OneInputStreamOperator; import org.apache.flink.streaming.api.operators.SimpleOperatorFactory; import org.apache.flink.table.data.RowData; -import org.apache.flink.table.planner.codegen.CodeGeneratorContext; -import org.apache.flink.table.planner.codegen.agg.AggsHandlerCodeGenerator; import org.apache.flink.table.planner.delegation.PlannerBase; import org.apache.flink.table.planner.plan.logical.WindowingStrategy; import org.apache.flink.table.planner.plan.nodes.exec.ExecEdge; @@ -53,22 +51,18 @@ import org.apache.flink.table.planner.plan.nodes.exec.ExecNodeMetadata; import org.apache.flink.table.planner.plan.nodes.exec.InputProperty; import org.apache.flink.table.planner.plan.nodes.exec.utils.ExecNodeUtil; -import org.apache.flink.table.planner.plan.utils.AggregateInfoList; -import org.apache.flink.table.planner.utils.JavaScalaConversionUtil; import org.apache.flink.table.planner.utils.TableConfigUtils; -import org.apache.flink.table.runtime.generated.GeneratedNamespaceAggsHandleFunction; -import org.apache.flink.table.runtime.operators.window.tvf.slicing.SliceAssigner; import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; import org.apache.flink.table.runtime.util.TimeWindowUtil; -import org.apache.flink.table.types.logical.LogicalType; import org.apache.flink.table.types.logical.RowType; import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.annotation.JsonCreator; import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.annotation.JsonProperty; import org.apache.calcite.rel.core.AggregateCall; -import org.apache.calcite.tools.RelBuilder; import org.apache.commons.math3.util.ArithmeticUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import javax.annotation.Nullable; @@ -94,6 +88,7 @@ minStateVersion = FlinkVersion.v1_15) public class StreamExecLocalWindowAggregate extends StreamExecWindowAggregateBase { + private static final Logger LOG = LoggerFactory.getLogger(StreamExecLocalWindowAggregate.class); public static final String LOCAL_WINDOW_AGGREGATE_TRANSFORMATION = "local-window-aggregate"; private static final long WINDOW_AGG_MEMORY_RATIO = 100; @@ -196,7 +191,7 @@ protected Transformation translateToPlanInternal( PlanNode aggregation = new AggregationNode( PlanNodeIdGenerator.newId(), - AggregateStep.SINGLE, + AggregateStep.PARTIAL, groupingKeys, groupingKeys, aggNames, @@ -245,35 +240,4 @@ protected Transformation translateToPlanInternal( WINDOW_AGG_MEMORY_RATIO / 2, false); } - - private GeneratedNamespaceAggsHandleFunction createAggsHandler( - SliceAssigner sliceAssigner, - AggregateInfoList aggInfoList, - ExecNodeConfig config, - ClassLoader classLoader, - RelBuilder relBuilder, - List fieldTypes, - ZoneId shiftTimeZone) { - final AggsHandlerCodeGenerator generator = - new AggsHandlerCodeGenerator( - new CodeGeneratorContext(config, classLoader), - relBuilder, - JavaScalaConversionUtil.toScala(fieldTypes), - true) // copyInputField - .needAccumulate() - .needMerge(0, true, null); - - if (needRetraction) { - generator.needRetract(); - } - - return generator.generateNamespaceAggsHandler( - "LocalWindowAggsHandler", - aggInfoList, - JavaScalaConversionUtil.toScala(Collections.emptyList()), - sliceAssigner, - // we use window end timestamp to indicate a slicing window, see SliceAssigner - Long.class, - shiftTimeZone); - } } diff --git a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecWindowAggregate.java b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecWindowAggregate.java index c1f35885dcc..14f227e4885 100644 --- a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecWindowAggregate.java +++ b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecWindowAggregate.java @@ -236,20 +236,7 @@ protected Transformation translateToPlanInternal( List.of()); // processing time window can not apply to local-global aggregate optimization, so here we need // to set local aggregtate as null when it is not event time window. - PlanNode localAgg = - isRowTime - ? new AggregationNode( - PlanNodeIdGenerator.newId(), - AggregateStep.SINGLE, - groupingKeys, - groupingKeys, - aggNames, - aggregates, - false, - List.of(new EmptyNode(inputType)), - null, - List.of()) - : null; + PlanNode localAgg = null; PlanNode windowAgg = new StreamWindowAggregationNode( PlanNodeIdGenerator.newId(), @@ -282,8 +269,6 @@ protected Transformation translateToPlanInternal( // For TVF windows (Tumbling, Hopping, Cumulative, Session), the window namespace // is identified by the window end timestamp (Long). If count-based windows are // supported in the future, a different serializer may be needed. - final org.apache.flink.api.common.typeutils.TypeSerializer windowSerializer = - org.apache.flink.api.common.typeutils.base.LongSerializer.INSTANCE; final OneInputStreamOperator windowOperator = new org.apache.gluten.table.runtime.operators.WindowAggOperator( new StatefulPlanNode(windowAgg.getId(), windowAgg), @@ -296,7 +281,7 @@ protected Transformation translateToPlanInternal( selector.getProducedType(), aggInfoList.getAggNames(), accTypes, - windowSerializer); + windowing.isRowtime()); // --- End Gluten-specific code changes --- final OneInputTransformation transform = diff --git a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/rules/logical/PushWatermarkIntoTableSourceScanAcrossCalcRulexxx.java b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/rules/logical/PushWatermarkIntoTableSourceScanAcrossCalcRulexx.java similarity index 91% rename from gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/rules/logical/PushWatermarkIntoTableSourceScanAcrossCalcRulexxx.java rename to gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/rules/logical/PushWatermarkIntoTableSourceScanAcrossCalcRulexx.java index 4ada321a3bb..0a3ac3b2fdf 100644 --- a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/rules/logical/PushWatermarkIntoTableSourceScanAcrossCalcRulexxx.java +++ b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/rules/logical/PushWatermarkIntoTableSourceScanAcrossCalcRulexx.java @@ -30,12 +30,12 @@ * a {@link WatermarkStrategy}. With the new scan the rule will build a new {@link * FlinkLogicalCalc}. */ -public class PushWatermarkIntoTableSourceScanAcrossCalcRulexxx +public class PushWatermarkIntoTableSourceScanAcrossCalcRulexx extends PushWatermarkIntoTableSourceScanRuleBase { - public static final PushWatermarkIntoTableSourceScanAcrossCalcRulexxx INSTANCE = - new PushWatermarkIntoTableSourceScanAcrossCalcRulexxx(); + public static final PushWatermarkIntoTableSourceScanAcrossCalcRulexx INSTANCE = + new PushWatermarkIntoTableSourceScanAcrossCalcRulexx(); - public PushWatermarkIntoTableSourceScanAcrossCalcRulexxx() { + public PushWatermarkIntoTableSourceScanAcrossCalcRulexx() { super( operand( FlinkLogicalWatermarkAssigner.class, diff --git a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/rules/logical/PushWatermarkIntoTableSourceScanRulexxx.java b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/rules/logical/PushWatermarkIntoTableSourceScanRulexx.java similarity index 88% rename from gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/rules/logical/PushWatermarkIntoTableSourceScanRulexxx.java rename to gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/rules/logical/PushWatermarkIntoTableSourceScanRulexx.java index 4abc784a66b..6ea30f1d3c1 100644 --- a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/rules/logical/PushWatermarkIntoTableSourceScanRulexxx.java +++ b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/rules/logical/PushWatermarkIntoTableSourceScanRulexx.java @@ -27,14 +27,14 @@ * Rule to push the {@link FlinkLogicalWatermarkAssigner} into the {@link * FlinkLogicalTableSourceScan}. */ -public class PushWatermarkIntoTableSourceScanRulexxx +public class PushWatermarkIntoTableSourceScanRulexx extends PushWatermarkIntoTableSourceScanRuleBase { private static final Logger LOG = - LoggerFactory.getLogger(PushWatermarkIntoTableSourceScanRulexxx.class); - public static final PushWatermarkIntoTableSourceScanRulexxx INSTANCE = - new PushWatermarkIntoTableSourceScanRulexxx(); + LoggerFactory.getLogger(PushWatermarkIntoTableSourceScanRulexx.class); + public static final PushWatermarkIntoTableSourceScanRulexx INSTANCE = + new PushWatermarkIntoTableSourceScanRulexx(); - public PushWatermarkIntoTableSourceScanRulexxx() { + public PushWatermarkIntoTableSourceScanRulexx() { super( operand( FlinkLogicalWatermarkAssigner.class, diff --git a/gluten-flink/planner/src/main/java/org/apache/gluten/rexnode/AggregateCallConverter.java b/gluten-flink/planner/src/main/java/org/apache/gluten/rexnode/AggregateCallConverter.java index ca79fb01c01..bc28074ef77 100644 --- a/gluten-flink/planner/src/main/java/org/apache/gluten/rexnode/AggregateCallConverter.java +++ b/gluten-flink/planner/src/main/java/org/apache/gluten/rexnode/AggregateCallConverter.java @@ -56,6 +56,33 @@ public static List toAggregates( return aggregates; } + public static List toIntermediateAggregates( + AggregateCall[] aggregateCalls, io.github.zhztheplayer.velox4j.type.RowType inputType) { + List aggregates = new ArrayList<>(); + List typeExprs = new ArrayList<>(); + for (int i = 0; i < inputType.getNames().size(); i++) { + typeExprs.add( + FieldAccessTypedExpr.create(inputType.getChildren().get(i), inputType.getNames().get(i))); + } + for (int i = 0; i < aggregateCalls.length; i++) { + AggregateCall aggregateCall = aggregateCalls[i]; + CallTypedExpr call = + convertAggregation( + aggregateCall.getAggregation().getName(), + typeExprs, + RexNodeConverter.toType(aggregateCall.getType())); + aggregates.add( + new Aggregate( + call, + inputType.getChildren(), + null, + List.of(), + List.of(), + aggregateCall.isDistinct())); + } + return aggregates; + } + public static WindowFunction toFunction( AggregateCall aggregateCall, io.github.zhztheplayer.velox4j.type.RowType inputType) { CallTypedExpr call = toCall(aggregateCall, inputType); diff --git a/gluten-flink/planner/src/main/java/org/apache/gluten/rexnode/functions/SubstractRexCallConverter.java b/gluten-flink/planner/src/main/java/org/apache/gluten/rexnode/functions/SubstractRexCallConverter.java index 1c7e861595c..b1425c0d3a9 100644 --- a/gluten-flink/planner/src/main/java/org/apache/gluten/rexnode/functions/SubstractRexCallConverter.java +++ b/gluten-flink/planner/src/main/java/org/apache/gluten/rexnode/functions/SubstractRexCallConverter.java @@ -63,12 +63,10 @@ public ValidationResult isSuitable(RexCall callNode, RexConversionContext contex @Override public TypedExpr toTypedExpr(RexCall callNode, RexConversionContext context) { List params = getParams(callNode, context); - if (params.get(0).getReturnType() instanceof TimestampType && params.get(1).getReturnType() instanceof BigIntType) { - Type bigIntType = new BigIntType(); - TypedExpr castExpr = new CallTypedExpr(bigIntType, List.of(params.get(0)), "cast"); + TypedExpr castExpr = new CallTypedExpr(bigIntType, List.of(params.get(0)), "unix_millis"); List newParams = List.of(castExpr, params.get(1)); return new CallTypedExpr(bigIntType, newParams, functionName); @@ -76,6 +74,7 @@ public TypedExpr toTypedExpr(RexCall callNode, RexConversionContext context) { List alignedParams = TypeUtils.promoteTypeForArithmeticExpressions(params.get(0), params.get(1)); + Type resultType = getResultType(callNode); return new CallTypedExpr(resultType, alignedParams, functionName); } diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/streaming/api/operators/GlutenOperator.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/streaming/api/operators/GlutenOperator.java index 97205519679..77b879b1a5d 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/streaming/api/operators/GlutenOperator.java +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/streaming/api/operators/GlutenOperator.java @@ -16,6 +16,8 @@ */ package org.apache.gluten.streaming.api.operators; +import org.apache.gluten.table.runtime.operators.GlutenSessionResources; + import io.github.zhztheplayer.velox4j.plan.StatefulPlanNode; import io.github.zhztheplayer.velox4j.type.RowType; @@ -34,4 +36,15 @@ public interface GlutenOperator { public default String getDescription() { return ""; } + + public default void processElementInternal() {} + + public static void processElementByJni(String operatorId) { + GlutenOperator operator = + GlutenSessionResources.getInstance().getOperator(operatorId).orElse(null); + if (operator == null) { + throw new IllegalArgumentException("Operator not found: " + operatorId); + } + operator.processElementInternal(); + } } diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenOneInputOperator.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenOneInputOperator.java index d8c9e4b51e5..e5a9294d32d 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenOneInputOperator.java +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenOneInputOperator.java @@ -43,9 +43,6 @@ import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; import org.apache.flink.table.runtime.operators.TableStreamOperator; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import java.util.List; import java.util.Map; @@ -53,8 +50,6 @@ public class GlutenOneInputOperator extends TableStreamOperator implements OneInputStreamOperator, GlutenOperator { - private static final Logger LOG = LoggerFactory.getLogger(GlutenOneInputOperator.class); - private final StatefulPlanNode glutenPlan; private final String id; private final RowType inputType; @@ -122,6 +117,7 @@ void initSession() { } sessionResource = new GlutenSessionResource(); GlutenSessionResources.getInstance().addSessionResource(id, sessionResource); + GlutenSessionResources.getInstance().addOperator(this.getClass().getSimpleName(), this); inputQueue = sessionResource.getSession().externalStreamOps().newBlockingQueue(); // add a mock input as velox not allow the source is empty. if (inputType == null) { @@ -164,33 +160,45 @@ public void processElement(StreamRecord element) { StatefulRecord statefulRecord = inputBridge.convertToStatefulRecord( element, sessionResource.getAllocator(), sessionResource.getSession(), inputType); - inputQueue.put(statefulRecord.getRowVector()); - // Only the rowvectors generated by this operator should be closed here. - if (getId().equals(statefulRecord.getNodeId())) { - statefulRecord.close(); + try { + inputQueue.put(statefulRecord.getRowVector()); + } finally { + // Only the rowvectors generated by this operator should be closed here. + if (getId().equals(statefulRecord.getNodeId())) { + statefulRecord.close(); + } } + processElementInternal(); } - protected void processElementInternal() { + @Override + public void processElementInternal() { while (true) { UpIterator.State state = task.advance(); if (state == UpIterator.State.AVAILABLE) { final StatefulElement statefulElement = task.statefulGet(); - if (statefulElement.isWatermark()) { - StatefulWatermark watermark = statefulElement.asWatermark(); - output.emitWatermark(new Watermark(watermark.getTimestamp())); - } else { - outputBridge.collect( - output, statefulElement.asRecord(), sessionResource.getAllocator(), outputType); + try { + if (statefulElement.isWatermark()) { + StatefulWatermark watermark = statefulElement.asWatermark(); + output.emitWatermark(new Watermark(watermark.getTimestamp())); + } else { + outputBridge.collect( + output, statefulElement.asRecord(), sessionResource.getAllocator(), outputType); + } + } finally { + statefulElement.close(); } - statefulElement.close(); } else { break; } } } + public boolean operateOnProcessTime() { + return false; + } + public GlutenOneInputOperator cloneWithInputOutputClasses( StatefulPlanNode plan, Class newInClass, Class newOutClass) { return new GlutenOneInputOperator<>( @@ -216,13 +224,13 @@ public void processWatermark2(Watermark mark) throws Exception { @Override public void close() throws Exception { + if (task != null) { + task.close(); + } if (inputQueue != null) { inputQueue.noMoreInput(); inputQueue.close(); } - if (task != null) { - task.close(); - } if (sessionResource != null) { sessionResource.close(); } diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenSessionResources.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenSessionResources.java index 2f709817819..005830b942e 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenSessionResources.java +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenSessionResources.java @@ -32,6 +32,7 @@ import java.util.HashMap; import java.util.Map; +import java.util.Optional; // Manage the session and resource for Velox. class GlutenSessionResource { @@ -110,8 +111,10 @@ public void addOperator(String id, GlutenOperator operator) { operators.put(id, operator); } - public GlutenOperator getOperator(String id) { - LOG.info("getOperator: {}, {}", id, operators.keySet()); - return operators.get(id); + public Optional getOperator(String id) { + if (operators.containsKey(id)) { + return Optional.of(operators.get(id)); + } + return Optional.empty(); } } diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenSourceFunction.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenSourceFunction.java index 3d7f520b14a..53f36fcf67c 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenSourceFunction.java +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenSourceFunction.java @@ -32,13 +32,12 @@ import io.github.zhztheplayer.velox4j.stateful.StatefulWatermark; import io.github.zhztheplayer.velox4j.type.RowType; -import org.apache.flink.api.common.eventtime.WatermarkGenerator; -import org.apache.flink.api.common.eventtime.WatermarkOutput; import org.apache.flink.configuration.Configuration; import org.apache.flink.runtime.state.FunctionInitializationContext; import org.apache.flink.runtime.state.FunctionSnapshotContext; import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction; import org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction; +import org.apache.flink.streaming.api.watermark.Watermark; import org.apache.flink.table.data.RowData; import org.slf4j.Logger; diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenTwoInputOperator.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenTwoInputOperator.java index 94db3072e4b..f20670fed66 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenTwoInputOperator.java +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenTwoInputOperator.java @@ -152,7 +152,8 @@ public void processElement2(StreamRecord element) { processElementInternal(); } - private void processElementInternal() { + @Override + public void processElementInternal() { while (true) { UpIterator.State state = task.advance(); if (state == UpIterator.State.AVAILABLE) { diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/WindowAggOperator.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/WindowAggOperator.java index 50d91f2c73a..ac81e0fafe5 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/WindowAggOperator.java +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/WindowAggOperator.java @@ -51,6 +51,7 @@ public class WindowAggOperator extends GlutenOneInputOperator keyType; private String[] accNames; private LogicalType[] accTypes; + private boolean isRowTime = false; public WindowAggOperator( StatefulPlanNode plan, @@ -62,11 +63,13 @@ public WindowAggOperator( String description, InternalTypeInfo keyType, String[] accNames, - LogicalType[] accTypes) { + LogicalType[] accTypes, + boolean isRowTime) { super(plan, id, inputType, outputTypes, inClass, outClass, description); this.keyType = keyType; this.accNames = accNames; this.accTypes = accTypes; + this.isRowTime = isRowTime; } public InternalTypeInfo getKeyTye() { @@ -132,9 +135,14 @@ public void initializeState(StateInitializationContext context) throws Exception } } + @Override + public boolean operateOnProcessTime() { + return !isRowTime; + } + @Override public WindowAggOperator cloneWithInputOutputClasses( - Class newInClass, Class newOutClass) { + StatefulPlanNode plan, Class newInClass, Class newOutClass) { return new WindowAggOperator<>( getPlanNode(), getId(), @@ -145,7 +153,8 @@ public WindowAggOperator cloneWithInputOutputClasses( getDescription(), keyType, accNames, - accTypes); + accTypes, + isRowTime); } @Override diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/typeutils/GlutenStatefulRecordSerializer.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/typeutils/GlutenStatefulRecordSerializer.java index 57c273037d9..66fdd7bc515 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/typeutils/GlutenStatefulRecordSerializer.java +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/typeutils/GlutenStatefulRecordSerializer.java @@ -72,7 +72,7 @@ public StatefulRecord deserialize(DataInputView source) throws IOException { .baseVectorOps() .deserializeOne(new String(str)) .asRowVector(); - StatefulRecord record = new StatefulRecord(null, 0, 0, false, -1); + StatefulRecord record = new StatefulRecord(operator.getId(), rowVector.id(), 0, false, -1); record.setRowVector(rowVector); return record; } From 016db0a2defed5d40b1315d7a76fc0b89b8db88b Mon Sep 17 00:00:00 2001 From: zouyunhe Date: Mon, 18 May 2026 02:54:36 +0000 Subject: [PATCH 11/17] Fix event-time window --- .../nodes/exec/common/CommonExecSink.java | 24 ++--- .../nodes/exec/stream/StreamExecExchange.java | 93 ++++++++----------- .../StreamExecLocalWindowAggregate.java | 2 +- .../stream/StreamExecTableSourceScan.java | 53 ++++++++++- .../stream/StreamExecWatermarkAssigner.java | 15 +++ ...rkIntoTableSourceScanAcrossCalcRulexx.java | 53 ----------- ...ushWatermarkIntoTableSourceScanRulexx.java | 53 ----------- .../gluten/velox/KafkaSourceSinkFactory.java | 12 ++- .../GlutenKeyGroupStreamPartitioner.java | 11 ++- .../runtime/config/VeloxQueryConfig.java | 8 ++ 10 files changed, 142 insertions(+), 182 deletions(-) delete mode 100644 gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/rules/logical/PushWatermarkIntoTableSourceScanAcrossCalcRulexx.java delete mode 100644 gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/rules/logical/PushWatermarkIntoTableSourceScanRulexx.java diff --git a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/common/CommonExecSink.java b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/common/CommonExecSink.java index fab420c4818..20943277f8d 100644 --- a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/common/CommonExecSink.java +++ b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/common/CommonExecSink.java @@ -70,7 +70,6 @@ import org.apache.flink.table.runtime.operators.sink.ConstraintEnforcer; import org.apache.flink.table.runtime.operators.sink.RowKindSetter; import org.apache.flink.table.runtime.operators.sink.SinkOperator; -import org.apache.flink.table.runtime.operators.sink.StreamRecordTimestampInserter; import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; import org.apache.flink.table.types.logical.BinaryType; import org.apache.flink.table.types.logical.CharType; @@ -556,17 +555,18 @@ private Transformation applyRowtimeTransformation( if (rowtimeFieldIndex == -1) { return inputTransform; } - return ExecNodeUtil.createOneInputTransformation( - inputTransform, - createTransformationMeta( - TIMESTAMP_INSERTER_TRANSFORMATION, - String.format("StreamRecordTimestampInserter(rowtime field: %s)", rowtimeFieldIndex), - "StreamRecordTimestampInserter", - config), - new StreamRecordTimestampInserter(rowtimeFieldIndex), - inputTransform.getOutputType(), - sinkParallelism, - sinkParallelismConfigured); + // return ExecNodeUtil.createOneInputTransformation( + // inputTransform, + // createTransformationMeta( + // TIMESTAMP_INSERTER_TRANSFORMATION, + // String.format("StreamRecordTimestampInserter(rowtime field: %s)", rowtimeFieldIndex), + // "StreamRecordTimestampInserter", + // config), + // new StreamRecordTimestampInserter(rowtimeFieldIndex), + // inputTransform.getOutputType(), + // sinkParallelism, + // sinkParallelismConfigured); + return inputTransform; } private InternalTypeInfo getInputTypeInfo() { diff --git a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecExchange.java b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecExchange.java index 17e3966d52a..a6d23739801 100644 --- a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecExchange.java +++ b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecExchange.java @@ -19,24 +19,12 @@ import org.apache.gluten.streaming.api.operators.GlutenOperator; import org.apache.gluten.streaming.runtime.partitioner.GlutenKeyGroupStreamPartitioner; import org.apache.gluten.table.runtime.keyselector.GlutenKeySelector; -import org.apache.gluten.table.runtime.operators.GlutenOneInputOperator; -import org.apache.gluten.util.LogicalTypeConverter; -import org.apache.gluten.util.PlanNodeIdGenerator; - -import io.github.zhztheplayer.velox4j.plan.EmptyNode; -import io.github.zhztheplayer.velox4j.plan.HashPartitionFunctionSpec; -import io.github.zhztheplayer.velox4j.plan.LocalPartitionNode; -import io.github.zhztheplayer.velox4j.plan.PartitionFunctionSpec; -import io.github.zhztheplayer.velox4j.plan.PlanNode; -import io.github.zhztheplayer.velox4j.plan.StatefulPlanNode; -import io.github.zhztheplayer.velox4j.plan.StreamPartitionNode; import org.apache.flink.FlinkVersion; import org.apache.flink.api.common.ExecutionConfig; import org.apache.flink.api.dag.Transformation; import org.apache.flink.api.java.functions.KeySelector; import org.apache.flink.configuration.ReadableConfig; -import org.apache.flink.streaming.api.operators.OneInputStreamOperator; import org.apache.flink.streaming.api.transformations.OneInputTransformation; import org.apache.flink.streaming.api.transformations.PartitionTransformation; import org.apache.flink.streaming.runtime.partitioner.GlobalPartitioner; @@ -45,7 +33,6 @@ import org.apache.flink.table.api.TableException; import org.apache.flink.table.data.RowData; import org.apache.flink.table.planner.delegation.PlannerBase; -import org.apache.flink.table.planner.plan.nodes.exec.ExecEdge; import org.apache.flink.table.planner.plan.nodes.exec.ExecNode; import org.apache.flink.table.planner.plan.nodes.exec.ExecNodeConfig; import org.apache.flink.table.planner.plan.nodes.exec.ExecNodeContext; @@ -53,8 +40,6 @@ import org.apache.flink.table.planner.plan.nodes.exec.InputProperty; import org.apache.flink.table.planner.plan.nodes.exec.InputProperty.HashDistribution; import org.apache.flink.table.planner.plan.nodes.exec.common.CommonExecExchange; -import org.apache.flink.table.planner.plan.nodes.exec.utils.ExecNodeUtil; -import org.apache.flink.table.planner.plan.nodes.exec.utils.TransformationMetadata; import org.apache.flink.table.planner.plan.utils.KeySelectorUtil; import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; import org.apache.flink.table.types.logical.RowType; @@ -62,11 +47,8 @@ import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.annotation.JsonCreator; import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.annotation.JsonProperty; -import java.util.Arrays; import java.util.Collections; import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; import static org.apache.flink.runtime.state.KeyGroupRangeAssignment.DEFAULT_LOWER_BOUND_MAX_PARALLELISM; import static org.apache.flink.util.Preconditions.checkArgument; @@ -144,44 +126,45 @@ protected Transformation translateToPlanInternal( // should set it when operator init. parallelism = inputTransform.getParallelism(); keySelector = new GlutenKeySelector(); - final ExecEdge inputEdge = getInputEdges().get(0); - io.github.zhztheplayer.velox4j.type.RowType glutenInputType = - (io.github.zhztheplayer.velox4j.type.RowType) - LogicalTypeConverter.toVLType(inputEdge.getOutputType()); - io.github.zhztheplayer.velox4j.type.RowType outputType = - (io.github.zhztheplayer.velox4j.type.RowType) - LogicalTypeConverter.toVLType(getOutputType()); - String id = PlanNodeIdGenerator.newId(); - List keyIndexes = Arrays.stream(keys).boxed().collect(Collectors.toList()); - PartitionFunctionSpec partitionFunctionSpec = - new HashPartitionFunctionSpec(glutenInputType, keyIndexes); - PlanNode localPartition = - new LocalPartitionNode( - id, - List.of(new EmptyNode(outputType)), - "REPARTITION", - false, - partitionFunctionSpec); - PlanNode exchange = new StreamPartitionNode(id, localPartition, parallelism); - final OneInputStreamOperator exchangeKeyGenerator = - new GlutenOneInputOperator( - new StatefulPlanNode(id, exchange), - id, - glutenInputType, - Map.of(id, outputType), - RowData.class, - RowData.class, - "StreamExecExchange"); - inputTransform = - ExecNodeUtil.createOneInputTransformation( - inputTransform, - new TransformationMetadata("exchange-hash", "Gluten exchange hash"), - exchangeKeyGenerator, - inputTransform.getOutputType(), - parallelism, - false); + // final ExecEdge inputEdge = getInputEdges().get(0); + // io.github.zhztheplayer.velox4j.type.RowType glutenInputType = + // (io.github.zhztheplayer.velox4j.type.RowType) + // LogicalTypeConverter.toVLType(inputEdge.getOutputType()); + // io.github.zhztheplayer.velox4j.type.RowType outputType = + // (io.github.zhztheplayer.velox4j.type.RowType) + // LogicalTypeConverter.toVLType(getOutputType()); + // String id = PlanNodeIdGenerator.newId(); + // List keyIndexes = Arrays.stream(keys).boxed().collect(Collectors.toList()); + // PartitionFunctionSpec partitionFunctionSpec = + // new HashPartitionFunctionSpec(glutenInputType, keyIndexes); + // PlanNode localPartition = + // new LocalPartitionNode( + // id, + // List.of(new EmptyNode(outputType)), + // "REPARTITION", + // false, + // partitionFunctionSpec); + // PlanNode exchange = new StreamPartitionNode(id, localPartition, parallelism); + // final OneInputStreamOperator exchangeKeyGenerator = + // new GlutenOneInputOperator( + // new StatefulPlanNode(id, exchange), + // id, + // glutenInputType, + // Map.of(id, outputType), + // RowData.class, + // RowData.class, + // "StreamExecExchange"); + // inputTransform = + // ExecNodeUtil.createOneInputTransformation( + // inputTransform, + // new TransformationMetadata("exchange-hash", "Gluten exchange hash"), + // exchangeKeyGenerator, + // inputTransform.getOutputType(), + // parallelism, + // false); partitioner = - new GlutenKeyGroupStreamPartitioner(keySelector, DEFAULT_LOWER_BOUND_MAX_PARALLELISM); + new GlutenKeyGroupStreamPartitioner( + keySelector, DEFAULT_LOWER_BOUND_MAX_PARALLELISM, parallelism); } else { parallelism = ExecutionConfig.PARALLELISM_DEFAULT; partitioner = diff --git a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecLocalWindowAggregate.java b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecLocalWindowAggregate.java index 7a7bc4520eb..5ffddd367c2 100644 --- a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecLocalWindowAggregate.java +++ b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecLocalWindowAggregate.java @@ -176,7 +176,7 @@ protected Transformation translateToPlanInternal( .limit(aggCalls.length) .collect(Collectors.toList()); List keyIndexes = Arrays.stream(grouping).boxed().collect(Collectors.toList()); - PartitionFunctionSpec keySelectorSpec = new HashPartitionFunctionSpec(inputType, keyIndexes); + PartitionFunctionSpec keySelectorSpec = new HashPartitionFunctionSpec(outputType, keyIndexes); // TODO: support more window types. Tuple5 windowSpecParams = WindowUtils.extractWindowParameters(windowing); diff --git a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecTableSourceScan.java b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecTableSourceScan.java index 90b3981f0f8..c4627e30c45 100644 --- a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecTableSourceScan.java +++ b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecTableSourceScan.java @@ -18,14 +18,21 @@ import org.apache.gluten.velox.VeloxSourceSinkFactory; +import io.github.zhztheplayer.velox4j.plan.ProjectNode; + import org.apache.flink.FlinkVersion; import org.apache.flink.api.common.io.InputFormat; import org.apache.flink.api.dag.Transformation; +import org.apache.flink.configuration.PipelineOptions; import org.apache.flink.configuration.ReadableConfig; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.streaming.api.transformations.SourceTransformation; +import org.apache.flink.table.api.config.ExecutionConfigOptions; import org.apache.flink.table.connector.source.ScanTableSource; import org.apache.flink.table.data.RowData; import org.apache.flink.table.planner.delegation.PlannerBase; +import org.apache.flink.table.planner.plan.abilities.source.SourceAbilitySpec; +import org.apache.flink.table.planner.plan.abilities.source.WatermarkPushDownSpec; import org.apache.flink.table.planner.plan.nodes.exec.ExecNode; import org.apache.flink.table.planner.plan.nodes.exec.ExecNodeConfig; import org.apache.flink.table.planner.plan.nodes.exec.ExecNodeContext; @@ -35,11 +42,17 @@ import org.apache.flink.table.planner.utils.ShortcutUtils; import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.RowType.RowField; +import org.apache.flink.table.types.logical.TimestampType; import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.annotation.JsonCreator; import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.annotation.JsonProperty; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import java.util.Collections; +import java.util.List; import java.util.Map; /** @@ -54,6 +67,7 @@ minStateVersion = FlinkVersion.v1_15) public class StreamExecTableSourceScan extends CommonExecTableSourceScan implements StreamExecNode { + private static final Logger LOG = LoggerFactory.getLogger(StreamExecTableSourceScan.class); public StreamExecTableSourceScan( ReadableConfig tableConfig, @@ -98,6 +112,35 @@ public Transformation createInputFormatTransformation( return env.createInput(inputFormat, outputTypeInfo).name(operatorName).getTransformation(); } + private io.github.zhztheplayer.velox4j.plan.WatermarkPushDownSpec getWatermarkPushDownSpec( + Transformation transformation, ExecNodeConfig config) { + io.github.zhztheplayer.velox4j.plan.WatermarkPushDownSpec watermarkPushDownSpecNode = null; + if (transformation instanceof SourceTransformation) { + List sourceAbilities = getTableSourceSpec().getSourceAbilities(); + if (sourceAbilities != null) { + for (SourceAbilitySpec sourceAbility : sourceAbilities) { + if (sourceAbility instanceof WatermarkPushDownSpec) { + final long idleTimeout = + config.get(ExecutionConfigOptions.TABLE_EXEC_SOURCE_IDLE_TIMEOUT).toMillis(); + final long watermarkInterval = + config.get(PipelineOptions.AUTO_WATERMARK_INTERVAL).toMillis(); + WatermarkPushDownSpec watermarkPushDownSpec = (WatermarkPushDownSpec) sourceAbility; + RowField watermarkField = new RowField("watermark", new TimestampType(3)); + ProjectNode project = + StreamExecWatermarkAssigner.translateWatermarkExpr( + getOutputType(), + new RowType(List.of(watermarkField)), + watermarkPushDownSpec.getWatermarkExpr()); + watermarkPushDownSpecNode = + new io.github.zhztheplayer.velox4j.plan.WatermarkPushDownSpec( + project, idleTimeout, watermarkInterval, -1); + } + } + } + } + return watermarkPushDownSpecNode; + } + @Override protected Transformation translateToPlanInternal( PlannerBase planner, ExecNodeConfig config) { @@ -106,14 +149,18 @@ protected Transformation translateToPlanInternal( getTableSourceSpec() .getScanTableSource( planner.getFlinkContext(), ShortcutUtils.unwrapTypeFactory(planner)); - Transformation sourceTransformation = super.translateToPlanInternal(planner, config); + Transformation transformation = super.translateToPlanInternal(planner, config); + io.github.zhztheplayer.velox4j.plan.WatermarkPushDownSpec watermarkPushDownSpec = + getWatermarkPushDownSpec(transformation, config); return VeloxSourceSinkFactory.buildSource( - sourceTransformation, + transformation, Map.of( ScanTableSource.class.getName(), tableSource, "checkpoint.enabled", - planner.getExecEnv().getCheckpointConfig().isCheckpointingEnabled())); + planner.getExecEnv().getCheckpointConfig().isCheckpointingEnabled(), + "watermarkPushDownSpec", + watermarkPushDownSpec)); // --- End Gluten-specific code changes --- } } diff --git a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecWatermarkAssigner.java b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecWatermarkAssigner.java index 1d07e93f280..928daf48b89 100644 --- a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecWatermarkAssigner.java +++ b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecWatermarkAssigner.java @@ -48,6 +48,7 @@ import org.apache.flink.table.planner.plan.nodes.exec.SingleTransformationTranslator; import org.apache.flink.table.planner.plan.nodes.exec.utils.ExecNodeUtil; import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; +import org.apache.flink.table.types.logical.LogicalType; import org.apache.flink.table.types.logical.RowType; import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.annotation.JsonCreator; @@ -117,6 +118,20 @@ public StreamExecWatermarkAssigner( this.rowtimeFieldIndex = rowtimeFieldIndex; } + static ProjectNode translateWatermarkExpr( + LogicalType inputType, LogicalType outputType, RexNode watermarkExpr) { + List inNames = Utils.getNamesFromRowType(inputType); + RexConversionContext conversionContext = new RexConversionContext(inNames); + TypedExpr watermarkExprs = RexNodeConverter.toTypedExpr(watermarkExpr, conversionContext); + io.github.zhztheplayer.velox4j.type.RowType outputRowType = + (io.github.zhztheplayer.velox4j.type.RowType) LogicalTypeConverter.toVLType(outputType); + return new ProjectNode( + PlanNodeIdGenerator.newId(), + List.of(new EmptyNode(outputRowType)), + List.of("TIMESTAMP"), + List.of(watermarkExprs)); + } + @SuppressWarnings("unchecked") @Override protected Transformation translateToPlanInternal( diff --git a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/rules/logical/PushWatermarkIntoTableSourceScanAcrossCalcRulexx.java b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/rules/logical/PushWatermarkIntoTableSourceScanAcrossCalcRulexx.java deleted file mode 100644 index 0a3ac3b2fdf..00000000000 --- a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/rules/logical/PushWatermarkIntoTableSourceScanAcrossCalcRulexx.java +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.flink.table.planner.plan.rules.logical; - -import org.apache.flink.api.common.eventtime.WatermarkStrategy; -import org.apache.flink.table.planner.plan.nodes.logical.FlinkLogicalCalc; -import org.apache.flink.table.planner.plan.nodes.logical.FlinkLogicalTableSourceScan; -import org.apache.flink.table.planner.plan.nodes.logical.FlinkLogicalWatermarkAssigner; - -import org.apache.calcite.plan.RelOptRuleCall; - -/** - * Rule to push the {@link FlinkLogicalWatermarkAssigner} across the {@link FlinkLogicalCalc} to the - * {@link FlinkLogicalTableSourceScan}. The rule will first look for the computed column in the - * {@link FlinkLogicalCalc} and then translate the watermark expression and the computed column into - * a {@link WatermarkStrategy}. With the new scan the rule will build a new {@link - * FlinkLogicalCalc}. - */ -public class PushWatermarkIntoTableSourceScanAcrossCalcRulexx - extends PushWatermarkIntoTableSourceScanRuleBase { - public static final PushWatermarkIntoTableSourceScanAcrossCalcRulexx INSTANCE = - new PushWatermarkIntoTableSourceScanAcrossCalcRulexx(); - - public PushWatermarkIntoTableSourceScanAcrossCalcRulexx() { - super( - operand( - FlinkLogicalWatermarkAssigner.class, - operand(FlinkLogicalCalc.class, operand(FlinkLogicalTableSourceScan.class, none()))), - "PushWatermarkIntoFlinkTableSourceScanAcrossCalcRule"); - } - - @Override - public boolean matches(RelOptRuleCall call) { - return false; - } - - @Override - public void onMatch(RelOptRuleCall call) {} -} diff --git a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/rules/logical/PushWatermarkIntoTableSourceScanRulexx.java b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/rules/logical/PushWatermarkIntoTableSourceScanRulexx.java deleted file mode 100644 index 6ea30f1d3c1..00000000000 --- a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/rules/logical/PushWatermarkIntoTableSourceScanRulexx.java +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.flink.table.planner.plan.rules.logical; - -import org.apache.flink.table.planner.plan.nodes.logical.FlinkLogicalTableSourceScan; -import org.apache.flink.table.planner.plan.nodes.logical.FlinkLogicalWatermarkAssigner; - -import org.apache.calcite.plan.RelOptRuleCall; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Rule to push the {@link FlinkLogicalWatermarkAssigner} into the {@link - * FlinkLogicalTableSourceScan}. - */ -public class PushWatermarkIntoTableSourceScanRulexx - extends PushWatermarkIntoTableSourceScanRuleBase { - private static final Logger LOG = - LoggerFactory.getLogger(PushWatermarkIntoTableSourceScanRulexx.class); - public static final PushWatermarkIntoTableSourceScanRulexx INSTANCE = - new PushWatermarkIntoTableSourceScanRulexx(); - - public PushWatermarkIntoTableSourceScanRulexx() { - super( - operand( - FlinkLogicalWatermarkAssigner.class, - operand(FlinkLogicalTableSourceScan.class, none())), - "PushWatermarkIntoTableSourceScanRule"); - } - - @Override - public boolean matches(RelOptRuleCall call) { - LOG.info("PushWatermarkIntoTableSourceScanRule does not match xxxxx"); - return false; - } - - @Override - public void onMatch(RelOptRuleCall call) {} -} diff --git a/gluten-flink/planner/src/main/java/org/apache/gluten/velox/KafkaSourceSinkFactory.java b/gluten-flink/planner/src/main/java/org/apache/gluten/velox/KafkaSourceSinkFactory.java index 8644dac3dac..8d4c22778c1 100644 --- a/gluten-flink/planner/src/main/java/org/apache/gluten/velox/KafkaSourceSinkFactory.java +++ b/gluten-flink/planner/src/main/java/org/apache/gluten/velox/KafkaSourceSinkFactory.java @@ -24,8 +24,11 @@ import io.github.zhztheplayer.velox4j.connector.KafkaConnectorSplit; import io.github.zhztheplayer.velox4j.connector.KafkaTableHandle; +import io.github.zhztheplayer.velox4j.plan.PlanNode; import io.github.zhztheplayer.velox4j.plan.StatefulPlanNode; import io.github.zhztheplayer.velox4j.plan.TableScanNode; +import io.github.zhztheplayer.velox4j.plan.TableScanWithWatermarkNode; +import io.github.zhztheplayer.velox4j.plan.WatermarkPushDownSpec; import io.github.zhztheplayer.velox4j.type.RowType; import org.apache.flink.api.connector.source.Source; @@ -68,6 +71,8 @@ public Transformation buildVeloxSource( ScanTableSource tableSource = (ScanTableSource) parameters.get(ScanTableSource.class.getName()); boolean checkpointEnabled = (Boolean) parameters.get("checkpoint.enabled"); + WatermarkPushDownSpec watermarkPushDownSpec = + (WatermarkPushDownSpec) parameters.get("watermarkPushDownSpec"); Class tableSourceClazz = Class.forName("org.apache.flink.streaming.connectors.kafka.table.KafkaDynamicSource"); Properties properties = @@ -112,7 +117,12 @@ public Transformation buildVeloxSource( Boolean.valueOf(kafkaTableParameters.getOrDefault("enable.auto.commit", "false")), "latest", List.of()); - TableScanNode kafkaScan = new TableScanNode(planId, outputType, kafkaTableHandle, List.of()); + + PlanNode kafkaScan = + watermarkPushDownSpec != null + ? new TableScanWithWatermarkNode( + planId, outputType, kafkaTableHandle, List.of(), watermarkPushDownSpec) + : new TableScanNode(planId, outputType, kafkaTableHandle, List.of()); GlutenStreamSource sourceOp = new GlutenStreamSource( new GlutenSourceFunction( diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/streaming/runtime/partitioner/GlutenKeyGroupStreamPartitioner.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/streaming/runtime/partitioner/GlutenKeyGroupStreamPartitioner.java index 1a30c525f82..90f34f514a3 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/streaming/runtime/partitioner/GlutenKeyGroupStreamPartitioner.java +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/streaming/runtime/partitioner/GlutenKeyGroupStreamPartitioner.java @@ -39,16 +39,17 @@ public class GlutenKeyGroupStreamPartitioner extends StreamPartitioner implements ConfigurableStreamPartitioner { private static final long serialVersionUID = 1L; - private final KeySelector keySelector; private int maxParallelism; + private int parallelism; public GlutenKeyGroupStreamPartitioner( - KeySelector keySelector, int maxParallelism) { + KeySelector keySelector, int maxParallelism, int parallelism) { Preconditions.checkArgument(maxParallelism > 0, "Number of key-groups must be > 0!"); this.keySelector = Preconditions.checkNotNull(keySelector); this.maxParallelism = maxParallelism; + this.parallelism = parallelism; } public int getMaxParallelism() { @@ -58,8 +59,10 @@ public int getMaxParallelism() { @Override public int selectChannel(SerializationDelegate> record) { try { - int channel = keySelector.getKey(record.getInstance().getValue()); - return channel; + int key = keySelector.getKey(record.getInstance().getValue()); + int keyGroup = KeyGroupRangeAssignment.assignToKeyGroup(key, maxParallelism); + return KeyGroupRangeAssignment.computeOperatorIndexForKeyGroup( + maxParallelism, parallelism, keyGroup); } catch (Exception e) { throw new RuntimeException( "Could not extract key from " + record.getInstance().getValue(), e); diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/config/VeloxQueryConfig.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/config/VeloxQueryConfig.java index a1026cfe49a..96631eeab7b 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/config/VeloxQueryConfig.java +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/config/VeloxQueryConfig.java @@ -33,6 +33,9 @@ public class VeloxQueryConfig { private static final String keyVeloxSessionTimezone = "session_timezone"; private static final String kStreamingAggregationMinOutputBatchRows = "streaming_aggregation_min_output_batch_rows"; + private static final String kMaxOutputBatchRows = "max_output_batch_rows"; + private static final String kPreferredOutputBatchRows = "preferred_output_batch_rows"; + private static final String kStatefulTaskParallelism = "stateful_task_parallelism"; public static Config getConfig(RuntimeContext context) { if (!(context instanceof StreamingRuntimeContext)) { @@ -50,6 +53,11 @@ public static Config getConfig(RuntimeContext context) { configMap.put(keyVeloxSessionTimezone, localTimeZone); } configMap.put(kStreamingAggregationMinOutputBatchRows, String.valueOf(1)); + configMap.put(kMaxOutputBatchRows, String.valueOf(Integer.MAX_VALUE)); + configMap.put(kPreferredOutputBatchRows, String.valueOf(Integer.MAX_VALUE)); + configMap.put( + kStatefulTaskParallelism, + String.valueOf(context.getTaskInfo().getNumberOfParallelSubtasks())); return Config.create(configMap); } } From df0f4e556dbac1415f143eb7831bf9ab29515d56 Mon Sep 17 00:00:00 2001 From: zouyunhe Date: Tue, 2 Jun 2026 10:16:54 +0000 Subject: [PATCH 12/17] support proctime window --- .../StreamExecGroupWindowAggregate.java | 11 -- .../GlutenOneInputOperatorFactory.java | 6 + .../api/operators/GlutenOperator.java | 49 ++++++-- .../operators/GlutenMailboxHolder.java | 39 +++++++ .../GlutenMailboxOperatorHelper.java | 108 ++++++++++++++++++ .../operators/GlutenOneInputOperator.java | 20 +++- .../operators/GlutenSourceFunction.java | 2 +- .../operators/GlutenTwoInputOperator.java | 38 ++++-- .../runtime/operators/WindowAggOperator.java | 2 + 9 files changed, 246 insertions(+), 29 deletions(-) create mode 100644 gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenMailboxHolder.java create mode 100644 gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenMailboxOperatorHelper.java diff --git a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecGroupWindowAggregate.java b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecGroupWindowAggregate.java index ccd11653160..8843982008f 100644 --- a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecGroupWindowAggregate.java +++ b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecGroupWindowAggregate.java @@ -16,14 +16,10 @@ */ package org.apache.flink.table.planner.plan.nodes.exec.stream; -import org.apache.gluten.rexnode.AggregateCallConverter; -import org.apache.gluten.rexnode.Utils; import org.apache.gluten.table.runtime.operators.GlutenOneInputOperator; import org.apache.gluten.util.LogicalTypeConverter; import org.apache.gluten.util.PlanNodeIdGenerator; -import io.github.zhztheplayer.velox4j.aggregate.Aggregate; -import io.github.zhztheplayer.velox4j.expression.FieldAccessTypedExpr; import io.github.zhztheplayer.velox4j.plan.GroupWindowAggregationNode; import io.github.zhztheplayer.velox4j.plan.GroupWindowAggsHandlerNode; import io.github.zhztheplayer.velox4j.plan.HashPartitionFunctionSpec; @@ -213,14 +209,7 @@ protected Transformation translateToPlanInternal( io.github.zhztheplayer.velox4j.type.RowType outputType = (io.github.zhztheplayer.velox4j.type.RowType) LogicalTypeConverter.toVLType(getOutputType()); - List groupingKeys = Utils.generateFieldAccesses(inputType, grouping); - List aggregates = AggregateCallConverter.toAggregates(aggCalls, inputType); checkArgument(outputType.getNames().size() >= grouping.length + aggCalls.length); - List aggNames = - outputType.getNames().stream() - .skip(grouping.length) - .limit(aggCalls.length) - .collect(Collectors.toList()); List keyIndexes = Arrays.stream(grouping).boxed().collect(Collectors.toList()); PartitionFunctionSpec keySelectorSpec = new HashPartitionFunctionSpec(inputType, keyIndexes); // TODO: support more window types. diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/streaming/api/operators/GlutenOneInputOperatorFactory.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/streaming/api/operators/GlutenOneInputOperatorFactory.java index 831dfde66f2..41522731a30 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/streaming/api/operators/GlutenOneInputOperatorFactory.java +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/streaming/api/operators/GlutenOneInputOperatorFactory.java @@ -16,6 +16,8 @@ */ package org.apache.gluten.streaming.api.operators; +import org.apache.gluten.table.runtime.operators.GlutenMailboxOperatorHelper; + import io.github.zhztheplayer.velox4j.serde.Serde; import org.apache.flink.streaming.api.operators.AbstractStreamOperator; @@ -59,6 +61,10 @@ public > T createStreamOperator( .setup( parameters.getContainingTask(), parameters.getStreamConfig(), parameters.getOutput()); } + if (operator instanceof GlutenOperator) { + GlutenMailboxOperatorHelper.bindAtTaskStartup( + ((GlutenOperator) operator).mailboxHolder(), parameters); + } return (T) operator; } diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/streaming/api/operators/GlutenOperator.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/streaming/api/operators/GlutenOperator.java index 77b879b1a5d..0983140ed93 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/streaming/api/operators/GlutenOperator.java +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/streaming/api/operators/GlutenOperator.java @@ -16,35 +16,68 @@ */ package org.apache.gluten.streaming.api.operators; +import org.apache.gluten.table.runtime.operators.GlutenMailboxHolder; import org.apache.gluten.table.runtime.operators.GlutenSessionResources; import io.github.zhztheplayer.velox4j.plan.StatefulPlanNode; import io.github.zhztheplayer.velox4j.type.RowType; +import org.apache.flink.api.common.operators.MailboxExecutor; +import org.apache.flink.streaming.runtime.tasks.StreamTask; + import java.util.Map; /** Interface for all gluten operators. */ public interface GlutenOperator { - public StatefulPlanNode getPlanNode(); - public RowType getInputType(); + StatefulPlanNode getPlanNode(); + + RowType getInputType(); - public Map getOutputTypes(); + Map getOutputTypes(); - public String getId(); + String getId(); + + /** Mailbox drain helper holder; must be a non-transient field on the concrete operator. */ + default GlutenMailboxHolder mailboxHolder() { + return new GlutenMailboxHolder(); + } - public default String getDescription() { + default String getDescription() { return ""; } - public default void processElementInternal() {} + default void processElementInternal() {} - public static void processElementByJni(String operatorId) { + default void bindMailboxExecutor(MailboxExecutor mailboxExecutor) { + mailboxHolder().get().bindMailboxExecutor(mailboxExecutor); + } + + default void ensureMailboxInitialized(StreamTask containingTask) { + mailboxHolder().get().ensureMailboxInitialized(containingTask); + } + + default void drainOutput(Runnable drainAction) { + mailboxHolder().get().runDrain(drainAction); + } + + default void scheduleDrainOnMailbox(Runnable drainAction) { + mailboxHolder().get().scheduleDrain(drainAction); + } + + /** + * Called from native Velox code to drain operator output. Drain is always scheduled on the Flink + * task mailbox thread. + */ + static void processElementByJni(String operatorId) { GlutenOperator operator = GlutenSessionResources.getInstance().getOperator(operatorId).orElse(null); if (operator == null) { throw new IllegalArgumentException("Operator not found: " + operatorId); } - operator.processElementInternal(); + operator.scheduleProcessElementOnMailbox(); } + + /** Schedules native output drain on the mailbox thread. Implemented by concrete operators. */ + default void scheduleProcessElementOnMailbox() {} } diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenMailboxHolder.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenMailboxHolder.java new file mode 100644 index 00000000000..9415fcfe363 --- /dev/null +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenMailboxHolder.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.table.runtime.operators; + +import java.io.Serializable; + +/** + * Serializable holder for {@link GlutenMailboxOperatorHelper}. + * + *

The holder survives Flink operator deserialization; the helper is lazily recreated because it + * is {@code transient}. + */ +public final class GlutenMailboxHolder implements Serializable { + + private static final long serialVersionUID = 1L; + + private transient GlutenMailboxOperatorHelper helper; + + public GlutenMailboxOperatorHelper get() { + if (helper == null) { + helper = new GlutenMailboxOperatorHelper(); + } + return helper; + } +} diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenMailboxOperatorHelper.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenMailboxOperatorHelper.java new file mode 100644 index 00000000000..ecec25ede66 --- /dev/null +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenMailboxOperatorHelper.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.table.runtime.operators; + +import org.apache.flink.api.common.operators.MailboxExecutor; +import org.apache.flink.streaming.api.operators.StreamOperatorParameters; +import org.apache.flink.streaming.runtime.tasks.StreamTask; + +/** + * Schedules Velox output drain onto the Flink task mailbox thread. + * + *

JNI callbacks from Velox must not call {@code SerialTask.advance()} or emit records directly. + * When a callback arrives while a drain is already in progress on the mailbox thread, a follow-up + * drain is requested via {@code pendingMailboxDrain} instead of nesting another drain. + */ +public final class GlutenMailboxOperatorHelper { + + private transient MailboxExecutor mailboxExecutor; + private transient boolean draining; + private transient boolean pendingMailboxDrain; + private transient boolean mailboxDrainScheduled; + + public void bindMailboxExecutor(MailboxExecutor mailboxExecutor) { + this.mailboxExecutor = mailboxExecutor; + } + + public void ensureMailboxInitialized(StreamTask containingTask) { + if (mailboxExecutor == null) { + mailboxExecutor = containingTask.getMailboxExecutorFactory().createExecutor(0); + } + } + + public boolean isMailboxBound() { + return mailboxExecutor != null; + } + + public void runDrain(Runnable drainAction) { + draining = true; + try { + boolean repeat; + do { + pendingMailboxDrain = false; + drainAction.run(); + repeat = pendingMailboxDrain; + } while (repeat); + } finally { + draining = false; + } + } + + public void scheduleDrain(Runnable drainAction) { + if (mailboxExecutor == null) { + runDrain(drainAction); + return; + } + if (draining) { + pendingMailboxDrain = true; + return; + } + synchronized (this) { + if (mailboxDrainScheduled) { + return; + } + mailboxDrainScheduled = true; + } + try { + mailboxExecutor.submit( + () -> { + try { + runDrain(drainAction); + } finally { + synchronized (GlutenMailboxOperatorHelper.this) { + mailboxDrainScheduled = false; + } + } + }, + "gluten-drain-output"); + } catch (RuntimeException e) { + synchronized (this) { + mailboxDrainScheduled = false; + } + throw e; + } + } + + /** Binds mailbox from {@link StreamOperatorParameters} during operator factory startup on TM. */ + public static void bindAtTaskStartup( + GlutenMailboxHolder mailboxHolder, StreamOperatorParameters parameters) { + mailboxHolder + .get() + .bindMailboxExecutor( + parameters.getContainingTask().getMailboxExecutorFactory().createExecutor(0)); + } +} diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenOneInputOperator.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenOneInputOperator.java index e5a9294d32d..aae4490c54c 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenOneInputOperator.java +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenOneInputOperator.java @@ -65,6 +65,7 @@ public class GlutenOneInputOperator extends TableStreamOperator private final Class outClass; private transient VectorInputBridge inputBridge; private transient VectorOutputBridge outputBridge; + private final GlutenMailboxHolder mailboxHolder = new GlutenMailboxHolder(); public GlutenOneInputOperator( StatefulPlanNode plan, @@ -146,9 +147,22 @@ void initSession() { task.noMoreSplits(id); } + @Override + public GlutenMailboxHolder mailboxHolder() { + return mailboxHolder; + } + + @Override + public void scheduleProcessElementOnMailbox() { + scheduleDrainOnMailbox(this::drainTaskOutput); + } + @Override public void open() throws Exception { super.open(); + if (!mailboxHolder().get().isMailboxBound()) { + ensureMailboxInitialized(getContainingTask()); + } initSession(); } @@ -174,6 +188,10 @@ public void processElement(StreamRecord element) { @Override public void processElementInternal() { + drainOutput(this::drainTaskOutput); + } + + private void drainTaskOutput() { while (true) { UpIterator.State state = task.advance(); if (state == UpIterator.State.AVAILABLE) { @@ -274,7 +292,7 @@ public void initializeState(StateInitializationContext context) throws Exception if (task == null) { initSession(); } - task.initializeState(0, null); + // task.initializeState(0, null); super.initializeState(context); } diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenSourceFunction.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenSourceFunction.java index 53f36fcf67c..0cb386550e4 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenSourceFunction.java +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenSourceFunction.java @@ -213,7 +213,7 @@ public void snapshotState(FunctionSnapshotContext context) throws Exception { public void initializeState(FunctionInitializationContext context) throws Exception { initSession(); // TODO: implement it - this.task.initializeState(0, null); + // this.task.initializeState(0, null); } public String[] notifyCheckpointComplete(long checkpointId) throws Exception { diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenTwoInputOperator.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenTwoInputOperator.java index f20670fed66..9f4933ad273 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenTwoInputOperator.java +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenTwoInputOperator.java @@ -72,6 +72,7 @@ public class GlutenTwoInputOperator extends AbstractStreamOperator private VectorInputBridge inputBridge; private VectorOutputBridge outputBridge; private String description; + private final GlutenMailboxHolder mailboxHolder = new GlutenMailboxHolder(); public GlutenTwoInputOperator( StatefulPlanNode plan, @@ -114,9 +115,22 @@ public String getDescription() { return description; } + @Override + public GlutenMailboxHolder mailboxHolder() { + return mailboxHolder; + } + + @Override + public void scheduleProcessElementOnMailbox() { + scheduleDrainOnMailbox(this::drainTaskOutput); + } + @Override public void open() throws Exception { super.open(); + if (!mailboxHolder().get().isMailboxBound()) { + ensureMailboxInitialized(getContainingTask()); + } initSession(); } @@ -154,18 +168,25 @@ public void processElement2(StreamRecord element) { @Override public void processElementInternal() { + drainOutput(this::drainTaskOutput); + } + + private void drainTaskOutput() { while (true) { UpIterator.State state = task.advance(); if (state == UpIterator.State.AVAILABLE) { final StatefulElement element = task.statefulGet(); - if (element.isWatermark()) { - StatefulWatermark watermark = element.asWatermark(); - output.emitWatermark(new Watermark(watermark.getTimestamp())); - } else { - outputBridge.collect( - output, element.asRecord(), sessionResource.getAllocator(), outputType); + try { + if (element.isWatermark()) { + StatefulWatermark watermark = element.asWatermark(); + output.emitWatermark(new Watermark(watermark.getTimestamp())); + } else { + outputBridge.collect( + output, element.asRecord(), sessionResource.getAllocator(), outputType); + } + } finally { + element.close(); } - element.close(); } else { break; } @@ -254,7 +275,7 @@ public void snapshotState(StateSnapshotContext context) throws Exception { public void initializeState(StateInitializationContext context) throws Exception { initSession(); // TODO: implement it - task.initializeState(0, null); + // task.initializeState(0, null); super.initializeState(context); } @@ -265,6 +286,7 @@ private void initSession() { sessionResource = new GlutenSessionResource(); GlutenSessionResources.getInstance().addSessionResource(getId(), sessionResource); + GlutenSessionResources.getInstance().addOperator(getId(), this); leftInputQueue = sessionResource.getSession().externalStreamOps().newBlockingQueue(); rightInputQueue = sessionResource.getSession().externalStreamOps().newBlockingQueue(); diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/WindowAggOperator.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/WindowAggOperator.java index ac81e0fafe5..8c2d5bbc056 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/WindowAggOperator.java +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/WindowAggOperator.java @@ -132,6 +132,8 @@ public void initializeState(StateInitializationContext context) throws Exception new org.apache.flink.table.types.logical.RowType(accFields))), Map.of(windowStateName, new BigIntType())); task.initializeState(0, parameters); + } else { + task.initializeState(0, null); } } From e4d443fa1157752a70e1e280015fc62598fec461 Mon Sep 17 00:00:00 2001 From: zouyunhe Date: Thu, 18 Jun 2026 04:26:14 +0000 Subject: [PATCH 13/17] remove useless changes --- .../nodes/exec/common/CommonExecSink.java | 24 ++++----- .../stream/StreamExecTableSourceScan.java | 53 ++----------------- .../stream/StreamExecWatermarkAssigner.java | 15 ------ .../functions/SubstractRexCallConverter.java | 5 +- .../gluten/velox/KafkaSourceSinkFactory.java | 12 +---- .../api/operators/GlutenOperator.java | 15 +++--- .../operators/GlutenOneInputOperator.java | 30 +++++------ .../operators/GlutenSessionResources.java | 1 + .../operators/GlutenTwoInputOperator.java | 12 +---- .../ut/src/test/resources/nexmark/q5.sql | 38 +++++++++++++ .../ut/src/test/resources/nexmark/q7.sql | 21 ++++++++ .../ut/src/test/resources/nexmark/q8.sql | 27 ++++++++++ 12 files changed, 128 insertions(+), 125 deletions(-) create mode 100644 gluten-flink/ut/src/test/resources/nexmark/q5.sql create mode 100644 gluten-flink/ut/src/test/resources/nexmark/q7.sql create mode 100644 gluten-flink/ut/src/test/resources/nexmark/q8.sql diff --git a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/common/CommonExecSink.java b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/common/CommonExecSink.java index 20943277f8d..fab420c4818 100644 --- a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/common/CommonExecSink.java +++ b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/common/CommonExecSink.java @@ -70,6 +70,7 @@ import org.apache.flink.table.runtime.operators.sink.ConstraintEnforcer; import org.apache.flink.table.runtime.operators.sink.RowKindSetter; import org.apache.flink.table.runtime.operators.sink.SinkOperator; +import org.apache.flink.table.runtime.operators.sink.StreamRecordTimestampInserter; import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; import org.apache.flink.table.types.logical.BinaryType; import org.apache.flink.table.types.logical.CharType; @@ -555,18 +556,17 @@ private Transformation applyRowtimeTransformation( if (rowtimeFieldIndex == -1) { return inputTransform; } - // return ExecNodeUtil.createOneInputTransformation( - // inputTransform, - // createTransformationMeta( - // TIMESTAMP_INSERTER_TRANSFORMATION, - // String.format("StreamRecordTimestampInserter(rowtime field: %s)", rowtimeFieldIndex), - // "StreamRecordTimestampInserter", - // config), - // new StreamRecordTimestampInserter(rowtimeFieldIndex), - // inputTransform.getOutputType(), - // sinkParallelism, - // sinkParallelismConfigured); - return inputTransform; + return ExecNodeUtil.createOneInputTransformation( + inputTransform, + createTransformationMeta( + TIMESTAMP_INSERTER_TRANSFORMATION, + String.format("StreamRecordTimestampInserter(rowtime field: %s)", rowtimeFieldIndex), + "StreamRecordTimestampInserter", + config), + new StreamRecordTimestampInserter(rowtimeFieldIndex), + inputTransform.getOutputType(), + sinkParallelism, + sinkParallelismConfigured); } private InternalTypeInfo getInputTypeInfo() { diff --git a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecTableSourceScan.java b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecTableSourceScan.java index c4627e30c45..90b3981f0f8 100644 --- a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecTableSourceScan.java +++ b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecTableSourceScan.java @@ -18,21 +18,14 @@ import org.apache.gluten.velox.VeloxSourceSinkFactory; -import io.github.zhztheplayer.velox4j.plan.ProjectNode; - import org.apache.flink.FlinkVersion; import org.apache.flink.api.common.io.InputFormat; import org.apache.flink.api.dag.Transformation; -import org.apache.flink.configuration.PipelineOptions; import org.apache.flink.configuration.ReadableConfig; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; -import org.apache.flink.streaming.api.transformations.SourceTransformation; -import org.apache.flink.table.api.config.ExecutionConfigOptions; import org.apache.flink.table.connector.source.ScanTableSource; import org.apache.flink.table.data.RowData; import org.apache.flink.table.planner.delegation.PlannerBase; -import org.apache.flink.table.planner.plan.abilities.source.SourceAbilitySpec; -import org.apache.flink.table.planner.plan.abilities.source.WatermarkPushDownSpec; import org.apache.flink.table.planner.plan.nodes.exec.ExecNode; import org.apache.flink.table.planner.plan.nodes.exec.ExecNodeConfig; import org.apache.flink.table.planner.plan.nodes.exec.ExecNodeContext; @@ -42,17 +35,11 @@ import org.apache.flink.table.planner.utils.ShortcutUtils; import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; import org.apache.flink.table.types.logical.RowType; -import org.apache.flink.table.types.logical.RowType.RowField; -import org.apache.flink.table.types.logical.TimestampType; import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.annotation.JsonCreator; import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.annotation.JsonProperty; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import java.util.Collections; -import java.util.List; import java.util.Map; /** @@ -67,7 +54,6 @@ minStateVersion = FlinkVersion.v1_15) public class StreamExecTableSourceScan extends CommonExecTableSourceScan implements StreamExecNode { - private static final Logger LOG = LoggerFactory.getLogger(StreamExecTableSourceScan.class); public StreamExecTableSourceScan( ReadableConfig tableConfig, @@ -112,35 +98,6 @@ public Transformation createInputFormatTransformation( return env.createInput(inputFormat, outputTypeInfo).name(operatorName).getTransformation(); } - private io.github.zhztheplayer.velox4j.plan.WatermarkPushDownSpec getWatermarkPushDownSpec( - Transformation transformation, ExecNodeConfig config) { - io.github.zhztheplayer.velox4j.plan.WatermarkPushDownSpec watermarkPushDownSpecNode = null; - if (transformation instanceof SourceTransformation) { - List sourceAbilities = getTableSourceSpec().getSourceAbilities(); - if (sourceAbilities != null) { - for (SourceAbilitySpec sourceAbility : sourceAbilities) { - if (sourceAbility instanceof WatermarkPushDownSpec) { - final long idleTimeout = - config.get(ExecutionConfigOptions.TABLE_EXEC_SOURCE_IDLE_TIMEOUT).toMillis(); - final long watermarkInterval = - config.get(PipelineOptions.AUTO_WATERMARK_INTERVAL).toMillis(); - WatermarkPushDownSpec watermarkPushDownSpec = (WatermarkPushDownSpec) sourceAbility; - RowField watermarkField = new RowField("watermark", new TimestampType(3)); - ProjectNode project = - StreamExecWatermarkAssigner.translateWatermarkExpr( - getOutputType(), - new RowType(List.of(watermarkField)), - watermarkPushDownSpec.getWatermarkExpr()); - watermarkPushDownSpecNode = - new io.github.zhztheplayer.velox4j.plan.WatermarkPushDownSpec( - project, idleTimeout, watermarkInterval, -1); - } - } - } - } - return watermarkPushDownSpecNode; - } - @Override protected Transformation translateToPlanInternal( PlannerBase planner, ExecNodeConfig config) { @@ -149,18 +106,14 @@ protected Transformation translateToPlanInternal( getTableSourceSpec() .getScanTableSource( planner.getFlinkContext(), ShortcutUtils.unwrapTypeFactory(planner)); - Transformation transformation = super.translateToPlanInternal(planner, config); - io.github.zhztheplayer.velox4j.plan.WatermarkPushDownSpec watermarkPushDownSpec = - getWatermarkPushDownSpec(transformation, config); + Transformation sourceTransformation = super.translateToPlanInternal(planner, config); return VeloxSourceSinkFactory.buildSource( - transformation, + sourceTransformation, Map.of( ScanTableSource.class.getName(), tableSource, "checkpoint.enabled", - planner.getExecEnv().getCheckpointConfig().isCheckpointingEnabled(), - "watermarkPushDownSpec", - watermarkPushDownSpec)); + planner.getExecEnv().getCheckpointConfig().isCheckpointingEnabled())); // --- End Gluten-specific code changes --- } } diff --git a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecWatermarkAssigner.java b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecWatermarkAssigner.java index 928daf48b89..1d07e93f280 100644 --- a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecWatermarkAssigner.java +++ b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecWatermarkAssigner.java @@ -48,7 +48,6 @@ import org.apache.flink.table.planner.plan.nodes.exec.SingleTransformationTranslator; import org.apache.flink.table.planner.plan.nodes.exec.utils.ExecNodeUtil; import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; -import org.apache.flink.table.types.logical.LogicalType; import org.apache.flink.table.types.logical.RowType; import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.annotation.JsonCreator; @@ -118,20 +117,6 @@ public StreamExecWatermarkAssigner( this.rowtimeFieldIndex = rowtimeFieldIndex; } - static ProjectNode translateWatermarkExpr( - LogicalType inputType, LogicalType outputType, RexNode watermarkExpr) { - List inNames = Utils.getNamesFromRowType(inputType); - RexConversionContext conversionContext = new RexConversionContext(inNames); - TypedExpr watermarkExprs = RexNodeConverter.toTypedExpr(watermarkExpr, conversionContext); - io.github.zhztheplayer.velox4j.type.RowType outputRowType = - (io.github.zhztheplayer.velox4j.type.RowType) LogicalTypeConverter.toVLType(outputType); - return new ProjectNode( - PlanNodeIdGenerator.newId(), - List.of(new EmptyNode(outputRowType)), - List.of("TIMESTAMP"), - List.of(watermarkExprs)); - } - @SuppressWarnings("unchecked") @Override protected Transformation translateToPlanInternal( diff --git a/gluten-flink/planner/src/main/java/org/apache/gluten/rexnode/functions/SubstractRexCallConverter.java b/gluten-flink/planner/src/main/java/org/apache/gluten/rexnode/functions/SubstractRexCallConverter.java index b1425c0d3a9..1c7e861595c 100644 --- a/gluten-flink/planner/src/main/java/org/apache/gluten/rexnode/functions/SubstractRexCallConverter.java +++ b/gluten-flink/planner/src/main/java/org/apache/gluten/rexnode/functions/SubstractRexCallConverter.java @@ -63,10 +63,12 @@ public ValidationResult isSuitable(RexCall callNode, RexConversionContext contex @Override public TypedExpr toTypedExpr(RexCall callNode, RexConversionContext context) { List params = getParams(callNode, context); + if (params.get(0).getReturnType() instanceof TimestampType && params.get(1).getReturnType() instanceof BigIntType) { + Type bigIntType = new BigIntType(); - TypedExpr castExpr = new CallTypedExpr(bigIntType, List.of(params.get(0)), "unix_millis"); + TypedExpr castExpr = new CallTypedExpr(bigIntType, List.of(params.get(0)), "cast"); List newParams = List.of(castExpr, params.get(1)); return new CallTypedExpr(bigIntType, newParams, functionName); @@ -74,7 +76,6 @@ public TypedExpr toTypedExpr(RexCall callNode, RexConversionContext context) { List alignedParams = TypeUtils.promoteTypeForArithmeticExpressions(params.get(0), params.get(1)); - Type resultType = getResultType(callNode); return new CallTypedExpr(resultType, alignedParams, functionName); } diff --git a/gluten-flink/planner/src/main/java/org/apache/gluten/velox/KafkaSourceSinkFactory.java b/gluten-flink/planner/src/main/java/org/apache/gluten/velox/KafkaSourceSinkFactory.java index 8d4c22778c1..8644dac3dac 100644 --- a/gluten-flink/planner/src/main/java/org/apache/gluten/velox/KafkaSourceSinkFactory.java +++ b/gluten-flink/planner/src/main/java/org/apache/gluten/velox/KafkaSourceSinkFactory.java @@ -24,11 +24,8 @@ import io.github.zhztheplayer.velox4j.connector.KafkaConnectorSplit; import io.github.zhztheplayer.velox4j.connector.KafkaTableHandle; -import io.github.zhztheplayer.velox4j.plan.PlanNode; import io.github.zhztheplayer.velox4j.plan.StatefulPlanNode; import io.github.zhztheplayer.velox4j.plan.TableScanNode; -import io.github.zhztheplayer.velox4j.plan.TableScanWithWatermarkNode; -import io.github.zhztheplayer.velox4j.plan.WatermarkPushDownSpec; import io.github.zhztheplayer.velox4j.type.RowType; import org.apache.flink.api.connector.source.Source; @@ -71,8 +68,6 @@ public Transformation buildVeloxSource( ScanTableSource tableSource = (ScanTableSource) parameters.get(ScanTableSource.class.getName()); boolean checkpointEnabled = (Boolean) parameters.get("checkpoint.enabled"); - WatermarkPushDownSpec watermarkPushDownSpec = - (WatermarkPushDownSpec) parameters.get("watermarkPushDownSpec"); Class tableSourceClazz = Class.forName("org.apache.flink.streaming.connectors.kafka.table.KafkaDynamicSource"); Properties properties = @@ -117,12 +112,7 @@ public Transformation buildVeloxSource( Boolean.valueOf(kafkaTableParameters.getOrDefault("enable.auto.commit", "false")), "latest", List.of()); - - PlanNode kafkaScan = - watermarkPushDownSpec != null - ? new TableScanWithWatermarkNode( - planId, outputType, kafkaTableHandle, List.of(), watermarkPushDownSpec) - : new TableScanNode(planId, outputType, kafkaTableHandle, List.of()); + TableScanNode kafkaScan = new TableScanNode(planId, outputType, kafkaTableHandle, List.of()); GlutenStreamSource sourceOp = new GlutenStreamSource( new GlutenSourceFunction( diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/streaming/api/operators/GlutenOperator.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/streaming/api/operators/GlutenOperator.java index 0983140ed93..bb6bc3bf543 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/streaming/api/operators/GlutenOperator.java +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/streaming/api/operators/GlutenOperator.java @@ -29,24 +29,23 @@ /** Interface for all gluten operators. */ public interface GlutenOperator { + public StatefulPlanNode getPlanNode(); - StatefulPlanNode getPlanNode(); + public RowType getInputType(); - RowType getInputType(); + public Map getOutputTypes(); - Map getOutputTypes(); + public String getId(); - String getId(); + public default String getDescription() { + return ""; + } /** Mailbox drain helper holder; must be a non-transient field on the concrete operator. */ default GlutenMailboxHolder mailboxHolder() { return new GlutenMailboxHolder(); } - default String getDescription() { - return ""; - } - default void processElementInternal() {} default void bindMailboxExecutor(MailboxExecutor mailboxExecutor) { diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenOneInputOperator.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenOneInputOperator.java index aae4490c54c..09b059f3bc2 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenOneInputOperator.java +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenOneInputOperator.java @@ -36,6 +36,7 @@ import io.github.zhztheplayer.velox4j.stateful.StatefulWatermark; import io.github.zhztheplayer.velox4j.type.RowType; +import org.apache.flink.contrib.streaming.state.RocksDBKeyedStateBackend; import org.apache.flink.runtime.state.StateInitializationContext; import org.apache.flink.runtime.state.StateSnapshotContext; import org.apache.flink.streaming.api.operators.OneInputStreamOperator; @@ -147,16 +148,6 @@ void initSession() { task.noMoreSplits(id); } - @Override - public GlutenMailboxHolder mailboxHolder() { - return mailboxHolder; - } - - @Override - public void scheduleProcessElementOnMailbox() { - scheduleDrainOnMailbox(this::drainTaskOutput); - } - @Override public void open() throws Exception { super.open(); @@ -182,10 +173,19 @@ public void processElement(StreamRecord element) { statefulRecord.close(); } } - processElementInternal(); } + @Override + public GlutenMailboxHolder mailboxHolder() { + return mailboxHolder; + } + + @Override + public void scheduleProcessElementOnMailbox() { + scheduleDrainOnMailbox(this::drainTaskOutput); + } + @Override public void processElementInternal() { drainOutput(this::drainTaskOutput); @@ -213,10 +213,6 @@ private void drainTaskOutput() { } } - public boolean operateOnProcessTime() { - return false; - } - public GlutenOneInputOperator cloneWithInputOutputClasses( StatefulPlanNode plan, Class newInClass, Class newOutClass) { return new GlutenOneInputOperator<>( @@ -292,7 +288,9 @@ public void initializeState(StateInitializationContext context) throws Exception if (task == null) { initSession(); } - // task.initializeState(0, null); + if (!(getKeyedStateBackend() instanceof RocksDBKeyedStateBackend)) { + task.initializeState(0, null); + } super.initializeState(context); } diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenSessionResources.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenSessionResources.java index 8d737da83cc..fac0784a248 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenSessionResources.java +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenSessionResources.java @@ -27,6 +27,7 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; + import java.util.HashMap; import java.util.Map; import java.util.Optional; diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenTwoInputOperator.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenTwoInputOperator.java index a2ab5088de2..3f73ad4bbd8 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenTwoInputOperator.java +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenTwoInputOperator.java @@ -115,16 +115,6 @@ public String getDescription() { return description; } - @Override - public GlutenMailboxHolder mailboxHolder() { - return mailboxHolder; - } - - @Override - public void scheduleProcessElementOnMailbox() { - scheduleDrainOnMailbox(this::drainTaskOutput); - } - @Override public void open() throws Exception { super.open(); @@ -285,7 +275,7 @@ public void snapshotState(StateSnapshotContext context) throws Exception { public void initializeState(StateInitializationContext context) throws Exception { initSession(); // TODO: implement it - // task.initializeState(0, null); + task.initializeState(0, null); super.initializeState(context); } diff --git a/gluten-flink/ut/src/test/resources/nexmark/q5.sql b/gluten-flink/ut/src/test/resources/nexmark/q5.sql new file mode 100644 index 00000000000..98954bacf20 --- /dev/null +++ b/gluten-flink/ut/src/test/resources/nexmark/q5.sql @@ -0,0 +1,38 @@ +CREATE TABLE nexmark_q5 ( + auction BIGINT, + num BIGINT +) WITH ( + 'connector' = 'blackhole' +); + +INSERT INTO nexmark_q5 +SELECT AuctionBids.auction, AuctionBids.num + FROM ( + SELECT + auction, + count(*) AS num, + window_start AS starttime, + window_end AS endtime + FROM TABLE( + HOP(TABLE bid, DESCRIPTOR(`dateTime`), INTERVAL '2' SECOND, INTERVAL '10' SECOND)) + GROUP BY auction, window_start, window_end + ) AS AuctionBids + JOIN ( + SELECT + max(CountBids.num) AS maxn, + CountBids.starttime, + CountBids.endtime + FROM ( + SELECT + count(*) AS num, + window_start AS starttime, + window_end AS endtime + FROM TABLE( + HOP(TABLE bid, DESCRIPTOR(`dateTime`), INTERVAL '2' SECOND, INTERVAL '10' SECOND)) + GROUP BY auction, window_start, window_end + ) AS CountBids + GROUP BY CountBids.starttime, CountBids.endtime + ) AS MaxBids + ON AuctionBids.starttime = MaxBids.starttime AND + AuctionBids.endtime = MaxBids.endtime AND + AuctionBids.num >= MaxBids.maxn; diff --git a/gluten-flink/ut/src/test/resources/nexmark/q7.sql b/gluten-flink/ut/src/test/resources/nexmark/q7.sql new file mode 100644 index 00000000000..1b0ec308e94 --- /dev/null +++ b/gluten-flink/ut/src/test/resources/nexmark/q7.sql @@ -0,0 +1,21 @@ +CREATE TABLE nexmark_q7 ( + auction BIGINT, + bidder BIGINT, + price BIGINT, + `dateTime` TIMESTAMP(3), + extra VARCHAR +) WITH ( + 'connector' = 'blackhole' +); + +INSERT INTO nexmark_q7 +SELECT B.auction, B.price, B.bidder, B.`dateTime`, B.extra +from bid B +JOIN ( + SELECT MAX(price) AS maxprice, window_end as `dateTime` + FROM TABLE( + TUMBLE(TABLE bid, DESCRIPTOR(`dateTime`), INTERVAL '10' SECOND)) + GROUP BY window_start, window_end +) B1 +ON B.price = B1.maxprice +WHERE B.`dateTime` BETWEEN B1.`dateTime` - INTERVAL '10' SECOND AND B1.`dateTime`; diff --git a/gluten-flink/ut/src/test/resources/nexmark/q8.sql b/gluten-flink/ut/src/test/resources/nexmark/q8.sql new file mode 100644 index 00000000000..1b112f9fc75 --- /dev/null +++ b/gluten-flink/ut/src/test/resources/nexmark/q8.sql @@ -0,0 +1,27 @@ +CREATE TABLE nexmark_q8 ( + id BIGINT, + name VARCHAR, + stime TIMESTAMP(3) +) WITH ( + 'connector' = 'blackhole' +); + +INSERT INTO nexmark_q8 +SELECT P.id, P.name, P.starttime +FROM ( + SELECT id, name, + window_start AS starttime, + window_end AS endtime + FROM TABLE( + TUMBLE(TABLE person, DESCRIPTOR(`dateTime`), INTERVAL '10' SECOND)) + GROUP BY id, name, window_start, window_end +) P +JOIN ( + SELECT seller, + window_start AS starttime, + window_end AS endtime + FROM TABLE( + TUMBLE(TABLE auction, DESCRIPTOR(`dateTime`), INTERVAL '10' SECOND)) + GROUP BY seller, window_start, window_end +) A +ON P.id = A.seller AND P.starttime = A.starttime AND P.endtime = A.endtime; From cf426af136f17337e95680fdccfd32d901ee117c Mon Sep 17 00:00:00 2001 From: zouyunhe Date: Thu, 18 Jun 2026 04:28:57 +0000 Subject: [PATCH 14/17] remove useless changes --- .../nodes/exec/stream/StreamExecExchange.java | 93 +++++++++++-------- 1 file changed, 55 insertions(+), 38 deletions(-) diff --git a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecExchange.java b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecExchange.java index a6d23739801..17e3966d52a 100644 --- a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecExchange.java +++ b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecExchange.java @@ -19,12 +19,24 @@ import org.apache.gluten.streaming.api.operators.GlutenOperator; import org.apache.gluten.streaming.runtime.partitioner.GlutenKeyGroupStreamPartitioner; import org.apache.gluten.table.runtime.keyselector.GlutenKeySelector; +import org.apache.gluten.table.runtime.operators.GlutenOneInputOperator; +import org.apache.gluten.util.LogicalTypeConverter; +import org.apache.gluten.util.PlanNodeIdGenerator; + +import io.github.zhztheplayer.velox4j.plan.EmptyNode; +import io.github.zhztheplayer.velox4j.plan.HashPartitionFunctionSpec; +import io.github.zhztheplayer.velox4j.plan.LocalPartitionNode; +import io.github.zhztheplayer.velox4j.plan.PartitionFunctionSpec; +import io.github.zhztheplayer.velox4j.plan.PlanNode; +import io.github.zhztheplayer.velox4j.plan.StatefulPlanNode; +import io.github.zhztheplayer.velox4j.plan.StreamPartitionNode; import org.apache.flink.FlinkVersion; import org.apache.flink.api.common.ExecutionConfig; import org.apache.flink.api.dag.Transformation; import org.apache.flink.api.java.functions.KeySelector; import org.apache.flink.configuration.ReadableConfig; +import org.apache.flink.streaming.api.operators.OneInputStreamOperator; import org.apache.flink.streaming.api.transformations.OneInputTransformation; import org.apache.flink.streaming.api.transformations.PartitionTransformation; import org.apache.flink.streaming.runtime.partitioner.GlobalPartitioner; @@ -33,6 +45,7 @@ import org.apache.flink.table.api.TableException; import org.apache.flink.table.data.RowData; import org.apache.flink.table.planner.delegation.PlannerBase; +import org.apache.flink.table.planner.plan.nodes.exec.ExecEdge; import org.apache.flink.table.planner.plan.nodes.exec.ExecNode; import org.apache.flink.table.planner.plan.nodes.exec.ExecNodeConfig; import org.apache.flink.table.planner.plan.nodes.exec.ExecNodeContext; @@ -40,6 +53,8 @@ import org.apache.flink.table.planner.plan.nodes.exec.InputProperty; import org.apache.flink.table.planner.plan.nodes.exec.InputProperty.HashDistribution; import org.apache.flink.table.planner.plan.nodes.exec.common.CommonExecExchange; +import org.apache.flink.table.planner.plan.nodes.exec.utils.ExecNodeUtil; +import org.apache.flink.table.planner.plan.nodes.exec.utils.TransformationMetadata; import org.apache.flink.table.planner.plan.utils.KeySelectorUtil; import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; import org.apache.flink.table.types.logical.RowType; @@ -47,8 +62,11 @@ import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.annotation.JsonCreator; import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.annotation.JsonProperty; +import java.util.Arrays; import java.util.Collections; import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; import static org.apache.flink.runtime.state.KeyGroupRangeAssignment.DEFAULT_LOWER_BOUND_MAX_PARALLELISM; import static org.apache.flink.util.Preconditions.checkArgument; @@ -126,45 +144,44 @@ protected Transformation translateToPlanInternal( // should set it when operator init. parallelism = inputTransform.getParallelism(); keySelector = new GlutenKeySelector(); - // final ExecEdge inputEdge = getInputEdges().get(0); - // io.github.zhztheplayer.velox4j.type.RowType glutenInputType = - // (io.github.zhztheplayer.velox4j.type.RowType) - // LogicalTypeConverter.toVLType(inputEdge.getOutputType()); - // io.github.zhztheplayer.velox4j.type.RowType outputType = - // (io.github.zhztheplayer.velox4j.type.RowType) - // LogicalTypeConverter.toVLType(getOutputType()); - // String id = PlanNodeIdGenerator.newId(); - // List keyIndexes = Arrays.stream(keys).boxed().collect(Collectors.toList()); - // PartitionFunctionSpec partitionFunctionSpec = - // new HashPartitionFunctionSpec(glutenInputType, keyIndexes); - // PlanNode localPartition = - // new LocalPartitionNode( - // id, - // List.of(new EmptyNode(outputType)), - // "REPARTITION", - // false, - // partitionFunctionSpec); - // PlanNode exchange = new StreamPartitionNode(id, localPartition, parallelism); - // final OneInputStreamOperator exchangeKeyGenerator = - // new GlutenOneInputOperator( - // new StatefulPlanNode(id, exchange), - // id, - // glutenInputType, - // Map.of(id, outputType), - // RowData.class, - // RowData.class, - // "StreamExecExchange"); - // inputTransform = - // ExecNodeUtil.createOneInputTransformation( - // inputTransform, - // new TransformationMetadata("exchange-hash", "Gluten exchange hash"), - // exchangeKeyGenerator, - // inputTransform.getOutputType(), - // parallelism, - // false); + final ExecEdge inputEdge = getInputEdges().get(0); + io.github.zhztheplayer.velox4j.type.RowType glutenInputType = + (io.github.zhztheplayer.velox4j.type.RowType) + LogicalTypeConverter.toVLType(inputEdge.getOutputType()); + io.github.zhztheplayer.velox4j.type.RowType outputType = + (io.github.zhztheplayer.velox4j.type.RowType) + LogicalTypeConverter.toVLType(getOutputType()); + String id = PlanNodeIdGenerator.newId(); + List keyIndexes = Arrays.stream(keys).boxed().collect(Collectors.toList()); + PartitionFunctionSpec partitionFunctionSpec = + new HashPartitionFunctionSpec(glutenInputType, keyIndexes); + PlanNode localPartition = + new LocalPartitionNode( + id, + List.of(new EmptyNode(outputType)), + "REPARTITION", + false, + partitionFunctionSpec); + PlanNode exchange = new StreamPartitionNode(id, localPartition, parallelism); + final OneInputStreamOperator exchangeKeyGenerator = + new GlutenOneInputOperator( + new StatefulPlanNode(id, exchange), + id, + glutenInputType, + Map.of(id, outputType), + RowData.class, + RowData.class, + "StreamExecExchange"); + inputTransform = + ExecNodeUtil.createOneInputTransformation( + inputTransform, + new TransformationMetadata("exchange-hash", "Gluten exchange hash"), + exchangeKeyGenerator, + inputTransform.getOutputType(), + parallelism, + false); partitioner = - new GlutenKeyGroupStreamPartitioner( - keySelector, DEFAULT_LOWER_BOUND_MAX_PARALLELISM, parallelism); + new GlutenKeyGroupStreamPartitioner(keySelector, DEFAULT_LOWER_BOUND_MAX_PARALLELISM); } else { parallelism = ExecutionConfig.PARALLELISM_DEFAULT; partitioner = From 085367e975bc01ef29fa75d78b022b0fdfa1240e Mon Sep 17 00:00:00 2001 From: zouyunhe Date: Thu, 18 Jun 2026 04:30:59 +0000 Subject: [PATCH 15/17] remove useless changes --- .../gluten/table/runtime/operators/GlutenSourceFunction.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenSourceFunction.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenSourceFunction.java index 0cb386550e4..53f36fcf67c 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenSourceFunction.java +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/GlutenSourceFunction.java @@ -213,7 +213,7 @@ public void snapshotState(FunctionSnapshotContext context) throws Exception { public void initializeState(FunctionInitializationContext context) throws Exception { initSession(); // TODO: implement it - // this.task.initializeState(0, null); + this.task.initializeState(0, null); } public String[] notifyCheckpointComplete(long checkpointId) throws Exception { From 59b32779633cea39c65662dd0dd205515897fe6b Mon Sep 17 00:00:00 2001 From: zouyunhe Date: Thu, 18 Jun 2026 06:17:17 +0000 Subject: [PATCH 16/17] fix compile error --- .../nodes/exec/stream/StreamExecExchange.java | 19 +++++++++++++++++-- .../runtime/operators/WindowAggOperator.java | 5 ----- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecExchange.java b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecExchange.java index 17e3966d52a..7989c7000a8 100644 --- a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecExchange.java +++ b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecExchange.java @@ -20,6 +20,7 @@ import org.apache.gluten.streaming.runtime.partitioner.GlutenKeyGroupStreamPartitioner; import org.apache.gluten.table.runtime.keyselector.GlutenKeySelector; import org.apache.gluten.table.runtime.operators.GlutenOneInputOperator; +import org.apache.gluten.table.runtime.operators.WindowAggOperator; import org.apache.gluten.util.LogicalTypeConverter; import org.apache.gluten.util.PlanNodeIdGenerator; @@ -112,6 +113,18 @@ public StreamExecExchange( checkArgument(inputProperties.size() == 1); } + private static boolean isWindowPropertyField(String fieldName) { + return "window_start".equals(fieldName) + || "window_end".equals(fieldName) + || "window_time".equals(fieldName); + } + + private boolean isWindowAggregateExchange(OneInputTransformation inputTransform) { + return inputTransform.getOperator() instanceof WindowAggOperator + || ((RowType) getOutputType()).getFieldNames().stream() + .anyMatch(StreamExecExchange::isWindowPropertyField); + } + @SuppressWarnings("unchecked") @Override protected Transformation translateToPlanInternal( @@ -139,7 +152,8 @@ protected Transformation translateToPlanInternal( planner.getFlinkContext().getClassLoader(), keys, inputType); // --- Begin Gluten-specific code changes --- OneInputTransformation oneInputTransform = (OneInputTransformation) inputTransform; - if (oneInputTransform.getOperator() instanceof GlutenOperator) { + if (oneInputTransform.getOperator() instanceof GlutenOperator + && !isWindowAggregateExchange(oneInputTransform)) { // TODO: velox's parallelism need to be set here, as some nodes need it. // should set it when operator init. parallelism = inputTransform.getParallelism(); @@ -181,7 +195,8 @@ protected Transformation translateToPlanInternal( parallelism, false); partitioner = - new GlutenKeyGroupStreamPartitioner(keySelector, DEFAULT_LOWER_BOUND_MAX_PARALLELISM); + new GlutenKeyGroupStreamPartitioner( + keySelector, DEFAULT_LOWER_BOUND_MAX_PARALLELISM, parallelism); } else { parallelism = ExecutionConfig.PARALLELISM_DEFAULT; partitioner = diff --git a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/WindowAggOperator.java b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/WindowAggOperator.java index 8c2d5bbc056..7eb71ee551c 100644 --- a/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/WindowAggOperator.java +++ b/gluten-flink/runtime/src/main/java/org/apache/gluten/table/runtime/operators/WindowAggOperator.java @@ -137,11 +137,6 @@ public void initializeState(StateInitializationContext context) throws Exception } } - @Override - public boolean operateOnProcessTime() { - return !isRowTime; - } - @Override public WindowAggOperator cloneWithInputOutputClasses( StatefulPlanNode plan, Class newInClass, Class newOutClass) { From b3fd8c16f09f8902ffb9cf5dc6d6f7c83c7a347a Mon Sep 17 00:00:00 2001 From: zouyunhe Date: Thu, 18 Jun 2026 06:20:16 +0000 Subject: [PATCH 17/17] remove useless changes --- .../planner/plan/nodes/exec/stream/StreamExecExchange.java | 4 ++-- .../nodes/exec/stream/StreamExecLocalWindowAggregate.java | 3 --- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecExchange.java b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecExchange.java index 7989c7000a8..8bedbd89321 100644 --- a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecExchange.java +++ b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecExchange.java @@ -121,8 +121,8 @@ private static boolean isWindowPropertyField(String fieldName) { private boolean isWindowAggregateExchange(OneInputTransformation inputTransform) { return inputTransform.getOperator() instanceof WindowAggOperator - || ((RowType) getOutputType()).getFieldNames().stream() - .anyMatch(StreamExecExchange::isWindowPropertyField); + || ((RowType) getOutputType()) + .getFieldNames().stream().anyMatch(StreamExecExchange::isWindowPropertyField); } @SuppressWarnings("unchecked") diff --git a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecLocalWindowAggregate.java b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecLocalWindowAggregate.java index 5ffddd367c2..574362cf5ad 100644 --- a/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecLocalWindowAggregate.java +++ b/gluten-flink/planner/src/main/java/org/apache/flink/table/planner/plan/nodes/exec/stream/StreamExecLocalWindowAggregate.java @@ -61,8 +61,6 @@ import org.apache.calcite.rel.core.AggregateCall; import org.apache.commons.math3.util.ArithmeticUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import javax.annotation.Nullable; @@ -88,7 +86,6 @@ minStateVersion = FlinkVersion.v1_15) public class StreamExecLocalWindowAggregate extends StreamExecWindowAggregateBase { - private static final Logger LOG = LoggerFactory.getLogger(StreamExecLocalWindowAggregate.class); public static final String LOCAL_WINDOW_AGGREGATE_TRANSFORMATION = "local-window-aggregate"; private static final long WINDOW_AGG_MEMORY_RATIO = 100;