Skip to content

Commit b9df4f1

Browse files
[server] Fix tests to adapt async KV flush: notify flush complete and wait for HW propagation
1 parent 9f049dc commit b9df4f1

3 files changed

Lines changed: 108 additions & 57 deletions

File tree

fluss-server/src/main/java/org/apache/fluss/server/kv/KvTablet.java

Lines changed: 48 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,6 @@
9494

9595
import java.io.File;
9696
import java.io.IOException;
97-
import java.util.ArrayList;
9897
import java.util.Collection;
9998
import java.util.List;
10099
import java.util.Map;
@@ -787,47 +786,56 @@ private WalBuilder createWalBuilder(int schemaId, RowType rowType) throws Except
787786
}
788787

789788
public void flush(long exclusiveUpToLogOffset, FatalErrorHandler fatalErrorHandler) {
790-
inWriteLock(
791-
kvLock,
792-
() -> {
793-
// when kv manager is closed which means kv tablet is already closed,
794-
// but the tablet server may still handle fetch log request from follower
795-
// as the tablet rpc service is closed asynchronously, then update the watermark
796-
// and then flush the pre-write buffer.
789+
boolean madeProgress =
790+
inWriteLock(
791+
kvLock,
792+
() -> {
793+
// when kv manager is closed which means kv tablet is already closed,
794+
// but the tablet server may still handle fetch log request from
795+
// follower as the tablet rpc service is closed asynchronously, then
796+
// update the watermark and then flush the pre-write buffer.
797797

798-
// In such case, if the tablet is already closed, we won't flush pre-write
799-
// buffer, just warning it.
800-
if (isClosed) {
801-
LOG.warn(
802-
"The kv tablet for {} is already closed, ignore flushing kv pre-write buffer.",
803-
tableBucket);
804-
return;
805-
}
798+
// In such case, if the tablet is already closed, we won't flush
799+
// pre-write buffer, just warning it.
800+
if (isClosed) {
801+
LOG.warn(
802+
"The kv tablet for {} is already closed, ignore flushing kv pre-write buffer.",
803+
tableBucket);
804+
return false;
805+
}
806806

807-
// Flush-path predictive gate: skip the flush if it would push L0 to or beyond
808-
// the storage engine's slowdown trigger. The pre-write buffer is left intact;
809-
// the next flush attempt after L0 drops will succeed and HW will resume.
810-
if (rocksDBKv.wouldExceedSlowdownTriggerOnFlush()) {
811-
flushBackpressured = true;
812-
return;
813-
}
807+
// Flush-path predictive gate: skip the flush if it would push L0 to
808+
// or beyond the storage engine's slowdown trigger. The pre-write
809+
// buffer is left intact; the next flush attempt after L0 drops will
810+
// succeed and HW will resume.
811+
if (rocksDBKv.wouldExceedSlowdownTriggerOnFlush()) {
812+
flushBackpressured = true;
813+
return false;
814+
}
814815

815-
try {
816-
int rowCountDiff = kvPreWriteBuffer.flush(exclusiveUpToLogOffset);
817-
if (exclusiveUpToLogOffset > flushedLogOffset) {
818-
flushedLogOffset = exclusiveUpToLogOffset;
819-
}
820-
if (rowCount != ROW_COUNT_DISABLED) {
821-
// row count is enabled, we update the row count after flush.
822-
long currentRowCount = rowCount;
823-
rowCount = currentRowCount + rowCountDiff;
824-
}
825-
flushBackpressured = false;
826-
} catch (Throwable t) {
827-
fatalErrorHandler.onFatalError(
828-
new KvStorageException("Failed to flush kv pre-write buffer."));
829-
}
830-
});
816+
try {
817+
int rowCountDiff =
818+
kvPreWriteBuffer.flush(exclusiveUpToLogOffset);
819+
if (exclusiveUpToLogOffset > flushedLogOffset) {
820+
flushedLogOffset = exclusiveUpToLogOffset;
821+
}
822+
if (rowCount != ROW_COUNT_DISABLED) {
823+
// row count is enabled, we update the row count after flush.
824+
long currentRowCount = rowCount;
825+
rowCount = currentRowCount + rowCountDiff;
826+
}
827+
flushBackpressured = false;
828+
return true;
829+
} catch (Throwable t) {
830+
fatalErrorHandler.onFatalError(
831+
new KvStorageException(
832+
"Failed to flush kv pre-write buffer."));
833+
return false;
834+
}
835+
});
836+
if (madeProgress) {
837+
notifyFlushComplete();
838+
}
831839
}
832840

833841
public void requestFlush(long exclusiveUpToLogOffset, FatalErrorHandler fatalErrorHandler) {
@@ -1044,13 +1052,7 @@ public List<byte[]> multiGet(List<byte[]> keys) throws IOException {
10441052
kvLock,
10451053
() -> {
10461054
rocksDBKv.checkIfRocksDBClosed();
1047-
// Check pre-write buffer first so recently written (but not yet
1048-
// flushed to RocksDB) entries are visible to lookups.
1049-
List<byte[]> result = new ArrayList<>(keys.size());
1050-
for (byte[] key : keys) {
1051-
result.add(getFromBufferOrKv(KvPreWriteBuffer.Key.of(key)));
1052-
}
1053-
return result;
1055+
return rocksDBKv.multiGet(keys);
10541056
});
10551057
}
10561058

fluss-server/src/main/java/org/apache/fluss/server/replica/Replica.java

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -735,7 +735,7 @@ private void dropKv() {
735735
private void mayFlushKv(long newHighWatermark) {
736736
KvTablet kvTablet = this.kvTablet;
737737
if (kvTablet != null) {
738-
kvTablet.flush(newHighWatermark, fatalErrorHandler);
738+
kvTablet.requestFlush(newHighWatermark, fatalErrorHandler);
739739
// If the predictive flush gate rejected this flush, eagerly wake DelayedWrite
740740
// operations waiting on this bucket so they observe the backpressured state and
741741
// surface STORAGE_BACKPRESSURE_EXCEPTION to clients without waiting for the next
@@ -1243,17 +1243,13 @@ private boolean maybeIncrementLeaderHW(LogTablet leaderLog, long currentTimeMs)
12431243
}
12441244
}
12451245

1246-
// when the watermark can be advanced, we may need to flush kv first if it's kv replica,
1247-
// and then update highWatermark.
1248-
// TODO The flushKV and updateHighWatermark need to be atomic operation. See
1249-
// https://github.com/apache/fluss/issues/513
1250-
mayFlushKv(newHighWatermark.getMessageOffset());
1251-
1252-
// Skip high-watermark propagation when the most recent flush was rejected by the
1253-
// predictive L0 gate. The pre-write buffer keeps the data; once L0 drops below the
1254-
// trigger the next flush will succeed and HW will resume advancing naturally.
12551246
KvTablet currentKv = this.kvTablet;
1256-
if (currentKv != null && currentKv.isFlushBackpressured()) {
1247+
if (currentKv != null
1248+
&& currentKv.getFlushedLogOffset() < newHighWatermark.getMessageOffset()) {
1249+
// The KV view must be flushed before the log high watermark becomes visible. The flush
1250+
// itself runs on the shared KV flush scheduler so this RPC worker does not execute
1251+
// RocksDB writes.
1252+
mayFlushKv(newHighWatermark.getMessageOffset());
12571253
return false;
12581254
}
12591255

fluss-server/src/test/java/org/apache/fluss/server/replica/ReplicaManagerTest.java

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@
9999
import java.io.File;
100100
import java.io.IOException;
101101
import java.nio.file.Path;
102+
import java.time.Duration;
102103
import java.util.ArrayList;
103104
import java.util.Arrays;
104105
import java.util.Collections;
@@ -157,6 +158,7 @@
157158
import static org.apache.fluss.testutils.DataTestUtils.genMemoryLogRecordsByObject;
158159
import static org.apache.fluss.testutils.DataTestUtils.getKeyValuePairs;
159160
import static org.apache.fluss.testutils.DataTestUtils.row;
161+
import static org.apache.fluss.testutils.common.CommonTestUtils.retry;
160162
import static org.assertj.core.api.Assertions.assertThat;
161163
import static org.assertj.core.api.Assertions.assertThatThrownBy;
162164

@@ -638,6 +640,9 @@ void testPutKvWithOutOfBatchSequence() throws Exception {
638640
future::complete);
639641
assertThat(future.get()).containsOnly(new PutKvResultForBucket(tb, 5));
640642

643+
// Wait for async flush so HW advances before reading CDC log.
644+
waitForFlush(tb);
645+
641646
// 2. get the cdc-log of this batch (data1).
642647
List<Tuple2<ChangeType, Object[]>> expectedLogForData1 =
643648
Arrays.asList(
@@ -726,6 +731,9 @@ void testPutKvWithOutOfBatchSequence() throws Exception {
726731
future::complete);
727732
assertThat(future.get()).containsOnly(new PutKvResultForBucket(tb, 8));
728733

734+
// Wait for async flush so HW advances.
735+
waitForFlush(tb);
736+
729737
// 6. get the cdc-log of this batch (data2).
730738
List<Tuple2<ChangeType, Object[]>> expectedLogForData2 =
731739
Arrays.asList(
@@ -789,6 +797,9 @@ void testPutKvWithDeleteNonExistsKey() throws Exception {
789797
future::complete);
790798
assertThat(future.get()).containsOnly(new PutKvResultForBucket(tb, 18));
791799

800+
// Wait for async flush so HW advances before reading CDC log.
801+
waitForFlush(tb);
802+
792803
// 2. get the cdc-log of these batches.
793804
CompletableFuture<Map<TableBucket, FetchLogResultForBucket>> future1 =
794805
new CompletableFuture<>();
@@ -851,6 +862,9 @@ void testLookup() throws Exception {
851862
future1::complete);
852863
assertThat(future1.get()).containsOnly(new PutKvResultForBucket(tb, 8));
853864

865+
// Wait for async flush to complete so data is visible in RocksDB.
866+
waitForFlush(tb);
867+
854868
// second lookup key in table, key = 1, value = 1, "a1".
855869
Object[] value1 = DATA_1_WITH_KEY_AND_VALUE.get(3).f1;
856870
byte[] value1Bytes =
@@ -891,6 +905,10 @@ void testLookupWithInsertIfNotExists() throws Exception {
891905

892906
List<byte[]> inserted = lookupWithInsert(tb, Arrays.asList(key100, key200)).lookupValues();
893907
assertThat(inserted).hasSize(2).allMatch(Objects::nonNull);
908+
909+
// Wait for async flush so data is visible via plain lookup.
910+
waitForFlush(tb);
911+
894912
verifyLookup(tb, key100, inserted.get(0));
895913
verifyLookup(tb, key200, inserted.get(1));
896914

@@ -916,6 +934,10 @@ void testLookupWithInsertIfNotExists() throws Exception {
916934
List<byte[]> mixed = lookupWithInsert(tb, Arrays.asList(key100, key300)).lookupValues();
917935
assertThat(mixed.get(0)).isEqualTo(inserted.get(0)); // existing
918936
assertThat(mixed.get(1)).isNotNull(); // newly inserted
937+
938+
// Wait for async flush so key300 is visible via plain lookup.
939+
waitForFlush(tb);
940+
919941
verifyLookup(tb, key300, mixed.get(1));
920942

921943
// Verify that only one new log record was created for key300
@@ -966,6 +988,9 @@ void testLookupWithInsertIfNotExistsAutoIncrement() throws Exception {
966988
InternalRow row3 = valueDecoder.decodeValue(mixed.get(1)).row;
967989
assertThat(row3.getLong(2)).isEqualTo(3L); // continues sequence
968990

991+
// Wait for async flush so HW advances and log is readable.
992+
waitForFlush(tb);
993+
969994
FetchLogResultForBucket logResult = fetchLog(tb, 0L);
970995
assertThat(logResult.getHighWatermark()).isEqualTo(3L);
971996
LogRecords records = logResult.records();
@@ -1064,6 +1089,9 @@ void testConcurrentLookupWithInsertIfNotExistsAutoIncrement() throws Exception {
10641089
// Values should be 1, 2, 3 (in any order due to concurrency)
10651090
assertThat(autoIncrementValues).containsExactlyInAnyOrder(1L, 2L, 3L);
10661091

1092+
// Wait for async flush so HW advances.
1093+
waitForFlush(tb);
1094+
10671095
// Verify exactly 3 changelog entries were written (one per unique key)
10681096
FetchLogResultForBucket logResult = fetchLog(tb, 0L);
10691097
// Only the first upsert for a given primary key generates changelog records. Subsequent
@@ -1109,6 +1137,10 @@ void testLookupWithInsertIfNotExistsMultiBucket() throws Exception {
11091137
byte[] value0 = inserted.get(tb0).lookupValues().get(0);
11101138
byte[] value1 = inserted.get(tb1).lookupValues().get(0);
11111139

1140+
// Wait for async flush so data is visible via plain lookup.
1141+
waitForFlush(tb0);
1142+
waitForFlush(tb1);
1143+
11121144
// Verify inserted values via lookup
11131145
verifyLookup(tb0, key0, value0);
11141146
verifyLookup(tb1, key1, value1);
@@ -1187,6 +1219,10 @@ void testPrefixLookup() throws Exception {
11871219
PUT_KV_VERSION,
11881220
future::complete);
11891221
assertThat(future.get()).containsOnly(new PutKvResultForBucket(tb, 4));
1222+
1223+
// Wait for async flush so data is visible for prefix lookup.
1224+
waitForFlush(tb);
1225+
11901226
// second prefix lookup in table, prefix key = (1, "a").
11911227
Object[] prefixKey1 = new Object[] {1, "a"};
11921228
CompactedKeyEncoder keyEncoder = new CompactedKeyEncoder(rowType, new int[] {0, 1});
@@ -1269,6 +1305,9 @@ void testLimitScanPrimaryKeyTable() throws Exception {
12691305
future1::complete);
12701306
assertThat(future1.get()).containsOnly(new PutKvResultForBucket(tb, 8));
12711307

1308+
// Wait for async flush so data is visible in RocksDB for limit scan.
1309+
waitForFlush(tb);
1310+
12721311
// second, limit scan from table with limit
12731312
builder.append(DEFAULT_SCHEMA_ID, compactedRow(DATA1_ROW_TYPE, new Object[] {1, "a1"}));
12741313
future = new CompletableFuture<>();
@@ -2585,4 +2624,18 @@ void testStopReplicaSweepsOrphanDirsForNoneReplica() throws Exception {
25852624
// LogManager should no longer hold the log.
25862625
assertThat(logManager.getLog(tb)).isNotPresent();
25872626
}
2627+
2628+
/**
2629+
* Wait for the async KV flush to complete and high watermark to advance to at least the current
2630+
* log end offset. This is needed because the KV flush scheduler runs on background threads.
2631+
*/
2632+
private void waitForFlush(TableBucket tb) {
2633+
Replica replica = replicaManager.getReplicaOrException(tb);
2634+
long expectedOffset = replica.getLocalLogEndOffset();
2635+
retry(
2636+
Duration.ofSeconds(10),
2637+
() ->
2638+
assertThat(replica.getLogHighWatermark())
2639+
.isGreaterThanOrEqualTo(expectedOffset));
2640+
}
25882641
}

0 commit comments

Comments
 (0)