Skip to content

Commit d13e52e

Browse files
authored
global sort: add boundaries to split keys when generating plan (#58323) (#58356)
close #58267
1 parent 678b713 commit d13e52e

File tree

7 files changed

+157
-69
lines changed

7 files changed

+157
-69
lines changed

pkg/ddl/backfilling_dist_scheduler.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -447,7 +447,7 @@ func splitSubtaskMetaForOneKVMetaGroup(
447447
startKey := kvMeta.StartKey
448448
var endKey kv.Key
449449
for {
450-
endKeyOfGroup, dataFiles, statFiles, interiorRangeJobKeys, regionSplitKeys, err := splitter.SplitOneRangesGroup()
450+
endKeyOfGroup, dataFiles, statFiles, interiorRangeJobKeys, interiorRegionSplitKeys, err := splitter.SplitOneRangesGroup()
451451
if err != nil {
452452
return nil, err
453453
}
@@ -468,6 +468,10 @@ func splitSubtaskMetaForOneKVMetaGroup(
468468
rangeJobKeys = append(rangeJobKeys, startKey)
469469
rangeJobKeys = append(rangeJobKeys, interiorRangeJobKeys...)
470470
rangeJobKeys = append(rangeJobKeys, endKey)
471+
regionSplitKeys := make([][]byte, 0, len(interiorRegionSplitKeys)+2)
472+
regionSplitKeys = append(regionSplitKeys, startKey)
473+
regionSplitKeys = append(regionSplitKeys, interiorRegionSplitKeys...)
474+
regionSplitKeys = append(regionSplitKeys, endKey)
471475
m := &BackfillSubTaskMeta{
472476
MetaGroups: []*external.SortedKVMeta{{
473477
StartKey: startKey,

pkg/disttask/importinto/BUILD.bazel

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,8 +89,9 @@ go_test(
8989
embed = [":importinto"],
9090
flaky = True,
9191
race = "on",
92-
shard_count = 16,
92+
shard_count = 17,
9393
deps = [
94+
"//br/pkg/storage",
9495
"//pkg/ddl",
9596
"//pkg/disttask/framework/planner",
9697
"//pkg/disttask/framework/proto",
@@ -124,6 +125,7 @@ go_test(
124125
"@com_github_stretchr_testify//require",
125126
"@com_github_stretchr_testify//suite",
126127
"@com_github_tikv_client_go_v2//util",
128+
"@com_github_tikv_pd_client//:client",
127129
"@org_uber_go_mock//gomock",
128130
"@org_uber_go_zap//:zap",
129131
],

pkg/disttask/importinto/planner.go

Lines changed: 83 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -372,71 +372,88 @@ func generateWriteIngestSpecs(planCtx planner.PlanCtx, p *LogicalPlan) ([]planne
372372

373373
specs := make([]planner.PipelineSpec, 0, 16)
374374
for kvGroup, kvMeta := range kvMetas {
375-
splitter, err1 := getRangeSplitter(ctx, controller.GlobalSortStore, kvMeta)
376-
if err1 != nil {
377-
return nil, err1
375+
specsForOneSubtask, err3 := splitForOneSubtask(ctx, controller.GlobalSortStore, kvGroup, kvMeta, ts)
376+
if err3 != nil {
377+
return nil, err3
378378
}
379+
specs = append(specs, specsForOneSubtask...)
380+
}
381+
return specs, nil
382+
}
379383

380-
err1 = func() error {
381-
defer func() {
382-
err2 := splitter.Close()
383-
if err2 != nil {
384-
logutil.Logger(ctx).Warn("close range splitter failed", zap.Error(err2))
385-
}
386-
}()
387-
startKey := tidbkv.Key(kvMeta.StartKey)
388-
var endKey tidbkv.Key
389-
for {
390-
endKeyOfGroup, dataFiles, statFiles, interiorRangeJobKeys, regionSplitKeys, err2 := splitter.SplitOneRangesGroup()
391-
if err2 != nil {
392-
return err2
393-
}
394-
if len(endKeyOfGroup) == 0 {
395-
endKey = kvMeta.EndKey
396-
} else {
397-
endKey = tidbkv.Key(endKeyOfGroup).Clone()
398-
}
399-
logutil.Logger(ctx).Info("kv range as subtask",
400-
zap.String("startKey", hex.EncodeToString(startKey)),
401-
zap.String("endKey", hex.EncodeToString(endKey)),
402-
zap.Int("dataFiles", len(dataFiles)))
403-
if startKey.Cmp(endKey) >= 0 {
404-
return errors.Errorf("invalid kv range, startKey: %s, endKey: %s",
405-
hex.EncodeToString(startKey), hex.EncodeToString(endKey))
406-
}
407-
rangeJobKeys := make([][]byte, 0, len(interiorRangeJobKeys)+2)
408-
rangeJobKeys = append(rangeJobKeys, startKey)
409-
rangeJobKeys = append(rangeJobKeys, interiorRangeJobKeys...)
410-
rangeJobKeys = append(rangeJobKeys, endKey)
411-
// each subtask will write and ingest one range group
412-
m := &WriteIngestStepMeta{
413-
KVGroup: kvGroup,
414-
SortedKVMeta: external.SortedKVMeta{
415-
StartKey: startKey,
416-
EndKey: endKey,
417-
// this is actually an estimate, we don't know the exact size of the data
418-
TotalKVSize: uint64(config.DefaultBatchSize),
419-
},
420-
DataFiles: dataFiles,
421-
StatFiles: statFiles,
422-
RangeJobKeys: rangeJobKeys,
423-
RangeSplitKeys: regionSplitKeys,
424-
TS: ts,
425-
}
426-
specs = append(specs, &WriteIngestSpec{m})
427-
428-
startKey = endKey
429-
if len(endKeyOfGroup) == 0 {
430-
break
431-
}
432-
}
433-
return nil
434-
}()
435-
if err1 != nil {
436-
return nil, err1
384+
func splitForOneSubtask(
385+
ctx context.Context,
386+
extStorage storage.ExternalStorage,
387+
kvGroup string,
388+
kvMeta *external.SortedKVMeta,
389+
ts uint64,
390+
) ([]planner.PipelineSpec, error) {
391+
splitter, err := getRangeSplitter(ctx, extStorage, kvMeta)
392+
if err != nil {
393+
return nil, err
394+
}
395+
defer func() {
396+
err3 := splitter.Close()
397+
if err3 != nil {
398+
logutil.Logger(ctx).Warn("close range splitter failed", zap.Error(err3))
399+
}
400+
}()
401+
402+
ret := make([]planner.PipelineSpec, 0, 16)
403+
404+
startKey := tidbkv.Key(kvMeta.StartKey)
405+
var endKey tidbkv.Key
406+
for {
407+
endKeyOfGroup, dataFiles, statFiles, interiorRangeJobKeys, interiorRegionSplitKeys, err2 := splitter.SplitOneRangesGroup()
408+
if err2 != nil {
409+
return nil, err2
410+
}
411+
if len(endKeyOfGroup) == 0 {
412+
endKey = kvMeta.EndKey
413+
} else {
414+
endKey = tidbkv.Key(endKeyOfGroup).Clone()
415+
}
416+
logutil.Logger(ctx).Info("kv range as subtask",
417+
zap.String("startKey", hex.EncodeToString(startKey)),
418+
zap.String("endKey", hex.EncodeToString(endKey)),
419+
zap.Int("dataFiles", len(dataFiles)))
420+
if startKey.Cmp(endKey) >= 0 {
421+
return nil, errors.Errorf("invalid kv range, startKey: %s, endKey: %s",
422+
hex.EncodeToString(startKey), hex.EncodeToString(endKey))
423+
}
424+
rangeJobKeys := make([][]byte, 0, len(interiorRangeJobKeys)+2)
425+
rangeJobKeys = append(rangeJobKeys, startKey)
426+
rangeJobKeys = append(rangeJobKeys, interiorRangeJobKeys...)
427+
rangeJobKeys = append(rangeJobKeys, endKey)
428+
429+
regionSplitKeys := make([][]byte, 0, len(interiorRegionSplitKeys)+2)
430+
regionSplitKeys = append(regionSplitKeys, startKey)
431+
regionSplitKeys = append(regionSplitKeys, interiorRegionSplitKeys...)
432+
regionSplitKeys = append(regionSplitKeys, endKey)
433+
// each subtask will write and ingest one range group
434+
m := &WriteIngestStepMeta{
435+
KVGroup: kvGroup,
436+
SortedKVMeta: external.SortedKVMeta{
437+
StartKey: startKey,
438+
EndKey: endKey,
439+
// this is actually an estimate, we don't know the exact size of the data
440+
TotalKVSize: uint64(config.DefaultBatchSize),
441+
},
442+
DataFiles: dataFiles,
443+
StatFiles: statFiles,
444+
RangeJobKeys: rangeJobKeys,
445+
RangeSplitKeys: regionSplitKeys,
446+
TS: ts,
447+
}
448+
ret = append(ret, &WriteIngestSpec{m})
449+
450+
startKey = endKey
451+
if len(endKeyOfGroup) == 0 {
452+
break
437453
}
438454
}
439-
return specs, nil
455+
456+
return ret, nil
440457
}
441458

442459
func getSortedKVMetasOfEncodeStep(subTaskMetas [][]byte) (map[string]*external.SortedKVMeta, error) {
@@ -508,8 +525,11 @@ func getSortedKVMetasForIngest(planCtx planner.PlanCtx, p *LogicalPlan) (map[str
508525
return kvMetasOfMergeSort, nil
509526
}
510527

511-
func getRangeSplitter(ctx context.Context, store storage.ExternalStorage, kvMeta *external.SortedKVMeta) (
512-
*external.RangeSplitter, error) {
528+
func getRangeSplitter(
529+
ctx context.Context,
530+
store storage.ExternalStorage,
531+
kvMeta *external.SortedKVMeta,
532+
) (*external.RangeSplitter, error) {
513533
regionSplitSize, regionSplitKeys, err := importer.GetRegionSplitSizeKeys(ctx)
514534
if err != nil {
515535
logutil.Logger(ctx).Warn("fail to get region split size and keys", zap.Error(err))

pkg/disttask/importinto/planner_test.go

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,20 @@ import (
2020
"fmt"
2121
"testing"
2222

23+
"github.com/pingcap/errors"
2324
"github.com/pingcap/failpoint"
25+
"github.com/pingcap/tidb/br/pkg/storage"
2426
"github.com/pingcap/tidb/pkg/disttask/framework/planner"
2527
"github.com/pingcap/tidb/pkg/disttask/framework/proto"
2628
"github.com/pingcap/tidb/pkg/domain/infosync"
2729
"github.com/pingcap/tidb/pkg/executor/importer"
30+
"github.com/pingcap/tidb/pkg/kv"
2831
"github.com/pingcap/tidb/pkg/lightning/backend/external"
2932
"github.com/pingcap/tidb/pkg/meta/autoid"
3033
"github.com/pingcap/tidb/pkg/meta/model"
3134
pmodel "github.com/pingcap/tidb/pkg/parser/model"
3235
"github.com/stretchr/testify/require"
36+
pd "github.com/tikv/pd/client"
3337
)
3438

3539
func TestLogicalPlan(t *testing.T) {
@@ -283,3 +287,57 @@ func TestGetSortedKVMetas(t *testing.T) {
283287
require.Equal(t, []byte("i1_0_a"), allKVMetas["1"].StartKey)
284288
require.Equal(t, []byte("i1_2_c"), allKVMetas["1"].EndKey)
285289
}
290+
291+
func TestSplitForOneSubtask(t *testing.T) {
292+
ctx := context.Background()
293+
workDir := t.TempDir()
294+
store, err := storage.NewLocalStorage(workDir)
295+
require.NoError(t, err)
296+
297+
// about 140MB data
298+
largeValue := make([]byte, 1024*1024)
299+
keys := make([][]byte, 140)
300+
values := make([][]byte, 140)
301+
for i := 0; i < 140; i++ {
302+
keys[i] = []byte(fmt.Sprintf("%05d", i))
303+
values[i] = largeValue
304+
}
305+
306+
var multiFileStat []external.MultipleFilesStat
307+
writer := external.NewWriterBuilder().
308+
SetMemorySizeLimit(40*1024*1024).
309+
SetBlockSize(20*1024*1024).
310+
SetPropSizeDistance(5*1024*1024).
311+
SetPropKeysDistance(5).
312+
SetOnCloseFunc(func(s *external.WriterSummary) {
313+
multiFileStat = s.MultipleFilesStats
314+
}).
315+
Build(store, "/mock-test", "0")
316+
_, _, err = external.MockExternalEngineWithWriter(
317+
store, writer, "/mock-test", keys, values,
318+
)
319+
require.NoError(t, err)
320+
kvMeta := &external.SortedKVMeta{
321+
StartKey: keys[0],
322+
EndKey: kv.Key(keys[len(keys)-1]).Next(),
323+
MultipleFilesStats: multiFileStat,
324+
}
325+
326+
bak := importer.NewClientWithContext
327+
t.Cleanup(func() {
328+
importer.NewClientWithContext = bak
329+
})
330+
importer.NewClientWithContext = func(_ context.Context, _ []string, _ pd.SecurityOption, _ ...pd.ClientOption) (pd.Client, error) {
331+
return nil, errors.New("mock error")
332+
}
333+
334+
spec, err := splitForOneSubtask(ctx, store, "test-group", kvMeta, 123)
335+
require.NoError(t, err)
336+
337+
require.Len(t, spec, 1)
338+
writeSpec := spec[0].(*WriteIngestSpec)
339+
require.Equal(t, "test-group", writeSpec.KVGroup)
340+
require.Equal(t, [][]byte{
341+
[]byte("00000"), []byte("00096"), []byte("00139\x00"),
342+
}, writeSpec.RangeSplitKeys)
343+
}

pkg/lightning/backend/external/split.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -156,14 +156,15 @@ func (r *RangeSplitter) Close() error {
156156
// but it will be nil when the group is the last one. `dataFiles` and `statFiles`
157157
// are all the files that have overlapping key ranges in this group.
158158
// `interiorRangeJobKeys` are the interior boundary keys of the range jobs, the
159-
// range can be constructed with start/end key at caller. `regionSplitKeys` are
160-
// the split keys that will be used later to split regions.
159+
// range can be constructed with start/end key at caller.
160+
// `interiorRegionSplitKeys` are the split keys that will be used later to split
161+
// regions.
161162
func (r *RangeSplitter) SplitOneRangesGroup() (
162163
endKeyOfGroup []byte,
163164
dataFiles []string,
164165
statFiles []string,
165166
interiorRangeJobKeys [][]byte,
166-
regionSplitKeys [][]byte,
167+
interiorRegionSplitKeys [][]byte,
167168
err error,
168169
) {
169170
var (

pkg/lightning/backend/local/local.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1347,7 +1347,7 @@ func (local *Backend) ImportEngine(
13471347

13481348
log.FromContext(ctx).Info("start import engine",
13491349
zap.Stringer("uuid", engineUUID),
1350-
zap.Int("region ranges", len(splitKeys)),
1350+
zap.Int("region ranges", len(splitKeys)-1),
13511351
zap.Int64("count", lfLength),
13521352
zap.Int64("size", lfTotalSize))
13531353

pkg/lightning/common/engine.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ type Engine interface {
4444
// keys that can be used as region split keys. If the duplicate detection is
4545
// enabled, the keys stored in engine are encoded by duplicate detection but the
4646
// returned keys should not be encoded.
47+
//
48+
// Currently, the start/end key of this import should also be included in the
49+
// returned split keys.
4750
GetRegionSplitKeys() ([][]byte, error)
4851
Close() error
4952
}

0 commit comments

Comments
 (0)