diff --git a/api/fine-tuning/cmd/e2e-serving-test/main.go b/api/fine-tuning/cmd/e2e-serving-test/main.go new file mode 100644 index 00000000..7cb67ad5 --- /dev/null +++ b/api/fine-tuning/cmd/e2e-serving-test/main.go @@ -0,0 +1,508 @@ +package main + +import ( + "bytes" + "context" + "crypto/ecdsa" + "encoding/hex" + "encoding/json" + "fmt" + "io" + "net/http" + "os" + "strings" + "time" + + commonConfig "github.com/0glabs/0g-serving-broker/common/config" + commonLog "github.com/0glabs/0g-serving-broker/common/log" + "github.com/0glabs/0g-serving-broker/fine-tuning/config" + "github.com/0glabs/0g-serving-broker/fine-tuning/internal/db" + "github.com/0glabs/0g-serving-broker/fine-tuning/internal/serving" + "github.com/ethereum/go-ethereum/accounts" + "github.com/ethereum/go-ethereum/crypto" + "github.com/gin-gonic/gin" + "github.com/google/uuid" +) + +func main() { + logCfg := &commonConfig.LoggerConfig{ + Format: "text", + Level: "debug", + } + logger, err := commonLog.GetLogger(logCfg) + if err != nil { + panic(err) + } + + fmt.Println("========================================") + fmt.Println(" 0G Broker Serving Module E2E Test") + fmt.Println("========================================") + + // --- Step 1: Connect to MySQL --- + fmt.Println("\n[1/7] Connecting to MySQL...") + cfg := &config.Config{} + cfg.Database.FineTune = "root:123456@tcp(127.0.0.1:3306)/fineTune?parseTime=true" + database, err := db.NewDB(cfg, logger) + if err != nil { + fmt.Printf("FAIL: MySQL connection: %v\n", err) + os.Exit(1) + } + if err := database.Migrate(); err != nil { + fmt.Printf("FAIL: migration: %v\n", err) + os.Exit(1) + } + fmt.Println("PASS: MySQL connected, schema migrated") + + // --- Step 2: Generate test user keypair --- + fmt.Println("\n[2/7] Creating test user keypair...") + testPrivKey, err := crypto.GenerateKey() + if err != nil { + fmt.Printf("FAIL: key generation: %v\n", err) + os.Exit(1) + } + testAddr := crypto.PubkeyToAddress(testPrivKey.PublicKey) + fmt.Printf("PASS: Test user address: %s\n", testAddr.Hex()) + + authSig := generateAuthSignature(testPrivKey) + fmt.Printf("PASS: Auth signature generated (len=%d)\n", len(authSig)) + + // --- Step 3: Insert test tasks --- + fmt.Println("\n[3/7] Inserting simulated finished tasks...") + taskIDs := make([]uuid.UUID, 3) + loraAdapters := []string{ + "/root/lora-modules/ft-lora-adapter-0", + "/root/lora-modules/ft-lora-adapter-1", + "/root/lora-modules/ft-lora-adapter-2", + } + for i := 0; i < 3; i++ { + id := uuid.New() + taskIDs[i] = id + if err := database.InsertTestTask(id, strings.ToLower(testAddr.Hex()), "Qwen2.5-0.5B"); err != nil { + fmt.Printf("FAIL: insert task %d: %v\n", i, err) + os.Exit(1) + } + fmt.Printf(" Task %d: %s\n", i, id.String()) + } + fmt.Println("PASS: 3 finished tasks inserted") + + // --- Step 4: Create serving Manager --- + fmt.Println("\n[4/7] Starting serving Manager + vLLM...") + servingCfg := serving.ServingConfig{ + Enable: true, + BaseModelPath: "/root/models/Qwen2.5-0.5B-Instruct", + InferenceGPUIDs: "0", + VLLMPort: 8000, + MaxLoraRank: 64, + MaxLoraModules: 16, + MaxCpuLoras: 32, + LoraModulesDir: "/root/e2e-lora-modules", + OffloadAfterMinutes: 60, + EnableColdStorage: false, + ModelLoadTimeoutSeconds: 300, + GpuMemoryUtilization: 0.6, + } + + mgr := serving.NewManager(database, servingCfg, logger, nil) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + if err := mgr.Start(ctx); err != nil { + fmt.Printf("FAIL: Manager start: %v\n", err) + os.Exit(1) + } + fmt.Println("PASS: Manager started, vLLM launching in background") + + // --- Step 5: Register LoRA adapters --- + fmt.Println("\n[5/7] Registering LoRA adapters...") + var modelNames []string + for i, taskID := range taskIDs { + name, err := mgr.RegisterModel(taskID, strings.ToLower(testAddr.Hex()), "Qwen2.5-0.5B", loraAdapters[i], "") + if err != nil { + fmt.Printf("FAIL: register adapter %d: %v\n", i, err) + os.Exit(1) + } + modelNames = append(modelNames, name) + fmt.Printf(" Registered: %s -> %s\n", name, loraAdapters[i]) + } + fmt.Println("PASS: 3 LoRA adapters registered") + + // --- Step 6: Start HTTP proxy --- + fmt.Println("\n[6/7] Starting HTTP proxy on :3080...") + proxy := serving.NewProxy(mgr, logger) + gin.SetMode(gin.ReleaseMode) + engine := gin.New() + engine.Use(gin.Recovery()) + v1 := engine.Group("/v1") + proxy.RegisterRoutes(v1) + + go func() { + if err := engine.Run(":3080"); err != nil { + fmt.Printf("HTTP server error: %v\n", err) + } + }() + time.Sleep(500 * time.Millisecond) + fmt.Println("PASS: HTTP proxy started") + + // --- Step 7: Wait for vLLM --- + fmt.Println("\n[7/7] Waiting for vLLM to become ready (may take 1-3 minutes)...") + if !waitForVLLM(ctx, 300*time.Second) { + fmt.Println("FAIL: vLLM did not become ready within timeout") + cleanup(database, taskIDs) + os.Exit(1) + } + fmt.Println("PASS: vLLM is ready!") + + // --- Run test cases --- + fmt.Println("\n========================================") + fmt.Println(" Running E2E Test Cases") + fmt.Println("========================================") + + passed, failed := 0, 0 + run := func(name string, fn func() error) { + fmt.Printf("\n--- Test: %s ---\n", name) + if err := fn(); err != nil { + fmt.Printf("FAIL: %v\n", err) + failed++ + } else { + fmt.Printf("PASS\n") + passed++ + } + } + + authHeader := "Bearer " + authSig + + run("Health endpoint", func() error { + resp, body, err := httpGet("http://localhost:3080/v1/serving/health", nil) + if err != nil { + return err + } + if resp.StatusCode != 200 { + return fmt.Errorf("expected 200, got %d: %s", resp.StatusCode, body) + } + fmt.Printf(" Response: %s\n", body) + return nil + }) + + run("List all served models", func() error { + resp, body, err := httpGet("http://localhost:3080/v1/serving/models", nil) + if err != nil { + return err + } + if resp.StatusCode != 200 { + return fmt.Errorf("expected 200, got %d: %s", resp.StatusCode, body) + } + var models []map[string]interface{} + json.Unmarshal([]byte(body), &models) + if len(models) != 3 { + return fmt.Errorf("expected 3 models, got %d", len(models)) + } + for _, m := range models { + fmt.Printf(" Model: %s (state: %s)\n", m["modelName"], m["state"]) + } + return nil + }) + + run("List models for user (authenticated)", func() error { + resp, body, err := httpGet("http://localhost:3080/v1/serving/v1/models", map[string]string{ + "Authorization": authHeader, + }) + if err != nil { + return err + } + if resp.StatusCode != 200 { + return fmt.Errorf("expected 200, got %d: %s", resp.StatusCode, body) + } + fmt.Printf(" Response: %s\n", truncate(body, 200)) + return nil + }) + + run("Unauthorized request rejected", func() error { + resp, _, err := httpGet("http://localhost:3080/v1/serving/v1/models", nil) + if err != nil { + return err + } + if resp.StatusCode != 401 { + return fmt.Errorf("expected 401, got %d", resp.StatusCode) + } + return nil + }) + + run("Chat completion with LoRA adapter 0", func() error { + return testChatCompletion(modelNames[0], authHeader) + }) + + run("Chat completion with LoRA adapter 1", func() error { + return testChatCompletion(modelNames[1], authHeader) + }) + + run("Chat completion with LoRA adapter 2", func() error { + return testChatCompletion(modelNames[2], authHeader) + }) + + run("Non-existent model returns 404", func() error { + reqBody := map[string]interface{}{ + "model": "non-existent-model", + "messages": []map[string]string{{"role": "user", "content": "Hello"}}, + } + bodyBytes, _ := json.Marshal(reqBody) + resp, _, err := httpPost("http://localhost:3080/v1/serving/v1/chat/completions", bodyBytes, map[string]string{ + "Authorization": authHeader, + "Content-Type": "application/json", + }) + if err != nil { + return err + } + if resp.StatusCode != 404 { + return fmt.Errorf("expected 404, got %d", resp.StatusCode) + } + return nil + }) + + run("Wrong user cannot access model", func() error { + otherKey, _ := crypto.GenerateKey() + otherSig := generateAuthSignature(otherKey) + reqBody := map[string]interface{}{ + "model": modelNames[0], + "messages": []map[string]string{{"role": "user", "content": "Hello"}}, + } + bodyBytes, _ := json.Marshal(reqBody) + resp, _, err := httpPost("http://localhost:3080/v1/serving/v1/chat/completions", bodyBytes, map[string]string{ + "Authorization": "Bearer " + otherSig, + "Content-Type": "application/json", + }) + if err != nil { + return err + } + if resp.StatusCode != 403 { + return fmt.Errorf("expected 403, got %d", resp.StatusCode) + } + return nil + }) + + run("Streaming chat completion", func() error { + reqBody := map[string]interface{}{ + "model": modelNames[0], + "messages": []map[string]string{{"role": "user", "content": "Count 1 to 5"}}, + "stream": true, + "max_tokens": 50, + } + bodyBytes, _ := json.Marshal(reqBody) + resp, err := httpPostRaw("http://localhost:3080/v1/serving/v1/chat/completions", bodyBytes, map[string]string{ + "Authorization": authHeader, + "Content-Type": "application/json", + }) + if err != nil { + return err + } + defer resp.Body.Close() + if resp.StatusCode != 200 { + body, _ := io.ReadAll(resp.Body) + return fmt.Errorf("expected 200, got %d: %s", resp.StatusCode, string(body)) + } + chunks := 0 + buf := make([]byte, 4096) + for { + n, readErr := resp.Body.Read(buf) + if n > 0 { + chunks++ + if chunks <= 3 { + fmt.Printf(" Chunk %d: %s\n", chunks, truncate(string(buf[:n]), 100)) + } + } + if readErr != nil { + break + } + } + fmt.Printf(" Total chunks: %d\n", chunks) + if chunks < 2 { + return fmt.Errorf("expected multiple chunks, got %d", chunks) + } + return nil + }) + + run("Chat completion with wait_for_model on active model", func() error { + reqBody := map[string]interface{}{ + "model": modelNames[0], + "messages": []map[string]string{{"role": "user", "content": "What is 2+2?"}}, + "max_tokens": 30, + "wait_for_model": true, + } + bodyBytes, _ := json.Marshal(reqBody) + resp, body, err := httpPost("http://localhost:3080/v1/serving/v1/chat/completions", bodyBytes, map[string]string{ + "Authorization": authHeader, + "Content-Type": "application/json", + }) + if err != nil { + return err + } + if resp.StatusCode != 200 { + return fmt.Errorf("expected 200 for active model with wait_for_model, got %d: %s", resp.StatusCode, body) + } + fmt.Printf(" Response (wait_for_model=true, active): %s\n", truncate(body, 120)) + return nil + }) + + run("Health endpoint shows model_load_timeout_sec", func() error { + resp, body, err := httpGet("http://localhost:3080/v1/serving/health", nil) + if err != nil { + return err + } + if resp.StatusCode != 200 { + return fmt.Errorf("expected 200, got %d", resp.StatusCode) + } + var health map[string]interface{} + json.Unmarshal([]byte(body), &health) + if _, ok := health["model_load_timeout_sec"]; !ok { + return fmt.Errorf("model_load_timeout_sec missing from health response: %s", body) + } + fmt.Printf(" model_load_timeout_sec: %v\n", health["model_load_timeout_sec"]) + return nil + }) + + run("Concurrent requests to different adapters", func() error { + type result struct { + idx int + err error + } + ch := make(chan result, 3) + for i := 0; i < 3; i++ { + go func(idx int) { + ch <- result{idx, testChatCompletion(modelNames[idx], authHeader)} + }(i) + } + for i := 0; i < 3; i++ { + r := <-ch + if r.err != nil { + return fmt.Errorf("concurrent request %d failed: %v", r.idx, r.err) + } + } + fmt.Printf(" All 3 concurrent requests succeeded\n") + return nil + }) + + // --- Summary --- + fmt.Println("\n========================================") + fmt.Printf(" Results: %d passed, %d failed\n", passed, failed) + fmt.Println("========================================") + + cleanup(database, taskIDs) + cancel() + mgr.Stop() + + if failed > 0 { + os.Exit(1) + } +} + +func cleanup(database *db.DB, taskIDs []uuid.UUID) { + for _, id := range taskIDs { + database.DeleteTestTask(id) + } + fmt.Println("Cleaned up test tasks from DB") +} + +func generateAuthSignature(key *ecdsa.PrivateKey) string { + message := "0g-serving-inference-auth" + hash := accounts.TextHash([]byte(message)) + sig, err := crypto.Sign(hash, key) + if err != nil { + panic(err) + } + if sig[64] < 27 { + sig[64] += 27 + } + return "0x" + hex.EncodeToString(sig) +} + +func testChatCompletion(modelName, authHeader string) error { + reqBody := map[string]interface{}{ + "model": modelName, + "messages": []map[string]string{{"role": "user", "content": "What is 2+2?"}}, + "max_tokens": 30, + } + bodyBytes, _ := json.Marshal(reqBody) + resp, body, err := httpPost("http://localhost:3080/v1/serving/v1/chat/completions", bodyBytes, map[string]string{ + "Authorization": authHeader, + "Content-Type": "application/json", + }) + if err != nil { + return err + } + if resp.StatusCode != 200 { + return fmt.Errorf("expected 200, got %d: %s", resp.StatusCode, body) + } + var result map[string]interface{} + if err := json.Unmarshal([]byte(body), &result); err != nil { + return fmt.Errorf("invalid JSON: %v", err) + } + choices, ok := result["choices"].([]interface{}) + if !ok || len(choices) == 0 { + return fmt.Errorf("no choices: %s", truncate(body, 200)) + } + fmt.Printf(" Model: %s -> %s\n", modelName, truncate(body, 120)) + return nil +} + +func waitForVLLM(ctx context.Context, timeout time.Duration) bool { + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + select { + case <-ctx.Done(): + return false + default: + } + resp, err := http.Get("http://localhost:8000/health") + if err == nil { + resp.Body.Close() + if resp.StatusCode == 200 { + return true + } + } + time.Sleep(5 * time.Second) + } + return false +} + +func httpGet(url string, headers map[string]string) (*http.Response, string, error) { + req, _ := http.NewRequest("GET", url, nil) + for k, v := range headers { + req.Header.Set(k, v) + } + resp, err := http.DefaultClient.Do(req) + if err != nil { + return nil, "", err + } + defer resp.Body.Close() + body, _ := io.ReadAll(resp.Body) + return resp, string(body), nil +} + +func httpPost(url string, data []byte, headers map[string]string) (*http.Response, string, error) { + req, _ := http.NewRequest("POST", url, bytes.NewBuffer(data)) + for k, v := range headers { + req.Header.Set(k, v) + } + resp, err := (&http.Client{Timeout: 120 * time.Second}).Do(req) + if err != nil { + return nil, "", err + } + defer resp.Body.Close() + body, _ := io.ReadAll(resp.Body) + return resp, string(body), nil +} + +func httpPostRaw(url string, data []byte, headers map[string]string) (*http.Response, error) { + req, _ := http.NewRequest("POST", url, bytes.NewBuffer(data)) + for k, v := range headers { + req.Header.Set(k, v) + } + return (&http.Client{Timeout: 120 * time.Second}).Do(req) +} + +func truncate(s string, maxLen int) string { + if len(s) <= maxLen { + return s + } + return s[:maxLen] + "..." +} diff --git a/api/fine-tuning/cmd/server/main.go b/api/fine-tuning/cmd/server/main.go index 481c4057..839542c5 100644 --- a/api/fine-tuning/cmd/server/main.go +++ b/api/fine-tuning/cmd/server/main.go @@ -7,7 +7,9 @@ import ( "os" "os/signal" "path/filepath" + "sync" "syscall" + "time" image "github.com/0glabs/0g-serving-broker/common/docker" "github.com/0glabs/0g-serving-broker/common/log" @@ -20,10 +22,13 @@ import ( "github.com/0glabs/0g-serving-broker/fine-tuning/internal/db" "github.com/0glabs/0g-serving-broker/fine-tuning/internal/handler" "github.com/0glabs/0g-serving-broker/fine-tuning/internal/services" + "github.com/0glabs/0g-serving-broker/fine-tuning/internal/serving" "github.com/0glabs/0g-serving-broker/fine-tuning/internal/storage" "github.com/0glabs/0g-serving-broker/fine-tuning/internal/utils" + "github.com/0glabs/0g-serving-broker/fine-tuning/monitor" "github.com/docker/docker/client" "github.com/gin-gonic/gin" + "github.com/prometheus/client_golang/prometheus/promhttp" ) //go:generate swag fmt @@ -42,8 +47,6 @@ func Main() { panic(err) } - // Initialize data directory for task storage - // Uses configured dataDir or falls back to os.TempDir() utils.SetDataDir(cfg.Service.DataDir) logger.Infof("Data directory set to: %s", utils.GetDataDir()) @@ -55,13 +58,13 @@ func Main() { defer cancel() imageChan := buildImageIfNeeded(ctx, cfg, logger) - services, err := initializeServices(ctx, cfg, logger) + appServices, err := initializeServices(ctx, cfg, logger) if err != nil { panic(err) } - defer services.contract.Close() + defer appServices.contract.Close() - if err := runApplication(ctx, cfg, services, logger, imageChan); err != nil { + if err := runApplication(ctx, cfg, appServices, logger, imageChan); err != nil { panic(err) } } @@ -121,14 +124,11 @@ func buildImageIfNeeded(ctx context.Context, config *config.Config, logger log.L if buildImage { logger.Debugf("build image %s", imageName) - // Check if transformer files exist in the embedded location embeddedPath := "/fine-tuning/execution/transformer" - // Prepare bridge directory for Docker daemon access if _, err := os.Stat(embeddedPath); err == nil { logger.Infof("Found embedded transformer files at %s", embeddedPath) - // Clean bridge directory contents but don't remove the directory itself (it may be mounted) bridgeDir := constant.FineTuningDockerfilePath if entries, err := os.ReadDir(bridgeDir); err == nil { for _, entry := range entries { @@ -139,13 +139,11 @@ func buildImageIfNeeded(ctx context.Context, config *config.Config, logger log.L } } - // Ensure bridge directory exists if err := os.MkdirAll(bridgeDir, 0755); err != nil { logger.Errorf("failed to create bridge directory: %v", err) return } - // Copy transformer files to bridge directory logger.Infof("Copying transformer files to bridge directory: %s", bridgeDir) if err := copyDirectory(embeddedPath, bridgeDir); err != nil { logger.Errorf("failed to copy transformer files: %v", err) @@ -156,7 +154,6 @@ func buildImageIfNeeded(ctx context.Context, config *config.Config, logger log.L logger.Warnf("Embedded transformer files not found at %s, checking bridge directory", embeddedPath) } - // Build image using the bridge directory (constant.FineTuningDockerfilePath now points to /tmp/transformer-bridge) logger.Infof("Building image from: %s", constant.FineTuningDockerfilePath) err := image.ImageBuild(ctx, cli, constant.FineTuningDockerfilePath, imageName, logger) if err != nil { @@ -173,11 +170,11 @@ func buildImageIfNeeded(ctx context.Context, config *config.Config, logger log.L } func initializeServices(ctx context.Context, cfg *config.Config, logger log.Logger) (*ApplicationServices, error) { - db, err := db.NewDB(cfg, logger) + database, err := db.NewDB(cfg, logger) if err != nil { return nil, err } - if err := db.Migrate(); err != nil { + if err := database.Migrate(); err != nil { return nil, err } @@ -199,7 +196,6 @@ func initializeServices(ctx context.Context, cfg *config.Config, logger log.Logg return nil, err } - // Sync TEE quote to initialize Address before creating contract logger.Info("syncing TEE quote during service initialization") if err := teeService.SyncQuote(ctx, os.Getenv("NETWORK") != "hardhat"); err != nil { return nil, err @@ -210,34 +206,34 @@ func initializeServices(ctx context.Context, cfg *config.Config, logger log.Logg return nil, err } - ctrl := ctrl.New(db, cfg, contract, teeService, logger) + ctrlInst := ctrl.New(database, cfg, contract, teeService, logger) - setup, err := services.NewSetup(db, cfg, contract, logger, storageClient, teeService) + setup, err := services.NewSetup(database, cfg, contract, logger, storageClient, teeService) if err != nil { return nil, err } - executor, err := services.NewExecutor(db, cfg, contract, logger) + executor, err := services.NewExecutor(database, cfg, contract, logger) if err != nil { return nil, err } - finalizer, err := services.NewFinalizer(db, cfg, contract, logger, storageClient, teeService) + finalizer, err := services.NewFinalizer(database, cfg, contract, logger, storageClient, teeService) if err != nil { return nil, err } - settlement, err := services.NewSettlement(db, contract, cfg, teeService, logger) + settlement, err := services.NewSettlement(database, contract, cfg, teeService, logger) if err != nil { return nil, err } return &ApplicationServices{ - db: db, + db: database, storageClient: storageClient, contract: contract, teeService: teeService, - ctrl: ctrl, + ctrl: ctrlInst, setup: setup, executor: executor, finalizer: finalizer, @@ -245,43 +241,91 @@ func initializeServices(ctx context.Context, cfg *config.Config, logger log.Logg }, nil } -func runApplication(ctx context.Context, cfg *config.Config, services *ApplicationServices, logger log.Logger, imageChan <-chan bool) error { - if err := services.db.MarkInProgressTasksAsFailed(); err != nil { +func runApplication(ctx context.Context, cfg *config.Config, svc *ApplicationServices, logger log.Logger, imageChan <-chan bool) error { + if err := svc.db.MarkInProgressTasksAsFailed(); err != nil { return err } - if err := services.ctrl.SyncServices(ctx); err != nil { + if err := svc.ctrl.SyncServices(ctx); err != nil { return err } - if err := services.finalizer.Start(ctx); err != nil { + if err := svc.finalizer.Start(ctx); err != nil { return err } - if err := services.executor.Start(ctx); err != nil { + if err := svc.executor.Start(ctx); err != nil { return err } - if err := services.setup.Start(ctx); err != nil { + if err := svc.setup.Start(ctx); err != nil { return err } engine := gin.New() - h := handler.New(services.ctrl, logger, cfg.RateLimitRPS, cfg.RateLimitBurst) + + var wg sync.WaitGroup + + if cfg.Monitor.Enable { + monitor.Init(cfg.Service.ServingUrl, ctx) + engine.GET("/metrics", gin.WrapH(promhttp.Handler())) + engine.Use(monitor.TrackMetrics()) + wg.Add(1) + go func() { + defer wg.Done() + startTaskStatePoller(ctx, svc.db, logger) + }() + } + + var servingProxy *serving.Proxy + if cfg.Serving.Enable { + servingMgr := serving.NewManager(svc.db, serving.ServingConfig{ + Enable: cfg.Serving.Enable, + BaseModelPath: cfg.Serving.BaseModelPath, + InferenceGPUIDs: cfg.Serving.InferenceGPUIDs, + VLLMPort: cfg.Serving.VLLMPort, + MaxLoraRank: cfg.Serving.MaxLoraRank, + MaxLoraModules: cfg.Serving.MaxLoraModules, + MaxCpuLoras: cfg.Serving.MaxCpuLoras, + LoraModulesDir: cfg.Serving.LoraModulesDir, + OffloadAfterMinutes: cfg.Serving.OffloadAfterMinutes, + EnableColdStorage: cfg.Serving.EnableColdStorage, + ModelLoadTimeoutSeconds: cfg.Serving.ModelLoadTimeoutSeconds, + GpuMemoryUtilization: cfg.Serving.GpuMemoryUtilization, + }, logger, svc.storageClient) + if err := servingMgr.Start(ctx); err != nil { + return err + } + defer func() { + if err := servingMgr.Stop(); err != nil { + logger.Warnf("failed to stop vLLM: %v", err) + } + }() + + registry := serving.NewRegistry(svc.contract, servingMgr, serving.RegistryConfig{ + InputPrice: cfg.Serving.InputPrice, + OutputPrice: cfg.Serving.OutputPrice, + }, logger) + registry.Start(ctx) + + servingProxy = serving.NewProxy(servingMgr, logger) + logger.Info("LoRA serving module initialized") + } + + h := handler.New(svc.ctrl, logger, cfg.RateLimitRPS, cfg.RateLimitBurst, servingProxy) h.Register(engine) if _, ok := <-imageChan; !ok { return errors.New("image build failed") } - if err := services.settlement.Start(ctx); err != nil { + if err := svc.settlement.Start(ctx); err != nil { return err } stop := make(chan os.Signal, 1) signal.Notify(stop, os.Interrupt, syscall.SIGTERM) - // Listen and Serve, config port with PORT=X go func() { logger.Info("starting http server...") if err := engine.Run(); err != nil { @@ -292,40 +336,53 @@ func runApplication(ctx context.Context, cfg *config.Config, services *Applicati <-stop logger.Info("shutting down server...") + wg.Wait() return nil } -// copyDirectory recursively copies a directory from src to dst +func startTaskStatePoller(ctx context.Context, database *db.DB, logger log.Logger) { + ticker := time.NewTicker(30 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + counts, err := database.CountTasksByState() + if err != nil { + logger.Warnf("failed to count tasks by state for metrics: %v", err) + continue + } + monitor.UpdateTaskStateGauge(counts) + } + } +} + func copyDirectory(src, dst string) error { - // Get file info of source srcInfo, err := os.Stat(src) if err != nil { return err } - // Create destination directory if err := os.MkdirAll(dst, srcInfo.Mode()); err != nil { return err } - // Read source directory entries, err := os.ReadDir(src) if err != nil { return err } - // Copy each entry for _, entry := range entries { srcPath := filepath.Join(src, entry.Name()) dstPath := filepath.Join(dst, entry.Name()) if entry.IsDir() { - // Recursively copy subdirectory if err := copyDirectory(srcPath, dstPath); err != nil { return err } } else { - // Copy file if err := copyFile(srcPath, dstPath); err != nil { return err } @@ -335,33 +392,24 @@ func copyDirectory(src, dst string) error { return nil } -// copyFile copies a single file from src to dst func copyFile(src, dst string) error { - // Open source file srcFile, err := os.Open(src) if err != nil { return err } defer srcFile.Close() - // Get source file info srcInfo, err := srcFile.Stat() if err != nil { return err } - // Create destination file dstFile, err := os.OpenFile(dst, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, srcInfo.Mode()) if err != nil { return err } defer dstFile.Close() - // Copy contents _, err = io.Copy(dstFile, srcFile) - if err != nil { - return err - } - - return nil + return err } diff --git a/api/fine-tuning/config/config.go b/api/fine-tuning/config/config.go index a70b3376..3ce7703b 100644 --- a/api/fine-tuning/config/config.go +++ b/api/fine-tuning/config/config.go @@ -21,40 +21,21 @@ type Service struct { ServingUrl string `yaml:"servingUrl"` Quota struct { CpuCount int64 `yaml:"cpuCount"` - Memory int64 `yaml:"memory"` // Memory limit in GB - Storage int64 `yaml:"storage"` // Storage limit in GB + Memory int64 `yaml:"memory"` + Storage int64 `yaml:"storage"` GpuType string `yaml:"gpuType"` GpuCount int64 `yaml:"gpuCount"` } `yaml:"quota"` - PricePerToken int64 `yaml:"pricePerToken"` - ProviderStake string `yaml:"providerStake"` // Stake amount for first-time service registration (default: 100000000000000000000 = 100 0G) - CustomizedModels []CustomizedModel `yaml:"customizedModels"` - // SupportedPredefinedModels is a whitelist of predefined model hashes that this provider supports - // If empty, all models in SCRIPT_MAP are allowed (backward compatible) - // If specified, only models in this list will be accepted for fine-tuning tasks - SupportedPredefinedModels []string `yaml:"supportedPredefinedModels"` - // ModelLocalPaths maps model hash to local file path for any model (including predefined models) - // When set, the broker will use the local model instead of downloading from 0G Storage - ModelLocalPaths map[string]string `yaml:"modelLocalPaths"` - // ModelHuggingFaceFallback maps model hash to HuggingFace repo name - // Used as fallback when local model path doesn't exist + PricePerToken int64 `yaml:"pricePerToken"` + ProviderStake string `yaml:"providerStake"` + CustomizedModels []CustomizedModel `yaml:"customizedModels"` + SupportedPredefinedModels []string `yaml:"supportedPredefinedModels"` + ModelLocalPaths map[string]string `yaml:"modelLocalPaths"` ModelHuggingFaceFallback map[string]string `yaml:"modelHuggingFaceFallback"` - // DatasetLocalPaths maps dataset hash to local file path - // When set, the broker will use the local dataset instead of downloading from 0G Storage - // Useful for testing or pre-cached datasets - DatasetLocalPaths map[string]string `yaml:"datasetLocalPaths"` - // SkipStorageUpload when true, skips uploading trained model to 0G Storage - // Users can still download LoRA directly from TEE via /v1/user/:address/task/:id/lora - // Useful for testing or when 0G Storage is not available - SkipStorageUpload bool `yaml:"skipStorageUpload"` - // FileRetentionHours specifies how long to keep task files (dataset, output, encrypted LoRA) - // After this period, files will be automatically cleaned up - // Default: 72 hours (3 days) - FileRetentionHours int `yaml:"fileRetentionHours"` - // DataDir specifies the root directory for storing task data (datasets, models, outputs) - // Default: /tmp (uses os.TempDir()) - // Recommended: /dstack/persistent for large models to avoid memory pressure - DataDir string `yaml:"dataDir"` + DatasetLocalPaths map[string]string `yaml:"datasetLocalPaths"` + SkipStorageUpload bool `yaml:"skipStorageUpload"` + FileRetentionHours int `yaml:"fileRetentionHours"` + DataDir string `yaml:"dataDir"` } func (s *Service) GetCustomizedModels() map[ethcommon.Hash]CustomizedModel { @@ -63,7 +44,6 @@ func (s *Service) GetCustomizedModels() map[ethcommon.Hash]CustomizedModel { hash := ethcommon.HexToHash(model.Hash) customizedModels[hash] = model } - return customizedModels } @@ -116,7 +96,7 @@ type CustomizedModel struct { Description string `yaml:"description" json:"description"` Tokenizer string `yaml:"tokenizer" json:"tokenizer"` UsageFile string `yaml:"usageFile" json:"usageFile"` - LocalPath string `yaml:"localPath" json:"localPath"` // Local path to pre-downloaded model, skip 0G Storage download if set + LocalPath string `yaml:"localPath" json:"localPath"` } type Images struct { @@ -126,6 +106,28 @@ type Images struct { OverrideImage bool `yaml:"overrideImage"` } +type MonitorConfig struct { + Enable bool `yaml:"enable"` + EventAddress string `yaml:"eventAddress"` +} + +type ServingConfig struct { + Enable bool `yaml:"enable"` + BaseModelPath string `yaml:"baseModelPath"` + InferenceGPUIDs string `yaml:"inferenceGpuIds"` + VLLMPort int `yaml:"vllmPort"` + MaxLoraRank int `yaml:"maxLoraRank"` + MaxLoraModules int `yaml:"maxLoraModules"` + MaxCpuLoras int `yaml:"maxCpuLoras"` + LoraModulesDir string `yaml:"loraModulesDir"` + InputPrice string `yaml:"inputPrice"` + OutputPrice string `yaml:"outputPrice"` + OffloadAfterMinutes int `yaml:"offloadAfterMinutes"` + EnableColdStorage bool `yaml:"enableColdStorage"` + ModelLoadTimeoutSeconds int `yaml:"modelLoadTimeoutSeconds"` + GpuMemoryUtilization float64 `yaml:"gpuMemoryUtilization"` +} + type Config struct { ContractAddress string `yaml:"contractAddress"` Database struct { @@ -138,6 +140,8 @@ type Config struct { Service Service `yaml:"service"` ProviderOption providers.Option `mapstructure:"providerOption" yaml:"providerOption"` Logger config.LoggerConfig `yaml:"logger"` + Monitor MonitorConfig `yaml:"monitor"` + Serving ServingConfig `yaml:"serving"` SettlementCheckIntervalSecs int64 `yaml:"settlementCheckInterval"` BalanceThresholdInEther int64 `yaml:"balanceThresholdInEther"` GasPrice string `yaml:"gasPrice"` @@ -153,8 +157,8 @@ type Config struct { DeliveredTaskAckTimeoutSecs uint `yaml:"deliveredTaskAckTimeoutSecs"` DataRetentionDays uint `yaml:"dataRetentionDays"` MaxTaskQueueSize uint `yaml:"maxTaskQueueSize"` - RateLimitRPS float64 `yaml:"rateLimitRPS"` // Rate limit requests per second - RateLimitBurst int `yaml:"rateLimitBurst"` // Rate limit burst size + RateLimitRPS float64 `yaml:"rateLimitRPS"` + RateLimitBurst int `yaml:"rateLimitBurst"` } type StorageClientConfig struct { @@ -166,12 +170,10 @@ type StorageClientConfig struct { type UploadArgs struct { Tags string `yaml:"tags"` ExpectedReplica uint `yaml:"expectedReplica"` - SkipTx bool `yaml:"skipTx"` FinalityRequired bool `yaml:"finalityRequired"` TaskSize uint `yaml:"taskSize"` Routines int `yaml:"routines"` - FragmentSize int64 `yaml:"fragmentSize"` FullTrusted bool `yaml:"fullTrusted"` FastMode bool `yaml:"fastMode"` @@ -207,7 +209,7 @@ func GetConfig() *Config { Database: struct { FineTune string `yaml:"fineTune"` }{ - FineTune: "root:123456@tcp(mysql:3306)/fineTune?parseTime=true", + FineTune: "root:123456@tcp(0g-fine-tune-broker-db:3306)/fineTune?parseTime=true", }, GasPrice: "", Images: Images{ @@ -222,6 +224,23 @@ func GetConfig() *Config { Path: "", RotationCount: 50, }, + Monitor: MonitorConfig{ + Enable: false, + EventAddress: ":3081", + }, + Serving: ServingConfig{ + Enable: false, + VLLMPort: 8000, + MaxLoraRank: 64, + MaxLoraModules: 16, + MaxCpuLoras: 32, + LoraModulesDir: "/tmp/lora-modules", + InputPrice: "10000000", + OutputPrice: "10000000", + OffloadAfterMinutes: 60, + EnableColdStorage: false, + ModelLoadTimeoutSeconds: 300, + }, SettlementCheckIntervalSecs: 60, BalanceThresholdInEther: 1, MaxGasPrice: "1000000000000", @@ -233,11 +252,11 @@ func GetConfig() *Config { MaxFinalizerRetriesPerTask: 10, MaxSettlementRetriesPerTask: 10, SettlementBatchSize: 1, - DeliveredTaskAckTimeoutSecs: 60 * 60 * 48, + DeliveredTaskAckTimeoutSecs: 60 * 60 * 6, DataRetentionDays: 3, MaxTaskQueueSize: 5, - RateLimitRPS: 0.1, // Default: 0.1 requests per second (1 request per 10 seconds) - suitable for file upload/download operations - RateLimitBurst: 2, // Default: burst of 2 requests - allows retry on failure + RateLimitRPS: 0.1, + RateLimitBurst: 2, } if err := loadConfig(instance); err != nil { diff --git a/api/fine-tuning/docs/LORA_SERVING.md b/api/fine-tuning/docs/LORA_SERVING.md new file mode 100644 index 00000000..3a7ac223 --- /dev/null +++ b/api/fine-tuning/docs/LORA_SERVING.md @@ -0,0 +1,364 @@ +# LoRA Inference Serving & Multi-Tier Caching + +This document describes the LoRA inference serving subsystem added to the fine-tuning broker. It enables providers to automatically serve fine-tuned LoRA adapters to end-users through an OpenAI-compatible API, with a multi-tier caching strategy that manages GPU memory, CPU memory, local disk, and 0G decentralized storage. + +## Table of Contents + +- [Overview](#overview) +- [Architecture](#architecture) +- [Multi-Tier Caching](#multi-tier-caching) +- [Components](#components) +- [API Reference](#api-reference) +- [Configuration](#configuration) +- [Authentication](#authentication) +- [Deployment](#deployment) +- [Testing](#testing) + +## Overview + +When a fine-tuning task completes, its LoRA adapter output is automatically discovered and registered for inference serving. The system: + +1. Runs a single **vLLM** process with one base model and multiple LoRA adapters attached +2. Routes user requests to the correct LoRA adapter based on the model name +3. Manages adapter lifecycle across four storage tiers to optimize GPU utilization +4. Enforces per-model access control via EIP-191 signature authentication + +### Key Design Decisions + +- **vLLM as the inference engine**: Chosen for its native multi-LoRA support, high throughput, and OpenAI-compatible API +- **Filesystem resolver for dynamic loading**: LoRA adapters are discovered from a directory at runtime — no server restart required +- **Proxy-based access control**: The broker proxy sits in front of vLLM, verifying model ownership before forwarding requests + +## Architecture + +``` + ┌─────────────────────────────┐ + │ vLLM Process │ + │ │ + User Request │ Base Model (e.g. Qwen2.5) │ + (model: ft-xxx-yyy) │ ┌───────────┐ │ + │ │ ┌────┤ LoRA #1 │ │ + ▼ │ │ └───────────┘ │ + ┌───────────┐ ┌───────────┐ │ │ ┌───────────┐ │ + │ Auth + │───▶│ Serving │───▶│────┼────┤ LoRA #2 │ │ + │ Routing │ │ Proxy │ │ │ └───────────┘ │ + └───────────┘ └───────────┘ │ │ ┌───────────┐ │ + │ └────┤ LoRA #N │ │ + │ └───────────┘ │ + └─────────────────────────────┘ + ▲ + │ symlinks + ┌─────────────────────────────┐ + │ /tmp/lora-modules/ │ + │ ├── ft-qwen-abc123/ │ + │ ├── ft-qwen-def456/ │ + │ └── ft-qwen-ghi789/ │ + └─────────────────────────────┘ +``` + +### Request Flow + +1. User sends a chat completion request with `model: "ft-qwen-abc123"` and an `Authorization: Bearer ` header +2. The **auth middleware** recovers the Ethereum address from the EIP-191 signature +3. The **proxy** verifies the user owns the requested model +4. The proxy checks the model's **cache state**: + - `active` → forward to vLLM + - `archived` → trigger async restore from 0G Storage, return HTTP 202 + - `loading` → return HTTP 202 with status message +5. vLLM serves the request using the corresponding LoRA adapter +6. Response is streamed back to the user + +## Multi-Tier Caching + +The system implements a four-tier storage hierarchy for LoRA adapters: + +``` +┌──────────────────────────────────────────────────────────┐ +│ Storage Tiers │ +│ │ +│ ┌────────┐ ┌────────┐ ┌────────┐ ┌──────────┐ │ +│ │ GPU │◄──▶│ CPU │◄──▶│ Disk │◄──▶│0G Storage│ │ +│ │ (Hot) │ │ (Warm) │ │ (Cool) │ │ (Cold) │ │ +│ └────────┘ └────────┘ └────────┘ └──────────┘ │ +│ │ +│ ◄── vLLM native LRU ──▶ ◄── Broker managed ──────▶ │ +└──────────────────────────────────────────────────────────┘ +``` + +| Tier | Managed By | Capacity Control | Latency | +|------|-----------|-----------------|---------| +| GPU (hot) | vLLM LRU cache | `--max-loras` | ~0ms (already loaded) | +| CPU (warm) | vLLM LRU cache | `--max-cpu-loras` | ~10ms (CPU→GPU transfer) | +| Disk (cool) | Filesystem resolver | Disk space | ~50-75ms (disk read + load) | +| 0G Storage (cold) | Broker offload loop | Unlimited | Seconds-minutes (network download) | + +### Tier Transitions + +**GPU ↔ CPU ↔ Disk** (handled by vLLM natively): +- vLLM maintains an LRU cache of LoRA adapters on GPU +- When GPU slots are full, least-recently-used adapters are moved to CPU memory +- When CPU slots are full, adapters are evicted entirely and must be reloaded from disk +- The filesystem resolver plugin automatically loads adapters from the `lora-modules` directory + +**Disk → 0G Storage** (handled by the broker's offload loop): +- A background goroutine checks every minute for adapters that have not been accessed within `offloadAfterMinutes` +- Inactive adapters are archived: the local symlink and LoRA files are deleted +- The adapter's metadata (model name, task ID, storage hash) is retained in memory +- Only adapters with a valid `OutputRootHash` (uploaded to 0G Storage during fine-tuning finalization) can be offloaded + +**0G Storage → Disk** (handled by the broker's restore logic): +- When a user requests an archived adapter, `RestoreModel` is triggered +- The adapter is downloaded from 0G Storage using the stored root hash +- A new symlink is created in the `lora-modules` directory +- The model state transitions: `archived` → `loading` → `active` +- During the download, subsequent requests receive HTTP 202 with a `"status": "loading"` response + +### Benchmark Results + +Tested on NVIDIA H20 (98GB) with Qwen2.5-0.5B-Instruct base model: + +| Scenario | TTFT | Total Time | +|----------|------|------------| +| Hot (GPU cached) | ~15ms | ~50ms | +| Warm (CPU→GPU reload) | ~25ms | ~60ms | +| Cold (disk→GPU load) | ~65ms | ~100ms | +| Concurrent 4 LoRAs | ~35ms avg | ~80ms avg | + +These results confirm that GPU↔CPU↔Disk transitions are effectively instantaneous from the user's perspective. The only significant latency comes from the 0G Storage cold tier, which involves network downloads. + +## Components + +### Manager (`internal/serving/manager.go`) + +The central controller that manages: + +- **vLLM process lifecycle**: Starts vLLM with multi-LoRA arguments and environment variables, monitors health +- **Auto-discovery**: Polls the database every 30 seconds for completed fine-tuning tasks and registers their LoRA adapters +- **Model registration**: Creates symlinks from the LoRA output directory to the `lora-modules` directory +- **Pruning**: When the number of served models exceeds `MaxLoraModules`, the oldest registered model is removed + +### Model Cache (`internal/serving/model_cache.go`) + +Handles the Disk ↔ 0G Storage tier: + +- **`ModelState`** enum: `active` (on disk), `archived` (cold storage only), `loading` (downloading) +- **`offloadLoop`**: Background goroutine that periodically checks `LastAccessedAt` timestamps +- **`RestoreModel`**: Triggers async download from 0G Storage via the `StorageDownloader` interface +- **`RecordAccess`**: Updates the last-accessed timestamp on every inference request + +### Proxy (`internal/serving/proxy.go`) + +OpenAI-compatible HTTP endpoints: + +- Signature-based authentication +- Model ownership enforcement +- Cache state checks with appropriate HTTP status codes +- Request proxying to vLLM with streaming support + +### Registry (`internal/serving/registry.go`) + +Placeholder for future on-chain inference service registration. Currently tracks serving state locally. + +## API Reference + +All serving endpoints are mounted under `/v1/serving/`. + +### POST `/v1/serving/v1/chat/completions` + +OpenAI-compatible chat completion endpoint. + +**Headers:** +- `Authorization: Bearer ` (required) +- `Content-Type: application/json` + +**Request Body:** +```json +{ + "model": "ft-qwen2-5-0-5B-I-a1b2c3d4e5f6", + "messages": [ + {"role": "user", "content": "Hello, how are you?"} + ], + "stream": true, + "max_tokens": 100 +} +``` + +**Responses:** +- `200`: Inference result (same format as OpenAI API) +- `202`: Model is loading from cold storage + ```json + { + "error": "Model is being loaded from cold storage. Please retry in a few moments.", + "status": "loading", + "model": "ft-qwen2-5-0-5B-I-a1b2c3d4e5f6" + } + ``` +- `401`: Invalid or missing signature +- `403`: User does not own the requested model +- `404`: Model not found +- `503`: vLLM not ready + +### GET `/v1/serving/v1/models` + +List models owned by the authenticated user. + +**Headers:** +- `Authorization: Bearer ` (required) + +**Response:** +```json +{ + "object": "list", + "data": [ + { + "id": "ft-qwen2-5-0-5B-I-a1b2c3d4e5f6", + "object": "model", + "owned_by": "0x1234...abcd", + "task_id": "a1b2c3d4-...", + "state": "active" + } + ] +} +``` + +### GET `/v1/serving/models` + +List all served models (no authentication required, for monitoring). + +### GET `/v1/serving/health` + +Health check with cache statistics. + +**Response:** +```json +{ + "vllm_ready": true, + "total_models": 10, + "active_on_disk": 7, + "archived_cold": 2, + "loading": 1, + "cold_storage": true, + "offload_minutes": 60 +} +``` + +## Configuration + +Add the following to your `config.yaml`: + +```yaml +serving: + enable: true + baseModelPath: "/models/Qwen2.5-0.5B-Instruct" # Path to base model + inferenceGpuIds: "0" # GPU device IDs for vLLM + vllmPort: 8000 # vLLM server port + maxLoraRank: 64 # Maximum LoRA rank supported + maxLoraModules: 16 # Max LoRA adapters on GPU simultaneously + maxCpuLoras: 32 # Max LoRA adapters cached in CPU memory + loraModulesDir: "/tmp/lora-modules" # Directory for LoRA symlinks + inputPrice: "10000000" # Price per input token (for contract registration) + outputPrice: "10000000" # Price per output token + offloadAfterMinutes: 60 # Minutes of inactivity before cold-storage offload + enableColdStorage: false # Enable 0G Storage offload/restore +``` + +### Configuration Notes + +- **`maxLoraModules`** controls `--max-loras` in vLLM: how many LoRA adapters can be loaded on GPU simultaneously. Higher values use more GPU memory. +- **`maxCpuLoras`** controls `--max-cpu-loras`: how many adapters are cached in CPU memory as a warm tier. Set this higher than `maxLoraModules` for a larger warm cache. +- **`enableColdStorage`** should only be enabled when 0G Storage is properly configured and the fine-tuning service uploads model outputs (i.e., `OutputRootHash` is populated). +- **`offloadAfterMinutes`** sets the inactivity threshold. Models without access within this period are offloaded to save disk space. Set to `0` to disable time-based offloading. + +## Authentication + +The serving proxy uses **EIP-191 personal message signatures** for authentication: + +1. The user signs the message `"0g-serving-inference-auth"` with their Ethereum private key +2. The signature is sent in the `Authorization: Bearer ` header +3. The proxy recovers the signer's address and matches it against the model's `UserAddress` +4. Only the user who created the fine-tuning task can access their LoRA model + +This ensures that fine-tuned models remain private to their creators without requiring API keys or session tokens. + +## Deployment + +### Prerequisites + +- NVIDIA GPU with sufficient VRAM for the base model + LoRA adapters +- vLLM installed (`pip install vllm`) +- `lora_filesystem_resolver` plugin (included with vLLM by default since v0.6.x) + +### Environment Variables Set by the Manager + +The following are automatically set when starting the vLLM subprocess: + +```bash +VLLM_ALLOW_RUNTIME_LORA_UPDATING=True # Enable dynamic LoRA loading +VLLM_PLUGINS=lora_filesystem_resolver # Use filesystem-based LoRA discovery +VLLM_LORA_RESOLVER_CACHE_DIR=/tmp/lora-modules # Directory to scan for adapters +CUDA_VISIBLE_DEVICES=0 # GPU isolation (if configured) +``` + +### GPU Memory Estimation + +As a rough guide for the base model + LoRA overhead: + +| Base Model | Base VRAM | Per-LoRA (r=8) | Per-LoRA (r=64) | +|-----------|----------|----------------|-----------------| +| 0.5B params | ~1 GB | ~5 MB | ~40 MB | +| 7B params | ~14 GB | ~50 MB | ~400 MB | +| 13B params | ~26 GB | ~100 MB | ~800 MB | + +With a 98GB H20 GPU and a 7B base model, you can comfortably serve 16+ LoRA adapters on GPU simultaneously. + +## Testing + +### Unit Tests + +The serving module includes 17 unit tests covering: + +```bash +cd api/fine-tuning && go test ./internal/serving/ -v +``` + +Test coverage includes: +- Model registration, state tracking, and access recording +- Offload logic (stale models, skip models without storage hash, skip recently accessed) +- Restore logic (async download, idempotent for active/loading states) +- Unregister, prune, ownership checks, deterministic model naming +- Full offload → restore end-to-end cycle + +### End-to-End Validation + +Verified on NVIDIA H20 (98GB) with Qwen2.5-0.5B-Instruct: + +1. **Base model inference** — vLLM serves the base model correctly +2. **Dynamic LoRA loading** — 3 LoRA adapters loaded via filesystem resolver without restart +3. **Multi-LoRA routing** — Concurrent requests to different adapters each route correctly +4. **GPU/CPU caching** — Adapter files deleted from disk, vLLM still serves from memory cache +5. **Disk restore** — Adapter files restored to disk, vLLM resumes serving on next request + +### Manual Testing + +To test locally without the full broker: + +```bash +# 1. Start vLLM with multi-LoRA +export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True +export VLLM_PLUGINS=lora_filesystem_resolver +export VLLM_LORA_RESOLVER_CACHE_DIR=/path/to/lora-modules + +vllm serve /path/to/base-model \ + --enable-lora \ + --max-lora-rank 64 \ + --max-loras 16 \ + --max-cpu-loras 32 + +# 2. Place LoRA adapters in the modules directory +ln -s /path/to/task-output/output_model /path/to/lora-modules/my-adapter + +# 3. Send inference request +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{"model": "my-adapter", "messages": [{"role": "user", "content": "Hello"}]}' +``` diff --git a/api/fine-tuning/e2e-serving-test b/api/fine-tuning/e2e-serving-test new file mode 100755 index 00000000..d84260c7 Binary files /dev/null and b/api/fine-tuning/e2e-serving-test differ diff --git a/api/fine-tuning/integration/prod/docker-compose-2d03306.yml b/api/fine-tuning/integration/prod/docker-compose-2d03306.yml new file mode 100644 index 00000000..3b99795d --- /dev/null +++ b/api/fine-tuning/integration/prod/docker-compose-2d03306.yml @@ -0,0 +1,69 @@ +services: + config-init: + image: alpine:latest + volumes: + - config-vol:/config + command: sh -c 'echo "bmV0d29ya3M6CiAgZXRoZXJldW0wZzoKICAgIHVybDogImh0dHBzOi8vZXZtcnBjLXRlc3RuZXQuMGcuYWkiCiAgICBjaGFpbklEOiAxNjYwMgogICAgcHJpdmF0ZUtleXM6CiAgICAgIC0gIjJDRTQ2QTFDM0I1RTVGNzNFREI3OUMzNDNCQjg2OUVGOTQ3MDlFNkJFQkIzQjI5QjcwOTM3RDdCNUM2RDU3NTEiCiAgICB0cmFuc2FjdGlvbkxpbWl0OiAxMDAwMDAwCnNlcnZpY2U6CiAgc2VydmluZ1VybDogIiNEU1RBQ0tfQVBQX1VSTCMiCiAgcHJpY2VQZXJUb2tlbjogMQogIHF1b3RhOgogICAgY3B1Q291bnQ6IDgKICAgIG1lbW9yeTogMTg3CiAgICBzdG9yYWdlOiA5MDAKICAgIGdwdVR5cGU6IEgyMDAKICAgIGdwdUNvdW50OiAxCiAgbW9kZWxMb2NhbFBhdGhzOgogICAgIjB4MmU2Zjk2MjBjMzViZGNiMmI3NTNjYzdhYTM0ZTc4MDc3YThlZDEzM2UzNmZhMzYwMDhmZDZiZGZkMjlhZjNhNSI6IC9kc3RhY2svcGVyc2lzdGVudC9tb2RlbHMvUXdlbjMtMzJCCiAgbW9kZWxIdWdnaW5nRmFjZUZhbGxiYWNrOgogICAgIjB4MmU2Zjk2MjBjMzViZGNiMmI3NTNjYzdhYTM0ZTc4MDc3YThlZDEzM2UzNmZhMzYwMDhmZDZiZGZkMjlhZjNhNSI6IFF3ZW4vUXdlbjMtMzJCCiAgc2tpcFN0b3JhZ2VVcGxvYWQ6IHRydWUKICBmaWxlUmV0ZW50aW9uSG91cnM6IDcyCmRhdGFiYXNlOgogIGZpbmVUdW5lOiByb290OjEyMzQ1NkB0Y3AoMGctZmluZS10dW5lLWJyb2tlci1kYjozMzA2KS9maW5lVHVuZT9wYXJzZVRpbWU9dHJ1ZQpsb2dnZXI6CiAgbGV2ZWw6IGRlYnVnCiAgcGF0aDogL3RtcC9maW5lLXR1bmluZy5sb2cKZmluYWxpemVyV29ya2VyQ291bnQ6IDEKdHJhaW5pbmdXb3JrZXJDb3VudDogMQpzZXR1cFdvcmtlckNvdW50OiAxCm1heFNldHVwUmV0cmllc1BlclRhc2s6IDMKbWF4RXhlY3V0b3JSZXRyaWVzUGVyVGFzazogMwptYXhGaW5hbGl6ZXJSZXRyaWVzUGVyVGFzazogMTAKbWF4U2V0dGxlbWVudFJldHJpZXNQZXJUYXNrOiAxMApzZXR0bGVtZW50QmF0Y2hTaXplOiAxCmRlbGl2ZXJlZFRhc2tBY2tUaW1lb3V0U2VjczogMjE2MDAKZGF0YVJldGVudGlvbkRheXM6IDMKbWF4VGFza1F1ZXVlU2l6ZTogNQo=" | base64 -d > /config/config.yaml' + + mysql: + image: mysql:8.0 + container_name: 0g-fine-tune-broker-db + ports: + - "33060:3306" + environment: + - MYSQL_ROOT_PASSWORD=123456 + - MYSQL_DATABASE=fineTune + volumes: + - mysql-data:/var/lib/mysql + restart: always + healthcheck: + test: ["CMD-SHELL", "mysqladmin ping -h localhost"] + interval: 10s + retries: 5 + networks: + - localhost + depends_on: + config-init: + condition: service_completed_successfully + + 0g-fine-tune-broker: + image: ghcr.io/0gfoundation/0g-serving-broker:2d03306-amd64 + privileged: true + restart: always + container_name: 0g-fine-tune-broker + environment: + - PORT=3080 + - CONFIG_FILE=/etc/config/config.yaml + - NVIDIA_VISIBLE_DEVICES=all + - TOKEN=PHALA@2025 + - DSTACK_SIMULATOR_ENDPOINT=/var/run/dstack.sock + ports: + - "80:3080" + volumes: + - /var/run/dstack.sock:/var/run/dstack.sock + - /var/run/docker.sock:/var/run/docker.sock + - config-vol:/etc/config:ro + - /tmp:/tmp + - /tmp/0g-transformer-bridge:/opt/transformer-bridge + - ./models:/fine-tuning/execution/models + - /dstack/persistent:/dstack/persistent + command: ["0g-fine-tuning-server", "--config", "/etc/config/config.yaml"] + networks: + - localhost + logging: + driver: "json-file" + options: + max-size: "100m" + max-file: "5" + depends_on: + mysql: + condition: service_healthy + +volumes: + mysql-data: + config-vol: + +networks: + localhost: + name: localhost + external: false diff --git a/api/fine-tuning/integration/prod/docker-compose-fix-cmd.yml b/api/fine-tuning/integration/prod/docker-compose-fix-cmd.yml new file mode 100644 index 00000000..c02e37e4 --- /dev/null +++ b/api/fine-tuning/integration/prod/docker-compose-fix-cmd.yml @@ -0,0 +1,69 @@ +services: + config-init: + image: alpine:latest + volumes: + - config-vol:/config + command: sh -c 'echo "bmV0d29ya3M6CiAgZXRoZXJldW0wZzoKICAgIHVybDogImh0dHBzOi8vZXZtcnBjLXRlc3RuZXQuMGcuYWkiCiAgICBjaGFpbklEOiAxNjYwMgogICAgcHJpdmF0ZUtleXM6CiAgICAgIC0gIjJDRTQ2QTFDM0I1RTVGNzNFREI3OUMzNDNCQjg2OUVGOTQ3MDlFNkJFQkIzQjI5QjcwOTM3RDdCNUM2RDU3NTEiCiAgICB0cmFuc2FjdGlvbkxpbWl0OiAxMDAwMDAwCnNlcnZpY2U6CiAgc2VydmluZ1VybDogIiNEU1RBQ0tfQVBQX1VSTCMiCiAgcHJpY2VQZXJUb2tlbjogMQogIHF1b3RhOgogICAgY3B1Q291bnQ6IDgKICAgIG1lbW9yeTogMTg3CiAgICBzdG9yYWdlOiA5MDAKICAgIGdwdVR5cGU6IEgyMDAKICAgIGdwdUNvdW50OiAxCiAgbW9kZWxMb2NhbFBhdGhzOgogICAgIjB4MmU2Zjk2MjBjMzViZGNiMmI3NTNjYzdhYTM0ZTc4MDc3YThlZDEzM2UzNmZhMzYwMDhmZDZiZGZkMjlhZjNhNSI6IC9kc3RhY2svcGVyc2lzdGVudC9tb2RlbHMvUXdlbjMtMzJCCiAgbW9kZWxIdWdnaW5nRmFjZUZhbGxiYWNrOgogICAgIjB4MmU2Zjk2MjBjMzViZGNiMmI3NTNjYzdhYTM0ZTc4MDc3YThlZDEzM2UzNmZhMzYwMDhmZDZiZGZkMjlhZjNhNSI6IFF3ZW4vUXdlbjMtMzJCCiAgc2tpcFN0b3JhZ2VVcGxvYWQ6IHRydWUKICBmaWxlUmV0ZW50aW9uSG91cnM6IDcyCnN0b3JhZ2VDbGllbnQ6CiAgaW5kZXhlclN0YW5kYXJkOiAiaHR0cHM6Ly9pbmRleGVyLXN0b3JhZ2UtdGVzdG5ldC1zdGFuZGFyZC4wZy5haSIKICBpbmRleGVyVHVyYm86ICJodHRwczovL2luZGV4ZXItc3RvcmFnZS10ZXN0bmV0LXR1cmJvLjBnLmFpIgogIHVwbG9hZEFyZ3M6CiAgICBmaW5hbGl0eVJlcXVpcmVkOiBmYWxzZQogICAgdGFnczogIjB4IgogICAgZXhwZWN0ZWRSZXBsaWNhOiAxCiAgICBza2lwVHg6IGZhbHNlCiAgICBmcmFnbWVudFNpemU6IDQyOTQ5NjcyOTYKZGF0YWJhc2U6CiAgZmluZVR1bmU6IHJvb3Q6MTIzNDU2QHRjcCgwZy1maW5lLXR1bmUtYnJva2VyLWRiOjMzMDYpL2ZpbmVUdW5lP3BhcnNlVGltZT10cnVlCmxvZ2dlcjoKICBsZXZlbDogZGVidWcKICBwYXRoOiAvdG1wL2ZpbmUtdHVuaW5nLmxvZwpmaW5hbGl6ZXJXb3JrZXJDb3VudDogMQp0cmFpbmluZ1dvcmtlckNvdW50OiAxCnNldHVwV29ya2VyQ291bnQ6IDEKbWF4U2V0dXBSZXRyaWVzUGVyVGFzazogMwptYXhFeGVjdXRvclJldHJpZXNQZXJUYXNrOiAzCm1heEZpbmFsaXplclJldHJpZXNQZXJUYXNrOiAxMAptYXhTZXR0bGVtZW50UmV0cmllc1BlclRhc2s6IDEwCnNldHRsZW1lbnRCYXRjaFNpemU6IDEKZGVsaXZlcmVkVGFza0Fja1RpbWVvdXRTZWNzOiAyMTYwMApkYXRhUmV0ZW50aW9uRGF5czogMwptYXhUYXNrUXVldWVTaXplOiA1Cg==" | base64 -d > /config/config.yaml' + + mysql: + image: mysql:8.0 + container_name: 0g-fine-tune-broker-db + ports: + - "33060:3306" + environment: + - MYSQL_ROOT_PASSWORD=123456 + - MYSQL_DATABASE=fineTune + volumes: + - mysql-data:/var/lib/mysql + restart: always + healthcheck: + test: ["CMD-SHELL", "mysqladmin ping -h localhost"] + interval: 10s + retries: 5 + networks: + - localhost + depends_on: + config-init: + condition: service_completed_successfully + + 0g-fine-tune-broker: + image: ghcr.io/0gfoundation/0g-serving-broker:efa7350-full-amd64 + privileged: true + restart: always + container_name: 0g-fine-tune-broker + environment: + - PORT=3080 + - CONFIG_FILE=/etc/config/config.yaml + - NVIDIA_VISIBLE_DEVICES=all + - TOKEN=PHALA@2025 + - NETWORK=hardhat + ports: + - "80:3080" + volumes: + - /var/run/dstack.sock:/var/run/dstack.sock + - /var/run/docker.sock:/var/run/docker.sock + - config-vol:/etc/config:ro + - /tmp:/tmp + - /tmp/0g-transformer-bridge:/opt/transformer-bridge + - ./models:/fine-tuning/execution/models + - /dstack/persistent:/dstack/persistent + command: ["0g-fine-tuning-server", "--config", "/etc/config/config.yaml"] + networks: + - localhost + logging: + driver: "json-file" + options: + max-size: "100m" + max-file: "5" + depends_on: + mysql: + condition: service_healthy + +volumes: + mysql-data: + config-vol: + +networks: + localhost: + name: localhost + external: false diff --git a/api/fine-tuning/integration/prod/docker-compose-fix-getservice.yml b/api/fine-tuning/integration/prod/docker-compose-fix-getservice.yml new file mode 100644 index 00000000..e2f19a80 --- /dev/null +++ b/api/fine-tuning/integration/prod/docker-compose-fix-getservice.yml @@ -0,0 +1,69 @@ +services: + config-init: + image: alpine:latest + volumes: + - config-vol:/config + command: sh -c 'echo "bmV0d29ya3M6CiAgZXRoZXJldW0wZzoKICAgIHVybDogImh0dHBzOi8vZXZtcnBjLXRlc3RuZXQuMGcuYWkiCiAgICBjaGFpbklEOiAxNjYwMgogICAgcHJpdmF0ZUtleXM6CiAgICAgIC0gIjJDRTQ2QTFDM0I1RTVGNzNFREI3OUMzNDNCQjg2OUVGOTQ3MDlFNkJFQkIzQjI5QjcwOTM3RDdCNUM2RDU3NTEiCiAgICB0cmFuc2FjdGlvbkxpbWl0OiAxMDAwMDAwCiAgICBnYXNFc3RpbWF0aW9uQnVmZmVyOiAxMDAwMApzZXJ2aWNlOgogIHNlcnZpbmdVcmw6ICIjRFNUQUNLX0FQUF9VUkwjIgogIHByaWNlUGVyVG9rZW46IDEKICBxdW90YToKICAgIGNwdUNvdW50OiA4CiAgICBtZW1vcnk6IDE4NwogICAgc3RvcmFnZTogOTAwCiAgICBncHVUeXBlOiBIMjAwCiAgICBncHVDb3VudDogMQpzdG9yYWdlQ2xpZW50OgogIGluZGV4ZXJTdGFuZGFyZDogImh0dHBzOi8vaW5kZXhlci1zdG9yYWdlLXRlc3RuZXQtc3RhbmRhcmQuMGcuYWkiCiAgaW5kZXhlclR1cmJvOiAiaHR0cHM6Ly9pbmRleGVyLXN0b3JhZ2UtdGVzdG5ldC10dXJiby4wZy5haSIKICB1cGxvYWRBcmdzOgogICAgZmluYWxpdHlSZXF1aXJlZDogZmFsc2UKICAgIHRhZ3M6ICIweCIKICAgIGV4cGVjdGVkUmVwbGljYTogMQogICAgc2tpcFR4OiBmYWxzZQogICAgZnJhZ21lbnRTaXplOiA0Mjk0OTY3Mjk2CmRhdGFiYXNlOgogIGZpbmVUdW5lOiByb290OjEyMzQ1NkB0Y3AoMGctZmluZS10dW5lLWJyb2tlci1kYjozMzA2KS9maW5lVHVuZT9wYXJzZVRpbWU9dHJ1ZQpsb2dnZXI6CiAgbGV2ZWw6IGRlYnVnCiAgcGF0aDogL3RtcC9maW5lLXR1bmluZy5sb2cKdHJhaW5pbmdXb3JrZXJDb3VudDogMQpzZXR1cFdvcmtlckNvdW50OiAxCmZpbmFsaXplcldvcmtlckNvdW50OiAxCm1heFNldHVwUmV0cmllc1BlclRhc2s6IDMKbWF4RXhlY3V0b3JSZXRyaWVzUGVyVGFzazogMwptYXhGaW5hbGl6ZXJSZXRyaWVzUGVyVGFzazogMTAKbWF4U2V0dGxlbWVudFJldHJpZXNQZXJUYXNrOiAxMApzZXR0bGVtZW50QmF0Y2hTaXplOiAxCmRlbGl2ZXJlZFRhc2tBY2tUaW1lb3V0U2VjczogMjE2MDAKZGF0YVJldGVudGlvbkRheXM6IDMKbWF4VGFza1F1ZXVlU2l6ZTogNQo=" | base64 -d > /config/config.yaml' + + mysql: + image: mysql:8.0 + container_name: 0g-fine-tune-broker-db + ports: + - "33060:3306" + environment: + - MYSQL_ROOT_PASSWORD=123456 + - MYSQL_DATABASE=fineTune + volumes: + - mysql-data:/var/lib/mysql + restart: always + healthcheck: + test: ["CMD-SHELL", "mysqladmin ping -h localhost"] + interval: 10s + retries: 5 + networks: + - localhost + depends_on: + config-init: + condition: service_completed_successfully + + 0g-fine-tune-broker: + image: ghcr.io/0gfoundation/0g-serving-broker:efa7350-fix-getservice + privileged: true + restart: always + container_name: 0g-fine-tune-broker + environment: + - PORT=3080 + - CONFIG_FILE=/etc/config/config.yaml + - NVIDIA_VISIBLE_DEVICES=all + - TOKEN=PHALA@2025 + - NETWORK=hardhat + ports: + - "80:3080" + volumes: + - /var/run/dstack.sock:/var/run/dstack.sock + - /var/run/docker.sock:/var/run/docker.sock + - config-vol:/etc/config:ro + - /tmp:/tmp + - /tmp/0g-transformer-bridge:/opt/transformer-bridge + - ./models:/fine-tuning/execution/models + - /dstack/persistent:/dstack/persistent + command: ["0g-fine-tuning-server", "--config", "/etc/config/config.yaml"] + networks: + - localhost + logging: + driver: "json-file" + options: + max-size: "100m" + max-file: "5" + depends_on: + mysql: + condition: service_healthy + +volumes: + mysql-data: + config-vol: + +networks: + localhost: + name: localhost + external: false diff --git a/api/fine-tuning/integration/prod/docker-compose-fix2.yml b/api/fine-tuning/integration/prod/docker-compose-fix2.yml new file mode 100644 index 00000000..ccc20b26 --- /dev/null +++ b/api/fine-tuning/integration/prod/docker-compose-fix2.yml @@ -0,0 +1,68 @@ +services: + config-init: + image: alpine:latest + volumes: + - config-vol:/config + command: sh -c 'echo "bmV0d29ya3M6CiAgZXRoZXJldW0wZzoKICAgIHVybDogImh0dHBzOi8vZXZtcnBjLXRlc3RuZXQuMGcuYWkiCiAgICBjaGFpbklEOiAxNjYwMgogICAgcHJpdmF0ZUtleXM6CiAgICAgIC0gIjJDRTQ2QTFDM0I1RTVGNzNFREI3OUMzNDNCQjg2OUVGOTQ3MDlFNkJFQkIzQjI5QjcwOTM3RDdCNUM2RDU3NTEiCiAgICB0cmFuc2FjdGlvbkxpbWl0OiAxMDAwMDAwCnNlcnZpY2U6CiAgc2VydmluZ1VybDogIiNEU1RBQ0tfQVBQX1VSTCMiCiAgcHJpY2VQZXJUb2tlbjogMQogIHF1b3RhOgogICAgY3B1Q291bnQ6IDgKICAgIG1lbW9yeTogMTg3CiAgICBzdG9yYWdlOiA5MDAKICAgIGdwdVR5cGU6IEgyMDAKICAgIGdwdUNvdW50OiAxCiAgbW9kZWxMb2NhbFBhdGhzOgogICAgIjB4MmU2Zjk2MjBjMzViZGNiMmI3NTNjYzdhYTM0ZTc4MDc3YThlZDEzM2UzNmZhMzYwMDhmZDZiZGZkMjlhZjNhNSI6IC9kc3RhY2svcGVyc2lzdGVudC9tb2RlbHMvUXdlbjMtMzJCCiAgbW9kZWxIdWdnaW5nRmFjZUZhbGxiYWNrOgogICAgIjB4MmU2Zjk2MjBjMzViZGNiMmI3NTNjYzdhYTM0ZTc4MDc3YThlZDEzM2UzNmZhMzYwMDhmZDZiZGZkMjlhZjNhNSI6IFF3ZW4vUXdlbjMtMzJCCiAgc2tpcFN0b3JhZ2VVcGxvYWQ6IHRydWUKICBmaWxlUmV0ZW50aW9uSG91cnM6IDcyCmRhdGFiYXNlOgogIGZpbmVUdW5lOiByb290OjEyMzQ1NkB0Y3AoMGctZmluZS10dW5lLWJyb2tlci1kYjozMzA2KS9maW5lVHVuZT9wYXJzZVRpbWU9dHJ1ZQpsb2dnZXI6CiAgbGV2ZWw6IGRlYnVnCiAgcGF0aDogL3RtcC9maW5lLXR1bmluZy5sb2cKZmluYWxpemVyV29ya2VyQ291bnQ6IDEKdHJhaW5pbmdXb3JrZXJDb3VudDogMQpzZXR1cFdvcmtlckNvdW50OiAxCm1heFNldHVwUmV0cmllc1BlclRhc2s6IDMKbWF4RXhlY3V0b3JSZXRyaWVzUGVyVGFzazogMwptYXhGaW5hbGl6ZXJSZXRyaWVzUGVyVGFzazogMTAKbWF4U2V0dGxlbWVudFJldHJpZXNQZXJUYXNrOiAxMApzZXR0bGVtZW50QmF0Y2hTaXplOiAxCmRlbGl2ZXJlZFRhc2tBY2tUaW1lb3V0U2VjczogMjE2MDAKZGF0YVJldGVudGlvbkRheXM6IDMKbWF4VGFza1F1ZXVlU2l6ZTogNQo=" | base64 -d > /config/config.yaml' + + mysql: + image: mysql:8.0 + container_name: 0g-fine-tune-broker-db + ports: + - "33060:3306" + environment: + - MYSQL_ROOT_PASSWORD=123456 + - MYSQL_DATABASE=fineTune + volumes: + - mysql-data:/var/lib/mysql + restart: always + healthcheck: + test: ["CMD-SHELL", "mysqladmin ping -h localhost"] + interval: 10s + retries: 5 + networks: + - localhost + depends_on: + config-init: + condition: service_completed_successfully + + 0g-fine-tune-broker: + image: ghcr.io/0gfoundation/0g-serving-broker:2d03306-fix2-amd64 + privileged: true + restart: always + container_name: 0g-fine-tune-broker + environment: + - PORT=3080 + - CONFIG_FILE=/etc/config/config.yaml + - NVIDIA_VISIBLE_DEVICES=all + - TOKEN=PHALA@2025 + ports: + - "80:3080" + volumes: + - /var/run/dstack.sock:/var/run/dstack.sock + - /var/run/docker.sock:/var/run/docker.sock + - config-vol:/etc/config:ro + - /tmp:/tmp + - /tmp/0g-transformer-bridge:/opt/transformer-bridge + - ./models:/fine-tuning/execution/models + - /dstack/persistent:/dstack/persistent + command: ["0g-fine-tuning-server", "--config", "/etc/config/config.yaml"] + networks: + - localhost + logging: + driver: "json-file" + options: + max-size: "100m" + max-file: "5" + depends_on: + mysql: + condition: service_healthy + +volumes: + mysql-data: + config-vol: + +networks: + localhost: + name: localhost + external: false diff --git a/api/fine-tuning/integration/prod/docker-compose-fix3.yml b/api/fine-tuning/integration/prod/docker-compose-fix3.yml new file mode 100644 index 00000000..79615aee --- /dev/null +++ b/api/fine-tuning/integration/prod/docker-compose-fix3.yml @@ -0,0 +1,68 @@ +services: + config-init: + image: alpine:latest + volumes: + - config-vol:/config + command: sh -c 'echo "bmV0d29ya3M6CiAgZXRoZXJldW0wZzoKICAgIHVybDogImh0dHBzOi8vZXZtcnBjLXRlc3RuZXQuMGcuYWkiCiAgICBjaGFpbklEOiAxNjYwMgogICAgcHJpdmF0ZUtleXM6CiAgICAgIC0gIjJDRTQ2QTFDM0I1RTVGNzNFREI3OUMzNDNCQjg2OUVGOTQ3MDlFNkJFQkIzQjI5QjcwOTM3RDdCNUM2RDU3NTEiCiAgICB0cmFuc2FjdGlvbkxpbWl0OiAxMDAwMDAwCnNlcnZpY2U6CiAgc2VydmluZ1VybDogIiNEU1RBQ0tfQVBQX1VSTCMiCiAgcHJpY2VQZXJUb2tlbjogMQogIHF1b3RhOgogICAgY3B1Q291bnQ6IDgKICAgIG1lbW9yeTogMTg3CiAgICBzdG9yYWdlOiA5MDAKICAgIGdwdVR5cGU6IEgyMDAKICAgIGdwdUNvdW50OiAxCiAgbW9kZWxMb2NhbFBhdGhzOgogICAgIjB4MmU2Zjk2MjBjMzViZGNiMmI3NTNjYzdhYTM0ZTc4MDc3YThlZDEzM2UzNmZhMzYwMDhmZDZiZGZkMjlhZjNhNSI6IC9kc3RhY2svcGVyc2lzdGVudC9tb2RlbHMvUXdlbjMtMzJCCiAgbW9kZWxIdWdnaW5nRmFjZUZhbGxiYWNrOgogICAgIjB4MmU2Zjk2MjBjMzViZGNiMmI3NTNjYzdhYTM0ZTc4MDc3YThlZDEzM2UzNmZhMzYwMDhmZDZiZGZkMjlhZjNhNSI6IFF3ZW4vUXdlbjMtMzJCCiAgc2tpcFN0b3JhZ2VVcGxvYWQ6IHRydWUKICBmaWxlUmV0ZW50aW9uSG91cnM6IDcyCmRhdGFiYXNlOgogIGZpbmVUdW5lOiByb290OjEyMzQ1NkB0Y3AoMGctZmluZS10dW5lLWJyb2tlci1kYjozMzA2KS9maW5lVHVuZT9wYXJzZVRpbWU9dHJ1ZQpsb2dnZXI6CiAgbGV2ZWw6IGRlYnVnCiAgcGF0aDogL3RtcC9maW5lLXR1bmluZy5sb2cKZmluYWxpemVyV29ya2VyQ291bnQ6IDEKdHJhaW5pbmdXb3JrZXJDb3VudDogMQpzZXR1cFdvcmtlckNvdW50OiAxCm1heFNldHVwUmV0cmllc1BlclRhc2s6IDMKbWF4RXhlY3V0b3JSZXRyaWVzUGVyVGFzazogMwptYXhGaW5hbGl6ZXJSZXRyaWVzUGVyVGFzazogMTAKbWF4U2V0dGxlbWVudFJldHJpZXNQZXJUYXNrOiAxMApzZXR0bGVtZW50QmF0Y2hTaXplOiAxCmRlbGl2ZXJlZFRhc2tBY2tUaW1lb3V0U2VjczogMjE2MDAKZGF0YVJldGVudGlvbkRheXM6IDMKbWF4VGFza1F1ZXVlU2l6ZTogNQo=" | base64 -d > /config/config.yaml' + + mysql: + image: mysql:8.0 + container_name: 0g-fine-tune-broker-db + ports: + - "33060:3306" + environment: + - MYSQL_ROOT_PASSWORD=123456 + - MYSQL_DATABASE=fineTune + volumes: + - mysql-data:/var/lib/mysql + restart: always + healthcheck: + test: ["CMD-SHELL", "mysqladmin ping -h localhost"] + interval: 10s + retries: 5 + networks: + - localhost + depends_on: + config-init: + condition: service_completed_successfully + + 0g-fine-tune-broker: + image: ghcr.io/0gfoundation/0g-serving-broker:2d03306-fix3-amd64 + privileged: true + restart: always + container_name: 0g-fine-tune-broker + environment: + - PORT=3080 + - CONFIG_FILE=/etc/config/config.yaml + - NVIDIA_VISIBLE_DEVICES=all + - TOKEN=PHALA@2025 + ports: + - "80:3080" + volumes: + - /var/run/dstack.sock:/var/run/dstack.sock + - /var/run/docker.sock:/var/run/docker.sock + - config-vol:/etc/config:ro + - /tmp:/tmp + - /tmp/0g-transformer-bridge:/opt/transformer-bridge + - ./models:/fine-tuning/execution/models + - /dstack/persistent:/dstack/persistent + command: ["0g-fine-tuning-server", "--config", "/etc/config/config.yaml"] + networks: + - localhost + logging: + driver: "json-file" + options: + max-size: "100m" + max-file: "5" + depends_on: + mysql: + condition: service_healthy + +volumes: + mysql-data: + config-vol: + +networks: + localhost: + name: localhost + external: false diff --git a/api/fine-tuning/integration/prod/docker-compose-fixed-image.yml b/api/fine-tuning/integration/prod/docker-compose-fixed-image.yml new file mode 100644 index 00000000..4dac762e --- /dev/null +++ b/api/fine-tuning/integration/prod/docker-compose-fixed-image.yml @@ -0,0 +1,68 @@ +services: + config-init: + image: alpine:latest + volumes: + - config-vol:/config + command: sh -c 'echo "bmV0d29ya3M6CiAgZXRoZXJldW0wZzoKICAgIHVybDogImh0dHBzOi8vZXZtcnBjLXRlc3RuZXQuMGcuYWkiCiAgICBjaGFpbklEOiAxNjYwMgogICAgcHJpdmF0ZUtleXM6CiAgICAgIC0gIjJDRTQ2QTFDM0I1RTVGNzNFREI3OUMzNDNCQjg2OUVGOTQ3MDlFNkJFQkIzQjI5QjcwOTM3RDdCNUM2RDU3NTEiCiAgICB0cmFuc2FjdGlvbkxpbWl0OiAxMDAwMDAwCnNlcnZpY2U6CiAgc2VydmluZ1VybDogIiNEU1RBQ0tfQVBQX1VSTCMiCiAgcHJpY2VQZXJUb2tlbjogMQogIHF1b3RhOgogICAgY3B1Q291bnQ6IDgKICAgIG1lbW9yeTogMTg3CiAgICBzdG9yYWdlOiA5MDAKICAgIGdwdVR5cGU6IEgyMDAKICAgIGdwdUNvdW50OiAxCiAgbW9kZWxMb2NhbFBhdGhzOgogICAgIjB4MmU2Zjk2MjBjMzViZGNiMmI3NTNjYzdhYTM0ZTc4MDc3YThlZDEzM2UzNmZhMzYwMDhmZDZiZGZkMjlhZjNhNSI6IC9kc3RhY2svcGVyc2lzdGVudC9tb2RlbHMvUXdlbjMtMzJCCiAgbW9kZWxIdWdnaW5nRmFjZUZhbGxiYWNrOgogICAgIjB4MmU2Zjk2MjBjMzViZGNiMmI3NTNjYzdhYTM0ZTc4MDc3YThlZDEzM2UzNmZhMzYwMDhmZDZiZGZkMjlhZjNhNSI6IFF3ZW4vUXdlbjMtMzJCCiAgc2tpcFN0b3JhZ2VVcGxvYWQ6IHRydWUKICBmaWxlUmV0ZW50aW9uSG91cnM6IDcyCmRhdGFiYXNlOgogIGZpbmVUdW5lOiByb290OjEyMzQ1NkB0Y3AoMGctZmluZS10dW5lLWJyb2tlci1kYjozMzA2KS9maW5lVHVuZT9wYXJzZVRpbWU9dHJ1ZQpsb2dnZXI6CiAgbGV2ZWw6IGRlYnVnCiAgcGF0aDogL3RtcC9maW5lLXR1bmluZy5sb2cKZmluYWxpemVyV29ya2VyQ291bnQ6IDEKdHJhaW5pbmdXb3JrZXJDb3VudDogMQpzZXR1cFdvcmtlckNvdW50OiAxCm1heFNldHVwUmV0cmllc1BlclRhc2s6IDMKbWF4RXhlY3V0b3JSZXRyaWVzUGVyVGFzazogMwptYXhGaW5hbGl6ZXJSZXRyaWVzUGVyVGFzazogMTAKbWF4U2V0dGxlbWVudFJldHJpZXNQZXJUYXNrOiAxMApzZXR0bGVtZW50QmF0Y2hTaXplOiAxCmRlbGl2ZXJlZFRhc2tBY2tUaW1lb3V0U2VjczogMjE2MDAKZGF0YVJldGVudGlvbkRheXM6IDMKbWF4VGFza1F1ZXVlU2l6ZTogNQo=" | base64 -d > /config/config.yaml' + + mysql: + image: mysql:8.0 + container_name: 0g-fine-tune-broker-db + ports: + - "33060:3306" + environment: + - MYSQL_ROOT_PASSWORD=123456 + - MYSQL_DATABASE=fineTune + volumes: + - mysql-data:/var/lib/mysql + restart: always + healthcheck: + test: ["CMD-SHELL", "mysqladmin ping -h localhost"] + interval: 10s + retries: 5 + networks: + - localhost + depends_on: + config-init: + condition: service_completed_successfully + + 0g-fine-tune-broker: + image: ghcr.io/0gfoundation/0g-serving-broker:2d03306-fix-amd64 + privileged: true + restart: always + container_name: 0g-fine-tune-broker + environment: + - PORT=3080 + - CONFIG_FILE=/etc/config/config.yaml + - NVIDIA_VISIBLE_DEVICES=all + - TOKEN=PHALA@2025 + ports: + - "80:3080" + volumes: + - /var/run/dstack.sock:/var/run/dstack.sock + - /var/run/docker.sock:/var/run/docker.sock + - config-vol:/etc/config:ro + - /tmp:/tmp + - /tmp/0g-transformer-bridge:/opt/transformer-bridge + - ./models:/fine-tuning/execution/models + - /dstack/persistent:/dstack/persistent + command: ["0g-fine-tuning-server", "--config", "/etc/config/config.yaml"] + networks: + - localhost + logging: + driver: "json-file" + options: + max-size: "100m" + max-file: "5" + depends_on: + mysql: + condition: service_healthy + +volumes: + mysql-data: + config-vol: + +networks: + localhost: + name: localhost + external: false diff --git a/api/fine-tuning/integration/prod/docker-compose-fixed.yml b/api/fine-tuning/integration/prod/docker-compose-fixed.yml new file mode 100644 index 00000000..fc1fdc53 --- /dev/null +++ b/api/fine-tuning/integration/prod/docker-compose-fixed.yml @@ -0,0 +1,70 @@ +services: + config-init: + image: alpine:latest + volumes: + - config-vol:/config + command: sh -c 'echo "bmV0d29ya3M6CiAgZXRoZXJldW0wZzoKICAgIHVybDogImh0dHBzOi8vZXZtcnBjLXRlc3RuZXQuMGcuYWkiCiAgICBjaGFpbklEOiAxNjYwMgogICAgcHJpdmF0ZUtleXM6CiAgICAgIC0gIjJDRTQ2QTFDM0I1RTVGNzNFREI3OUMzNDNCQjg2OUVGOTQ3MDlFNkJFQkIzQjI5QjcwOTM3RDdCNUM2RDU3NTEiCiAgICB0cmFuc2FjdGlvbkxpbWl0OiAxMDAwMDAwCnNlcnZpY2U6CiAgc2VydmluZ1VybDogIiNEU1RBQ0tfQVBQX1VSTCMiCiAgcHJpY2VQZXJUb2tlbjogMQogIHF1b3RhOgogICAgY3B1Q291bnQ6IDgKICAgIG1lbW9yeTogMTg3CiAgICBzdG9yYWdlOiA5MDAKICAgIGdwdVR5cGU6IEgyMDAKICAgIGdwdUNvdW50OiAxCiAgY3VzdG9taXplZE1vZGVsczoKICAgIC0gbmFtZTogUXdlbjMtMzJCCiAgICAgIGhhc2g6ICIweDJlNmY5NjIwYzM1YmRjYjJiNzUzY2M3YWEzNGU3ODA3N2E4ZWQxMzNlMzZmYTM2MDA4ZmQ2YmRmZDI5YWYzYTUiCiAgICAgIGltYWdlOiBnaGNyLmlvLzBnZm91bmRhdGlvbi9maW5lLXR1bmluZy10ZXN0OnYxCiAgICAgIGRhdGFUeXBlOiB0ZXh0CiAgICAgIHRyYWluaW5nU2NyaXB0OiAvYXBwL3RyYWluX2xvcmEucHkKICAgICAgZGVzY3JpcHRpb246IFF3ZW4zLTMyQiBMb1JBIGZpbmUtdHVuaW5nCiAgICAgIHRva2VuaXplcjogUXdlbi9Rd2VuMy0zMkIKICAgICAgbG9jYWxQYXRoOiAvZHN0YWNrL3BlcnNpc3RlbnQvbW9kZWxzL1F3ZW4zLTMyQgogIG1vZGVsSHVnZ2luZ0ZhY2VGYWxsYmFjazoKICAgICIweDJlNmY5NjIwYzM1YmRjYjJiNzUzY2M3YWEzNGU3ODA3N2E4ZWQxMzNlMzZmYTM2MDA4ZmQ2YmRmZDI5YWYzYTUiOiBRd2VuL1F3ZW4zLTMyQgogIHNraXBTdG9yYWdlVXBsb2FkOiB0cnVlCiAgZmlsZVJldGVudGlvbkhvdXJzOiA3MgpkYXRhYmFzZToKICBmaW5lVHVuZTogcm9vdDoxMjM0NTZAdGNwKDBnLWZpbmUtdHVuZS1icm9rZXItZGI6MzMwNikvZmluZVR1bmU/cGFyc2VUaW1lPXRydWUKbG9nZ2VyOgogIGxldmVsOiBkZWJ1ZwogIHBhdGg6IC90bXAvZmluZS10dW5pbmcubG9nCmZpbmFsaXplcldvcmtlckNvdW50OiAxCmV4ZWN1dG9yV29ya2VyQ291bnQ6IDEKc2V0dXBXb3JrZXJDb3VudDogMQptYXhTZXR1cFJldHJpZXNQZXJUYXNrOiAzCm1heEV4ZWN1dG9yUmV0cmllc1BlclRhc2s6IDMKbWF4RmluYWxpemVyUmV0cmllc1BlclRhc2s6IDEwCm1heFNldHRsZW1lbnRSZXRyaWVzUGVyVGFzazogMTAKc2V0dGxlbWVudEJhdGNoU2l6ZTogMQpkZWxpdmVyZWRUYXNrQWNrVGltZW91dFNlY3M6IDIxNjAwCmRhdGFSZXRlbnRpb25EYXlzOiAzCm1heFRhc2tRdWV1ZVNpemU6IDUK" | base64 -d > /config/config.yaml' + + mysql: + image: mysql:8.0 + container_name: 0g-fine-tune-broker-db + ports: + - target: 3306 + published: 33060 + environment: + - MYSQL_ROOT_PASSWORD=123456 + - MYSQL_DATABASE=fineTune + volumes: + - mysql-data:/var/lib/mysql + restart: always + healthcheck: + test: ["CMD-SHELL", "mysqladmin ping -h localhost"] + interval: 10s + retries: 5 + networks: + - localhost + depends_on: + config-init: + condition: service_completed_successfully + + 0g-fine-tune-broker: + image: ghcr.io/0gfoundation/0g-serving-broker:latest + privileged: true + restart: always + container_name: 0g-fine-tune-broker + environment: + - PORT=3080 + - CONFIG_FILE=/etc/config/config.yaml + - NVIDIA_VISIBLE_DEVICES=all + - TOKEN=PHALA@2025 + ports: + - target: 3080 + published: "#PORT#" + volumes: + - /var/run/dstack.sock:/var/run/dstack.sock + - /var/run/docker.sock:/var/run/docker.sock + - config-vol:/etc/config:ro + - /tmp:/tmp + - /tmp/0g-transformer-bridge:/opt/transformer-bridge + - ./models:/fine-tuning/execution/models + - /dstack/persistent:/dstack/persistent + command: ["0g-fine-tuning-server", "--config", "/etc/config/config.yaml"] + networks: + - localhost + logging: + driver: "json-file" + options: + max-size: "100m" + max-file: "5" + depends_on: + mysql: + condition: service_healthy + +volumes: + mysql-data: + config-vol: + +networks: + localhost: + name: localhost + external: false diff --git a/api/fine-tuning/integration/prod/docker-compose-full-config.yml b/api/fine-tuning/integration/prod/docker-compose-full-config.yml new file mode 100644 index 00000000..1ec85a3f --- /dev/null +++ b/api/fine-tuning/integration/prod/docker-compose-full-config.yml @@ -0,0 +1,68 @@ +services: + config-init: + image: alpine:latest + volumes: + - config-vol:/config + command: sh -c 'echo "bmV0d29ya3M6CiAgZXRoZXJldW0wZzoKICAgIHVybDogImh0dHBzOi8vZXZtcnBjLXRlc3RuZXQuMGcuYWkiCiAgICBjaGFpbklEOiAxNjYwMgogICAgcHJpdmF0ZUtleXM6CiAgICAgIC0gIjJDRTQ2QTFDM0I1RTVGNzNFREI3OUMzNDNCQjg2OUVGOTQ3MDlFNkJFQkIzQjI5QjcwOTM3RDdCNUM2RDU3NTEiCiAgICB0cmFuc2FjdGlvbkxpbWl0OiAxMDAwMDAwCnNlcnZpY2U6CiAgc2VydmluZ1VybDogIiNEU1RBQ0tfQVBQX1VSTCMiCiAgcHJpY2VQZXJUb2tlbjogMQogIHF1b3RhOgogICAgY3B1Q291bnQ6IDgKICAgIG1lbW9yeTogMTg3CiAgICBzdG9yYWdlOiA5MDAKICAgIGdwdVR5cGU6IEgyMDAKICAgIGdwdUNvdW50OiAxCiAgbW9kZWxMb2NhbFBhdGhzOgogICAgIjB4MmU2Zjk2MjBjMzViZGNiMmI3NTNjYzdhYTM0ZTc4MDc3YThlZDEzM2UzNmZhMzYwMDhmZDZiZGZkMjlhZjNhNSI6IC9kc3RhY2svcGVyc2lzdGVudC9tb2RlbHMvUXdlbjMtMzJCCiAgbW9kZWxIdWdnaW5nRmFjZUZhbGxiYWNrOgogICAgIjB4MmU2Zjk2MjBjMzViZGNiMmI3NTNjYzdhYTM0ZTc4MDc3YThlZDEzM2UzNmZhMzYwMDhmZDZiZGZkMjlhZjNhNSI6IFF3ZW4vUXdlbjMtMzJCCiAgc2tpcFN0b3JhZ2VVcGxvYWQ6IHRydWUKICBmaWxlUmV0ZW50aW9uSG91cnM6IDcyCnN0b3JhZ2VDbGllbnQ6CiAgaW5kZXhlclN0YW5kYXJkOiAiaHR0cHM6Ly9pbmRleGVyLXN0b3JhZ2UtdGVzdG5ldC1zdGFuZGFyZC4wZy5haSIKICBpbmRleGVyVHVyYm86ICJodHRwczovL2luZGV4ZXItc3RvcmFnZS10ZXN0bmV0LXR1cmJvLjBnLmFpIgogIHVwbG9hZEFyZ3M6CiAgICBmaW5hbGl0eVJlcXVpcmVkOiBmYWxzZQogICAgdGFnczogIjB4IgogICAgZXhwZWN0ZWRSZXBsaWNhOiAxCiAgICBza2lwVHg6IGZhbHNlCiAgICBmcmFnbWVudFNpemU6IDQyOTQ5NjcyOTYKZGF0YWJhc2U6CiAgZmluZVR1bmU6IHJvb3Q6MTIzNDU2QHRjcCgwZy1maW5lLXR1bmUtYnJva2VyLWRiOjMzMDYpL2ZpbmVUdW5lP3BhcnNlVGltZT10cnVlCmxvZ2dlcjoKICBsZXZlbDogZGVidWcKICBwYXRoOiAvdG1wL2ZpbmUtdHVuaW5nLmxvZwpmaW5hbGl6ZXJXb3JrZXJDb3VudDogMQp0cmFpbmluZ1dvcmtlckNvdW50OiAxCnNldHVwV29ya2VyQ291bnQ6IDEKbWF4U2V0dXBSZXRyaWVzUGVyVGFzazogMwptYXhFeGVjdXRvclJldHJpZXNQZXJUYXNrOiAzCm1heEZpbmFsaXplclJldHJpZXNQZXJUYXNrOiAxMAptYXhTZXR0bGVtZW50UmV0cmllc1BlclRhc2s6IDEwCnNldHRsZW1lbnRCYXRjaFNpemU6IDEKZGVsaXZlcmVkVGFza0Fja1RpbWVvdXRTZWNzOiAyMTYwMApkYXRhUmV0ZW50aW9uRGF5czogMwptYXhUYXNrUXVldWVTaXplOiA1Cg==" | base64 -d > /config/config.yaml' + + mysql: + image: mysql:8.0 + container_name: 0g-fine-tune-broker-db + ports: + - "33060:3306" + environment: + - MYSQL_ROOT_PASSWORD=123456 + - MYSQL_DATABASE=fineTune + volumes: + - mysql-data:/var/lib/mysql + restart: always + healthcheck: + test: ["CMD-SHELL", "mysqladmin ping -h localhost"] + interval: 10s + retries: 5 + networks: + - localhost + depends_on: + config-init: + condition: service_completed_successfully + + 0g-fine-tune-broker: + image: ghcr.io/0gfoundation/0g-serving-broker:2d03306-fix3-amd64 + privileged: true + restart: always + container_name: 0g-fine-tune-broker + environment: + - PORT=3080 + - CONFIG_FILE=/etc/config/config.yaml + - NVIDIA_VISIBLE_DEVICES=all + - TOKEN=PHALA@2025 + ports: + - "80:3080" + volumes: + - /var/run/dstack.sock:/var/run/dstack.sock + - /var/run/docker.sock:/var/run/docker.sock + - config-vol:/etc/config:ro + - /tmp:/tmp + - /tmp/0g-transformer-bridge:/opt/transformer-bridge + - ./models:/fine-tuning/execution/models + - /dstack/persistent:/dstack/persistent + command: ["0g-fine-tuning-server", "--config", "/etc/config/config.yaml"] + networks: + - localhost + logging: + driver: "json-file" + options: + max-size: "100m" + max-file: "5" + depends_on: + mysql: + condition: service_healthy + +volumes: + mysql-data: + config-vol: + +networks: + localhost: + name: localhost + external: false diff --git a/api/fine-tuning/integration/prod/docker-compose-full.yml b/api/fine-tuning/integration/prod/docker-compose-full.yml new file mode 100644 index 00000000..567cedb6 --- /dev/null +++ b/api/fine-tuning/integration/prod/docker-compose-full.yml @@ -0,0 +1,69 @@ +services: + config-init: + image: alpine:latest + volumes: + - config-vol:/config + command: sh -c 'echo "bmV0d29ya3M6CiAgZXRoZXJldW0wZzoKICAgIHVybDogImh0dHBzOi8vZXZtcnBjLXRlc3RuZXQuMGcuYWkiCiAgICBjaGFpbklEOiAxNjYwMgogICAgcHJpdmF0ZUtleXM6CiAgICAgIC0gIjJDRTQ2QTFDM0I1RTVGNzNFREI3OUMzNDNCQjg2OUVGOTQ3MDlFNkJFQkIzQjI5QjcwOTM3RDdCNUM2RDU3NTEiCiAgICB0cmFuc2FjdGlvbkxpbWl0OiAxMDAwMDAwCnNlcnZpY2U6CiAgc2VydmluZ1VybDogIiNEU1RBQ0tfQVBQX1VSTCMiCiAgcHJpY2VQZXJUb2tlbjogMQogIHF1b3RhOgogICAgY3B1Q291bnQ6IDgKICAgIG1lbW9yeTogMTg3CiAgICBzdG9yYWdlOiA5MDAKICAgIGdwdVR5cGU6IEgyMDAKICAgIGdwdUNvdW50OiAxCiAgbW9kZWxMb2NhbFBhdGhzOgogICAgIjB4MmU2Zjk2MjBjMzViZGNiMmI3NTNjYzdhYTM0ZTc4MDc3YThlZDEzM2UzNmZhMzYwMDhmZDZiZGZkMjlhZjNhNSI6IC9kc3RhY2svcGVyc2lzdGVudC9tb2RlbHMvUXdlbjMtMzJCCiAgbW9kZWxIdWdnaW5nRmFjZUZhbGxiYWNrOgogICAgIjB4MmU2Zjk2MjBjMzViZGNiMmI3NTNjYzdhYTM0ZTc4MDc3YThlZDEzM2UzNmZhMzYwMDhmZDZiZGZkMjlhZjNhNSI6IFF3ZW4vUXdlbjMtMzJCCiAgc2tpcFN0b3JhZ2VVcGxvYWQ6IHRydWUKICBmaWxlUmV0ZW50aW9uSG91cnM6IDcyCnN0b3JhZ2VDbGllbnQ6CiAgaW5kZXhlclN0YW5kYXJkOiAiaHR0cHM6Ly9pbmRleGVyLXN0b3JhZ2UtdGVzdG5ldC1zdGFuZGFyZC4wZy5haSIKICBpbmRleGVyVHVyYm86ICJodHRwczovL2luZGV4ZXItc3RvcmFnZS10ZXN0bmV0LXR1cmJvLjBnLmFpIgogIHVwbG9hZEFyZ3M6CiAgICBmaW5hbGl0eVJlcXVpcmVkOiBmYWxzZQogICAgdGFnczogIjB4IgogICAgZXhwZWN0ZWRSZXBsaWNhOiAxCiAgICBza2lwVHg6IGZhbHNlCiAgICBmcmFnbWVudFNpemU6IDQyOTQ5NjcyOTYKZGF0YWJhc2U6CiAgZmluZVR1bmU6IHJvb3Q6MTIzNDU2QHRjcCgwZy1maW5lLXR1bmUtYnJva2VyLWRiOjMzMDYpL2ZpbmVUdW5lP3BhcnNlVGltZT10cnVlCmxvZ2dlcjoKICBsZXZlbDogZGVidWcKICBwYXRoOiAvdG1wL2ZpbmUtdHVuaW5nLmxvZwpmaW5hbGl6ZXJXb3JrZXJDb3VudDogMQp0cmFpbmluZ1dvcmtlckNvdW50OiAxCnNldHVwV29ya2VyQ291bnQ6IDEKbWF4U2V0dXBSZXRyaWVzUGVyVGFzazogMwptYXhFeGVjdXRvclJldHJpZXNQZXJUYXNrOiAzCm1heEZpbmFsaXplclJldHJpZXNQZXJUYXNrOiAxMAptYXhTZXR0bGVtZW50UmV0cmllc1BlclRhc2s6IDEwCnNldHRsZW1lbnRCYXRjaFNpemU6IDEKZGVsaXZlcmVkVGFza0Fja1RpbWVvdXRTZWNzOiAyMTYwMApkYXRhUmV0ZW50aW9uRGF5czogMwptYXhUYXNrUXVldWVTaXplOiA1Cg==" | base64 -d > /config/config.yaml' + + mysql: + image: mysql:8.0 + container_name: 0g-fine-tune-broker-db + ports: + - "33060:3306" + environment: + - MYSQL_ROOT_PASSWORD=123456 + - MYSQL_DATABASE=fineTune + volumes: + - mysql-data:/var/lib/mysql + restart: always + healthcheck: + test: ["CMD-SHELL", "mysqladmin ping -h localhost"] + interval: 10s + retries: 5 + networks: + - localhost + depends_on: + config-init: + condition: service_completed_successfully + + 0g-fine-tune-broker: + image: ghcr.io/0gfoundation/0g-serving-broker:efa7350-full-amd64 + privileged: true + restart: always + container_name: 0g-fine-tune-broker + environment: + - PORT=3080 + - CONFIG_FILE=/etc/config/config.yaml + - NVIDIA_VISIBLE_DEVICES=all + - TOKEN=PHALA@2025 + - NETWORK=hardhat + ports: + - "80:3080" + volumes: + - /var/run/dstack.sock:/var/run/dstack.sock + - /var/run/docker.sock:/var/run/docker.sock + - config-vol:/etc/config:ro + - /tmp:/tmp + - /tmp/0g-transformer-bridge:/opt/transformer-bridge + - ./models:/fine-tuning/execution/models + - /dstack/persistent:/dstack/persistent + command: ["/usr/bin/broker", "0g-fine-tuning-server", "--config", "/etc/config/config.yaml"] + networks: + - localhost + logging: + driver: "json-file" + options: + max-size: "100m" + max-file: "5" + depends_on: + mysql: + condition: service_healthy + +volumes: + mysql-data: + config-vol: + +networks: + localhost: + name: localhost + external: false diff --git a/api/fine-tuning/integration/prod/docker-compose-latest.yml b/api/fine-tuning/integration/prod/docker-compose-latest.yml new file mode 100644 index 00000000..3b11553e --- /dev/null +++ b/api/fine-tuning/integration/prod/docker-compose-latest.yml @@ -0,0 +1,69 @@ +services: + config-init: + image: alpine:latest + volumes: + - config-vol:/config + command: sh -c 'echo "bmV0d29ya3M6CiAgZXRoZXJldW0wZzoKICAgIHVybDogImh0dHBzOi8vZXZtcnBjLXRlc3RuZXQuMGcuYWkiCiAgICBjaGFpbklEOiAxNjYwMgogICAgcHJpdmF0ZUtleXM6CiAgICAgIC0gIjJDRTQ2QTFDM0I1RTVGNzNFREI3OUMzNDNCQjg2OUVGOTQ3MDlFNkJFQkIzQjI5QjcwOTM3RDdCNUM2RDU3NTEiCiAgICB0cmFuc2FjdGlvbkxpbWl0OiAxMDAwMDAwCnNlcnZpY2U6CiAgc2VydmluZ1VybDogIiNEU1RBQ0tfQVBQX1VSTCMiCiAgcHJpY2VQZXJUb2tlbjogMQogIHF1b3RhOgogICAgY3B1Q291bnQ6IDgKICAgIG1lbW9yeTogMTg3CiAgICBzdG9yYWdlOiA5MDAKICAgIGdwdVR5cGU6IEgyMDAKICAgIGdwdUNvdW50OiAxCiAgbW9kZWxMb2NhbFBhdGhzOgogICAgIjB4MmU2Zjk2MjBjMzViZGNiMmI3NTNjYzdhYTM0ZTc4MDc3YThlZDEzM2UzNmZhMzYwMDhmZDZiZGZkMjlhZjNhNSI6IC9kc3RhY2svcGVyc2lzdGVudC9tb2RlbHMvUXdlbjMtMzJCCiAgbW9kZWxIdWdnaW5nRmFjZUZhbGxiYWNrOgogICAgIjB4MmU2Zjk2MjBjMzViZGNiMmI3NTNjYzdhYTM0ZTc4MDc3YThlZDEzM2UzNmZhMzYwMDhmZDZiZGZkMjlhZjNhNSI6IFF3ZW4vUXdlbjMtMzJCCiAgc2tpcFN0b3JhZ2VVcGxvYWQ6IHRydWUKICBmaWxlUmV0ZW50aW9uSG91cnM6IDcyCnN0b3JhZ2VDbGllbnQ6CiAgaW5kZXhlclN0YW5kYXJkOiAiaHR0cHM6Ly9pbmRleGVyLXN0b3JhZ2UtdGVzdG5ldC1zdGFuZGFyZC4wZy5haSIKICBpbmRleGVyVHVyYm86ICJodHRwczovL2luZGV4ZXItc3RvcmFnZS10ZXN0bmV0LXR1cmJvLjBnLmFpIgogIHVwbG9hZEFyZ3M6CiAgICBmaW5hbGl0eVJlcXVpcmVkOiBmYWxzZQogICAgdGFnczogIjB4IgogICAgZXhwZWN0ZWRSZXBsaWNhOiAxCiAgICBza2lwVHg6IGZhbHNlCiAgICBmcmFnbWVudFNpemU6IDQyOTQ5NjcyOTYKZGF0YWJhc2U6CiAgZmluZVR1bmU6IHJvb3Q6MTIzNDU2QHRjcCgwZy1maW5lLXR1bmUtYnJva2VyLWRiOjMzMDYpL2ZpbmVUdW5lP3BhcnNlVGltZT10cnVlCmxvZ2dlcjoKICBsZXZlbDogZGVidWcKICBwYXRoOiAvdG1wL2ZpbmUtdHVuaW5nLmxvZwpmaW5hbGl6ZXJXb3JrZXJDb3VudDogMQp0cmFpbmluZ1dvcmtlckNvdW50OiAxCnNldHVwV29ya2VyQ291bnQ6IDEKbWF4U2V0dXBSZXRyaWVzUGVyVGFzazogMwptYXhFeGVjdXRvclJldHJpZXNQZXJUYXNrOiAzCm1heEZpbmFsaXplclJldHJpZXNQZXJUYXNrOiAxMAptYXhTZXR0bGVtZW50UmV0cmllc1BlclRhc2s6IDEwCnNldHRsZW1lbnRCYXRjaFNpemU6IDEKZGVsaXZlcmVkVGFza0Fja1RpbWVvdXRTZWNzOiAyMTYwMApkYXRhUmV0ZW50aW9uRGF5czogMwptYXhUYXNrUXVldWVTaXplOiA1Cg==" | base64 -d > /config/config.yaml' + + mysql: + image: mysql:8.0 + container_name: 0g-fine-tune-broker-db + ports: + - "33060:3306" + environment: + - MYSQL_ROOT_PASSWORD=123456 + - MYSQL_DATABASE=fineTune + volumes: + - mysql-data:/var/lib/mysql + restart: always + healthcheck: + test: ["CMD-SHELL", "mysqladmin ping -h localhost"] + interval: 10s + retries: 5 + networks: + - localhost + depends_on: + config-init: + condition: service_completed_successfully + + 0g-fine-tune-broker: + image: ghcr.io/0gfoundation/0g-serving-broker:efa7350-amd64 + privileged: true + restart: always + container_name: 0g-fine-tune-broker + environment: + - PORT=3080 + - CONFIG_FILE=/etc/config/config.yaml + - NVIDIA_VISIBLE_DEVICES=all + - TOKEN=PHALA@2025 + - NETWORK=hardhat + ports: + - "80:3080" + volumes: + - /var/run/dstack.sock:/var/run/dstack.sock + - /var/run/docker.sock:/var/run/docker.sock + - config-vol:/etc/config:ro + - /tmp:/tmp + - /tmp/0g-transformer-bridge:/opt/transformer-bridge + - ./models:/fine-tuning/execution/models + - /dstack/persistent:/dstack/persistent + command: ["0g-fine-tuning-server", "--config", "/etc/config/config.yaml"] + networks: + - localhost + logging: + driver: "json-file" + options: + max-size: "100m" + max-file: "5" + depends_on: + mysql: + condition: service_healthy + +volumes: + mysql-data: + config-vol: + +networks: + localhost: + name: localhost + external: false diff --git a/api/fine-tuning/integration/prod/docker-compose-no-gpu.yml b/api/fine-tuning/integration/prod/docker-compose-no-gpu.yml new file mode 100644 index 00000000..a0fc7911 --- /dev/null +++ b/api/fine-tuning/integration/prod/docker-compose-no-gpu.yml @@ -0,0 +1,69 @@ +services: + config-init: + image: alpine:latest + volumes: + - config-vol:/config + command: sh -c 'echo "bmV0d29ya3M6CiAgZXRoZXJldW0wZzoKICAgIHVybDogImh0dHBzOi8vZXZtcnBjLXRlc3RuZXQuMGcuYWkiCiAgICBjaGFpbklEOiAxNjYwMgogICAgcHJpdmF0ZUtleXM6CiAgICAgIC0gIjJDRTQ2QTFDM0I1RTVGNzNFREI3OUMzNDNCQjg2OUVGOTQ3MDlFNkJFQkIzQjI5QjcwOTM3RDdCNUM2RDU3NTEiCiAgICB0cmFuc2FjdGlvbkxpbWl0OiAxMDAwMDAwCnNlcnZpY2U6CiAgc2VydmluZ1VybDogIiNEU1RBQ0tfQVBQX1VSTCMiCiAgcHJpY2VQZXJUb2tlbjogMQogIHF1b3RhOgogICAgY3B1Q291bnQ6IDgKICAgIG1lbW9yeTogMTg3CiAgICBzdG9yYWdlOiA5MDAKICAgIGdwdVR5cGU6IEgyMDAKICAgIGdwdUNvdW50OiAxCiAgbW9kZWxMb2NhbFBhdGhzOgogICAgIjB4MmU2Zjk2MjBjMzViZGNiMmI3NTNjYzdhYTM0ZTc4MDc3YThlZDEzM2UzNmZhMzYwMDhmZDZiZGZkMjlhZjNhNSI6IC9kc3RhY2svcGVyc2lzdGVudC9tb2RlbHMvUXdlbjMtMzJCCiAgbW9kZWxIdWdnaW5nRmFjZUZhbGxiYWNrOgogICAgIjB4MmU2Zjk2MjBjMzViZGNiMmI3NTNjYzdhYTM0ZTc4MDc3YThlZDEzM2UzNmZhMzYwMDhmZDZiZGZkMjlhZjNhNSI6IFF3ZW4vUXdlbjMtMzJCCiAgc2tpcFN0b3JhZ2VVcGxvYWQ6IHRydWUKICBmaWxlUmV0ZW50aW9uSG91cnM6IDcyCnN0b3JhZ2VDbGllbnQ6CiAgaW5kZXhlclN0YW5kYXJkOiAiaHR0cHM6Ly9pbmRleGVyLXN0b3JhZ2UtdGVzdG5ldC1zdGFuZGFyZC4wZy5haSIKICBpbmRleGVyVHVyYm86ICJodHRwczovL2luZGV4ZXItc3RvcmFnZS10ZXN0bmV0LXR1cmJvLjBnLmFpIgogIHVwbG9hZEFyZ3M6CiAgICBmaW5hbGl0eVJlcXVpcmVkOiBmYWxzZQogICAgdGFnczogIjB4IgogICAgZXhwZWN0ZWRSZXBsaWNhOiAxCiAgICBza2lwVHg6IGZhbHNlCiAgICBmcmFnbWVudFNpemU6IDQyOTQ5NjcyOTYKZGF0YWJhc2U6CiAgZmluZVR1bmU6IHJvb3Q6MTIzNDU2QHRjcCgwZy1maW5lLXR1bmUtYnJva2VyLWRiOjMzMDYpL2ZpbmVUdW5lP3BhcnNlVGltZT10cnVlCmxvZ2dlcjoKICBsZXZlbDogZGVidWcKICBwYXRoOiAvdG1wL2ZpbmUtdHVuaW5nLmxvZwpmaW5hbGl6ZXJXb3JrZXJDb3VudDogMQp0cmFpbmluZ1dvcmtlckNvdW50OiAxCnNldHVwV29ya2VyQ291bnQ6IDEKbWF4U2V0dXBSZXRyaWVzUGVyVGFzazogMwptYXhFeGVjdXRvclJldHJpZXNQZXJUYXNrOiAzCm1heEZpbmFsaXplclJldHJpZXNQZXJUYXNrOiAxMAptYXhTZXR0bGVtZW50UmV0cmllc1BlclRhc2s6IDEwCnNldHRsZW1lbnRCYXRjaFNpemU6IDEKZGVsaXZlcmVkVGFza0Fja1RpbWVvdXRTZWNzOiAyMTYwMApkYXRhUmV0ZW50aW9uRGF5czogMwptYXhUYXNrUXVldWVTaXplOiA1Cg==" | base64 -d > /config/config.yaml' + + mysql: + image: mysql:8.0 + container_name: 0g-fine-tune-broker-db + ports: + - "33060:3306" + environment: + - MYSQL_ROOT_PASSWORD=123456 + - MYSQL_DATABASE=fineTune + volumes: + - mysql-data:/var/lib/mysql + restart: always + healthcheck: + test: ["CMD-SHELL", "mysqladmin ping -h localhost"] + interval: 10s + retries: 5 + networks: + - localhost + depends_on: + config-init: + condition: service_completed_successfully + + 0g-fine-tune-broker: + image: ghcr.io/0gfoundation/0g-serving-broker:2d03306-fix3-amd64 + privileged: true + restart: always + container_name: 0g-fine-tune-broker + environment: + - PORT=3080 + - CONFIG_FILE=/etc/config/config.yaml + - NVIDIA_VISIBLE_DEVICES=all + - TOKEN=PHALA@2025 + - NETWORK=hardhat + ports: + - "80:3080" + volumes: + - /var/run/dstack.sock:/var/run/dstack.sock + - /var/run/docker.sock:/var/run/docker.sock + - config-vol:/etc/config:ro + - /tmp:/tmp + - /tmp/0g-transformer-bridge:/opt/transformer-bridge + - ./models:/fine-tuning/execution/models + - /dstack/persistent:/dstack/persistent + command: ["0g-fine-tuning-server", "--config", "/etc/config/config.yaml"] + networks: + - localhost + logging: + driver: "json-file" + options: + max-size: "100m" + max-file: "5" + depends_on: + mysql: + condition: service_healthy + +volumes: + mysql-data: + config-vol: + +networks: + localhost: + name: localhost + external: false diff --git a/api/fine-tuning/integration/prod/docker-compose-noport.yml b/api/fine-tuning/integration/prod/docker-compose-noport.yml new file mode 100644 index 00000000..2b7754f2 --- /dev/null +++ b/api/fine-tuning/integration/prod/docker-compose-noport.yml @@ -0,0 +1,68 @@ +services: + config-init: + image: alpine:latest + volumes: + - config-vol:/config + command: sh -c 'echo "bmV0d29ya3M6CiAgZXRoZXJldW0wZzoKICAgIHVybDogImh0dHBzOi8vZXZtcnBjLXRlc3RuZXQuMGcuYWkiCiAgICBjaGFpbklEOiAxNjYwMgogICAgcHJpdmF0ZUtleXM6CiAgICAgIC0gIjJDRTQ2QTFDM0I1RTVGNzNFREI3OUMzNDNCQjg2OUVGOTQ3MDlFNkJFQkIzQjI5QjcwOTM3RDdCNUM2RDU3NTEiCiAgICB0cmFuc2FjdGlvbkxpbWl0OiAxMDAwMDAwCnNlcnZpY2U6CiAgc2VydmluZ1VybDogIiNEU1RBQ0tfQVBQX1VSTCMiCiAgcHJpY2VQZXJUb2tlbjogMQogIHF1b3RhOgogICAgY3B1Q291bnQ6IDgKICAgIG1lbW9yeTogMTg3CiAgICBzdG9yYWdlOiA5MDAKICAgIGdwdVR5cGU6IEgyMDAKICAgIGdwdUNvdW50OiAxCiAgY3VzdG9taXplZE1vZGVsczoKICAgIC0gbmFtZTogUXdlbjMtMzJCCiAgICAgIGhhc2g6ICIweDJlNmY5NjIwYzM1YmRjYjJiNzUzY2M3YWEzNGU3ODA3N2E4ZWQxMzNlMzZmYTM2MDA4ZmQ2YmRmZDI5YWYzYTUiCiAgICAgIGltYWdlOiBnaGNyLmlvLzBnZm91bmRhdGlvbi9maW5lLXR1bmluZy10ZXN0OnYxCiAgICAgIGRhdGFUeXBlOiB0ZXh0CiAgICAgIHRyYWluaW5nU2NyaXB0OiAvYXBwL3RyYWluX2xvcmEucHkKICAgICAgZGVzY3JpcHRpb246IFF3ZW4zLTMyQiBMb1JBIGZpbmUtdHVuaW5nCiAgICAgIHRva2VuaXplcjogUXdlbi9Rd2VuMy0zMkIKICAgICAgbG9jYWxQYXRoOiAvZHN0YWNrL3BlcnNpc3RlbnQvbW9kZWxzL1F3ZW4zLTMyQgogIG1vZGVsSHVnZ2luZ0ZhY2VGYWxsYmFjazoKICAgICIweDJlNmY5NjIwYzM1YmRjYjJiNzUzY2M3YWEzNGU3ODA3N2E4ZWQxMzNlMzZmYTM2MDA4ZmQ2YmRmZDI5YWYzYTUiOiBRd2VuL1F3ZW4zLTMyQgogIHNraXBTdG9yYWdlVXBsb2FkOiB0cnVlCiAgZmlsZVJldGVudGlvbkhvdXJzOiA3MgpkYXRhYmFzZToKICBmaW5lVHVuZTogcm9vdDoxMjM0NTZAdGNwKDBnLWZpbmUtdHVuZS1icm9rZXItZGI6MzMwNikvZmluZVR1bmU/cGFyc2VUaW1lPXRydWUKbG9nZ2VyOgogIGxldmVsOiBkZWJ1ZwogIHBhdGg6IC90bXAvZmluZS10dW5pbmcubG9nCmZpbmFsaXplcldvcmtlckNvdW50OiAxCmV4ZWN1dG9yV29ya2VyQ291bnQ6IDEKc2V0dXBXb3JrZXJDb3VudDogMQptYXhTZXR1cFJldHJpZXNQZXJUYXNrOiAzCm1heEV4ZWN1dG9yUmV0cmllc1BlclRhc2s6IDMKbWF4RmluYWxpemVyUmV0cmllc1BlclRhc2s6IDEwCm1heFNldHRsZW1lbnRSZXRyaWVzUGVyVGFzazogMTAKc2V0dGxlbWVudEJhdGNoU2l6ZTogMQpkZWxpdmVyZWRUYXNrQWNrVGltZW91dFNlY3M6IDIxNjAwCmRhdGFSZXRlbnRpb25EYXlzOiAzCm1heFRhc2tRdWV1ZVNpemU6IDUK" | base64 -d > /config/config.yaml' + + mysql: + image: mysql:8.0 + container_name: 0g-fine-tune-broker-db + ports: + - "33060:3306" + environment: + - MYSQL_ROOT_PASSWORD=123456 + - MYSQL_DATABASE=fineTune + volumes: + - mysql-data:/var/lib/mysql + restart: always + healthcheck: + test: ["CMD-SHELL", "mysqladmin ping -h localhost"] + interval: 10s + retries: 5 + networks: + - localhost + depends_on: + config-init: + condition: service_completed_successfully + + 0g-fine-tune-broker: + image: ghcr.io/0gfoundation/0g-serving-broker:latest + privileged: true + restart: always + container_name: 0g-fine-tune-broker + environment: + - PORT=3080 + - CONFIG_FILE=/etc/config/config.yaml + - NVIDIA_VISIBLE_DEVICES=all + - TOKEN=PHALA@2025 + ports: + - "80:3080" + volumes: + - /var/run/dstack.sock:/var/run/dstack.sock + - /var/run/docker.sock:/var/run/docker.sock + - config-vol:/etc/config:ro + - /tmp:/tmp + - /tmp/0g-transformer-bridge:/opt/transformer-bridge + - ./models:/fine-tuning/execution/models + - /dstack/persistent:/dstack/persistent + command: ["0g-fine-tuning-server", "--config", "/etc/config/config.yaml"] + networks: + - localhost + logging: + driver: "json-file" + options: + max-size: "100m" + max-file: "5" + depends_on: + mysql: + condition: service_healthy + +volumes: + mysql-data: + config-vol: + +networks: + localhost: + name: localhost + external: false diff --git a/api/fine-tuning/integration/prod/docker-compose-real-url.yml b/api/fine-tuning/integration/prod/docker-compose-real-url.yml new file mode 100644 index 00000000..97a2f8e2 --- /dev/null +++ b/api/fine-tuning/integration/prod/docker-compose-real-url.yml @@ -0,0 +1,69 @@ +services: + config-init: + image: alpine:latest + volumes: + - config-vol:/config + command: sh -c 'echo "bmV0d29ya3M6CiAgZXRoZXJldW0wZzoKICAgIHVybDogImh0dHBzOi8vZXZtcnBjLXRlc3RuZXQuMGcuYWkiCiAgICBjaGFpbklEOiAxNjYwMgogICAgcHJpdmF0ZUtleXM6CiAgICAgIC0gIjJDRTQ2QTFDM0I1RTVGNzNFREI3OUMzNDNCQjg2OUVGOTQ3MDlFNkJFQkIzQjI5QjcwOTM3RDdCNUM2RDU3NTEiCiAgICB0cmFuc2FjdGlvbkxpbWl0OiAxMDAwMDAwCiAgICBnYXNFc3RpbWF0aW9uQnVmZmVyOiAxMDAwMApzZXJ2aWNlOgogIHNlcnZpbmdVcmw6ICJodHRwczovLzY0NDQ0ZjY4M2Y3OTdkNWZlNDRiNjhkYzkyNDlhMjEwYTU2YzU2YTMtODAuZHN0YWNrLXBoYS11c2UyLnBoYWxhLm5ldHdvcmsiCiAgcHJpY2VQZXJUb2tlbjogMQogIHF1b3RhOgogICAgY3B1Q291bnQ6IDgKICAgIG1lbW9yeTogMTg3CiAgICBzdG9yYWdlOiA5MDAKICAgIGdwdVR5cGU6IEgyMDAKICAgIGdwdUNvdW50OiAxCnN0b3JhZ2VDbGllbnQ6CiAgaW5kZXhlclN0YW5kYXJkOiAiaHR0cHM6Ly9pbmRleGVyLXN0b3JhZ2UtdGVzdG5ldC1zdGFuZGFyZC4wZy5haSIKICBpbmRleGVyVHVyYm86ICJodHRwczovL2luZGV4ZXItc3RvcmFnZS10ZXN0bmV0LXR1cmJvLjBnLmFpIgogIHVwbG9hZEFyZ3M6CiAgICBmaW5hbGl0eVJlcXVpcmVkOiBmYWxzZQogICAgdGFnczogIjB4IgogICAgZXhwZWN0ZWRSZXBsaWNhOiAxCiAgICBza2lwVHg6IGZhbHNlCiAgICBmcmFnbWVudFNpemU6IDQyOTQ5NjcyOTYKZGF0YWJhc2U6CiAgZmluZVR1bmU6IHJvb3Q6MTIzNDU2QHRjcCgwZy1maW5lLXR1bmUtYnJva2VyLWRiOjMzMDYpL2ZpbmVUdW5lP3BhcnNlVGltZT10cnVlCmxvZ2dlcjoKICBsZXZlbDogZGVidWcKICBwYXRoOiAvdG1wL2ZpbmUtdHVuaW5nLmxvZwp0cmFpbmluZ1dvcmtlckNvdW50OiAxCnNldHVwV29ya2VyQ291bnQ6IDEKZmluYWxpemVyV29ya2VyQ291bnQ6IDEKbWF4U2V0dXBSZXRyaWVzUGVyVGFzazogMwptYXhFeGVjdXRvclJldHJpZXNQZXJUYXNrOiAzCm1heEZpbmFsaXplclJldHJpZXNQZXJUYXNrOiAxMAptYXhTZXR0bGVtZW50UmV0cmllc1BlclRhc2s6IDEwCnNldHRsZW1lbnRCYXRjaFNpemU6IDEKZGVsaXZlcmVkVGFza0Fja1RpbWVvdXRTZWNzOiAyMTYwMApkYXRhUmV0ZW50aW9uRGF5czogMwptYXhUYXNrUXVldWVTaXplOiA1Cg==" | base64 -d > /config/config.yaml' + + mysql: + image: mysql:8.0 + container_name: 0g-fine-tune-broker-db + ports: + - "33060:3306" + environment: + - MYSQL_ROOT_PASSWORD=123456 + - MYSQL_DATABASE=fineTune + volumes: + - mysql-data:/var/lib/mysql + restart: always + healthcheck: + test: ["CMD-SHELL", "mysqladmin ping -h localhost"] + interval: 10s + retries: 5 + networks: + - localhost + depends_on: + config-init: + condition: service_completed_successfully + + 0g-fine-tune-broker: + image: ghcr.io/0gfoundation/0g-serving-broker:efa7350-fix-getservice + privileged: true + restart: always + container_name: 0g-fine-tune-broker + environment: + - PORT=3080 + - CONFIG_FILE=/etc/config/config.yaml + - NVIDIA_VISIBLE_DEVICES=all + - TOKEN=PHALA@2025 + - NETWORK=hardhat + ports: + - "80:3080" + volumes: + - /var/run/dstack.sock:/var/run/dstack.sock + - /var/run/docker.sock:/var/run/docker.sock + - config-vol:/etc/config:ro + - /tmp:/tmp + - /tmp/0g-transformer-bridge:/opt/transformer-bridge + - ./models:/fine-tuning/execution/models + - /dstack/persistent:/dstack/persistent + command: ["0g-fine-tuning-server", "--config", "/etc/config/config.yaml"] + networks: + - localhost + logging: + driver: "json-file" + options: + max-size: "100m" + max-file: "5" + depends_on: + mysql: + condition: service_healthy + +volumes: + mysql-data: + config-vol: + +networks: + localhost: + name: localhost + external: false diff --git a/api/fine-tuning/integration/prod/docker-compose-simple.yml b/api/fine-tuning/integration/prod/docker-compose-simple.yml new file mode 100644 index 00000000..ea9e93f2 --- /dev/null +++ b/api/fine-tuning/integration/prod/docker-compose-simple.yml @@ -0,0 +1,68 @@ +services: + config-init: + image: alpine:latest + volumes: + - config-vol:/config + command: sh -c 'echo "bmV0d29ya3M6CiAgZXRoZXJldW0wZzoKICAgIHVybDogImh0dHBzOi8vZXZtcnBjLXRlc3RuZXQuMGcuYWkiCiAgICBjaGFpbklEOiAxNjYwMgogICAgcHJpdmF0ZUtleXM6CiAgICAgIC0gIjJDRTQ2QTFDM0I1RTVGNzNFREI3OUMzNDNCQjg2OUVGOTQ3MDlFNkJFQkIzQjI5QjcwOTM3RDdCNUM2RDU3NTEiCiAgICB0cmFuc2FjdGlvbkxpbWl0OiAxMDAwMDAwCnNlcnZpY2U6CiAgc2VydmluZ1VybDogIiNEU1RBQ0tfQVBQX1VSTCMiCiAgcHJpY2VQZXJUb2tlbjogMQogIHF1b3RhOgogICAgY3B1Q291bnQ6IDgKICAgIG1lbW9yeTogMTg3CiAgICBzdG9yYWdlOiA5MDAKICAgIGdwdVR5cGU6IEgyMDAKICAgIGdwdUNvdW50OiAxCiAgY3VzdG9taXplZE1vZGVsczoKICAgIC0gbmFtZTogUXdlbjMtMzJCCiAgICAgIGhhc2g6ICIweDJlNmY5NjIwYzM1YmRjYjJiNzUzY2M3YWEzNGU3ODA3N2E4ZWQxMzNlMzZmYTM2MDA4ZmQ2YmRmZDI5YWYzYTUiCiAgICAgIGltYWdlOiBnaGNyLmlvLzBnZm91bmRhdGlvbi9maW5lLXR1bmluZy10ZXN0OnYxCiAgICAgIGRhdGFUeXBlOiB0ZXh0CiAgICAgIHRyYWluaW5nU2NyaXB0OiAvYXBwL3RyYWluX2xvcmEucHkKICAgICAgZGVzY3JpcHRpb246IFF3ZW4zLTMyQiBMb1JBIGZpbmUtdHVuaW5nCiAgICAgIHRva2VuaXplcjogUXdlbi9Rd2VuMy0zMkIKICAgICAgbG9jYWxQYXRoOiAvZHN0YWNrL3BlcnNpc3RlbnQvbW9kZWxzL1F3ZW4zLTMyQgogIG1vZGVsSHVnZ2luZ0ZhY2VGYWxsYmFjazoKICAgICIweDJlNmY5NjIwYzM1YmRjYjJiNzUzY2M3YWEzNGU3ODA3N2E4ZWQxMzNlMzZmYTM2MDA4ZmQ2YmRmZDI5YWYzYTUiOiBRd2VuL1F3ZW4zLTMyQgogIHNraXBTdG9yYWdlVXBsb2FkOiB0cnVlCiAgZmlsZVJldGVudGlvbkhvdXJzOiA3MgpkYXRhYmFzZToKICBmaW5lVHVuZTogcm9vdDoxMjM0NTZAdGNwKDBnLWZpbmUtdHVuZS1icm9rZXItZGI6MzMwNikvZmluZVR1bmU/cGFyc2VUaW1lPXRydWUKbG9nZ2VyOgogIGxldmVsOiBkZWJ1ZwogIHBhdGg6IC90bXAvZmluZS10dW5pbmcubG9nCmZpbmFsaXplcldvcmtlckNvdW50OiAxCmV4ZWN1dG9yV29ya2VyQ291bnQ6IDEKc2V0dXBXb3JrZXJDb3VudDogMQptYXhTZXR1cFJldHJpZXNQZXJUYXNrOiAzCm1heEV4ZWN1dG9yUmV0cmllc1BlclRhc2s6IDMKbWF4RmluYWxpemVyUmV0cmllc1BlclRhc2s6IDEwCm1heFNldHRsZW1lbnRSZXRyaWVzUGVyVGFzazogMTAKc2V0dGxlbWVudEJhdGNoU2l6ZTogMQpkZWxpdmVyZWRUYXNrQWNrVGltZW91dFNlY3M6IDIxNjAwCmRhdGFSZXRlbnRpb25EYXlzOiAzCm1heFRhc2tRdWV1ZVNpemU6IDUK" | base64 -d > /config/config.yaml' + + mysql: + image: mysql:8.0 + container_name: 0g-fine-tune-broker-db + ports: + - "33060:3306" + environment: + - MYSQL_ROOT_PASSWORD=123456 + - MYSQL_DATABASE=fineTune + volumes: + - mysql-data:/var/lib/mysql + restart: always + healthcheck: + test: ["CMD-SHELL", "mysqladmin ping -h localhost"] + interval: 10s + retries: 5 + networks: + - localhost + depends_on: + config-init: + condition: service_completed_successfully + + 0g-fine-tune-broker: + image: ghcr.io/0gfoundation/0g-serving-broker:latest + privileged: true + restart: always + container_name: 0g-fine-tune-broker + environment: + - PORT=3080 + - CONFIG_FILE=/etc/config/config.yaml + - NVIDIA_VISIBLE_DEVICES=all + - TOKEN=PHALA@2025 + ports: + - "#PORT#:3080" + volumes: + - /var/run/dstack.sock:/var/run/dstack.sock + - /var/run/docker.sock:/var/run/docker.sock + - config-vol:/etc/config:ro + - /tmp:/tmp + - /tmp/0g-transformer-bridge:/opt/transformer-bridge + - ./models:/fine-tuning/execution/models + - /dstack/persistent:/dstack/persistent + command: ["0g-fine-tuning-server", "--config", "/etc/config/config.yaml"] + networks: + - localhost + logging: + driver: "json-file" + options: + max-size: "100m" + max-file: "5" + depends_on: + mysql: + condition: service_healthy + +volumes: + mysql-data: + config-vol: + +networks: + localhost: + name: localhost + external: false diff --git a/api/fine-tuning/integration/prod/docker-compose-simplified.yml b/api/fine-tuning/integration/prod/docker-compose-simplified.yml new file mode 100644 index 00000000..eeb601d7 --- /dev/null +++ b/api/fine-tuning/integration/prod/docker-compose-simplified.yml @@ -0,0 +1,69 @@ +services: + config-init: + image: alpine:latest + volumes: + - config-vol:/config + command: sh -c 'echo "bmV0d29ya3M6CiAgZXRoZXJldW0wZzoKICAgIHVybDogImh0dHBzOi8vZXZtcnBjLXRlc3RuZXQuMGcuYWkiCiAgICBjaGFpbklEOiAxNjYwMgogICAgcHJpdmF0ZUtleXM6CiAgICAgIC0gIjJDRTQ2QTFDM0I1RTVGNzNFREI3OUMzNDNCQjg2OUVGOTQ3MDlFNkJFQkIzQjI5QjcwOTM3RDdCNUM2RDU3NTEiCiAgICB0cmFuc2FjdGlvbkxpbWl0OiAxMDAwMDAwCiAgICBnYXNFc3RpbWF0aW9uQnVmZmVyOiAxMDAwMApzZXJ2aWNlOgogIHNlcnZpbmdVcmw6ICIjRFNUQUNLX0FQUF9VUkwjIgogIHByaWNlUGVyVG9rZW46IDEKICBxdW90YToKICAgIGNwdUNvdW50OiA4CiAgICBtZW1vcnk6IDE4NwogICAgc3RvcmFnZTogOTAwCiAgICBncHVUeXBlOiBIMjAwCiAgICBncHVDb3VudDogMQpzdG9yYWdlQ2xpZW50OgogIGluZGV4ZXJTdGFuZGFyZDogImh0dHBzOi8vaW5kZXhlci1zdG9yYWdlLXRlc3RuZXQtc3RhbmRhcmQuMGcuYWkiCiAgaW5kZXhlclR1cmJvOiAiaHR0cHM6Ly9pbmRleGVyLXN0b3JhZ2UtdGVzdG5ldC10dXJiby4wZy5haSIKICB1cGxvYWRBcmdzOgogICAgZmluYWxpdHlSZXF1aXJlZDogZmFsc2UKICAgIHRhZ3M6ICIweCIKICAgIGV4cGVjdGVkUmVwbGljYTogMQogICAgc2tpcFR4OiBmYWxzZQogICAgZnJhZ21lbnRTaXplOiA0Mjk0OTY3Mjk2CmRhdGFiYXNlOgogIGZpbmVUdW5lOiByb290OjEyMzQ1NkB0Y3AoMGctZmluZS10dW5lLWJyb2tlci1kYjozMzA2KS9maW5lVHVuZT9wYXJzZVRpbWU9dHJ1ZQpsb2dnZXI6CiAgbGV2ZWw6IGRlYnVnCiAgcGF0aDogL3RtcC9maW5lLXR1bmluZy5sb2cKdHJhaW5pbmdXb3JrZXJDb3VudDogMQpzZXR1cFdvcmtlckNvdW50OiAxCmZpbmFsaXplcldvcmtlckNvdW50OiAxCm1heFNldHVwUmV0cmllc1BlclRhc2s6IDMKbWF4RXhlY3V0b3JSZXRyaWVzUGVyVGFzazogMwptYXhGaW5hbGl6ZXJSZXRyaWVzUGVyVGFzazogMTAKbWF4U2V0dGxlbWVudFJldHJpZXNQZXJUYXNrOiAxMApzZXR0bGVtZW50QmF0Y2hTaXplOiAxCmRlbGl2ZXJlZFRhc2tBY2tUaW1lb3V0U2VjczogMjE2MDAKZGF0YVJldGVudGlvbkRheXM6IDMKbWF4VGFza1F1ZXVlU2l6ZTogNQo=" | base64 -d > /config/config.yaml' + + mysql: + image: mysql:8.0 + container_name: 0g-fine-tune-broker-db + ports: + - "33060:3306" + environment: + - MYSQL_ROOT_PASSWORD=123456 + - MYSQL_DATABASE=fineTune + volumes: + - mysql-data:/var/lib/mysql + restart: always + healthcheck: + test: ["CMD-SHELL", "mysqladmin ping -h localhost"] + interval: 10s + retries: 5 + networks: + - localhost + depends_on: + config-init: + condition: service_completed_successfully + + 0g-fine-tune-broker: + image: ghcr.io/0gfoundation/0g-serving-broker:efa7350-full-amd64 + privileged: true + restart: always + container_name: 0g-fine-tune-broker + environment: + - PORT=3080 + - CONFIG_FILE=/etc/config/config.yaml + - NVIDIA_VISIBLE_DEVICES=all + - TOKEN=PHALA@2025 + - NETWORK=hardhat + ports: + - "80:3080" + volumes: + - /var/run/dstack.sock:/var/run/dstack.sock + - /var/run/docker.sock:/var/run/docker.sock + - config-vol:/etc/config:ro + - /tmp:/tmp + - /tmp/0g-transformer-bridge:/opt/transformer-bridge + - ./models:/fine-tuning/execution/models + - /dstack/persistent:/dstack/persistent + command: ["0g-fine-tuning-server", "--config", "/etc/config/config.yaml"] + networks: + - localhost + logging: + driver: "json-file" + options: + max-size: "100m" + max-file: "5" + depends_on: + mysql: + condition: service_healthy + +volumes: + mysql-data: + config-vol: + +networks: + localhost: + name: localhost + external: false diff --git a/api/fine-tuning/integration/prod/docker-compose-skip-storage.yml b/api/fine-tuning/integration/prod/docker-compose-skip-storage.yml new file mode 100644 index 00000000..f87f9f07 --- /dev/null +++ b/api/fine-tuning/integration/prod/docker-compose-skip-storage.yml @@ -0,0 +1,135 @@ +services: + config-init: + image: alpine:latest + container_name: config-init + volumes: + - /dstack/user_config:/config + command: > + sh -c 'cat > /config/config.yaml << CFGEOF + networks: + ethereum0g: + url: "https://evmrpc-testnet.0g.ai" + chainID: 16602 + privateKeys: + - "2CE46A1C3B5E5F73EDB79C343BB869EF94709E6BEBB3B29B70937D7B5C6D5751" + transactionLimit: 1000000 + service: + servingUrl: "#DSTACK_APP_URL#" + pricePerToken: 1 + quota: + cpuCount: 8 + memory: 187 + storage: 900 + gpuType: "H200" + gpuCount: 1 + customizedModels: + - name: "Qwen3-32B" + hash: "0x2e6f9620c35bdcb2b753cc7aa34e78077a8ed133e36fa36008fd6bdfd29af3a5" + image: "ghcr.io/0gfoundation/fine-tuning-test:v1" + dataType: "text" + trainingScript: "/app/train_lora.py" + description: "Qwen3-32B LoRA fine-tuning" + tokenizer: "Qwen/Qwen3-32B" + localPath: "/dstack/persistent/models/Qwen3-32B" + modelHuggingFaceFallback: + "0x2e6f9620c35bdcb2b753cc7aa34e78077a8ed133e36fa36008fd6bdfd29af3a5": "Qwen/Qwen3-32B" + skipStorageUpload: true + fileRetentionHours: 72 + database: + fineTune: "root:123456@tcp(0g-fine-tune-broker-db:3306)/fineTune?parseTime=true" + logger: + level: "debug" + path: "/tmp/fine-tuning.log" + finalizerWorkerCount: 1 + executorWorkerCount: 1 + setupWorkerCount: 1 + maxSetupRetriesPerTask: 3 + maxExecutorRetriesPerTask: 3 + maxFinalizerRetriesPerTask: 10 + maxSettlementRetriesPerTask: 10 + settlementBatchSize: 1 + deliveredTaskAckTimeoutSecs: 21600 + dataRetentionDays: 3 + maxTaskQueueSize: 5 + CFGEOF + echo "Config created with skipStorageUpload: true"' + + mysql: + image: mysql:8.0 + container_name: 0g-fine-tune-broker-db + ports: + - "33060:3306" + environment: + - MYSQL_ROOT_PASSWORD=123456 + - MYSQL_DATABASE=fineTune + volumes: + - mysql-data:/var/lib/mysql + restart: always + healthcheck: + test: ["CMD-SHELL", "mysqladmin ping -h localhost"] + interval: 10s + retries: 5 + networks: + - localhost + depends_on: + config-init: + condition: service_completed_successfully + + model-downloader: + image: python:3.11-slim + container_name: model-downloader + volumes: + - /dstack/persistent:/dstack/persistent + command: > + bash -c 'if [ -d "/dstack/persistent/models/Qwen3-32B" ]; then + echo "Model already exists"; echo "Model ready!"; + else + pip install -q huggingface_hub && + python3 -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id=\"Qwen/Qwen3-32B\", local_dir=\"/dstack/persistent/models/Qwen3-32B\", local_dir_use_symlinks=False, resume_download=True)" && + echo "Model ready!"; + fi' + depends_on: + config-init: + condition: service_completed_successfully + + 0g-fine-tune-broker: + image: ghcr.io/0gfoundation/0g-serving-broker:latest + privileged: true + restart: always + container_name: 0g-fine-tune-broker + environment: + - PORT=3080 + - CONFIG_FILE=/etc/config.yaml + - NVIDIA_VISIBLE_DEVICES=all + - TOKEN=PHALA@2025 + ports: + - "#PORT#:3080" + volumes: + - /var/run/dstack.sock:/var/run/dstack.sock + - /var/run/docker.sock:/var/run/docker.sock + - /dstack/user_config/config.yaml:/etc/config.yaml + - /tmp:/tmp + - /tmp/0g-transformer-bridge:/opt/transformer-bridge + - ./models:/fine-tuning/execution/models + - /dstack/persistent:/dstack/persistent + command: 0g-fine-tuning-server + networks: + - localhost + logging: + driver: "json-file" + options: + max-size: "100m" + max-file: "5" + depends_on: + mysql: + condition: service_healthy + model-downloader: + condition: service_completed_successfully + +volumes: + mysql-data: + +networks: + localhost: + name: localhost + external: false diff --git a/api/fine-tuning/integration/prod/docker-compose-updated.yml b/api/fine-tuning/integration/prod/docker-compose-updated.yml new file mode 100644 index 00000000..844939d1 --- /dev/null +++ b/api/fine-tuning/integration/prod/docker-compose-updated.yml @@ -0,0 +1,167 @@ +services: + config-init: + image: alpine:latest + container_name: config-init + volumes: + - /dstack/user_config:/config + entrypoint: ["/bin/sh", "-c"] + command: + - | + cat > /config/config.yaml << 'EOF' + networks: + ethereum0g: + url: "https://evmrpc-testnet.0g.ai" + chainID: 16602 + privateKeys: + - "2CE46A1C3B5E5F73EDB79C343BB869EF94709E6BEBB3B29B70937D7B5C6D5751" + transactionLimit: 1000000 + + service: + servingUrl: "#DSTACK_APP_URL#" + pricePerToken: 1 + quota: + cpuCount: 8 + memory: 187 + storage: 900 + gpuType: "H200" + gpuCount: 1 + + customizedModels: + - name: "Qwen2.5-0.5B-Instruct" + hash: "0xb4f76a886b8655c92bb021922d60b5e4d9271a5c9da98b6cb10937a06c2c75a7" + image: "ghcr.io/0gfoundation/fine-tuning-test:v1" + dataType: "text" + trainingScript: "/app/train_lora.py" + description: "Qwen2.5-0.5B LoRA fine-tuning" + tokenizer: "Qwen/Qwen2.5-0.5B-Instruct" + - name: "Qwen3-32B" + hash: "0x2e6f9620c35bdcb2b753cc7aa34e78077a8ed133e36fa36008fd6bdfd29af3a5" + image: "ghcr.io/0gfoundation/fine-tuning-test:v1" + dataType: "text" + trainingScript: "/app/train_lora.py" + description: "Qwen3-32B LoRA fine-tuning" + tokenizer: "Qwen/Qwen3-32B" + localPath: "/dstack/persistent/models/Qwen3-32B" + + modelHuggingFaceFallback: + "0xb4f76a886b8655c92bb021922d60b5e4d9271a5c9da98b6cb10937a06c2c75a7": "Qwen/Qwen2.5-0.5B-Instruct" + "0x2e6f9620c35bdcb2b753cc7aa34e78077a8ed133e36fa36008fd6bdfd29af3a5": "Qwen/Qwen3-32B" + + # IMPORTANT: Skip 0G Storage upload - users download directly from TEE + skipStorageUpload: true + fileRetentionHours: 72 + + database: + fineTune: "root:123456@tcp(0g-fine-tune-broker-db:3306)/fineTune?parseTime=true" + + logger: + level: "debug" + path: "/tmp/fine-tuning.log" + + finalizerWorkerCount: 1 + executorWorkerCount: 1 + setupWorkerCount: 1 + maxSetupRetriesPerTask: 3 + maxExecutorRetriesPerTask: 3 + maxFinalizerRetriesPerTask: 10 + maxSettlementRetriesPerTask: 10 + settlementBatchSize: 1 + deliveredTaskAckTimeoutSecs: 21600 + dataRetentionDays: 3 + maxTaskQueueSize: 5 + EOF + echo "Config created with skipStorageUpload: true" + + mysql: + image: mysql:8.0 + container_name: 0g-fine-tune-broker-db + ports: + - "33060:3306" + environment: + - MYSQL_ROOT_PASSWORD=123456 + - MYSQL_DATABASE=fineTune + volumes: + - mysql-data:/var/lib/mysql + restart: always + healthcheck: + test: ["CMD-SHELL", "mysqladmin ping -h localhost"] + interval: 10s + retries: 5 + networks: + - localhost + depends_on: + config-init: + condition: service_completed_successfully + + model-downloader: + image: python:3.11-slim + container_name: model-downloader + volumes: + - /dstack/persistent:/dstack/persistent + entrypoint: ["/bin/bash", "-c"] + command: + - | + if [ -d "/dstack/persistent/models/Qwen3-32B" ]; then + echo "Model already exists at /dstack/persistent/models/Qwen3-32B" + echo "Model ready!" + else + echo "Installing huggingface_hub..." + pip install -q huggingface_hub + echo "Downloading Qwen/Qwen3-32B to /dstack/persistent/models/Qwen3-32B..." + python3 -c " +from huggingface_hub import snapshot_download +snapshot_download( + repo_id='Qwen/Qwen3-32B', + local_dir='/dstack/persistent/models/Qwen3-32B', + local_dir_use_symlinks=False, + resume_download=True +) +print('Downloaded to: /dstack/persistent/models/Qwen3-32B') +" + echo "Model ready!" + fi + depends_on: + config-init: + condition: service_completed_successfully + + 0g-fine-tune-broker: + image: ghcr.io/0gfoundation/0g-serving-broker:latest + privileged: true + restart: always + container_name: 0g-fine-tune-broker + environment: + - PORT=3080 + - CONFIG_FILE=/etc/config.yaml + - NVIDIA_VISIBLE_DEVICES=all + - TOKEN=PHALA@2025 + ports: + - "#PORT#:3080" + volumes: + - /var/run/dstack.sock:/var/run/dstack.sock + - /var/run/docker.sock:/var/run/docker.sock + - /dstack/user_config/config.yaml:/etc/config.yaml + - /tmp:/tmp + - /tmp/0g-transformer-bridge:/opt/transformer-bridge + - ./models:/fine-tuning/execution/models + - /dstack/persistent:/dstack/persistent + command: 0g-fine-tuning-server + networks: + - localhost + logging: + driver: "json-file" + options: + max-size: "100m" + max-file: "5" + depends_on: + mysql: + condition: service_healthy + model-downloader: + condition: service_completed_successfully + +volumes: + mysql-data: + +networks: + localhost: + name: localhost + external: false diff --git a/api/fine-tuning/integration/prod/docker-compose-with-skip.yml b/api/fine-tuning/integration/prod/docker-compose-with-skip.yml new file mode 100644 index 00000000..98a9abda --- /dev/null +++ b/api/fine-tuning/integration/prod/docker-compose-with-skip.yml @@ -0,0 +1,117 @@ +services: + config-init: + image: alpine:latest + container_name: config-init + volumes: + - config-data:/config + command: | + sh -c 'cat > /config/config.yaml << ENDCONFIG + networks: + ethereum0g: + url: "https://evmrpc-testnet.0g.ai" + chainID: 16602 + privateKeys: + - "2CE46A1C3B5E5F73EDB79C343BB869EF94709E6BEBB3B29B70937D7B5C6D5751" + transactionLimit: 1000000 + service: + servingUrl: "#DSTACK_APP_URL#" + pricePerToken: 1 + quota: + cpuCount: 8 + memory: 187 + storage: 900 + gpuType: H200 + gpuCount: 1 + customizedModels: + - name: Qwen3-32B + hash: "0x2e6f9620c35bdcb2b753cc7aa34e78077a8ed133e36fa36008fd6bdfd29af3a5" + image: ghcr.io/0gfoundation/fine-tuning-test:v1 + dataType: text + trainingScript: /app/train_lora.py + description: Qwen3-32B LoRA fine-tuning + tokenizer: Qwen/Qwen3-32B + localPath: /dstack/persistent/models/Qwen3-32B + modelHuggingFaceFallback: + "0x2e6f9620c35bdcb2b753cc7aa34e78077a8ed133e36fa36008fd6bdfd29af3a5": Qwen/Qwen3-32B + skipStorageUpload: true + fileRetentionHours: 72 + database: + fineTune: root:123456@tcp(0g-fine-tune-broker-db:3306)/fineTune?parseTime=true + logger: + level: debug + path: /tmp/fine-tuning.log + finalizerWorkerCount: 1 + executorWorkerCount: 1 + setupWorkerCount: 1 + maxSetupRetriesPerTask: 3 + maxExecutorRetriesPerTask: 3 + maxFinalizerRetriesPerTask: 10 + maxSettlementRetriesPerTask: 10 + settlementBatchSize: 1 + deliveredTaskAckTimeoutSecs: 21600 + dataRetentionDays: 3 + maxTaskQueueSize: 5 + ENDCONFIG + echo Config created with skipStorageUpload: true' + + mysql: + image: mysql:8.0 + container_name: 0g-fine-tune-broker-db + ports: + - "33060:3306" + environment: + - MYSQL_ROOT_PASSWORD=123456 + - MYSQL_DATABASE=fineTune + volumes: + - mysql-data:/var/lib/mysql + restart: always + healthcheck: + test: ["CMD-SHELL", "mysqladmin ping -h localhost"] + interval: 10s + retries: 5 + networks: + - localhost + depends_on: + config-init: + condition: service_completed_successfully + + 0g-fine-tune-broker: + image: ghcr.io/0gfoundation/0g-serving-broker:latest + privileged: true + restart: always + container_name: 0g-fine-tune-broker + environment: + - PORT=3080 + - CONFIG_FILE=/etc/config.yaml + - NVIDIA_VISIBLE_DEVICES=all + - TOKEN=PHALA@2025 + ports: + - "#PORT#:3080" + volumes: + - /var/run/dstack.sock:/var/run/dstack.sock + - /var/run/docker.sock:/var/run/docker.sock + - config-data:/etc/config:ro + - /tmp:/tmp + - /tmp/0g-transformer-bridge:/opt/transformer-bridge + - ./models:/fine-tuning/execution/models + - /dstack/persistent:/dstack/persistent + command: ["0g-fine-tuning-server", "--config", "/etc/config/config.yaml"] + networks: + - localhost + logging: + driver: "json-file" + options: + max-size: "100m" + max-file: "5" + depends_on: + mysql: + condition: service_healthy + +volumes: + mysql-data: + config-data: + +networks: + localhost: + name: localhost + external: false diff --git a/api/fine-tuning/internal/db/service.go b/api/fine-tuning/internal/db/service.go index 2b216b54..bc2fe84b 100644 --- a/api/fine-tuning/internal/db/service.go +++ b/api/fine-tuning/internal/db/service.go @@ -257,3 +257,54 @@ func (d *DB) UpdateUserPublicKey(task *Task, key string) error { UserPublicKey: key, }) } + +func (d *DB) GetFinishedTasksForServing() ([]Task, error) { + var tasks []Task + servableStates := []string{ + ProgressStateFinished.String(), + ProgressStateUserAcknowledged.String(), + } + ret := d.db.Where("progress IN ?", servableStates).Order("created_at DESC").Limit(1000).Find(&tasks) + if ret.Error != nil { + return nil, ret.Error + } + return tasks, nil +} + +func (d *DB) CountTasksByState() (map[string]float64, error) { + type stateCount struct { + Progress string + Count float64 + } + + var results []stateCount + ret := d.db.Model(&Task{}). + Select("progress, COUNT(*) as count"). + Group("progress"). + Find(&results) + if ret.Error != nil { + return nil, ret.Error + } + + counts := make(map[string]float64) + allStates := []string{ + ProgressStateInit.String(), + ProgressStateSettingUp.String(), + ProgressStateSetUp.String(), + ProgressStateTraining.String(), + ProgressStateTrained.String(), + ProgressStateDelivering.String(), + ProgressStateDelivered.String(), + ProgressStateUserAcknowledged.String(), + ProgressStateFinished.String(), + ProgressStateFailed.String(), + } + for _, s := range allStates { + counts[s] = 0 + } + for _, r := range results { + counts[r.Progress] = r.Count + } + + return counts, nil +} diff --git a/api/fine-tuning/internal/db/test_helpers.go b/api/fine-tuning/internal/db/test_helpers.go new file mode 100644 index 00000000..c420afa3 --- /dev/null +++ b/api/fine-tuning/internal/db/test_helpers.go @@ -0,0 +1,31 @@ +package db + +import ( + "encoding/hex" + "time" + + "github.com/google/uuid" +) + +func (d *DB) InsertTestTask(id uuid.UUID, userAddress, modelHash string) error { + now := time.Now() + task := &Task{ + ID: &id, + CreatedAt: &now, + UpdatedAt: &now, + UserAddress: userAddress, + PreTrainedModelHash: modelHash, + DatasetHash: "0x" + hex.EncodeToString(make([]byte, 32)), + TrainingParams: `{"epochs":1}`, + Fee: "100", + Nonce: "1", + Signature: "0x" + hex.EncodeToString(make([]byte, 65)), + Progress: ProgressStateFinished.String(), + OutputRootHash: "", + } + return d.db.Create(task).Error +} + +func (d *DB) DeleteTestTask(id uuid.UUID) error { + return d.db.Unscoped().Where("id = ?", id.String()).Delete(&Task{}).Error +} diff --git a/api/fine-tuning/internal/handler/handler.go b/api/fine-tuning/internal/handler/handler.go index fa16c362..61411629 100644 --- a/api/fine-tuning/internal/handler/handler.go +++ b/api/fine-tuning/internal/handler/handler.go @@ -8,19 +8,22 @@ import ( "github.com/0glabs/0g-serving-broker/common/log" "github.com/0glabs/0g-serving-broker/common/middleware" "github.com/0glabs/0g-serving-broker/fine-tuning/internal/ctrl" + "github.com/0glabs/0g-serving-broker/fine-tuning/internal/serving" ) type Handler struct { - ctrl *ctrl.Ctrl - logger log.Logger - rateLimiter *middleware.RateLimiter + ctrl *ctrl.Ctrl + logger log.Logger + rateLimiter *middleware.RateLimiter + servingProxy *serving.Proxy } -func New(ctrl *ctrl.Ctrl, logger log.Logger, rateLimitRPS float64, rateLimitBurst int) *Handler { +func New(ctrl *ctrl.Ctrl, logger log.Logger, rateLimitRPS float64, rateLimitBurst int, servingProxy *serving.Proxy) *Handler { h := &Handler{ - ctrl: ctrl, - logger: logger, - rateLimiter: middleware.NewRateLimiter(rate.Limit(rateLimitRPS), rateLimitBurst), + ctrl: ctrl, + logger: logger, + rateLimiter: middleware.NewRateLimiter(rate.Limit(rateLimitRPS), rateLimitBurst), + servingProxy: servingProxy, } return h } @@ -34,8 +37,8 @@ func (h *Handler) Register(r *gin.Engine) { group.GET("/user/:userAddress/task/:taskID", h.GetTask) group.GET("/user/:userAddress/task/:taskID/log", h.GetTaskProgress) - group.POST("/user/:userAddress/task/:taskID/lora", middleware.RateLimitMiddleware(h.rateLimiter), h.DownloadLoRA) // Download LoRA with rate limiting - group.POST("/user/:userAddress/dataset", middleware.RateLimitMiddleware(h.rateLimiter), h.UploadDataset) // Upload dataset to TEE with rate limiting + group.POST("/user/:userAddress/task/:taskID/lora", middleware.RateLimitMiddleware(h.rateLimiter), h.DownloadLoRA) + group.POST("/user/:userAddress/dataset", middleware.RateLimitMiddleware(h.rateLimiter), h.UploadDataset) group.GET("/task/pending", h.GetPendingTrainingTaskCount) group.GET("/quote", middleware.RateLimitMiddleware(h.rateLimiter), h.GetQuote) @@ -43,6 +46,10 @@ func (h *Handler) Register(r *gin.Engine) { group.GET("/model", h.ListModel) group.GET("/model/:name", h.GetModel) group.GET("/model/desc/:name", h.GetModelDesc) + + if h.servingProxy != nil { + h.servingProxy.RegisterRoutes(group) + } } func handleBrokerError(ctx *gin.Context, err error, context string) { diff --git a/api/fine-tuning/internal/handler/task.go b/api/fine-tuning/internal/handler/task.go index b5128132..ec0b0dee 100644 --- a/api/fine-tuning/internal/handler/task.go +++ b/api/fine-tuning/internal/handler/task.go @@ -11,6 +11,7 @@ import ( "github.com/gin-gonic/gin" "github.com/google/uuid" + "github.com/0glabs/0g-serving-broker/fine-tuning/monitor" "github.com/0glabs/0g-serving-broker/fine-tuning/schema" ) @@ -38,6 +39,8 @@ func (h *Handler) CreateTask(ctx *gin.Context) { return } + monitor.RecordTaskCreated() + monitor.RecordUniqueUser(task.UserAddress) ctx.JSON(http.StatusCreated, gin.H{"id": id}) } diff --git a/api/fine-tuning/internal/services/service.go b/api/fine-tuning/internal/services/service.go index ac634b25..af1e9e54 100644 --- a/api/fine-tuning/internal/services/service.go +++ b/api/fine-tuning/internal/services/service.go @@ -11,6 +11,7 @@ import ( "github.com/0glabs/0g-serving-broker/fine-tuning/config" "github.com/0glabs/0g-serving-broker/fine-tuning/internal/db" "github.com/0glabs/0g-serving-broker/fine-tuning/internal/utils" + "github.com/0glabs/0g-serving-broker/fine-tuning/monitor" "github.com/gammazero/workerpool" ) @@ -211,6 +212,7 @@ func (s *Service) handleTaskFailure(err error, dbTask *db.Task) error { } if errors.Is(err, errSignature) { + monitor.RecordTaskFailed() return s.db.MarkTaskFailed(dbTask) } @@ -219,6 +221,8 @@ func (s *Service) handleTaskFailure(err error, dbTask *db.Task) error { if err := utils.WriteToLogFile(dbTask.ID, fmt.Sprintf("Retrying task %v\n", dbTask.ID)); err != nil { s.logger.Errorf("Write into task log failed: %v", err) } + } else { + monitor.RecordTaskFailed() } return err diff --git a/api/fine-tuning/internal/services/settlement.go b/api/fine-tuning/internal/services/settlement.go index defd9298..ae2a8e85 100644 --- a/api/fine-tuning/internal/services/settlement.go +++ b/api/fine-tuning/internal/services/settlement.go @@ -17,6 +17,7 @@ import ( providercontract "github.com/0glabs/0g-serving-broker/fine-tuning/internal/contract" "github.com/0glabs/0g-serving-broker/fine-tuning/internal/db" "github.com/0glabs/0g-serving-broker/fine-tuning/internal/utils" + "github.com/0glabs/0g-serving-broker/fine-tuning/monitor" "github.com/ethereum/go-ethereum/common" "github.com/ethereum/go-ethereum/common/hexutil" "github.com/ethereum/go-ethereum/crypto" @@ -114,6 +115,7 @@ func (s *Settlement) processFinishedTasks(ctx context.Context) error { func (s *Settlement) trySettle(ctx context.Context, task db.Task, userAcked bool) error { s.logger.Infof("settle for task %v, ack %v", task.ID.String(), userAcked) if err := s.doSettlement(ctx, &task, userAcked); err != nil { + monitor.RecordSettlement(err) err = errors.Wrapf(err, "error during do settlement for tasks failed once") s.logger.Errorf("%v", err) if err := utils.WriteToLogFile(task.ID, fmt.Sprintf("Settle task %v failed: %v\n", task.ID, err)); err != nil { @@ -127,10 +129,12 @@ func (s *Settlement) trySettle(ctx context.Context, task db.Task, userAcked bool } return err - } else { - if err := utils.WriteToLogFile(task.ID, fmt.Sprintf("Settle task %s successfully\n", task.ID)); err != nil { - s.logger.Errorf("Write into task log failed: %v", err) - } + } + + monitor.RecordSettlement(nil) + monitor.RecordTaskCompleted() + if err := utils.WriteToLogFile(task.ID, fmt.Sprintf("Settle task %s successfully\n", task.ID)); err != nil { + s.logger.Errorf("Write into task log failed: %v", err) } return nil diff --git a/api/fine-tuning/internal/serving/manager.go b/api/fine-tuning/internal/serving/manager.go new file mode 100644 index 00000000..70ea1ed7 --- /dev/null +++ b/api/fine-tuning/internal/serving/manager.go @@ -0,0 +1,512 @@ +package serving + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "os" + "os/exec" + "path/filepath" + "strings" + "sync" + "syscall" + "time" + + "github.com/0glabs/0g-serving-broker/common/errors" + "github.com/0glabs/0g-serving-broker/common/log" + "github.com/0glabs/0g-serving-broker/fine-tuning/internal/db" + "github.com/0glabs/0g-serving-broker/fine-tuning/internal/utils" + "github.com/google/uuid" +) + +// ServedModel represents a LoRA adapter tracked by the serving system. +// State indicates its current storage tier (active on disk, archived in 0G Storage, or loading). +type ServedModel struct { + TaskID uuid.UUID + UserAddress string + BaseModel string + LoRAPath string + ModelName string + RegisteredAt time.Time + LastAccessedAt time.Time + State ModelState + OutputRootHash string +} + +// Manager controls the lifecycle of the vLLM process, LoRA adapter loading/unloading, +// and multi-tier caching (GPU → CPU → Disk → 0G Storage). +type Manager struct { + mu sync.RWMutex + servedModels map[string]*ServedModel // key: modelName + modelReadyChs map[string]chan struct{} // closed when model becomes active or restore fails + db *db.DB + logger log.Logger + config ServingConfig + vllmProcess *exec.Cmd + vllmReady bool + loraModulesDir string + httpClient *http.Client + storageClient StorageDownloader +} + +// ServingConfig holds configuration for the LoRA inference serving subsystem. +type ServingConfig struct { + Enable bool `yaml:"enable"` + BaseModelPath string `yaml:"baseModelPath"` + InferenceGPUIDs string `yaml:"inferenceGpuIds"` + VLLMPort int `yaml:"vllmPort"` + MaxLoraRank int `yaml:"maxLoraRank"` + MaxLoraModules int `yaml:"maxLoraModules"` + MaxCpuLoras int `yaml:"maxCpuLoras"` + LoraModulesDir string `yaml:"loraModulesDir"` + OffloadAfterMinutes int `yaml:"offloadAfterMinutes"` + EnableColdStorage bool `yaml:"enableColdStorage"` + ModelLoadTimeoutSeconds int `yaml:"modelLoadTimeoutSeconds"` + GpuMemoryUtilization float64 `yaml:"gpuMemoryUtilization"` +} + +// NewManager creates a new serving Manager with the given database, config, logger, +// and optional storage client for cold-storage offload/restore. +func NewManager(db *db.DB, config ServingConfig, logger log.Logger, storageClient StorageDownloader) *Manager { + loraDir := config.LoraModulesDir + if loraDir == "" { + loraDir = "/tmp/lora-modules" + } + + return &Manager{ + servedModels: make(map[string]*ServedModel), + modelReadyChs: make(map[string]chan struct{}), + db: db, + logger: logger, + config: config, + loraModulesDir: loraDir, + httpClient: &http.Client{ + Timeout: 10 * time.Second, + }, + storageClient: storageClient, + } +} + +// Start launches the vLLM subprocess and begins polling for finished fine-tuning tasks. +func (m *Manager) Start(ctx context.Context) error { + if !m.config.Enable { + m.logger.Info("LoRA serving is disabled") + return nil + } + + if m.config.BaseModelPath == "" { + return errors.New("serving.baseModelPath is required when serving is enabled") + } + + if err := os.MkdirAll(m.loraModulesDir, 0755); err != nil { + return errors.Wrap(err, "create lora modules directory") + } + + go m.startVLLM(ctx) + go m.pollFinishedTasks(ctx) + go m.offloadLoop(ctx) + return nil +} + +// Stop gracefully terminates the vLLM process and all its child processes. +// vLLM runs a multi-process architecture (APIServer + EngineCore), so we kill +// the entire process group to avoid orphaned GPU-holding processes. +func (m *Manager) Stop() error { + m.mu.Lock() + defer m.mu.Unlock() + + if m.vllmProcess == nil || m.vllmProcess.Process == nil { + return nil + } + + pid := m.vllmProcess.Process.Pid + m.logger.Infof("stopping vLLM process group (pid %d)", pid) + + // Kill the entire process group (negative PID) to ensure EngineCore + // and other child processes are terminated and GPU memory is released. + if err := syscall.Kill(-pid, syscall.SIGTERM); err != nil { + m.logger.Warnf("SIGTERM to process group failed: %v, escalating to SIGKILL", err) + if err := syscall.Kill(-pid, syscall.SIGKILL); err != nil { + m.logger.Errorf("SIGKILL to process group also failed: %v", err) + return err + } + } + + // Wait briefly for graceful shutdown, then force kill if still alive. + done := make(chan struct{}) + go func() { + m.vllmProcess.Wait() + close(done) + }() + + select { + case <-done: + m.logger.Info("vLLM process group terminated gracefully") + case <-time.After(10 * time.Second): + m.logger.Warn("vLLM did not exit within 10s, sending SIGKILL") + syscall.Kill(-pid, syscall.SIGKILL) + } + + return nil +} + +func (m *Manager) startVLLM(ctx context.Context) { + port := m.config.VLLMPort + if port == 0 { + port = 8000 + } + + maxLoraRank := m.config.MaxLoraRank + if maxLoraRank == 0 { + maxLoraRank = 64 + } + + args := []string{ + "serve", m.config.BaseModelPath, + "--port", fmt.Sprintf("%d", port), + "--enable-lora", + "--max-lora-rank", fmt.Sprintf("%d", maxLoraRank), + } + + if m.config.MaxLoraModules > 0 { + args = append(args, "--max-loras", fmt.Sprintf("%d", m.config.MaxLoraModules)) + } + if m.config.MaxCpuLoras > 0 { + args = append(args, "--max-cpu-loras", fmt.Sprintf("%d", m.config.MaxCpuLoras)) + } + if m.config.GpuMemoryUtilization > 0 && m.config.GpuMemoryUtilization < 1.0 { + args = append(args, "--gpu-memory-utilization", fmt.Sprintf("%.2f", m.config.GpuMemoryUtilization)) + } + + m.logger.Infof("starting vLLM with args: %v", args) + + cmd := exec.CommandContext(ctx, "vllm", args...) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} + + cmd.Env = os.Environ() + if m.config.InferenceGPUIDs != "" { + cmd.Env = append(cmd.Env, "CUDA_VISIBLE_DEVICES="+m.config.InferenceGPUIDs) + } + cmd.Env = append(cmd.Env, + "VLLM_ALLOW_RUNTIME_LORA_UPDATING=True", + "VLLM_PLUGINS=lora_filesystem_resolver", + "VLLM_LORA_RESOLVER_CACHE_DIR="+m.loraModulesDir, + ) + + m.mu.Lock() + m.vllmProcess = cmd + m.mu.Unlock() + + if err := cmd.Start(); err != nil { + m.logger.Errorf("failed to start vLLM: %v", err) + return + } + + go m.waitForVLLMReady(ctx) + + if err := cmd.Wait(); err != nil { + if ctx.Err() != nil { + m.logger.Info("vLLM stopped due to context cancellation") + return + } + m.logger.Errorf("vLLM exited with error: %v", err) + } +} + +func (m *Manager) waitForVLLMReady(ctx context.Context) { + endpoint := m.GetVLLMEndpoint() + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint+"/health", nil) + if err != nil { + continue + } + resp, err := m.httpClient.Do(req) + if err == nil { + resp.Body.Close() + if resp.StatusCode == http.StatusOK { + m.mu.Lock() + m.vllmReady = true + m.mu.Unlock() + m.logger.Info("vLLM is ready (health check passed)") + return + } + } + } + } +} + +func (m *Manager) pollFinishedTasks(ctx context.Context) { + // Wait for vLLM to be ready before polling + for { + select { + case <-ctx.Done(): + return + case <-time.After(10 * time.Second): + if m.IsReady() { + goto poll + } + m.logger.Debug("waiting for vLLM to be ready before polling tasks...") + } + } + +poll: + m.logger.Info("starting finished task polling for LoRA auto-registration") + ticker := time.NewTicker(30 * time.Second) + defer ticker.Stop() + + m.discoverAndRegisterModels() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + m.discoverAndRegisterModels() + } + } +} + +func (m *Manager) discoverAndRegisterModels() { + tasks, err := m.db.GetFinishedTasksForServing() + if err != nil { + m.logger.Warnf("failed to query finished tasks for serving: %v", err) + return + } + + for _, task := range tasks { + taskDir := filepath.Join(utils.GetDataDir(), task.ID.String()) + paths := utils.NewTaskPaths(taskDir) + loraPath := paths.Output + + if _, err := os.Stat(loraPath); os.IsNotExist(err) { + continue + } + + m.mu.RLock() + modelName := m.makeModelName(task.PreTrainedModelHash, *task.ID) + _, exists := m.servedModels[modelName] + m.mu.RUnlock() + + if exists { + continue + } + + registeredName, err := m.RegisterModel(*task.ID, task.UserAddress, task.PreTrainedModelHash, loraPath, task.OutputRootHash) + if err != nil { + m.logger.Warnf("failed to auto-register model for task %s: %v", task.ID, err) + continue + } + + m.logger.Infof("auto-registered LoRA model: %s (task: %s, user: %s)", registeredName, task.ID, task.UserAddress) + } + + m.pruneStaleModels() +} + +func (m *Manager) pruneStaleModels() { + if m.config.MaxLoraModules <= 0 { + return + } + + m.mu.Lock() + defer m.mu.Unlock() + + if len(m.servedModels) <= m.config.MaxLoraModules { + return + } + + var oldest *ServedModel + for _, model := range m.servedModels { + if oldest == nil || model.RegisteredAt.Before(oldest.RegisteredAt) { + oldest = model + } + } + + if oldest != nil { + destDir := filepath.Join(m.loraModulesDir, oldest.ModelName) + if err := os.RemoveAll(destDir); err != nil { + m.logger.Warnf("failed to remove pruned model directory %s: %v", destDir, err) + } + delete(m.servedModels, oldest.ModelName) + m.logger.Infof("pruned oldest served model: %s (task: %s)", oldest.ModelName, oldest.TaskID) + } +} + +// RegisterModel creates a symlink for a LoRA adapter and adds it to the served model set. +// outputRootHash is the 0G Storage root hash used for cold-storage restore if the adapter +// is later offloaded. +func (m *Manager) RegisterModel(taskID uuid.UUID, userAddress, baseModel, loraPath, outputRootHash string) (string, error) { + m.mu.Lock() + defer m.mu.Unlock() + + if !m.config.Enable { + return "", errors.New("LoRA serving is not enabled") + } + + modelName := m.makeModelName(baseModel, taskID) + + destDir := filepath.Join(m.loraModulesDir, modelName) + if err := os.MkdirAll(filepath.Dir(destDir), 0755); err != nil { + return "", errors.Wrap(err, "create lora destination directory") + } + + if err := os.Symlink(loraPath, destDir); err != nil && !os.IsExist(err) { + return "", errors.Wrap(err, "symlink lora adapter") + } + + now := time.Now() + served := &ServedModel{ + TaskID: taskID, + UserAddress: userAddress, + BaseModel: baseModel, + LoRAPath: loraPath, + ModelName: modelName, + RegisteredAt: now, + LastAccessedAt: now, + State: ModelStateActive, + OutputRootHash: outputRootHash, + } + + m.servedModels[modelName] = served + m.logger.Infof("registered LoRA model for serving: %s (task: %s, user: %s, hash: %s)", + modelName, taskID, userAddress, outputRootHash) + + return modelName, nil +} + +// UnregisterModel removes a LoRA adapter from serving and cleans up its symlink. +func (m *Manager) UnregisterModel(modelName string) error { + m.mu.Lock() + defer m.mu.Unlock() + + served, exists := m.servedModels[modelName] + if !exists { + return fmt.Errorf("model not found: %s", modelName) + } + + destDir := filepath.Join(m.loraModulesDir, modelName) + if err := os.RemoveAll(destDir); err != nil { + m.logger.Warnf("failed to remove model directory %s: %v", destDir, err) + } + + delete(m.servedModels, modelName) + m.logger.Infof("unregistered LoRA model: %s (task: %s)", modelName, served.TaskID) + + return nil +} + +// ListServedModels returns all currently served LoRA models. +func (m *Manager) ListServedModels() []*ServedModel { + m.mu.RLock() + defer m.mu.RUnlock() + + models := make([]*ServedModel, 0, len(m.servedModels)) + for _, model := range m.servedModels { + models = append(models, model) + } + return models +} + +// ListServedModelsForUser returns served models owned by the given user address. +func (m *Manager) ListServedModelsForUser(userAddress string) []*ServedModel { + m.mu.RLock() + defer m.mu.RUnlock() + + var models []*ServedModel + for _, model := range m.servedModels { + if strings.EqualFold(model.UserAddress, userAddress) { + models = append(models, model) + } + } + return models +} + +// GetServedModel retrieves a served model by name. +func (m *Manager) GetServedModel(modelName string) (*ServedModel, bool) { + m.mu.RLock() + defer m.mu.RUnlock() + model, exists := m.servedModels[modelName] + return model, exists +} + +// IsModelOwner checks whether the given user address owns the specified model. +func (m *Manager) IsModelOwner(modelName, userAddress string) bool { + m.mu.RLock() + defer m.mu.RUnlock() + model, exists := m.servedModels[modelName] + if !exists { + return false + } + return strings.EqualFold(model.UserAddress, userAddress) +} + +// IsReady reports whether the vLLM backend has passed its health check. +func (m *Manager) IsReady() bool { + m.mu.RLock() + defer m.mu.RUnlock() + return m.vllmReady +} + +// GetVLLMEndpoint returns the base URL of the local vLLM server. +func (m *Manager) GetVLLMEndpoint() string { + port := m.config.VLLMPort + if port == 0 { + port = 8000 + } + return fmt.Sprintf("http://localhost:%d", port) +} + +// makeModelName builds a deterministic, vLLM-safe model identifier from the +// base model name and task UUID. Non-alphanumeric characters (except - and _) +// are replaced with hyphens so the name is valid for vLLM's model registry. +func (m *Manager) makeModelName(baseModel string, taskID uuid.UUID) string { + shortBase := baseModel + if len(shortBase) > 16 { + shortBase = shortBase[:16] + } + shortBase = strings.Map(func(r rune) rune { + if (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') || r == '-' || r == '_' { + return r + } + return '-' + }, shortBase) + return fmt.Sprintf("ft-%s-%s", shortBase, taskID.String()[:12]) +} + +// GetVLLMModels queries the vLLM server for its currently loaded model names. +func (m *Manager) GetVLLMModels(ctx context.Context) ([]string, error) { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, m.GetVLLMEndpoint()+"/v1/models", nil) + if err != nil { + return nil, err + } + resp, err := m.httpClient.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + var result struct { + Data []struct { + ID string `json:"id"` + } `json:"data"` + } + if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { + return nil, err + } + + var models []string + for _, m := range result.Data { + models = append(models, m.ID) + } + return models, nil +} diff --git a/api/fine-tuning/internal/serving/model_cache.go b/api/fine-tuning/internal/serving/model_cache.go new file mode 100644 index 00000000..516eb074 --- /dev/null +++ b/api/fine-tuning/internal/serving/model_cache.go @@ -0,0 +1,241 @@ +package serving + +import ( + "context" + "fmt" + "os" + "path/filepath" + "time" +) + +// ModelState represents the storage tier of a LoRA adapter. +type ModelState int + +const ( + // ModelStateActive means the adapter is on local disk and available for vLLM to load. + // vLLM manages the GPU↔CPU transitions internally via its own LRU cache. + ModelStateActive ModelState = iota + // ModelStateArchived means the adapter has been removed from disk and only + // exists in 0G Storage. A download is required before vLLM can serve it. + ModelStateArchived + // ModelStateLoading means the adapter is being downloaded from 0G Storage. + ModelStateLoading +) + +func (s ModelState) String() string { + return [...]string{"active", "archived", "loading"}[s] +} + +// StorageDownloader abstracts 0G Storage download operations so the serving +// package does not depend on the concrete storage.Client type. +type StorageDownloader interface { + DownloadFromStorage(ctx context.Context, hash, filePath string, isTurbo bool) (string, error) +} + +// RecordAccess updates the last-accessed timestamp for a model. +// Called on every inference request to prevent premature offloading. +func (m *Manager) RecordAccess(modelName string) { + m.mu.Lock() + defer m.mu.Unlock() + if model, exists := m.servedModels[modelName]; exists { + model.LastAccessedAt = time.Now() + } +} + +// GetModelState returns the current storage tier state of a served model. +func (m *Manager) GetModelState(modelName string) (ModelState, bool) { + m.mu.RLock() + defer m.mu.RUnlock() + model, exists := m.servedModels[modelName] + if !exists { + return 0, false + } + return model.State, true +} + +// offloadLoop periodically checks for inactive models and moves them to cold +// storage by deleting local files. Models can be restored on demand via RestoreModel. +func (m *Manager) offloadLoop(ctx context.Context) { + if m.config.OffloadAfterMinutes <= 0 || !m.config.EnableColdStorage { + m.logger.Info("cold storage offloading disabled") + return + } + + m.logger.Infof("cold storage offload loop started (threshold: %d minutes)", m.config.OffloadAfterMinutes) + ticker := time.NewTicker(1 * time.Minute) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + m.offloadStaleModels() + } + } +} + +func (m *Manager) offloadStaleModels() { + threshold := time.Now().Add(-time.Duration(m.config.OffloadAfterMinutes) * time.Minute) + + m.mu.Lock() + defer m.mu.Unlock() + + for name, model := range m.servedModels { + if model.State != ModelStateActive { + continue + } + if model.OutputRootHash == "" { + continue + } + if model.LastAccessedAt.Before(threshold) { + destDir := filepath.Join(m.loraModulesDir, name) + if err := os.RemoveAll(destDir); err != nil { + m.logger.Warnf("failed to remove symlink %s during offload: %v", destDir, err) + } + if err := os.RemoveAll(model.LoRAPath); err != nil { + m.logger.Warnf("failed to remove LoRA files %s during offload: %v", model.LoRAPath, err) + } + model.State = ModelStateArchived + m.logger.Infof("offloaded model %s to cold storage (last accessed: %s)", + name, model.LastAccessedAt.Format(time.RFC3339)) + } + } +} + +// getOrCreateReadyCh returns a channel that will be closed when the model +// transitions out of the loading state. Must be called with m.mu held. +func (m *Manager) getOrCreateReadyCh(modelName string) chan struct{} { + ch, exists := m.modelReadyChs[modelName] + if !exists { + ch = make(chan struct{}) + m.modelReadyChs[modelName] = ch + } + return ch +} + +// notifyModelReady closes the ready channel for a model, waking all waiters. +// Must be called with m.mu held. +func (m *Manager) notifyModelReady(modelName string) { + if ch, exists := m.modelReadyChs[modelName]; exists { + close(ch) + delete(m.modelReadyChs, modelName) + } +} + +// WaitForModel blocks until the model reaches the active state, the context +// is cancelled, or the timeout expires. Returns the final model state. +func (m *Manager) WaitForModel(ctx context.Context, modelName string, timeout time.Duration) (ModelState, error) { + m.mu.RLock() + model, exists := m.servedModels[modelName] + if !exists { + m.mu.RUnlock() + return 0, fmt.Errorf("model not found: %s", modelName) + } + if model.State == ModelStateActive { + m.mu.RUnlock() + return ModelStateActive, nil + } + m.mu.RUnlock() + + // Get or create the notification channel (needs write lock). + m.mu.Lock() + ch := m.getOrCreateReadyCh(modelName) + m.mu.Unlock() + + ctx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + select { + case <-ch: + // Channel closed — model restored (or failed). Check final state. + state, ok := m.GetModelState(modelName) + if !ok { + return 0, fmt.Errorf("model disappeared: %s", modelName) + } + return state, nil + case <-ctx.Done(): + return ModelStateLoading, ctx.Err() + } +} + +// RestoreModel triggers an async download of an archived model from 0G Storage. +// Returns nil immediately if the model is already active or already loading. +func (m *Manager) RestoreModel(ctx context.Context, modelName string) error { + m.mu.Lock() + model, exists := m.servedModels[modelName] + if !exists { + m.mu.Unlock() + return fmt.Errorf("model not found: %s", modelName) + } + + switch model.State { + case ModelStateActive: + m.mu.Unlock() + return nil + case ModelStateLoading: + m.mu.Unlock() + return nil + case ModelStateArchived: + model.State = ModelStateLoading + m.getOrCreateReadyCh(modelName) + m.mu.Unlock() + } + + go func() { + if err := m.downloadAndActivate(context.Background(), modelName); err != nil { + m.logger.Errorf("failed to restore model %s from cold storage: %v", modelName, err) + m.mu.Lock() + if mdl, ok := m.servedModels[modelName]; ok { + mdl.State = ModelStateArchived + } + m.notifyModelReady(modelName) + m.mu.Unlock() + } + }() + + return nil +} + +func (m *Manager) downloadAndActivate(ctx context.Context, modelName string) error { + m.mu.RLock() + model, exists := m.servedModels[modelName] + if !exists { + m.mu.RUnlock() + return fmt.Errorf("model not found: %s", modelName) + } + hash := model.OutputRootHash + loraPath := model.LoRAPath + m.mu.RUnlock() + + if m.storageClient == nil { + return fmt.Errorf("storage client not configured") + } + + m.logger.Infof("downloading model %s from 0G Storage (hash: %s)", modelName, hash) + + if err := os.MkdirAll(filepath.Dir(loraPath), 0755); err != nil { + return fmt.Errorf("create lora directory: %w", err) + } + + if _, err := m.storageClient.DownloadFromStorage(ctx, hash, loraPath, false); err != nil { + return fmt.Errorf("download from storage: %w", err) + } + + destDir := filepath.Join(m.loraModulesDir, modelName) + _ = os.Remove(destDir) + if err := os.Symlink(loraPath, destDir); err != nil && !os.IsExist(err) { + return fmt.Errorf("symlink lora adapter: %w", err) + } + + m.mu.Lock() + if mdl, ok := m.servedModels[modelName]; ok { + mdl.State = ModelStateActive + mdl.LastAccessedAt = time.Now() + } + m.notifyModelReady(modelName) + m.mu.Unlock() + + m.logger.Infof("model %s restored from cold storage and activated", modelName) + return nil +} diff --git a/api/fine-tuning/internal/serving/proxy.go b/api/fine-tuning/internal/serving/proxy.go new file mode 100644 index 00000000..22cc094e --- /dev/null +++ b/api/fine-tuning/internal/serving/proxy.go @@ -0,0 +1,410 @@ +package serving + +import ( + "bytes" + "encoding/json" + "fmt" + "io" + "net/http" + "strings" + "time" + + "github.com/0glabs/0g-serving-broker/common/errors" + "github.com/0glabs/0g-serving-broker/common/log" + "github.com/ethereum/go-ethereum/accounts" + "github.com/ethereum/go-ethereum/common" + "github.com/ethereum/go-ethereum/crypto" + "github.com/gin-gonic/gin" + "github.com/google/uuid" +) + +// Proxy exposes OpenAI-compatible HTTP endpoints for serving fine-tuned LoRA models +// and implements authentication and model ownership enforcement. +type Proxy struct { + manager *Manager + logger log.Logger + client *http.Client +} + +// NewProxy creates a Proxy with the given Manager and logger. +func NewProxy(manager *Manager, logger log.Logger) *Proxy { + return &Proxy{ + manager: manager, + logger: logger, + client: &http.Client{ + Timeout: 5 * time.Minute, + }, + } +} + +// RegisterRoutes adds the LoRA serving endpoints to the given Gin router group. +func (p *Proxy) RegisterRoutes(group *gin.RouterGroup) { + serving := group.Group("/serving") + serving.POST("/v1/chat/completions", p.authMiddleware(), p.handleChatCompletions) + serving.GET("/v1/models", p.authMiddleware(), p.handleListModelsForUser) + serving.GET("/models", p.handleListServedModels) + serving.POST("/models/:taskID", p.handleRegisterModel) + serving.DELETE("/models/:modelName", p.authMiddleware(), p.handleUnregisterModel) + serving.GET("/health", p.handleHealth) +} + +func (p *Proxy) authMiddleware() gin.HandlerFunc { + return func(c *gin.Context) { + authHeader := c.GetHeader("Authorization") + if authHeader == "" { + c.JSON(http.StatusUnauthorized, gin.H{"error": "Authorization header required"}) + c.Abort() + return + } + + parts := strings.SplitN(authHeader, " ", 2) + if len(parts) != 2 || strings.ToLower(parts[0]) != "bearer" { + c.JSON(http.StatusUnauthorized, gin.H{"error": "Invalid authorization format, use: Bearer "}) + c.Abort() + return + } + + sig := parts[1] + if !strings.HasPrefix(sig, "0x") { + sig = "0x" + sig + } + + sigBytes := common.FromHex(sig) + if len(sigBytes) != 65 { + c.JSON(http.StatusUnauthorized, gin.H{"error": "Invalid signature length"}) + c.Abort() + return + } + + message := "0g-serving-inference-auth" + hash := accounts.TextHash([]byte(message)) + + if sigBytes[64] >= 27 { + sigBytes[64] -= 27 + } + pubKey, err := crypto.SigToPub(hash, sigBytes) + if err != nil { + c.JSON(http.StatusUnauthorized, gin.H{"error": "Invalid signature"}) + c.Abort() + return + } + + address := crypto.PubkeyToAddress(*pubKey) + c.Set("userAddress", strings.ToLower(address.Hex())) + c.Next() + } +} + +func (p *Proxy) handleChatCompletions(c *gin.Context) { + if !p.manager.IsReady() { + c.JSON(http.StatusServiceUnavailable, gin.H{"error": "Inference service is not ready"}) + return + } + + body, err := io.ReadAll(c.Request.Body) + if err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": "Failed to read request body"}) + return + } + + var reqMap map[string]interface{} + if err := json.Unmarshal(body, &reqMap); err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid JSON"}) + return + } + + modelName, _ := reqMap["model"].(string) + if modelName == "" { + c.JSON(http.StatusBadRequest, gin.H{"error": "model field is required"}) + return + } + + served, exists := p.manager.GetServedModel(modelName) + if !exists { + c.JSON(http.StatusNotFound, gin.H{"error": fmt.Sprintf("model %s not found", modelName)}) + return + } + + userAddress, _ := c.Get("userAddress") + userAddrStr, _ := userAddress.(string) + if !strings.EqualFold(served.UserAddress, userAddrStr) { + c.JSON(http.StatusForbidden, gin.H{"error": "You are not the owner of this model"}) + return + } + + waitForModel, _ := reqMap["wait_for_model"].(bool) + + switch served.State { + case ModelStateArchived: + if err := p.manager.RestoreModel(c.Request.Context(), modelName); err != nil { + p.logger.Errorf("failed to trigger restore for model %s: %v", modelName, err) + c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to initiate model loading"}) + return + } + if !waitForModel { + c.JSON(http.StatusAccepted, gin.H{ + "error": "Model is being loaded from cold storage. Please retry in a few moments, or set wait_for_model=true to wait.", + "status": "loading", + "model": modelName, + }) + return + } + timeout := p.modelLoadTimeout() + p.logger.Infof("client waiting for model %s to load from cold storage (timeout: %s)", modelName, timeout) + state, err := p.manager.WaitForModel(c.Request.Context(), modelName, timeout) + if err != nil || state != ModelStateActive { + c.JSON(http.StatusGatewayTimeout, gin.H{ + "error": "Model loading timed out or failed. Please retry.", + "status": "timeout", + "model": modelName, + }) + return + } + case ModelStateLoading: + if !waitForModel { + c.JSON(http.StatusAccepted, gin.H{ + "error": "Model is currently being loaded. Please retry shortly, or set wait_for_model=true to wait.", + "status": "loading", + "model": modelName, + }) + return + } + timeout := p.modelLoadTimeout() + p.logger.Infof("client waiting for model %s (already loading, timeout: %s)", modelName, timeout) + state, err := p.manager.WaitForModel(c.Request.Context(), modelName, timeout) + if err != nil || state != ModelStateActive { + c.JSON(http.StatusGatewayTimeout, gin.H{ + "error": "Model loading timed out or failed. Please retry.", + "status": "timeout", + "model": modelName, + }) + return + } + } + + p.manager.RecordAccess(modelName) + + endpoint := p.manager.GetVLLMEndpoint() + targetURL := endpoint + "/v1/chat/completions" + + proxyReq, err := http.NewRequestWithContext(c.Request.Context(), "POST", targetURL, bytes.NewBuffer(body)) + if err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to create proxy request"}) + return + } + proxyReq.Header.Set("Content-Type", "application/json") + + resp, err := p.client.Do(proxyReq) + if err != nil { + c.JSON(http.StatusBadGateway, gin.H{"error": fmt.Sprintf("Backend error: %v", err)}) + return + } + defer resp.Body.Close() + + for k, v := range resp.Header { + if k == "Content-Length" { + continue + } + c.Writer.Header()[k] = v + } + c.Writer.Header().Set("X-Accel-Buffering", "no") + c.Status(resp.StatusCode) + + if isStreamRequest(body) { + c.Writer.Header().Set("Cache-Control", "no-cache") + c.Writer.Header().Set("Connection", "keep-alive") + c.Writer.Flush() + + buf := make([]byte, 4096) + for { + n, readErr := resp.Body.Read(buf) + if n > 0 { + if _, writeErr := c.Writer.Write(buf[:n]); writeErr != nil { + p.logger.Warnf("stream write error: %v", writeErr) + return + } + c.Writer.Flush() + } + if readErr != nil { + if readErr != io.EOF { + p.logger.Warnf("stream read error: %v", readErr) + } + return + } + } + } else { + respBody, err := io.ReadAll(resp.Body) + if err != nil { + p.logger.Errorf("failed to read response: %v", err) + c.JSON(http.StatusBadGateway, gin.H{"error": "Failed to read backend response"}) + return + } + if _, err := c.Writer.Write(respBody); err != nil { + p.logger.Warnf("failed to write response: %v", err) + } + } +} + +func (p *Proxy) handleListModelsForUser(c *gin.Context) { + userAddress, _ := c.Get("userAddress") + userAddrStr, _ := userAddress.(string) + + models := p.manager.ListServedModelsForUser(userAddrStr) + + type modelData struct { + ID string `json:"id"` + Object string `json:"object"` + OwnedBy string `json:"owned_by"` + TaskID string `json:"task_id"` + State string `json:"state"` + } + + data := make([]modelData, 0, len(models)) + for _, m := range models { + data = append(data, modelData{ + ID: m.ModelName, + Object: "model", + OwnedBy: m.UserAddress, + TaskID: m.TaskID.String(), + State: m.State.String(), + }) + } + + c.JSON(http.StatusOK, gin.H{ + "object": "list", + "data": data, + }) +} + +func (p *Proxy) handleListServedModels(c *gin.Context) { + models := p.manager.ListServedModels() + + type modelInfo struct { + ModelName string `json:"modelName"` + TaskID string `json:"taskId"` + UserAddress string `json:"userAddress"` + BaseModel string `json:"baseModel"` + RegisteredAt string `json:"registeredAt"` + LastAccessedAt string `json:"lastAccessedAt"` + State string `json:"state"` + } + + result := make([]modelInfo, 0, len(models)) + for _, m := range models { + result = append(result, modelInfo{ + ModelName: m.ModelName, + TaskID: m.TaskID.String(), + UserAddress: m.UserAddress, + BaseModel: m.BaseModel, + RegisteredAt: m.RegisteredAt.Format("2006-01-02T15:04:05Z"), + LastAccessedAt: m.LastAccessedAt.Format("2006-01-02T15:04:05Z"), + State: m.State.String(), + }) + } + + c.JSON(http.StatusOK, result) +} + +func (p *Proxy) handleRegisterModel(c *gin.Context) { + taskIDStr := c.Param("taskID") + + var req struct { + UserAddress string `json:"userAddress" binding:"required"` + BaseModel string `json:"baseModel" binding:"required"` + LoRAPath string `json:"loraPath" binding:"required"` + OutputRootHash string `json:"outputRootHash"` + } + if err := c.BindJSON(&req); err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) + return + } + + taskID, err := parseUUID(taskIDStr) + if err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid task ID"}) + return + } + + modelName, err := p.manager.RegisterModel(taskID, req.UserAddress, req.BaseModel, req.LoRAPath, req.OutputRootHash) + if err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + c.JSON(http.StatusCreated, gin.H{ + "modelName": modelName, + "message": "Model registered for serving. Use this model name in chat/completions requests.", + }) +} + +func (p *Proxy) handleUnregisterModel(c *gin.Context) { + modelName := c.Param("modelName") + + userAddress, _ := c.Get("userAddress") + userAddrStr, _ := userAddress.(string) + + if !p.manager.IsModelOwner(modelName, userAddrStr) { + c.JSON(http.StatusForbidden, gin.H{"error": "You are not the owner of this model"}) + return + } + + if err := p.manager.UnregisterModel(modelName); err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) + return + } + + c.JSON(http.StatusOK, gin.H{"message": fmt.Sprintf("Model %s unregistered", modelName)}) +} + +func (p *Proxy) handleHealth(c *gin.Context) { + ready := p.manager.IsReady() + models := p.manager.ListServedModels() + + active, archived, loading := 0, 0, 0 + for _, m := range models { + switch m.State { + case ModelStateActive: + active++ + case ModelStateArchived: + archived++ + case ModelStateLoading: + loading++ + } + } + + c.JSON(http.StatusOK, gin.H{ + "vllm_ready": ready, + "total_models": len(models), + "active_on_disk": active, + "archived_cold": archived, + "loading": loading, + "cold_storage": p.manager.config.EnableColdStorage, + "offload_minutes": p.manager.config.OffloadAfterMinutes, + "model_load_timeout_sec": p.manager.config.ModelLoadTimeoutSeconds, + }) +} + +func isStreamRequest(body []byte) bool { + var m map[string]interface{} + if err := json.Unmarshal(body, &m); err != nil { + return false + } + stream, ok := m["stream"].(bool) + return ok && stream +} + +func (p *Proxy) modelLoadTimeout() time.Duration { + secs := p.manager.config.ModelLoadTimeoutSeconds + if secs <= 0 { + secs = 300 + } + return time.Duration(secs) * time.Second +} + +func parseUUID(s string) (uuid.UUID, error) { + id, err := uuid.Parse(s) + if err != nil { + return uuid.UUID{}, errors.Wrap(err, "parse UUID") + } + return id, nil +} diff --git a/api/fine-tuning/internal/serving/registry.go b/api/fine-tuning/internal/serving/registry.go new file mode 100644 index 00000000..6837939b --- /dev/null +++ b/api/fine-tuning/internal/serving/registry.go @@ -0,0 +1,118 @@ +package serving + +import ( + "context" + "sync" + "time" + + "github.com/0glabs/0g-serving-broker/common/log" + providercontract "github.com/0glabs/0g-serving-broker/fine-tuning/internal/contract" +) + +// RegistryConfig holds pricing configuration for registering inference services. +type RegistryConfig struct { + InputPrice string `yaml:"inputPrice"` + OutputPrice string `yaml:"outputPrice"` +} + +// Registry periodically synchronises the set of served LoRA models with the +// inference contract. Currently contract registration is a no-op (see registerOnContract); +// this component tracks serving state locally until the fine-tuning and inference +// brokers share a unified contract interface. +type Registry struct { + mu sync.Mutex + contract *providercontract.ProviderContract + manager *Manager + logger log.Logger + config RegistryConfig + registeredModels map[string]bool +} + +// NewRegistry creates a Registry that will track served models and (eventually) +// register them on the inference contract. +func NewRegistry(contract *providercontract.ProviderContract, manager *Manager, config RegistryConfig, logger log.Logger) *Registry { + return &Registry{ + contract: contract, + manager: manager, + logger: logger, + config: config, + registeredModels: make(map[string]bool), + } +} + +// Start begins the background sync loop that tracks model registrations. +// NOTE: On-chain contract registration is not yet implemented; the loop +// only maintains local serving state. See registerOnContract for details. +func (r *Registry) Start(ctx context.Context) { + r.logger.Info("registry started (local tracking only — contract registration not yet implemented)") + go r.syncLoop(ctx) +} + +func (r *Registry) syncLoop(ctx context.Context) { + ticker := time.NewTicker(60 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + r.syncRegistrations(ctx) + } + } +} + +func (r *Registry) syncRegistrations(ctx context.Context) { + r.mu.Lock() + defer r.mu.Unlock() + + models := r.manager.ListServedModels() + + for _, model := range models { + if r.registeredModels[model.ModelName] { + continue + } + + if err := r.registerOnContract(ctx, model); err != nil { + r.logger.Errorf("failed to register model %s on contract: %v", model.ModelName, err) + continue + } + + r.registeredModels[model.ModelName] = true + r.logger.Infof("registered model %s on inference contract", model.ModelName) + } + + currentModels := make(map[string]bool) + for _, m := range models { + currentModels[m.ModelName] = true + } + for name := range r.registeredModels { + if !currentModels[name] { + delete(r.registeredModels, name) + r.logger.Infof("removed contract registration tracking for model: %s", name) + } + } +} + +func (r *Registry) registerOnContract(ctx context.Context, model *ServedModel) error { + r.logger.Infof("registering fine-tuned model on contract: name=%s, task=%s, owner=%s, inputPrice=%s, outputPrice=%s", + model.ModelName, model.TaskID, model.UserAddress, r.config.InputPrice, r.config.OutputPrice) + + // TODO: When fine-tuning and inference brokers share a contract interface, this method + // should call contract.AddOrUpdateService() to register the LoRA model as an inference + // service endpoint. For now the fine-tuning contract only tracks deliverables per task + // (recorded during the finalizer phase), so we track serving state locally. + // The inference broker is responsible for on-chain inference service registration. + + r.logger.Infof("model %s marked as registered for inference serving (task: %s, owner: %s)", + model.ModelName, model.TaskID, model.UserAddress) + + return nil +} + +// IsRegistered reports whether the given model name has been tracked as registered. +func (r *Registry) IsRegistered(modelName string) bool { + r.mu.Lock() + defer r.mu.Unlock() + return r.registeredModels[modelName] +} diff --git a/api/fine-tuning/internal/serving/serving_test.go b/api/fine-tuning/internal/serving/serving_test.go new file mode 100644 index 00000000..d448c82c --- /dev/null +++ b/api/fine-tuning/internal/serving/serving_test.go @@ -0,0 +1,745 @@ +package serving + +import ( + "context" + "fmt" + "os" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/0glabs/0g-serving-broker/common/config" + "github.com/0glabs/0g-serving-broker/common/log" + "github.com/google/uuid" +) + +type mockStorageClient struct { + downloadFn func(ctx context.Context, hash, filePath string, isTurbo bool) (string, error) + calls int +} + +func (m *mockStorageClient) DownloadFromStorage(ctx context.Context, hash, filePath string, isTurbo bool) (string, error) { + m.calls++ + if m.downloadFn != nil { + return m.downloadFn(ctx, hash, filePath, isTurbo) + } + if err := os.MkdirAll(filePath, 0755); err != nil { + return "", err + } + return filePath, nil +} + +func newTestLogger() log.Logger { + l, _ := log.GetLogger(&config.LoggerConfig{ + Format: "text", + Level: "debug", + }) + return l +} + +func newTestManager(t *testing.T) (*Manager, string) { + t.Helper() + tmpDir := t.TempDir() + loraDir := filepath.Join(tmpDir, "lora-modules") + if err := os.MkdirAll(loraDir, 0755); err != nil { + t.Fatal(err) + } + + m := &Manager{ + servedModels: make(map[string]*ServedModel), + modelReadyChs: make(map[string]chan struct{}), + logger: newTestLogger(), + config: ServingConfig{ + Enable: true, + MaxLoraModules: 16, + MaxCpuLoras: 32, + LoraModulesDir: loraDir, + OffloadAfterMinutes: 5, + EnableColdStorage: true, + ModelLoadTimeoutSeconds: 10, + }, + loraModulesDir: loraDir, + storageClient: &mockStorageClient{}, + } + return m, tmpDir +} + +func createFakeLoRA(t *testing.T, baseDir string) string { + t.Helper() + loraPath := filepath.Join(baseDir, "output_model") + if err := os.MkdirAll(loraPath, 0755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(loraPath, "adapter_config.json"), []byte(`{"r": 8}`), 0644); err != nil { + t.Fatal(err) + } + return loraPath +} + +func TestRegisterModel(t *testing.T) { + m, tmpDir := newTestManager(t) + loraPath := createFakeLoRA(t, tmpDir) + taskID := uuid.New() + + name, err := m.RegisterModel(taskID, "0xUser1", "base-model", loraPath, "0xabc123") + if err != nil { + t.Fatalf("RegisterModel failed: %v", err) + } + + if name == "" { + t.Fatal("expected non-empty model name") + } + + symlink := filepath.Join(m.loraModulesDir, name) + if _, err := os.Lstat(symlink); err != nil { + t.Fatalf("symlink not created: %v", err) + } + + target, err := os.Readlink(symlink) + if err != nil { + t.Fatalf("readlink failed: %v", err) + } + if target != loraPath { + t.Fatalf("symlink target = %s, want %s", target, loraPath) + } +} + +func TestRegisterModelSetsInitialState(t *testing.T) { + m, tmpDir := newTestManager(t) + loraPath := createFakeLoRA(t, tmpDir) + taskID := uuid.New() + + name, _ := m.RegisterModel(taskID, "0xUser1", "base-model", loraPath, "0xhash") + + served, exists := m.GetServedModel(name) + if !exists { + t.Fatal("model not found after registration") + } + if served.State != ModelStateActive { + t.Fatalf("state = %v, want Active", served.State) + } + if served.OutputRootHash != "0xhash" { + t.Fatalf("OutputRootHash = %s, want 0xhash", served.OutputRootHash) + } + if served.LastAccessedAt.IsZero() { + t.Fatal("LastAccessedAt should be set") + } +} + +func TestRecordAccess(t *testing.T) { + m, tmpDir := newTestManager(t) + loraPath := createFakeLoRA(t, tmpDir) + taskID := uuid.New() + + name, _ := m.RegisterModel(taskID, "0xUser1", "base-model", loraPath, "") + + served, _ := m.GetServedModel(name) + firstAccess := served.LastAccessedAt + + time.Sleep(10 * time.Millisecond) + m.RecordAccess(name) + + served, _ = m.GetServedModel(name) + if !served.LastAccessedAt.After(firstAccess) { + t.Fatal("LastAccessedAt should be updated after RecordAccess") + } +} + +func TestGetModelState(t *testing.T) { + m, tmpDir := newTestManager(t) + loraPath := createFakeLoRA(t, tmpDir) + taskID := uuid.New() + + name, _ := m.RegisterModel(taskID, "0xUser1", "base", loraPath, "") + + state, exists := m.GetModelState(name) + if !exists { + t.Fatal("model should exist") + } + if state != ModelStateActive { + t.Fatalf("state = %v, want Active", state) + } + + _, exists = m.GetModelState("nonexistent") + if exists { + t.Fatal("nonexistent model should not exist") + } +} + +func TestOffloadStaleModels(t *testing.T) { + m, tmpDir := newTestManager(t) + m.config.OffloadAfterMinutes = 0 // immediate offload for testing + + loraPath := createFakeLoRA(t, tmpDir) + taskID := uuid.New() + + name, _ := m.RegisterModel(taskID, "0xUser1", "base", loraPath, "0xStorageHash") + + // Set last access to the past + m.mu.Lock() + m.servedModels[name].LastAccessedAt = time.Now().Add(-10 * time.Minute) + m.mu.Unlock() + + m.offloadStaleModels() + + state, _ := m.GetModelState(name) + if state != ModelStateArchived { + t.Fatalf("state = %v, want Archived after offload", state) + } + + symlink := filepath.Join(m.loraModulesDir, name) + if _, err := os.Lstat(symlink); !os.IsNotExist(err) { + t.Fatal("symlink should be removed after offload") + } + + if _, err := os.Stat(loraPath); !os.IsNotExist(err) { + t.Fatal("LoRA files should be removed after offload") + } +} + +func TestOffloadSkipsModelsWithoutStorageHash(t *testing.T) { + m, tmpDir := newTestManager(t) + m.config.OffloadAfterMinutes = 0 + + loraPath := createFakeLoRA(t, tmpDir) + taskID := uuid.New() + + name, _ := m.RegisterModel(taskID, "0xUser1", "base", loraPath, "") + + m.mu.Lock() + m.servedModels[name].LastAccessedAt = time.Now().Add(-10 * time.Minute) + m.mu.Unlock() + + m.offloadStaleModels() + + state, _ := m.GetModelState(name) + if state != ModelStateActive { + t.Fatalf("model without hash should NOT be offloaded, got state = %v", state) + } +} + +func TestOffloadSkipsRecentlyAccessedModels(t *testing.T) { + m, tmpDir := newTestManager(t) + m.config.OffloadAfterMinutes = 60 + + loraPath := createFakeLoRA(t, tmpDir) + taskID := uuid.New() + + name, _ := m.RegisterModel(taskID, "0xUser1", "base", loraPath, "0xHash") + + m.offloadStaleModels() + + state, _ := m.GetModelState(name) + if state != ModelStateActive { + t.Fatal("recently accessed model should NOT be offloaded") + } +} + +func TestRestoreModel(t *testing.T) { + m, tmpDir := newTestManager(t) + storage := &mockStorageClient{ + downloadFn: func(ctx context.Context, hash, filePath string, isTurbo bool) (string, error) { + if err := os.MkdirAll(filePath, 0755); err != nil { + return "", err + } + return filePath, nil + }, + } + m.storageClient = storage + + loraPath := createFakeLoRA(t, tmpDir) + taskID := uuid.New() + name, _ := m.RegisterModel(taskID, "0xUser1", "base", loraPath, "0xStorageHash") + + // Manually set to archived + m.mu.Lock() + m.servedModels[name].State = ModelStateArchived + m.mu.Unlock() + os.RemoveAll(filepath.Join(m.loraModulesDir, name)) + os.RemoveAll(loraPath) + + ctx := context.Background() + err := m.RestoreModel(ctx, name) + if err != nil { + t.Fatalf("RestoreModel failed: %v", err) + } + + // RestoreModel is async, wait for it + deadline := time.Now().Add(5 * time.Second) + for time.Now().Before(deadline) { + state, _ := m.GetModelState(name) + if state == ModelStateActive { + break + } + time.Sleep(50 * time.Millisecond) + } + + state, _ := m.GetModelState(name) + if state != ModelStateActive { + t.Fatalf("state = %v, want Active after restore", state) + } + + if storage.calls == 0 { + t.Fatal("storage download should have been called") + } +} + +func TestRestoreModelAlreadyActive(t *testing.T) { + m, tmpDir := newTestManager(t) + storage := &mockStorageClient{} + m.storageClient = storage + + loraPath := createFakeLoRA(t, tmpDir) + taskID := uuid.New() + name, _ := m.RegisterModel(taskID, "0xUser1", "base", loraPath, "0xHash") + + ctx := context.Background() + err := m.RestoreModel(ctx, name) + if err != nil { + t.Fatalf("RestoreModel on active model should not error: %v", err) + } + + time.Sleep(100 * time.Millisecond) + if storage.calls > 0 { + t.Fatal("should NOT download for an already active model") + } +} + +func TestRestoreModelAlreadyLoading(t *testing.T) { + m, tmpDir := newTestManager(t) + storage := &mockStorageClient{ + downloadFn: func(ctx context.Context, hash, filePath string, isTurbo bool) (string, error) { + time.Sleep(2 * time.Second) + if err := os.MkdirAll(filePath, 0755); err != nil { + return "", err + } + return filePath, nil + }, + } + m.storageClient = storage + + loraPath := createFakeLoRA(t, tmpDir) + taskID := uuid.New() + name, _ := m.RegisterModel(taskID, "0xUser1", "base", loraPath, "0xHash") + + m.mu.Lock() + m.servedModels[name].State = ModelStateArchived + m.mu.Unlock() + + ctx := context.Background() + m.RestoreModel(ctx, name) + time.Sleep(50 * time.Millisecond) + + state, _ := m.GetModelState(name) + if state != ModelStateLoading { + t.Fatalf("state should be Loading, got %v", state) + } + + // Second call should be a no-op + m.RestoreModel(ctx, name) + time.Sleep(50 * time.Millisecond) + + if storage.calls > 1 { + t.Fatalf("should not trigger duplicate download, got %d calls", storage.calls) + } +} + +func TestUnregisterModel(t *testing.T) { + m, tmpDir := newTestManager(t) + loraPath := createFakeLoRA(t, tmpDir) + taskID := uuid.New() + + name, _ := m.RegisterModel(taskID, "0xUser1", "base", loraPath, "") + + err := m.UnregisterModel(name) + if err != nil { + t.Fatalf("UnregisterModel failed: %v", err) + } + + _, exists := m.GetServedModel(name) + if exists { + t.Fatal("model should not exist after unregister") + } + + err = m.UnregisterModel("nonexistent") + if err == nil { + t.Fatal("UnregisterModel on nonexistent should error") + } +} + +func TestListServedModelsForUser(t *testing.T) { + m, tmpDir := newTestManager(t) + + lora1 := createFakeLoRA(t, filepath.Join(tmpDir, "task1")) + lora2 := createFakeLoRA(t, filepath.Join(tmpDir, "task2")) + lora3 := createFakeLoRA(t, filepath.Join(tmpDir, "task3")) + + m.RegisterModel(uuid.New(), "0xUserA", "base", lora1, "") + m.RegisterModel(uuid.New(), "0xUserA", "base", lora2, "") + m.RegisterModel(uuid.New(), "0xUserB", "base", lora3, "") + + modelsA := m.ListServedModelsForUser("0xUserA") + if len(modelsA) != 2 { + t.Fatalf("expected 2 models for UserA, got %d", len(modelsA)) + } + + modelsB := m.ListServedModelsForUser("0xUserB") + if len(modelsB) != 1 { + t.Fatalf("expected 1 model for UserB, got %d", len(modelsB)) + } + + modelsC := m.ListServedModelsForUser("0xUserC") + if len(modelsC) != 0 { + t.Fatalf("expected 0 models for UserC, got %d", len(modelsC)) + } +} + +func TestIsModelOwner(t *testing.T) { + m, tmpDir := newTestManager(t) + loraPath := createFakeLoRA(t, tmpDir) + taskID := uuid.New() + + name, _ := m.RegisterModel(taskID, "0xAbCdEf", "base", loraPath, "") + + if !m.IsModelOwner(name, "0xabcdef") { + t.Fatal("case-insensitive match should pass") + } + if !m.IsModelOwner(name, "0xABCDEF") { + t.Fatal("uppercase match should pass") + } + if m.IsModelOwner(name, "0xother") { + t.Fatal("different user should not be owner") + } + if m.IsModelOwner("nonexistent", "0xAbCdEf") { + t.Fatal("nonexistent model should return false") + } +} + +func TestMakeModelName(t *testing.T) { + m, _ := newTestManager(t) + taskID := uuid.MustParse("aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee") + + name := m.makeModelName("Qwen2.5-0.5B-Instruct", taskID) + + if !strings.HasPrefix(name, "ft-") { + t.Fatalf("model name should start with 'ft-', got: %s", name) + } + if !strings.Contains(name, "aaaaaaaa-bbb") { + t.Fatalf("model name should contain task ID prefix, got: %s", name) + } + // Special chars should be sanitized + if strings.Contains(name, ".") { + t.Fatalf("dots should be replaced, got: %s", name) + } + + name2 := m.makeModelName("Qwen2.5-0.5B-Instruct", taskID) + if name != name2 { + t.Fatal("makeModelName should be deterministic") + } +} + +func TestPruneStaleModels(t *testing.T) { + m, tmpDir := newTestManager(t) + m.config.MaxLoraModules = 2 + + lora1 := createFakeLoRA(t, filepath.Join(tmpDir, "task1")) + lora2 := createFakeLoRA(t, filepath.Join(tmpDir, "task2")) + lora3 := createFakeLoRA(t, filepath.Join(tmpDir, "task3")) + + name1, _ := m.RegisterModel(uuid.New(), "0xU", "b", lora1, "") + time.Sleep(10 * time.Millisecond) + m.RegisterModel(uuid.New(), "0xU", "b", lora2, "") + time.Sleep(10 * time.Millisecond) + m.RegisterModel(uuid.New(), "0xU", "b", lora3, "") + + m.pruneStaleModels() + + _, exists := m.GetServedModel(name1) + if exists { + t.Fatal("oldest model should have been pruned") + } + + all := m.ListServedModels() + if len(all) != 2 { + t.Fatalf("expected 2 models after prune, got %d", len(all)) + } +} + +func TestModelStateString(t *testing.T) { + tests := []struct { + state ModelState + want string + }{ + {ModelStateActive, "active"}, + {ModelStateArchived, "archived"}, + {ModelStateLoading, "loading"}, + } + for _, tt := range tests { + if got := tt.state.String(); got != tt.want { + t.Errorf("ModelState(%d).String() = %s, want %s", tt.state, got, tt.want) + } + } +} + +func TestOffloadLoopDisabledWhenColdStorageOff(t *testing.T) { + m, _ := newTestManager(t) + m.config.EnableColdStorage = false + + ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond) + defer cancel() + + done := make(chan struct{}) + go func() { + m.offloadLoop(ctx) + close(done) + }() + + select { + case <-done: + // offloadLoop returned immediately because cold storage is disabled + case <-time.After(1 * time.Second): + t.Fatal("offloadLoop should return immediately when cold storage is disabled") + } +} + +func TestFullOffloadRestoreCycle(t *testing.T) { + m, tmpDir := newTestManager(t) + m.config.OffloadAfterMinutes = 0 + + storage := &mockStorageClient{ + downloadFn: func(ctx context.Context, hash, filePath string, isTurbo bool) (string, error) { + if err := os.MkdirAll(filePath, 0755); err != nil { + return "", err + } + return filePath, nil + }, + } + m.storageClient = storage + + loraPath := createFakeLoRA(t, tmpDir) + taskID := uuid.New() + name, _ := m.RegisterModel(taskID, "0xUser", "base", loraPath, "0xStorageHash") + + // 1. Verify active + state, _ := m.GetModelState(name) + if state != ModelStateActive { + t.Fatalf("initial state = %v, want Active", state) + } + + // 2. Offload (set access time to past) + m.mu.Lock() + m.servedModels[name].LastAccessedAt = time.Now().Add(-1 * time.Hour) + m.mu.Unlock() + m.offloadStaleModels() + + state, _ = m.GetModelState(name) + if state != ModelStateArchived { + t.Fatalf("after offload state = %v, want Archived", state) + } + + // 3. Restore + ctx := context.Background() + m.RestoreModel(ctx, name) + + deadline := time.Now().Add(5 * time.Second) + for time.Now().Before(deadline) { + state, _ = m.GetModelState(name) + if state == ModelStateActive { + break + } + time.Sleep(50 * time.Millisecond) + } + + state, _ = m.GetModelState(name) + if state != ModelStateActive { + t.Fatalf("after restore state = %v, want Active", state) + } + + // 4. Verify symlink restored + symlink := filepath.Join(m.loraModulesDir, name) + if _, err := os.Lstat(symlink); err != nil { + t.Fatalf("symlink should be restored: %v", err) + } +} + +func TestWaitForModelAlreadyActive(t *testing.T) { + m, tmpDir := newTestManager(t) + loraPath := createFakeLoRA(t, tmpDir) + name, _ := m.RegisterModel(uuid.New(), "0xUser1", "base", loraPath, "") + + ctx := context.Background() + state, err := m.WaitForModel(ctx, name, 1*time.Second) + if err != nil { + t.Fatalf("WaitForModel on active model should not error: %v", err) + } + if state != ModelStateActive { + t.Fatalf("expected Active, got %v", state) + } +} + +func TestWaitForModelRestoreCompletes(t *testing.T) { + m, tmpDir := newTestManager(t) + storage := &mockStorageClient{ + downloadFn: func(ctx context.Context, hash, filePath string, isTurbo bool) (string, error) { + time.Sleep(200 * time.Millisecond) + if err := os.MkdirAll(filePath, 0755); err != nil { + return "", err + } + return filePath, nil + }, + } + m.storageClient = storage + + loraPath := createFakeLoRA(t, tmpDir) + name, _ := m.RegisterModel(uuid.New(), "0xUser1", "base", loraPath, "0xHash") + + m.mu.Lock() + m.servedModels[name].State = ModelStateArchived + m.mu.Unlock() + os.RemoveAll(filepath.Join(m.loraModulesDir, name)) + os.RemoveAll(loraPath) + + ctx := context.Background() + if err := m.RestoreModel(ctx, name); err != nil { + t.Fatalf("RestoreModel failed: %v", err) + } + + state, err := m.WaitForModel(ctx, name, 5*time.Second) + if err != nil { + t.Fatalf("WaitForModel should succeed: %v", err) + } + if state != ModelStateActive { + t.Fatalf("expected Active after wait, got %v", state) + } +} + +func TestWaitForModelTimeout(t *testing.T) { + m, tmpDir := newTestManager(t) + storage := &mockStorageClient{ + downloadFn: func(ctx context.Context, hash, filePath string, isTurbo bool) (string, error) { + time.Sleep(5 * time.Second) + return filePath, nil + }, + } + m.storageClient = storage + + loraPath := createFakeLoRA(t, tmpDir) + name, _ := m.RegisterModel(uuid.New(), "0xUser1", "base", loraPath, "0xHash") + + m.mu.Lock() + m.servedModels[name].State = ModelStateArchived + m.mu.Unlock() + + ctx := context.Background() + m.RestoreModel(ctx, name) + + _, err := m.WaitForModel(ctx, name, 100*time.Millisecond) + if err == nil { + t.Fatal("WaitForModel should timeout") + } + if err != context.DeadlineExceeded { + t.Fatalf("expected DeadlineExceeded, got %v", err) + } +} + +func TestWaitForModelContextCancelled(t *testing.T) { + m, tmpDir := newTestManager(t) + storage := &mockStorageClient{ + downloadFn: func(ctx context.Context, hash, filePath string, isTurbo bool) (string, error) { + time.Sleep(5 * time.Second) + return filePath, nil + }, + } + m.storageClient = storage + + loraPath := createFakeLoRA(t, tmpDir) + name, _ := m.RegisterModel(uuid.New(), "0xUser1", "base", loraPath, "0xHash") + + m.mu.Lock() + m.servedModels[name].State = ModelStateArchived + m.mu.Unlock() + + ctx, cancel := context.WithCancel(context.Background()) + m.RestoreModel(ctx, name) + + go func() { + time.Sleep(100 * time.Millisecond) + cancel() + }() + + _, err := m.WaitForModel(ctx, name, 10*time.Second) + if err == nil { + t.Fatal("WaitForModel should fail on context cancellation") + } +} + +func TestWaitForModelMultipleWaiters(t *testing.T) { + m, tmpDir := newTestManager(t) + storage := &mockStorageClient{ + downloadFn: func(ctx context.Context, hash, filePath string, isTurbo bool) (string, error) { + time.Sleep(200 * time.Millisecond) + if err := os.MkdirAll(filePath, 0755); err != nil { + return "", err + } + return filePath, nil + }, + } + m.storageClient = storage + + loraPath := createFakeLoRA(t, tmpDir) + name, _ := m.RegisterModel(uuid.New(), "0xUser1", "base", loraPath, "0xHash") + + m.mu.Lock() + m.servedModels[name].State = ModelStateArchived + m.mu.Unlock() + os.RemoveAll(filepath.Join(m.loraModulesDir, name)) + os.RemoveAll(loraPath) + + ctx := context.Background() + m.RestoreModel(ctx, name) + + results := make(chan ModelState, 3) + for i := 0; i < 3; i++ { + go func() { + state, _ := m.WaitForModel(ctx, name, 5*time.Second) + results <- state + }() + } + + for i := 0; i < 3; i++ { + state := <-results + if state != ModelStateActive { + t.Fatalf("waiter %d got state %v, want Active", i, state) + } + } +} + +func TestWaitForModelRestoreFails(t *testing.T) { + m, tmpDir := newTestManager(t) + storage := &mockStorageClient{ + downloadFn: func(ctx context.Context, hash, filePath string, isTurbo bool) (string, error) { + time.Sleep(100 * time.Millisecond) + return "", fmt.Errorf("storage unavailable") + }, + } + m.storageClient = storage + + loraPath := createFakeLoRA(t, tmpDir) + name, _ := m.RegisterModel(uuid.New(), "0xUser1", "base", loraPath, "0xHash") + + m.mu.Lock() + m.servedModels[name].State = ModelStateArchived + m.mu.Unlock() + + ctx := context.Background() + m.RestoreModel(ctx, name) + + state, err := m.WaitForModel(ctx, name, 5*time.Second) + if err != nil { + t.Fatalf("WaitForModel should not return error on restore failure: %v", err) + } + if state != ModelStateArchived { + t.Fatalf("expected Archived (restore failed), got %v", state) + } +} diff --git a/api/fine-tuning/monitor/monitor.go b/api/fine-tuning/monitor/monitor.go new file mode 100644 index 00000000..eb233fba --- /dev/null +++ b/api/fine-tuning/monitor/monitor.go @@ -0,0 +1,298 @@ +package monitor + +import ( + "context" + "net/http" + "time" + + "github.com/gin-gonic/gin" + "github.com/prometheus/client_golang/prometheus" +) + +var ( + TasksCreatedTotal prometheus.Counter + TasksCompletedTotal prometheus.Counter + TasksFailedTotal prometheus.Counter + + TasksByState *prometheus.GaugeVec + + TaskPhaseDuration *prometheus.HistogramVec + + StorageUploadTotal prometheus.Counter + StorageUploadErrors prometheus.Counter + StorageDownloadTotal prometheus.Counter + StorageDownloadErrors prometheus.Counter + StorageUploadDuration prometheus.Histogram + StorageDownloadDuration prometheus.Histogram + + SettlementTotal prometheus.Counter + SettlementErrors prometheus.Counter + + RequestCount *prometheus.CounterVec + ErrorCount *prometheus.CounterVec + RequestDuration *prometheus.HistogramVec + + UniqueUsersTotal prometheus.Gauge + + uniqueUsersChan chan string +) + +// Init initializes all Prometheus metrics for the fine-tuning service. +// The context is used for graceful shutdown of background goroutines. +func Init(serverName string, ctx context.Context) { + if serverName == "" { + serverName = "fine-tuning" + } + + labels := prometheus.Labels{"server": serverName} + + TasksCreatedTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "ft_tasks_created_total", + Help: "Total number of fine-tuning tasks created.", + ConstLabels: labels, + }) + + TasksCompletedTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "ft_tasks_completed_total", + Help: "Total number of fine-tuning tasks completed successfully.", + ConstLabels: labels, + }) + + TasksFailedTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "ft_tasks_failed_total", + Help: "Total number of fine-tuning tasks that failed.", + ConstLabels: labels, + }) + + TasksByState = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "ft_tasks_by_state", + Help: "Current number of fine-tuning tasks in each state.", + ConstLabels: labels, + }, []string{"state"}) + + TaskPhaseDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Name: "ft_task_phase_duration_seconds", + Help: "Duration of each fine-tuning task phase.", + Buckets: []float64{10, 30, 60, 120, 300, 600, 1800, 3600, 7200}, + ConstLabels: labels, + }, []string{"phase"}) + + StorageUploadTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "ft_storage_upload_total", + Help: "Total number of 0G Storage upload attempts.", + ConstLabels: labels, + }) + + StorageUploadErrors = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "ft_storage_upload_errors_total", + Help: "Total number of 0G Storage upload errors.", + ConstLabels: labels, + }) + + StorageDownloadTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "ft_storage_download_total", + Help: "Total number of 0G Storage download attempts.", + ConstLabels: labels, + }) + + StorageDownloadErrors = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "ft_storage_download_errors_total", + Help: "Total number of 0G Storage download errors.", + ConstLabels: labels, + }) + + StorageUploadDuration = prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: "ft_storage_upload_duration_seconds", + Help: "Duration of 0G Storage uploads.", + Buckets: []float64{5, 15, 30, 60, 120, 300, 600, 1800}, + ConstLabels: labels, + }) + + StorageDownloadDuration = prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: "ft_storage_download_duration_seconds", + Help: "Duration of 0G Storage downloads.", + Buckets: []float64{5, 15, 30, 60, 120, 300, 600, 1800}, + ConstLabels: labels, + }) + + SettlementTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "ft_settlement_total", + Help: "Total number of fine-tuning settlements processed.", + ConstLabels: labels, + }) + + SettlementErrors = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "ft_settlement_errors_total", + Help: "Total number of fine-tuning settlement errors.", + ConstLabels: labels, + }) + + RequestCount = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "ft_requests_total", + Help: "Total number of HTTP requests processed.", + ConstLabels: labels, + }, []string{"path", "status"}) + + ErrorCount = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "ft_requests_errors_total", + Help: "Total number of HTTP request errors.", + ConstLabels: labels, + }, []string{"path", "status"}) + + RequestDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Name: "ft_request_duration_seconds", + Help: "Histogram of HTTP request latencies.", + Buckets: prometheus.DefBuckets, + ConstLabels: labels, + }, []string{"path"}) + + UniqueUsersTotal = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "ft_unique_users_total", + Help: "Number of unique fine-tuning users for the current day (resets at UTC midnight).", + ConstLabels: labels, + }) + + prometheus.MustRegister( + TasksCreatedTotal, TasksCompletedTotal, TasksFailedTotal, + TasksByState, TaskPhaseDuration, + StorageUploadTotal, StorageUploadErrors, StorageDownloadTotal, StorageDownloadErrors, + StorageUploadDuration, StorageDownloadDuration, + SettlementTotal, SettlementErrors, + RequestCount, ErrorCount, RequestDuration, + UniqueUsersTotal, + ) + + // 10 000 provides ~100 s of burst capacity at 100 tasks/s before drops. + uniqueUsersChan = make(chan string, 10000) + go processUniqueUsers(ctx) +} + +func processUniqueUsers(ctx context.Context) { + uniqueUsers := make(map[string]struct{}) + lastResetDay := time.Now().UTC().YearDay() + + for { + select { + case <-ctx.Done(): + return + case userAddress := <-uniqueUsersChan: + currentDay := time.Now().UTC().YearDay() + if currentDay != lastResetDay { + uniqueUsers = make(map[string]struct{}) + lastResetDay = currentDay + UniqueUsersTotal.Set(0) + } + + if _, exists := uniqueUsers[userAddress]; !exists { + uniqueUsers[userAddress] = struct{}{} + UniqueUsersTotal.Set(float64(len(uniqueUsers))) + } + } + } +} + +// TrackMetrics returns a Gin middleware that records HTTP request count, errors, and duration. +func TrackMetrics() gin.HandlerFunc { + return func(c *gin.Context) { + startTime := time.Now() + + path := c.Request.URL.Path + c.Next() + + duration := time.Since(startTime).Seconds() + RequestDuration.WithLabelValues(path).Observe(duration) + + status := c.Writer.Status() + RequestCount.WithLabelValues(path, http.StatusText(status)).Inc() + if status >= 400 { + ErrorCount.WithLabelValues(path, http.StatusText(status)).Inc() + } + } +} + +// RecordUniqueUser tracks a unique user address for the current day. +func RecordUniqueUser(userAddress string) { + if userAddress == "" || uniqueUsersChan == nil { + return + } + + select { + case uniqueUsersChan <- userAddress: + default: + } +} + +// RecordTaskCreated increments the counter for total fine-tuning tasks created. +func RecordTaskCreated() { + if TasksCreatedTotal != nil { + TasksCreatedTotal.Inc() + } +} + +// RecordTaskCompleted increments the counter for successfully completed tasks. +func RecordTaskCompleted() { + if TasksCompletedTotal != nil { + TasksCompletedTotal.Inc() + } +} + +// RecordTaskFailed increments the counter for tasks that ended in failure. +func RecordTaskFailed() { + if TasksFailedTotal != nil { + TasksFailedTotal.Inc() + } +} + +// RecordPhaseDuration records how long a task spent in a given phase. +func RecordPhaseDuration(phase string, duration time.Duration) { + if TaskPhaseDuration != nil { + TaskPhaseDuration.WithLabelValues(phase).Observe(duration.Seconds()) + } +} + +// UpdateTaskStateGauge sets the current task count for each state. +func UpdateTaskStateGauge(stateCounts map[string]float64) { + if TasksByState == nil { + return + } + for state, count := range stateCounts { + TasksByState.WithLabelValues(state).Set(count) + } +} + +// RecordStorageUpload records an upload attempt, its error status, and duration. +func RecordStorageUpload(err error, duration time.Duration) { + if StorageUploadTotal != nil { + StorageUploadTotal.Inc() + } + if err != nil && StorageUploadErrors != nil { + StorageUploadErrors.Inc() + } + if StorageUploadDuration != nil { + StorageUploadDuration.Observe(duration.Seconds()) + } +} + +// RecordStorageDownload records a download attempt, its error status, and duration. +func RecordStorageDownload(err error, duration time.Duration) { + if StorageDownloadTotal != nil { + StorageDownloadTotal.Inc() + } + if err != nil && StorageDownloadErrors != nil { + StorageDownloadErrors.Inc() + } + if StorageDownloadDuration != nil { + StorageDownloadDuration.Observe(duration.Seconds()) + } +} + +// RecordSettlement records a settlement attempt and its error status. +func RecordSettlement(err error) { + if SettlementTotal != nil { + SettlementTotal.Inc() + } + if err != nil && SettlementErrors != nil { + SettlementErrors.Inc() + } +} +