cascadeflow/examples/vllm_example.py at main · lemony-ai/cascadeflow · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
"""
vLLM provider example.

Demonstrates using vLLM for high-performance local inference.

Prerequisites:
1. Install vLLM: pip install vllm
2. Start vLLM server:
   python -m vllm.entrypoints.openai.api_server \
     --model meta-llama/Llama-3-8B-Instruct \
     --host 0.0.0.0 \
     --port 8000
"""

import asyncio

from cascadeflow.providers.vllm import VLLMProvider


async def main():
    """Test vLLM provider."""

    print("vLLM Provider Test\n")

    # Initialize provider
    provider = VLLMProvider(base_url="http://localhost:8000/v1")

    try:
        # List available models
        print("Checking available models...")
        models = await provider.list_models()
        print(f"Available models: {models}\n")

        if not models:
            print("No models found. Make sure vLLM server is running.")
            return

        # Use first available model
        model = models[0]
        print(f"Using model: {model}\n")

        # Test completion
        print("Testing completion...")
        result = await provider.complete(
            prompt="Explain AI in one sentence", model=model, max_tokens=100
        )

        print(f"Response: {result.content}")
        print(f"Tokens: {result.tokens_used}")
        print(f"Latency: {result.latency_ms:.0f}ms")
        print(f"Cost: ${result.cost:.4f} (self-hosted)")

    except Exception as e:
        print(f"Error: {e}")
        print("\nMake sure vLLM server is running:")
        print("  python -m vllm.entrypoints.openai.api_server \\")
        print("    --model meta-llama/Llama-3-8B-Instruct \\")
        print("    --host 0.0.0.0 --port 8000")

    finally:
        await provider.client.aclose()


if __name__ == "__main__":
    asyncio.run(main())