-
-
Notifications
You must be signed in to change notification settings - Fork 19
Expand file tree
/
Copy pathtest_code_analysis.py
More file actions
74 lines (61 loc) · 2.17 KB
/
test_code_analysis.py
File metadata and controls
74 lines (61 loc) · 2.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/usr/bin/env python3
"""Benchmark 30B model with full code analysis - Dynamic Context Version"""
import requests
import time
import sys
# Read the full Pipes.py code
with open("Pipes.py", "r") as f:
code_content = f.read()
print(
f"Code size: {len(code_content):,} chars, ~{len(code_content)//4:,} tokens estimate"
)
# Use 30B model for code analysis
prompt = f"""Here is the complete Pipes.py code from ezlocalai:
```python
{code_content}
```
Analyze this code and explain:
1. How does the GPU layer auto-calibration work?
2. What is the fallback strategy if native estimation fails?
3. How does the multi-model hot-swap work?
Be concise but thorough."""
print(f"Prompt size: {len(prompt):,} chars")
print(f"\nTesting 30B model with full code analysis...")
print("(Dynamic context - will auto-size to fit prompt)")
print("(30B should have ~38 GPU layers at 32k context now)\n")
sys.stdout.flush()
start = time.time()
resp = requests.post(
"http://localhost:8091/v1/chat/completions",
json={
"model": "unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF",
"messages": [{"role": "user", "content": prompt}],
"max_tokens": 1000,
"temperature": 0.3,
},
timeout=1800, # 30 min timeout
)
elapsed = time.time() - start
if resp.status_code == 200:
data = resp.json()
usage = data.get("usage", {})
timings = data.get("timings", {})
content = data["choices"][0]["message"]["content"]
prompt_tokens = usage.get("prompt_tokens", 0)
completion_tokens = usage.get("completion_tokens", 0)
print(f"=== 30B Model with Dynamic Context ===")
print(f"Prompt tokens: {prompt_tokens:,}")
print(f"Completion tokens: {completion_tokens:,}")
print(f"Total time: {elapsed:.1f}s")
print(f"Generation speed: {completion_tokens/elapsed:.1f} tok/s overall")
if timings:
print(f"\nLLM Timings:")
print(f" Prompt processing: {timings.get('prompt_per_second', 0):.1f} tok/s")
print(f" Generation: {timings.get('predicted_per_second', 0):.1f} tok/s")
print(f"\n{'='*60}")
print(f"RESPONSE:")
print(f"{'='*60}\n")
print(content)
else:
print(f"Error: {resp.status_code}")
print(resp.text[:1000])