agent-systems-eval/test_system.py at main · m-levytskyi/agent-systems-eval · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
"""
Test script to verify the structure and functionality of the agent system.

This script performs dry-run tests without making actual API calls.
"""

import os
import sys
import json
from pathlib import Path
from dotenv import load_dotenv

load_dotenv()


def test_imports():
    """Test that all modules can be imported."""
    print("Testing imports...")
    try:
        import monolithic
        import ensemble
        import evaluate
        print("✓ All modules imported successfully")
        return True
    except Exception as e:
        print(f"✗ Import error: {e}")
        return False


def test_critical_dependencies():
    """Test that all critical dependencies are available."""
    print("\nTesting critical dependencies...")
    missing = []

    # Core dependencies
    dependencies = [
        ("mlflow", "MLflow for experiment tracking"),
        ("openai", "OpenAI SDK for LLM compatibility"),
        ("crewai", "CrewAI for agent orchestration"),
        ("litellm", "LiteLLM for CrewAI LLM integration"),
        ("dotenv", "python-dotenv for configuration"),
        ("PyPDF2", "PyPDF2 for document loading"),
        ("bert_score", "BERTScore for evaluation"),
        ("rouge_score", "ROUGE for evaluation"),
    ]

    for module_name, description in dependencies:
        try:
            __import__(module_name)
            print(f"✓ {description}: {module_name}")
        except ImportError as e:
            print(f"✗ Missing: {module_name} - {description}")
            missing.append(module_name)

    if missing:
        print(f"\n⚠️  Missing dependencies: {', '.join(missing)}")
        print("Install with: pip install -r requirements.txt")
        return False

    return True


def test_crewai_llm_compatibility():
    """Test that CrewAI LLM can be initialized without errors."""
    print("\nTesting CrewAI LLM compatibility...")
    try:
        from crewai import LLM

        # Test LLM initialization with a simple model string
        # This should not raise ImportError about LiteLLM
        test_model = "openai/gpt-3.5-turbo"
        llm = LLM(model=test_model)
        print(f"✓ CrewAI LLM initialized successfully with model: {test_model}")
        return True
    except ImportError as e:
        if "LiteLLM" in str(e):
            print(f"✗ LiteLLM dependency missing: {e}")
            print("Install with: pip install litellm")
        else:
            print(f"✗ CrewAI import error: {e}")
        return False
    except Exception as e:
        # Other errors are acceptable (e.g., API key issues) as long as LLM class loads
        print(f"✓ CrewAI LLM class loads (runtime error expected without API key: {e})")
        return True


def test_data_files():
    """Test that all required data files exist."""
    print("\nTesting data files...")

    required_files = [
        "data/source_documents/doc1_ai_history.pdf",
        "data/source_documents/doc2_ml_fundamentals.pdf",
        "data/source_documents/doc3_ai_ethics.pdf",
        "data/tasks/synthesis_tasks.json"
    ]

    all_exist = True
    for filepath in required_files:
        if os.path.exists(filepath):
            print(f"✓ Found: {filepath}")
        else:
            print(f"✗ Missing: {filepath}")
            all_exist = False

    return all_exist


def test_task_structure():
    """Test that tasks file has correct structure."""
    print("\nTesting task file structure...")
    try:
        with open("data/tasks/synthesis_tasks.json", "r", encoding="utf-8") as f:
            tasks = json.load(f)

        if not isinstance(tasks, list):
            print("✗ Tasks must be a list")
            return False

        if len(tasks) == 0:
            print("✗ No tasks found")
            return False

        print(f"✓ Found {len(tasks)} tasks")

        for i, task in enumerate(tasks):
            required_keys = ["task_id", "task_description"]
            for key in required_keys:
                if key not in task:
                    print(f"✗ Task {i} missing key: {key}")
                    return False
            print(f"  ✓ Task {task['task_id']}: {task['task_description'][:50]}...")

        return True
    except Exception as e:
        print(f"✗ Error reading tasks: {e}")
        return False


def test_agent_initialization():
    """Test that agents can be initialized (without API key)."""
    print("\nTesting agent initialization...")
    try:
        os.environ["LLM_PROVIDER"] = "ollama"
        os.environ.setdefault("OLLAMA_MODEL", "qwen2.5:7b")
        os.environ.setdefault("CREWAI_MODEL", "openai/qwen2.5:7b")

        from monolithic import MonolithicAgent
        from ensemble import EnsembleAgent

        mono = MonolithicAgent()
        print(f"✓ MonolithicAgent initialized (model: {mono.model})")

        # Note: EnsembleAgent now uses CrewAI Flows, so we just check instantiation
        ens = EnsembleAgent()
        print(f"✓ EnsembleAgent initialized (CrewAI model: {ens.model})")

        return True
    except Exception as e:
        print(f"✗ Initialization error: {e}")
        return False


def test_document_loading():
    """Test that documents can be loaded."""
    print("\nTesting document loading...")
    try:
        from evaluate import load_source_documents, load_tasks

        docs = load_source_documents("data/source_documents")
        print(f"✓ Loaded {len(docs)} documents")

        for i, doc in enumerate(docs):
            print(f"  ✓ Document {i+1}: {len(doc)} characters")

        tasks = load_tasks("data/tasks/synthesis_tasks.json")
        print(f"✓ Loaded {len(tasks)} tasks")

        return True
    except Exception as e:
        print(f"✗ Loading error: {e}")
        return False


def test_project_structure():
    """Test overall project structure."""
    print("\nTesting project structure...")

    required_files = [
        "README.md",
        "requirements.txt",
        ".gitignore",
        ".env.example",
        "monolithic.py",
        "ensemble.py",
        "evaluate.py"
    ]

    all_exist = True
    for filepath in required_files:
        if os.path.exists(filepath):
            print(f"✓ Found: {filepath}")
        else:
            print(f"✗ Missing: {filepath}")
            all_exist = False

    return all_exist


def main():
    """Run all tests."""
    print("="*60)
    print("Agent Systems Evaluation - Test Suite")
    print("="*60)

    tests = [
        ("Project Structure", test_project_structure),
        ("Critical Dependencies", test_critical_dependencies),
        ("Module Imports", test_imports),
        ("CrewAI LLM Compatibility", test_crewai_llm_compatibility),
        ("Data Files", test_data_files),
        ("Task Structure", test_task_structure),
        ("Agent Initialization", test_agent_initialization),
        ("Document Loading", test_document_loading)
    ]

    results = []
    for test_name, test_func in tests:
        try:
            result = test_func()
            results.append((test_name, result))
        except Exception as e:
            print(f"\n✗ Test '{test_name}' failed with exception: {e}")
            results.append((test_name, False))

    print("\n" + "="*60)
    print("Test Summary")
    print("="*60)

    for test_name, result in results:
        status = "PASS" if result else "FAIL"
        symbol = "✓" if result else "✗"
        print(f"{symbol} {test_name}: {status}")

    total = len(results)
    passed = sum(1 for _, result in results if result)

    print(f"\nTotal: {passed}/{total} tests passed")

    if passed == total:
        print("\n🎉 All tests passed!")
        return 0
    else:
        print(f"\n⚠️  {total - passed} test(s) failed")
        return 1


if __name__ == "__main__":
    sys.exit(main())