Alibaba-NLP · chindris-mihai-alexandru · Nov 26, 2025 · Nov 26, 2025 · Nov 26, 2025 · Nov 26, 2025
diff --git a/.env.example b/.env.example
@@ -46,9 +46,14 @@ MAX_WORKERS=30
 # API Keys and External Services
 # =============================================================================
 
-# Serper API for web search and Google Scholar
-# Get your key from: https://serper.dev/
-SERPER_KEY_ID=your_key
+# Exa.ai API for semantic web search
+# Get your key from: https://exa.ai/
+# Exa provides AI-native neural search with:
+# - Semantic understanding (not just keyword matching)
+# - Built-in query optimization
+# - Direct content retrieval
+# - Better results for complex research queries
+EXA_API_KEY=your_key
 
 # Jina API for web page reading
 # Get your key from: https://jina.ai/
@@ -57,8 +62,8 @@ JINA_API_KEYS=your_key
 # Summary model API (OpenAI-compatible) for page summarization
 # Get your key from: https://platform.openai.com/
 API_KEY=your_key
-API_BASE=your_api_base
-SUMMARY_MODEL_NAME=your_summary_model_name
+API_BASE=https://api.openai.com/v1
+SUMMARY_MODEL_NAME=gpt-4o-mini
 
 # Dashscope API for file parsing (PDF, Office, etc.)
 # Get your key from: https://dashscope.aliyun.com/
@@ -95,4 +100,24 @@ IDP_KEY_SECRET=your_idp_key_secret
 
 # These are typically set by distributed training frameworks
 # WORLD_SIZE=1
-# RANK=0
+# RANK=0
+
+# =============================================================================
+# MLX Configuration (Apple Silicon Only)
+# =============================================================================
+# For running on Apple Silicon Macs (M1/M2/M3/M4) using MLX framework
+# instead of CUDA/vLLM. Uses mlx-lm for efficient local inference.
+#
+# Requirements:
+#   pip install mlx-lm
+#
+# Recommended models:
+#   - abalogh/Tongyi-DeepResearch-30B-A3B-4bit (17GB, fits 32GB RAM)
+#   - Original BF16 model requires 62GB+
+#
+# Usage:
+#   bash inference/run_mlx_infer.sh
+#
+# MLX_MODEL=abalogh/Tongyi-DeepResearch-30B-A3B-4bit
+# MLX_HOST=127.0.0.1
+# MLX_PORT=8080
diff --git a/inference/interactive.py b/inference/interactive.py
@@ -0,0 +1,218 @@
+#!/usr/bin/env python3
+"""
+Interactive CLI for DeepResearch on Apple Silicon (MLX)
+
+Usage:
+    python interactive.py [--model MODEL_PATH]
+
+Example:
+    python interactive.py
+    python interactive.py --model abalogh/Tongyi-DeepResearch-30B-A3B-4bit
+"""
+
+import argparse
+import json
+import os
+import sys
+import time
+
+# Load environment variables first
+from dotenv import load_dotenv
+load_dotenv(os.path.join(os.path.dirname(__file__), "..", ".env"))
+
+# Disable tokenizer parallelism warning
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+# Optional: rich for better formatting
+try:
+    from rich.console import Console
+    from rich.markdown import Markdown
+    from rich.panel import Panel
+    from rich.progress import Progress, SpinnerColumn, TextColumn
+    RICH_AVAILABLE = True
+    console = Console()
+except ImportError:
+    RICH_AVAILABLE = False
+    console = None
+
+
+def print_header():
+    """Print welcome header."""
+    header = """
+╔══════════════════════════════════════════════════════════════╗
+║           DeepResearch - Interactive Mode (MLX)              ║
+║                  Apple Silicon Optimized                     ║
+╚══════════════════════════════════════════════════════════════╝
+"""
+    if RICH_AVAILABLE:
+        console.print(header, style="bold blue")
+    else:
+        print(header)
+
+
+def print_help():
+    """Print help information."""
+    help_text = """
+Commands:
+  /help     - Show this help message
+  /quit     - Exit the program (or Ctrl+C)
+  /clear    - Clear conversation history (start fresh)
+  /status   - Show model and memory status
+
+Just type your research question to begin!
+
+Examples:
+  > What is the current population of Tokyo?
+  > Who won the 2024 Nobel Prize in Physics?
+  > Explain the mechanism of CRISPR-Cas9 gene editing
+"""
+    if RICH_AVAILABLE:
+        console.print(Panel(help_text, title="Help", border_style="green"))
+    else:
+        print(help_text)
+
+
+def format_answer(answer: str):
+    """Format the answer for display."""
+    if RICH_AVAILABLE:
+        console.print("\n")
+        console.print(Panel(Markdown(answer), title="[bold green]Answer[/]", border_style="green"))
+    else:
+        print("\n" + "=" * 60)
+        print("ANSWER:")
+        print("=" * 60)
+        print(answer)
+        print("=" * 60)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Interactive DeepResearch CLI")
+    parser.add_argument("--model", type=str, 
+                        default="abalogh/Tongyi-DeepResearch-30B-A3B-4bit",
+                        help="Model path or HuggingFace ID")
+    parser.add_argument("--temperature", type=float, default=0.7,
+                        help="Sampling temperature")
+    parser.add_argument("--max_tokens", type=int, default=4096,
+                        help="Max tokens per generation")
+    parser.add_argument("--max_rounds", type=int, default=15,
+                        help="Max research rounds per question")
+    args = parser.parse_args()
+
+    print_header()
+
+    # Set max rounds via environment
+    os.environ['MAX_LLM_CALL_PER_RUN'] = str(args.max_rounds)
+
+    # Import agent after setting environment
+    print("Loading model (this may take a minute)...")
+
+    try:
+        from run_mlx_react import MLXReactAgent, TOOL_MAP
+    except ImportError as e:
+        print(f"Error importing agent: {e}")
+        print("Make sure you're running from the inference directory.")
+        return 1
+
+    if RICH_AVAILABLE:
+        with Progress(
+            SpinnerColumn(),
+            TextColumn("[progress.description]{task.description}"),
+            console=console
+        ) as progress:
+            progress.add_task("Loading MLX model...", total=None)
+            agent = MLXReactAgent(
+                model_path=args.model,
+                temperature=args.temperature,
+                max_tokens=args.max_tokens
+            )
+    else:
+        agent = MLXReactAgent(
+            model_path=args.model,
+            temperature=args.temperature,
+            max_tokens=args.max_tokens
+        )
+
+    print(f"\nTools available: {list(TOOL_MAP.keys())}")
+    print(f"Max rounds per question: {args.max_rounds}")
+    print_help()
+
+    while True:
+        try:
+            # Get user input
+            if RICH_AVAILABLE:
+                query = console.input("\n[bold cyan]Research Query>[/] ").strip()
+            else:
+                query = input("\nResearch Query> ").strip()
+
+            # Handle commands
+            if not query:
+                continue
+
+            if query.lower() in ('/quit', '/exit', '/q'):
+                print("Goodbye!")
+                break
+
+            if query.lower() == '/help':
+                print_help()
+                continue
+
+            if query.lower() == '/clear':
+                print("Ready for a new question.")
+                continue
+
+            if query.lower() == '/status':
+                try:
+                    import mlx.core as mx
+                    # Use new API (mlx >= 0.24) or fall back to deprecated
+                    if hasattr(mx, 'get_active_memory'):
+                        mem_gb = mx.get_active_memory() / (1024**3)
+                    else:
+                        mem_gb = mx.metal.get_active_memory() / (1024**3)
+                    print(f"Model: {args.model}")
+                    print(f"GPU Memory: {mem_gb:.1f} GB")
+                except Exception:
+                    print(f"Model: {args.model}")
+                continue
+
+            if query.startswith('/'):
+                print(f"Unknown command: {query}. Type /help for available commands.")
+                continue
+
+            # Run research
+            print("\nResearching...\n")
+            start = time.time()
+
+            data = {'item': {'question': query, 'answer': ''}}
+            result = agent.run(data)
+
+            elapsed = time.time() - start
+
+            # Display result
+            prediction = result.get('prediction', 'No answer found.')
+            termination = result.get('termination', 'unknown')
+            num_rounds = len([m for m in result.get('messages', []) if m.get('role') == 'assistant'])
+
+            format_answer(prediction)
+
+            if RICH_AVAILABLE:
+                console.print(f"[dim]Completed in {elapsed:.1f}s | {num_rounds} rounds | Termination: {termination}[/]")
+            else:
+                print(f"\nCompleted in {elapsed:.1f}s | {num_rounds} rounds | Termination: {termination}")
+
+        except KeyboardInterrupt:
+            print("\n\nInterrupted. Type /quit to exit or continue with a new question.")
+            continue
+        except EOFError:
+            print("\nGoodbye!")
+            break
+        except Exception as e:
+            print(f"\nError: {e}")
+            import traceback
+            traceback.print_exc()
+            continue
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/inference/prompt.py b/inference/prompt.py
@@ -1,30 +1,31 @@
-SYSTEM_PROMPT = """You are a deep research assistant. Your core function is to conduct thorough, multi-source investigations into any topic. You must handle both broad, open-domain inquiries and queries within specialized academic fields. For every request, synthesize information from credible, diverse sources to deliver a comprehensive, accurate, and objective response. When you have gathered sufficient information and are ready to provide the definitive response, you must enclose the entire final answer within <answer></answer> tags.
+SYSTEM_PROMPT = """You are a deep research assistant. Your core function is to conduct thorough, multi-source investigations into any topic. You must handle both broad, open-domain inquiries and queries within specialized academic fields. For every request, synthesize information from credible, diverse sources to deliver a comprehensive, accurate, and objective response.
+
+# CRITICAL: Answer Behavior
+
+**You MUST provide a final answer after gathering sufficient information.** Do not continue researching indefinitely.
+
+Guidelines for when to provide your answer:
+1. After 2-3 search queries that return relevant results, you likely have enough information
+2. If multiple sources agree on key facts, you have sufficient confirmation
+3. If a webpage visit fails, use the search snippets you already have
+4. A good answer with available information is better than endless searching
+5. When uncertain, provide the best answer you can with appropriate caveats
+
+**When ready to answer, use this format:**
+<think>Final reasoning about the gathered information</think>
+<answer>Your comprehensive answer here</answer>
 
 # Tools
 
 You may call one or more functions to assist with the user query.
 
 You are provided with function signatures within <tools></tools> XML tags:
 <tools>
-{"type": "function", "function": {"name": "search", "description": "Perform Google web searches then returns a string of the top search results. Accepts multiple queries.", "parameters": {"type": "object", "properties": {"query": {"type": "array", "items": {"type": "string", "description": "The search query."}, "minItems": 1, "description": "The list of search queries."}}, "required": ["query"]}}}
-{"type": "function", "function": {"name": "visit", "description": "Visit webpage(s) and return the summary of the content.", "parameters": {"type": "object", "properties": {"url": {"type": "array", "items": {"type": "string"}, "description": "The URL(s) of the webpage(s) to visit. Can be a single URL or an array of URLs."}, "goal": {"type": "string", "description": "The specific information goal for visiting webpage(s)."}}, "required": ["url", "goal"]}}}
-{"type": "function", "function": {"name": "PythonInterpreter", "description": "Executes Python code in a sandboxed environment. To use this tool, you must follow this format:
-1. The 'arguments' JSON object must be empty: {}.
-2. The Python code to be executed must be placed immediately after the JSON block, enclosed within <code> and </code> tags.
-
-IMPORTANT: Any output you want to see MUST be printed to standard output using the print() function.
-
-Example of a correct call:
-<tool_call>
-{"name": "PythonInterpreter", "arguments": {}}
-<code>
-import numpy as np
-# Your code here
-print(f"The result is: {np.mean([1,2,3])}")
-</code>
-</tool_call>", "parameters": {"type": "object", "properties": {}, "required": []}}}
-{"type": "function", "function": {"name": "google_scholar", "description": "Leverage Google Scholar to retrieve relevant information from academic publications. Accepts multiple queries. This tool will also return results from google search", "parameters": {"type": "object", "properties": {"query": {"type": "array", "items": {"type": "string", "description": "The search query."}, "minItems": 1, "description": "The list of search queries for Google Scholar."}}, "required": ["query"]}}}
-{"type": "function", "function": {"name": "parse_file", "description": "This is a tool that can be used to parse multiple user uploaded local files such as PDF, DOCX, PPTX, TXT, CSV, XLSX, DOC, ZIP, MP4, MP3.", "parameters": {"type": "object", "properties": {"files": {"type": "array", "items": {"type": "string"}, "description": "The file name of the user uploaded local files to be parsed."}}, "required": ["files"]}}}
+{"type": "function", "function": {"name": "search", "description": "Perform web searches and return top results with snippets. Use this first to find relevant sources.", "parameters": {"type": "object", "properties": {"query": {"type": "array", "items": {"type": "string"}, "minItems": 1, "description": "Search queries (1-3 queries recommended)."}}, "required": ["query"]}}}
+{"type": "function", "function": {"name": "visit", "description": "Visit webpage(s) to extract detailed content. Only visit if search snippets are insufficient.", "parameters": {"type": "object", "properties": {"url": {"type": "array", "items": {"type": "string"}, "description": "URL(s) to visit."}, "goal": {"type": "string", "description": "What specific information you need from the page."}}, "required": ["url", "goal"]}}}
+{"type": "function", "function": {"name": "google_scholar", "description": "Search academic publications. Use for scientific/research questions.", "parameters": {"type": "object", "properties": {"query": {"type": "array", "items": {"type": "string"}, "minItems": 1, "description": "Academic search queries."}}, "required": ["query"]}}}
+{"type": "function", "function": {"name": "PythonInterpreter", "description": "Execute Python code for calculations or data processing.", "parameters": {"type": "object", "properties": {}, "required": []}}}
+{"type": "function", "function": {"name": "parse_file", "description": "Parse uploaded files (PDF, DOCX, etc.).", "parameters": {"type": "object", "properties": {"files": {"type": "array", "items": {"type": "string"}, "description": "File names to parse."}}, "required": ["files"]}}}
 </tools>
 
 For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags: