diff --git a/strix/interface/main.py b/strix/interface/main.py index 1da6e54b..4cdd99b6 100644 --- a/strix/interface/main.py +++ b/strix/interface/main.py @@ -62,6 +62,9 @@ def validate_environment() -> None: # noqa: PLR0912, PLR0915 if not has_base_url: missing_optional_vars.append("LLM_API_BASE") + if not os.getenv("FIRECRAWL_API_KEY"): + missing_optional_vars.append("FIRECRAWL_API_KEY") + if not os.getenv("PERPLEXITY_API_KEY"): missing_optional_vars.append("PERPLEXITY_API_KEY") @@ -109,6 +112,13 @@ def validate_environment() -> None: # noqa: PLR0912, PLR0915 " - Custom API base URL if using local models (e.g., Ollama, LMStudio)\n", style="white", ) + elif var == "FIRECRAWL_API_KEY": + error_text.append("• ", style="white") + error_text.append("FIRECRAWL_API_KEY", style="bold cyan") + error_text.append( + " - API key for Firecrawl web search (enables real-time research)\n", + style="white", + ) elif var == "PERPLEXITY_API_KEY": error_text.append("• ", style="white") error_text.append("PERPLEXITY_API_KEY", style="bold cyan") @@ -134,6 +144,10 @@ def validate_environment() -> None: # noqa: PLR0912, PLR0915 "# needed for local models only\n", style="dim white", ) + elif var == "FIRECRAWL_API_KEY": + error_text.append( + "export FIRECRAWL_API_KEY='your-firecrawl-key-here'\n", style="dim white" + ) elif var == "PERPLEXITY_API_KEY": error_text.append( "export PERPLEXITY_API_KEY='your-perplexity-key-here'\n", style="dim white" @@ -527,6 +541,14 @@ def main() -> None: results_path = Path("strix_runs") / args.run_name display_completion_message(args, results_path) + # Cleanup runtime resources (e.g. Docker container) + from strix.runtime import get_runtime + try: + runtime = get_runtime() + asyncio.run(runtime.cleanup()) + except Exception: + pass + if args.non_interactive: tracer = get_global_tracer() if tracer and tracer.vulnerability_reports: diff --git a/strix/tools/__init__.py b/strix/tools/__init__.py index 4193d410..6ed22bbc 100644 --- a/strix/tools/__init__.py +++ b/strix/tools/__init__.py @@ -22,6 +22,7 @@ SANDBOX_MODE = os.getenv("STRIX_SANDBOX_MODE", "false").lower() == "true" +HAS_FIRECRAWL_API = bool(os.getenv("FIRECRAWL_API_KEY")) HAS_PERPLEXITY_API = bool(os.getenv("PERPLEXITY_API_KEY")) DISABLE_BROWSER = os.getenv("STRIX_DISABLE_BROWSER", "false").lower() == "true" @@ -41,7 +42,7 @@ from .thinking import * # noqa: F403 from .todo import * # noqa: F403 - if HAS_PERPLEXITY_API: + if HAS_PERPLEXITY_API or HAS_FIRECRAWL_API: from .web_search import * # noqa: F403 else: if not DISABLE_BROWSER: diff --git a/strix/tools/web_search/web_search_actions.py b/strix/tools/web_search/web_search_actions.py index 52f00a97..d54f4eed 100644 --- a/strix/tools/web_search/web_search_actions.py +++ b/strix/tools/web_search/web_search_actions.py @@ -1,12 +1,45 @@ import os +import logging +import tarfile +import io +from pathlib import Path from typing import Any +import litellm import requests +from tenacity import retry, stop_after_attempt, wait_exponential from strix.tools.registry import register_tool +from strix.telemetry.tracer import get_global_tracer +from strix.runtime import get_runtime +logger = logging.getLogger(__name__) -SYSTEM_PROMPT = """You are assisting a cybersecurity agent specialized in vulnerability scanning +SYSTEM_PROMPT_FIRECRAWL = """You are assisting a cybersecurity agent specialized in vulnerability scanning +and security assessment running on Kali Linux. + +You have been provided with search results from the web in Markdown format. +Your task is to synthesize this information to answer the user's query comprehensively. + +1. Prioritize cybersecurity-relevant information including: + - Vulnerability details (CVEs, CVSS scores, impact) + - Security tools, techniques, and methodologies + - Exploit information and proof-of-concepts + - Security best practices and mitigations + - Web application security findings + +2. Provide technical depth appropriate for security professionals. +3. Cite sources implicitly by validating facts against the provided search context. +4. If the search results do not contain the answer, state that clearly and suggest what else might be searched. +5. Focus on actionable intelligence for security assessment. +6. Be detailed and specific - always include concrete code examples, command-line instructions, + or practical implementation steps when applicable. + +Structure your response to be comprehensive yet concise, emphasizing the most critical +security implications. +""" + +SYSTEM_PROMPT_PERPLEXITY = """You are assisting a cybersecurity agent specialized in vulnerability scanning and security assessment running on Kali Linux. When responding to search queries: 1. Prioritize cybersecurity-relevant information including: @@ -14,7 +47,6 @@ - Security tools, techniques, and methodologies - Exploit information and proof-of-concepts - Security best practices and mitigations - - Penetration testing approaches - Web application security findings 2. Provide technical depth appropriate for security professionals @@ -31,50 +63,209 @@ security implications and details.""" +@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10), reraise=True) +def _firecrawl_search(query: str, api_key: str) -> dict[str, Any]: + """Execute search via Firecrawl API with retries.""" + url = "https://api.firecrawl.dev/v1/search" + headers = { + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json" + } + + # We ask for markdown content to feed into the LLM + payload = { + "query": query, + "limit": 5, + "scrapeOptions": { + "formats": ["markdown"] + } + } + + response = requests.post(url, headers=headers, json=payload, timeout=60) + try: + response.raise_for_status() + except requests.exceptions.HTTPError as e: + # Include the response text in the error for debugging + raise RuntimeError(f"Firecrawl API Error ({e.response.status_code}): {e.response.text}") from e + + return response.json() + + +def _save_scraped_data(data: list[dict[str, Any]], query: str, agent_state: Any | None) -> None: + """Save scraped data to host and sandbox.""" + try: + tracer = get_global_tracer() + if not tracer: + return + + run_dir = tracer.get_run_dir() + scraped_dir = run_dir / "scraped_data" + scraped_dir.mkdir(exist_ok=True) + + # Sanitize query for filename + import re + safe_query = re.sub(r'[^a-zA-Z0-9]', '_', query)[:50] + + saved_files = [] + + for i, item in enumerate(data): + markdown = item.get("markdown", "") + title = item.get("title", "untitled") + url = item.get("url", "no_url") + + if not markdown: + continue + + filename = f"{safe_query}_{i}.md" + file_path = scraped_dir / filename + + with file_path.open("w", encoding="utf-8") as f: + f.write(f"--- \nTitle: {title}\nURL: {url}\n---\n\n") + f.write(markdown) + saved_files.append(file_path) + + # If we have an active agent state (sandbox), copy files there + if agent_state and agent_state.sandbox_id: + try: + runtime = get_runtime() + # Check if it's DockerRuntime by duck typing or import + if hasattr(runtime, "client"): + container = runtime.client.containers.get(agent_state.sandbox_id) + + # Create tarball in memory + tar_buffer = io.BytesIO() + with tarfile.open(fileobj=tar_buffer, mode="w") as tar: + for file_path in saved_files: + arcname = f"scraped_data/{file_path.name}" + tar.add(file_path, arcname=arcname) + + tar_buffer.seek(0) + + # Create directory in container first + container.exec_run("mkdir -p /workspace/scraped_data") + + # Copy files + container.put_archive("/workspace", tar_buffer.getvalue()) + + # Fix permissions + container.exec_run("chown -R pentester:pentester /workspace/scraped_data") + + logger.info(f"Copied {len(saved_files)} scraped files to sandbox /workspace/scraped_data") + + except Exception as e: + logger.warning(f"Failed to copy scraped data to sandbox: {e}") + + except Exception as e: + logger.warning(f"Failed to save scraped data: {e}") + + +def _perplexity_search(query: str, api_key: str) -> dict[str, Any]: + """Execute search via Perplexity API.""" + url = "https://api.perplexity.ai/chat/completions" + headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"} + + payload = { + "model": "sonar-reasoning", + "messages": [ + {"role": "system", "content": SYSTEM_PROMPT_PERPLEXITY}, + {"role": "user", "content": query}, + ], + } + + response = requests.post(url, headers=headers, json=payload, timeout=300) + response.raise_for_status() + + response_data = response.json() + content = response_data["choices"][0]["message"]["content"] + return { + "success": True, + "query": query, + "content": content, + "message": "Web search completed successfully via Perplexity", + } + + @register_tool(sandbox_execution=False) -def web_search(query: str) -> dict[str, Any]: +def web_search(query: str, agent_state: Any | None = None) -> dict[str, Any]: try: - api_key = os.getenv("PERPLEXITY_API_KEY") - if not api_key: + # Prioritize Perplexity if available + perplexity_key = os.getenv("PERPLEXITY_API_KEY") + if perplexity_key: + return _perplexity_search(query, perplexity_key) + + firecrawl_key = os.getenv("FIRECRAWL_API_KEY") + if not firecrawl_key: + return { + "success": False, + "message": "Neither PERPLEXITY_API_KEY nor FIRECRAWL_API_KEY environment variables are set. Please configure one to use web search.", + "results": [], + } + + # 1. Search and Crawl with Firecrawl + try: + search_data = _firecrawl_search(query, firecrawl_key) + except Exception as e: + logger.error(f"Firecrawl search failed: {e}") return { "success": False, - "message": "PERPLEXITY_API_KEY environment variable not set", + "message": f"Search failed: {e}", + "results": [] + } + + if not search_data.get("success") or not search_data.get("data"): + return { + "success": False, + "message": "No results found.", "results": [], } - url = "https://api.perplexity.ai/chat/completions" - headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"} + # 1.5 Save Scraped Data + _save_scraped_data(search_data["data"], query, agent_state) - payload = { - "model": "sonar-reasoning", - "messages": [ - {"role": "system", "content": SYSTEM_PROMPT}, - {"role": "user", "content": query}, - ], - } + # 2. Prepare Context for LLM + context_parts = [] + for item in search_data["data"]: + title = item.get("title", "No Title") + url = item.get("url", "No URL") + markdown = item.get("markdown", "") + # Truncate very long pages to avoid token limits, though modern models handle large context well. + # 10k chars is a safe starting heuristic per page for 5 pages. + markdown_snippet = markdown[:15000] - response = requests.post(url, headers=headers, json=payload, timeout=300) - response.raise_for_status() + context_parts.append(f"Source: {title} ({url})\n\nContent:\n{markdown_snippet}\n---") - response_data = response.json() - content = response_data["choices"][0]["message"]["content"] + full_context = "\n".join(context_parts) - except requests.exceptions.Timeout: - return {"success": False, "message": "Request timed out", "results": []} - except requests.exceptions.RequestException as e: - return {"success": False, "message": f"API request failed: {e!s}", "results": []} - except KeyError as e: - return { - "success": False, - "message": f"Unexpected API response format: missing {e!s}", - "results": [], + # 3. Synthesize with LLM + llm_model = os.getenv("STRIX_LLM", "openai/gpt-4o") + llm_api_key = os.getenv("LLM_API_KEY") + llm_api_base = os.getenv("LLM_API_BASE") + + messages = [ + {"role": "system", "content": SYSTEM_PROMPT_FIRECRAWL}, + {"role": "user", "content": f"User Query: {query}\n\nSearch Results:\n{full_context}"} + ] + + completion_kwargs = { + "model": llm_model, + "messages": messages, + "timeout": 120, } - except Exception as e: # noqa: BLE001 + if llm_api_key: + completion_kwargs["api_key"] = llm_api_key + if llm_api_base: + completion_kwargs["api_base"] = llm_api_base + + response = litellm.completion(**completion_kwargs) + content = response.choices[0].message.content + + except Exception as e: + logger.exception("Web search synthesis failed") return {"success": False, "message": f"Web search failed: {e!s}", "results": []} else: return { "success": True, "query": query, "content": content, - "message": "Web search completed successfully", - } + "message": "Web search completed successfully via Firecrawl", + } \ No newline at end of file diff --git a/strix/tools/web_search/web_search_actions_schema.xml b/strix/tools/web_search/web_search_actions_schema.xml index 993f4e97..1b33350e 100644 --- a/strix/tools/web_search/web_search_actions_schema.xml +++ b/strix/tools/web_search/web_search_actions_schema.xml @@ -1,6 +1,6 @@ - Search the web using Perplexity AI for real-time information and current events. + Search the web using Perplexity AI or Firecrawl for real-time information and current events. This is your PRIMARY research tool - use it extensively and liberally for: - Current vulnerabilities, CVEs, and security advisories @@ -14,7 +14,7 @@ This is your PRIMARY research tool - use it extensively and liberally for: - Security conference talks and research papers The tool provides intelligent, contextual responses with current information that may not be in your training data. Use it early and often during security assessments to gather the most up-to-date factual information. -
This tool leverages Perplexity AI's sonar-reasoning model to search the web and provide intelligent, contextual responses to queries. It's essential for effective cybersecurity work as it provides access to the latest vulnerabilities, attack vectors, security tools, and defensive techniques. The AI understands security context and can synthesize information from multiple sources.
+
This tool leverages either Perplexity AI's sonar-reasoning model or Firecrawl's deep search and scraping capabilities to search the web and provide intelligent, contextual responses to queries. It's essential for effective cybersecurity work as it provides access to the latest vulnerabilities, attack vectors, security tools, and defensive techniques. The AI understands security context and can synthesize information from multiple sources.
The search query or question you want to research. Be specific and include relevant technical terms, version numbers, or context for better results. Make it as detailed as possible, with the context of the current security assessment.