diff --git a/.github/workflows/pr-status-gate.yml b/.github/workflows/pr-status-gate.yml index 4220cb644b..f79eb16936 100644 --- a/.github/workflows/pr-status-gate.yml +++ b/.github/workflows/pr-status-gate.yml @@ -2,7 +2,7 @@ name: PR Status Gate on: workflow_run: - workflows: [CI, Lint, Quality Security, CLA Assistant, Quality Commit Lint] + workflows: [CI, Lint, Quality Security] types: [completed] permissions: @@ -36,7 +36,7 @@ jobs: // To find check names: Go to PR → Checks tab → copy exact name // To update: Edit this list when workflow jobs are added/renamed/removed // - // Last validated: 2025-12-26 + // Last validated: 2025-12-31 // ═══════════════════════════════════════════════════════════════════════ const requiredChecks = [ // CI workflow (ci.yml) - 3 checks @@ -49,11 +49,7 @@ jobs: 'Quality Security / CodeQL (javascript-typescript) (pull_request)', 'Quality Security / CodeQL (python) (pull_request)', 'Quality Security / Python Security (Bandit) (pull_request)', - 'Quality Security / Security Summary (pull_request)', - // CLA Assistant workflow (cla.yml) - 1 check - 'CLA Assistant / CLA Check', - // Quality Commit Lint workflow (quality-commit-lint.yml) - 1 check - 'Quality Commit Lint / Conventional Commits (pull_request)' + 'Quality Security / Security Summary (pull_request)' ]; const statusLabels = { diff --git a/.husky/pre-commit b/.husky/pre-commit index 4dca91b56e..05dd657f3c 100755 --- a/.husky/pre-commit +++ b/.husky/pre-commit @@ -36,14 +36,44 @@ if git diff --cached --name-only | grep -q "^package.json$"; then echo " Updated apps/backend/__init__.py to $VERSION" fi - # Sync to README.md + # Sync to README.md - section-aware updates (stable vs beta) if [ -f "README.md" ]; then # Escape hyphens for shields.io badge format (shields.io uses -- for literal hyphens) ESCAPED_VERSION=$(echo "$VERSION" | sed 's/-/--/g') - # Update version badge - match both stable (X.Y.Z) and prerelease (X.Y.Z-prerelease.N or X.Y.Z--prerelease.N) - sed -i.bak "s/version-[0-9]*\.[0-9]*\.[0-9]*\(-\{1,2\}[a-z]*\.[0-9]*\)*-blue/version-$ESCAPED_VERSION-blue/g" README.md - # Update download links - match both stable and prerelease versions - sed -i.bak "s/Auto-Claude-[0-9]*\.[0-9]*\.[0-9]*\(-[a-z]*\.[0-9]*\)*/Auto-Claude-$VERSION/g" README.md + + # Detect if this is a prerelease (contains - after base version, e.g., 2.7.2-beta.10) + if echo "$VERSION" | grep -q '-'; then + # PRERELEASE: Update only beta sections + echo " Detected PRERELEASE version: $VERSION" + + # Update beta version badge (orange) + sed -i.bak "s/beta-[0-9]*\.[0-9]*\.[0-9]*\(--[a-z]*\.[0-9]*\)*-orange/beta-$ESCAPED_VERSION-orange/g" README.md + + # Update beta version badge link (within BETA_VERSION_BADGE section) + sed -i.bak '//,//s|releases/tag/v[0-9.a-z-]*)|releases/tag/v'"$VERSION"')|g' README.md + + # Update beta download links (within BETA_DOWNLOADS section only) + for SUFFIX in "win32-x64.exe" "darwin-arm64.dmg" "darwin-x64.dmg" "linux-x86_64.AppImage" "linux-amd64.deb" "linux-x86_64.flatpak"; do + sed -i.bak '//,//{s|Auto-Claude-[0-9.a-z-]*-'"$SUFFIX"'](https://github.com/AndyMik90/Auto-Claude/releases/download/v[^/]*/Auto-Claude-[^)]*-'"$SUFFIX"')|Auto-Claude-'"$VERSION"'-'"$SUFFIX"'](https://github.com/AndyMik90/Auto-Claude/releases/download/v'"$VERSION"'/Auto-Claude-'"$VERSION"'-'"$SUFFIX"')|g}' README.md + done + else + # STABLE: Update stable sections and top badge + echo " Detected STABLE version: $VERSION" + + # Update top version badge (blue) - within TOP_VERSION_BADGE section + sed -i.bak '//,//s/version-[0-9]*\.[0-9]*\.[0-9]*\(--[a-z]*\.[0-9]*\)*-blue/version-'"$ESCAPED_VERSION"'-blue/g' README.md + sed -i.bak '//,//s|releases/tag/v[0-9.a-z-]*)|releases/tag/v'"$VERSION"')|g' README.md + + # Update stable version badge (blue) - within STABLE_VERSION_BADGE section + sed -i.bak '//,//s/stable-[0-9]*\.[0-9]*\.[0-9]*\(--[a-z]*\.[0-9]*\)*-blue/stable-'"$ESCAPED_VERSION"'-blue/g' README.md + sed -i.bak '//,//s|releases/tag/v[0-9.a-z-]*)|releases/tag/v'"$VERSION"')|g' README.md + + # Update stable download links (within STABLE_DOWNLOADS section only) + for SUFFIX in "win32-x64.exe" "darwin-arm64.dmg" "darwin-x64.dmg" "linux-x86_64.AppImage" "linux-amd64.deb"; do + sed -i.bak '//,//{s|Auto-Claude-[0-9.a-z-]*-'"$SUFFIX"'](https://github.com/AndyMik90/Auto-Claude/releases/download/v[^/]*/Auto-Claude-[^)]*-'"$SUFFIX"')|Auto-Claude-'"$VERSION"'-'"$SUFFIX"'](https://github.com/AndyMik90/Auto-Claude/releases/download/v'"$VERSION"'/Auto-Claude-'"$VERSION"'-'"$SUFFIX"')|g}' README.md + done + fi + rm -f README.md.bak git add README.md echo " Updated README.md to $VERSION" @@ -72,20 +102,25 @@ if git diff --cached --name-only | grep -q "^apps/backend/.*\.py$"; then fi if [ -n "$RUFF" ]; then - # Run ruff linting (auto-fix) - echo "Running ruff lint..." - $RUFF check apps/backend/ --fix - if [ $? -ne 0 ]; then - echo "Ruff lint failed. Please fix Python linting errors before committing." - exit 1 + # Get only staged Python files in apps/backend (process only what's being committed) + STAGED_PY_FILES=$(git diff --cached --name-only --diff-filter=ACM | grep "^apps/backend/.*\.py$" || true) + + if [ -n "$STAGED_PY_FILES" ]; then + # Run ruff linting (auto-fix) only on staged files + echo "Running ruff lint on staged files..." + echo "$STAGED_PY_FILES" | xargs $RUFF check --fix + if [ $? -ne 0 ]; then + echo "Ruff lint failed. Please fix Python linting errors before committing." + exit 1 + fi + + # Run ruff format (auto-fix) only on staged files + echo "Running ruff format on staged files..." + echo "$STAGED_PY_FILES" | xargs $RUFF format + + # Re-stage only the files that were originally staged (in case ruff modified them) + echo "$STAGED_PY_FILES" | xargs git add fi - - # Run ruff format (auto-fix) - echo "Running ruff format..." - $RUFF format apps/backend/ - - # Stage any files that were auto-fixed by ruff (POSIX-compliant) - find apps/backend -name "*.py" -type f -exec git add {} + 2>/dev/null || true else echo "Warning: ruff not found, skipping Python linting. Install with: uv pip install ruff" fi diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ac44d62a3a..f67b77c813 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -25,14 +25,41 @@ repos: # Sync to apps/backend/__init__.py sed -i.bak "s/__version__ = \"[^\"]*\"/__version__ = \"$VERSION\"/" apps/backend/__init__.py && rm -f apps/backend/__init__.py.bak - # Sync to README.md - shields.io version badge (text and URL) + # Sync to README.md - section-aware updates (stable vs beta) ESCAPED_VERSION=$(echo "$VERSION" | sed 's/-/--/g') - sed -i.bak -e "s/version-[0-9]*\.[0-9]*\.[0-9]*\(-\{1,2\}[a-z]*\.[0-9]*\)*-blue/version-$ESCAPED_VERSION-blue/g" -e "s|releases/tag/v[0-9.a-z-]*)|releases/tag/v$VERSION)|g" README.md - # Sync to README.md - download links with correct filenames and URLs - for SUFFIX in "win32-x64.exe" "darwin-arm64.dmg" "darwin-x64.dmg" "linux-x86_64.AppImage" "linux-amd64.deb"; do - sed -i.bak "s|Auto-Claude-[0-9.a-z-]*-${SUFFIX}](https://github.com/AndyMik90/Auto-Claude/releases/download/v[^/]*/Auto-Claude-[^)]*-${SUFFIX})|Auto-Claude-${VERSION}-${SUFFIX}](https://github.com/AndyMik90/Auto-Claude/releases/download/v${VERSION}/Auto-Claude-${VERSION}-${SUFFIX})|g" README.md - done + # Detect if this is a prerelease (contains - after base version) + if echo "$VERSION" | grep -q '-'; then + # PRERELEASE: Update only beta sections + echo " Detected PRERELEASE version: $VERSION" + + # Update beta version badge (orange) + sed -i.bak "s/beta-[0-9]*\.[0-9]*\.[0-9]*\(--[a-z]*\.[0-9]*\)*-orange/beta-$ESCAPED_VERSION-orange/g" README.md + + # Update beta version badge link + sed -i.bak '//,//s|releases/tag/v[0-9.a-z-]*)|releases/tag/v'"$VERSION"')|g' README.md + + # Update beta download links (within BETA_DOWNLOADS section only) + for SUFFIX in "win32-x64.exe" "darwin-arm64.dmg" "darwin-x64.dmg" "linux-x86_64.AppImage" "linux-amd64.deb" "linux-x86_64.flatpak"; do + sed -i.bak '//,//{s|Auto-Claude-[0-9.a-z-]*-'"$SUFFIX"'](https://github.com/AndyMik90/Auto-Claude/releases/download/v[^/]*/Auto-Claude-[^)]*-'"$SUFFIX"')|Auto-Claude-'"$VERSION"'-'"$SUFFIX"'](https://github.com/AndyMik90/Auto-Claude/releases/download/v'"$VERSION"'/Auto-Claude-'"$VERSION"'-'"$SUFFIX"')|g}' README.md + done + else + # STABLE: Update stable sections and top badge + echo " Detected STABLE version: $VERSION" + + # Update top version badge (blue) + sed -i.bak '//,//s/version-[0-9]*\.[0-9]*\.[0-9]*\(--[a-z]*\.[0-9]*\)*-blue/version-'"$ESCAPED_VERSION"'-blue/g' README.md + sed -i.bak '//,//s|releases/tag/v[0-9.a-z-]*)|releases/tag/v'"$VERSION"')|g' README.md + + # Update stable version badge (blue) + sed -i.bak '//,//s/stable-[0-9]*\.[0-9]*\.[0-9]*\(--[a-z]*\.[0-9]*\)*-blue/stable-'"$ESCAPED_VERSION"'-blue/g' README.md + sed -i.bak '//,//s|releases/tag/v[0-9.a-z-]*)|releases/tag/v'"$VERSION"')|g' README.md + + # Update stable download links (within STABLE_DOWNLOADS section only) + for SUFFIX in "win32-x64.exe" "darwin-arm64.dmg" "darwin-x64.dmg" "linux-x86_64.AppImage" "linux-amd64.deb"; do + sed -i.bak '//,//{s|Auto-Claude-[0-9.a-z-]*-'"$SUFFIX"'](https://github.com/AndyMik90/Auto-Claude/releases/download/v[^/]*/Auto-Claude-[^)]*-'"$SUFFIX"')|Auto-Claude-'"$VERSION"'-'"$SUFFIX"'](https://github.com/AndyMik90/Auto-Claude/releases/download/v'"$VERSION"'/Auto-Claude-'"$VERSION"'-'"$SUFFIX"')|g}' README.md + done + fi rm -f README.md.bak # Stage changes diff --git a/apps/backend/__init__.py b/apps/backend/__init__.py index 72e597cd0a..106d44d4a2 100644 --- a/apps/backend/__init__.py +++ b/apps/backend/__init__.py @@ -19,5 +19,5 @@ See README.md for full documentation. """ -__version__ = "2.7.2-beta.10" +__version__ = "2.7.2-beta.12" __author__ = "Auto Claude Team" diff --git a/apps/backend/agents/session.py b/apps/backend/agents/session.py index 39ae045893..89a5d5d48c 100644 --- a/apps/backend/agents/session.py +++ b/apps/backend/agents/session.py @@ -21,6 +21,7 @@ is_build_complete, ) from recovery import RecoveryManager +from security.tool_input_validator import get_safe_tool_input from task_logger import ( LogEntryType, LogPhase, @@ -386,41 +387,43 @@ async def run_agent_session( ) elif block_type == "ToolUseBlock" and hasattr(block, "name"): tool_name = block.name - tool_input = None + tool_input_display = None tool_count += 1 + # Safely extract tool input (handles None, non-dict, etc.) + inp = get_safe_tool_input(block) + # Extract meaningful tool input for display - if hasattr(block, "input") and block.input: - inp = block.input - if isinstance(inp, dict): - if "pattern" in inp: - tool_input = f"pattern: {inp['pattern']}" - elif "file_path" in inp: - fp = inp["file_path"] - if len(fp) > 50: - fp = "..." + fp[-47:] - tool_input = fp - elif "command" in inp: - cmd = inp["command"] - if len(cmd) > 50: - cmd = cmd[:47] + "..." - tool_input = cmd - elif "path" in inp: - tool_input = inp["path"] + if inp: + if "pattern" in inp: + tool_input_display = f"pattern: {inp['pattern']}" + elif "file_path" in inp: + fp = inp["file_path"] + if len(fp) > 50: + fp = "..." + fp[-47:] + tool_input_display = fp + elif "command" in inp: + cmd = inp["command"] + if len(cmd) > 50: + cmd = cmd[:47] + "..." + tool_input_display = cmd + elif "path" in inp: + tool_input_display = inp["path"] debug( "session", f"Tool call #{tool_count}: {tool_name}", - tool_input=tool_input, - full_input=str(block.input)[:500] - if hasattr(block, "input") - else None, + tool_input=tool_input_display, + full_input=str(inp)[:500] if inp else None, ) # Log tool start (handles printing too) if task_logger: task_logger.tool_start( - tool_name, tool_input, phase, print_to_console=True + tool_name, + tool_input_display, + phase, + print_to_console=True, ) else: print(f"\n[Tool: {tool_name}]", flush=True) diff --git a/apps/backend/agents/tools_pkg/models.py b/apps/backend/agents/tools_pkg/models.py index c1e5df5714..44d780d990 100644 --- a/apps/backend/agents/tools_pkg/models.py +++ b/apps/backend/agents/tools_pkg/models.py @@ -216,8 +216,9 @@ def is_electron_mcp_enabled() -> bool: # QA PHASES (Read + test + browser + Graphiti memory) # ═══════════════════════════════════════════════════════════════════════ "qa_reviewer": { - # Read-only + Bash (for running tests) - reviewer should NOT edit code - "tools": BASE_READ_TOOLS + ["Bash"] + WEB_TOOLS, + # Read + Write/Edit (for QA reports and plan updates) + Bash (for tests) + # Note: Reviewer writes to spec directory only (qa_report.md, implementation_plan.json) + "tools": BASE_READ_TOOLS + BASE_WRITE_TOOLS + WEB_TOOLS, "mcp_servers": ["context7", "graphiti", "auto-claude", "browser"], "mcp_servers_optional": ["linear"], # For updating issue status "auto_claude_tools": [ diff --git a/apps/backend/prompts/github/pr_finding_validator.md b/apps/backend/prompts/github/pr_finding_validator.md new file mode 100644 index 0000000000..b054344ea9 --- /dev/null +++ b/apps/backend/prompts/github/pr_finding_validator.md @@ -0,0 +1,158 @@ +# Finding Validator Agent + +You are a finding re-investigator. For each unresolved finding from a previous PR review, you must actively investigate whether it is a REAL issue or a FALSE POSITIVE. + +Your job is to prevent false positives from persisting indefinitely by actually reading the code and verifying the issue exists. + +## Your Mission + +For each finding you receive: +1. **READ** the actual code at the file/line location using the Read tool +2. **ANALYZE** whether the described issue actually exists in the code +3. **PROVIDE** concrete code evidence for your conclusion +4. **RETURN** validation status with evidence + +## Investigation Process + +### Step 1: Fetch the Code + +Use the Read tool to get the actual code at `finding.file` around `finding.line`. +Get sufficient context (±20 lines minimum). + +``` +Read the file: {finding.file} +Focus on lines around: {finding.line} +``` + +### Step 2: Analyze with Fresh Eyes + +**Do NOT assume the original finding is correct.** Ask yourself: +- Does the code ACTUALLY have this issue? +- Is the described vulnerability/bug/problem present? +- Could the original reviewer have misunderstood the code? +- Is there context that makes this NOT an issue (e.g., sanitization elsewhere)? + +Be skeptical. The original review may have hallucinated this finding. + +### Step 3: Document Evidence + +You MUST provide concrete evidence: +- **Exact code snippet** you examined (copy-paste from the file) +- **Line numbers** where you found (or didn't find) the issue +- **Your analysis** of whether the issue exists +- **Confidence level** (0.0-1.0) in your conclusion + +## Validation Statuses + +### `confirmed_valid` +Use when you verify the issue IS real: +- The problematic code pattern exists exactly as described +- The vulnerability/bug is present and exploitable +- The code quality issue genuinely impacts the codebase + +### `dismissed_false_positive` +Use when you verify the issue does NOT exist: +- The described code pattern is not actually present +- The original finding misunderstood the code +- There is mitigating code that prevents the issue (e.g., input validation elsewhere) +- The finding was based on incorrect assumptions + +### `needs_human_review` +Use when you cannot determine with confidence: +- The issue requires runtime analysis to verify +- The code is too complex to analyze statically +- You have conflicting evidence +- Your confidence is below 0.70 + +## Output Format + +Return one result per finding: + +```json +{ + "finding_id": "SEC-001", + "validation_status": "confirmed_valid", + "code_evidence": "const query = `SELECT * FROM users WHERE id = ${userId}`;", + "line_range": [45, 45], + "explanation": "SQL injection vulnerability confirmed. User input 'userId' is directly interpolated into the SQL query at line 45 without any sanitization. The query is executed via db.execute() on line 46.", + "confidence": 0.95 +} +``` + +```json +{ + "finding_id": "QUAL-002", + "validation_status": "dismissed_false_positive", + "code_evidence": "function processInput(data: string): string {\n const sanitized = DOMPurify.sanitize(data);\n return sanitized;\n}", + "line_range": [23, 26], + "explanation": "The original finding claimed XSS vulnerability, but the code uses DOMPurify.sanitize() before output. The input is properly sanitized at line 24 before being returned.", + "confidence": 0.88 +} +``` + +```json +{ + "finding_id": "LOGIC-003", + "validation_status": "needs_human_review", + "code_evidence": "async function handleRequest(req) {\n // Complex async logic...\n}", + "line_range": [100, 150], + "explanation": "The original finding claims a race condition, but verifying this requires understanding the runtime behavior and concurrency model. Cannot determine statically.", + "confidence": 0.45 +} +``` + +## Confidence Guidelines + +Rate your confidence based on how certain you are: + +| Confidence | Meaning | +|------------|---------| +| 0.90-1.00 | Definitive evidence - code clearly shows the issue exists/doesn't exist | +| 0.80-0.89 | Strong evidence - high confidence with minor uncertainty | +| 0.70-0.79 | Moderate evidence - likely correct but some ambiguity | +| 0.50-0.69 | Uncertain - use `needs_human_review` | +| Below 0.50 | Insufficient evidence - must use `needs_human_review` | + +**Minimum thresholds:** +- To confirm as `confirmed_valid`: confidence >= 0.70 +- To dismiss as `dismissed_false_positive`: confidence >= 0.80 (higher bar for dismissal) +- If below thresholds: must use `needs_human_review` + +## Common False Positive Patterns + +Watch for these patterns that often indicate false positives: + +1. **Sanitization elsewhere**: Input is validated/sanitized before reaching the flagged code +2. **Internal-only code**: Code only handles trusted internal data, not user input +3. **Framework protection**: Framework provides automatic protection (e.g., ORM parameterization) +4. **Dead code**: The flagged code is never executed in the current codebase +5. **Test code**: The issue is in test files where it's acceptable +6. **Misread syntax**: Original reviewer misunderstood the language syntax + +## Common Valid Issue Patterns + +These patterns often confirm the issue is real: + +1. **Direct string concatenation** in SQL/commands with user input +2. **Missing null checks** where null values can flow through +3. **Hardcoded credentials** that are actually used (not examples) +4. **Missing error handling** in critical paths +5. **Race conditions** with clear concurrent access + +## Critical Rules + +1. **ALWAYS read the actual code** - Never rely on memory or the original finding description +2. **ALWAYS provide code_evidence** - No empty strings. Quote the actual code. +3. **Be skeptical of original findings** - Many AI reviews produce false positives +4. **Higher bar for dismissal** - Need 0.80 confidence to dismiss (vs 0.70 to confirm) +5. **When uncertain, escalate** - Use `needs_human_review` rather than guessing +6. **Look for mitigations** - Check surrounding code for sanitization/validation +7. **Check the full context** - Read ±20 lines, not just the flagged line + +## Anti-Patterns to Avoid + +- **Trusting the original finding blindly** - Always verify +- **Dismissing without reading code** - Must provide code_evidence +- **Low confidence dismissals** - Needs 0.80+ confidence to dismiss +- **Vague explanations** - Be specific about what you found +- **Missing line numbers** - Always include line_range diff --git a/apps/backend/prompts/github/pr_followup_orchestrator.md b/apps/backend/prompts/github/pr_followup_orchestrator.md index 39851d864e..da2ee6b97a 100644 --- a/apps/backend/prompts/github/pr_followup_orchestrator.md +++ b/apps/backend/prompts/github/pr_followup_orchestrator.md @@ -35,6 +35,20 @@ You have access to these specialist agents via the Task tool: - Flags concerns that need addressing - **Invoke when**: There are comments or reviews since last review +### 4. finding-validator (CRITICAL - Prevent False Positives) +**Use for**: Re-investigating unresolved findings to validate they are real issues +- Reads the ACTUAL CODE at the finding location with fresh eyes +- Actively investigates whether the described issue truly exists +- Can DISMISS findings as false positives if original review was incorrect +- Can CONFIRM findings as valid if issue is genuine +- Requires concrete CODE EVIDENCE for any conclusion +- **ALWAYS invoke after resolution-verifier for ALL unresolved findings** +- **Invoke when**: There are findings still marked as unresolved + +**Why this is critical**: Initial reviews may produce false positives (hallucinated issues). +Without validation, these persist indefinitely. This agent prevents that by actually +examining the code and determining if the issue is real. + ## Workflow ### Phase 1: Analyze Scope @@ -50,6 +64,9 @@ Based on your analysis, invoke the appropriate agents: **Always invoke** `resolution-verifier` if there are previous findings. +**ALWAYS invoke** `finding-validator` for ALL unresolved findings from resolution-verifier. +This is CRITICAL to prevent false positives from persisting. + **Invoke** `new-code-reviewer` if: - Diff is substantial (>50 lines) - Changes touch security-sensitive areas @@ -61,17 +78,28 @@ Based on your analysis, invoke the appropriate agents: - There are AI tool reviews to triage - Questions remain unanswered -### Phase 3: Synthesize Results -After agents complete: +### Phase 3: Validate Unresolved Findings +After resolution-verifier returns findings marked as unresolved: +1. Pass ALL unresolved findings to finding-validator +2. finding-validator will read the actual code at each location +3. For each finding, it returns: + - `confirmed_valid`: Issue IS real → keep as unresolved + - `dismissed_false_positive`: Original finding was WRONG → remove from findings + - `needs_human_review`: Cannot determine → flag for human + +### Phase 4: Synthesize Results +After all agents complete: 1. Combine resolution verifications -2. Merge new findings (deduplicate if needed) -3. Incorporate comment analysis -4. Generate final verdict +2. Apply validation results (remove dismissed false positives) +3. Merge new findings (deduplicate if needed) +4. Incorporate comment analysis +5. Generate final verdict based on VALIDATED findings only ## Verdict Guidelines ### READY_TO_MERGE -- All previous findings verified as resolved +- All previous findings verified as resolved OR dismissed as false positives +- No CONFIRMED_VALID critical/high issues remaining - No new critical/high issues - No blocking concerns from comments - Contributor questions addressed @@ -82,15 +110,17 @@ After agents complete: - Optional polish items can be addressed post-merge ### NEEDS_REVISION (Strict Quality Gates) -- HIGH or MEDIUM severity findings unresolved +- HIGH or MEDIUM severity findings CONFIRMED_VALID (not dismissed as false positive) - New HIGH or MEDIUM severity issues introduced - Important contributor concerns unaddressed - **Note: Both HIGH and MEDIUM block merge** (AI fixes quickly, so be strict) +- **Note: Only count findings that passed validation** (dismissed_false_positive findings don't block) ### BLOCKED -- CRITICAL findings remain unresolved +- CRITICAL findings remain CONFIRMED_VALID (not dismissed as false positive) - New CRITICAL issues introduced - Fundamental problems with the fix approach +- **Note: Only block for findings that passed validation** ## Cross-Validation @@ -106,10 +136,28 @@ Provide your synthesis as a structured response matching the ParallelFollowupRes ```json { "analysis_summary": "Brief summary of what was analyzed", - "agents_invoked": ["resolution-verifier", "new-code-reviewer"], + "agents_invoked": ["resolution-verifier", "finding-validator", "new-code-reviewer"], "commits_analyzed": 5, "files_changed": 12, "resolution_verifications": [...], + "finding_validations": [ + { + "finding_id": "SEC-001", + "validation_status": "confirmed_valid", + "code_evidence": "const query = `SELECT * FROM users WHERE id = ${userId}`;", + "line_range": [45, 45], + "explanation": "SQL injection is present - user input is concatenated...", + "confidence": 0.92 + }, + { + "finding_id": "QUAL-002", + "validation_status": "dismissed_false_positive", + "code_evidence": "const sanitized = DOMPurify.sanitize(data);", + "line_range": [23, 26], + "explanation": "Original finding claimed XSS but code uses DOMPurify...", + "confidence": 0.88 + } + ], "new_findings": [...], "comment_analyses": [...], "comment_findings": [...], @@ -119,7 +167,7 @@ Provide your synthesis as a structured response matching the ParallelFollowupRes "resolution_notes": null }, "verdict": "READY_TO_MERGE", - "verdict_reasoning": "All 3 previous findings verified as resolved..." + "verdict_reasoning": "2 findings resolved, 1 dismissed as false positive, 1 confirmed valid but LOW severity..." } ``` diff --git a/apps/backend/qa/fixer.py b/apps/backend/qa/fixer.py index 0901c9d285..d17f97d11b 100644 --- a/apps/backend/qa/fixer.py +++ b/apps/backend/qa/fixer.py @@ -9,6 +9,7 @@ from claude_agent_sdk import ClaudeSDKClient from debug import debug, debug_detailed, debug_error, debug_section, debug_success +from security.tool_input_validator import get_safe_tool_input from task_logger import ( LogEntryType, LogPhase, @@ -128,34 +129,35 @@ async def run_qa_fixer_session( ) elif block_type == "ToolUseBlock" and hasattr(block, "name"): tool_name = block.name - tool_input = None + tool_input_display = None tool_count += 1 - if hasattr(block, "input") and block.input: - inp = block.input - if isinstance(inp, dict): - if "file_path" in inp: - fp = inp["file_path"] - if len(fp) > 50: - fp = "..." + fp[-47:] - tool_input = fp - elif "command" in inp: - cmd = inp["command"] - if len(cmd) > 50: - cmd = cmd[:47] + "..." - tool_input = cmd + # Safely extract tool input (handles None, non-dict, etc.) + inp = get_safe_tool_input(block) + + if inp: + if "file_path" in inp: + fp = inp["file_path"] + if len(fp) > 50: + fp = "..." + fp[-47:] + tool_input_display = fp + elif "command" in inp: + cmd = inp["command"] + if len(cmd) > 50: + cmd = cmd[:47] + "..." + tool_input_display = cmd debug( "qa_fixer", f"Tool call #{tool_count}: {tool_name}", - tool_input=tool_input, + tool_input=tool_input_display, ) # Log tool start (handles printing) if task_logger: task_logger.tool_start( tool_name, - tool_input, + tool_input_display, LogPhase.VALIDATION, print_to_console=True, ) diff --git a/apps/backend/qa/reviewer.py b/apps/backend/qa/reviewer.py index 49de9577fa..3d06a06c03 100644 --- a/apps/backend/qa/reviewer.py +++ b/apps/backend/qa/reviewer.py @@ -11,6 +11,7 @@ from claude_agent_sdk import ClaudeSDKClient from debug import debug, debug_detailed, debug_error, debug_section, debug_success from prompts_pkg import get_qa_reviewer_prompt +from security.tool_input_validator import get_safe_tool_input from task_logger import ( LogEntryType, LogPhase, @@ -195,32 +196,33 @@ async def run_qa_agent_session( ) elif block_type == "ToolUseBlock" and hasattr(block, "name"): tool_name = block.name - tool_input = None + tool_input_display = None tool_count += 1 + # Safely extract tool input (handles None, non-dict, etc.) + inp = get_safe_tool_input(block) + # Extract tool input for display - if hasattr(block, "input") and block.input: - inp = block.input - if isinstance(inp, dict): - if "file_path" in inp: - fp = inp["file_path"] - if len(fp) > 50: - fp = "..." + fp[-47:] - tool_input = fp - elif "pattern" in inp: - tool_input = f"pattern: {inp['pattern']}" + if inp: + if "file_path" in inp: + fp = inp["file_path"] + if len(fp) > 50: + fp = "..." + fp[-47:] + tool_input_display = fp + elif "pattern" in inp: + tool_input_display = f"pattern: {inp['pattern']}" debug( "qa_reviewer", f"Tool call #{tool_count}: {tool_name}", - tool_input=tool_input, + tool_input=tool_input_display, ) # Log tool start (handles printing) if task_logger: task_logger.tool_start( tool_name, - tool_input, + tool_input_display, LogPhase.VALIDATION, print_to_console=True, ) diff --git a/apps/backend/runners/github/gh_client.py b/apps/backend/runners/github/gh_client.py index d6e7ddbdc1..942aefa2b4 100644 --- a/apps/backend/runners/github/gh_client.py +++ b/apps/backend/runners/github/gh_client.py @@ -810,3 +810,65 @@ async def get_pr_head_sha(self, pr_number: int) -> str | None: # Last commit is the HEAD return commits[-1].get("oid") return None + + async def get_pr_checks(self, pr_number: int) -> dict[str, Any]: + """ + Get CI check runs status for a PR. + + Uses `gh pr checks` to get the status of all check runs. + + Args: + pr_number: PR number + + Returns: + Dict with: + - checks: List of check runs with name, status, conclusion + - passing: Number of passing checks + - failing: Number of failing checks + - pending: Number of pending checks + - failed_checks: List of failed check names + """ + try: + args = ["pr", "checks", str(pr_number), "--json", "name,state,conclusion"] + args = self._add_repo_flag(args) + + result = await self.run(args, timeout=30.0) + checks = json.loads(result.stdout) if result.stdout.strip() else [] + + passing = 0 + failing = 0 + pending = 0 + failed_checks = [] + + for check in checks: + state = check.get("state", "").upper() + conclusion = check.get("conclusion", "").upper() + name = check.get("name", "Unknown") + + if state == "COMPLETED": + if conclusion in ("SUCCESS", "NEUTRAL", "SKIPPED"): + passing += 1 + elif conclusion in ("FAILURE", "TIMED_OUT", "CANCELLED"): + failing += 1 + failed_checks.append(name) + else: + # PENDING, QUEUED, IN_PROGRESS, etc. + pending += 1 + + return { + "checks": checks, + "passing": passing, + "failing": failing, + "pending": pending, + "failed_checks": failed_checks, + } + except (GHCommandError, GHTimeoutError, json.JSONDecodeError) as e: + logger.warning(f"Failed to get PR checks for #{pr_number}: {e}") + return { + "checks": [], + "passing": 0, + "failing": 0, + "pending": 0, + "failed_checks": [], + "error": str(e), + } diff --git a/apps/backend/runners/github/models.py b/apps/backend/runners/github/models.py index ed9f83c3e3..cb7dbe22e9 100644 --- a/apps/backend/runners/github/models.py +++ b/apps/backend/runners/github/models.py @@ -221,6 +221,14 @@ class PRReviewFinding: ) redundant_with: str | None = None # Reference to duplicate code (file:line) + # NEW: Finding validation fields (from finding-validator re-investigation) + validation_status: str | None = ( + None # confirmed_valid, dismissed_false_positive, needs_human_review + ) + validation_evidence: str | None = None # Code snippet examined during validation + validation_confidence: float | None = None # Confidence of validation (0.0-1.0) + validation_explanation: str | None = None # Why finding was validated/dismissed + def to_dict(self) -> dict: return { "id": self.id, @@ -237,6 +245,11 @@ def to_dict(self) -> dict: "confidence": self.confidence, "verification_note": self.verification_note, "redundant_with": self.redundant_with, + # Validation fields + "validation_status": self.validation_status, + "validation_evidence": self.validation_evidence, + "validation_confidence": self.validation_confidence, + "validation_explanation": self.validation_explanation, } @classmethod @@ -256,6 +269,11 @@ def from_dict(cls, data: dict) -> PRReviewFinding: confidence=data.get("confidence", 0.85), verification_note=data.get("verification_note"), redundant_with=data.get("redundant_with"), + # Validation fields + validation_status=data.get("validation_status"), + validation_evidence=data.get("validation_evidence"), + validation_confidence=data.get("validation_confidence"), + validation_explanation=data.get("validation_explanation"), ) diff --git a/apps/backend/runners/github/orchestrator.py b/apps/backend/runners/github/orchestrator.py index 5456f5b0f6..0cfb078efe 100644 --- a/apps/backend/runners/github/orchestrator.py +++ b/apps/backend/runners/github/orchestrator.py @@ -389,9 +389,17 @@ async def review_pr( pr_number=pr_number, ) - # Generate verdict + # Check CI status + ci_status = await self.gh_client.get_pr_checks(pr_number) + print( + f"[DEBUG orchestrator] CI status: {ci_status.get('passing', 0)} passing, " + f"{ci_status.get('failing', 0)} failing, {ci_status.get('pending', 0)} pending", + flush=True, + ) + + # Generate verdict (now includes CI status) verdict, verdict_reasoning, blockers = self._generate_verdict( - findings, structural_issues, ai_triages + findings, structural_issues, ai_triages, ci_status ) print( f"[DEBUG orchestrator] Verdict: {verdict.value} - {verdict_reasoning}", @@ -661,6 +669,37 @@ async def followup_review_pr(self, pr_number: int) -> PRReviewResult: ) result = await reviewer.review_followup(followup_context) + # Check CI status and override verdict if failing + ci_status = await self.gh_client.get_pr_checks(pr_number) + failed_checks = ci_status.get("failed_checks", []) + if failed_checks: + print( + f"[Followup] CI checks failing: {failed_checks}", + flush=True, + ) + # Override verdict if CI is failing + if result.verdict in ( + MergeVerdict.READY_TO_MERGE, + MergeVerdict.MERGE_WITH_CHANGES, + ): + result.verdict = MergeVerdict.BLOCKED + result.verdict_reasoning = ( + f"Blocked: {len(failed_checks)} CI check(s) failing. " + "Fix CI before merge." + ) + result.overall_status = "request_changes" + # Add CI failures to blockers + for check_name in failed_checks: + if f"CI Failed: {check_name}" not in result.blockers: + result.blockers.append(f"CI Failed: {check_name}") + # Update summary to reflect CI status + ci_warning = ( + f"\n\n**⚠️ CI Status:** {len(failed_checks)} check(s) failing: " + f"{', '.join(failed_checks)}" + ) + if ci_warning not in result.summary: + result.summary += ci_warning + # Save result await result.save(self.github_dir) @@ -690,13 +729,16 @@ def _generate_verdict( findings: list[PRReviewFinding], structural_issues: list[StructuralIssue], ai_triages: list[AICommentTriage], + ci_status: dict | None = None, ) -> tuple[MergeVerdict, str, list[str]]: """ - Generate merge verdict based on all findings. + Generate merge verdict based on all findings and CI status. - NEW: Strengthened to block on verification failures and redundancy issues. + NEW: Strengthened to block on verification failures, redundancy issues, + and failing CI checks. """ blockers = [] + ci_status = ci_status or {} # Count by severity critical = [f for f in findings if f.severity == ReviewSeverity.CRITICAL] @@ -733,6 +775,11 @@ def _generate_verdict( ai_critical = [t for t in ai_triages if t.verdict == AICommentVerdict.CRITICAL] # Build blockers list with NEW categories first + # CI failures block merging + failed_checks = ci_status.get("failed_checks", []) + for check_name in failed_checks: + blockers.append(f"CI Failed: {check_name}") + # NEW: Verification failures block merging for f in verification_failures: note = f" - {f.verification_note}" if f.verification_note else "" @@ -765,10 +812,17 @@ def _generate_verdict( ) blockers.append(f"{t.tool_name}: {summary}") - # Determine verdict with NEW verification and redundancy checks + # Determine verdict with CI, verification and redundancy checks if blockers: + # CI failures are always blockers + if failed_checks: + verdict = MergeVerdict.BLOCKED + reasoning = ( + f"Blocked: {len(failed_checks)} CI check(s) failing. " + "Fix CI before merge." + ) # NEW: Prioritize verification failures - if verification_failures: + elif verification_failures: verdict = MergeVerdict.BLOCKED reasoning = ( f"Blocked: Cannot verify {len(verification_failures)} claim(s) in PR. " diff --git a/apps/backend/runners/github/services/parallel_followup_reviewer.py b/apps/backend/runners/github/services/parallel_followup_reviewer.py index 2009301092..fb7a04365b 100644 --- a/apps/backend/runners/github/services/parallel_followup_reviewer.py +++ b/apps/backend/runners/github/services/parallel_followup_reviewer.py @@ -150,6 +150,7 @@ def _define_specialist_agents(self) -> dict[str, AgentDefinition]: resolution_prompt = self._load_prompt("pr_followup_resolution_agent.md") newcode_prompt = self._load_prompt("pr_followup_newcode_agent.md") comment_prompt = self._load_prompt("pr_followup_comment_agent.md") + validator_prompt = self._load_prompt("pr_finding_validator.md") return { "resolution-verifier": AgentDefinition( @@ -186,6 +187,20 @@ def _define_specialist_agents(self) -> dict[str, AgentDefinition]: tools=["Read", "Grep", "Glob"], model="inherit", ), + "finding-validator": AgentDefinition( + description=( + "Finding re-investigation specialist. Re-investigates unresolved findings " + "to validate they are actually real issues, not false positives. " + "Actively reads the code at the finding location with fresh eyes. " + "Can confirm findings as valid OR dismiss them as false positives. " + "CRITICAL: Invoke for ALL unresolved findings after resolution-verifier runs. " + "Invoke when: There are findings marked as unresolved that need validation." + ), + prompt=validator_prompt + or "You validate whether unresolved findings are real issues.", + tools=["Read", "Grep", "Glob"], + model="inherit", + ), } def _format_previous_findings(self, context: FollowupReviewContext) -> str: @@ -444,6 +459,11 @@ async def review(self, context: FollowupReviewContext) -> PRReviewResult: f"{len(resolved_ids)} resolved, {len(unresolved_ids)} unresolved" ) + # Extract validation counts + dismissed_count = len(result_data.get("dismissed_false_positive_ids", [])) + confirmed_count = result_data.get("confirmed_valid_count", 0) + needs_human_count = result_data.get("needs_human_review_count", 0) + # Generate summary summary = self._generate_summary( verdict=verdict, @@ -452,6 +472,9 @@ async def review(self, context: FollowupReviewContext) -> PRReviewResult: unresolved_count=len(unresolved_ids), new_count=len(new_finding_ids), agents_invoked=final_agents, + dismissed_false_positive_count=dismissed_count, + confirmed_valid_count=confirmed_count, + needs_human_review_count=needs_human_count, ) # Map verdict to overall_status @@ -545,10 +568,34 @@ def _parse_structured_output( new_finding_ids = [] # Process resolution verifications + # First, build a map of finding validations (from finding-validator agent) + validation_map = {} + dismissed_ids = [] + for fv in response.finding_validations: + validation_map[fv.finding_id] = fv + if fv.validation_status == "dismissed_false_positive": + dismissed_ids.append(fv.finding_id) + print( + f"[ParallelFollowup] Finding {fv.finding_id} DISMISSED as false positive: {fv.explanation[:100]}", + flush=True, + ) + for rv in response.resolution_verifications: if rv.status == "resolved": resolved_ids.append(rv.finding_id) elif rv.status in ("unresolved", "partially_resolved", "cant_verify"): + # Check if finding was validated and dismissed as false positive + if rv.finding_id in dismissed_ids: + # Finding-validator determined this was a false positive - skip it + print( + f"[ParallelFollowup] Skipping {rv.finding_id} - dismissed as false positive by finding-validator", + flush=True, + ) + resolved_ids.append( + rv.finding_id + ) # Count as resolved (false positive) + continue + # Include "cant_verify" as unresolved - if we can't verify, assume not fixed unresolved_ids.append(rv.finding_id) # Add unresolved as a finding @@ -563,6 +610,19 @@ def _parse_structured_output( None, ) if original: + # Check if we have validation evidence + validation = validation_map.get(rv.finding_id) + validation_status = None + validation_evidence = None + validation_confidence = None + validation_explanation = None + + if validation: + validation_status = validation.validation_status + validation_evidence = validation.code_evidence + validation_confidence = validation.confidence + validation_explanation = validation.explanation + findings.append( PRReviewFinding( id=rv.finding_id, @@ -574,6 +634,10 @@ def _parse_structured_output( line=original.line, suggested_fix=original.suggested_fix, fixable=original.fixable, + validation_status=validation_status, + validation_evidence=validation_evidence, + validation_confidence=validation_confidence, + validation_explanation=validation_explanation, ) ) @@ -626,6 +690,18 @@ def _parse_structured_output( } verdict = verdict_map.get(response.verdict, MergeVerdict.NEEDS_REVISION) + # Count validation results + confirmed_valid_count = sum( + 1 + for fv in response.finding_validations + if fv.validation_status == "confirmed_valid" + ) + needs_human_count = sum( + 1 + for fv in response.finding_validations + if fv.validation_status == "needs_human_review" + ) + # Log findings summary for verification print( f"[ParallelFollowup] Parsed {len(findings)} findings, " @@ -633,11 +709,22 @@ def _parse_structured_output( f"{len(new_finding_ids)} new", flush=True, ) + if dismissed_ids: + print( + f"[ParallelFollowup] Validation: {len(dismissed_ids)} findings dismissed as false positives, " + f"{confirmed_valid_count} confirmed valid, {needs_human_count} need human review", + flush=True, + ) if findings: print("[ParallelFollowup] Findings summary:", flush=True) for i, f in enumerate(findings, 1): + validation_note = "" + if f.validation_status == "confirmed_valid": + validation_note = " [VALIDATED]" + elif f.validation_status == "needs_human_review": + validation_note = " [NEEDS HUMAN REVIEW]" print( - f" [{f.severity.value.upper()}] {i}. {f.title} ({f.file}:{f.line})", + f" [{f.severity.value.upper()}] {i}. {f.title} ({f.file}:{f.line}){validation_note}", flush=True, ) @@ -646,6 +733,9 @@ def _parse_structured_output( "resolved_ids": resolved_ids, "unresolved_ids": unresolved_ids, "new_finding_ids": new_finding_ids, + "dismissed_false_positive_ids": dismissed_ids, + "confirmed_valid_count": confirmed_valid_count, + "needs_human_review_count": needs_human_count, "verdict": verdict, "verdict_reasoning": response.verdict_reasoning, "agents_invoked": agents_from_output, @@ -719,6 +809,9 @@ def _generate_summary( unresolved_count: int, new_count: int, agents_invoked: list[str], + dismissed_false_positive_count: int = 0, + confirmed_valid_count: int = 0, + needs_human_review_count: int = 0, ) -> str: """Generate a human-readable summary of the follow-up review.""" status_emoji = { @@ -733,13 +826,27 @@ def _generate_summary( ", ".join(agents_invoked) if agents_invoked else "orchestrator only" ) + # Build validation section if there are validation results + validation_section = "" + if ( + dismissed_false_positive_count > 0 + or confirmed_valid_count > 0 + or needs_human_review_count > 0 + ): + validation_section = f""" +### Finding Validation +- 🔍 **Dismissed as False Positives**: {dismissed_false_positive_count} findings were re-investigated and found to be incorrect +- ✓ **Confirmed Valid**: {confirmed_valid_count} findings verified as genuine issues +- 👤 **Needs Human Review**: {needs_human_review_count} findings require manual verification +""" + summary = f"""## {emoji} Follow-up Review: {verdict.value.replace("_", " ").title()} ### Resolution Status - ✅ **Resolved**: {resolved_count} previous findings addressed - ❌ **Unresolved**: {unresolved_count} previous findings remain - 🆕 **New Issues**: {new_count} new findings in recent changes - +{validation_section} ### Verdict {verdict_reasoning} @@ -747,6 +854,6 @@ def _generate_summary( Agents invoked: {agents_str} --- -*This is an AI-generated follow-up review using parallel specialist analysis.* +*This is an AI-generated follow-up review using parallel specialist analysis with finding validation.* """ return summary diff --git a/apps/backend/runners/github/services/pydantic_models.py b/apps/backend/runners/github/services/pydantic_models.py index 1bfa638343..3c91a219eb 100644 --- a/apps/backend/runners/github/services/pydantic_models.py +++ b/apps/backend/runners/github/services/pydantic_models.py @@ -591,6 +591,15 @@ class ParallelFollowupResponse(BaseModel): description="AI-verified resolution status for each previous finding", ) + # Finding validations (from finding-validator agent) + finding_validations: list[FindingValidationResult] = Field( + default_factory=list, + description=( + "Re-investigation results for unresolved findings. " + "Validates whether findings are real issues or false positives." + ), + ) + # New findings (from new-code-reviewer agent) new_findings: list[ParallelFollowupFinding] = Field( default_factory=list, @@ -618,3 +627,77 @@ class ParallelFollowupResponse(BaseModel): "READY_TO_MERGE", "MERGE_WITH_CHANGES", "NEEDS_REVISION", "BLOCKED" ] = Field(description="Overall merge verdict") verdict_reasoning: str = Field(description="Explanation for the verdict") + + +# ============================================================================= +# Finding Validation Response (Re-investigation of unresolved findings) +# ============================================================================= + + +class FindingValidationResult(BaseModel): + """ + Result of re-investigating an unresolved finding to validate it's actually real. + + The finding-validator agent uses this to report whether a previous finding + is a genuine issue or a false positive that should be dismissed. + """ + + finding_id: str = Field(description="ID of the finding being validated") + validation_status: Literal[ + "confirmed_valid", "dismissed_false_positive", "needs_human_review" + ] = Field( + description=( + "Validation result: " + "confirmed_valid = issue IS real, keep as unresolved; " + "dismissed_false_positive = original finding was incorrect, remove; " + "needs_human_review = cannot determine with confidence" + ) + ) + code_evidence: str = Field( + min_length=1, + description=( + "REQUIRED: Exact code snippet examined from the file. " + "Must be actual code, not a description." + ), + ) + line_range: tuple[int, int] = Field( + description="Start and end line numbers of the examined code" + ) + explanation: str = Field( + min_length=20, + description=( + "Detailed explanation of why the finding is valid/invalid. " + "Must reference specific code and explain the reasoning." + ), + ) + confidence: float = Field( + ge=0.0, + le=1.0, + description=( + "Confidence in the validation result (0.0-1.0). " + "Must be >= 0.80 to dismiss as false positive, >= 0.70 to confirm valid." + ), + ) + + @field_validator("confidence", mode="before") + @classmethod + def normalize_confidence(cls, v: int | float) -> float: + """Normalize confidence to 0.0-1.0 range (accepts 0-100 or 0.0-1.0).""" + if v > 1: + return v / 100.0 + return float(v) + + +class FindingValidationResponse(BaseModel): + """Complete response from the finding-validator agent.""" + + validations: list[FindingValidationResult] = Field( + default_factory=list, + description="Validation results for each finding investigated", + ) + summary: str = Field( + description=( + "Brief summary of validation results: how many confirmed, " + "how many dismissed, how many need human review" + ) + ) diff --git a/apps/backend/security/__init__.py b/apps/backend/security/__init__.py index c2e4624e67..9b389373b6 100644 --- a/apps/backend/security/__init__.py +++ b/apps/backend/security/__init__.py @@ -50,6 +50,12 @@ reset_profile_cache, ) +# Tool input validation +from .tool_input_validator import ( + get_safe_tool_input, + validate_tool_input, +) + # Validators (for advanced usage) from .validator import ( VALIDATORS, @@ -100,4 +106,7 @@ "is_command_allowed", "needs_validation", "BASE_COMMANDS", + # Tool input validation + "validate_tool_input", + "get_safe_tool_input", ] diff --git a/apps/backend/security/hooks.py b/apps/backend/security/hooks.py index 6aaa9f8ec8..35152d4433 100644 --- a/apps/backend/security/hooks.py +++ b/apps/backend/security/hooks.py @@ -26,10 +26,11 @@ async def bash_security_hook( Pre-tool-use hook that validates bash commands using dynamic allowlist. This is the main security enforcement point. It: - 1. Extracts command names from the command string - 2. Checks each command against the project's security profile - 3. Runs additional validation for sensitive commands - 4. Blocks disallowed commands with clear error messages + 1. Validates tool_input structure (must be dict with 'command' key) + 2. Extracts command names from the command string + 3. Checks each command against the project's security profile + 4. Runs additional validation for sensitive commands + 5. Blocks disallowed commands with clear error messages Args: input_data: Dict containing tool_name and tool_input @@ -42,7 +43,25 @@ async def bash_security_hook( if input_data.get("tool_name") != "Bash": return {} - command = input_data.get("tool_input", {}).get("command", "") + # Validate tool_input structure before accessing + tool_input = input_data.get("tool_input") + + # Check if tool_input is None (malformed tool call) + if tool_input is None: + return { + "decision": "block", + "reason": "Bash tool_input is None - malformed tool call from SDK", + } + + # Check if tool_input is a dict + if not isinstance(tool_input, dict): + return { + "decision": "block", + "reason": f"Bash tool_input must be dict, got {type(tool_input).__name__}", + } + + # Now safe to access command + command = tool_input.get("command", "") if not command: return {} diff --git a/apps/backend/security/tool_input_validator.py b/apps/backend/security/tool_input_validator.py new file mode 100644 index 0000000000..7c702388a9 --- /dev/null +++ b/apps/backend/security/tool_input_validator.py @@ -0,0 +1,97 @@ +""" +Tool Input Validator +==================== + +Validates tool_input structure before tool execution. +Catches malformed inputs (None, wrong type, missing required keys) early. +""" + +from typing import Any + +# Required keys per tool type +TOOL_REQUIRED_KEYS: dict[str, list[str]] = { + "Bash": ["command"], + "Read": ["file_path"], + "Write": ["file_path", "content"], + "Edit": ["file_path", "old_string", "new_string"], + "Glob": ["pattern"], + "Grep": ["pattern"], + "WebFetch": ["url"], + "WebSearch": ["query"], +} + + +def validate_tool_input( + tool_name: str, + tool_input: Any, +) -> tuple[bool, str | None]: + """ + Validate tool input structure. + + Args: + tool_name: Name of the tool being called + tool_input: The tool_input value from the SDK + + Returns: + (is_valid, error_message) where error_message is None if valid + """ + # Must not be None + if tool_input is None: + return False, f"{tool_name}: tool_input is None (malformed tool call)" + + # Must be a dict + if not isinstance(tool_input, dict): + return ( + False, + f"{tool_name}: tool_input must be dict, got {type(tool_input).__name__}", + ) + + # Check required keys for known tools + required_keys = TOOL_REQUIRED_KEYS.get(tool_name, []) + missing_keys = [key for key in required_keys if key not in tool_input] + + if missing_keys: + return ( + False, + f"{tool_name}: missing required keys: {', '.join(missing_keys)}", + ) + + # Additional validation for specific tools + if tool_name == "Bash": + command = tool_input.get("command") + if not isinstance(command, str): + return ( + False, + f"Bash: 'command' must be string, got {type(command).__name__}", + ) + if not command.strip(): + return False, "Bash: 'command' is empty" + + return True, None + + +def get_safe_tool_input(block: Any, default: dict | None = None) -> dict: + """ + Safely extract tool_input from a ToolUseBlock, defaulting to empty dict. + + Args: + block: A ToolUseBlock from Claude SDK + default: Default value if extraction fails (defaults to empty dict) + + Returns: + The tool input as a dict (never None) + """ + if default is None: + default = {} + + if not hasattr(block, "input"): + return default + + tool_input = block.input + if tool_input is None: + return default + + if not isinstance(tool_input, dict): + return default + + return tool_input diff --git a/apps/backend/spec/pipeline/agent_runner.py b/apps/backend/spec/pipeline/agent_runner.py index 57e08f0632..d1ee2a78d7 100644 --- a/apps/backend/spec/pipeline/agent_runner.py +++ b/apps/backend/spec/pipeline/agent_runner.py @@ -14,6 +14,7 @@ from core.client import create_client from debug import debug, debug_detailed, debug_error, debug_section, debug_success +from security.tool_input_validator import get_safe_tool_input from task_logger import ( LogEntryType, LogPhase, @@ -160,25 +161,24 @@ async def run_agent( block, "name" ): tool_name = block.name - tool_input = None tool_count += 1 - # Extract meaningful tool input for display - if hasattr(block, "input") and block.input: - tool_input = self._extract_tool_input_display( - block.input - ) + # Safely extract tool input (handles None, non-dict, etc.) + inp = get_safe_tool_input(block) + tool_input_display = self._extract_tool_input_display( + inp + ) debug( "agent_runner", f"Tool call #{tool_count}: {tool_name}", - tool_input=tool_input, + tool_input=tool_input_display, ) if self.task_logger: self.task_logger.tool_start( tool_name, - tool_input, + tool_input_display, LogPhase.PLANNING, print_to_console=True, ) diff --git a/apps/frontend/package.json b/apps/frontend/package.json index 53e82ed114..f07759f183 100644 --- a/apps/frontend/package.json +++ b/apps/frontend/package.json @@ -1,6 +1,6 @@ { "name": "auto-claude-ui", - "version": "2.7.2-beta.10", + "version": "2.7.2-beta.12", "type": "module", "description": "Desktop UI for Auto Claude autonomous coding framework", "homepage": "https://github.com/AndyMik90/Auto-Claude", diff --git a/apps/frontend/src/main/ipc-handlers/claude-code-handlers.ts b/apps/frontend/src/main/ipc-handlers/claude-code-handlers.ts index 2f93f83a05..2b6a151002 100644 --- a/apps/frontend/src/main/ipc-handlers/claude-code-handlers.ts +++ b/apps/frontend/src/main/ipc-handlers/claude-code-handlers.ts @@ -105,7 +105,9 @@ export function escapePowerShellCommand(str: string): string { .replace(/\(/g, '`(') // Escape opening parentheses .replace(/\)/g, '`)') // Escape closing parentheses .replace(/;/g, '`;') // Escape semicolons (statement separator) - .replace(/&/g, '`&'); // Escape ampersands (call operator) + .replace(/&/g, '`&') // Escape ampersands (call operator) + .replace(/\r/g, '`r') // Escape carriage returns + .replace(/\n/g, '`n'); // Escape newlines } /** diff --git a/apps/frontend/src/main/ipc-handlers/github/pr-handlers.ts b/apps/frontend/src/main/ipc-handlers/github/pr-handlers.ts index ba175abbce..9c42076b93 100644 --- a/apps/frontend/src/main/ipc-handlers/github/pr-handlers.ts +++ b/apps/frontend/src/main/ipc-handlers/github/pr-handlers.ts @@ -405,17 +405,21 @@ function savePRLogs(project: Project, logs: PRLogs): void { /** * Add a log entry to PR logs + * Returns true if the phase status changed (for triggering immediate save) */ -function addLogEntry(logs: PRLogs, entry: PRLogEntry): void { +function addLogEntry(logs: PRLogs, entry: PRLogEntry): boolean { const phase = logs.phases[entry.phase]; + let statusChanged = false; - // Update phase status if needed + // Start the phase if it was pending if (phase.status === 'pending') { phase.status = 'active'; phase.started_at = entry.timestamp; + statusChanged = true; } phase.entries.push(entry); + return statusChanged; } /** @@ -442,14 +446,20 @@ class PRLogCollector { const phase = getPhaseFromSource(parsed.source); - // Track phase transitions - mark previous phases as complete + // Track phase transitions - mark previous phases as complete (only if they were active) if (phase !== this.currentPhase) { // When moving to a new phase, mark the previous phase as complete + // Only mark complete if the phase was actually active (received log entries) + // This prevents marking phases as "completed" if they were skipped if (this.currentPhase === 'context' && (phase === 'analysis' || phase === 'synthesis')) { - this.markPhaseComplete('context', true); + if (this.logs.phases.context.status === 'active') { + this.markPhaseComplete('context', true); + } } if (this.currentPhase === 'analysis' && phase === 'synthesis') { - this.markPhaseComplete('analysis', true); + if (this.logs.phases.analysis.status === 'active') { + this.markPhaseComplete('analysis', true); + } } this.currentPhase = phase; } @@ -462,11 +472,12 @@ class PRLogCollector { source: parsed.source, }; - addLogEntry(this.logs, entry); + const phaseStatusChanged = addLogEntry(this.logs, entry); this.entryCount++; - // Save periodically for real-time streaming (every N entries) - if (this.entryCount % this.saveInterval === 0) { + // Save immediately if phase status changed (so frontend sees phase activation) + // OR save periodically for real-time streaming (every N entries) + if (phaseStatusChanged || this.entryCount % this.saveInterval === 0) { this.save(); } } @@ -484,20 +495,15 @@ class PRLogCollector { } finalize(success: boolean): void { - // Mark all phases as completed based on success status - // For phases with entries: mark based on success - // For phases without entries (pending): mark as completed if previous phases completed - let previousCompleted = true; + // Mark active phases as completed based on success status + // Pending phases with no entries should stay pending (they never ran) for (const phase of ['context', 'analysis', 'synthesis'] as PRLogPhase[]) { const phaseLog = this.logs.phases[phase]; if (phaseLog.status === 'active') { this.markPhaseComplete(phase, success); - previousCompleted = success; - } else if (phaseLog.status === 'pending' && previousCompleted && success) { - // If review succeeded, mark pending phases as completed (they just had no logs) - phaseLog.status = 'completed'; - phaseLog.completed_at = new Date().toISOString(); } + // Note: Pending phases stay pending - they never received any log entries + // This is correct behavior for follow-up reviews where some phases may be skipped } this.save(); } diff --git a/apps/frontend/src/renderer/App.tsx b/apps/frontend/src/renderer/App.tsx index 8ddeb48aa5..b380347b04 100644 --- a/apps/frontend/src/renderer/App.tsx +++ b/apps/frontend/src/renderer/App.tsx @@ -755,13 +755,17 @@ export function App() { onNavigateToTask={handleGoToTask} /> )} - {activeView === 'github-prs' && (activeProjectId || selectedProjectId) && ( - { - setSettingsInitialProjectSection('github'); - setIsSettingsDialogOpen(true); - }} - /> + {/* GitHubPRs is always mounted but hidden when not active to preserve review state */} + {(activeProjectId || selectedProjectId) && ( +
+ { + setSettingsInitialProjectSection('github'); + setIsSettingsDialogOpen(true); + }} + isActive={activeView === 'github-prs'} + /> +
)} {activeView === 'gitlab-merge-requests' && (activeProjectId || selectedProjectId) && ( void; + isActive?: boolean; } function NotConnectedState({ @@ -50,7 +51,7 @@ function EmptyState({ message }: { message: string }) { ); } -export function GitHubPRs({ onOpenSettings }: GitHubPRsProps) { +export function GitHubPRs({ onOpenSettings, isActive = false }: GitHubPRsProps) { const { t } = useTranslation('common'); const projects = useProjectStore((state) => state.projects); const selectedProjectId = useProjectStore((state) => state.selectedProjectId); @@ -232,6 +233,7 @@ export function GitHubPRs({ onOpenSettings }: GitHubPRsProps) { reviewProgress={reviewProgress} isReviewing={isReviewing} initialNewCommitsCheck={storedNewCommitsCheck} + isActive={isActive} onRunReview={handleRunReview} onRunFollowupReview={handleRunFollowupReview} onCheckNewCommits={handleCheckNewCommits} diff --git a/apps/frontend/src/renderer/components/github-prs/components/PRDetail.tsx b/apps/frontend/src/renderer/components/github-prs/components/PRDetail.tsx index 0f2499588b..e5421faf47 100644 --- a/apps/frontend/src/renderer/components/github-prs/components/PRDetail.tsx +++ b/apps/frontend/src/renderer/components/github-prs/components/PRDetail.tsx @@ -38,6 +38,7 @@ interface PRDetailProps { reviewProgress: PRReviewProgress | null; isReviewing: boolean; initialNewCommitsCheck?: NewCommitsCheck | null; + isActive?: boolean; onRunReview: () => void; onRunFollowupReview: () => void; onCheckNewCommits: () => Promise; @@ -67,6 +68,7 @@ export function PRDetail({ reviewProgress, isReviewing, initialNewCommitsCheck, + isActive = false, onRunReview, onRunFollowupReview, onCheckNewCommits, diff --git a/apps/frontend/src/renderer/components/github-prs/components/PRLogs.tsx b/apps/frontend/src/renderer/components/github-prs/components/PRLogs.tsx index 9c3522ae36..d7aa393652 100644 --- a/apps/frontend/src/renderer/components/github-prs/components/PRLogs.tsx +++ b/apps/frontend/src/renderer/components/github-prs/components/PRLogs.tsx @@ -167,6 +167,17 @@ function PhaseLogSection({ phase, phaseLog, isExpanded, onToggle, isStreaming = ); } + + // Defensive check: During streaming, if a phase shows "completed" but has no entries, + // treat it as pending (this catches edge cases where phases are marked complete incorrectly) + if (isStreaming && status === 'completed' && !hasEntries) { + return ( + + Pending + + ); + } + switch (status) { case 'completed': return ( diff --git a/apps/frontend/src/renderer/components/github-prs/hooks/useGitHubPRs.ts b/apps/frontend/src/renderer/components/github-prs/hooks/useGitHubPRs.ts index 24c7d4682c..db39633662 100644 --- a/apps/frontend/src/renderer/components/github-prs/hooks/useGitHubPRs.ts +++ b/apps/frontend/src/renderer/components/github-prs/hooks/useGitHubPRs.ts @@ -48,6 +48,7 @@ export function useGitHubPRs(projectId?: string): UseGitHubPRsResult { const prReviews = usePRReviewStore((state) => state.prReviews); const getPRReviewState = usePRReviewStore((state) => state.getPRReviewState); const getActivePRReviews = usePRReviewStore((state) => state.getActivePRReviews); + const setNewCommitsCheckAction = usePRReviewStore((state) => state.setNewCommitsCheck); // Get review state for the selected PR from the store const selectedPRReviewState = useMemo(() => { @@ -137,7 +138,8 @@ export function useGitHubPRs(projectId?: string): UseGitHubPRsResult { prsWithReviews.map(async ({ prNumber }) => { try { const newCommitsResult = await window.electronAPI.github.checkNewCommits(projectId, prNumber); - usePRReviewStore.getState().setNewCommitsCheck(projectId, prNumber, newCommitsResult); + // Use the action from the hook subscription to ensure proper React re-renders + setNewCommitsCheckAction(projectId, prNumber, newCommitsResult); } catch (err) { // Silently fail for individual PR checks - don't block the list console.warn(`Failed to check new commits for PR #${prNumber}:`, err); @@ -158,7 +160,7 @@ export function useGitHubPRs(projectId?: string): UseGitHubPRsResult { } finally { setIsLoading(false); } - }, [projectId, getPRReviewState]); + }, [projectId, getPRReviewState, setNewCommitsCheckAction]); useEffect(() => { fetchPRs(); @@ -212,13 +214,14 @@ export function useGitHubPRs(projectId?: string): UseGitHubPRsResult { try { const result = await window.electronAPI.github.checkNewCommits(projectId, prNumber); // Cache the result in the store so the list view can use it - usePRReviewStore.getState().setNewCommitsCheck(projectId, prNumber, result); + // Use the action from the hook subscription to ensure proper React re-renders + setNewCommitsCheckAction(projectId, prNumber, result); return result; } catch (err) { setError(err instanceof Error ? err.message : 'Failed to check for new commits'); return { hasNewCommits: false, newCommitCount: 0 }; } - }, [projectId]); + }, [projectId, setNewCommitsCheckAction]); const cancelReview = useCallback(async (prNumber: number): Promise => { if (!projectId) return false; diff --git a/package.json b/package.json index 29f6ec966b..5d855f8cda 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "auto-claude", - "version": "2.7.2-beta.10", + "version": "2.7.2-beta.12", "description": "Autonomous multi-agent coding framework powered by Claude AI", "license": "AGPL-3.0", "author": "Auto Claude Team", diff --git a/tests/test_agent_configs.py b/tests/test_agent_configs.py index 00af90638a..d24d30bf42 100644 --- a/tests/test_agent_configs.py +++ b/tests/test_agent_configs.py @@ -233,15 +233,15 @@ def test_coder_includes_write_tools(self): assert "Edit" in tools assert "Bash" in tools - def test_qa_reviewer_is_read_only_plus_bash(self): - """QA reviewer should be read-only plus Bash for running tests - no Write/Edit.""" + def test_qa_reviewer_has_write_for_reports(self): + """QA reviewer needs Write/Edit to create qa_report.md and update implementation_plan.json.""" from agents.tools_pkg.permissions import get_allowed_tools tools = get_allowed_tools("qa_reviewer") assert "Read" in tools assert "Bash" in tools # Can run tests - assert "Write" not in tools # Should NOT be able to edit code - assert "Edit" not in tools # Should NOT be able to edit code + assert "Write" in tools # Needs to write qa_report.md + assert "Edit" in tools # Needs to edit implementation_plan.json def test_pr_reviewer_is_read_only(self): """PR reviewer should only have Read tools.""" diff --git a/tests/test_finding_validation.py b/tests/test_finding_validation.py new file mode 100644 index 0000000000..f01b96011f --- /dev/null +++ b/tests/test_finding_validation.py @@ -0,0 +1,414 @@ +""" +Tests for Finding Validation System +==================================== + +Tests the finding-validator agent integration and FindingValidationResult models. +This system prevents false positives from persisting by re-investigating unresolved findings. +""" + +import sys +from pathlib import Path + +import pytest +from pydantic import ValidationError + +# Add the backend directory to path +_backend_dir = Path(__file__).parent.parent / "apps" / "backend" +_github_dir = _backend_dir / "runners" / "github" +_services_dir = _github_dir / "services" + +if str(_services_dir) not in sys.path: + sys.path.insert(0, str(_services_dir)) +if str(_github_dir) not in sys.path: + sys.path.insert(0, str(_github_dir)) +if str(_backend_dir) not in sys.path: + sys.path.insert(0, str(_backend_dir)) + +from pydantic_models import ( + FindingValidationResult, + FindingValidationResponse, + ParallelFollowupResponse, + ResolutionVerification, +) +from models import ( + PRReviewFinding, + PRReviewResult, + ReviewSeverity, + ReviewCategory, + MergeVerdict, +) + + +# ============================================================================ +# FindingValidationResult Model Tests +# ============================================================================ + + +class TestFindingValidationResultModel: + """Tests for the FindingValidationResult Pydantic model.""" + + def test_valid_confirmed_valid(self): + """Test creating a confirmed_valid validation result.""" + result = FindingValidationResult( + finding_id="SEC-001", + validation_status="confirmed_valid", + code_evidence="const query = `SELECT * FROM users WHERE id = ${userId}`;", + line_range=(45, 45), + explanation="SQL injection is present - user input is concatenated directly into the query.", + confidence=0.92, + ) + assert result.finding_id == "SEC-001" + assert result.validation_status == "confirmed_valid" + assert "SELECT" in result.code_evidence + assert result.confidence == 0.92 + + def test_valid_dismissed_false_positive(self): + """Test creating a dismissed_false_positive validation result.""" + result = FindingValidationResult( + finding_id="QUAL-002", + validation_status="dismissed_false_positive", + code_evidence="const sanitized = DOMPurify.sanitize(data);", + line_range=(23, 26), + explanation="Original finding claimed XSS but code uses DOMPurify.sanitize() for protection.", + confidence=0.88, + ) + assert result.validation_status == "dismissed_false_positive" + assert result.confidence == 0.88 + + def test_valid_needs_human_review(self): + """Test creating a needs_human_review validation result.""" + result = FindingValidationResult( + finding_id="LOGIC-003", + validation_status="needs_human_review", + code_evidence="async function handleRequest(req) { ... }", + line_range=(100, 150), + explanation="Race condition claim requires runtime analysis to verify.", + confidence=0.45, + ) + assert result.validation_status == "needs_human_review" + assert result.confidence == 0.45 + + def test_code_evidence_required(self): + """Test that code_evidence cannot be empty.""" + with pytest.raises(ValidationError) as exc_info: + FindingValidationResult( + finding_id="SEC-001", + validation_status="confirmed_valid", + code_evidence="", # Empty string should fail + line_range=(45, 45), + explanation="This is a detailed explanation of the issue.", + confidence=0.92, + ) + errors = exc_info.value.errors() + assert any("code_evidence" in str(e) for e in errors) + + def test_explanation_min_length(self): + """Test that explanation must be at least 20 characters.""" + with pytest.raises(ValidationError) as exc_info: + FindingValidationResult( + finding_id="SEC-001", + validation_status="confirmed_valid", + code_evidence="const x = 1;", + line_range=(45, 45), + explanation="Too short", # Less than 20 chars + confidence=0.92, + ) + errors = exc_info.value.errors() + assert any("explanation" in str(e) for e in errors) + + def test_confidence_normalized_from_percentage(self): + """Test that confidence 0-100 is normalized to 0.0-1.0.""" + result = FindingValidationResult( + finding_id="SEC-001", + validation_status="confirmed_valid", + code_evidence="const query = `SELECT * FROM users`;", + line_range=(45, 45), + explanation="SQL injection vulnerability found in the query construction.", + confidence=85, # Percentage value + ) + assert result.confidence == 0.85 + + def test_confidence_range_validation(self): + """Test that confidence must be between 0.0 and 1.0 after normalization.""" + with pytest.raises(ValidationError): + FindingValidationResult( + finding_id="SEC-001", + validation_status="confirmed_valid", + code_evidence="const x = 1;", + line_range=(45, 45), + explanation="This is a detailed explanation of the issue.", + confidence=150, # Will normalize to 1.5, which is out of range + ) + + def test_invalid_validation_status(self): + """Test that invalid validation_status values are rejected.""" + with pytest.raises(ValidationError): + FindingValidationResult( + finding_id="SEC-001", + validation_status="invalid_status", # Not a valid status + code_evidence="const x = 1;", + line_range=(45, 45), + explanation="This is a detailed explanation of the issue.", + confidence=0.92, + ) + + +class TestFindingValidationResponse: + """Tests for the FindingValidationResponse container model.""" + + def test_valid_response_with_multiple_validations(self): + """Test creating a response with multiple validation results.""" + response = FindingValidationResponse( + validations=[ + FindingValidationResult( + finding_id="SEC-001", + validation_status="confirmed_valid", + code_evidence="const query = `SELECT * FROM users`;", + line_range=(45, 45), + explanation="SQL injection confirmed in this query.", + confidence=0.92, + ), + FindingValidationResult( + finding_id="QUAL-002", + validation_status="dismissed_false_positive", + code_evidence="const sanitized = DOMPurify.sanitize(data);", + line_range=(23, 26), + explanation="Code uses DOMPurify so XSS claim is false.", + confidence=0.88, + ), + ], + summary="1 finding confirmed valid, 1 dismissed as false positive", + ) + assert len(response.validations) == 2 + assert "1 finding confirmed" in response.summary + + +class TestParallelFollowupResponseWithValidation: + """Tests for ParallelFollowupResponse including finding_validations.""" + + def test_response_includes_finding_validations(self): + """Test that ParallelFollowupResponse accepts finding_validations.""" + response = ParallelFollowupResponse( + analysis_summary="Follow-up review with validation", + agents_invoked=["resolution-verifier", "finding-validator"], + commits_analyzed=3, + files_changed=5, + resolution_verifications=[ + ResolutionVerification( + finding_id="SEC-001", + status="unresolved", + confidence=0.85, + evidence="File was not modified", + ) + ], + finding_validations=[ + FindingValidationResult( + finding_id="SEC-001", + validation_status="confirmed_valid", + code_evidence="const query = `SELECT * FROM users`;", + line_range=(45, 45), + explanation="SQL injection confirmed in this query.", + confidence=0.92, + ) + ], + new_findings=[], + comment_analyses=[], + comment_findings=[], + verdict="NEEDS_REVISION", + verdict_reasoning="1 confirmed valid security issue remains", + ) + assert len(response.finding_validations) == 1 + assert response.finding_validations[0].validation_status == "confirmed_valid" + + def test_response_with_dismissed_findings(self): + """Test response where findings are dismissed as false positives.""" + response = ParallelFollowupResponse( + analysis_summary="All findings dismissed as false positives", + agents_invoked=["resolution-verifier", "finding-validator"], + commits_analyzed=3, + files_changed=5, + resolution_verifications=[ + ResolutionVerification( + finding_id="SEC-001", + status="unresolved", + confidence=0.50, + evidence="Line wasn't changed but need to verify", + ) + ], + finding_validations=[ + FindingValidationResult( + finding_id="SEC-001", + validation_status="dismissed_false_positive", + code_evidence="const query = db.prepare('SELECT * FROM users WHERE id = ?').get(userId);", + line_range=(45, 48), + explanation="Original review misread - using parameterized query.", + confidence=0.95, + ) + ], + new_findings=[], + comment_analyses=[], + comment_findings=[], + verdict="READY_TO_MERGE", + verdict_reasoning="Previous finding was a false positive, now dismissed", + ) + assert len(response.finding_validations) == 1 + assert response.finding_validations[0].validation_status == "dismissed_false_positive" + + +# ============================================================================ +# PRReviewFinding Validation Fields Tests +# ============================================================================ + + +class TestPRReviewFindingValidationFields: + """Tests for validation fields on PRReviewFinding model.""" + + def test_finding_with_validation_fields(self): + """Test creating a finding with validation fields populated.""" + finding = PRReviewFinding( + id="SEC-001", + severity=ReviewSeverity.HIGH, + category=ReviewCategory.SECURITY, + title="SQL Injection", + description="User input not sanitized", + file="src/db.py", + line=42, + validation_status="confirmed_valid", + validation_evidence="const query = `SELECT * FROM users`;", + validation_confidence=0.92, + validation_explanation="SQL injection confirmed in the query.", + ) + assert finding.validation_status == "confirmed_valid" + assert finding.validation_confidence == 0.92 + + def test_finding_without_validation_fields(self): + """Test that validation fields are optional.""" + finding = PRReviewFinding( + id="SEC-001", + severity=ReviewSeverity.HIGH, + category=ReviewCategory.SECURITY, + title="SQL Injection", + description="User input not sanitized", + file="src/db.py", + line=42, + ) + assert finding.validation_status is None + assert finding.validation_evidence is None + assert finding.validation_confidence is None + assert finding.validation_explanation is None + + def test_finding_to_dict_includes_validation(self): + """Test that to_dict includes validation fields.""" + finding = PRReviewFinding( + id="SEC-001", + severity=ReviewSeverity.HIGH, + category=ReviewCategory.SECURITY, + title="SQL Injection", + description="User input not sanitized", + file="src/db.py", + line=42, + validation_status="confirmed_valid", + validation_evidence="const query = ...;", + validation_confidence=0.92, + validation_explanation="Issue confirmed.", + ) + data = finding.to_dict() + assert data["validation_status"] == "confirmed_valid" + assert data["validation_evidence"] == "const query = ...;" + assert data["validation_confidence"] == 0.92 + assert data["validation_explanation"] == "Issue confirmed." + + def test_finding_from_dict_with_validation(self): + """Test that from_dict loads validation fields.""" + data = { + "id": "SEC-001", + "severity": "high", + "category": "security", + "title": "SQL Injection", + "description": "User input not sanitized", + "file": "src/db.py", + "line": 42, + "validation_status": "dismissed_false_positive", + "validation_evidence": "parameterized query used", + "validation_confidence": 0.88, + "validation_explanation": "False positive - using prepared statements.", + } + finding = PRReviewFinding.from_dict(data) + assert finding.validation_status == "dismissed_false_positive" + assert finding.validation_confidence == 0.88 + + +# ============================================================================ +# Integration Tests +# ============================================================================ + + +class TestValidationIntegration: + """Integration tests for the validation flow.""" + + def test_validation_summary_format(self): + """Test that validation summary format is correct when validation results exist.""" + # Test the expected summary format when validation results are present + # We can't directly import ParallelFollowupReviewer due to complex imports, + # so we verify the Pydantic models work correctly instead + + response = ParallelFollowupResponse( + analysis_summary="Follow-up with validation", + agents_invoked=["resolution-verifier", "finding-validator"], + commits_analyzed=3, + files_changed=5, + resolution_verifications=[], + finding_validations=[ + FindingValidationResult( + finding_id="SEC-001", + validation_status="confirmed_valid", + code_evidence="const query = `SELECT * FROM users`;", + line_range=(45, 45), + explanation="SQL injection confirmed in this query construction.", + confidence=0.92, + ), + FindingValidationResult( + finding_id="QUAL-002", + validation_status="dismissed_false_positive", + code_evidence="const sanitized = DOMPurify.sanitize(data);", + line_range=(23, 26), + explanation="Original XSS claim was incorrect - uses DOMPurify.", + confidence=0.88, + ), + ], + new_findings=[], + comment_analyses=[], + comment_findings=[], + verdict="READY_TO_MERGE", + verdict_reasoning="1 dismissed as false positive, 1 confirmed valid but low severity", + ) + + # Verify validation counts can be computed from the response + confirmed_count = sum( + 1 for fv in response.finding_validations + if fv.validation_status == "confirmed_valid" + ) + dismissed_count = sum( + 1 for fv in response.finding_validations + if fv.validation_status == "dismissed_false_positive" + ) + + assert confirmed_count == 1 + assert dismissed_count == 1 + assert len(response.finding_validations) == 2 + assert "finding-validator" in response.agents_invoked + + def test_validation_status_enum_values(self): + """Test all valid validation status values.""" + valid_statuses = ["confirmed_valid", "dismissed_false_positive", "needs_human_review"] + + for status in valid_statuses: + result = FindingValidationResult( + finding_id="TEST-001", + validation_status=status, + code_evidence="const x = 1;", + line_range=(1, 1), + explanation="This is a valid explanation for the finding status.", + confidence=0.85, + ) + assert result.validation_status == status