Katoshy · Katoshy · Mar 22, 2026 · Mar 22, 2026 · Mar 22, 2026 · Mar 22, 2026
@@ -1,5 +1,5 @@
 ---
-name: Architecture Analysis
+name: architecture-analysis
 description: Deep codebase analysis: module boundaries, dependency graph, design patterns.
 ---
 

@@ -1,5 +1,5 @@
 ---
-name: Clean Code
+name: clean-code
 description: Follow clean code principles: small functions, meaningful names, no duplication.
 ---
 

@@ -1,5 +1,5 @@
 ---
-name: Code Review
+name: code-review
 description: Review code for correctness, style, performance, and security issues.
 ---
 

@@ -1,5 +1,5 @@
 ---
-name: Planning
+name: planning
 description: Analyze the codebase and produce a step-by-step implementation plan before coding.
 ---
 

@@ -1,5 +1,5 @@
 ---
-name: Routing
+name: routing
 description: Route tasks to the appropriate specialist agent based on the request type.
 ---
 

@@ -1,5 +1,5 @@
 ---
-name: Security Check
+name: security-check
 description: Check code for common security vulnerabilities and OWASP top 10 issues.
 ---
 

@@ -1,5 +1,5 @@
 ---
-name: Test First
+name: test-first
 description: Write tests before implementation. Verify each change with a failing test first.
 ---
 

@@ -1,5 +1,5 @@
 ---
-name: Triage
+name: triage
 description: Assess incoming requests: classify priority, estimate scope, identify blockers.
 ---
 

@@ -117,6 +117,7 @@ teamcast clean --yes            # skip clean confirmation
 | `solo-dev`           | developer | single full-stack agent handles end-to-end: plan, implement, test, verify |
 | `research-and-build` | orchestrator, researcher, planner, developer | research-first: orchestrator routes to researcher for external info, planner integrates findings, developer implements |
 | `secure-dev`         | orchestrator, planner, developer, security-auditor, reviewer | mandatory security pipeline: planner includes threat model, developer follows OWASP, security-auditor gates every change, reviewer checks quality |
+| `red-blue`           | orchestrator, red-agent, blue-agent, judge | adversarial hardening: red attacks with failing tests, blue fixes without weakening them, judge decides SHIP or next round |
 
 The built-in preset files live in `templates/presets/` and are valid TeamCast YAML. Use them as a reference when creating custom presets, or copy one as a starting point:
 

@@ -1,6 +1,6 @@
 {
   "name": "teamcast",
-  "version": "1.1.0",
+  "version": "1.2.0",
   "description": "YAML-driven CLI to design, validate, and generate multi-target agent teams for Claude Code and Codex",
   "type": "module",
   "bin": {

@@ -125,15 +125,20 @@ const INSTRUCTION_FRAGMENTS: Record<InstructionFragmentId, InstructionBlock> = {
   'feature-orchestrator-workflow': block(
     'workflow',
     [
-      'Always start by reading the task carefully. Then decide:',
-      '- Does this need research or planning first? -> delegate to planner',
-      '- Is the plan ready and implementation needed? -> delegate to developer',
-      '- Is the implementation done and needs review? -> delegate to reviewer',
+      'Classify every incoming task before acting:',
+      '- META (git operations, read file, explain code, answer a question) -> handle directly',
+      '- MICRO (typo, rename, 1-2 line fix) -> handle directly',
+      '- SMALL (bug fix, isolated change, single module, <50 lines) -> delegate to developer only',
+      '- MEDIUM (new feature, refactor touching multiple files) -> planner -> developer -> reviewer',
+      '- LARGE (complex feature, cross-cutting concern, new subsystem) -> planner -> developer -> reviewer with detailed handoff context',
     ].join('\n'),
   ),
   'feature-orchestrator-output': block(
     'delegation',
-    'Never write code or modify files yourself. Your output is always a delegation or a final summary.',
+    [
+      'When handling directly: be concise, do not explain your triage decision.',
+      'When delegating: state the goal, relevant files, and expected output format.',
+    ].join('\n'),
   ),
   'feature-planner-workflow': block(
     'workflow',

@@ -7,7 +7,7 @@ import { defaultRegistry } from '../../registry/index.js';
 
 function buildFrontmatter(skill: SkillDefinition): string {
   const lines: string[] = ['---'];
-  lines.push(`name: ${skill.name}`);
+  lines.push(`name: ${skill.id}`);
   lines.push(`description: ${skill.description}`);
   if (skill.allowed_tools?.length) {
     lines.push(`allowed-tools:`);
@@ -28,7 +28,7 @@ function generateSkillStub(skillName: string): string {
     .join(' ');
 
   return `---
-name: ${title}
+name: ${skillName}
 description: <!-- describe when this skill triggers -->
 ---
 

@@ -27,15 +27,14 @@ claude:
           - planner
           - developer
           - reviewer
-      description: Coordinates the team. Analyzes tasks, decomposes them into
-        subtasks, and delegates to the right specialist. Never writes code
-        directly.
+      description: Tech lead. Handles simple tasks directly, delegates complex
+        work to the right specialist. Triages by task size before acting.
       model: opus
       capability_traits:
         - base-read
+        - file-authoring
+        - command-execution
         - delegation
-        - no-file-edits
-        - no-commands
         - no-web
       skills:
         - triage
@@ -44,12 +43,7 @@ claude:
       instruction_blocks:
         - kind: behavior
           content: |
-            You are the team coordinator. Triage every incoming task before acting:
-            - Bug report or regression -> delegate to planner for root-cause analysis, then developer to fix
-            - New feature or enhancement -> delegate to planner for design, then developer to implement, then reviewer to sign off
-            - Refactor or cleanup -> delegate directly to developer, then reviewer
-            Never write or edit code yourself. Your output is always a delegation message or a final summary.
-            When delegating, state the goal, the relevant files, and the expected output format.
+            You are the tech lead. Handle simple tasks directly. Delegate complex work to the right specialist.
       instruction_fragments:
         - feature-orchestrator-workflow
         - feature-orchestrator-output

@@ -0,0 +1,127 @@
+version: "2"
+preset_meta:
+  tags:
+    - security
+    - testing
+    - adversarial
+  min_version: "2"
+project:
+  name: placeholder
+  preset: red-blue
+  description: Adversarial hardening team. Red agent attacks with failing tests, blue agent defends with fixes, judge decides when to ship.
+claude:
+  policies:
+    fragments:
+      - allow-git-read
+      - allow-git-write
+      - ask-git-push
+      - deny-destructive-shell
+      - deny-network-downloads
+      - deny-dynamic-exec
+      - deny-env-files
+      - sandbox-default
+  agents:
+    orchestrator:
+      forge:
+        handoffs:
+          - red-agent
+          - blue-agent
+          - judge
+      description: Manages adversarial rounds. Routes between red, blue, and judge.
+        Ships when judge approves or max rounds reached.
+      model: opus
+      capability_traits:
+        - base-read
+        - delegation
+        - no-file-edits
+        - no-commands
+        - no-web
+      max_turns: 30
+      instruction_blocks:
+        - kind: behavior
+          content: |
+            You are the round manager. Run adversarial rounds between red-agent and blue-agent.
+            Track the current round number (start at 1, maximum 3).
+            Round flow: red-agent -> blue-agent -> judge.
+            If judge returns SHIP or round >= 3: deliver final report and stop.
+            If judge returns ROUND N+1: increment round, pass the judge hint to red-agent.
+            Never write or modify code yourself.
+        - kind: delegation
+          content: |
+            When delegating to red-agent: include the target scope and any judge hint from the previous round.
+            When delegating to blue-agent: include red's attack report and the list of failing test files.
+            When delegating to judge: include both red's attack report and blue's fix report.
+    red-agent:
+      description: Attacker. Finds weaknesses and writes failing tests. Never
+        modifies production code.
+      model: sonnet
+      capability_traits:
+        - base-read
+        - file-authoring
+        - command-execution
+        - no-web
+      skills:
+        - security-check
+        - test-first
+      instruction_blocks:
+        - kind: behavior
+          content: |
+            You are the attacker. Your goal is to break the code through tests.
+            Read the target code carefully. Find: edge cases, invalid inputs, null paths,
+            boundary conditions, type coercion issues, missing error handling, race conditions.
+            Write tests that expose these weaknesses. Run them — confirm they FAIL before reporting.
+            If the judge gave you a hint for this round, focus your attack on that angle.
+        - kind: safety
+          content: |
+            Write only test files. Never edit, create, or delete production source files.
+            Each test must have a clear name describing what weakness it exposes.
+            Only include tests that actually fail in your report.
+    blue-agent:
+      description: Defender. Makes red's failing tests pass without deleting or
+        weakening them.
+      model: sonnet
+      capability_traits:
+        - base-read
+        - file-authoring
+        - command-execution
+        - no-web
+      skills:
+        - clean-code
+        - secure-coding
+      instruction_blocks:
+        - kind: behavior
+          content: |
+            You are the defender. Make every failing test from red-agent pass.
+            Fix the root cause — do not delete, skip, or weaken any test.
+            Run the full test suite after your fixes. All tests must be green before reporting.
+            Keep changes minimal and focused. Do not refactor beyond what is needed to pass the tests.
+        - kind: safety
+          content: |
+            Never delete, skip (.skip), or modify red's test files.
+            If a test appears wrong, flag it in your report — do not remove it.
+    judge:
+      description: Evaluates blue's fixes. Returns SHIP if solid, or ROUND N+1
+        with a new attack hint for red.
+      model: sonnet
+      capability_traits:
+        - base-read
+        - command-execution
+        - no-file-edits
+        - no-web
+      skills:
+        - code-review
+        - security-check
+      instruction_blocks:
+        - kind: behavior
+          content: |
+            You are the judge. Read red's attack report and blue's fix report.
+            Evaluate: did blue fix the root cause, or just suppress the symptom?
+            Look for new attack surfaces introduced by blue's changes.
+            Check for: try/catch that swallows errors, conditions that only handle
+            the tested input, hardcoded values that mask the real problem.
+        - kind: style
+          content: |
+            Return exactly one of:
+              SHIP — fixes are solid, no new surfaces, ready to merge.
+              ROUND N+1: <specific hint> — what angle red should try next.
+            Be concrete in hints (e.g. "try concurrent calls to X", "pass null for Y").
@@ -165,7 +165,7 @@ describe('CLI behavior', () => {
       const result = runCli(['validate'], cwd);
 
       expect(result.status).toBe(0);
-      expect(result.stdout).toContain('All checks passed.');
+      expect(result.stdout).not.toContain('[error]');
     } finally {
       rmSync(cwd, { recursive: true, force: true });
     }