diff --git a/.cursor/README.md b/.cursor/README.md
index d9522b4..055abd3 100644
--- a/.cursor/README.md
+++ b/.cursor/README.md
@@ -7,7 +7,7 @@
 Type `/autopilot` to start or continue the full workflow. The orchestrator detects where your project is and picks up from there.
 
 ```
-/autopilot              — start a new project or continue where you left off
+/autopilot (or /auto)   — start a new project or continue where you left off
 ```
 
 If you want to run a specific skill directly (without the orchestrator), use the individual commands:
diff --git a/.cursor/skills/autopilot/SKILL.md b/.cursor/skills/autopilot/SKILL.md
index 57d39a1..8cec5a5 100644
--- a/.cursor/skills/autopilot/SKILL.md
+++ b/.cursor/skills/autopilot/SKILL.md
@@ -60,7 +60,7 @@ Every invocation follows this sequence:
 3. Cross-check state file against _docs/ folder structure (rules in state.md)
 4. Resolve flow (see Flow Resolution above)
 5. Resolve current step (detection rules from the active flow file)
-6. Present Status Summary (format in protocols.md)
+6. Present Status Summary (template in active flow file)
 7. Execute:
    a. Delegate to current skill (see Skill Delegation below)
    b. If skill returns FAILED → apply Skill Failure Retry Protocol (see protocols.md):
@@ -102,37 +102,6 @@ This skill activates when the user wants to:
 - User wants to document an existing codebase → use `/document` directly
 - User wants the full guided workflow → use `/autopilot`
 
-## Methodology Quick Reference
+## Flow Reference
 
-```
-┌────────────────────────────────────────────────────────────────┐
-│              Autopilot (Auto-Chain Orchestrator)                │
-├────────────────────────────────────────────────────────────────┤
-│ EVERY INVOCATION:                                              │
-│   1. Read state file + module files                            │
-│   2. Resolve flow & current step                               │
-│   3. Status Summary → Execute → Auto-chain (loop)             │
-│                                                                │
-│ GREENFIELD FLOW (flows/greenfield.md):                         │
-│   Step 0 Problem → Step 1 Research → Step 2 Plan              │
-│   → 2a UI Design (if UI) → Step 3 Decompose → [SESSION]     │
-│   → Step 4 Implement → Step 5 Run Tests                      │
-│   → 5b Security (opt) → 5c Perf Test (opt) → Step 6 Deploy  │
-│   → DONE                                                      │
-│                                                                │
-│ EXISTING CODE FLOW (flows/existing-code.md):                   │
-│   Pre-Step Document → 2b Test Spec → 2c Decompose Tests      │
-│   → [SESSION] → 2d Implement Tests → 2e Refactor             │
-│   → 2ea UI Design (if UI) → 2f New Task → [SESSION]         │
-│   → 2g Implement → 2h Run Tests → 2hb Security (opt)        │
-│   → 2hc Perf Test (opt) → 2i Deploy → DONE                  │
-│                                                                │
-│ STATE: _docs/_autopilot_state.md (see state.md)                │
-│ PROTOCOLS: choice format, Jira auth, errors (see protocols.md) │
-│ PAUSE POINTS: sub-skill BLOCKING gates only                    │
-│ SESSION BREAK: after Decompose/New Task (before Implement)     │
-├────────────────────────────────────────────────────────────────┤
-│ Auto-chain · State to file · Rich re-entry · Delegate          │
-│ Pause at decisions only · Minimize interruptions               │
-└────────────────────────────────────────────────────────────────┘
-```
+See `flows/greenfield.md` and `flows/existing-code.md` for step tables, detection rules, auto-chain rules, and status summary templates.
diff --git a/.cursor/skills/autopilot/flows/existing-code.md b/.cursor/skills/autopilot/flows/existing-code.md
index 91e120f..ff31c36 100644
--- a/.cursor/skills/autopilot/flows/existing-code.md
+++ b/.cursor/skills/autopilot/flows/existing-code.md
@@ -1,25 +1,25 @@
 # Existing Code Workflow
 
-Workflow for projects with an existing codebase. Starts with documentation, produces test specs, decomposes and implements tests, refactors with that safety net, then adds new functionality and deploys.
+Workflow for projects with an existing codebase. Starts with documentation, produces test specs, decomposes and implements tests, verifies them, refactors with that safety net, then adds new functionality and deploys.
 
 ## Step Reference Table
 
-| Step | Name                    | Sub-Skill                       | Internal SubSteps                     |
-|------|-------------------------|---------------------------------|---------------------------------------|
-| —    | Document (pre-step)     | document/SKILL.md               | Steps 1–8                             |
-| 2b   | Blackbox Test Spec      | test-spec/SKILL.md              | Phase 1a–1b                           |
-| 2c   | Decompose Tests         | decompose/SKILL.md (tests-only) | Step 1t + Step 3 + Step 4             |
-| 2d   | Implement Tests         | implement/SKILL.md              | (batch-driven, no fixed sub-steps)    |
-| 2e   | Refactor                | refactor/SKILL.md               | Phases 0–5 (6-phase method)           |
-| 2ea  | UI Design               | ui-design/SKILL.md              | Phase 0–8 (conditional — UI projects only) |
-| 2f   | New Task                | new-task/SKILL.md               | Steps 1–8 (loop)                      |
-| 2g   | Implement               | implement/SKILL.md              | (batch-driven, no fixed sub-steps)    |
-| 2h   | Run Tests               | (autopilot-managed)             | Unit tests → Blackbox tests |
-| 2hb  | Security Audit          | security/SKILL.md               | Phase 1–5 (optional)                  |
-| 2hc  | Performance Test        | (autopilot-managed)             | Load/stress tests (optional)          |
-| 2i   | Deploy                  | deploy/SKILL.md                 | Steps 1–7                             |
+| Step | Name | Sub-Skill | Internal SubSteps |
+|------|------|-----------|-------------------|
+| 1 | Document | document/SKILL.md | Steps 1–8 |
+| 2 | Test Spec | test-spec/SKILL.md | Phase 1a–1b |
+| 3 | Decompose Tests | decompose/SKILL.md (tests-only) | Step 1t + Step 3 + Step 4 |
+| 4 | Implement Tests | implement/SKILL.md | (batch-driven, no fixed sub-steps) |
+| 5 | Run Tests | test-run/SKILL.md | Steps 1–4 |
+| 6 | Refactor | refactor/SKILL.md | Phases 0–5 (6-phase method) |
+| 7 | New Task | new-task/SKILL.md | Steps 1–8 (loop) |
+| 8 | Implement | implement/SKILL.md | (batch-driven, no fixed sub-steps) |
+| 9 | Run Tests | test-run/SKILL.md | Steps 1–4 |
+| 10 | Security Audit | security/SKILL.md | Phase 1–5 (optional) |
+| 11 | Performance Test | (autopilot-managed) | Load/stress tests (optional) |
+| 12 | Deploy | deploy/SKILL.md | Step 1–7 |
 
-After Step 2i, the existing-code workflow is complete.
+After Step 12, the existing-code workflow is complete.
 
 ## Detection Rules
 
@@ -27,30 +27,14 @@ Check rules in order — first match wins.
 
 ---
 
-**Pre-Step — Existing Codebase Detection**
+**Step 1 — Document**
 Condition: `_docs/` does not exist AND the workspace contains source code files (e.g., `*.py`, `*.cs`, `*.rs`, `*.ts`, `src/`, `Cargo.toml`, `*.csproj`, `package.json`)
 
-Action: An existing codebase without documentation was detected. Present using Choose format:
-
-```
-══════════════════════════════════════
- DECISION REQUIRED: Existing codebase detected
-══════════════════════════════════════
- A) Start fresh — define the problem from scratch (greenfield workflow)
- B) Document existing codebase first — run /document to reverse-engineer docs, then continue
-══════════════════════════════════════
- Recommendation: B — the /document skill analyzes your code
- bottom-up and produces _docs/ artifacts automatically,
- then you can continue with test specs, refactor, and new features.
-══════════════════════════════════════
-```
-
-- If user picks A → proceed to Step 0 (Problem Gathering) in the greenfield flow
-- If user picks B → read and execute `.cursor/skills/document/SKILL.md`. After document skill completes, re-detect state (the produced `_docs/` artifacts will place the project at Step 2b or later).
+Action: An existing codebase without documentation was detected. Read and execute `.cursor/skills/document/SKILL.md`. After the document skill completes, re-detect state (the produced `_docs/` artifacts will place the project at Step 2 or later).
 
 ---
 
-**Step 2b — Blackbox Test Spec**
+**Step 2 — Test Spec**
 Condition: `_docs/02_document/FINAL_report.md` exists AND workspace contains source code files (e.g., `*.py`, `*.cs`, `*.rs`, `*.ts`) AND `_docs/02_document/tests/traceability-matrix.md` does not exist AND the autopilot state shows Document was run (check `Completed Steps` for "Document" entry)
 
 Action: Read and execute `.cursor/skills/test-spec/SKILL.md`
@@ -59,7 +43,7 @@ This step applies when the codebase was documented via the `/document` skill. Te
 
 ---
 
-**Step 2c — Decompose Tests**
+**Step 3 — Decompose Tests**
 Condition: `_docs/02_document/tests/traceability-matrix.md` exists AND workspace contains source code files AND the autopilot state shows Document was run AND (`_docs/02_tasks/` does not exist or has no task files)
 
 Action: Read and execute `.cursor/skills/decompose/SKILL.md` in **tests-only mode** (pass `_docs/02_document/tests/` as input). The decompose skill will:
@@ -71,8 +55,8 @@ If `_docs/02_tasks/` has some task files already, the decompose skill's resumabi
 
 ---
 
-**Step 2d — Implement Tests**
-Condition: `_docs/02_tasks/` contains task files AND `_dependencies_table.md` exists AND the autopilot state shows Step 2c (Decompose Tests) is completed AND `_docs/03_implementation/FINAL_implementation_report.md` does not exist
+**Step 4 — Implement Tests**
+Condition: `_docs/02_tasks/` contains task files AND `_dependencies_table.md` exists AND the autopilot state shows Step 3 (Decompose Tests) is completed AND `_docs/03_implementation/FINAL_implementation_report.md` does not exist
 
 Action: Read and execute `.cursor/skills/implement/SKILL.md`
 
@@ -82,8 +66,17 @@ If `_docs/03_implementation/` has batch reports, the implement skill detects com
 
 ---
 
-**Step 2e — Refactor**
-Condition: `_docs/03_implementation/FINAL_implementation_report.md` exists AND the autopilot state shows Step 2d (Implement Tests) is completed AND `_docs/04_refactoring/FINAL_report.md` does not exist
+**Step 5 — Run Tests**
+Condition: `_docs/03_implementation/FINAL_implementation_report.md` exists AND the autopilot state shows Step 4 (Implement Tests) is completed AND the autopilot state does NOT show Step 5 (Run Tests) as completed
+
+Action: Read and execute `.cursor/skills/test-run/SKILL.md`
+
+Verifies the implemented test suite passes before proceeding to refactoring. The tests form the safety net for all subsequent code changes.
+
+---
+
+**Step 6 — Refactor**
+Condition: the autopilot state shows Step 5 (Run Tests) is completed AND `_docs/04_refactoring/FINAL_report.md` does not exist
 
 Action: Read and execute `.cursor/skills/refactor/SKILL.md`
 
@@ -93,37 +86,8 @@ If `_docs/04_refactoring/` has phase reports, the refactor skill detects complet
 
 ---
 
-**Step 2ea — UI Design (conditional)**
-Condition: the autopilot state shows Step 2e (Refactor) is completed AND the autopilot state does NOT show Step 2ea (UI Design) as completed or skipped
-
-**UI Project Detection** — the project is a UI project if ANY of the following are true:
-- `package.json` exists in the workspace root or any subdirectory
-- `*.html`, `*.jsx`, `*.tsx` files exist in the workspace
-- `_docs/02_document/components/` contains a component whose `description.md` mentions UI, frontend, page, screen, dashboard, form, or view
-- `_docs/02_document/architecture.md` mentions frontend, UI layer, SPA, or client-side rendering
-
-If the project is NOT a UI project → mark Step 2ea as `skipped` in the state file and auto-chain to Step 2f.
-
-If the project IS a UI project → present using Choose format:
-
-```
-══════════════════════════════════════
- DECISION REQUIRED: UI project detected — generate/update mockups?
-══════════════════════════════════════
- A) Generate UI mockups before new task planning (recommended)
- B) Skip — proceed directly to new task
-══════════════════════════════════════
- Recommendation: A — mockups inform better frontend task specs
-══════════════════════════════════════
-```
-
-- If user picks A → Read and execute `.cursor/skills/ui-design/SKILL.md`. After completion, auto-chain to Step 2f (New Task).
-- If user picks B → Mark Step 2ea as `skipped` in the state file, auto-chain to Step 2f (New Task).
-
----
-
-**Step 2f — New Task**
-Condition: (the autopilot state shows Step 2ea (UI Design) is completed or skipped) AND the autopilot state does NOT show Step 2f (New Task) as completed
+**Step 7 — New Task**
+Condition: the autopilot state shows Step 6 (Refactor) is completed AND the autopilot state does NOT show Step 7 (New Task) as completed
 
 Action: Read and execute `.cursor/skills/new-task/SKILL.md`
 
@@ -131,46 +95,26 @@ The new-task skill interactively guides the user through defining new functional
 
 ---
 
-**Step 2g — Implement**
-Condition: the autopilot state shows Step 2f (New Task) is completed AND `_docs/03_implementation/` does not contain a FINAL report covering the new tasks (check state for distinction between test implementation and feature implementation)
+**Step 8 — Implement**
+Condition: the autopilot state shows Step 7 (New Task) is completed AND `_docs/03_implementation/` does not contain a FINAL report covering the new tasks (check state for distinction between test implementation and feature implementation)
 
 Action: Read and execute `.cursor/skills/implement/SKILL.md`
 
-The implement skill reads the new tasks from `_docs/02_tasks/` and implements them. Tasks already implemented in Step 2d are skipped (the implement skill tracks completed tasks in batch reports).
+The implement skill reads the new tasks from `_docs/02_tasks/` and implements them. Tasks already implemented in Step 4 are skipped (the implement skill tracks completed tasks in batch reports).
 
 If `_docs/03_implementation/` has batch reports from this phase, the implement skill detects completed tasks and continues.
 
 ---
 
-**Step 2h — Run Tests**
-Condition: the autopilot state shows Step 2g (Implement) is completed AND the autopilot state does NOT show Step 2h (Run Tests) as completed
+**Step 9 — Run Tests**
+Condition: the autopilot state shows Step 8 (Implement) is completed AND the autopilot state does NOT show Step 9 (Run Tests) as completed
 
-Action: Run the full test suite to verify the implementation before deployment.
-
-1. If `scripts/run-tests.sh` exists (generated by the test-spec skill Phase 4), execute it
-2. Otherwise, detect the project's test runner manually (e.g., `pytest`, `dotnet test`, `cargo test`, `npm test`) and run all unit tests; if `docker-compose.test.yml` or an equivalent test environment exists, spin it up and run the blackbox test suite
-3. **Report results**: present a summary of passed/failed/skipped tests
-
-If all tests pass → auto-chain to Step 2hb (Security Audit).
-
-If tests fail → present using Choose format:
-
-```
-══════════════════════════════════════
- TEST RESULTS: [N passed, M failed, K skipped]
-══════════════════════════════════════
- A) Fix failing tests and re-run
- B) Proceed to deploy anyway (not recommended)
- C) Abort — fix manually
-══════════════════════════════════════
- Recommendation: A — fix failures before deploying
-══════════════════════════════════════
-```
+Action: Read and execute `.cursor/skills/test-run/SKILL.md`
 
 ---
 
-**Step 2hb — Security Audit (optional)**
-Condition: the autopilot state shows Step 2h (Run Tests) is completed AND the autopilot state does NOT show Step 2hb (Security Audit) as completed or skipped AND (`_docs/04_deploy/` does not exist or is incomplete)
+**Step 10 — Security Audit (optional)**
+Condition: the autopilot state shows Step 9 (Run Tests) is completed AND the autopilot state does NOT show Step 10 (Security Audit) as completed or skipped AND (`_docs/04_deploy/` does not exist or is incomplete)
 
 Action: Present using Choose format:
 
@@ -185,13 +129,13 @@ Action: Present using Choose format:
 ══════════════════════════════════════
 ```
 
-- If user picks A → Read and execute `.cursor/skills/security/SKILL.md`. After completion, auto-chain to Step 2i (Deploy).
-- If user picks B → Mark Step 2hb as `skipped` in the state file, auto-chain to Step 2i (Deploy).
+- If user picks A → Read and execute `.cursor/skills/security/SKILL.md`. After completion, auto-chain to Step 11 (Performance Test).
+- If user picks B → Mark Step 10 as `skipped` in the state file, auto-chain to Step 11 (Performance Test).
 
 ---
 
-**Step 2hc — Performance Test (optional)**
-Condition: the autopilot state shows Step 2hb (Security Audit) is completed or skipped AND the autopilot state does NOT show Step 2hc (Performance Test) as completed or skipped AND (`_docs/04_deploy/` does not exist or is incomplete)
+**Step 11 — Performance Test (optional)**
+Condition: the autopilot state shows Step 10 (Security Audit) is completed or skipped AND the autopilot state does NOT show Step 11 (Performance Test) as completed or skipped AND (`_docs/04_deploy/` does not exist or is incomplete)
 
 Action: Present using Choose format:
 
@@ -212,13 +156,13 @@ Action: Present using Choose format:
   2. Otherwise, check if `_docs/02_document/tests/performance-tests.md` exists for test scenarios, detect appropriate load testing tool (k6, locust, artillery, wrk, or built-in benchmarks), and execute performance test scenarios against the running system
   3. Present results vs acceptance criteria thresholds
   4. If thresholds fail → present Choose format: A) Fix and re-run, B) Proceed anyway, C) Abort
-  5. After completion, auto-chain to Step 2i (Deploy)
-- If user picks B → Mark Step 2hc as `skipped` in the state file, auto-chain to Step 2i (Deploy).
+  5. After completion, auto-chain to Step 12 (Deploy)
+- If user picks B → Mark Step 11 as `skipped` in the state file, auto-chain to Step 12 (Deploy).
 
 ---
 
-**Step 2i — Deploy**
-Condition: the autopilot state shows Step 2h (Run Tests) is completed AND (Step 2hb is completed or skipped) AND (Step 2hc is completed or skipped) AND (`_docs/04_deploy/` does not exist or is incomplete)
+**Step 12 — Deploy**
+Condition: the autopilot state shows Step 9 (Run Tests) is completed AND (Step 10 is completed or skipped) AND (Step 11 is completed or skipped) AND (`_docs/04_deploy/` does not exist or is incomplete)
 
 Action: Read and execute `.cursor/skills/deploy/SKILL.md`
 
@@ -227,7 +171,7 @@ After deployment completes, the existing-code workflow is done.
 ---
 
 **Re-Entry After Completion**
-Condition: the autopilot state shows `step: done` OR all steps through 2i (Deploy) are completed
+Condition: the autopilot state shows `step: done` OR all steps through 12 (Deploy) are completed
 
 Action: The project completed a full cycle. Present status and loop back to New Task:
 
@@ -243,22 +187,48 @@ Action: The project completed a full cycle. Present status and loop back to New
 ══════════════════════════════════════
 ```
 
-- If user picks A → set `step: 2f`, `status: not_started` in the state file, then auto-chain to Step 2f (New Task). Previous cycle history stays in Completed Steps.
+- If user picks A → set `step: 7`, `status: not_started` in the state file, then auto-chain to Step 7 (New Task). Previous cycle history stays in Completed Steps.
 - If user picks B → report final project status and exit.
 
 ## Auto-Chain Rules
 
 | Completed Step | Next Action |
 |---------------|-------------|
-| Document (existing code) | Auto-chain → Blackbox Test Spec (Step 2b) |
-| Blackbox Test Spec (Step 2b) | Auto-chain → Decompose Tests (Step 2c) |
-| Decompose Tests (Step 2c) | **Session boundary** — suggest new conversation before Implement Tests |
-| Implement Tests (Step 2d) | Auto-chain → Refactor (Step 2e) |
-| Refactor (Step 2e) | Auto-chain → UI Design detection (Step 2ea) |
-| UI Design (Step 2ea, done or skipped) | Auto-chain → New Task (Step 2f) |
-| New Task (Step 2f) | **Session boundary** — suggest new conversation before Implement |
-| Implement (Step 2g) | Auto-chain → Run Tests (Step 2h) |
-| Run Tests (Step 2h, all pass) | Auto-chain → Security Audit choice (Step 2hb) |
-| Security Audit (Step 2hb, done or skipped) | Auto-chain → Performance Test choice (Step 2hc) |
-| Performance Test (Step 2hc, done or skipped) | Auto-chain → Deploy (Step 2i) |
-| Deploy (Step 2i) | **Workflow complete** — existing-code flow done |
+| Document (1) | Auto-chain → Test Spec (2) |
+| Test Spec (2) | Auto-chain → Decompose Tests (3) |
+| Decompose Tests (3) | **Session boundary** — suggest new conversation before Implement Tests |
+| Implement Tests (4) | Auto-chain → Run Tests (5) |
+| Run Tests (5, all pass) | Auto-chain → Refactor (6) |
+| Refactor (6) | Auto-chain → New Task (7) |
+| New Task (7) | **Session boundary** — suggest new conversation before Implement |
+| Implement (8) | Auto-chain → Run Tests (9) |
+| Run Tests (9, all pass) | Auto-chain → Security Audit choice (10) |
+| Security Audit (10, done or skipped) | Auto-chain → Performance Test choice (11) |
+| Performance Test (11, done or skipped) | Auto-chain → Deploy (12) |
+| Deploy (12) | **Workflow complete** — existing-code flow done |
+
+## Status Summary Template
+
+```
+═══════════════════════════════════════════════════
+ AUTOPILOT STATUS (existing-code)
+═══════════════════════════════════════════════════
+ Step 1   Document            [DONE / IN PROGRESS / NOT STARTED / FAILED (retry N/3)]
+ Step 2   Test Spec           [DONE / IN PROGRESS / NOT STARTED / FAILED (retry N/3)]
+ Step 3   Decompose Tests     [DONE (N tasks) / IN PROGRESS / NOT STARTED / FAILED (retry N/3)]
+ Step 4   Implement Tests     [DONE / IN PROGRESS (batch M) / NOT STARTED / FAILED (retry N/3)]
+ Step 5   Run Tests           [DONE (N passed, M failed) / IN PROGRESS / NOT STARTED / FAILED (retry N/3)]
+ Step 6   Refactor            [DONE / IN PROGRESS (phase N) / NOT STARTED / FAILED (retry N/3)]
+ Step 7   New Task            [DONE (N tasks) / IN PROGRESS / NOT STARTED / FAILED (retry N/3)]
+ Step 8   Implement           [DONE / IN PROGRESS (batch M of ~N) / NOT STARTED / FAILED (retry N/3)]
+ Step 9   Run Tests           [DONE (N passed, M failed) / IN PROGRESS / NOT STARTED / FAILED (retry N/3)]
+ Step 10  Security Audit      [DONE / SKIPPED / IN PROGRESS / NOT STARTED / FAILED (retry N/3)]
+ Step 11  Performance Test    [DONE / SKIPPED / IN PROGRESS / NOT STARTED / FAILED (retry N/3)]
+ Step 12  Deploy              [DONE / IN PROGRESS / NOT STARTED / FAILED (retry N/3)]
+═══════════════════════════════════════════════════
+ Current: Step N — Name
+ SubStep: M — [sub-skill internal step name]
+ Retry:   [N/3 if retrying, omit if 0]
+ Action:  [what will happen next]
+═══════════════════════════════════════════════════
+```
diff --git a/.cursor/skills/autopilot/flows/greenfield.md b/.cursor/skills/autopilot/flows/greenfield.md
index 859094d..04bf16f 100644
--- a/.cursor/skills/autopilot/flows/greenfield.md
+++ b/.cursor/skills/autopilot/flows/greenfield.md
@@ -1,21 +1,21 @@
 # Greenfield Workflow
 
-Workflow for new projects built from scratch. Flows linearly: Problem → Research → Plan → UI Design (if applicable) → Decompose → Implement → Run Tests → Security Audit (optional) → Deploy.
+Workflow for new projects built from scratch. Flows linearly: Problem → Research → Plan → UI Design (if applicable) → Decompose → Implement → Run Tests → Security Audit (optional) → Performance Test (optional) → Deploy.
 
 ## Step Reference Table
 
-| Step | Name      | Sub-Skill              | Internal SubSteps                     |
-|------|-----------|------------------------|---------------------------------------|
-| 0    | Problem   | problem/SKILL.md       | Phase 1–4                             |
-| 1    | Research  | research/SKILL.md      | Mode A: Phase 1–4 · Mode B: Step 0–8 |
-| 2    | Plan      | plan/SKILL.md          | Step 1–6 + Final                      |
-| 2a   | UI Design | ui-design/SKILL.md     | Phase 0–8 (conditional — UI projects only) |
-| 3    | Decompose | decompose/SKILL.md     | Step 1–4                              |
-| 4    | Implement | implement/SKILL.md     | (batch-driven, no fixed sub-steps)    |
-| 5    | Run Tests | (autopilot-managed)    | Unit tests → Blackbox tests |
-| 5b   | Security Audit | security/SKILL.md | Phase 1–5 (optional)                  |
-| 5c   | Performance Test | (autopilot-managed) | Load/stress tests (optional)        |
-| 6    | Deploy    | deploy/SKILL.md        | Step 1–7                              |
+| Step | Name | Sub-Skill | Internal SubSteps |
+|------|------|-----------|-------------------|
+| 1 | Problem | problem/SKILL.md | Phase 1–4 |
+| 2 | Research | research/SKILL.md | Mode A: Phase 1–4 · Mode B: Step 0–8 |
+| 3 | Plan | plan/SKILL.md | Step 1–6 + Final |
+| 4 | UI Design | ui-design/SKILL.md | Phase 0–8 (conditional — UI projects only) |
+| 5 | Decompose | decompose/SKILL.md | Step 1–4 |
+| 6 | Implement | implement/SKILL.md | (batch-driven, no fixed sub-steps) |
+| 7 | Run Tests | test-run/SKILL.md | Steps 1–4 |
+| 8 | Security Audit | security/SKILL.md | Phase 1–5 (optional) |
+| 9 | Performance Test | (autopilot-managed) | Load/stress tests (optional) |
+| 10 | Deploy | deploy/SKILL.md | Step 1–7 |
 
 ## Detection Rules
 
@@ -23,7 +23,7 @@ Check rules in order — first match wins.
 
 ---
 
-**Step 0 — Problem Gathering**
+**Step 1 — Problem Gathering**
 Condition: `_docs/00_problem/` does not exist, OR any of these are missing/empty:
 - `problem.md`
 - `restrictions.md`
@@ -34,14 +34,14 @@ Action: Read and execute `.cursor/skills/problem/SKILL.md`
 
 ---
 
-**Step 1 — Research (Initial)**
+**Step 2 — Research (Initial)**
 Condition: `_docs/00_problem/` is complete AND `_docs/01_solution/` has no `solution_draft*.md` files
 
 Action: Read and execute `.cursor/skills/research/SKILL.md` (will auto-detect Mode A)
 
 ---
 
-**Step 1b — Research Decision**
+**Research Decision** (inline gate between Step 2 and Step 3)
 Condition: `_docs/01_solution/` contains `solution_draft*.md` files AND `_docs/01_solution/solution.md` does not exist AND `_docs/02_document/architecture.md` does not exist
 
 Action: Present the current research state to the user:
@@ -63,11 +63,11 @@ Then present using the **Choose format**:
 ```
 
 - If user picks A → Read and execute `.cursor/skills/research/SKILL.md` (will auto-detect Mode B)
-- If user picks B → auto-chain to Step 2 (Plan)
+- If user picks B → auto-chain to Step 3 (Plan)
 
 ---
 
-**Step 2 — Plan**
+**Step 3 — Plan**
 Condition: `_docs/01_solution/` has `solution_draft*.md` files AND `_docs/02_document/architecture.md` does not exist
 
 Action:
@@ -78,8 +78,8 @@ If `_docs/02_document/` exists but is incomplete (has some artifacts but no `FIN
 
 ---
 
-**Step 2a — UI Design (conditional)**
-Condition: `_docs/02_document/architecture.md` exists AND the autopilot state does NOT show Step 2a (UI Design) as completed or skipped AND the project is a UI project
+**Step 4 — UI Design (conditional)**
+Condition: `_docs/02_document/architecture.md` exists AND the autopilot state does NOT show Step 4 (UI Design) as completed or skipped AND the project is a UI project
 
 **UI Project Detection** — the project is a UI project if ANY of the following are true:
 - `package.json` exists in the workspace root or any subdirectory
@@ -88,7 +88,7 @@ Condition: `_docs/02_document/architecture.md` exists AND the autopilot state do
 - `_docs/02_document/architecture.md` mentions frontend, UI layer, SPA, or client-side rendering
 - `_docs/01_solution/solution.md` mentions frontend, web interface, or user-facing UI
 
-If the project is NOT a UI project → mark Step 2a as `skipped` in the state file and auto-chain to Step 3.
+If the project is NOT a UI project → mark Step 4 as `skipped` in the state file and auto-chain to Step 5.
 
 If the project IS a UI project → present using Choose format:
 
@@ -104,12 +104,12 @@ If the project IS a UI project → present using Choose format:
 ══════════════════════════════════════
 ```
 
-- If user picks A → Read and execute `.cursor/skills/ui-design/SKILL.md`. After completion, auto-chain to Step 3 (Decompose).
-- If user picks B → Mark Step 2a as `skipped` in the state file, auto-chain to Step 3 (Decompose).
+- If user picks A → Read and execute `.cursor/skills/ui-design/SKILL.md`. After completion, auto-chain to Step 5 (Decompose).
+- If user picks B → Mark Step 4 as `skipped` in the state file, auto-chain to Step 5 (Decompose).
 
 ---
 
-**Step 3 — Decompose**
+**Step 5 — Decompose**
 Condition: `_docs/02_document/` contains `architecture.md` AND `_docs/02_document/components/` has at least one component AND `_docs/02_tasks/` does not exist or has no task files (excluding `_dependencies_table.md`)
 
 Action: Read and execute `.cursor/skills/decompose/SKILL.md`
@@ -118,7 +118,7 @@ If `_docs/02_tasks/` has some task files already, the decompose skill's resumabi
 
 ---
 
-**Step 4 — Implement**
+**Step 6 — Implement**
 Condition: `_docs/02_tasks/` contains task files AND `_dependencies_table.md` exists AND `_docs/03_implementation/FINAL_implementation_report.md` does not exist
 
 Action: Read and execute `.cursor/skills/implement/SKILL.md`
@@ -127,35 +127,15 @@ If `_docs/03_implementation/` has batch reports, the implement skill detects com
 
 ---
 
-**Step 5 — Run Tests**
-Condition: `_docs/03_implementation/FINAL_implementation_report.md` exists AND the autopilot state does NOT show Step 5 (Run Tests) as completed AND (`_docs/04_deploy/` does not exist or is incomplete)
+**Step 7 — Run Tests**
+Condition: `_docs/03_implementation/FINAL_implementation_report.md` exists AND the autopilot state does NOT show Step 7 (Run Tests) as completed AND (`_docs/04_deploy/` does not exist or is incomplete)
 
-Action: Run the full test suite to verify the implementation before deployment.
-
-1. If `scripts/run-tests.sh` exists (generated by the test-spec skill Phase 4), execute it
-2. Otherwise, detect the project's test runner manually (e.g., `pytest`, `dotnet test`, `cargo test`, `npm test`) and run all unit tests; if `docker-compose.test.yml` or an equivalent test environment exists, spin it up and run the blackbox test suite
-3. **Report results**: present a summary of passed/failed/skipped tests
-
-If all tests pass → auto-chain to Step 5b (Security Audit).
-
-If tests fail → present using Choose format:
-
-```
-══════════════════════════════════════
- TEST RESULTS: [N passed, M failed, K skipped]
-══════════════════════════════════════
- A) Fix failing tests and re-run
- B) Proceed to deploy anyway (not recommended)
- C) Abort — fix manually
-══════════════════════════════════════
- Recommendation: A — fix failures before deploying
-══════════════════════════════════════
-```
+Action: Read and execute `.cursor/skills/test-run/SKILL.md`
 
 ---
 
-**Step 5b — Security Audit (optional)**
-Condition: the autopilot state shows Step 5 (Run Tests) is completed AND the autopilot state does NOT show Step 5b (Security Audit) as completed or skipped AND (`_docs/04_deploy/` does not exist or is incomplete)
+**Step 8 — Security Audit (optional)**
+Condition: the autopilot state shows Step 7 (Run Tests) is completed AND the autopilot state does NOT show Step 8 (Security Audit) as completed or skipped AND (`_docs/04_deploy/` does not exist or is incomplete)
 
 Action: Present using Choose format:
 
@@ -170,13 +150,13 @@ Action: Present using Choose format:
 ══════════════════════════════════════
 ```
 
-- If user picks A → Read and execute `.cursor/skills/security/SKILL.md`. After completion, auto-chain to Step 6 (Deploy).
-- If user picks B → Mark Step 5b as `skipped` in the state file, auto-chain to Step 6 (Deploy).
+- If user picks A → Read and execute `.cursor/skills/security/SKILL.md`. After completion, auto-chain to Step 9 (Performance Test).
+- If user picks B → Mark Step 8 as `skipped` in the state file, auto-chain to Step 9 (Performance Test).
 
 ---
 
-**Step 5c — Performance Test (optional)**
-Condition: the autopilot state shows Step 5b (Security Audit) is completed or skipped AND the autopilot state does NOT show Step 5c (Performance Test) as completed or skipped AND (`_docs/04_deploy/` does not exist or is incomplete)
+**Step 9 — Performance Test (optional)**
+Condition: the autopilot state shows Step 8 (Security Audit) is completed or skipped AND the autopilot state does NOT show Step 9 (Performance Test) as completed or skipped AND (`_docs/04_deploy/` does not exist or is incomplete)
 
 Action: Present using Choose format:
 
@@ -197,13 +177,13 @@ Action: Present using Choose format:
   2. Otherwise, check if `_docs/02_document/tests/performance-tests.md` exists for test scenarios, detect appropriate load testing tool (k6, locust, artillery, wrk, or built-in benchmarks), and execute performance test scenarios against the running system
   3. Present results vs acceptance criteria thresholds
   4. If thresholds fail → present Choose format: A) Fix and re-run, B) Proceed anyway, C) Abort
-  5. After completion, auto-chain to Step 6 (Deploy)
-- If user picks B → Mark Step 5c as `skipped` in the state file, auto-chain to Step 6 (Deploy).
+  5. After completion, auto-chain to Step 10 (Deploy)
+- If user picks B → Mark Step 9 as `skipped` in the state file, auto-chain to Step 10 (Deploy).
 
 ---
 
-**Step 6 — Deploy**
-Condition: the autopilot state shows Step 5 (Run Tests) is completed AND (Step 5b is completed or skipped) AND (Step 5c is completed or skipped) AND (`_docs/04_deploy/` does not exist or is incomplete)
+**Step 10 — Deploy**
+Condition: the autopilot state shows Step 7 (Run Tests) is completed AND (Step 8 is completed or skipped) AND (Step 9 is completed or skipped) AND (`_docs/04_deploy/` does not exist or is incomplete)
 
 Action: Read and execute `.cursor/skills/deploy/SKILL.md`
 
@@ -218,14 +198,38 @@ Action: Report project completion with summary. If the user runs autopilot again
 
 | Completed Step | Next Action |
 |---------------|-------------|
-| Problem Gathering | Auto-chain → Research (Mode A) |
-| Research (any round) | Auto-chain → Research Decision (ask user: another round or proceed?) |
-| Research Decision → proceed | Auto-chain → Plan |
-| Plan | Auto-chain → UI Design detection (Step 2a) |
-| UI Design (done or skipped) | Auto-chain → Decompose |
-| Decompose | **Session boundary** — suggest new conversation before Implement |
-| Implement | Auto-chain → Run Tests (Step 5) |
-| Run Tests (all pass) | Auto-chain → Security Audit choice (Step 5b) |
-| Security Audit (done or skipped) | Auto-chain → Performance Test choice (Step 5c) |
-| Performance Test (done or skipped) | Auto-chain → Deploy (Step 6) |
-| Deploy | Report completion |
+| Problem (1) | Auto-chain → Research (2) |
+| Research (2) | Auto-chain → Research Decision (ask user: another round or proceed?) |
+| Research Decision → proceed | Auto-chain → Plan (3) |
+| Plan (3) | Auto-chain → UI Design detection (4) |
+| UI Design (4, done or skipped) | Auto-chain → Decompose (5) |
+| Decompose (5) | **Session boundary** — suggest new conversation before Implement |
+| Implement (6) | Auto-chain → Run Tests (7) |
+| Run Tests (7, all pass) | Auto-chain → Security Audit choice (8) |
+| Security Audit (8, done or skipped) | Auto-chain → Performance Test choice (9) |
+| Performance Test (9, done or skipped) | Auto-chain → Deploy (10) |
+| Deploy (10) | Report completion |
+
+## Status Summary Template
+
+```
+═══════════════════════════════════════════════════
+ AUTOPILOT STATUS (greenfield)
+═══════════════════════════════════════════════════
+ Step 1   Problem             [DONE / IN PROGRESS / NOT STARTED / FAILED (retry N/3)]
+ Step 2   Research            [DONE (N drafts) / IN PROGRESS / NOT STARTED / FAILED (retry N/3)]
+ Step 3   Plan                [DONE / IN PROGRESS / NOT STARTED / FAILED (retry N/3)]
+ Step 4   UI Design           [DONE / SKIPPED / IN PROGRESS / NOT STARTED / FAILED (retry N/3)]
+ Step 5   Decompose           [DONE (N tasks) / IN PROGRESS / NOT STARTED / FAILED (retry N/3)]
+ Step 6   Implement           [DONE / IN PROGRESS (batch M of ~N) / NOT STARTED / FAILED (retry N/3)]
+ Step 7   Run Tests           [DONE (N passed, M failed) / IN PROGRESS / NOT STARTED / FAILED (retry N/3)]
+ Step 8   Security Audit      [DONE / SKIPPED / IN PROGRESS / NOT STARTED / FAILED (retry N/3)]
+ Step 9   Performance Test    [DONE / SKIPPED / IN PROGRESS / NOT STARTED / FAILED (retry N/3)]
+ Step 10  Deploy              [DONE / IN PROGRESS / NOT STARTED / FAILED (retry N/3)]
+═══════════════════════════════════════════════════
+ Current: Step N — Name
+ SubStep: M — [sub-skill internal step name]
+ Retry:   [N/3 if retrying, omit if 0]
+ Action:  [what will happen next]
+═══════════════════════════════════════════════════
+```
diff --git a/.cursor/skills/autopilot/protocols.md b/.cursor/skills/autopilot/protocols.md
index 18eb731..406bf72 100644
--- a/.cursor/skills/autopilot/protocols.md
+++ b/.cursor/skills/autopilot/protocols.md
@@ -63,16 +63,16 @@ Several workflow steps create work items (epics, tasks, links). The system suppo
 
 ### Steps That Require Work Item Tracker
 
-| Step | Sub-Step | Tracker Action |
-|------|----------|----------------|
-| 2 (Plan) | Step 6 — Epics | Create epics for each component |
-| 2c (Decompose Tests) | Step 1t + Step 3 — All test tasks | Create ticket per task, link to epic |
-| 2f (New Task) | Step 7 — Ticket | Create ticket per task, link to epic |
-| 3 (Decompose) | Step 1–3 — All tasks | Create ticket per task, link to epic |
+| Flow | Step | Sub-Step | Tracker Action |
+|------|------|----------|----------------|
+| greenfield | 3 (Plan) | Step 6 — Epics | Create epics for each component |
+| greenfield | 5 (Decompose) | Step 1–3 — All tasks | Create ticket per task, link to epic |
+| existing-code | 3 (Decompose Tests) | Step 1t + Step 3 — All test tasks | Create ticket per task, link to epic |
+| existing-code | 7 (New Task) | Step 7 — Ticket | Create ticket per task, link to epic |
 
 ### Authentication Gate
 
-Before entering **Step 2 (Plan)**, **Step 2c (Decompose Tests)**, **Step 2f (New Task)**, or **Step 3 (Decompose)** for the first time, the autopilot must:
+Before entering a step that requires work item tracking (see table above) for the first time, the autopilot must:
 
 1. Call `mcp_auth` on the detected tracker's MCP server
 2. If authentication succeeds → proceed normally
@@ -306,57 +306,7 @@ For steps that produce `_docs/` artifacts (problem, research, plan, decompose, d
 
 ## Status Summary
 
-On every invocation, before executing any skill, present a status summary built from the state file (with folder scan fallback). Use the template matching the active flow (see Flow Resolution in SKILL.md).
-
-### Greenfield Flow
-
-```
-═══════════════════════════════════════════════════
- AUTOPILOT STATUS (greenfield)
-═══════════════════════════════════════════════════
- Step 0   Problem             [DONE / IN PROGRESS / NOT STARTED / FAILED (retry N/3)]
- Step 1   Research            [DONE (N drafts) / IN PROGRESS / NOT STARTED / FAILED (retry N/3)]
- Step 2   Plan                [DONE / IN PROGRESS / NOT STARTED / FAILED (retry N/3)]
- Step 2a  UI Design           [DONE / SKIPPED / IN PROGRESS / NOT STARTED / FAILED (retry N/3)]
- Step 3   Decompose           [DONE (N tasks) / IN PROGRESS / NOT STARTED / FAILED (retry N/3)]
- Step 4   Implement           [DONE / IN PROGRESS (batch M of ~N) / NOT STARTED / FAILED (retry N/3)]
- Step 5   Run Tests           [DONE (N passed, M failed) / IN PROGRESS / NOT STARTED / FAILED (retry N/3)]
- Step 5b  Security Audit      [DONE / SKIPPED / IN PROGRESS / NOT STARTED / FAILED (retry N/3)]
- Step 5c  Performance Test    [DONE / SKIPPED / IN PROGRESS / NOT STARTED / FAILED (retry N/3)]
- Step 6   Deploy              [DONE / IN PROGRESS / NOT STARTED / FAILED (retry N/3)]
-═══════════════════════════════════════════════════
- Current: Step N — Name
- SubStep: M — [sub-skill internal step name]
- Retry:   [N/3 if retrying, omit if 0]
- Action:  [what will happen next]
-═══════════════════════════════════════════════════
-```
-
-### Existing Code Flow
-
-```
-═══════════════════════════════════════════════════
- AUTOPILOT STATUS (existing-code)
-═══════════════════════════════════════════════════
- Pre      Document            [DONE / IN PROGRESS / NOT STARTED / FAILED (retry N/3)]
- Step 2b  Blackbox Test Spec  [DONE / IN PROGRESS / NOT STARTED / FAILED (retry N/3)]
- Step 2c  Decompose Tests     [DONE (N tasks) / IN PROGRESS / NOT STARTED / FAILED (retry N/3)]
- Step 2d  Implement Tests     [DONE / IN PROGRESS (batch M) / NOT STARTED / FAILED (retry N/3)]
- Step 2e  Refactor            [DONE / IN PROGRESS (phase N) / NOT STARTED / FAILED (retry N/3)]
- Step 2ea UI Design           [DONE / SKIPPED / IN PROGRESS / NOT STARTED / FAILED (retry N/3)]
- Step 2f  New Task            [DONE (N tasks) / IN PROGRESS / NOT STARTED / FAILED (retry N/3)]
- Step 2g  Implement           [DONE / IN PROGRESS (batch M of ~N) / NOT STARTED / FAILED (retry N/3)]
- Step 2h  Run Tests           [DONE (N passed, M failed) / IN PROGRESS / NOT STARTED / FAILED (retry N/3)]
- Step 2hb Security Audit      [DONE / SKIPPED / IN PROGRESS / NOT STARTED / FAILED (retry N/3)]
- Step 2hc Performance Test    [DONE / SKIPPED / IN PROGRESS / NOT STARTED / FAILED (retry N/3)]
- Step 2i  Deploy              [DONE / IN PROGRESS / NOT STARTED / FAILED (retry N/3)]
-═══════════════════════════════════════════════════
- Current: Step N — Name
- SubStep: M — [sub-skill internal step name]
- Retry:   [N/3 if retrying, omit if 0]
- Action:  [what will happen next]
-═══════════════════════════════════════════════════
-```
+On every invocation, before executing any skill, present a status summary built from the state file (with folder scan fallback). Use the Status Summary Template from the active flow file (`flows/greenfield.md` or `flows/existing-code.md`).
 
 For re-entry (state file exists), also include:
 - Key decisions from the state file's `Key Decisions` section
diff --git a/.cursor/skills/autopilot/state.md b/.cursor/skills/autopilot/state.md
index 50650aa..57e6444 100644
--- a/.cursor/skills/autopilot/state.md
+++ b/.cursor/skills/autopilot/state.md
@@ -10,28 +10,29 @@ The autopilot persists its state to `_docs/_autopilot_state.md`. This file is th
 # Autopilot State
 
 ## Current Step
-step: [0-6 or "2a" / "2b" / "2c" / "2d" / "2e" / "2ea" / "2f" / "2g" / "2h" / "2hb" / "2hc" / "2i" or "5b" / "5c" or "done"]
-name: [Problem / Research / Plan / UI Design / Blackbox Test Spec / Decompose Tests / Implement Tests / Refactor / UI Design / New Task / Implement / Run Tests / Security Audit / Performance Test / Deploy / Decompose / Done]
+flow: [greenfield | existing-code]
+step: [1-10 for greenfield, 1-12 for existing-code, or "done"]
+name: [step name from the active flow's Step Reference Table]
 status: [not_started / in_progress / completed / skipped / failed]
 sub_step: [optional — sub-skill internal step number + name if interrupted mid-step]
 retry_count: [0-3 — number of consecutive auto-retry attempts for current step, reset to 0 on success]
 
-## Step ↔ SubStep Reference
-(include the step reference table from the active flow file)
-
 When updating `Current Step`, always write it as:
-  step: N          ← autopilot step (0–6 or 2b/2c/2d/2e/2ea/2f/2g/2h/2hb/2hc/2i or 5b/5c)
-  sub_step: M      ← sub-skill's own internal step/phase number + name
-  retry_count: 0   ← reset on new step or success; increment on each failed retry
+  flow: existing-code   ← active flow
+  step: N               ← autopilot step (sequential integer)
+  sub_step: M           ← sub-skill's own internal step/phase number + name
+  retry_count: 0        ← reset on new step or success; increment on each failed retry
 Example:
-  step: 2
+  flow: greenfield
+  step: 3
   name: Plan
   status: in_progress
   sub_step: 4 — Architecture Review & Risk Assessment
   retry_count: 0
 Example (failed after 3 retries):
-  step: 2b
-  name: Blackbox Test Spec
+  flow: existing-code
+  step: 2
+  name: Test Spec
   status: failed
   sub_step: 1b — Test Case Generation
   retry_count: 3
@@ -40,8 +41,8 @@ Example (failed after 3 retries):
 
 | Step | Name | Completed | Key Outcome |
 |------|------|-----------|-------------|
-| 0 | Problem | [date] | [one-line summary] |
-| 1 | Research | [date] | [N drafts, final approach summary] |
+| 1 | [name] | [date] | [one-line summary] |
+| 2 | [name] | [date] | [one-line summary] |
 | ... | ... | ... | ... |
 
 ## Key Decisions
@@ -69,10 +70,10 @@ notes: [any context for next session]
 
 ### State File Rules
 
-1. **Create** the state file on the very first autopilot invocation (after state detection determines Step 0)
+1. **Create** the state file on the very first autopilot invocation (after state detection determines Step 1)
 2. **Update** the state file after every step completion, every session boundary, every BLOCKING gate confirmation, and every failed retry attempt
 3. **Read** the state file as the first action on every invocation — before folder scanning
-4. **Cross-check**: after reading the state file, verify against actual `_docs/` folder contents. If they disagree (e.g., state file says Step 2 but `_docs/02_document/architecture.md` already exists), trust the folder structure and update the state file to match
+4. **Cross-check**: after reading the state file, verify against actual `_docs/` folder contents. If they disagree (e.g., state file says Step 3 but `_docs/02_document/architecture.md` already exists), trust the folder structure and update the state file to match
 5. **Never delete** the state file. It accumulates history across the entire project lifecycle
 6. **Retry tracking**: increment `retry_count` on each failed auto-retry; reset to `0` when the step succeeds or the user manually resets. If `retry_count` reaches 3, set `status: failed` and add an entry to `Blockers`
 7. **Failed state on re-entry**: if the state file shows `status: failed` with `retry_count: 3`, do NOT auto-retry — present the blocker to the user and wait for their decision before proceeding
@@ -83,7 +84,7 @@ Read `_docs/_autopilot_state.md` first. If it exists and is consistent with the
 
 ### Folder Scan Rules (fallback)
 
-Scan `_docs/` to determine the current workflow position. The detection rules are defined in each flow file (`flows/greenfield.md` and `flows/existing-code.md`). Check the existing-code flow first (Pre-Step detection), then greenfield flow rules. First match wins.
+Scan `_docs/` to determine the current workflow position. The detection rules are defined in each flow file (`flows/greenfield.md` and `flows/existing-code.md`). Check the existing-code flow first (Step 1 detection), then greenfield flow rules. First match wins.
 
 ## Re-Entry Protocol
 
@@ -97,12 +98,12 @@ When the user invokes `/autopilot` and work already exists:
 
 ## Session Boundaries
 
-After any decompose/planning step completes (Step 2c, Step 2f, or Step 3), **do not auto-chain to implement**. Instead:
+After any decompose/planning step completes, **do not auto-chain to implement**. Instead:
 
 1. Update state file: mark the step as completed, set current step to the next implement step with status `not_started`
-   - After Step 2c (Decompose Tests) → set current step to 2d (Implement Tests)
-   - After Step 2f (New Task) → set current step to 2g (Implement)
-   - After Step 3 (Decompose) → set current step to 4 (Implement)
+   - Existing-code flow: After Step 3 (Decompose Tests) → set current step to 4 (Implement Tests)
+   - Existing-code flow: After Step 7 (New Task) → set current step to 8 (Implement)
+   - Greenfield flow: After Step 5 (Decompose) → set current step to 6 (Implement)
 2. Write `Last Session` section: `reason: session boundary`, `notes: Decompose complete, implementation ready`
 3. Present a summary: number of tasks, estimated batches, total complexity points
 4. Use Choose format:
diff --git a/.cursor/skills/test-run/SKILL.md b/.cursor/skills/test-run/SKILL.md
new file mode 100644
index 0000000..e8a52c9
--- /dev/null
+++ b/.cursor/skills/test-run/SKILL.md
@@ -0,0 +1,75 @@
+---
+name: test-run
+description: |
+  Run the project's test suite, report results, and handle failures.
+  Detects test runners automatically (pytest, dotnet test, cargo test, npm test)
+  or uses scripts/run-tests.sh if available.
+  Trigger phrases:
+  - "run tests", "test suite", "verify tests"
+category: build
+tags: [testing, verification, test-suite]
+disable-model-invocation: true
+---
+
+# Test Run
+
+Run the project's test suite and report results. This skill is invoked by the autopilot at verification checkpoints — after implementing tests, after implementing features, or at any point where the test suite must pass before proceeding.
+
+## Workflow
+
+### 1. Detect Test Runner
+
+Check in order — first match wins:
+
+1. `scripts/run-tests.sh` exists → use it
+2. `docker-compose.test.yml` or equivalent test environment exists → spin it up first, then detect runner below
+3. Auto-detect from project files:
+   - `pytest.ini`, `pyproject.toml` with `[tool.pytest]`, or `conftest.py` → `pytest`
+   - `*.csproj` or `*.sln` → `dotnet test`
+   - `Cargo.toml` → `cargo test`
+   - `package.json` with test script → `npm test`
+   - `Makefile` with `test` target → `make test`
+
+If no runner detected → report failure and ask user to specify.
+
+### 2. Run Tests
+
+1. Execute the detected test runner
+2. Capture output: passed, failed, skipped, errors
+3. If a test environment was spun up, tear it down after tests complete
+
+### 3. Report Results
+
+Present a summary:
+
+```
+══════════════════════════════════════
+ TEST RESULTS: [N passed, M failed, K skipped]
+══════════════════════════════════════
+```
+
+### 4. Handle Outcome
+
+**All tests pass** → return success to the autopilot for auto-chain.
+
+**Tests fail** → present using Choose format:
+
+```
+══════════════════════════════════════
+ TEST RESULTS: [N passed, M failed, K skipped]
+══════════════════════════════════════
+ A) Fix failing tests and re-run
+ B) Proceed anyway (not recommended)
+ C) Abort — fix manually
+══════════════════════════════════════
+ Recommendation: A — fix failures before proceeding
+══════════════════════════════════════
+```
+
+- If user picks A → attempt to fix failures, then re-run (loop back to step 2)
+- If user picks B → return success with warning to the autopilot
+- If user picks C → return failure to the autopilot
+
+## Trigger Conditions
+
+This skill is invoked by the autopilot at test verification checkpoints. It is not typically invoked directly by the user.
diff --git a/.cursor/skills/test-spec/SKILL.md b/.cursor/skills/test-spec/SKILL.md
index 54a056d..7dd3e48 100644
--- a/.cursor/skills/test-spec/SKILL.md
+++ b/.cursor/skills/test-spec/SKILL.md
@@ -59,16 +59,16 @@ Every input data item MUST have a corresponding expected result that defines wha
 
 Expected results live inside `_docs/00_problem/input_data/` in one or both of:
 
-1. **Mapping file** (`input_data/expected_results.md`): a table pairing each input with its quantifiable expected output, using the format defined in `.cursor/skills/test-spec/templates/expected-results.md`
+1. **Mapping file** (`input_data/expected_results/results_report.md`): a table pairing each input with its quantifiable expected output, using the format defined in `.cursor/skills/test-spec/templates/expected-results.md`
 
 2. **Reference files folder** (`input_data/expected_results/`): machine-readable files (JSON, CSV, etc.) containing full expected outputs for complex cases, referenced from the mapping file
 
 ```
 input_data/
-├── expected_results.md          ← required: input→expected result mapping
-├── expected_results/            ← optional: complex reference files
-│   ├── image_01_detections.json
-│   └── batch_A_results.json
+├── expected_results/            ← required: expected results folder
+│   ├── results_report.md        ← required: input→expected result mapping
+│   ├── image_01_expected.csv    ← per-file expected detections
+│   └── video_01_expected.csv
 ├── image_01.jpg
 ├── empty_scene.jpg
 └── data_parameters.md
@@ -95,7 +95,7 @@ input_data/
 1. `acceptance_criteria.md` exists and is non-empty — **STOP if missing**
 2. `restrictions.md` exists and is non-empty — **STOP if missing**
 3. `input_data/` exists and contains at least one file — **STOP if missing**
-4. `input_data/expected_results.md` exists and is non-empty — **STOP if missing**. Prompt the user: *"Expected results mapping is required. Please create `_docs/00_problem/input_data/expected_results.md` pairing each input with its quantifiable expected output. Use `.cursor/skills/test-spec/templates/expected-results.md` as the format reference."*
+4. `input_data/expected_results/results_report.md` exists and is non-empty — **STOP if missing**. Prompt the user: *"Expected results mapping is required. Please create `_docs/00_problem/input_data/expected_results/results_report.md` pairing each input with its quantifiable expected output. Use `.cursor/skills/test-spec/templates/expected-results.md` as the format reference."*
 5. `problem.md` exists and is non-empty — **STOP if missing**
 6. `solution.md` exists and is non-empty — **STOP if missing**
 7. Create TESTS_OUTPUT_DIR if it does not exist
@@ -161,12 +161,12 @@ At the start of execution, create a TodoWrite with all three phases. Update stat
 2. Read `acceptance_criteria.md`, `restrictions.md`
 3. Read testing strategy from solution.md (if present)
 4. If `DOCUMENT_DIR/architecture.md` and `DOCUMENT_DIR/system-flows.md` exist, read them for additional context on system interfaces and flows
-5. Read `input_data/expected_results.md` and any referenced files in `input_data/expected_results/`
+5. Read `input_data/expected_results/results_report.md` and any referenced files in `input_data/expected_results/`
 6. Analyze `input_data/` contents against:
    - Coverage of acceptance criteria scenarios
    - Coverage of restriction edge cases
    - Coverage of testing strategy requirements
-7. Analyze `input_data/expected_results.md` completeness:
+7. Analyze `input_data/expected_results/results_report.md` completeness:
    - Every input data item has a corresponding expected result row in the mapping
    - Expected results are quantifiable (contain numeric thresholds, exact values, patterns, or file references — not vague descriptions like "works correctly" or "returns result")
    - Expected results specify a comparison method (exact match, tolerance range, pattern match, threshold) per the template
@@ -178,7 +178,7 @@ At the start of execution, create a TodoWrite with all three phases. Update stat
 | [file/data] | Yes/No | Yes/No | [missing, vague, no tolerance, etc.] |
 
 9. Threshold: at least 70% coverage of scenarios AND every covered scenario has a quantifiable expected result (see `.cursor/rules/cursor-meta.mdc` Quality Thresholds table)
-10. If coverage is low, search the internet for supplementary data, assess quality with user, and if user agrees, add to `input_data/` and update `input_data/expected_results.md`
+10. If coverage is low, search the internet for supplementary data, assess quality with user, and if user agrees, add to `input_data/` and update `input_data/expected_results/results_report.md`
 11. If expected results are missing or not quantifiable, ask user to provide them before proceeding
 
 **BLOCKING**: Do NOT proceed until user confirms both input data coverage AND expected results completeness are sufficient.
@@ -205,7 +205,7 @@ Based on all acquired data, acceptance_criteria, and restrictions, form detailed
 **Self-verification**:
 - [ ] Every acceptance criterion is covered by at least one test scenario
 - [ ] Every restriction is verified by at least one test scenario
-- [ ] Every test scenario has a quantifiable expected result from `input_data/expected_results.md`
+- [ ] Every test scenario has a quantifiable expected result from `input_data/expected_results/results_report.md`
 - [ ] Expected results use comparison methods from `.cursor/skills/test-spec/templates/expected-results.md`
 - [ ] Positive and negative scenarios are balanced
 - [ ] Consumer app has no direct access to system internals
@@ -251,7 +251,7 @@ For each row where **Input Provided?** is **No** OR **Expected Result Provided?*
 
 > **Option A — Provide the missing items**: Supply what is missing:
 > - **Missing input data**: Place test data files in `_docs/00_problem/input_data/` or indicate the location.
-> - **Missing expected result**: Provide the quantifiable expected result for this input. Update `_docs/00_problem/input_data/expected_results.md` with a row mapping the input to its expected output. If the expected result is complex, provide a reference file in `_docs/00_problem/input_data/expected_results/`. Use `.cursor/skills/test-spec/templates/expected-results.md` for format guidance.
+> - **Missing expected result**: Provide the quantifiable expected result for this input. Update `_docs/00_problem/input_data/expected_results/results_report.md` with a row mapping the input to its expected output. If the expected result is complex, provide a reference CSV file in `_docs/00_problem/input_data/expected_results/`. Use `.cursor/skills/test-spec/templates/expected-results.md` for format guidance.
 >
 > Expected results MUST be quantifiable — the test must be able to programmatically compare actual vs expected. Examples:
 > - "3 detections with bounding boxes [(x1,y1,x2,y2), ...] ± 10px"
@@ -273,7 +273,7 @@ For each item where the user chose **Option A**:
 3. Verify **quantity**: enough data samples to cover the scenario (e.g., at least N images for a batch test, multiple edge-case variants)
 
 **Expected result validation**:
-4. Verify the expected result exists in `input_data/expected_results.md` or as a referenced file in `input_data/expected_results/`
+4. Verify the expected result exists in `input_data/expected_results/results_report.md` or as a referenced file in `input_data/expected_results/`
 5. Verify **quantifiability**: the expected result can be evaluated programmatically — it must contain at least one of:
    - Exact values (counts, strings, status codes)
    - Numeric values with tolerance (e.g., `± 10px`, `≥ 0.85`)
@@ -392,7 +392,7 @@ Create `scripts/run-performance-tests.sh` at the project root. The script must:
 | Situation | Action |
 |-----------|--------|
 | Missing acceptance_criteria.md, restrictions.md, or input_data/ | **STOP** — specification cannot proceed |
-| Missing input_data/expected_results.md | **STOP** — ask user to provide expected results mapping using the template |
+| Missing input_data/expected_results/results_report.md | **STOP** — ask user to provide expected results mapping using the template |
 | Ambiguous requirements | ASK user |
 | Input data coverage below 70% (Phase 1) | Search internet for supplementary data, ASK user to validate |
 | Expected results missing or not quantifiable (Phase 1) | ASK user to provide quantifiable expected results before proceeding |
diff --git a/.cursor/skills/test-spec/templates/expected-results.md b/.cursor/skills/test-spec/templates/expected-results.md
index 0700733..315a13a 100644
--- a/.cursor/skills/test-spec/templates/expected-results.md
+++ b/.cursor/skills/test-spec/templates/expected-results.md
@@ -1,7 +1,7 @@
 # Expected Results Template
 
-Save as `_docs/00_problem/input_data/expected_results.md`.
-For complex expected outputs, create `_docs/00_problem/input_data/expected_results/` and place reference files there.
+Save as `_docs/00_problem/input_data/expected_results/results_report.md`.
+For complex expected outputs, place reference CSV files alongside it in `_docs/00_problem/input_data/expected_results/`.
 Referenced by the test-spec skill (`.cursor/skills/test-spec/SKILL.md`).
 
 ---
diff --git a/.gitignore b/.gitignore
index 564adca..7c54af8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -34,13 +34,14 @@ coverage.xml
 .hypothesis/
 .tox/
 
-# Binary test data
-_docs/00_problem/input_data/*.onnx
-_docs/00_problem/input_data/*.jpg
-_docs/00_problem/input_data/*.JPG
-_docs/00_problem/input_data/*.mp4
-_docs/00_problem/input_data/*.png
-_docs/00_problem/input_data/*.avi
+# Binary / media / model files
+*.onnx
+*.mp4
+*.avi
+*.jpg
+*.JPG
+*.jpeg
+*.png
 
 # Standalone skill output (ephemeral, not part of project)
 _standalone/
diff --git a/AGENTS.md b/AGENTS.md
index 6574f77..b157803 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -18,7 +18,7 @@ See [.cursor/README.md](.cursor/README.md) for the full documentation, including
 
 | Command | What it does |
 |---------|-------------|
-| `/autopilot` | Full guided workflow (problem → deploy) |
+| `/autopilot`, `/auto` | Full guided workflow (problem → deploy) |
 | `/problem` | Interactive problem gathering |
 | `/research` | Deep research methodology |
 | `/plan` | Architecture and component planning |
diff --git a/_docs/00_problem/input_data/azaion.pt b/_docs/00_problem/input_data/azaion.pt
new file mode 100644
index 0000000..5cabf08
Binary files /dev/null and b/_docs/00_problem/input_data/azaion.pt differ
diff --git a/_docs/00_problem/input_data/expected_results/image_dense01_expected.csv b/_docs/00_problem/input_data/expected_results/image_dense01_expected.csv
new file mode 100644
index 0000000..3567276
--- /dev/null
+++ b/_docs/00_problem/input_data/expected_results/image_dense01_expected.csv
@@ -0,0 +1 @@
+center_x,center_y,width,height,label,confidence_min
diff --git a/_docs/00_problem/input_data/expected_results/image_dense02_expected.csv b/_docs/00_problem/input_data/expected_results/image_dense02_expected.csv
new file mode 100644
index 0000000..3567276
--- /dev/null
+++ b/_docs/00_problem/input_data/expected_results/image_dense02_expected.csv
@@ -0,0 +1 @@
+center_x,center_y,width,height,label,confidence_min
diff --git a/_docs/00_problem/input_data/expected_results/image_different_types_expected.csv b/_docs/00_problem/input_data/expected_results/image_different_types_expected.csv
new file mode 100644
index 0000000..3567276
--- /dev/null
+++ b/_docs/00_problem/input_data/expected_results/image_different_types_expected.csv
@@ -0,0 +1 @@
+center_x,center_y,width,height,label,confidence_min
diff --git a/_docs/00_problem/input_data/expected_results/image_empty_scene_expected.csv b/_docs/00_problem/input_data/expected_results/image_empty_scene_expected.csv
new file mode 100644
index 0000000..3567276
--- /dev/null
+++ b/_docs/00_problem/input_data/expected_results/image_empty_scene_expected.csv
@@ -0,0 +1 @@
+center_x,center_y,width,height,label,confidence_min
diff --git a/_docs/00_problem/input_data/expected_results/image_large_expected.csv b/_docs/00_problem/input_data/expected_results/image_large_expected.csv
new file mode 100644
index 0000000..3567276
--- /dev/null
+++ b/_docs/00_problem/input_data/expected_results/image_large_expected.csv
@@ -0,0 +1 @@
+center_x,center_y,width,height,label,confidence_min
diff --git a/_docs/00_problem/input_data/expected_results/image_small_expected.csv b/_docs/00_problem/input_data/expected_results/image_small_expected.csv
new file mode 100644
index 0000000..3567276
--- /dev/null
+++ b/_docs/00_problem/input_data/expected_results/image_small_expected.csv
@@ -0,0 +1 @@
+center_x,center_y,width,height,label,confidence_min
diff --git a/_docs/00_problem/input_data/expected_results/results_report.md b/_docs/00_problem/input_data/expected_results/results_report.md
new file mode 100644
index 0000000..b298506
--- /dev/null
+++ b/_docs/00_problem/input_data/expected_results/results_report.md
@@ -0,0 +1,104 @@
+# Expected Results
+
+Maps every input data item to its quantifiable expected result.
+Tests use this mapping to compare actual system output against known-correct answers.
+
+## Coordinate System
+
+All bounding box coordinates are **normalized to 0.0–1.0** relative to the full image/frame dimensions, matching the API response format:
+
+| Field | Meaning |
+|-------|---------|
+| `center_x` | Horizontal center of bounding box (0.0 = left edge, 1.0 = right edge) |
+| `center_y` | Vertical center of bounding box (0.0 = top edge, 1.0 = bottom edge) |
+| `width` | Bounding box width as fraction of image width |
+| `height` | Bounding box height as fraction of image height |
+| `label` | Class name from `classes.json` (e.g., `ArmorVehicle`, `Car`, `Person`) |
+| `confidence_min` | Minimum acceptable confidence for this detection (threshold comparison, `≥`) |
+
+For videos, the additional field:
+
+| Field | Meaning |
+|-------|---------|
+| `time_sec` | Timestamp in seconds from video start when this detection is visible |
+
+## Global Tolerances
+
+| Parameter | Tolerance | Comparison Method |
+|-----------|-----------|-------------------|
+| Bounding box coordinates (center_x, center_y, width, height) | ± 0.05 | `numeric_tolerance` |
+| Detection count | ± 2 | `numeric_tolerance` |
+| Confidence | ≥ `confidence_min` value per row | `threshold_min` |
+| Label | exact match | `exact` |
+| Video time_sec | ± 1.0s | `numeric_tolerance` |
+
+## Input → Expected Result Mapping
+
+### Images
+
+| # | Input File | Description | Expected Result File | Expected Detection Count | Notes |
+|---|------------|-------------|---------------------|-------------------------|-------|
+| 1 | `image_small.jpg` | 1280×720 aerial, contains detectable objects | `image_small_expected.csv` | ? | Primary test image for single-frame detection |
+| 2 | `image_large.JPG` | 6252×4168 aerial, triggers GSD-based tiling | `image_large_expected.csv` | ? | Coordinates normalized to full image (not tile) |
+| 3 | `image_dense01.jpg` | 1280×720 dense scene, many clustered objects | `image_dense01_expected.csv` | ? | Used for dedup and max-detection-cap tests |
+| 4 | `image_dense02.jpg` | 1920×1080 dense scene variant | `image_dense02_expected.csv` | ? | Borderline tiling, dedup variant |
+| 5 | `image_different_types.jpg` | 900×1600, varied object classes | `image_different_types_expected.csv` | ? | Must contain multiple distinct class labels |
+| 6 | `image_empty_scene.jpg` | 1920×1080, no detectable objects | `image_empty_scene_expected.csv` | 0 | CSV has headers only — zero detections expected |
+
+### Videos
+
+| # | Input File | Description | Expected Result File | Notes |
+|---|------------|-------------|---------------------|-------|
+| 7 | `video_short01.mp4` | Standard test video | `video_short01_expected.csv` | Primary async/SSE/video test. List key-frame detections. |
+| 8 | `video_short02.mp4` | Video variant | `video_short02_expected.csv` | Used for resilience and concurrent tests |
+| 9 | `video_long03.mp4` | Long video (288MB), generates >100 SSE events | `video_long03_expected.csv` | SSE overflow test. Only key-frame samples needed. |
+
+## How to Fill
+
+### Images
+
+1. Run the model on each image (or use the detection service)
+2. Record every detection the model returns
+3. Fill one row per detection in the CSV:
+
+```
+center_x,center_y,width,height,label,confidence_min
+0.45,0.32,0.08,0.12,Car,0.25
+0.71,0.55,0.06,0.09,Person,0.25
+```
+
+4. For `image_empty_scene_expected.csv` — leave only the header row (0 detections)
+
+### Videos
+
+1. Run the model on the video (or use the detection service with `frame_period_recognition: 1`)
+2. For key frames where detections appear, record the timestamp and detections
+3. Fill one row per detection per timestamp:
+
+```
+time_sec,center_x,center_y,width,height,label,confidence_min
+2.0,0.45,0.32,0.08,0.12,Car,0.25
+2.0,0.71,0.55,0.06,0.09,Person,0.25
+4.0,0.46,0.33,0.08,0.12,Car,0.25
+```
+
+4. You don't need every single frame — sample at key moments (e.g., every 2–4 seconds) to validate detection presence and approximate positions
+
+## Non-Detection Expected Results
+
+The following test scenarios have expected results that are not per-file detections. These are defined inline in the test specs and do not need CSV files:
+
+| Scenario | Expected Result | Comparison | Defined In |
+|----------|----------------|------------|------------|
+| Empty image (FT-N-01) | HTTP 400, `"Image is empty"` | exact | `blackbox-tests.md` |
+| Corrupt image (FT-N-02) | HTTP 400 or 422 | exact | `blackbox-tests.md` |
+| Engine unavailable (FT-N-03) | HTTP 503 or 422, not 500 | exact | `blackbox-tests.md` |
+| Duplicate media_id (FT-N-04) | HTTP 409 | exact | `blackbox-tests.md` |
+| Missing classes.json (FT-N-05) | Service fails or empty detections | exact | `blackbox-tests.md` |
+| Health pre-init (FT-P-01) | `aiAvailability: "None"` | exact | `blackbox-tests.md` |
+| Health post-init (FT-P-02) | `aiAvailability` not "None"/"Downloading" | exact | `blackbox-tests.md` |
+| Async start (FT-P-08) | `{"status": "started"}`, response < 1s | exact + threshold_max | `blackbox-tests.md` |
+| SSE completion (FT-P-09) | Final event: `mediaStatus: "AIProcessed"`, `percent: 100` | exact | `blackbox-tests.md` |
+| Max detections (NFT-RES-LIM-03) | `len(detections) ≤ 300` | threshold_max | `resource-limit-tests.md` |
+| Single image latency (NFT-PERF-01) | p95 < 5000ms (ONNX CPU) | threshold_max | `performance-tests.md` |
+| Log file naming (NFT-RES-LIM-04) | `log_inference_YYYYMMDD.txt` exists | regex | `resource-limit-tests.md` |
diff --git a/_docs/00_problem/input_data/expected_results/video_long03_expected.csv b/_docs/00_problem/input_data/expected_results/video_long03_expected.csv
new file mode 100644
index 0000000..4aba659
--- /dev/null
+++ b/_docs/00_problem/input_data/expected_results/video_long03_expected.csv
@@ -0,0 +1 @@
+time_sec,center_x,center_y,width,height,label,confidence_min
diff --git a/_docs/00_problem/input_data/expected_results/video_short01_expected.csv b/_docs/00_problem/input_data/expected_results/video_short01_expected.csv
new file mode 100644
index 0000000..4aba659
--- /dev/null
+++ b/_docs/00_problem/input_data/expected_results/video_short01_expected.csv
@@ -0,0 +1 @@
+time_sec,center_x,center_y,width,height,label,confidence_min
diff --git a/_docs/00_problem/input_data/expected_results/video_short02_expected.csv b/_docs/00_problem/input_data/expected_results/video_short02_expected.csv
new file mode 100644
index 0000000..4aba659
--- /dev/null
+++ b/_docs/00_problem/input_data/expected_results/video_short02_expected.csv
@@ -0,0 +1 @@
+time_sec,center_x,center_y,width,height,label,confidence_min
diff --git a/_docs/_autopilot_state.md b/_docs/_autopilot_state.md
index 6409c8e..63028d9 100644
--- a/_docs/_autopilot_state.md
+++ b/_docs/_autopilot_state.md
@@ -1,46 +1,29 @@
 # Autopilot State
 
 ## Current Step
-step: 2f
-name: Refactor
-status: not_started
-sub_step: —
+flow: existing-code
+step: 5
+name: Run Tests
+status: in_progress
+sub_step: 2 — Run Tests
 retry_count: 0
 
-## Step ↔ SubStep Reference
-| Step | Name                   | Sub-Skill                        | Internal SubSteps                        |
-|------|------------------------|----------------------------------|------------------------------------------|
-| 0    | Problem                | problem/SKILL.md                 | Phase 1–4                                |
-| 1    | Research               | research/SKILL.md                | Mode A: Phase 1–4 · Mode B: Step 0–8    |
-| 2    | Plan                   | plan/SKILL.md                    | Step 1–6                                 |
-| 2b   | Blackbox Test Spec     | blackbox-test-spec/SKILL.md      | Phase 1a–1b (existing code path only)    |
-| 2c   | Post-Test-Spec Decision| (autopilot decision gate)        | Refactor vs normal workflow              |
-| 2d   | Decompose Tests        | decompose/SKILL.md (tests-only)  | Step 1t + Step 3 + Step 4                |
-| 2e   | Implement Tests        | implement/SKILL.md               | (batch-driven, no fixed sub-steps)       |
-| 2f   | Refactor               | refactor/SKILL.md                | Phases 0–5 (6-phase method)              |
-| 2g   | New Task               | new-task/SKILL.md                | Steps 1–8 (loop)                         |
-| 2h   | Implement              | implement/SKILL.md               | (batch-driven, no fixed sub-steps)       |
-| 2i   | Run Tests              | (autopilot-managed)              | Unit + integration tests                 |
-| 2j   | Security Audit         | security/SKILL.md                | Phase 1–5 (optional)                     |
-| 2k   | Deploy                 | deploy/SKILL.md                  | Step 1–7                                 |
-
 ## Completed Steps
 
 | Step | Name | Completed | Key Outcome |
 |------|------|-----------|-------------|
-| — | Document (pre-step) | 2026-03-21 | 10 modules, 4 components, full _docs/ generated from existing codebase |
-| 2b | Blackbox Test Spec | 2026-03-21 | 39 test scenarios (16 positive, 8 negative, 11 non-functional), 85% total coverage, 5 artifacts produced |
-| 2c | Post-Test-Spec Decision | 2026-03-22 | User chose refactor path (A) |
-| 2d | Decompose Tests | 2026-03-23 | 11 tasks (AZ-138..AZ-148), 35 complexity points, 3 batches. Phase 3 test data gate PASSED: 39/39 scenarios validated, 12 data files provided. |
-| 2e | Implement Tests | 2026-03-23 | 11 tasks implemented across 4 batches, 38 tests (2 skipped), all code reviews PASS_WITH_WARNINGS. Commits: 5418bd7, a469579, 861d4f0, f0e3737. |
+| 1 | Document | 2026-03-21 | 10 modules, 4 components, full _docs/ generated from existing codebase |
+| 2 | Test Spec | 2026-03-21 | 39 test scenarios (16 positive, 8 negative, 11 non-functional), 85% total coverage, 5 artifacts produced |
+| 3 | Decompose Tests | 2026-03-23 | 11 tasks (AZ-138..AZ-148), 35 complexity points, 3 batches. Phase 3 test data gate PASSED: 39/39 scenarios validated, 12 data files provided. |
+| 4 | Implement Tests | 2026-03-23 | 11 tasks implemented across 4 batches, 38 tests (2 skipped), all code reviews PASS_WITH_WARNINGS. Commits: 5418bd7, a469579, 861d4f0, f0e3737. |
 
 ## Key Decisions
-- User chose B: Document existing codebase before proceeding
+- User chose to document existing codebase before proceeding
 - Component breakdown: 4 components (Domain, Inference Engines, Inference Pipeline, API)
 - Verification: 4 legacy issues found and documented (unused serialize/from_msgpack, orphaned queue declarations)
 - Input data coverage approved at ~90% (Phase 1a)
 - Test coverage approved at 85% (21/22 AC, 13/18 restrictions) with all gaps justified
-- User chose A: Refactor path (decompose tests → implement tests → refactor)
+- User chose refactor path (decompose tests → implement tests → refactor)
 - Integration Tests Epic: AZ-137
 - Test Infrastructure: AZ-138 (5 pts)
 - 10 integration test tasks decomposed: AZ-139 through AZ-148 (30 pts)
@@ -51,10 +34,10 @@ retry_count: 0
 - Jira MCP auth skipped — tickets not transitioned to In Testing
 
 ## Last Session
-date: 2026-03-23
-ended_at: Step 2e Implement Tests — COMPLETE. All 11 tasks, 38 tests, 4 batches.
+date: 2026-03-24
+ended_at: Step 4 Implement Tests — COMPLETE. All 11 tasks, 38 tests, 4 batches.
 reason: step completed, context limit approaching
-notes: All integration tests implemented and committed. Next step: 2f Refactor. The refactor skill runs a 6-phase method using the implemented tests as a safety net. Recommend fresh conversation for better context management.
+notes: All integration tests implemented and committed. Next step: 5 Run Tests — verify tests pass before proceeding to refactor. Recommend fresh conversation for better context management.
 
 ## Blockers
 - none
diff --git a/constants_inf.pxd b/constants_inf.pxd
index f5573eb..59fb7a0 100644
--- a/constants_inf.pxd
+++ b/constants_inf.pxd
@@ -21,7 +21,7 @@ cdef log(str log_message)
 cdef logerror(str error)
 cdef format_time(int ms)
 
-cdef dict[int, AnnotationClass] annotations_dict
+cdef dict annotations_dict
 
 cdef class AnnotationClass:
     cdef public int id
diff --git a/constants_inf.pyx b/constants_inf.pyx
index 4b515bf..a94e6ef 100644
--- a/constants_inf.pyx
+++ b/constants_inf.pyx
@@ -70,6 +70,9 @@ logger.add(
     colorize=True
 )
 
+def get_annotations_dict():
+    return annotations_dict
+
 cdef log(str log_message):
     logger.info(log_message)
 
diff --git a/e2e/docker-compose.test.yml b/e2e/docker-compose.test.yml
index 4f52a05..324105b 100644
--- a/e2e/docker-compose.test.yml
+++ b/e2e/docker-compose.test.yml
@@ -65,7 +65,7 @@ services:
       - ./results:/results
     networks:
       - e2e-net
-    command: ["pytest", "--csv=/results/report.csv", "-v"]
+    command: ["pytest", "--csv=/results/report.csv", "-v", "-x"]
 
 networks:
   e2e-net:
diff --git a/inference.pyx b/inference.pyx
index f6153f6..81acee1 100644
--- a/inference.pyx
+++ b/inference.pyx
@@ -264,7 +264,9 @@ cdef class Inference:
         if frame is None:
             raise ValueError("Invalid image data")
 
-        input_blob = self.preprocess([frame])
+        cdef int bs = self.engine.get_batch_size()
+        frames = [frame] * bs
+        input_blob = self.preprocess(frames)
         outputs = self.engine.run(input_blob)
         list_detections = self.postprocess(outputs, ai_config)
         if list_detections:
diff --git a/main.py b/main.py
index 1e8d3f5..c3f0f26 100644
--- a/main.py
+++ b/main.py
@@ -109,9 +109,10 @@ class AIConfigDto(BaseModel):
 
 def detection_to_dto(det) -> DetectionDto:
     import constants_inf
+    ann = constants_inf.get_annotations_dict()
     label = ""
-    if det.cls in constants_inf.annotations_dict:
-        label = constants_inf.annotations_dict[det.cls].name
+    if det.cls in ann:
+        label = ann[det.cls].name
     return DetectionDto(
         centerX=det.x,
         centerY=det.y,
diff --git a/onnx_engine.pxd b/onnx_engine.pxd
new file mode 100644
index 0000000..55ab2fa
--- /dev/null
+++ b/onnx_engine.pxd
@@ -0,0 +1,13 @@
+from inference_engine cimport InferenceEngine
+
+
+cdef class OnnxEngine(InferenceEngine):
+
+    cdef public object session
+    cdef object model_inputs
+    cdef str input_name
+    cdef object input_shape
+
+    cdef tuple get_input_shape(self)
+    cdef int get_batch_size(self)
+    cdef run(self, input_data)