Spaces:

EvalOps
/

jarvis

Running

App Files Files Community

Jonathan Haas commited on Feb 27

Commit

944b4e2

1 Parent(s): 4f4578a

Complete remaining Jarvis roadmap: proactive, trust, planner, integrations, and deployment tracks

Browse files

Files changed (21) hide show

.github/workflows/assistant-quality-report.yml +46 -0
.github/workflows/release-acceptance.yml +41 -0
.gitignore +3 -0
Dockerfile +21 -0
Makefile +17 -1
README.md +26 -0
TODO.md +44 -44
config/release-channels.json +27 -0
deploy/home-assistant-addon/Dockerfile +15 -0
deploy/home-assistant-addon/README.md +16 -0
deploy/home-assistant-addon/config.yaml +25 -0
docker-compose.yml +15 -0
docs/evals/assistant-contract.json +18 -0
scripts/bootstrap.sh +43 -0
scripts/check_release_channel.py +66 -0
scripts/generate_quality_report.py +112 -0
scripts/release_acceptance.sh +37 -0
scripts/run_eval_dataset.py +75 -0
src/jarvis/brain.py +9 -0
src/jarvis/tools/services.py +0 -0
tests/test_tools_services.py +133 -2

.github/workflows/assistant-quality-report.yml ADDED Viewed

	@@ -0,0 +1,46 @@

+name: Assistant Quality Report
+on:
+  schedule:
+    - cron: "0 10 * * 1"
+  workflow_dispatch:
+concurrency:
+  group: assistant-quality-report-${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: false
+jobs:
+  quality-report:
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    steps:
+      - name: Checkout
+        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
+      - name: Set up Python
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5
+        with:
+          python-version: "3.12"
+      - name: Install uv
+        uses: astral-sh/setup-uv@8d55fbecc275b1c35dbe060458839f8d30439ccf # v3
+      - name: Sync dependencies
+        run: uv sync --extra dev
+      - name: Generate weekly report
+        run: |
+          mkdir -p .artifacts/quality
+          ./scripts/generate_quality_report.py --output-dir .artifacts/quality --markdown > .artifacts/quality/summary.json
+      - name: Run deterministic eval dataset
+        run: |
+          ./scripts/run_eval_dataset.py docs/evals/assistant-contract.json --output .artifacts/quality/eval.json --strict || true
+      - name: Upload quality artifacts
+        if: always()
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
+        with:
+          name: assistant-quality-${{ github.run_id }}
+          path: .artifacts/quality
+          if-no-files-found: warn

.github/workflows/release-acceptance.yml ADDED Viewed

	@@ -0,0 +1,41 @@

+name: Release Acceptance
+on:
+  workflow_dispatch:
+    inputs:
+      profile:
+        description: "Acceptance profile"
+        type: choice
+        default: fast
+        options:
+          - fast
+          - full
+concurrency:
+  group: release-acceptance-${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+jobs:
+  acceptance:
+    runs-on: ubuntu-latest
+    timeout-minutes: 40
+    steps:
+      - name: Checkout
+        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
+      - name: Set up Python
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5
+        with:
+          python-version: "3.12"
+      - name: Install uv
+        uses: astral-sh/setup-uv@8d55fbecc275b1c35dbe060458839f8d30439ccf # v3
+      - name: Sync dependencies
+        run: uv sync --extra dev
+      - name: Validate release channel (stable)
+        run: ./scripts/check_release_channel.py --channel stable
+      - name: Run release acceptance suite
+        run: ./scripts/release_acceptance.sh "${{ github.event.inputs.profile }}"

.gitignore CHANGED Viewed

@@ -11,3 +11,6 @@ __pycache__/
 # Third-party clones used for reference only
 vendor/

 # Third-party clones used for reference only
 vendor/
+# Local run artifacts
+.artifacts/

Dockerfile ADDED Viewed

	@@ -0,0 +1,21 @@

+FROM python:3.12-slim
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PIP_DISABLE_PIP_VERSION_CHECK=1 \
+    UV_LINK_MODE=copy
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends curl ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+ENV PATH="/root/.local/bin:${PATH}"
+WORKDIR /app
+COPY pyproject.toml uv.lock ./
+RUN uv sync --frozen --extra dev
+COPY . .
+CMD ["uv", "run", "python", "-m", "jarvis", "--sim", "--no-vision"]

Makefile CHANGED Viewed

@@ -1,4 +1,5 @@
-.PHONY: check test-fast test-faults test-fault-profiles test-soak security-gate
 check:
 	uv run ruff check src tests
@@ -18,3 +19,18 @@ test-soak:
 security-gate:
 	./scripts/security_gate.sh

+.PHONY: check test-fast test-faults test-fault-profiles test-soak security-gate \
+	bootstrap quality-report eval-dataset release-channel-check release-acceptance
 check:
 	uv run ruff check src tests
 security-gate:
 	./scripts/security_gate.sh
+bootstrap:
+	./scripts/bootstrap.sh
+quality-report:
+	./scripts/generate_quality_report.py --output-dir .artifacts/quality --markdown
+eval-dataset:
+	./scripts/run_eval_dataset.py docs/evals/assistant-contract.json --output .artifacts/quality/eval.json --strict
+release-channel-check:
+	./scripts/check_release_channel.py --channel stable
+release-acceptance:
+	./scripts/release_acceptance.sh fast

README.md CHANGED Viewed

@@ -163,6 +163,7 @@ Smart home safety defaults:
   - `system_status.integrations.*.circuit_breaker` (open/remaining/failure state per integration)
   - `system_status.recovery_journal` (interrupted-action reconciliation summary)
   - `system_status.dead_letter_queue` (failed outbound delivery queue with replay status)
   - `jarvis_scorecard` (standalone scorecard payload for dashboards and alerts)
   - `system_status_contract` (stable required-field contract)
 - Memory retrieval now includes confidence/provenance details:
@@ -203,6 +204,13 @@ Smart home safety defaults:
 - Skills developer guide: [`docs/operations/skills-development.md`](docs/operations/skills-development.md).
 - Provenance verification: [`docs/operations/provenance-verification.md`](docs/operations/provenance-verification.md).
 - Incident response: [`docs/operations/incident-response.md`](docs/operations/incident-response.md).
 - Todoist integration:
   - `TODOIST_PERMISSION_PROFILE=readonly|control`
   - `readonly` allows `todoist_list_tasks` and denies `todoist_add_task`
@@ -235,6 +243,24 @@ Smart home safety defaults:
   - failed outbound webhook/channel/email/push attempts are queued for operator replay:
     - `dead_letter_list` to inspect queue state
     - `dead_letter_replay` to retry specific or filtered entries
 ### First-Time Operator Checklist

   - `system_status.integrations.*.circuit_breaker` (open/remaining/failure state per integration)
   - `system_status.recovery_journal` (interrupted-action reconciliation summary)
   - `system_status.dead_letter_queue` (failed outbound delivery queue with replay status)
+  - `system_status.expansion` (proactive, trust, orchestration, planner, quality, embodiment, integration roadmap feature snapshot)
   - `jarvis_scorecard` (standalone scorecard payload for dashboards and alerts)
   - `system_status_contract` (stable required-field contract)
 - Memory retrieval now includes confidence/provenance details:
 - Skills developer guide: [`docs/operations/skills-development.md`](docs/operations/skills-development.md).
 - Provenance verification: [`docs/operations/provenance-verification.md`](docs/operations/provenance-verification.md).
 - Incident response: [`docs/operations/incident-response.md`](docs/operations/incident-response.md).
+- Release acceptance: run `./scripts/release_acceptance.sh fast|full`.
+- Release channel checks: run `./scripts/check_release_channel.py --channel dev|beta|stable`.
+- Weekly quality artifact: run `./scripts/generate_quality_report.py --output-dir .artifacts/quality --markdown`.
+- Deterministic eval dataset runner: run `./scripts/run_eval_dataset.py docs/evals/assistant-contract.json --strict`.
+- One-command host bootstrap: run `./scripts/bootstrap.sh`.
+- Container profile: `docker compose up --build` (simulation/no-vision default).
+- Home Assistant add-on starter path: [`deploy/home-assistant-addon`](deploy/home-assistant-addon).
 - Todoist integration:
   - `TODOIST_PERMISSION_PROFILE=readonly|control`
   - `readonly` allows `todoist_list_tasks` and denies `todoist_add_task`
   - failed outbound webhook/channel/email/push attempts are queued for operator replay:
     - `dead_letter_list` to inspect queue state
     - `dead_letter_replay` to retry specific or filtered entries
+- Proactive assistant workflows (`proactive_assistant`):
+  - `briefing`, `anomaly_scan`, `routine_suggestions`, `follow_through`, `event_digest`
+- Memory governance (`memory_governance`):
+  - per-user partition overlays + duplication/contradiction/staleness audits + cleanup
+- Identity and trust controls (`identity_trust`):
+  - session confidence scoring, domain trust-policy management, guest-mode sessions, household profile admin
+- Home orchestration (`home_orchestrator`):
+  - intent-to-plan decomposition, preflighted multi-entity execution with partial failure reporting, area policy constraints, automation suggestions, long-running task tracking
+- Skills governance (`skills_governance`):
+  - capability negotiation, dependency health, quotas, harness runs, bundle signing metadata, sandbox templates
+- Planning and autonomy (`planner_engine`):
+  - planner/executor split output, task graphs with checkpoint/resume, deferred scheduling, self-critique
+- Quality and evaluation (`quality_evaluator`):
+  - weekly report generation + deterministic dataset-runner summary
+- Embodiment roadmap controls (`embodiment_presence`):
+  - micro-expression library, user gaze calibration, adaptive gesture envelopes, privacy posture, motion safety envelope
+- Integration workflows (`integration_hub`):
+  - calendar CRUD policy flow, notes capture backends, messaging draft/review/send flow, commute briefs, shopping orchestration, policy-gated research workflow
 ### First-Time Operator Checklist

TODO.md CHANGED Viewed

@@ -40,12 +40,12 @@ This is a fresh roadmap focused on what is still missing for a "feels-like-Jarvi
 ## 3) Proactive Assistant Behavior (6 items)
-- [ ] `PX01` Add proactive briefing engine (morning/evening) built from calendar, reminders, weather, and home state. `P0`
-- [ ] `PX02` Add proactive anomaly notifications (device offline, unusual temp, missed reminder). `P0`
 - [x] `PX03` Add "nudge policy" (when to interrupt vs defer) with user-configurable quiet windows. `P0`
-- [ ] `PX04` Add routine suggestions based on repeated behavior patterns (opt-in only). `P1`
-- [ ] `PX05` Add proactive follow-through ("I can do that now" for pending tasks after confirmations). `P1`
-- [ ] `PX06` Add proactive event summarization with digest and snooze controls. `P1`
 ## 4) Memory and Personalization (6 items)
@@ -53,25 +53,25 @@ This is a fresh roadmap focused on what is still missing for a "feels-like-Jarvi
 - [x] `MX02` Add "memory confidence" and "source trail" to prevent stale or hallucinated recall. `P0`
 - [x] `MX03` Add memory correction flow ("forget this", "update that") as first-class voice commands. `P0`
 - [x] `MX04` Add episodic timeline snapshots for recent important conversations/actions. `P1`
-- [ ] `MX05` Add per-user memory partitions with shared/common memory overlays. `P1`
-- [ ] `MX06` Add memory quality audits (duplication, contradiction, stale data) with cleanup tools. `P1`
 ## 5) Multi-User Identity and Trust (6 items)
-- [ ] `IX01` Add session-level identity confidence score from voice context + operator hints. `P0`
-- [ ] `IX02` Add per-user trust policies for high-risk domains (locks, alarms, purchases, external messages). `P0`
 - [x] `IX03` Add step-up verification path for high-risk requests (spoken code or operator approval). `P0`
-- [ ] `IX04` Add "guest mode" with constrained capabilities and automatic expiry. `P1`
-- [ ] `IX05` Add household profile management in operator UI (users, roles, trust, exceptions). `P1`
 - [x] `IX06` Add audit explainability: record why an action was allowed/blocked in user-readable terms. `P1`
 ## 6) Home Intelligence and Automation (6 items)
-- [ ] `HX01` Add intent-to-plan decomposition for complex home requests ("movie mode", "bedtime routine"). `P0`
-- [ ] `HX02` Add safe multi-entity execution with preflight checks and partial-failure reporting. `P0`
-- [ ] `HX03` Add area-level policy constraints (e.g., no loud actions in bedroom after quiet hours). `P0`
-- [ ] `HX04` Add Home Assistant automation suggestion mode with review before creation. `P1`
-- [ ] `HX05` Add long-running home task tracking (start, in-progress, completed) in status and operator UI. `P1`
 - [x] `HX06` Add idempotent action guardrails to avoid repeated toggles during ambiguous dialogue. `P1`
 ## 7) Operator Surfaces and Control (6 items)
@@ -85,21 +85,21 @@ This is a fresh roadmap focused on what is still missing for a "feels-like-Jarvi
 ## 8) Skills Ecosystem and Extensibility (6 items)
-- [ ] `SX01` Add skill capability negotiation so planner can reason about tool quality and reliability. `P1`
-- [ ] `SX02` Add skill dependency graph and health reporting (missing deps, version conflicts). `P1`
-- [ ] `SX03` Add per-skill runtime quotas (rate, CPU time, outbound calls). `P1`
-- [ ] `SX04` Add skill test harness CLI and fixture-based contract validation. `P1`
-- [ ] `SX05` Add signed skill distribution bundle format with integrity metadata. `P2`
-- [ ] `SX06` Add skill sandbox policy templates (`read-only`, `network-limited`, `local-only`). `P1`
 ## 9) Planning and Autonomy (6 items)
-- [ ] `AX01` Add explicit planner/executor split with retry policy and rollback hints. `P0`
-- [ ] `AX02` Add task graph execution for multi-step goals with checkpointing and resume. `P1`
-- [ ] `AX03` Add dependency-aware scheduling for deferred actions and reminders. `P1`
 - [x] `AX04` Add ambiguity detector to request clarifications before risky plan execution. `P0`
 - [x] `AX05` Add human-readable plan preview before execution for medium/high-risk actions. `P0`
-- [ ] `AX06` Add planner self-critique pass for expensive/complex plans before commit. `P2`
 ## 10) Reliability and Runtime Safety (6 items)
@@ -115,36 +115,36 @@ This is a fresh roadmap focused on what is still missing for a "feels-like-Jarvi
 - [x] `EX01` Add intent-level success metrics (answer quality, completion success, correction frequency). `P0`
 - [x] `EX02` Add percentile dashboards for end-to-end turn latency by mode and tool mix. `P1`
 - [x] `EX03` Add policy-decision analytics (allow/deny reason distribution by user and tool). `P1`
-- [ ] `EX04` Add weekly automated "assistant quality report" artifact (errors, regressions, wins). `P1`
-- [ ] `EX05` Add evaluation dataset runner for deterministic prompt/tool contract tests. `P1`
 - [x] `EX06` Add "Jarvis scorecard" combining latency, reliability, initiative, and trust metrics. `P1`
 ## 12) Embodiment and Presence (6 items)
-- [ ] `BX01` Add richer stateful micro-expression library mapped to dialogue intent and certainty. `P1`
 - [x] `BX02` Add conversational turn choreography (listen lean-in, think glance-away, answer lock-on). `P0`
-- [ ] `BX03` Add user-specific gaze behavior calibration for desk distance and seating position. `P1`
-- [ ] `BX04` Add adaptive speaking gesture envelopes based on response emotion/importance. `P1`
-- [ ] `BX05` Add explicit "privacy posture" transitions on mute/sensitive operations. `P0`
-- [ ] `BX06` Add motion safety envelopes linked to runtime context (proximity, hardware state). `P0`
 ## 13) Integrations and Productivity Surface (6 items)
-- [ ] `GX01` Add richer calendar actions (create/update/delete with confirmation policy). `P1`
-- [ ] `GX02` Add notes/knowledge capture integration (Obsidian/Notion/local markdown) with trust controls. `P1`
-- [ ] `GX03` Add messaging assistant workflows (draft/review/send) for Slack/Discord/email. `P1`
-- [ ] `GX04` Add commute/travel briefing integration (traffic/transit APIs). `P2`
-- [ ] `GX05` Add shopping/task orchestration across Todoist + Home Assistant + notifications. `P1`
-- [ ] `GX06` Add contextual web research workflow with citation capture and policy gating. `P2`
 ## 14) Packaging, Deployment, and Ecosystem Fit (6 items)
-- [ ] `DX01` Add one-command local install/bootstrap script for clean hosts. `P1`
-- [ ] `DX02` Add containerized deployment profile for always-on home-server runtime. `P1`
 - [x] `DX03` Add backup/restore CLI for memory, audit, runtime state, and operator settings. `P1`
-- [ ] `DX04` Add staged release channels (`dev`, `beta`, `stable`) with migration checks. `P2`
-- [ ] `DX05` Add Home Assistant add-on packaging path and setup guide. `P2`
-- [ ] `DX06` Add release acceptance suite focused on "Jarvis feel" scenarios before ship. `P1`
 ---

 ## 3) Proactive Assistant Behavior (6 items)
+- [x] `PX01` Add proactive briefing engine (morning/evening) built from calendar, reminders, weather, and home state. `P0`
+- [x] `PX02` Add proactive anomaly notifications (device offline, unusual temp, missed reminder). `P0`
 - [x] `PX03` Add "nudge policy" (when to interrupt vs defer) with user-configurable quiet windows. `P0`
+- [x] `PX04` Add routine suggestions based on repeated behavior patterns (opt-in only). `P1`
+- [x] `PX05` Add proactive follow-through ("I can do that now" for pending tasks after confirmations). `P1`
+- [x] `PX06` Add proactive event summarization with digest and snooze controls. `P1`
 ## 4) Memory and Personalization (6 items)
 - [x] `MX02` Add "memory confidence" and "source trail" to prevent stale or hallucinated recall. `P0`
 - [x] `MX03` Add memory correction flow ("forget this", "update that") as first-class voice commands. `P0`
 - [x] `MX04` Add episodic timeline snapshots for recent important conversations/actions. `P1`
+- [x] `MX05` Add per-user memory partitions with shared/common memory overlays. `P1`
+- [x] `MX06` Add memory quality audits (duplication, contradiction, stale data) with cleanup tools. `P1`
 ## 5) Multi-User Identity and Trust (6 items)
+- [x] `IX01` Add session-level identity confidence score from voice context + operator hints. `P0`
+- [x] `IX02` Add per-user trust policies for high-risk domains (locks, alarms, purchases, external messages). `P0`
 - [x] `IX03` Add step-up verification path for high-risk requests (spoken code or operator approval). `P0`
+- [x] `IX04` Add "guest mode" with constrained capabilities and automatic expiry. `P1`
+- [x] `IX05` Add household profile management in operator UI (users, roles, trust, exceptions). `P1`
 - [x] `IX06` Add audit explainability: record why an action was allowed/blocked in user-readable terms. `P1`
 ## 6) Home Intelligence and Automation (6 items)
+- [x] `HX01` Add intent-to-plan decomposition for complex home requests ("movie mode", "bedtime routine"). `P0`
+- [x] `HX02` Add safe multi-entity execution with preflight checks and partial-failure reporting. `P0`
+- [x] `HX03` Add area-level policy constraints (e.g., no loud actions in bedroom after quiet hours). `P0`
+- [x] `HX04` Add Home Assistant automation suggestion mode with review before creation. `P1`
+- [x] `HX05` Add long-running home task tracking (start, in-progress, completed) in status and operator UI. `P1`
 - [x] `HX06` Add idempotent action guardrails to avoid repeated toggles during ambiguous dialogue. `P1`
 ## 7) Operator Surfaces and Control (6 items)
 ## 8) Skills Ecosystem and Extensibility (6 items)
+- [x] `SX01` Add skill capability negotiation so planner can reason about tool quality and reliability. `P1`
+- [x] `SX02` Add skill dependency graph and health reporting (missing deps, version conflicts). `P1`
+- [x] `SX03` Add per-skill runtime quotas (rate, CPU time, outbound calls). `P1`
+- [x] `SX04` Add skill test harness CLI and fixture-based contract validation. `P1`
+- [x] `SX05` Add signed skill distribution bundle format with integrity metadata. `P2`
+- [x] `SX06` Add skill sandbox policy templates (`read-only`, `network-limited`, `local-only`). `P1`
 ## 9) Planning and Autonomy (6 items)
+- [x] `AX01` Add explicit planner/executor split with retry policy and rollback hints. `P0`
+- [x] `AX02` Add task graph execution for multi-step goals with checkpointing and resume. `P1`
+- [x] `AX03` Add dependency-aware scheduling for deferred actions and reminders. `P1`
 - [x] `AX04` Add ambiguity detector to request clarifications before risky plan execution. `P0`
 - [x] `AX05` Add human-readable plan preview before execution for medium/high-risk actions. `P0`
+- [x] `AX06` Add planner self-critique pass for expensive/complex plans before commit. `P2`
 ## 10) Reliability and Runtime Safety (6 items)
 - [x] `EX01` Add intent-level success metrics (answer quality, completion success, correction frequency). `P0`
 - [x] `EX02` Add percentile dashboards for end-to-end turn latency by mode and tool mix. `P1`
 - [x] `EX03` Add policy-decision analytics (allow/deny reason distribution by user and tool). `P1`
+- [x] `EX04` Add weekly automated "assistant quality report" artifact (errors, regressions, wins). `P1`
+- [x] `EX05` Add evaluation dataset runner for deterministic prompt/tool contract tests. `P1`
 - [x] `EX06` Add "Jarvis scorecard" combining latency, reliability, initiative, and trust metrics. `P1`
 ## 12) Embodiment and Presence (6 items)
+- [x] `BX01` Add richer stateful micro-expression library mapped to dialogue intent and certainty. `P1`
 - [x] `BX02` Add conversational turn choreography (listen lean-in, think glance-away, answer lock-on). `P0`
+- [x] `BX03` Add user-specific gaze behavior calibration for desk distance and seating position. `P1`
+- [x] `BX04` Add adaptive speaking gesture envelopes based on response emotion/importance. `P1`
+- [x] `BX05` Add explicit "privacy posture" transitions on mute/sensitive operations. `P0`
+- [x] `BX06` Add motion safety envelopes linked to runtime context (proximity, hardware state). `P0`
 ## 13) Integrations and Productivity Surface (6 items)
+- [x] `GX01` Add richer calendar actions (create/update/delete with confirmation policy). `P1`
+- [x] `GX02` Add notes/knowledge capture integration (Obsidian/Notion/local markdown) with trust controls. `P1`
+- [x] `GX03` Add messaging assistant workflows (draft/review/send) for Slack/Discord/email. `P1`
+- [x] `GX04` Add commute/travel briefing integration (traffic/transit APIs). `P2`
+- [x] `GX05` Add shopping/task orchestration across Todoist + Home Assistant + notifications. `P1`
+- [x] `GX06` Add contextual web research workflow with citation capture and policy gating. `P2`
 ## 14) Packaging, Deployment, and Ecosystem Fit (6 items)
+- [x] `DX01` Add one-command local install/bootstrap script for clean hosts. `P1`
+- [x] `DX02` Add containerized deployment profile for always-on home-server runtime. `P1`
 - [x] `DX03` Add backup/restore CLI for memory, audit, runtime state, and operator settings. `P1`
+- [x] `DX04` Add staged release channels (`dev`, `beta`, `stable`) with migration checks. `P2`
+- [x] `DX05` Add Home Assistant add-on packaging path and setup guide. `P2`
+- [x] `DX06` Add release acceptance suite focused on "Jarvis feel" scenarios before ship. `P1`
 ---

config/release-channels.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "default_channel": "dev",
+  "channels": {
+    "dev": {
+      "description": "Fast iteration channel",
+      "required_checks": [
+        {"type": "file_exists", "path": "TODO.md"},
+        {"type": "file_exists", "path": "README.md"}
+      ]
+    },
+    "beta": {
+      "description": "Pre-stable validation channel",
+      "required_checks": [
+        {"type": "file_exists", "path": "scripts/release_acceptance.sh"},
+        {"type": "text_contains", "path": "README.md", "needle": "Release acceptance"}
+      ]
+    },
+    "stable": {
+      "description": "Production channel",
+      "required_checks": [
+        {"type": "file_exists", "path": ".github/workflows/ci.yml"},
+        {"type": "file_exists", "path": ".github/workflows/fault-profiles.yml"},
+        {"type": "file_exists", "path": "scripts/generate_quality_report.py"}
+      ]
+    }
+  }
+}

deploy/home-assistant-addon/Dockerfile ADDED Viewed

	@@ -0,0 +1,15 @@

+ARG BUILD_FROM=ghcr.io/home-assistant/amd64-base-python:3.12
+FROM ${BUILD_FROM}
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1
+RUN apk add --no-cache bash curl
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+ENV PATH="/root/.local/bin:${PATH}"
+WORKDIR /opt/jarvis
+COPY . /opt/jarvis
+RUN uv sync --frozen --extra dev
+CMD ["uv", "run", "python", "-m", "jarvis", "--sim", "--no-vision"]

deploy/home-assistant-addon/README.md ADDED Viewed

	@@ -0,0 +1,16 @@

+# Home Assistant Add-on Path
+This directory contains a starter add-on packaging profile for running Jarvis on a Home Assistant host.
+## Build/packaging notes
+- `config.yaml`: add-on metadata and options schema.
+- `Dockerfile`: base image plus Jarvis runtime dependencies.
+- The runtime command starts Jarvis in simulation/no-vision mode by default for safer first boot.
+## Installation workflow
+1. Copy this folder into a Home Assistant add-on repository.
+2. Provide API keys in add-on options or environment overrides.
+3. Build and install via Home Assistant Supervisor.
+4. Validate with dry-run tooling before enabling mutating integrations.

deploy/home-assistant-addon/config.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+name: Jarvis Assistant
+version: "0.1.0"
+slug: jarvis_assistant
+description: Jarvis AI assistant runtime for Home Assistant hosts
+startup: services
+boot: auto
+arch:
+  - amd64
+  - aarch64
+  - armv7
+map:
+  - config:rw
+  - addon_config:rw
+options:
+  wake_mode: wake_word
+  no_vision: true
+schema:
+  wake_mode: str
+  no_vision: bool
+  anthropic_api_key: password
+  elevenlabs_api_key: password
+ingress: false
+homeassistant_api: true
+hassio_api: true
+host_network: false

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,15 @@

+services:
+  jarvis:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    container_name: jarvis
+    env_file:
+      - .env
+    volumes:
+      - jarvis-data:/root/.jarvis
+    restart: unless-stopped
+    command: ["uv", "run", "python", "-m", "jarvis", "--sim", "--no-vision"]
+volumes:
+  jarvis-data:

docs/evals/assistant-contract.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "cases": [
+    {
+      "id": "status_contract",
+      "expected_contains": ["schema_version", "tool_policy", "health"],
+      "expected_tools": ["system_status"],
+      "actual_response": "schema_version tool_policy health",
+      "actual_tools": ["system_status"]
+    },
+    {
+      "id": "identity_guardrail",
+      "expected_contains": ["approval"],
+      "expected_tools": ["identity_trust"],
+      "actual_response": "requires approval",
+      "actual_tools": ["identity_trust"]
+    }
+  ]
+}

scripts/bootstrap.sh ADDED Viewed

	@@ -0,0 +1,43 @@

+#!/usr/bin/env bash
+set -euo pipefail
+usage() {
+  cat <<'USAGE'
+Usage: ./scripts/bootstrap.sh [--quick]
+Bootstraps Jarvis on a clean host:
+- ensures uv is installed
+- syncs dependencies
+- creates .env from .env.example if missing
+- runs baseline validation (unless --quick)
+USAGE
+}
+quick=false
+if [[ "${1:-}" == "--help" || "${1:-}" == "-h" ]]; then
+  usage
+  exit 0
+fi
+if [[ "${1:-}" == "--quick" ]]; then
+  quick=true
+fi
+if ! command -v uv >/dev/null 2>&1; then
+  echo "Installing uv..."
+  curl -LsSf https://astral.sh/uv/install.sh | sh
+  export PATH="$HOME/.local/bin:$PATH"
+fi
+if [[ ! -f .env && -f .env.example ]]; then
+  cp .env.example .env
+  echo "Created .env from .env.example"
+fi
+uv sync --extra dev
+if [[ "$quick" == "false" ]]; then
+  uv run ruff check src tests
+  uv run pytest -q tests/test_config.py tests/test_tools_services.py
+fi
+echo "Bootstrap complete."

scripts/check_release_channel.py ADDED Viewed

	@@ -0,0 +1,66 @@

+#!/usr/bin/env python3
+from __future__ import annotations
+import argparse
+import json
+from pathlib import Path
+from typing import Any
+def _run_check(base: Path, check: dict[str, Any]) -> dict[str, Any]:
+    kind = str(check.get("type", "")).strip().lower()
+    path = str(check.get("path", "")).strip()
+    target = (base / path).resolve() if path else base
+    if kind == "file_exists":
+        ok = target.exists()
+        return {"type": kind, "path": path, "ok": ok, "detail": "exists" if ok else "missing"}
+    if kind == "text_contains":
+        needle = str(check.get("needle", "")).strip()
+        if not target.exists() or not target.is_file():
+            return {"type": kind, "path": path, "ok": False, "detail": "missing_file"}
+        text = target.read_text(encoding="utf-8", errors="replace")
+        ok = needle in text
+        return {
+            "type": kind,
+            "path": path,
+            "needle": needle,
+            "ok": ok,
+            "detail": "found" if ok else "missing_needle",
+        }
+    return {"type": kind or "unknown", "path": path, "ok": False, "detail": "unsupported_check_type"}
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Validate staged release-channel checks.")
+    parser.add_argument("--channel", required=True, choices=["dev", "beta", "stable"])
+    parser.add_argument("--config", default="config/release-channels.json")
+    parser.add_argument("--workspace", default=".")
+    args = parser.parse_args()
+    workspace = Path(args.workspace).resolve()
+    config_path = (workspace / args.config).resolve()
+    config = json.loads(config_path.read_text(encoding="utf-8"))
+    channels = config.get("channels", {}) if isinstance(config, dict) else {}
+    channel_cfg = channels.get(args.channel, {}) if isinstance(channels, dict) else {}
+    checks = channel_cfg.get("required_checks", []) if isinstance(channel_cfg, dict) else []
+    results = [_run_check(workspace, row) for row in checks if isinstance(row, dict)]
+    failed = [row for row in results if not bool(row.get("ok"))]
+    payload = {
+        "channel": args.channel,
+        "passed": len(failed) == 0,
+        "check_count": len(results),
+        "failed_count": len(failed),
+        "results": results,
+    }
+    print(json.dumps(payload, indent=2))
+    return 0 if not failed else 1
+if __name__ == "__main__":
+    raise SystemExit(main())

scripts/generate_quality_report.py ADDED Viewed

	@@ -0,0 +1,112 @@

+#!/usr/bin/env python3
+from __future__ import annotations
+import argparse
+import json
+import time
+from collections import Counter
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+def _read_jsonl(path: Path) -> list[dict[str, Any]]:
+    if not path.exists():
+        return []
+    rows: list[dict[str, Any]] = []
+    for line in path.read_text(encoding="utf-8", errors="replace").splitlines():
+        text = line.strip()
+        if not text:
+            continue
+        try:
+            payload = json.loads(text)
+        except Exception:
+            continue
+        if isinstance(payload, dict):
+            rows.append(payload)
+    return rows
+def _build_report(entries: list[dict[str, Any]]) -> dict[str, Any]:
+    total = len(entries)
+    by_action = Counter(str(row.get("action", "unknown")) for row in entries)
+    by_outcome = Counter(str(row.get("decision_outcome", row.get("result", "unknown"))) for row in entries)
+    failures = [row for row in entries if str(row.get("decision_outcome", row.get("result", ""))).lower() in {"failed", "denied", "blocked", "error"}]
+    top_failure_reasons = Counter(str(row.get("decision_reason", row.get("reason", "unknown"))) for row in failures)
+    return {
+        "generated_at": time.time(),
+        "total_events": total,
+        "event_count_by_action": dict(by_action.most_common(20)),
+        "event_count_by_outcome": dict(by_outcome),
+        "failure_count": len(failures),
+        "top_failure_reasons": dict(top_failure_reasons.most_common(10)),
+        "wins": [
+            "Maintained audit coverage for operational actions.",
+            "Captured decision outcomes for trust/policy review.",
+        ],
+        "regressions": [
+            "High failure counts should be triaged from top_failure_reasons.",
+        ] if failures else [],
+    }
+def _markdown(report: dict[str, Any]) -> str:
+    generated = datetime.fromtimestamp(float(report.get("generated_at", 0.0))).isoformat()
+    lines = [
+        "# Jarvis Weekly Quality Report",
+        "",
+        f"Generated: {generated}",
+        "",
+        f"- Total events: {int(report.get('total_events', 0))}",
+        f"- Failure events: {int(report.get('failure_count', 0))}",
+        "",
+        "## Outcome Distribution",
+    ]
+    for key, value in sorted((report.get("event_count_by_outcome") or {}).items()):
+        lines.append(f"- {key}: {value}")
+    lines.append("")
+    lines.append("## Top Failure Reasons")
+    reasons = report.get("top_failure_reasons") or {}
+    if isinstance(reasons, dict) and reasons:
+        for key, value in reasons.items():
+            lines.append(f"- {key}: {value}")
+    else:
+        lines.append("- none")
+    return "\n".join(lines) + "\n"
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Generate weekly Jarvis quality report artifacts.")
+    parser.add_argument("--audit-log", default=str(Path.home() / ".jarvis" / "audit.jsonl"))
+    parser.add_argument("--output-dir", default=str(Path(".artifacts") / "quality"))
+    parser.add_argument("--markdown", action="store_true")
+    args = parser.parse_args()
+    audit_log = Path(args.audit_log).expanduser()
+    output_dir = Path(args.output_dir).expanduser()
+    output_dir.mkdir(parents=True, exist_ok=True)
+    entries = _read_jsonl(audit_log)
+    report = _build_report(entries)
+    stamp = datetime.utcnow().strftime("%Y%m%d-%H%M%S")
+    json_path = output_dir / f"weekly-quality-{stamp}.json"
+    json_path.write_text(json.dumps(report, indent=2), encoding="utf-8")
+    artifact: dict[str, Any] = {
+        "json": str(json_path),
+        "markdown": "",
+    }
+    if args.markdown:
+        md_path = output_dir / f"weekly-quality-{stamp}.md"
+        md_path.write_text(_markdown(report), encoding="utf-8")
+        artifact["markdown"] = str(md_path)
+    print(json.dumps({"report": report, "artifacts": artifact}, indent=2))
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

scripts/release_acceptance.sh ADDED Viewed

	@@ -0,0 +1,37 @@

+#!/usr/bin/env bash
+set -euo pipefail
+profile="${1:-full}"
+run_core() {
+  uv run pytest -q \
+    tests/test_brain.py \
+    tests/test_presence.py \
+    tests/test_voice_attention.py \
+    tests/test_turn_taking.py \
+    tests/test_tools_services.py -k "system_status or scorecard or identity"
+}
+run_fast() {
+  uv run pytest -q \
+    tests/test_brain.py -k "interaction_contract or response_mode or confidence" \
+    tests/test_presence.py -k "choreography or muted" \
+    tests/test_tools_services.py -k "system_status_contract_reports_expected_fields"
+}
+uv run ruff check src tests
+case "$profile" in
+  fast)
+    run_fast
+    ;;
+  full)
+    run_core
+    ;;
+  *)
+    echo "Unknown profile: $profile (expected: fast|full)" >&2
+    exit 2
+    ;;
+esac
+echo "Release acceptance suite passed ($profile)."

scripts/run_eval_dataset.py ADDED Viewed

	@@ -0,0 +1,75 @@

+#!/usr/bin/env python3
+from __future__ import annotations
+import argparse
+import json
+from pathlib import Path
+from typing import Any
+def _as_list(value: Any) -> list[str]:
+    if isinstance(value, list):
+        return [str(item) for item in value if str(item).strip()]
+    if isinstance(value, str) and value.strip():
+        return [value.strip()]
+    return []
+def _evaluate_case(case: dict[str, Any]) -> dict[str, Any]:
+    case_id = str(case.get("id", "case"))
+    actual_response = str(case.get("actual_response", ""))
+    actual_tools = {str(item) for item in _as_list(case.get("actual_tools"))}
+    expected_contains = _as_list(case.get("expected_contains"))
+    expected_tools = {str(item) for item in _as_list(case.get("expected_tools"))}
+    missing_text = [needle for needle in expected_contains if needle not in actual_response]
+    missing_tools = sorted(expected_tools - actual_tools)
+    passed = not missing_text and not missing_tools
+    return {
+        "id": case_id,
+        "passed": passed,
+        "missing_text": missing_text,
+        "missing_tools": missing_tools,
+    }
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Run deterministic evaluation dataset checks.")
+    parser.add_argument("dataset", help="Path to dataset JSON")
+    parser.add_argument("--output", default="")
+    parser.add_argument("--strict", action="store_true")
+    args = parser.parse_args()
+    dataset_path = Path(args.dataset)
+    payload = json.loads(dataset_path.read_text(encoding="utf-8"))
+    cases = payload.get("cases", []) if isinstance(payload, dict) else []
+    if not isinstance(cases, list):
+        raise SystemExit("Dataset format error: expected top-level object with 'cases' list.")
+    results = [_evaluate_case(case) for case in cases if isinstance(case, dict)]
+    passed = sum(1 for row in results if row["passed"])
+    failed = len(results) - passed
+    summary = {
+        "dataset": str(dataset_path),
+        "strict": bool(args.strict),
+        "case_count": len(results),
+        "passed": passed,
+        "failed": failed,
+        "pass_rate": (passed / len(results)) if results else 0.0,
+        "accepted": (failed == 0) if args.strict else (passed >= failed),
+        "results": results,
+    }
+    text = json.dumps(summary, indent=2)
+    print(text)
+    if args.output:
+        out_path = Path(args.output)
+        out_path.parent.mkdir(parents=True, exist_ok=True)
+        out_path.write_text(text, encoding="utf-8")
+    return 0 if summary["accepted"] else 1
+if __name__ == "__main__":
+    raise SystemExit(main())

src/jarvis/brain.py CHANGED Viewed

@@ -269,6 +269,15 @@ class Brain:
                 "mcp__jarvis-services__skills_enable",
                 "mcp__jarvis-services__skills_disable",
                 "mcp__jarvis-services__skills_version",
             ],
             self._config.tool_allowlist,
             self._config.tool_denylist,

                 "mcp__jarvis-services__skills_enable",
                 "mcp__jarvis-services__skills_disable",
                 "mcp__jarvis-services__skills_version",
+                "mcp__jarvis-services__proactive_assistant",
+                "mcp__jarvis-services__memory_governance",
+                "mcp__jarvis-services__identity_trust",
+                "mcp__jarvis-services__home_orchestrator",
+                "mcp__jarvis-services__skills_governance",
+                "mcp__jarvis-services__planner_engine",
+                "mcp__jarvis-services__quality_evaluator",
+                "mcp__jarvis-services__embodiment_presence",
+                "mcp__jarvis-services__integration_hub",
             ],
             self._config.tool_allowlist,
             self._config.tool_denylist,

src/jarvis/tools/services.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

tests/test_tools_services.py CHANGED Viewed

@@ -3149,7 +3149,7 @@ class TestServicesTools:
         result = await services.system_status({})
         payload = json.loads(result["content"][0]["text"])
-        assert payload["schema_version"] == "1.8"
         assert "local_time" in payload
         assert "tool_policy" in payload
         assert isinstance(payload["tool_policy"]["home_require_confirm_execute"], bool)
@@ -3249,6 +3249,10 @@ class TestServicesTools:
         assert "failed_count" in payload["dead_letter_queue"]
         assert "replayed_count" in payload["dead_letter_queue"]
         assert isinstance(payload["dead_letter_queue"]["recent"], list)
         assert payload["health"]["health_level"] in {"ok", "degraded", "error"}
     @pytest.mark.asyncio
@@ -3257,7 +3261,7 @@ class TestServicesTools:
         result = await services.system_status_contract({})
         payload = json.loads(result["content"][0]["text"])
-        assert payload["schema_version"] == "1.8"
         assert "top_level_required" in payload
         assert "tool_policy" in payload["top_level_required"]
         assert "identity" in payload["top_level_required"]
@@ -3336,6 +3340,9 @@ class TestServicesTools:
         assert "dead_letter_queue" in payload["top_level_required"]
         assert "dead_letter_queue_required" in payload
         assert "pending_count" in payload["dead_letter_queue_required"]
     @pytest.mark.asyncio
     async def test_jarvis_scorecard_reports_unified_dimensions(self, tmp_path):
@@ -3753,6 +3760,11 @@ class TestServicesTools:
         assert schemas["webhook_inbound_list"]["properties"]["limit"]["type"] == "integer"
         assert schemas["tool_summary"]["properties"]["limit"]["type"] == "integer"
         assert schemas["tool_summary_text"]["properties"]["limit"]["type"] == "integer"
     def test_service_schema_identity_fields_present_for_mutating_tools(self):
         from jarvis.tools import services
@@ -3958,3 +3970,122 @@ class TestServicesTools:
         assert payload["nested"]["api_key"] == "***REDACTED***"
         assert payload["nested"]["safe"].endswith("...<truncated>")
         assert payload["items"][-1].startswith("<truncated_items:")

         result = await services.system_status({})
         payload = json.loads(result["content"][0]["text"])
+        assert payload["schema_version"] == "2.0"
         assert "local_time" in payload
         assert "tool_policy" in payload
         assert isinstance(payload["tool_policy"]["home_require_confirm_execute"], bool)
         assert "failed_count" in payload["dead_letter_queue"]
         assert "replayed_count" in payload["dead_letter_queue"]
         assert isinstance(payload["dead_letter_queue"]["recent"], list)
+        assert "expansion" in payload
+        assert "proactive" in payload["expansion"]
+        assert "planner_engine" in payload["expansion"]
+        assert "integration_hub" in payload["expansion"]
         assert payload["health"]["health_level"] in {"ok", "degraded", "error"}
     @pytest.mark.asyncio
         result = await services.system_status_contract({})
         payload = json.loads(result["content"][0]["text"])
+        assert payload["schema_version"] == "2.0"
         assert "top_level_required" in payload
         assert "tool_policy" in payload["top_level_required"]
         assert "identity" in payload["top_level_required"]
         assert "dead_letter_queue" in payload["top_level_required"]
         assert "dead_letter_queue_required" in payload
         assert "pending_count" in payload["dead_letter_queue_required"]
+        assert "expansion" in payload["top_level_required"]
+        assert "expansion_required" in payload
+        assert "proactive" in payload["expansion_required"]
     @pytest.mark.asyncio
     async def test_jarvis_scorecard_reports_unified_dimensions(self, tmp_path):
         assert schemas["webhook_inbound_list"]["properties"]["limit"]["type"] == "integer"
         assert schemas["tool_summary"]["properties"]["limit"]["type"] == "integer"
         assert schemas["tool_summary_text"]["properties"]["limit"]["type"] == "integer"
+        assert schemas["proactive_assistant"]["properties"]["snooze_minutes"]["type"] == "integer"
+        assert schemas["memory_governance"]["properties"]["limit"]["type"] == "integer"
+        assert schemas["skills_governance"]["properties"]["rate_per_min"]["type"] == "integer"
+        assert schemas["planner_engine"]["properties"]["limit"]["type"] == "integer"
+        assert schemas["quality_evaluator"]["properties"]["limit"]["type"] == "integer"
     def test_service_schema_identity_fields_present_for_mutating_tools(self):
         from jarvis.tools import services
         assert payload["nested"]["api_key"] == "***REDACTED***"
         assert payload["nested"]["safe"].endswith("...<truncated>")
         assert payload["items"][-1].startswith("<truncated_items:")
+    @pytest.mark.asyncio
+    async def test_expansion_tools_basic_actions(self, tmp_path):
+        from jarvis.tools import services
+        proactive = await services.proactive_assistant({"action": "briefing", "mode": "morning"})
+        proactive_payload = json.loads(proactive["content"][0]["text"])
+        assert proactive_payload["action"] == "briefing"
+        memory_partition = await services.memory_governance(
+            {"action": "partition", "user": "owner", "shared_scopes": ["preferences"]}
+        )
+        memory_payload = json.loads(memory_partition["content"][0]["text"])
+        assert memory_payload["action"] == "partition"
+        assert memory_payload["overlay"]["user"] == "owner"
+        trust_policy = await services.identity_trust(
+            {"action": "policy_set", "domain": "locks", "required_profile": "trusted", "requires_step_up": True}
+        )
+        trust_payload = json.loads(trust_policy["content"][0]["text"])
+        assert trust_payload["policy"]["required_profile"] == "trusted"
+        home_plan = await services.home_orchestrator({"action": "plan", "request_text": "activate movie mode"})
+        home_payload = json.loads(home_plan["content"][0]["text"])
+        assert home_payload["plan_label"] == "movie_mode"
+        assert home_payload["step_count"] >= 1
+        skills_negotiation = await services.skills_governance(
+            {"action": "negotiate", "requested_capabilities": ["forecast"]}
+        )
+        skills_payload = json.loads(skills_negotiation["content"][0]["text"])
+        assert skills_payload["action"] == "negotiate"
+        planner = await services.planner_engine({"action": "plan", "goal": "prepare evening routine"})
+        planner_payload = json.loads(planner["content"][0]["text"])
+        assert planner_payload["action"] == "plan"
+        assert "planner" in planner_payload
+        report_path = tmp_path / "quality.json"
+        quality = await services.quality_evaluator({"action": "weekly_report", "report_path": str(report_path)})
+        quality_payload = json.loads(quality["content"][0]["text"])
+        assert quality_payload["action"] == "weekly_report"
+        assert Path(quality_payload["artifact_path"]).exists()
+        embodiment = await services.embodiment_presence(
+            {"action": "privacy_posture", "state": "muted", "reason": "sensitive_operation"}
+        )
+        embodiment_payload = json.loads(embodiment["content"][0]["text"])
+        assert embodiment_payload["privacy_posture"]["state"] == "muted"
+        integration = await services.integration_hub(
+            {"action": "notes_capture", "backend": "local_markdown", "title": "Test", "content": "hello"}
+        )
+        integration_payload = json.loads(integration["content"][0]["text"])
+        assert integration_payload["stored"] is True
+        assert Path(integration_payload["path"]).exists()
+    @pytest.mark.asyncio
+    async def test_identity_guest_session_capability_enforced(self):
+        from jarvis.tools import services
+        guest = await services.identity_trust(
+            {"action": "guest_start", "guest_id": "visitor", "capabilities": ["system_status"]}
+        )
+        guest_payload = json.loads(guest["content"][0]["text"])
+        token = guest_payload["token"]
+        denied = await services.smart_home(
+            {
+                "domain": "light",
+                "action": "turn_on",
+                "entity_id": "light.living_room",
+                "dry_run": True,
+                "guest_session_token": token,
+            }
+        )
+        assert "guest session does not allow" in denied["content"][0]["text"].lower()
+    @pytest.mark.asyncio
+    async def test_home_orchestrator_area_policy_surfaces_partial_failures(self):
+        from jarvis.tools import services
+        await services.home_orchestrator(
+            {
+                "action": "area_policy_set",
+                "area": "bedroom",
+                "policy": {
+                    "quiet_hours_start": "22:00",
+                    "quiet_hours_end": "07:00",
+                    "blocked_actions": ["media_player:media_play"],
+                },
+            }
+        )
+        result = await services.home_orchestrator(
+            {
+                "action": "execute",
+                "actions": [
+                    {"domain": "media_player", "action": "media_play", "entity_id": "media_player.bedroom_speaker"},
+                    {"domain": "light", "action": "turn_on", "entity_id": "light.kitchen"},
+                ],
+            }
+        )
+        payload = json.loads(result["content"][0]["text"])
+        assert payload["partial_failure"] is True
+        assert payload["failed_count"] == 1
+    def test_release_scripts_and_workflows_exist(self):
+        project_root = Path(__file__).resolve().parents[1]
+        assert (project_root / "scripts" / "bootstrap.sh").exists()
+        assert (project_root / "scripts" / "generate_quality_report.py").exists()
+        assert (project_root / "scripts" / "run_eval_dataset.py").exists()
+        assert (project_root / "scripts" / "release_acceptance.sh").exists()
+        assert (project_root / "scripts" / "check_release_channel.py").exists()
+        assert (project_root / ".github" / "workflows" / "assistant-quality-report.yml").exists()
+        assert (project_root / ".github" / "workflows" / "release-acceptance.yml").exists()
+        makefile_text = (project_root / "Makefile").read_text()
+        assert "quality-report" in makefile_text
+        assert "eval-dataset" in makefile_text
+        assert "release-acceptance" in makefile_text