Jonathan Haas commited on
Commit ·
944b4e2
1
Parent(s): 4f4578a
Complete remaining Jarvis roadmap: proactive, trust, planner, integrations, and deployment tracks
Browse files- .github/workflows/assistant-quality-report.yml +46 -0
- .github/workflows/release-acceptance.yml +41 -0
- .gitignore +3 -0
- Dockerfile +21 -0
- Makefile +17 -1
- README.md +26 -0
- TODO.md +44 -44
- config/release-channels.json +27 -0
- deploy/home-assistant-addon/Dockerfile +15 -0
- deploy/home-assistant-addon/README.md +16 -0
- deploy/home-assistant-addon/config.yaml +25 -0
- docker-compose.yml +15 -0
- docs/evals/assistant-contract.json +18 -0
- scripts/bootstrap.sh +43 -0
- scripts/check_release_channel.py +66 -0
- scripts/generate_quality_report.py +112 -0
- scripts/release_acceptance.sh +37 -0
- scripts/run_eval_dataset.py +75 -0
- src/jarvis/brain.py +9 -0
- src/jarvis/tools/services.py +0 -0
- tests/test_tools_services.py +133 -2
.github/workflows/assistant-quality-report.yml
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Assistant Quality Report
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
schedule:
|
| 5 |
+
- cron: "0 10 * * 1"
|
| 6 |
+
workflow_dispatch:
|
| 7 |
+
|
| 8 |
+
concurrency:
|
| 9 |
+
group: assistant-quality-report-${{ github.workflow }}-${{ github.ref }}
|
| 10 |
+
cancel-in-progress: false
|
| 11 |
+
|
| 12 |
+
jobs:
|
| 13 |
+
quality-report:
|
| 14 |
+
runs-on: ubuntu-latest
|
| 15 |
+
timeout-minutes: 30
|
| 16 |
+
steps:
|
| 17 |
+
- name: Checkout
|
| 18 |
+
uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
|
| 19 |
+
|
| 20 |
+
- name: Set up Python
|
| 21 |
+
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5
|
| 22 |
+
with:
|
| 23 |
+
python-version: "3.12"
|
| 24 |
+
|
| 25 |
+
- name: Install uv
|
| 26 |
+
uses: astral-sh/setup-uv@8d55fbecc275b1c35dbe060458839f8d30439ccf # v3
|
| 27 |
+
|
| 28 |
+
- name: Sync dependencies
|
| 29 |
+
run: uv sync --extra dev
|
| 30 |
+
|
| 31 |
+
- name: Generate weekly report
|
| 32 |
+
run: |
|
| 33 |
+
mkdir -p .artifacts/quality
|
| 34 |
+
./scripts/generate_quality_report.py --output-dir .artifacts/quality --markdown > .artifacts/quality/summary.json
|
| 35 |
+
|
| 36 |
+
- name: Run deterministic eval dataset
|
| 37 |
+
run: |
|
| 38 |
+
./scripts/run_eval_dataset.py docs/evals/assistant-contract.json --output .artifacts/quality/eval.json --strict || true
|
| 39 |
+
|
| 40 |
+
- name: Upload quality artifacts
|
| 41 |
+
if: always()
|
| 42 |
+
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
|
| 43 |
+
with:
|
| 44 |
+
name: assistant-quality-${{ github.run_id }}
|
| 45 |
+
path: .artifacts/quality
|
| 46 |
+
if-no-files-found: warn
|
.github/workflows/release-acceptance.yml
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Release Acceptance
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
workflow_dispatch:
|
| 5 |
+
inputs:
|
| 6 |
+
profile:
|
| 7 |
+
description: "Acceptance profile"
|
| 8 |
+
type: choice
|
| 9 |
+
default: fast
|
| 10 |
+
options:
|
| 11 |
+
- fast
|
| 12 |
+
- full
|
| 13 |
+
|
| 14 |
+
concurrency:
|
| 15 |
+
group: release-acceptance-${{ github.workflow }}-${{ github.ref }}
|
| 16 |
+
cancel-in-progress: true
|
| 17 |
+
|
| 18 |
+
jobs:
|
| 19 |
+
acceptance:
|
| 20 |
+
runs-on: ubuntu-latest
|
| 21 |
+
timeout-minutes: 40
|
| 22 |
+
steps:
|
| 23 |
+
- name: Checkout
|
| 24 |
+
uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
|
| 25 |
+
|
| 26 |
+
- name: Set up Python
|
| 27 |
+
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5
|
| 28 |
+
with:
|
| 29 |
+
python-version: "3.12"
|
| 30 |
+
|
| 31 |
+
- name: Install uv
|
| 32 |
+
uses: astral-sh/setup-uv@8d55fbecc275b1c35dbe060458839f8d30439ccf # v3
|
| 33 |
+
|
| 34 |
+
- name: Sync dependencies
|
| 35 |
+
run: uv sync --extra dev
|
| 36 |
+
|
| 37 |
+
- name: Validate release channel (stable)
|
| 38 |
+
run: ./scripts/check_release_channel.py --channel stable
|
| 39 |
+
|
| 40 |
+
- name: Run release acceptance suite
|
| 41 |
+
run: ./scripts/release_acceptance.sh "${{ github.event.inputs.profile }}"
|
.gitignore
CHANGED
|
@@ -11,3 +11,6 @@ __pycache__/
|
|
| 11 |
|
| 12 |
# Third-party clones used for reference only
|
| 13 |
vendor/
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
# Third-party clones used for reference only
|
| 13 |
vendor/
|
| 14 |
+
|
| 15 |
+
# Local run artifacts
|
| 16 |
+
.artifacts/
|
Dockerfile
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.12-slim
|
| 2 |
+
|
| 3 |
+
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 4 |
+
PYTHONUNBUFFERED=1 \
|
| 5 |
+
PIP_DISABLE_PIP_VERSION_CHECK=1 \
|
| 6 |
+
UV_LINK_MODE=copy
|
| 7 |
+
|
| 8 |
+
RUN apt-get update \
|
| 9 |
+
&& apt-get install -y --no-install-recommends curl ca-certificates \
|
| 10 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 11 |
+
|
| 12 |
+
RUN curl -LsSf https://astral.sh/uv/install.sh | sh
|
| 13 |
+
ENV PATH="/root/.local/bin:${PATH}"
|
| 14 |
+
|
| 15 |
+
WORKDIR /app
|
| 16 |
+
COPY pyproject.toml uv.lock ./
|
| 17 |
+
RUN uv sync --frozen --extra dev
|
| 18 |
+
|
| 19 |
+
COPY . .
|
| 20 |
+
|
| 21 |
+
CMD ["uv", "run", "python", "-m", "jarvis", "--sim", "--no-vision"]
|
Makefile
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
-
.PHONY: check test-fast test-faults test-fault-profiles test-soak security-gate
|
|
|
|
| 2 |
|
| 3 |
check:
|
| 4 |
uv run ruff check src tests
|
|
@@ -18,3 +19,18 @@ test-soak:
|
|
| 18 |
|
| 19 |
security-gate:
|
| 20 |
./scripts/security_gate.sh
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.PHONY: check test-fast test-faults test-fault-profiles test-soak security-gate \
|
| 2 |
+
bootstrap quality-report eval-dataset release-channel-check release-acceptance
|
| 3 |
|
| 4 |
check:
|
| 5 |
uv run ruff check src tests
|
|
|
|
| 19 |
|
| 20 |
security-gate:
|
| 21 |
./scripts/security_gate.sh
|
| 22 |
+
|
| 23 |
+
bootstrap:
|
| 24 |
+
./scripts/bootstrap.sh
|
| 25 |
+
|
| 26 |
+
quality-report:
|
| 27 |
+
./scripts/generate_quality_report.py --output-dir .artifacts/quality --markdown
|
| 28 |
+
|
| 29 |
+
eval-dataset:
|
| 30 |
+
./scripts/run_eval_dataset.py docs/evals/assistant-contract.json --output .artifacts/quality/eval.json --strict
|
| 31 |
+
|
| 32 |
+
release-channel-check:
|
| 33 |
+
./scripts/check_release_channel.py --channel stable
|
| 34 |
+
|
| 35 |
+
release-acceptance:
|
| 36 |
+
./scripts/release_acceptance.sh fast
|
README.md
CHANGED
|
@@ -163,6 +163,7 @@ Smart home safety defaults:
|
|
| 163 |
- `system_status.integrations.*.circuit_breaker` (open/remaining/failure state per integration)
|
| 164 |
- `system_status.recovery_journal` (interrupted-action reconciliation summary)
|
| 165 |
- `system_status.dead_letter_queue` (failed outbound delivery queue with replay status)
|
|
|
|
| 166 |
- `jarvis_scorecard` (standalone scorecard payload for dashboards and alerts)
|
| 167 |
- `system_status_contract` (stable required-field contract)
|
| 168 |
- Memory retrieval now includes confidence/provenance details:
|
|
@@ -203,6 +204,13 @@ Smart home safety defaults:
|
|
| 203 |
- Skills developer guide: [`docs/operations/skills-development.md`](docs/operations/skills-development.md).
|
| 204 |
- Provenance verification: [`docs/operations/provenance-verification.md`](docs/operations/provenance-verification.md).
|
| 205 |
- Incident response: [`docs/operations/incident-response.md`](docs/operations/incident-response.md).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
- Todoist integration:
|
| 207 |
- `TODOIST_PERMISSION_PROFILE=readonly|control`
|
| 208 |
- `readonly` allows `todoist_list_tasks` and denies `todoist_add_task`
|
|
@@ -235,6 +243,24 @@ Smart home safety defaults:
|
|
| 235 |
- failed outbound webhook/channel/email/push attempts are queued for operator replay:
|
| 236 |
- `dead_letter_list` to inspect queue state
|
| 237 |
- `dead_letter_replay` to retry specific or filtered entries
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
|
| 239 |
### First-Time Operator Checklist
|
| 240 |
|
|
|
|
| 163 |
- `system_status.integrations.*.circuit_breaker` (open/remaining/failure state per integration)
|
| 164 |
- `system_status.recovery_journal` (interrupted-action reconciliation summary)
|
| 165 |
- `system_status.dead_letter_queue` (failed outbound delivery queue with replay status)
|
| 166 |
+
- `system_status.expansion` (proactive, trust, orchestration, planner, quality, embodiment, integration roadmap feature snapshot)
|
| 167 |
- `jarvis_scorecard` (standalone scorecard payload for dashboards and alerts)
|
| 168 |
- `system_status_contract` (stable required-field contract)
|
| 169 |
- Memory retrieval now includes confidence/provenance details:
|
|
|
|
| 204 |
- Skills developer guide: [`docs/operations/skills-development.md`](docs/operations/skills-development.md).
|
| 205 |
- Provenance verification: [`docs/operations/provenance-verification.md`](docs/operations/provenance-verification.md).
|
| 206 |
- Incident response: [`docs/operations/incident-response.md`](docs/operations/incident-response.md).
|
| 207 |
+
- Release acceptance: run `./scripts/release_acceptance.sh fast|full`.
|
| 208 |
+
- Release channel checks: run `./scripts/check_release_channel.py --channel dev|beta|stable`.
|
| 209 |
+
- Weekly quality artifact: run `./scripts/generate_quality_report.py --output-dir .artifacts/quality --markdown`.
|
| 210 |
+
- Deterministic eval dataset runner: run `./scripts/run_eval_dataset.py docs/evals/assistant-contract.json --strict`.
|
| 211 |
+
- One-command host bootstrap: run `./scripts/bootstrap.sh`.
|
| 212 |
+
- Container profile: `docker compose up --build` (simulation/no-vision default).
|
| 213 |
+
- Home Assistant add-on starter path: [`deploy/home-assistant-addon`](deploy/home-assistant-addon).
|
| 214 |
- Todoist integration:
|
| 215 |
- `TODOIST_PERMISSION_PROFILE=readonly|control`
|
| 216 |
- `readonly` allows `todoist_list_tasks` and denies `todoist_add_task`
|
|
|
|
| 243 |
- failed outbound webhook/channel/email/push attempts are queued for operator replay:
|
| 244 |
- `dead_letter_list` to inspect queue state
|
| 245 |
- `dead_letter_replay` to retry specific or filtered entries
|
| 246 |
+
- Proactive assistant workflows (`proactive_assistant`):
|
| 247 |
+
- `briefing`, `anomaly_scan`, `routine_suggestions`, `follow_through`, `event_digest`
|
| 248 |
+
- Memory governance (`memory_governance`):
|
| 249 |
+
- per-user partition overlays + duplication/contradiction/staleness audits + cleanup
|
| 250 |
+
- Identity and trust controls (`identity_trust`):
|
| 251 |
+
- session confidence scoring, domain trust-policy management, guest-mode sessions, household profile admin
|
| 252 |
+
- Home orchestration (`home_orchestrator`):
|
| 253 |
+
- intent-to-plan decomposition, preflighted multi-entity execution with partial failure reporting, area policy constraints, automation suggestions, long-running task tracking
|
| 254 |
+
- Skills governance (`skills_governance`):
|
| 255 |
+
- capability negotiation, dependency health, quotas, harness runs, bundle signing metadata, sandbox templates
|
| 256 |
+
- Planning and autonomy (`planner_engine`):
|
| 257 |
+
- planner/executor split output, task graphs with checkpoint/resume, deferred scheduling, self-critique
|
| 258 |
+
- Quality and evaluation (`quality_evaluator`):
|
| 259 |
+
- weekly report generation + deterministic dataset-runner summary
|
| 260 |
+
- Embodiment roadmap controls (`embodiment_presence`):
|
| 261 |
+
- micro-expression library, user gaze calibration, adaptive gesture envelopes, privacy posture, motion safety envelope
|
| 262 |
+
- Integration workflows (`integration_hub`):
|
| 263 |
+
- calendar CRUD policy flow, notes capture backends, messaging draft/review/send flow, commute briefs, shopping orchestration, policy-gated research workflow
|
| 264 |
|
| 265 |
### First-Time Operator Checklist
|
| 266 |
|
TODO.md
CHANGED
|
@@ -40,12 +40,12 @@ This is a fresh roadmap focused on what is still missing for a "feels-like-Jarvi
|
|
| 40 |
|
| 41 |
## 3) Proactive Assistant Behavior (6 items)
|
| 42 |
|
| 43 |
-
- [
|
| 44 |
-
- [
|
| 45 |
- [x] `PX03` Add "nudge policy" (when to interrupt vs defer) with user-configurable quiet windows. `P0`
|
| 46 |
-
- [
|
| 47 |
-
- [
|
| 48 |
-
- [
|
| 49 |
|
| 50 |
## 4) Memory and Personalization (6 items)
|
| 51 |
|
|
@@ -53,25 +53,25 @@ This is a fresh roadmap focused on what is still missing for a "feels-like-Jarvi
|
|
| 53 |
- [x] `MX02` Add "memory confidence" and "source trail" to prevent stale or hallucinated recall. `P0`
|
| 54 |
- [x] `MX03` Add memory correction flow ("forget this", "update that") as first-class voice commands. `P0`
|
| 55 |
- [x] `MX04` Add episodic timeline snapshots for recent important conversations/actions. `P1`
|
| 56 |
-
- [
|
| 57 |
-
- [
|
| 58 |
|
| 59 |
## 5) Multi-User Identity and Trust (6 items)
|
| 60 |
|
| 61 |
-
- [
|
| 62 |
-
- [
|
| 63 |
- [x] `IX03` Add step-up verification path for high-risk requests (spoken code or operator approval). `P0`
|
| 64 |
-
- [
|
| 65 |
-
- [
|
| 66 |
- [x] `IX06` Add audit explainability: record why an action was allowed/blocked in user-readable terms. `P1`
|
| 67 |
|
| 68 |
## 6) Home Intelligence and Automation (6 items)
|
| 69 |
|
| 70 |
-
- [
|
| 71 |
-
- [
|
| 72 |
-
- [
|
| 73 |
-
- [
|
| 74 |
-
- [
|
| 75 |
- [x] `HX06` Add idempotent action guardrails to avoid repeated toggles during ambiguous dialogue. `P1`
|
| 76 |
|
| 77 |
## 7) Operator Surfaces and Control (6 items)
|
|
@@ -85,21 +85,21 @@ This is a fresh roadmap focused on what is still missing for a "feels-like-Jarvi
|
|
| 85 |
|
| 86 |
## 8) Skills Ecosystem and Extensibility (6 items)
|
| 87 |
|
| 88 |
-
- [
|
| 89 |
-
- [
|
| 90 |
-
- [
|
| 91 |
-
- [
|
| 92 |
-
- [
|
| 93 |
-
- [
|
| 94 |
|
| 95 |
## 9) Planning and Autonomy (6 items)
|
| 96 |
|
| 97 |
-
- [
|
| 98 |
-
- [
|
| 99 |
-
- [
|
| 100 |
- [x] `AX04` Add ambiguity detector to request clarifications before risky plan execution. `P0`
|
| 101 |
- [x] `AX05` Add human-readable plan preview before execution for medium/high-risk actions. `P0`
|
| 102 |
-
- [
|
| 103 |
|
| 104 |
## 10) Reliability and Runtime Safety (6 items)
|
| 105 |
|
|
@@ -115,36 +115,36 @@ This is a fresh roadmap focused on what is still missing for a "feels-like-Jarvi
|
|
| 115 |
- [x] `EX01` Add intent-level success metrics (answer quality, completion success, correction frequency). `P0`
|
| 116 |
- [x] `EX02` Add percentile dashboards for end-to-end turn latency by mode and tool mix. `P1`
|
| 117 |
- [x] `EX03` Add policy-decision analytics (allow/deny reason distribution by user and tool). `P1`
|
| 118 |
-
- [
|
| 119 |
-
- [
|
| 120 |
- [x] `EX06` Add "Jarvis scorecard" combining latency, reliability, initiative, and trust metrics. `P1`
|
| 121 |
|
| 122 |
## 12) Embodiment and Presence (6 items)
|
| 123 |
|
| 124 |
-
- [
|
| 125 |
- [x] `BX02` Add conversational turn choreography (listen lean-in, think glance-away, answer lock-on). `P0`
|
| 126 |
-
- [
|
| 127 |
-
- [
|
| 128 |
-
- [
|
| 129 |
-
- [
|
| 130 |
|
| 131 |
## 13) Integrations and Productivity Surface (6 items)
|
| 132 |
|
| 133 |
-
- [
|
| 134 |
-
- [
|
| 135 |
-
- [
|
| 136 |
-
- [
|
| 137 |
-
- [
|
| 138 |
-
- [
|
| 139 |
|
| 140 |
## 14) Packaging, Deployment, and Ecosystem Fit (6 items)
|
| 141 |
|
| 142 |
-
- [
|
| 143 |
-
- [
|
| 144 |
- [x] `DX03` Add backup/restore CLI for memory, audit, runtime state, and operator settings. `P1`
|
| 145 |
-
- [
|
| 146 |
-
- [
|
| 147 |
-
- [
|
| 148 |
|
| 149 |
---
|
| 150 |
|
|
|
|
| 40 |
|
| 41 |
## 3) Proactive Assistant Behavior (6 items)
|
| 42 |
|
| 43 |
+
- [x] `PX01` Add proactive briefing engine (morning/evening) built from calendar, reminders, weather, and home state. `P0`
|
| 44 |
+
- [x] `PX02` Add proactive anomaly notifications (device offline, unusual temp, missed reminder). `P0`
|
| 45 |
- [x] `PX03` Add "nudge policy" (when to interrupt vs defer) with user-configurable quiet windows. `P0`
|
| 46 |
+
- [x] `PX04` Add routine suggestions based on repeated behavior patterns (opt-in only). `P1`
|
| 47 |
+
- [x] `PX05` Add proactive follow-through ("I can do that now" for pending tasks after confirmations). `P1`
|
| 48 |
+
- [x] `PX06` Add proactive event summarization with digest and snooze controls. `P1`
|
| 49 |
|
| 50 |
## 4) Memory and Personalization (6 items)
|
| 51 |
|
|
|
|
| 53 |
- [x] `MX02` Add "memory confidence" and "source trail" to prevent stale or hallucinated recall. `P0`
|
| 54 |
- [x] `MX03` Add memory correction flow ("forget this", "update that") as first-class voice commands. `P0`
|
| 55 |
- [x] `MX04` Add episodic timeline snapshots for recent important conversations/actions. `P1`
|
| 56 |
+
- [x] `MX05` Add per-user memory partitions with shared/common memory overlays. `P1`
|
| 57 |
+
- [x] `MX06` Add memory quality audits (duplication, contradiction, stale data) with cleanup tools. `P1`
|
| 58 |
|
| 59 |
## 5) Multi-User Identity and Trust (6 items)
|
| 60 |
|
| 61 |
+
- [x] `IX01` Add session-level identity confidence score from voice context + operator hints. `P0`
|
| 62 |
+
- [x] `IX02` Add per-user trust policies for high-risk domains (locks, alarms, purchases, external messages). `P0`
|
| 63 |
- [x] `IX03` Add step-up verification path for high-risk requests (spoken code or operator approval). `P0`
|
| 64 |
+
- [x] `IX04` Add "guest mode" with constrained capabilities and automatic expiry. `P1`
|
| 65 |
+
- [x] `IX05` Add household profile management in operator UI (users, roles, trust, exceptions). `P1`
|
| 66 |
- [x] `IX06` Add audit explainability: record why an action was allowed/blocked in user-readable terms. `P1`
|
| 67 |
|
| 68 |
## 6) Home Intelligence and Automation (6 items)
|
| 69 |
|
| 70 |
+
- [x] `HX01` Add intent-to-plan decomposition for complex home requests ("movie mode", "bedtime routine"). `P0`
|
| 71 |
+
- [x] `HX02` Add safe multi-entity execution with preflight checks and partial-failure reporting. `P0`
|
| 72 |
+
- [x] `HX03` Add area-level policy constraints (e.g., no loud actions in bedroom after quiet hours). `P0`
|
| 73 |
+
- [x] `HX04` Add Home Assistant automation suggestion mode with review before creation. `P1`
|
| 74 |
+
- [x] `HX05` Add long-running home task tracking (start, in-progress, completed) in status and operator UI. `P1`
|
| 75 |
- [x] `HX06` Add idempotent action guardrails to avoid repeated toggles during ambiguous dialogue. `P1`
|
| 76 |
|
| 77 |
## 7) Operator Surfaces and Control (6 items)
|
|
|
|
| 85 |
|
| 86 |
## 8) Skills Ecosystem and Extensibility (6 items)
|
| 87 |
|
| 88 |
+
- [x] `SX01` Add skill capability negotiation so planner can reason about tool quality and reliability. `P1`
|
| 89 |
+
- [x] `SX02` Add skill dependency graph and health reporting (missing deps, version conflicts). `P1`
|
| 90 |
+
- [x] `SX03` Add per-skill runtime quotas (rate, CPU time, outbound calls). `P1`
|
| 91 |
+
- [x] `SX04` Add skill test harness CLI and fixture-based contract validation. `P1`
|
| 92 |
+
- [x] `SX05` Add signed skill distribution bundle format with integrity metadata. `P2`
|
| 93 |
+
- [x] `SX06` Add skill sandbox policy templates (`read-only`, `network-limited`, `local-only`). `P1`
|
| 94 |
|
| 95 |
## 9) Planning and Autonomy (6 items)
|
| 96 |
|
| 97 |
+
- [x] `AX01` Add explicit planner/executor split with retry policy and rollback hints. `P0`
|
| 98 |
+
- [x] `AX02` Add task graph execution for multi-step goals with checkpointing and resume. `P1`
|
| 99 |
+
- [x] `AX03` Add dependency-aware scheduling for deferred actions and reminders. `P1`
|
| 100 |
- [x] `AX04` Add ambiguity detector to request clarifications before risky plan execution. `P0`
|
| 101 |
- [x] `AX05` Add human-readable plan preview before execution for medium/high-risk actions. `P0`
|
| 102 |
+
- [x] `AX06` Add planner self-critique pass for expensive/complex plans before commit. `P2`
|
| 103 |
|
| 104 |
## 10) Reliability and Runtime Safety (6 items)
|
| 105 |
|
|
|
|
| 115 |
- [x] `EX01` Add intent-level success metrics (answer quality, completion success, correction frequency). `P0`
|
| 116 |
- [x] `EX02` Add percentile dashboards for end-to-end turn latency by mode and tool mix. `P1`
|
| 117 |
- [x] `EX03` Add policy-decision analytics (allow/deny reason distribution by user and tool). `P1`
|
| 118 |
+
- [x] `EX04` Add weekly automated "assistant quality report" artifact (errors, regressions, wins). `P1`
|
| 119 |
+
- [x] `EX05` Add evaluation dataset runner for deterministic prompt/tool contract tests. `P1`
|
| 120 |
- [x] `EX06` Add "Jarvis scorecard" combining latency, reliability, initiative, and trust metrics. `P1`
|
| 121 |
|
| 122 |
## 12) Embodiment and Presence (6 items)
|
| 123 |
|
| 124 |
+
- [x] `BX01` Add richer stateful micro-expression library mapped to dialogue intent and certainty. `P1`
|
| 125 |
- [x] `BX02` Add conversational turn choreography (listen lean-in, think glance-away, answer lock-on). `P0`
|
| 126 |
+
- [x] `BX03` Add user-specific gaze behavior calibration for desk distance and seating position. `P1`
|
| 127 |
+
- [x] `BX04` Add adaptive speaking gesture envelopes based on response emotion/importance. `P1`
|
| 128 |
+
- [x] `BX05` Add explicit "privacy posture" transitions on mute/sensitive operations. `P0`
|
| 129 |
+
- [x] `BX06` Add motion safety envelopes linked to runtime context (proximity, hardware state). `P0`
|
| 130 |
|
| 131 |
## 13) Integrations and Productivity Surface (6 items)
|
| 132 |
|
| 133 |
+
- [x] `GX01` Add richer calendar actions (create/update/delete with confirmation policy). `P1`
|
| 134 |
+
- [x] `GX02` Add notes/knowledge capture integration (Obsidian/Notion/local markdown) with trust controls. `P1`
|
| 135 |
+
- [x] `GX03` Add messaging assistant workflows (draft/review/send) for Slack/Discord/email. `P1`
|
| 136 |
+
- [x] `GX04` Add commute/travel briefing integration (traffic/transit APIs). `P2`
|
| 137 |
+
- [x] `GX05` Add shopping/task orchestration across Todoist + Home Assistant + notifications. `P1`
|
| 138 |
+
- [x] `GX06` Add contextual web research workflow with citation capture and policy gating. `P2`
|
| 139 |
|
| 140 |
## 14) Packaging, Deployment, and Ecosystem Fit (6 items)
|
| 141 |
|
| 142 |
+
- [x] `DX01` Add one-command local install/bootstrap script for clean hosts. `P1`
|
| 143 |
+
- [x] `DX02` Add containerized deployment profile for always-on home-server runtime. `P1`
|
| 144 |
- [x] `DX03` Add backup/restore CLI for memory, audit, runtime state, and operator settings. `P1`
|
| 145 |
+
- [x] `DX04` Add staged release channels (`dev`, `beta`, `stable`) with migration checks. `P2`
|
| 146 |
+
- [x] `DX05` Add Home Assistant add-on packaging path and setup guide. `P2`
|
| 147 |
+
- [x] `DX06` Add release acceptance suite focused on "Jarvis feel" scenarios before ship. `P1`
|
| 148 |
|
| 149 |
---
|
| 150 |
|
config/release-channels.json
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"default_channel": "dev",
|
| 3 |
+
"channels": {
|
| 4 |
+
"dev": {
|
| 5 |
+
"description": "Fast iteration channel",
|
| 6 |
+
"required_checks": [
|
| 7 |
+
{"type": "file_exists", "path": "TODO.md"},
|
| 8 |
+
{"type": "file_exists", "path": "README.md"}
|
| 9 |
+
]
|
| 10 |
+
},
|
| 11 |
+
"beta": {
|
| 12 |
+
"description": "Pre-stable validation channel",
|
| 13 |
+
"required_checks": [
|
| 14 |
+
{"type": "file_exists", "path": "scripts/release_acceptance.sh"},
|
| 15 |
+
{"type": "text_contains", "path": "README.md", "needle": "Release acceptance"}
|
| 16 |
+
]
|
| 17 |
+
},
|
| 18 |
+
"stable": {
|
| 19 |
+
"description": "Production channel",
|
| 20 |
+
"required_checks": [
|
| 21 |
+
{"type": "file_exists", "path": ".github/workflows/ci.yml"},
|
| 22 |
+
{"type": "file_exists", "path": ".github/workflows/fault-profiles.yml"},
|
| 23 |
+
{"type": "file_exists", "path": "scripts/generate_quality_report.py"}
|
| 24 |
+
]
|
| 25 |
+
}
|
| 26 |
+
}
|
| 27 |
+
}
|
deploy/home-assistant-addon/Dockerfile
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ARG BUILD_FROM=ghcr.io/home-assistant/amd64-base-python:3.12
|
| 2 |
+
FROM ${BUILD_FROM}
|
| 3 |
+
|
| 4 |
+
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 5 |
+
PYTHONUNBUFFERED=1
|
| 6 |
+
|
| 7 |
+
RUN apk add --no-cache bash curl
|
| 8 |
+
RUN curl -LsSf https://astral.sh/uv/install.sh | sh
|
| 9 |
+
ENV PATH="/root/.local/bin:${PATH}"
|
| 10 |
+
|
| 11 |
+
WORKDIR /opt/jarvis
|
| 12 |
+
COPY . /opt/jarvis
|
| 13 |
+
RUN uv sync --frozen --extra dev
|
| 14 |
+
|
| 15 |
+
CMD ["uv", "run", "python", "-m", "jarvis", "--sim", "--no-vision"]
|
deploy/home-assistant-addon/README.md
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Home Assistant Add-on Path
|
| 2 |
+
|
| 3 |
+
This directory contains a starter add-on packaging profile for running Jarvis on a Home Assistant host.
|
| 4 |
+
|
| 5 |
+
## Build/packaging notes
|
| 6 |
+
|
| 7 |
+
- `config.yaml`: add-on metadata and options schema.
|
| 8 |
+
- `Dockerfile`: base image plus Jarvis runtime dependencies.
|
| 9 |
+
- The runtime command starts Jarvis in simulation/no-vision mode by default for safer first boot.
|
| 10 |
+
|
| 11 |
+
## Installation workflow
|
| 12 |
+
|
| 13 |
+
1. Copy this folder into a Home Assistant add-on repository.
|
| 14 |
+
2. Provide API keys in add-on options or environment overrides.
|
| 15 |
+
3. Build and install via Home Assistant Supervisor.
|
| 16 |
+
4. Validate with dry-run tooling before enabling mutating integrations.
|
deploy/home-assistant-addon/config.yaml
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Jarvis Assistant
|
| 2 |
+
version: "0.1.0"
|
| 3 |
+
slug: jarvis_assistant
|
| 4 |
+
description: Jarvis AI assistant runtime for Home Assistant hosts
|
| 5 |
+
startup: services
|
| 6 |
+
boot: auto
|
| 7 |
+
arch:
|
| 8 |
+
- amd64
|
| 9 |
+
- aarch64
|
| 10 |
+
- armv7
|
| 11 |
+
map:
|
| 12 |
+
- config:rw
|
| 13 |
+
- addon_config:rw
|
| 14 |
+
options:
|
| 15 |
+
wake_mode: wake_word
|
| 16 |
+
no_vision: true
|
| 17 |
+
schema:
|
| 18 |
+
wake_mode: str
|
| 19 |
+
no_vision: bool
|
| 20 |
+
anthropic_api_key: password
|
| 21 |
+
elevenlabs_api_key: password
|
| 22 |
+
ingress: false
|
| 23 |
+
homeassistant_api: true
|
| 24 |
+
hassio_api: true
|
| 25 |
+
host_network: false
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
services:
|
| 2 |
+
jarvis:
|
| 3 |
+
build:
|
| 4 |
+
context: .
|
| 5 |
+
dockerfile: Dockerfile
|
| 6 |
+
container_name: jarvis
|
| 7 |
+
env_file:
|
| 8 |
+
- .env
|
| 9 |
+
volumes:
|
| 10 |
+
- jarvis-data:/root/.jarvis
|
| 11 |
+
restart: unless-stopped
|
| 12 |
+
command: ["uv", "run", "python", "-m", "jarvis", "--sim", "--no-vision"]
|
| 13 |
+
|
| 14 |
+
volumes:
|
| 15 |
+
jarvis-data:
|
docs/evals/assistant-contract.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cases": [
|
| 3 |
+
{
|
| 4 |
+
"id": "status_contract",
|
| 5 |
+
"expected_contains": ["schema_version", "tool_policy", "health"],
|
| 6 |
+
"expected_tools": ["system_status"],
|
| 7 |
+
"actual_response": "schema_version tool_policy health",
|
| 8 |
+
"actual_tools": ["system_status"]
|
| 9 |
+
},
|
| 10 |
+
{
|
| 11 |
+
"id": "identity_guardrail",
|
| 12 |
+
"expected_contains": ["approval"],
|
| 13 |
+
"expected_tools": ["identity_trust"],
|
| 14 |
+
"actual_response": "requires approval",
|
| 15 |
+
"actual_tools": ["identity_trust"]
|
| 16 |
+
}
|
| 17 |
+
]
|
| 18 |
+
}
|
scripts/bootstrap.sh
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
usage() {
|
| 5 |
+
cat <<'USAGE'
|
| 6 |
+
Usage: ./scripts/bootstrap.sh [--quick]
|
| 7 |
+
|
| 8 |
+
Bootstraps Jarvis on a clean host:
|
| 9 |
+
- ensures uv is installed
|
| 10 |
+
- syncs dependencies
|
| 11 |
+
- creates .env from .env.example if missing
|
| 12 |
+
- runs baseline validation (unless --quick)
|
| 13 |
+
USAGE
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
quick=false
|
| 17 |
+
if [[ "${1:-}" == "--help" || "${1:-}" == "-h" ]]; then
|
| 18 |
+
usage
|
| 19 |
+
exit 0
|
| 20 |
+
fi
|
| 21 |
+
if [[ "${1:-}" == "--quick" ]]; then
|
| 22 |
+
quick=true
|
| 23 |
+
fi
|
| 24 |
+
|
| 25 |
+
if ! command -v uv >/dev/null 2>&1; then
|
| 26 |
+
echo "Installing uv..."
|
| 27 |
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
| 28 |
+
export PATH="$HOME/.local/bin:$PATH"
|
| 29 |
+
fi
|
| 30 |
+
|
| 31 |
+
if [[ ! -f .env && -f .env.example ]]; then
|
| 32 |
+
cp .env.example .env
|
| 33 |
+
echo "Created .env from .env.example"
|
| 34 |
+
fi
|
| 35 |
+
|
| 36 |
+
uv sync --extra dev
|
| 37 |
+
|
| 38 |
+
if [[ "$quick" == "false" ]]; then
|
| 39 |
+
uv run ruff check src tests
|
| 40 |
+
uv run pytest -q tests/test_config.py tests/test_tools_services.py
|
| 41 |
+
fi
|
| 42 |
+
|
| 43 |
+
echo "Bootstrap complete."
|
scripts/check_release_channel.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import argparse
|
| 5 |
+
import json
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Any
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def _run_check(base: Path, check: dict[str, Any]) -> dict[str, Any]:
|
| 11 |
+
kind = str(check.get("type", "")).strip().lower()
|
| 12 |
+
path = str(check.get("path", "")).strip()
|
| 13 |
+
target = (base / path).resolve() if path else base
|
| 14 |
+
|
| 15 |
+
if kind == "file_exists":
|
| 16 |
+
ok = target.exists()
|
| 17 |
+
return {"type": kind, "path": path, "ok": ok, "detail": "exists" if ok else "missing"}
|
| 18 |
+
|
| 19 |
+
if kind == "text_contains":
|
| 20 |
+
needle = str(check.get("needle", "")).strip()
|
| 21 |
+
if not target.exists() or not target.is_file():
|
| 22 |
+
return {"type": kind, "path": path, "ok": False, "detail": "missing_file"}
|
| 23 |
+
text = target.read_text(encoding="utf-8", errors="replace")
|
| 24 |
+
ok = needle in text
|
| 25 |
+
return {
|
| 26 |
+
"type": kind,
|
| 27 |
+
"path": path,
|
| 28 |
+
"needle": needle,
|
| 29 |
+
"ok": ok,
|
| 30 |
+
"detail": "found" if ok else "missing_needle",
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
return {"type": kind or "unknown", "path": path, "ok": False, "detail": "unsupported_check_type"}
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def main() -> int:
|
| 37 |
+
parser = argparse.ArgumentParser(description="Validate staged release-channel checks.")
|
| 38 |
+
parser.add_argument("--channel", required=True, choices=["dev", "beta", "stable"])
|
| 39 |
+
parser.add_argument("--config", default="config/release-channels.json")
|
| 40 |
+
parser.add_argument("--workspace", default=".")
|
| 41 |
+
args = parser.parse_args()
|
| 42 |
+
|
| 43 |
+
workspace = Path(args.workspace).resolve()
|
| 44 |
+
config_path = (workspace / args.config).resolve()
|
| 45 |
+
config = json.loads(config_path.read_text(encoding="utf-8"))
|
| 46 |
+
|
| 47 |
+
channels = config.get("channels", {}) if isinstance(config, dict) else {}
|
| 48 |
+
channel_cfg = channels.get(args.channel, {}) if isinstance(channels, dict) else {}
|
| 49 |
+
checks = channel_cfg.get("required_checks", []) if isinstance(channel_cfg, dict) else []
|
| 50 |
+
|
| 51 |
+
results = [_run_check(workspace, row) for row in checks if isinstance(row, dict)]
|
| 52 |
+
failed = [row for row in results if not bool(row.get("ok"))]
|
| 53 |
+
|
| 54 |
+
payload = {
|
| 55 |
+
"channel": args.channel,
|
| 56 |
+
"passed": len(failed) == 0,
|
| 57 |
+
"check_count": len(results),
|
| 58 |
+
"failed_count": len(failed),
|
| 59 |
+
"results": results,
|
| 60 |
+
}
|
| 61 |
+
print(json.dumps(payload, indent=2))
|
| 62 |
+
return 0 if not failed else 1
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
if __name__ == "__main__":
|
| 66 |
+
raise SystemExit(main())
|
scripts/generate_quality_report.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import argparse
|
| 5 |
+
import json
|
| 6 |
+
import time
|
| 7 |
+
from collections import Counter
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import Any
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def _read_jsonl(path: Path) -> list[dict[str, Any]]:
|
| 14 |
+
if not path.exists():
|
| 15 |
+
return []
|
| 16 |
+
rows: list[dict[str, Any]] = []
|
| 17 |
+
for line in path.read_text(encoding="utf-8", errors="replace").splitlines():
|
| 18 |
+
text = line.strip()
|
| 19 |
+
if not text:
|
| 20 |
+
continue
|
| 21 |
+
try:
|
| 22 |
+
payload = json.loads(text)
|
| 23 |
+
except Exception:
|
| 24 |
+
continue
|
| 25 |
+
if isinstance(payload, dict):
|
| 26 |
+
rows.append(payload)
|
| 27 |
+
return rows
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def _build_report(entries: list[dict[str, Any]]) -> dict[str, Any]:
|
| 31 |
+
total = len(entries)
|
| 32 |
+
by_action = Counter(str(row.get("action", "unknown")) for row in entries)
|
| 33 |
+
by_outcome = Counter(str(row.get("decision_outcome", row.get("result", "unknown"))) for row in entries)
|
| 34 |
+
failures = [row for row in entries if str(row.get("decision_outcome", row.get("result", ""))).lower() in {"failed", "denied", "blocked", "error"}]
|
| 35 |
+
|
| 36 |
+
top_failure_reasons = Counter(str(row.get("decision_reason", row.get("reason", "unknown"))) for row in failures)
|
| 37 |
+
|
| 38 |
+
return {
|
| 39 |
+
"generated_at": time.time(),
|
| 40 |
+
"total_events": total,
|
| 41 |
+
"event_count_by_action": dict(by_action.most_common(20)),
|
| 42 |
+
"event_count_by_outcome": dict(by_outcome),
|
| 43 |
+
"failure_count": len(failures),
|
| 44 |
+
"top_failure_reasons": dict(top_failure_reasons.most_common(10)),
|
| 45 |
+
"wins": [
|
| 46 |
+
"Maintained audit coverage for operational actions.",
|
| 47 |
+
"Captured decision outcomes for trust/policy review.",
|
| 48 |
+
],
|
| 49 |
+
"regressions": [
|
| 50 |
+
"High failure counts should be triaged from top_failure_reasons.",
|
| 51 |
+
] if failures else [],
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def _markdown(report: dict[str, Any]) -> str:
|
| 56 |
+
generated = datetime.fromtimestamp(float(report.get("generated_at", 0.0))).isoformat()
|
| 57 |
+
lines = [
|
| 58 |
+
"# Jarvis Weekly Quality Report",
|
| 59 |
+
"",
|
| 60 |
+
f"Generated: {generated}",
|
| 61 |
+
"",
|
| 62 |
+
f"- Total events: {int(report.get('total_events', 0))}",
|
| 63 |
+
f"- Failure events: {int(report.get('failure_count', 0))}",
|
| 64 |
+
"",
|
| 65 |
+
"## Outcome Distribution",
|
| 66 |
+
]
|
| 67 |
+
for key, value in sorted((report.get("event_count_by_outcome") or {}).items()):
|
| 68 |
+
lines.append(f"- {key}: {value}")
|
| 69 |
+
lines.append("")
|
| 70 |
+
lines.append("## Top Failure Reasons")
|
| 71 |
+
reasons = report.get("top_failure_reasons") or {}
|
| 72 |
+
if isinstance(reasons, dict) and reasons:
|
| 73 |
+
for key, value in reasons.items():
|
| 74 |
+
lines.append(f"- {key}: {value}")
|
| 75 |
+
else:
|
| 76 |
+
lines.append("- none")
|
| 77 |
+
return "\n".join(lines) + "\n"
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def main() -> int:
|
| 81 |
+
parser = argparse.ArgumentParser(description="Generate weekly Jarvis quality report artifacts.")
|
| 82 |
+
parser.add_argument("--audit-log", default=str(Path.home() / ".jarvis" / "audit.jsonl"))
|
| 83 |
+
parser.add_argument("--output-dir", default=str(Path(".artifacts") / "quality"))
|
| 84 |
+
parser.add_argument("--markdown", action="store_true")
|
| 85 |
+
args = parser.parse_args()
|
| 86 |
+
|
| 87 |
+
audit_log = Path(args.audit_log).expanduser()
|
| 88 |
+
output_dir = Path(args.output_dir).expanduser()
|
| 89 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 90 |
+
|
| 91 |
+
entries = _read_jsonl(audit_log)
|
| 92 |
+
report = _build_report(entries)
|
| 93 |
+
|
| 94 |
+
stamp = datetime.utcnow().strftime("%Y%m%d-%H%M%S")
|
| 95 |
+
json_path = output_dir / f"weekly-quality-{stamp}.json"
|
| 96 |
+
json_path.write_text(json.dumps(report, indent=2), encoding="utf-8")
|
| 97 |
+
|
| 98 |
+
artifact: dict[str, Any] = {
|
| 99 |
+
"json": str(json_path),
|
| 100 |
+
"markdown": "",
|
| 101 |
+
}
|
| 102 |
+
if args.markdown:
|
| 103 |
+
md_path = output_dir / f"weekly-quality-{stamp}.md"
|
| 104 |
+
md_path.write_text(_markdown(report), encoding="utf-8")
|
| 105 |
+
artifact["markdown"] = str(md_path)
|
| 106 |
+
|
| 107 |
+
print(json.dumps({"report": report, "artifacts": artifact}, indent=2))
|
| 108 |
+
return 0
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
if __name__ == "__main__":
|
| 112 |
+
raise SystemExit(main())
|
scripts/release_acceptance.sh
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
profile="${1:-full}"
|
| 5 |
+
|
| 6 |
+
run_core() {
|
| 7 |
+
uv run pytest -q \
|
| 8 |
+
tests/test_brain.py \
|
| 9 |
+
tests/test_presence.py \
|
| 10 |
+
tests/test_voice_attention.py \
|
| 11 |
+
tests/test_turn_taking.py \
|
| 12 |
+
tests/test_tools_services.py -k "system_status or scorecard or identity"
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
run_fast() {
|
| 16 |
+
uv run pytest -q \
|
| 17 |
+
tests/test_brain.py -k "interaction_contract or response_mode or confidence" \
|
| 18 |
+
tests/test_presence.py -k "choreography or muted" \
|
| 19 |
+
tests/test_tools_services.py -k "system_status_contract_reports_expected_fields"
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
uv run ruff check src tests
|
| 23 |
+
|
| 24 |
+
case "$profile" in
|
| 25 |
+
fast)
|
| 26 |
+
run_fast
|
| 27 |
+
;;
|
| 28 |
+
full)
|
| 29 |
+
run_core
|
| 30 |
+
;;
|
| 31 |
+
*)
|
| 32 |
+
echo "Unknown profile: $profile (expected: fast|full)" >&2
|
| 33 |
+
exit 2
|
| 34 |
+
;;
|
| 35 |
+
esac
|
| 36 |
+
|
| 37 |
+
echo "Release acceptance suite passed ($profile)."
|
scripts/run_eval_dataset.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import argparse
|
| 5 |
+
import json
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Any
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def _as_list(value: Any) -> list[str]:
|
| 11 |
+
if isinstance(value, list):
|
| 12 |
+
return [str(item) for item in value if str(item).strip()]
|
| 13 |
+
if isinstance(value, str) and value.strip():
|
| 14 |
+
return [value.strip()]
|
| 15 |
+
return []
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def _evaluate_case(case: dict[str, Any]) -> dict[str, Any]:
|
| 19 |
+
case_id = str(case.get("id", "case"))
|
| 20 |
+
actual_response = str(case.get("actual_response", ""))
|
| 21 |
+
actual_tools = {str(item) for item in _as_list(case.get("actual_tools"))}
|
| 22 |
+
expected_contains = _as_list(case.get("expected_contains"))
|
| 23 |
+
expected_tools = {str(item) for item in _as_list(case.get("expected_tools"))}
|
| 24 |
+
|
| 25 |
+
missing_text = [needle for needle in expected_contains if needle not in actual_response]
|
| 26 |
+
missing_tools = sorted(expected_tools - actual_tools)
|
| 27 |
+
passed = not missing_text and not missing_tools
|
| 28 |
+
|
| 29 |
+
return {
|
| 30 |
+
"id": case_id,
|
| 31 |
+
"passed": passed,
|
| 32 |
+
"missing_text": missing_text,
|
| 33 |
+
"missing_tools": missing_tools,
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def main() -> int:
|
| 38 |
+
parser = argparse.ArgumentParser(description="Run deterministic evaluation dataset checks.")
|
| 39 |
+
parser.add_argument("dataset", help="Path to dataset JSON")
|
| 40 |
+
parser.add_argument("--output", default="")
|
| 41 |
+
parser.add_argument("--strict", action="store_true")
|
| 42 |
+
args = parser.parse_args()
|
| 43 |
+
|
| 44 |
+
dataset_path = Path(args.dataset)
|
| 45 |
+
payload = json.loads(dataset_path.read_text(encoding="utf-8"))
|
| 46 |
+
cases = payload.get("cases", []) if isinstance(payload, dict) else []
|
| 47 |
+
if not isinstance(cases, list):
|
| 48 |
+
raise SystemExit("Dataset format error: expected top-level object with 'cases' list.")
|
| 49 |
+
|
| 50 |
+
results = [_evaluate_case(case) for case in cases if isinstance(case, dict)]
|
| 51 |
+
passed = sum(1 for row in results if row["passed"])
|
| 52 |
+
failed = len(results) - passed
|
| 53 |
+
summary = {
|
| 54 |
+
"dataset": str(dataset_path),
|
| 55 |
+
"strict": bool(args.strict),
|
| 56 |
+
"case_count": len(results),
|
| 57 |
+
"passed": passed,
|
| 58 |
+
"failed": failed,
|
| 59 |
+
"pass_rate": (passed / len(results)) if results else 0.0,
|
| 60 |
+
"accepted": (failed == 0) if args.strict else (passed >= failed),
|
| 61 |
+
"results": results,
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
text = json.dumps(summary, indent=2)
|
| 65 |
+
print(text)
|
| 66 |
+
if args.output:
|
| 67 |
+
out_path = Path(args.output)
|
| 68 |
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
| 69 |
+
out_path.write_text(text, encoding="utf-8")
|
| 70 |
+
|
| 71 |
+
return 0 if summary["accepted"] else 1
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
if __name__ == "__main__":
|
| 75 |
+
raise SystemExit(main())
|
src/jarvis/brain.py
CHANGED
|
@@ -269,6 +269,15 @@ class Brain:
|
|
| 269 |
"mcp__jarvis-services__skills_enable",
|
| 270 |
"mcp__jarvis-services__skills_disable",
|
| 271 |
"mcp__jarvis-services__skills_version",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
],
|
| 273 |
self._config.tool_allowlist,
|
| 274 |
self._config.tool_denylist,
|
|
|
|
| 269 |
"mcp__jarvis-services__skills_enable",
|
| 270 |
"mcp__jarvis-services__skills_disable",
|
| 271 |
"mcp__jarvis-services__skills_version",
|
| 272 |
+
"mcp__jarvis-services__proactive_assistant",
|
| 273 |
+
"mcp__jarvis-services__memory_governance",
|
| 274 |
+
"mcp__jarvis-services__identity_trust",
|
| 275 |
+
"mcp__jarvis-services__home_orchestrator",
|
| 276 |
+
"mcp__jarvis-services__skills_governance",
|
| 277 |
+
"mcp__jarvis-services__planner_engine",
|
| 278 |
+
"mcp__jarvis-services__quality_evaluator",
|
| 279 |
+
"mcp__jarvis-services__embodiment_presence",
|
| 280 |
+
"mcp__jarvis-services__integration_hub",
|
| 281 |
],
|
| 282 |
self._config.tool_allowlist,
|
| 283 |
self._config.tool_denylist,
|
src/jarvis/tools/services.py
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tests/test_tools_services.py
CHANGED
|
@@ -3149,7 +3149,7 @@ class TestServicesTools:
|
|
| 3149 |
|
| 3150 |
result = await services.system_status({})
|
| 3151 |
payload = json.loads(result["content"][0]["text"])
|
| 3152 |
-
assert payload["schema_version"] == "
|
| 3153 |
assert "local_time" in payload
|
| 3154 |
assert "tool_policy" in payload
|
| 3155 |
assert isinstance(payload["tool_policy"]["home_require_confirm_execute"], bool)
|
|
@@ -3249,6 +3249,10 @@ class TestServicesTools:
|
|
| 3249 |
assert "failed_count" in payload["dead_letter_queue"]
|
| 3250 |
assert "replayed_count" in payload["dead_letter_queue"]
|
| 3251 |
assert isinstance(payload["dead_letter_queue"]["recent"], list)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3252 |
assert payload["health"]["health_level"] in {"ok", "degraded", "error"}
|
| 3253 |
|
| 3254 |
@pytest.mark.asyncio
|
|
@@ -3257,7 +3261,7 @@ class TestServicesTools:
|
|
| 3257 |
|
| 3258 |
result = await services.system_status_contract({})
|
| 3259 |
payload = json.loads(result["content"][0]["text"])
|
| 3260 |
-
assert payload["schema_version"] == "
|
| 3261 |
assert "top_level_required" in payload
|
| 3262 |
assert "tool_policy" in payload["top_level_required"]
|
| 3263 |
assert "identity" in payload["top_level_required"]
|
|
@@ -3336,6 +3340,9 @@ class TestServicesTools:
|
|
| 3336 |
assert "dead_letter_queue" in payload["top_level_required"]
|
| 3337 |
assert "dead_letter_queue_required" in payload
|
| 3338 |
assert "pending_count" in payload["dead_letter_queue_required"]
|
|
|
|
|
|
|
|
|
|
| 3339 |
|
| 3340 |
@pytest.mark.asyncio
|
| 3341 |
async def test_jarvis_scorecard_reports_unified_dimensions(self, tmp_path):
|
|
@@ -3753,6 +3760,11 @@ class TestServicesTools:
|
|
| 3753 |
assert schemas["webhook_inbound_list"]["properties"]["limit"]["type"] == "integer"
|
| 3754 |
assert schemas["tool_summary"]["properties"]["limit"]["type"] == "integer"
|
| 3755 |
assert schemas["tool_summary_text"]["properties"]["limit"]["type"] == "integer"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3756 |
|
| 3757 |
def test_service_schema_identity_fields_present_for_mutating_tools(self):
|
| 3758 |
from jarvis.tools import services
|
|
@@ -3958,3 +3970,122 @@ class TestServicesTools:
|
|
| 3958 |
assert payload["nested"]["api_key"] == "***REDACTED***"
|
| 3959 |
assert payload["nested"]["safe"].endswith("...<truncated>")
|
| 3960 |
assert payload["items"][-1].startswith("<truncated_items:")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3149 |
|
| 3150 |
result = await services.system_status({})
|
| 3151 |
payload = json.loads(result["content"][0]["text"])
|
| 3152 |
+
assert payload["schema_version"] == "2.0"
|
| 3153 |
assert "local_time" in payload
|
| 3154 |
assert "tool_policy" in payload
|
| 3155 |
assert isinstance(payload["tool_policy"]["home_require_confirm_execute"], bool)
|
|
|
|
| 3249 |
assert "failed_count" in payload["dead_letter_queue"]
|
| 3250 |
assert "replayed_count" in payload["dead_letter_queue"]
|
| 3251 |
assert isinstance(payload["dead_letter_queue"]["recent"], list)
|
| 3252 |
+
assert "expansion" in payload
|
| 3253 |
+
assert "proactive" in payload["expansion"]
|
| 3254 |
+
assert "planner_engine" in payload["expansion"]
|
| 3255 |
+
assert "integration_hub" in payload["expansion"]
|
| 3256 |
assert payload["health"]["health_level"] in {"ok", "degraded", "error"}
|
| 3257 |
|
| 3258 |
@pytest.mark.asyncio
|
|
|
|
| 3261 |
|
| 3262 |
result = await services.system_status_contract({})
|
| 3263 |
payload = json.loads(result["content"][0]["text"])
|
| 3264 |
+
assert payload["schema_version"] == "2.0"
|
| 3265 |
assert "top_level_required" in payload
|
| 3266 |
assert "tool_policy" in payload["top_level_required"]
|
| 3267 |
assert "identity" in payload["top_level_required"]
|
|
|
|
| 3340 |
assert "dead_letter_queue" in payload["top_level_required"]
|
| 3341 |
assert "dead_letter_queue_required" in payload
|
| 3342 |
assert "pending_count" in payload["dead_letter_queue_required"]
|
| 3343 |
+
assert "expansion" in payload["top_level_required"]
|
| 3344 |
+
assert "expansion_required" in payload
|
| 3345 |
+
assert "proactive" in payload["expansion_required"]
|
| 3346 |
|
| 3347 |
@pytest.mark.asyncio
|
| 3348 |
async def test_jarvis_scorecard_reports_unified_dimensions(self, tmp_path):
|
|
|
|
| 3760 |
assert schemas["webhook_inbound_list"]["properties"]["limit"]["type"] == "integer"
|
| 3761 |
assert schemas["tool_summary"]["properties"]["limit"]["type"] == "integer"
|
| 3762 |
assert schemas["tool_summary_text"]["properties"]["limit"]["type"] == "integer"
|
| 3763 |
+
assert schemas["proactive_assistant"]["properties"]["snooze_minutes"]["type"] == "integer"
|
| 3764 |
+
assert schemas["memory_governance"]["properties"]["limit"]["type"] == "integer"
|
| 3765 |
+
assert schemas["skills_governance"]["properties"]["rate_per_min"]["type"] == "integer"
|
| 3766 |
+
assert schemas["planner_engine"]["properties"]["limit"]["type"] == "integer"
|
| 3767 |
+
assert schemas["quality_evaluator"]["properties"]["limit"]["type"] == "integer"
|
| 3768 |
|
| 3769 |
def test_service_schema_identity_fields_present_for_mutating_tools(self):
|
| 3770 |
from jarvis.tools import services
|
|
|
|
| 3970 |
assert payload["nested"]["api_key"] == "***REDACTED***"
|
| 3971 |
assert payload["nested"]["safe"].endswith("...<truncated>")
|
| 3972 |
assert payload["items"][-1].startswith("<truncated_items:")
|
| 3973 |
+
|
| 3974 |
+
@pytest.mark.asyncio
|
| 3975 |
+
async def test_expansion_tools_basic_actions(self, tmp_path):
|
| 3976 |
+
from jarvis.tools import services
|
| 3977 |
+
|
| 3978 |
+
proactive = await services.proactive_assistant({"action": "briefing", "mode": "morning"})
|
| 3979 |
+
proactive_payload = json.loads(proactive["content"][0]["text"])
|
| 3980 |
+
assert proactive_payload["action"] == "briefing"
|
| 3981 |
+
|
| 3982 |
+
memory_partition = await services.memory_governance(
|
| 3983 |
+
{"action": "partition", "user": "owner", "shared_scopes": ["preferences"]}
|
| 3984 |
+
)
|
| 3985 |
+
memory_payload = json.loads(memory_partition["content"][0]["text"])
|
| 3986 |
+
assert memory_payload["action"] == "partition"
|
| 3987 |
+
assert memory_payload["overlay"]["user"] == "owner"
|
| 3988 |
+
|
| 3989 |
+
trust_policy = await services.identity_trust(
|
| 3990 |
+
{"action": "policy_set", "domain": "locks", "required_profile": "trusted", "requires_step_up": True}
|
| 3991 |
+
)
|
| 3992 |
+
trust_payload = json.loads(trust_policy["content"][0]["text"])
|
| 3993 |
+
assert trust_payload["policy"]["required_profile"] == "trusted"
|
| 3994 |
+
|
| 3995 |
+
home_plan = await services.home_orchestrator({"action": "plan", "request_text": "activate movie mode"})
|
| 3996 |
+
home_payload = json.loads(home_plan["content"][0]["text"])
|
| 3997 |
+
assert home_payload["plan_label"] == "movie_mode"
|
| 3998 |
+
assert home_payload["step_count"] >= 1
|
| 3999 |
+
|
| 4000 |
+
skills_negotiation = await services.skills_governance(
|
| 4001 |
+
{"action": "negotiate", "requested_capabilities": ["forecast"]}
|
| 4002 |
+
)
|
| 4003 |
+
skills_payload = json.loads(skills_negotiation["content"][0]["text"])
|
| 4004 |
+
assert skills_payload["action"] == "negotiate"
|
| 4005 |
+
|
| 4006 |
+
planner = await services.planner_engine({"action": "plan", "goal": "prepare evening routine"})
|
| 4007 |
+
planner_payload = json.loads(planner["content"][0]["text"])
|
| 4008 |
+
assert planner_payload["action"] == "plan"
|
| 4009 |
+
assert "planner" in planner_payload
|
| 4010 |
+
|
| 4011 |
+
report_path = tmp_path / "quality.json"
|
| 4012 |
+
quality = await services.quality_evaluator({"action": "weekly_report", "report_path": str(report_path)})
|
| 4013 |
+
quality_payload = json.loads(quality["content"][0]["text"])
|
| 4014 |
+
assert quality_payload["action"] == "weekly_report"
|
| 4015 |
+
assert Path(quality_payload["artifact_path"]).exists()
|
| 4016 |
+
|
| 4017 |
+
embodiment = await services.embodiment_presence(
|
| 4018 |
+
{"action": "privacy_posture", "state": "muted", "reason": "sensitive_operation"}
|
| 4019 |
+
)
|
| 4020 |
+
embodiment_payload = json.loads(embodiment["content"][0]["text"])
|
| 4021 |
+
assert embodiment_payload["privacy_posture"]["state"] == "muted"
|
| 4022 |
+
|
| 4023 |
+
integration = await services.integration_hub(
|
| 4024 |
+
{"action": "notes_capture", "backend": "local_markdown", "title": "Test", "content": "hello"}
|
| 4025 |
+
)
|
| 4026 |
+
integration_payload = json.loads(integration["content"][0]["text"])
|
| 4027 |
+
assert integration_payload["stored"] is True
|
| 4028 |
+
assert Path(integration_payload["path"]).exists()
|
| 4029 |
+
|
| 4030 |
+
@pytest.mark.asyncio
|
| 4031 |
+
async def test_identity_guest_session_capability_enforced(self):
|
| 4032 |
+
from jarvis.tools import services
|
| 4033 |
+
|
| 4034 |
+
guest = await services.identity_trust(
|
| 4035 |
+
{"action": "guest_start", "guest_id": "visitor", "capabilities": ["system_status"]}
|
| 4036 |
+
)
|
| 4037 |
+
guest_payload = json.loads(guest["content"][0]["text"])
|
| 4038 |
+
token = guest_payload["token"]
|
| 4039 |
+
|
| 4040 |
+
denied = await services.smart_home(
|
| 4041 |
+
{
|
| 4042 |
+
"domain": "light",
|
| 4043 |
+
"action": "turn_on",
|
| 4044 |
+
"entity_id": "light.living_room",
|
| 4045 |
+
"dry_run": True,
|
| 4046 |
+
"guest_session_token": token,
|
| 4047 |
+
}
|
| 4048 |
+
)
|
| 4049 |
+
assert "guest session does not allow" in denied["content"][0]["text"].lower()
|
| 4050 |
+
|
| 4051 |
+
@pytest.mark.asyncio
|
| 4052 |
+
async def test_home_orchestrator_area_policy_surfaces_partial_failures(self):
|
| 4053 |
+
from jarvis.tools import services
|
| 4054 |
+
|
| 4055 |
+
await services.home_orchestrator(
|
| 4056 |
+
{
|
| 4057 |
+
"action": "area_policy_set",
|
| 4058 |
+
"area": "bedroom",
|
| 4059 |
+
"policy": {
|
| 4060 |
+
"quiet_hours_start": "22:00",
|
| 4061 |
+
"quiet_hours_end": "07:00",
|
| 4062 |
+
"blocked_actions": ["media_player:media_play"],
|
| 4063 |
+
},
|
| 4064 |
+
}
|
| 4065 |
+
)
|
| 4066 |
+
result = await services.home_orchestrator(
|
| 4067 |
+
{
|
| 4068 |
+
"action": "execute",
|
| 4069 |
+
"actions": [
|
| 4070 |
+
{"domain": "media_player", "action": "media_play", "entity_id": "media_player.bedroom_speaker"},
|
| 4071 |
+
{"domain": "light", "action": "turn_on", "entity_id": "light.kitchen"},
|
| 4072 |
+
],
|
| 4073 |
+
}
|
| 4074 |
+
)
|
| 4075 |
+
payload = json.loads(result["content"][0]["text"])
|
| 4076 |
+
assert payload["partial_failure"] is True
|
| 4077 |
+
assert payload["failed_count"] == 1
|
| 4078 |
+
|
| 4079 |
+
def test_release_scripts_and_workflows_exist(self):
|
| 4080 |
+
project_root = Path(__file__).resolve().parents[1]
|
| 4081 |
+
assert (project_root / "scripts" / "bootstrap.sh").exists()
|
| 4082 |
+
assert (project_root / "scripts" / "generate_quality_report.py").exists()
|
| 4083 |
+
assert (project_root / "scripts" / "run_eval_dataset.py").exists()
|
| 4084 |
+
assert (project_root / "scripts" / "release_acceptance.sh").exists()
|
| 4085 |
+
assert (project_root / "scripts" / "check_release_channel.py").exists()
|
| 4086 |
+
assert (project_root / ".github" / "workflows" / "assistant-quality-report.yml").exists()
|
| 4087 |
+
assert (project_root / ".github" / "workflows" / "release-acceptance.yml").exists()
|
| 4088 |
+
makefile_text = (project_root / "Makefile").read_text()
|
| 4089 |
+
assert "quality-report" in makefile_text
|
| 4090 |
+
assert "eval-dataset" in makefile_text
|
| 4091 |
+
assert "release-acceptance" in makefile_text
|