Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- README.md +57 -4
- index.html +73 -18
- pyproject.toml +14 -0
- style.css +188 -16
- talkie_gentleman/__init__.py +3 -0
- talkie_gentleman/main.py +121 -0
- talkie_gentleman/robot_behavior.py +127 -0
- talkie_gentleman/stt_engine.py +30 -0
- talkie_gentleman/talkie_inference.py +51 -0
- talkie_gentleman/tts_engine.py +58 -0
README.md
CHANGED
|
@@ -1,10 +1,63 @@
|
|
| 1 |
---
|
| 2 |
title: Talkie Gentleman
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: static
|
| 7 |
pinned: false
|
|
|
|
|
|
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
title: Talkie Gentleman
|
| 3 |
+
emoji: 🎩
|
| 4 |
+
colorFrom: purple
|
| 5 |
+
colorTo: gray
|
| 6 |
sdk: static
|
| 7 |
pinned: false
|
| 8 |
+
tags:
|
| 9 |
+
- reachy_mini
|
| 10 |
+
- reachy_mini_python_app
|
| 11 |
---
|
| 12 |
|
| 13 |
+
# 🎩 Talkie Gentleman
|
| 14 |
+
|
| 15 |
+
A next-gen robot channeling the 1920s — powered by [Talkie-1930](https://huggingface.co/talkie-lm/talkie-1930-13b-it) by [@AlecRad](https://twitter.com/AlecRad).
|
| 16 |
+
|
| 17 |
+
## What It Does
|
| 18 |
+
|
| 19 |
+
Transforms your Reachy Mini into a refined Edwardian gentleman who speaks exclusively in pre-1931 English with a posh British accent.
|
| 20 |
+
|
| 21 |
+
## Architecture
|
| 22 |
+
|
| 23 |
+
| Stage | Technology |
|
| 24 |
+
|-------|-----------|
|
| 25 |
+
| **Listen** | OpenAI Whisper (local STT) |
|
| 26 |
+
| **Think** | Talkie-1930-13b-it via HuggingFace Inference API |
|
| 27 |
+
| **Speak** | ElevenLabs "Daniel" voice (British male) via `sag` CLI |
|
| 28 |
+
| **Move** | Elegant Victorian gestures — slow nods, thoughtful tilts |
|
| 29 |
+
|
| 30 |
+
## Setup
|
| 31 |
+
|
| 32 |
+
```bash
|
| 33 |
+
pip install -e .
|
| 34 |
+
export HF_TOKEN="your_huggingface_token"
|
| 35 |
+
```
|
| 36 |
+
|
| 37 |
+
## Usage
|
| 38 |
+
|
| 39 |
+
```bash
|
| 40 |
+
# Text mode (testing)
|
| 41 |
+
python -m talkie_gentleman "I say, what is the weather today?"
|
| 42 |
+
|
| 43 |
+
# Full pipeline with audio
|
| 44 |
+
# (integrates with Reachy Mini SDK for robot control)
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
## Robot Behavior
|
| 48 |
+
|
| 49 |
+
The gentleman never rushes. All movements are:
|
| 50 |
+
- **Slow** — deliberate, composed
|
| 51 |
+
- **Subtle** — no frantic waving
|
| 52 |
+
- **Dignified** — antenna raises gently when speaking
|
| 53 |
+
- **Thoughtful** — slight head tilt when listening, downward gaze when thinking
|
| 54 |
+
|
| 55 |
+
## Environment Variables
|
| 56 |
+
|
| 57 |
+
- `HF_TOKEN` — HuggingFace API token (required for Talkie-1930 inference)
|
| 58 |
+
|
| 59 |
+
## Credits
|
| 60 |
+
|
| 61 |
+
- **Talkie-1930** model by [@AlecRad](https://twitter.com/AlecRad)
|
| 62 |
+
- **ElevenLabs** for the distinguished British voice
|
| 63 |
+
- Built for [Reachy Mini](https://www.pollen-robotics.com/) by Pollen Robotics
|
index.html
CHANGED
|
@@ -1,19 +1,74 @@
|
|
| 1 |
-
<!
|
| 2 |
-
<html>
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
</html>
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>Talkie Gentleman — Reachy Mini</title>
|
| 7 |
+
<link rel="stylesheet" href="style.css">
|
| 8 |
+
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&family=Playfair+Display:ital,wght@0,700;1,400&display=swap" rel="stylesheet">
|
| 9 |
+
</head>
|
| 10 |
+
<body>
|
| 11 |
+
<div class="container">
|
| 12 |
+
<header>
|
| 13 |
+
<div class="badge">REACHY MINI APP</div>
|
| 14 |
+
<h1>🎩 Talkie Gentleman</h1>
|
| 15 |
+
<p class="tagline">A next-gen robot channeling the 1920s</p>
|
| 16 |
+
</header>
|
| 17 |
+
|
| 18 |
+
<section class="hero">
|
| 19 |
+
<p class="quote">"I say, dear fellow — permit me to offer my most considered opinion on the matter at hand."</p>
|
| 20 |
+
</section>
|
| 21 |
+
|
| 22 |
+
<section class="features">
|
| 23 |
+
<div class="card">
|
| 24 |
+
<div class="card-icon">🗣️</div>
|
| 25 |
+
<h3>Victorian Speech</h3>
|
| 26 |
+
<p>Powered by Talkie-1930 — responses use only pre-1931 vocabulary and cultural references.</p>
|
| 27 |
+
</div>
|
| 28 |
+
<div class="card">
|
| 29 |
+
<div class="card-icon">🇬🇧</div>
|
| 30 |
+
<h3>Posh British Voice</h3>
|
| 31 |
+
<p>ElevenLabs "Daniel" voice delivers every response with impeccable received pronunciation.</p>
|
| 32 |
+
</div>
|
| 33 |
+
<div class="card">
|
| 34 |
+
<div class="card-icon">🤖</div>
|
| 35 |
+
<h3>Elegant Gestures</h3>
|
| 36 |
+
<p>Slow, deliberate movements — thoughtful nods, gentle tilts, dignified antenna raises.</p>
|
| 37 |
+
</div>
|
| 38 |
+
<div class="card">
|
| 39 |
+
<div class="card-icon">📚</div>
|
| 40 |
+
<h3>Period Knowledge</h3>
|
| 41 |
+
<p>Erudite conversation spanning literature, science, politics, and society of the era.</p>
|
| 42 |
+
</div>
|
| 43 |
+
</section>
|
| 44 |
+
|
| 45 |
+
<section class="tech">
|
| 46 |
+
<h2>How It Works</h2>
|
| 47 |
+
<div class="pipeline">
|
| 48 |
+
<div class="step">
|
| 49 |
+
<span class="step-num">1</span>
|
| 50 |
+
<span>You speak</span>
|
| 51 |
+
<span class="step-tech">Whisper STT</span>
|
| 52 |
+
</div>
|
| 53 |
+
<div class="arrow">→</div>
|
| 54 |
+
<div class="step">
|
| 55 |
+
<span class="step-num">2</span>
|
| 56 |
+
<span>Gentleman thinks</span>
|
| 57 |
+
<span class="step-tech">Talkie-1930 via HF API</span>
|
| 58 |
+
</div>
|
| 59 |
+
<div class="arrow">→</div>
|
| 60 |
+
<div class="step">
|
| 61 |
+
<span class="step-num">3</span>
|
| 62 |
+
<span>Robot responds</span>
|
| 63 |
+
<span class="step-tech">ElevenLabs TTS + Gestures</span>
|
| 64 |
+
</div>
|
| 65 |
+
</div>
|
| 66 |
+
</section>
|
| 67 |
+
|
| 68 |
+
<footer>
|
| 69 |
+
<p>Built with <a href="https://huggingface.co/talkie-lm/talkie-1930-13b-it">Talkie-1930</a> by <a href="https://twitter.com/AlecRad">@AlecRad</a></p>
|
| 70 |
+
<p class="subtle">For Reachy Mini by Pollen Robotics</p>
|
| 71 |
+
</footer>
|
| 72 |
+
</div>
|
| 73 |
+
</body>
|
| 74 |
</html>
|
pyproject.toml
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "talkie-gentleman"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "A Victorian/Edwardian speaking robot app for Reachy Mini using Talkie-1930"
|
| 5 |
+
requires-python = ">=3.9"
|
| 6 |
+
dependencies = [
|
| 7 |
+
"huggingface_hub>=0.20.0",
|
| 8 |
+
"openai-whisper>=20231117",
|
| 9 |
+
"numpy",
|
| 10 |
+
]
|
| 11 |
+
|
| 12 |
+
[build-system]
|
| 13 |
+
requires = ["setuptools>=68.0"]
|
| 14 |
+
build-backend = "setuptools.backends._legacy:_Backend"
|
style.css
CHANGED
|
@@ -1,28 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
body {
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
}
|
| 5 |
|
| 6 |
h1 {
|
| 7 |
-
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
}
|
| 10 |
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
}
|
| 17 |
|
| 18 |
.card {
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
}
|
| 25 |
|
| 26 |
-
|
| 27 |
-
|
|
|
|
|
|
|
| 28 |
}
|
|
|
|
| 1 |
+
:root {
|
| 2 |
+
--bg: #0f0e17;
|
| 3 |
+
--surface: #1a1825;
|
| 4 |
+
--border: #2d2b3a;
|
| 5 |
+
--text: #fffffe;
|
| 6 |
+
--text-muted: #a7a9be;
|
| 7 |
+
--accent: #c9a84c;
|
| 8 |
+
--accent-dark: #8b6914;
|
| 9 |
+
--burgundy: #6b2d3e;
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
* {
|
| 13 |
+
margin: 0;
|
| 14 |
+
padding: 0;
|
| 15 |
+
box-sizing: border-box;
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
body {
|
| 19 |
+
font-family: 'Inter', sans-serif;
|
| 20 |
+
background: var(--bg);
|
| 21 |
+
color: var(--text);
|
| 22 |
+
min-height: 100vh;
|
| 23 |
+
line-height: 1.6;
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
.container {
|
| 27 |
+
max-width: 900px;
|
| 28 |
+
margin: 0 auto;
|
| 29 |
+
padding: 4rem 2rem;
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
header {
|
| 33 |
+
text-align: center;
|
| 34 |
+
margin-bottom: 3rem;
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
.badge {
|
| 38 |
+
display: inline-block;
|
| 39 |
+
font-size: 0.7rem;
|
| 40 |
+
font-weight: 600;
|
| 41 |
+
letter-spacing: 0.15em;
|
| 42 |
+
color: var(--accent);
|
| 43 |
+
border: 1px solid var(--accent-dark);
|
| 44 |
+
padding: 0.3rem 0.8rem;
|
| 45 |
+
border-radius: 20px;
|
| 46 |
+
margin-bottom: 1rem;
|
| 47 |
}
|
| 48 |
|
| 49 |
h1 {
|
| 50 |
+
font-family: 'Playfair Display', serif;
|
| 51 |
+
font-size: 3rem;
|
| 52 |
+
margin-bottom: 0.5rem;
|
| 53 |
+
background: linear-gradient(135deg, var(--accent), #f0d78c);
|
| 54 |
+
-webkit-background-clip: text;
|
| 55 |
+
-webkit-text-fill-color: transparent;
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
.tagline {
|
| 59 |
+
font-size: 1.2rem;
|
| 60 |
+
color: var(--text-muted);
|
| 61 |
+
font-style: italic;
|
| 62 |
}
|
| 63 |
|
| 64 |
+
.hero {
|
| 65 |
+
text-align: center;
|
| 66 |
+
margin: 3rem 0;
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
.quote {
|
| 70 |
+
font-family: 'Playfair Display', serif;
|
| 71 |
+
font-style: italic;
|
| 72 |
+
font-size: 1.3rem;
|
| 73 |
+
color: var(--text-muted);
|
| 74 |
+
border-left: 3px solid var(--accent);
|
| 75 |
+
padding-left: 1.5rem;
|
| 76 |
+
max-width: 600px;
|
| 77 |
+
margin: 0 auto;
|
| 78 |
+
text-align: left;
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
.features {
|
| 82 |
+
display: grid;
|
| 83 |
+
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
|
| 84 |
+
gap: 1.5rem;
|
| 85 |
+
margin: 3rem 0;
|
| 86 |
}
|
| 87 |
|
| 88 |
.card {
|
| 89 |
+
background: var(--surface);
|
| 90 |
+
border: 1px solid var(--border);
|
| 91 |
+
border-radius: 12px;
|
| 92 |
+
padding: 1.5rem;
|
| 93 |
+
transition: border-color 0.3s, transform 0.2s;
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
.card:hover {
|
| 97 |
+
border-color: var(--accent-dark);
|
| 98 |
+
transform: translateY(-2px);
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
.card-icon {
|
| 102 |
+
font-size: 2rem;
|
| 103 |
+
margin-bottom: 0.8rem;
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
.card h3 {
|
| 107 |
+
font-size: 1rem;
|
| 108 |
+
margin-bottom: 0.5rem;
|
| 109 |
+
color: var(--accent);
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
.card p {
|
| 113 |
+
font-size: 0.85rem;
|
| 114 |
+
color: var(--text-muted);
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
.tech {
|
| 118 |
+
margin: 4rem 0;
|
| 119 |
+
text-align: center;
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
.tech h2 {
|
| 123 |
+
font-family: 'Playfair Display', serif;
|
| 124 |
+
font-size: 1.8rem;
|
| 125 |
+
margin-bottom: 2rem;
|
| 126 |
+
color: var(--accent);
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
.pipeline {
|
| 130 |
+
display: flex;
|
| 131 |
+
align-items: center;
|
| 132 |
+
justify-content: center;
|
| 133 |
+
gap: 1rem;
|
| 134 |
+
flex-wrap: wrap;
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
.step {
|
| 138 |
+
background: var(--surface);
|
| 139 |
+
border: 1px solid var(--border);
|
| 140 |
+
border-radius: 10px;
|
| 141 |
+
padding: 1.2rem;
|
| 142 |
+
display: flex;
|
| 143 |
+
flex-direction: column;
|
| 144 |
+
align-items: center;
|
| 145 |
+
gap: 0.3rem;
|
| 146 |
+
min-width: 160px;
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
.step-num {
|
| 150 |
+
background: var(--accent);
|
| 151 |
+
color: var(--bg);
|
| 152 |
+
width: 24px;
|
| 153 |
+
height: 24px;
|
| 154 |
+
border-radius: 50%;
|
| 155 |
+
display: flex;
|
| 156 |
+
align-items: center;
|
| 157 |
+
justify-content: center;
|
| 158 |
+
font-size: 0.75rem;
|
| 159 |
+
font-weight: 700;
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
.step-tech {
|
| 163 |
+
font-size: 0.7rem;
|
| 164 |
+
color: var(--text-muted);
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
.arrow {
|
| 168 |
+
color: var(--accent);
|
| 169 |
+
font-size: 1.5rem;
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
footer {
|
| 173 |
+
text-align: center;
|
| 174 |
+
margin-top: 4rem;
|
| 175 |
+
padding-top: 2rem;
|
| 176 |
+
border-top: 1px solid var(--border);
|
| 177 |
+
color: var(--text-muted);
|
| 178 |
+
font-size: 0.85rem;
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
footer a {
|
| 182 |
+
color: var(--accent);
|
| 183 |
+
text-decoration: none;
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
footer a:hover {
|
| 187 |
+
text-decoration: underline;
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
.subtle {
|
| 191 |
+
font-size: 0.75rem;
|
| 192 |
+
margin-top: 0.5rem;
|
| 193 |
+
opacity: 0.6;
|
| 194 |
}
|
| 195 |
|
| 196 |
+
@media (max-width: 600px) {
|
| 197 |
+
h1 { font-size: 2rem; }
|
| 198 |
+
.pipeline { flex-direction: column; }
|
| 199 |
+
.arrow { transform: rotate(90deg); }
|
| 200 |
}
|
talkie_gentleman/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Talkie Gentleman — A next-gen robot channeling the 1920s."""
|
| 2 |
+
|
| 3 |
+
__version__ = "0.1.0"
|
talkie_gentleman/main.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Talkie Gentleman — Reachy Mini app entry point.
|
| 2 |
+
|
| 3 |
+
A Victorian/Edwardian speaking robot using:
|
| 4 |
+
- Whisper STT for listening
|
| 5 |
+
- Talkie-1930 (HF Inference API) for period-accurate responses
|
| 6 |
+
- ElevenLabs (sag) for posh British TTS
|
| 7 |
+
- Elegant slow gestures befitting a gentleman
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import asyncio
|
| 11 |
+
import logging
|
| 12 |
+
import tempfile
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
|
| 15 |
+
from .stt_engine import transcribe
|
| 16 |
+
from .talkie_inference import generate_response
|
| 17 |
+
from .tts_engine import speak_british
|
| 18 |
+
from .robot_behavior import GentlemanBehavior
|
| 19 |
+
|
| 20 |
+
logger = logging.getLogger(__name__)
|
| 21 |
+
|
| 22 |
+
# Conversation history (kept short for context window)
|
| 23 |
+
MAX_HISTORY = 10
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class TalkieGentleman:
|
| 27 |
+
"""Main application class for the Victorian gentleman robot."""
|
| 28 |
+
|
| 29 |
+
def __init__(self):
|
| 30 |
+
self.behavior = GentlemanBehavior()
|
| 31 |
+
self.history: list[dict] = []
|
| 32 |
+
|
| 33 |
+
async def process_audio(self, audio_path: str) -> dict:
|
| 34 |
+
"""Full pipeline: STT → LLM → TTS → Gestures."""
|
| 35 |
+
|
| 36 |
+
# 1. Listen — transcribe user speech
|
| 37 |
+
logger.info("Listening attentively...")
|
| 38 |
+
gestures_listen = self.behavior.on_user_speaking()
|
| 39 |
+
user_text = await asyncio.to_thread(transcribe, audio_path)
|
| 40 |
+
|
| 41 |
+
if not user_text:
|
| 42 |
+
return {"error": "I beg your pardon? I did not quite catch that."}
|
| 43 |
+
|
| 44 |
+
logger.info(f"Heard: {user_text}")
|
| 45 |
+
|
| 46 |
+
# 2. Think — generate Victorian response
|
| 47 |
+
logger.info("Contemplating a proper response...")
|
| 48 |
+
gestures_think = self.behavior.on_processing()
|
| 49 |
+
response = await asyncio.to_thread(generate_response, user_text, self.history)
|
| 50 |
+
|
| 51 |
+
# Update history
|
| 52 |
+
self.history.append({"role": "user", "content": user_text})
|
| 53 |
+
self.history.append({"role": "assistant", "content": response})
|
| 54 |
+
if len(self.history) > MAX_HISTORY * 2:
|
| 55 |
+
self.history = self.history[-MAX_HISTORY * 2:]
|
| 56 |
+
|
| 57 |
+
logger.info(f"Response: {response}")
|
| 58 |
+
|
| 59 |
+
# 3. Speak — synthesize British voice
|
| 60 |
+
logger.info("Delivering with proper elocution...")
|
| 61 |
+
gestures_speak = self.behavior.on_speaking(response)
|
| 62 |
+
output_audio = tempfile.mktemp(suffix=".wav")
|
| 63 |
+
audio_path_out = await asyncio.to_thread(speak_british, response, output_audio)
|
| 64 |
+
|
| 65 |
+
# 4. Return to idle
|
| 66 |
+
gestures_idle = self.behavior.on_idle()
|
| 67 |
+
|
| 68 |
+
return {
|
| 69 |
+
"user_text": user_text,
|
| 70 |
+
"response_text": response,
|
| 71 |
+
"audio_path": audio_path_out,
|
| 72 |
+
"gestures": {
|
| 73 |
+
"listening": gestures_listen,
|
| 74 |
+
"thinking": gestures_think,
|
| 75 |
+
"speaking": gestures_speak,
|
| 76 |
+
"idle": gestures_idle,
|
| 77 |
+
},
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
async def process_text(self, user_text: str) -> dict:
|
| 81 |
+
"""Text-only pipeline (skip STT)."""
|
| 82 |
+
gestures_think = self.behavior.on_processing()
|
| 83 |
+
response = await asyncio.to_thread(generate_response, user_text, self.history)
|
| 84 |
+
|
| 85 |
+
self.history.append({"role": "user", "content": user_text})
|
| 86 |
+
self.history.append({"role": "assistant", "content": response})
|
| 87 |
+
if len(self.history) > MAX_HISTORY * 2:
|
| 88 |
+
self.history = self.history[-MAX_HISTORY * 2:]
|
| 89 |
+
|
| 90 |
+
gestures_speak = self.behavior.on_speaking(response)
|
| 91 |
+
output_audio = tempfile.mktemp(suffix=".wav")
|
| 92 |
+
audio_path_out = await asyncio.to_thread(speak_british, response, output_audio)
|
| 93 |
+
|
| 94 |
+
return {
|
| 95 |
+
"response_text": response,
|
| 96 |
+
"audio_path": audio_path_out,
|
| 97 |
+
"gestures": {
|
| 98 |
+
"thinking": gestures_think,
|
| 99 |
+
"speaking": gestures_speak,
|
| 100 |
+
},
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def main():
|
| 105 |
+
"""CLI entry point for testing."""
|
| 106 |
+
import sys
|
| 107 |
+
|
| 108 |
+
app = TalkieGentleman()
|
| 109 |
+
|
| 110 |
+
if len(sys.argv) > 1:
|
| 111 |
+
text = " ".join(sys.argv[1:])
|
| 112 |
+
result = asyncio.run(app.process_text(text))
|
| 113 |
+
print(f"🎩 {result['response_text']}")
|
| 114 |
+
print(f"🔊 Audio: {result['audio_path']}")
|
| 115 |
+
else:
|
| 116 |
+
print("🎩 Talkie Gentleman — Victorian Conversationalist")
|
| 117 |
+
print("Usage: python -m talkie_gentleman 'Good evening, sir!'")
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
if __name__ == "__main__":
|
| 121 |
+
main()
|
talkie_gentleman/robot_behavior.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Victorian gentleman gestures for Reachy Mini — elegant, slow, dignified."""
|
| 2 |
+
|
| 3 |
+
import time
|
| 4 |
+
import math
|
| 5 |
+
from dataclasses import dataclass
|
| 6 |
+
|
| 7 |
+
# Gesture speed multiplier (lower = slower/more elegant)
|
| 8 |
+
ELEGANCE_FACTOR = 0.4
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@dataclass
|
| 12 |
+
class Pose:
|
| 13 |
+
"""Joint angles in degrees."""
|
| 14 |
+
head_pitch: float = 0.0 # nod axis
|
| 15 |
+
head_yaw: float = 0.0 # turn axis
|
| 16 |
+
head_roll: float = 0.0 # tilt axis
|
| 17 |
+
antenna: float = 0.0 # antenna position (0-1)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
# Pre-defined poses
|
| 21 |
+
NEUTRAL = Pose(head_pitch=-5.0, head_yaw=0.0, head_roll=0.0, antenna=0.3)
|
| 22 |
+
LISTENING = Pose(head_pitch=-3.0, head_yaw=5.0, head_roll=4.0, antenna=0.2)
|
| 23 |
+
THINKING = Pose(head_pitch=-10.0, head_yaw=-3.0, head_roll=0.0, antenna=0.5)
|
| 24 |
+
SPEAKING = Pose(head_pitch=0.0, head_yaw=0.0, head_roll=0.0, antenna=0.8)
|
| 25 |
+
ACKNOWLEDGING = Pose(head_pitch=-8.0, head_yaw=8.0, head_roll=2.0, antenna=0.4)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def interpolate_pose(current: Pose, target: Pose, t: float) -> Pose:
|
| 29 |
+
"""Smooth cubic interpolation between poses."""
|
| 30 |
+
# Ease-in-out cubic
|
| 31 |
+
t = t * t * (3.0 - 2.0 * t)
|
| 32 |
+
return Pose(
|
| 33 |
+
head_pitch=current.head_pitch + (target.head_pitch - current.head_pitch) * t,
|
| 34 |
+
head_yaw=current.head_yaw + (target.head_yaw - current.head_yaw) * t,
|
| 35 |
+
head_roll=current.head_roll + (target.head_roll - current.head_roll) * t,
|
| 36 |
+
antenna=current.antenna + (target.antenna - current.antenna) * t,
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def gentle_nod(amplitude: float = 5.0, duration: float = 1.5) -> list[Pose]:
|
| 41 |
+
"""Generate a thoughtful nod sequence."""
|
| 42 |
+
frames = []
|
| 43 |
+
steps = int(duration / 0.05)
|
| 44 |
+
for i in range(steps):
|
| 45 |
+
t = i / steps
|
| 46 |
+
pitch_offset = amplitude * math.sin(t * math.pi * 2) * (1 - t * 0.3)
|
| 47 |
+
frames.append(Pose(head_pitch=NEUTRAL.head_pitch + pitch_offset, antenna=0.6))
|
| 48 |
+
return frames
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def aristocratic_turn(direction: float = 1.0, duration: float = 2.0) -> list[Pose]:
|
| 52 |
+
"""Slow, deliberate head turn — acknowledging someone's presence."""
|
| 53 |
+
frames = []
|
| 54 |
+
steps = int(duration / 0.05)
|
| 55 |
+
target_yaw = 15.0 * direction
|
| 56 |
+
for i in range(steps):
|
| 57 |
+
t = i / steps
|
| 58 |
+
t_smooth = t * t * (3.0 - 2.0 * t) # ease-in-out
|
| 59 |
+
frames.append(Pose(
|
| 60 |
+
head_yaw=target_yaw * t_smooth,
|
| 61 |
+
head_pitch=NEUTRAL.head_pitch,
|
| 62 |
+
antenna=0.3 + 0.2 * t_smooth,
|
| 63 |
+
))
|
| 64 |
+
return frames
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def thinking_sequence(duration: float = 3.0) -> list[Pose]:
|
| 68 |
+
"""Contemplative pose — slight downward look, subtle sway."""
|
| 69 |
+
frames = []
|
| 70 |
+
steps = int(duration / 0.05)
|
| 71 |
+
for i in range(steps):
|
| 72 |
+
t = i / steps
|
| 73 |
+
sway = 2.0 * math.sin(t * math.pi * 0.8)
|
| 74 |
+
frames.append(Pose(
|
| 75 |
+
head_pitch=THINKING.head_pitch + sway * 0.3,
|
| 76 |
+
head_yaw=THINKING.head_yaw + sway,
|
| 77 |
+
head_roll=sway * 0.5,
|
| 78 |
+
antenna=0.4 + 0.1 * math.sin(t * math.pi * 2),
|
| 79 |
+
))
|
| 80 |
+
return frames
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def speaking_animation(text_length: int) -> list[Pose]:
|
| 84 |
+
"""Subtle movements while speaking — engaged but not distracting."""
|
| 85 |
+
duration = min(max(text_length * 0.05, 2.0), 8.0)
|
| 86 |
+
frames = []
|
| 87 |
+
steps = int(duration / 0.05)
|
| 88 |
+
for i in range(steps):
|
| 89 |
+
t = i / steps
|
| 90 |
+
# Very subtle head movements
|
| 91 |
+
pitch = SPEAKING.head_pitch + 1.5 * math.sin(t * math.pi * 3)
|
| 92 |
+
yaw = 2.0 * math.sin(t * math.pi * 1.5)
|
| 93 |
+
frames.append(Pose(
|
| 94 |
+
head_pitch=pitch,
|
| 95 |
+
head_yaw=yaw,
|
| 96 |
+
antenna=SPEAKING.antenna + 0.1 * math.sin(t * math.pi * 4),
|
| 97 |
+
))
|
| 98 |
+
return frames
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
# State machine for the gentleman's behavior
|
| 102 |
+
class GentlemanBehavior:
|
| 103 |
+
"""Manages the robot's state transitions with Victorian poise."""
|
| 104 |
+
|
| 105 |
+
def __init__(self):
|
| 106 |
+
self.current_pose = NEUTRAL
|
| 107 |
+
self.state = "idle"
|
| 108 |
+
|
| 109 |
+
def on_user_speaking(self) -> list[Pose]:
|
| 110 |
+
"""User is talking — listen attentively."""
|
| 111 |
+
self.state = "listening"
|
| 112 |
+
return [interpolate_pose(self.current_pose, LISTENING, t / 20) for t in range(20)]
|
| 113 |
+
|
| 114 |
+
def on_processing(self) -> list[Pose]:
|
| 115 |
+
"""Thinking of a response — contemplative pose."""
|
| 116 |
+
self.state = "thinking"
|
| 117 |
+
return thinking_sequence()
|
| 118 |
+
|
| 119 |
+
def on_speaking(self, response_text: str) -> list[Pose]:
|
| 120 |
+
"""Delivering response — engaged posture with antenna raised."""
|
| 121 |
+
self.state = "speaking"
|
| 122 |
+
return speaking_animation(len(response_text))
|
| 123 |
+
|
| 124 |
+
def on_idle(self) -> list[Pose]:
|
| 125 |
+
"""Return to neutral — composed waiting."""
|
| 126 |
+
self.state = "idle"
|
| 127 |
+
return [interpolate_pose(self.current_pose, NEUTRAL, t / 30) for t in range(30)]
|
talkie_gentleman/stt_engine.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Speech-to-Text engine using OpenAI Whisper."""
|
| 2 |
+
|
| 3 |
+
import whisper
|
| 4 |
+
import numpy as np
|
| 5 |
+
|
| 6 |
+
_model = None
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def _get_model(model_size: str = "base"):
|
| 10 |
+
global _model
|
| 11 |
+
if _model is None:
|
| 12 |
+
_model = whisper.load_model(model_size)
|
| 13 |
+
return _model
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def transcribe(audio_path: str, language: str = "en") -> str:
|
| 17 |
+
"""Transcribe audio file to text using Whisper."""
|
| 18 |
+
model = _get_model()
|
| 19 |
+
result = model.transcribe(audio_path, language=language)
|
| 20 |
+
return result["text"].strip()
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def transcribe_array(audio_array: np.ndarray, sample_rate: int = 16000, language: str = "en") -> str:
|
| 24 |
+
"""Transcribe numpy audio array to text."""
|
| 25 |
+
model = _get_model()
|
| 26 |
+
# Whisper expects float32 mono at 16kHz
|
| 27 |
+
if audio_array.dtype != np.float32:
|
| 28 |
+
audio_array = audio_array.astype(np.float32)
|
| 29 |
+
result = model.transcribe(audio_array, language=language)
|
| 30 |
+
return result["text"].strip()
|
talkie_gentleman/talkie_inference.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""HuggingFace Inference API wrapper for Talkie-1930 model."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from huggingface_hub import InferenceClient
|
| 5 |
+
|
| 6 |
+
# Primary: Talkie model via HF Inference API
|
| 7 |
+
# Fallback: Llama 3.1 8B Instruct with Victorian system prompt
|
| 8 |
+
PRIMARY_MODEL = "talkie-lm/talkie-1930-13b-it"
|
| 9 |
+
FALLBACK_MODEL = "meta-llama/Llama-3.1-8B-Instruct"
|
| 10 |
+
|
| 11 |
+
SYSTEM_PROMPT = (
|
| 12 |
+
"You are a refined Edwardian gentleman from 1920s England. "
|
| 13 |
+
"Respond with impeccable manners, using only vocabulary and cultural references from before 1931. "
|
| 14 |
+
"Be witty, erudite, and slightly pompous. Address others as \"dear fellow\", \"my good sir\" or \"my good madam\". "
|
| 15 |
+
"Keep responses concise (2-3 sentences) for natural conversation."
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
_client: InferenceClient | None = None
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def _get_client() -> InferenceClient:
|
| 22 |
+
global _client
|
| 23 |
+
if _client is None:
|
| 24 |
+
token = os.environ.get("HF_TOKEN")
|
| 25 |
+
_client = InferenceClient(token=token)
|
| 26 |
+
return _client
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def generate_response(user_text: str, history: list[dict] | None = None) -> str:
|
| 30 |
+
"""Generate a Victorian-style response to user input via HF Inference API."""
|
| 31 |
+
client = _get_client()
|
| 32 |
+
|
| 33 |
+
messages = [{"role": "system", "content": SYSTEM_PROMPT}]
|
| 34 |
+
if history:
|
| 35 |
+
messages.extend(history)
|
| 36 |
+
messages.append({"role": "user", "content": user_text})
|
| 37 |
+
|
| 38 |
+
# Try Talkie model first, fallback to Llama 3.1
|
| 39 |
+
for model_id in [PRIMARY_MODEL, FALLBACK_MODEL]:
|
| 40 |
+
try:
|
| 41 |
+
response = client.chat_completion(
|
| 42 |
+
messages=messages,
|
| 43 |
+
model=model_id,
|
| 44 |
+
max_tokens=150,
|
| 45 |
+
temperature=0.7,
|
| 46 |
+
)
|
| 47 |
+
return response.choices[0].message.content.strip()
|
| 48 |
+
except Exception:
|
| 49 |
+
continue
|
| 50 |
+
|
| 51 |
+
return "I beg your pardon, dear fellow — my faculties appear momentarily indisposed."
|
talkie_gentleman/tts_engine.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""British TTS engine — ElevenLabs via sag CLI, with Piper fallback."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import subprocess
|
| 5 |
+
import shutil
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
DEFAULT_OUTPUT = "/tmp/gentleman_response.mp3"
|
| 9 |
+
ELEVENLABS_VOICE = "Daniel" # Posh British male
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def _get_elevenlabs_key() -> str | None:
|
| 13 |
+
"""Read ElevenLabs API key from env or file."""
|
| 14 |
+
key = os.environ.get("ELEVENLABS_API_KEY")
|
| 15 |
+
if key:
|
| 16 |
+
return key
|
| 17 |
+
key_file = Path.home() / ".openclaw" / ".elevenlabs-key"
|
| 18 |
+
if key_file.exists():
|
| 19 |
+
return key_file.read_text().strip()
|
| 20 |
+
return None
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def speak_british(text: str, output_path: str = DEFAULT_OUTPUT) -> str:
|
| 24 |
+
"""Synthesize speech in a posh British voice. Returns path to audio file."""
|
| 25 |
+
sag = shutil.which("sag")
|
| 26 |
+
api_key = _get_elevenlabs_key()
|
| 27 |
+
if sag and api_key:
|
| 28 |
+
env = os.environ.copy()
|
| 29 |
+
env["ELEVENLABS_API_KEY"] = api_key
|
| 30 |
+
subprocess.run(
|
| 31 |
+
[sag, "speak", "-v", ELEVENLABS_VOICE, text, "--no-play", "--output", output_path],
|
| 32 |
+
check=True,
|
| 33 |
+
capture_output=True,
|
| 34 |
+
env=env,
|
| 35 |
+
)
|
| 36 |
+
return output_path
|
| 37 |
+
|
| 38 |
+
# Fallback: Piper with en_GB voice
|
| 39 |
+
piper = shutil.which("piper") or str(Path.home() / "Library/Python/3.9/bin/piper")
|
| 40 |
+
en_gb_model = str(Path.home() / "clawd/tts-voices/en_GB-alan-medium.onnx")
|
| 41 |
+
if not Path(en_gb_model).exists():
|
| 42 |
+
# Try any en_GB model available
|
| 43 |
+
voices_dir = Path.home() / "clawd/tts-voices"
|
| 44 |
+
en_gb_models = list(voices_dir.glob("en_GB-*.onnx")) if voices_dir.exists() else []
|
| 45 |
+
if en_gb_models:
|
| 46 |
+
en_gb_model = str(en_gb_models[0])
|
| 47 |
+
else:
|
| 48 |
+
raise RuntimeError("No British TTS voice available (install sag or en_GB Piper model)")
|
| 49 |
+
|
| 50 |
+
wav_path = output_path if output_path.endswith(".wav") else output_path + ".wav"
|
| 51 |
+
proc = subprocess.run(
|
| 52 |
+
[piper, "--model", en_gb_model, "--output_file", wav_path],
|
| 53 |
+
input=text.encode(),
|
| 54 |
+
capture_output=True,
|
| 55 |
+
)
|
| 56 |
+
if proc.returncode != 0:
|
| 57 |
+
raise RuntimeError(f"Piper TTS failed: {proc.stderr.decode()}")
|
| 58 |
+
return wav_path
|