File size: 22,857 Bytes
eda316b
 
 
 
 
 
 
 
 
e073547
 
 
 
 
 
 
 
eda316b
 
 
 
 
 
 
 
 
 
 
 
 
e073547
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9c19b67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e073547
 
 
 
 
 
 
 
 
 
 
 
9c19b67
 
 
 
 
 
e073547
eda316b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e073547
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9c19b67
 
 
 
 
 
e073547
 
 
 
 
 
 
 
 
 
 
 
 
 
eda316b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
"""

Step 1 - Ingestion: Download video and generate word-level transcript.



Responsibilities:

  - Download source video from YouTube using yt-dlp.

  - Extract audio track for transcription.

  - Generate word-level timestamped transcript.

"""

import json
import logging
import os
import shutil
import subprocess
import base64
from math import ceil
from pathlib import Path

import httpx

from humeo.video_cache import local_source_matches, write_local_source_info

logger = logging.getLogger(__name__)

OPENAI_MAX_UPLOAD_BYTES = 25 * 1024 * 1024
OPENAI_TARGET_UPLOAD_BYTES = 20 * 1024 * 1024
OPENAI_MIN_CHUNK_SEC = 300.0
ELEVENLABS_TRANSCRIBE_URL = "https://api.elevenlabs.io/v1/speech-to-text"
TRANSCRIPT_META_FILENAME = "transcript.meta.json"
ELEVENLABS_SCRIBE_MODEL = "scribe_v2"
_ELEVENLABS_SEGMENT_MAX_GAP_SEC = 0.65
_ELEVENLABS_SEGMENT_MAX_DURATION_SEC = 6.0
_ELEVENLABS_SEGMENT_MAX_WORDS = 18
YTDLP_BROWSER_USER_AGENT = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
)


def _decode_cookie_secret(raw: str) -> str:
    text = raw.strip()
    if "\\n" in text and "\n" not in text:
        text = text.replace("\\n", "\n")
    return text


def _yt_dlp_cookie_file(output_dir: Path) -> Path | None:
    raw = (
        os.environ.get("YTDLP_COOKIES")
        or os.environ.get("YOUTUBE_COOKIES")
        or ""
    ).strip()
    encoded = (
        os.environ.get("YTDLP_COOKIES_B64")
        or os.environ.get("YOUTUBE_COOKIES_B64")
        or ""
    ).strip()
    if not raw and encoded:
        try:
            raw = base64.b64decode(encoded).decode("utf-8")
        except Exception as exc:
            raise RuntimeError("Could not decode YTDLP_COOKIES_B64.") from exc
    if not raw:
        return None

    cookie_path = output_dir / "yt-dlp-cookies.txt"
    cookie_path.write_text(_decode_cookie_secret(raw).rstrip() + "\n", encoding="utf-8")
    try:
        cookie_path.chmod(0o600)
    except OSError:
        pass
    return cookie_path


def _yt_dlp_impersonate_target() -> str | None:
    target = (os.environ.get("YTDLP_IMPERSONATE") or "chrome").strip()
    if target.lower() in {"", "0", "false", "no", "off", "none"}:
        return None
    return target


def _yt_dlp_ip_family_flag() -> str | None:
    value = (os.environ.get("YTDLP_IP_FAMILY") or "").strip().lower()
    if value in {"4", "ipv4"}:
        return "--force-ipv4"
    if value in {"6", "ipv6"}:
        return "--force-ipv6"
    return None


def _yt_dlp_error(exc: subprocess.CalledProcessError) -> RuntimeError:
    stdout = (exc.stdout or "").strip()
    stderr = (exc.stderr or "").strip()
    details = stderr or stdout or str(exc)
    lowered = details.lower()
    hint = ""
    if any(token in lowered for token in ("sign in", "not a bot", "confirm you're not a bot", "cookies")):
        hint = (
            "\n\nYouTube blocked the Hugging Face downloader. Add a Space secret named "
            "YTDLP_COOKIES_B64 containing a base64 encoded Netscape cookies.txt export "
            "from a logged-in browser, or upload the MP4 directly."
        )
    elif "unexpected_eof_while_reading" in lowered or "ssl" in lowered:
        hint = (
            "\n\nYouTube closed the TLS connection from Hugging Face. The app will use "
            "browser TLS impersonation when curl_cffi is installed; if this persists, "
            "upload the MP4 directly or add YTDLP_COOKIES_B64."
        )
    return RuntimeError(f"yt-dlp failed to download the YouTube video:\n{details}{hint}")


def stage_local_video(source: str | Path, output_dir: Path) -> Path:
    """

    Copy a local source video into ``output_dir/source.mp4`` for cacheable reruns.

    """
    source_path = Path(source).expanduser().resolve(strict=False)
    if not source_path.is_file():
        raise FileNotFoundError(f"Local source video does not exist: {source_path}")

    output_dir.mkdir(parents=True, exist_ok=True)
    staged_path = output_dir / "source.mp4"
    staged_resolved = staged_path.resolve(strict=False)

    if source_path == staged_resolved:
        logger.info("Using local source video in place: %s", source_path)
        write_local_source_info(output_dir, source_path)
        return staged_path

    if staged_path.exists() and local_source_matches(output_dir, str(source_path)):
        logger.info("Local source already staged at: %s", staged_path)
        return staged_path

    if source_path.suffix.lower() != ".mp4":
        logger.warning(
            "Local source uses %s; staging it as source.mp4 anyway.",
            source_path.suffix or "<no extension>",
        )

    action = "Replacing" if staged_path.exists() else "Staging"
    logger.info("%s local video: %s -> %s", action, source_path, staged_path)
    shutil.copy2(source_path, staged_path)
    write_local_source_info(output_dir, source_path)
    return staged_path


def download_video(youtube_url: str, output_dir: Path) -> Path:
    """

    Download the best quality video+audio from YouTube.



    Returns the path to the downloaded MP4 file.

    """
    output_template = str(output_dir / "source.%(ext)s")
    cmd = [
        "yt-dlp",
        "--format",
        "bv*[ext=mp4]+ba[ext=m4a]/bv*+ba/best[ext=mp4]/best",
        "--merge-output-format",
        "mp4",
        "--output",
        output_template,
        "--no-playlist",
        "--write-info-json",
        "--retries",
        "5",
        "--fragment-retries",
        "5",
        "--extractor-retries",
        "3",
        "--socket-timeout",
        "30",
        "--user-agent",
        YTDLP_BROWSER_USER_AGENT,
        "--extractor-args",
        (os.environ.get("YTDLP_EXTRACTOR_ARGS") or "youtube:player_client=default,web_creator"),
        "--quiet",
    ]
    ip_family_flag = _yt_dlp_ip_family_flag()
    if ip_family_flag:
        cmd.append(ip_family_flag)
    impersonate_target = _yt_dlp_impersonate_target()
    if impersonate_target:
        cmd.extend(["--impersonate", impersonate_target])
    if shutil.which("node"):
        cmd.extend(["--js-runtimes", "node", "--remote-components", "ejs:github"])
    cookie_path = _yt_dlp_cookie_file(output_dir)
    if cookie_path is not None:
        cmd.extend(["--cookies", str(cookie_path)])
    cmd.append(youtube_url)

    logger.info("Downloading video: %s", youtube_url)
    try:
        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
    except subprocess.CalledProcessError as exc:
        raise _yt_dlp_error(exc) from exc
    if result.stderr:
        logger.warning(result.stderr.strip())

    # yt-dlp should produce source.mp4
    video_path = output_dir / "source.mp4"
    if not video_path.exists():
        # Fallback: find any mp4 in the output dir
        mp4_files = list(output_dir.glob("source.*"))
        if mp4_files:
            video_path = mp4_files[0]
        else:
            raise FileNotFoundError(f"Download failed - no output found in {output_dir}")

    logger.info("Downloaded to: %s", video_path)
    return video_path


def extract_audio(video_path: Path, output_dir: Path) -> Path:
    """

    Extract audio track from video as WAV (required by most ASR models).

    """
    audio_path = output_dir / "source_audio.wav"
    cmd = [
        "ffmpeg", "-y",
        "-i", str(video_path),
        "-vn",                        # no video
        "-acodec", "pcm_s16le",       # raw PCM
        "-ar", "16000",               # 16kHz sample rate (standard for ASR)
        "-ac", "1",                   # mono
        str(audio_path),
    ]

    logger.info("Extracting audio to: %s", audio_path)
    subprocess.run(cmd, check=True, capture_output=True)
    return audio_path


def _resolve_elevenlabs_api_key() -> str:
    key = (os.environ.get("ELEVENLABS_API_KEY") or "").strip()
    if key:
        return key
    raise ValueError("Set ELEVENLABS_API_KEY to use ElevenLabs Scribe v2 transcription.")


def _elevenlabs_no_verbatim_enabled() -> bool:
    raw = (os.environ.get("ELEVENLABS_NO_VERBATIM") or "true").strip().lower()
    return raw not in {"0", "false", "no", "off"}


def resolved_transcribe_settings() -> dict[str, object]:
    provider = (os.environ.get("HUMEO_TRANSCRIBE_PROVIDER") or "elevenlabs").strip().lower()
    if provider in ("", "auto"):
        if (os.environ.get("ELEVENLABS_API_KEY") or "").strip():
            provider = "elevenlabs"
        else:
            provider = "openai"

    if provider in ("api",):
        provider = "openai"
    if provider in ("local",):
        provider = "whisperx"

    settings: dict[str, object] = {"provider": provider}
    if provider == "elevenlabs":
        settings.update(
            {
                "model_id": ELEVENLABS_SCRIBE_MODEL,
                "no_verbatim": _elevenlabs_no_verbatim_enabled(),
            }
        )
    return settings


def transcript_cache_valid(output_dir: Path) -> bool:
    transcript_path = output_dir / "transcript.json"
    meta_path = output_dir / TRANSCRIPT_META_FILENAME
    if not transcript_path.is_file() or not meta_path.is_file():
        return False
    try:
        meta = json.loads(meta_path.read_text(encoding="utf-8"))
    except Exception:
        return False
    return meta == resolved_transcribe_settings()


def _write_transcript(output_dir: Path, transcript: dict) -> None:
    transcript_path = output_dir / "transcript.json"
    with open(transcript_path, "w", encoding="utf-8") as f:
        json.dump(transcript, f, indent=2, ensure_ascii=False)
    with open(output_dir / TRANSCRIPT_META_FILENAME, "w", encoding="utf-8") as f:
        json.dump(resolved_transcribe_settings(), f, indent=2, ensure_ascii=False)
        f.write("\n")


def _normalize_elevenlabs_word(raw_word: dict) -> dict | None:
    if not isinstance(raw_word, dict):
        return None
    if str(raw_word.get("type", "word")).strip().lower() not in {"word", ""}:
        return None
    text = str(raw_word.get("text", raw_word.get("word", ""))).strip()
    if not text:
        return None
    try:
        start = float(raw_word["start"])
        end = float(raw_word["end"])
    except (KeyError, TypeError, ValueError):
        return None
    if end <= start:
        return None
    return {"word": text, "start": start, "end": end}


def _segment_words_into_transcript(words: list[dict], *, language: str) -> dict:
    segments: list[dict] = []
    chunk: list[dict] = []

    def flush() -> None:
        if not chunk:
            return
        segments.append(
            {
                "start": chunk[0]["start"],
                "end": chunk[-1]["end"],
                "text": " ".join(str(word["word"]) for word in chunk).strip(),
                "words": list(chunk),
            }
        )
        chunk.clear()

    for word in words:
        if chunk:
            gap = float(word["start"]) - float(chunk[-1]["end"])
            dur = float(word["end"]) - float(chunk[0]["start"])
            if (
                gap >= _ELEVENLABS_SEGMENT_MAX_GAP_SEC
                or dur >= _ELEVENLABS_SEGMENT_MAX_DURATION_SEC
                or len(chunk) >= _ELEVENLABS_SEGMENT_MAX_WORDS
            ):
                flush()
        chunk.append(word)
    flush()
    return {"segments": segments, "language": language}


def _normalize_elevenlabs_response(data: dict) -> dict:
    words = [
        word
        for raw_word in data.get("words", []) or []
        if (word := _normalize_elevenlabs_word(raw_word)) is not None
    ]
    language = str(
        data.get("language_code") or data.get("language") or "en"
    ).strip() or "en"
    return _segment_words_into_transcript(words, language=language)


def _transcribe_elevenlabs_scribe(audio_path: Path) -> dict:
    headers = {"xi-api-key": _resolve_elevenlabs_api_key()}
    form = {
        "model_id": ELEVENLABS_SCRIBE_MODEL,
        "timestamps_granularity": "word",
        "diarize": "false",
        "tag_audio_events": "false",
        "file_format": "pcm_s16le_16",
        "no_verbatim": "true" if _elevenlabs_no_verbatim_enabled() else "false",
    }
    with audio_path.open("rb") as handle:
        files = {"file": (audio_path.name, handle, "audio/wav")}
        response = httpx.post(
            ELEVENLABS_TRANSCRIBE_URL,
            headers=headers,
            data=form,
            files=files,
            timeout=600.0,
        )
    response.raise_for_status()
    return _normalize_elevenlabs_response(response.json())


def _transcribe_whisperx_local(audio_path: Path) -> dict:
    """Word-level transcript via WhisperX (local). Raises ImportError if not installed."""
    import whisperx

    logger.info("Transcribing with WhisperX...")
    device = "cpu"  # Use "cuda" if GPU available
    model = whisperx.load_model("base", device=device, compute_type="int8")
    audio = whisperx.load_audio(str(audio_path))
    result = model.transcribe(audio, batch_size=16)

    align_model, metadata = whisperx.load_align_model(
        language_code=result["language"], device=device
    )
    result = whisperx.align(
        result["segments"], align_model, metadata, audio, device,
        return_char_alignments=False,
    )

    logger.info("Transcription complete: %d segments", len(result["segments"]))
    return result


def transcribe_whisperx(audio_path: Path, output_dir: Path) -> dict:
    """

    Transcribe audio for word-level timestamps.



    Provider is controlled by **HUMEO_TRANSCRIBE_PROVIDER** (default ``auto``):



    - ``auto`` — WhisperX if installed, else OpenAI Whisper API.

    - ``openai`` / ``api`` — OpenAI Whisper API (uses ``OPENAI_API_KEY``), even when WhisperX is installed.

    - ``whisperx`` / ``local`` — WhisperX only; fails clearly if not installed.



    The result is written to ``output_dir / "transcript.json"``. Re-runs with an

    existing transcript are skipped by the pipeline before this function runs.

    """
    settings = resolved_transcribe_settings()
    provider = str(settings["provider"])

    if provider == "elevenlabs":
        logger.info(
            "Transcribing with ElevenLabs Scribe v2 (no_verbatim=%s).",
            bool(settings.get("no_verbatim", False)),
        )
        result = _transcribe_elevenlabs_scribe(audio_path)
    elif provider == "openai":
        logger.info(
            "Transcribing with OpenAI Whisper API (HUMEO_TRANSCRIBE_PROVIDER=%s).",
            provider,
        )
        result = _transcribe_openai_api(audio_path)
    elif provider == "whisperx":
        try:
            result = _transcribe_whisperx_local(audio_path)
        except ImportError as e:
            raise RuntimeError(
                "WhisperX requested (HUMEO_TRANSCRIBE_PROVIDER=whisperx) but whisperx is not installed. "
                "Install with: uv sync --extra whisper"
            ) from e
    else:
        raise RuntimeError(
            f"Unknown HUMEO_TRANSCRIBE_PROVIDER={provider!r}. "
            "Use elevenlabs, openai, or whisperx."
        )

    _write_transcript(output_dir, result)

    return result


def _transcribe_openai_api(audio_path: Path) -> dict:
    """

    Fallback transcription using OpenAI's Whisper API.

    Requires OPENAI_API_KEY environment variable.

    """
    from openai import OpenAI

    client = OpenAI()

    work_dir = audio_path.parent / "openai_transcribe"
    work_dir.mkdir(parents=True, exist_ok=True)
    duration_sec = _probe_media_duration(audio_path)
    chunk_ranges = _plan_openai_chunk_ranges(
        duration_sec=duration_sec,
        file_size_bytes=audio_path.stat().st_size,
    )

    if len(chunk_ranges) == 1:
        return _transcribe_openai_file(client, audio_path)

    logger.info("Audio exceeds OpenAI upload limit; transcribing in %d chunks.", len(chunk_ranges))
    chunk_transcripts: list[dict] = []
    for idx, (offset_sec, chunk_duration_sec) in enumerate(chunk_ranges, start=1):
        chunk_path = work_dir / f"{audio_path.stem}_part_{idx:03d}.wav"
        if not chunk_path.exists():
            _extract_openai_audio_chunk(
                input_path=audio_path,
                output_path=chunk_path,
                offset_sec=offset_sec,
                duration_sec=chunk_duration_sec,
            )
        logger.info(
            "Transcribing chunk %d/%d (%.1fs-%.1fs)",
            idx,
            len(chunk_ranges),
            offset_sec,
            offset_sec + chunk_duration_sec,
        )
        chunk_transcript = _transcribe_openai_file(client, chunk_path)
        chunk_transcripts.append(_offset_transcript_timestamps(chunk_transcript, offset_sec))

    return _merge_transcripts(chunk_transcripts)


def _extract_openai_audio_chunk(

    input_path: Path,

    output_path: Path,

    offset_sec: float,

    duration_sec: float,

) -> Path:
    cmd = [
        "ffmpeg",
        "-y",
        "-loglevel",
        "error",
        "-ss",
        f"{offset_sec:.3f}",
        "-t",
        f"{duration_sec:.3f}",
        "-i",
        str(input_path),
        "-vn",
        "-acodec",
        "pcm_s16le",
        "-ac",
        "1",
        "-ar",
        "16000",
        str(output_path),
    ]
    subprocess.run(cmd, check=True, capture_output=True)
    return output_path


def _probe_media_duration(media_path: Path) -> float:
    cmd = [
        "ffprobe",
        "-v",
        "error",
        "-show_entries",
        "format=duration",
        "-of",
        "json",
        str(media_path),
    ]
    result = subprocess.run(cmd, check=True, capture_output=True, text=True)
    data = json.loads(result.stdout)
    return float(data["format"]["duration"])


def _plan_openai_chunk_ranges(

    *,

    duration_sec: float,

    file_size_bytes: int,

    max_upload_bytes: int = OPENAI_MAX_UPLOAD_BYTES,

    target_upload_bytes: int = OPENAI_TARGET_UPLOAD_BYTES,

) -> list[tuple[float, float]]:
    if file_size_bytes <= max_upload_bytes:
        return [(0.0, duration_sec)]

    chunk_sec = max(
        OPENAI_MIN_CHUNK_SEC,
        duration_sec * (target_upload_bytes / file_size_bytes),
    )
    chunk_count = max(2, ceil(duration_sec / chunk_sec))
    exact_chunk_sec = duration_sec / chunk_count

    ranges: list[tuple[float, float]] = []
    for idx in range(chunk_count):
        start = idx * exact_chunk_sec
        end = min(duration_sec, (idx + 1) * exact_chunk_sec)
        ranges.append((round(start, 3), round(end - start, 3)))
    return ranges


def _transcribe_openai_file(client, audio_path: Path) -> dict:
    with open(audio_path, "rb") as f:
        response = client.audio.transcriptions.create(
            model="whisper-1",
            file=f,
            response_format="verbose_json",
            timestamp_granularities=["word", "segment"],
        )
    return _normalize_openai_response(response)


def _normalize_openai_response(response: object) -> dict:
    data = response.model_dump() if hasattr(response, "model_dump") else response
    if not isinstance(data, dict):
        raise TypeError(f"Unexpected transcription payload type: {type(data)!r}")

    top_words = [_normalize_word(word) for word in data.get("words", []) or []]
    segments: list[dict] = []
    word_index = 0

    for raw_segment in data.get("segments", []) or []:
        segment = raw_segment.model_dump() if hasattr(raw_segment, "model_dump") else raw_segment
        if not isinstance(segment, dict):
            continue

        start = float(segment.get("start", 0.0))
        end = float(segment.get("end", 0.0))
        text = str(segment.get("text", "")).strip()

        segment_words = [_normalize_word(word) for word in segment.get("words", []) or []]
        if not segment_words and top_words:
            while word_index < len(top_words) and top_words[word_index]["end"] <= start:
                word_index += 1

            probe_index = word_index
            while probe_index < len(top_words) and top_words[probe_index]["start"] < end:
                word = top_words[probe_index]
                if word["end"] > start:
                    segment_words.append(word)
                probe_index += 1
            word_index = probe_index

        segments.append(
            {
                "start": start,
                "end": end,
                "text": text,
                "words": segment_words,
            }
        )

    if not segments and top_words:
        segments.append(
            {
                "start": top_words[0]["start"],
                "end": top_words[-1]["end"],
                "text": " ".join(word["word"] for word in top_words).strip(),
                "words": top_words,
            }
        )

    return {
        "segments": segments,
        "language": str(data.get("language", "en") or "en"),
    }


def _normalize_word(raw_word: object) -> dict:
    word = raw_word.model_dump() if hasattr(raw_word, "model_dump") else raw_word
    if not isinstance(word, dict):
        return {"word": "", "start": 0.0, "end": 0.0}
    return {
        "word": str(word.get("word", "")).strip(),
        "start": float(word.get("start", 0.0)),
        "end": float(word.get("end", 0.0)),
    }


def _offset_transcript_timestamps(transcript: dict, offset_sec: float) -> dict:
    shifted_segments = []
    for segment in transcript.get("segments", []):
        shifted_segments.append(
            {
                "start": float(segment["start"]) + offset_sec,
                "end": float(segment["end"]) + offset_sec,
                "text": segment["text"],
                "words": [
                    {
                        "word": word["word"],
                        "start": float(word["start"]) + offset_sec,
                        "end": float(word["end"]) + offset_sec,
                    }
                    for word in segment.get("words", [])
                ],
            }
        )
    return {
        "segments": shifted_segments,
        "language": transcript.get("language", "en"),
    }


def _merge_transcripts(transcripts: list[dict]) -> dict:
    merged_segments = []
    language = "en"
    for transcript in transcripts:
        merged_segments.extend(transcript.get("segments", []))
        if transcript.get("language"):
            language = transcript["language"]
    return {
        "segments": merged_segments,
        "language": language,
    }