File size: 2,961 Bytes
eda316b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
"""Persist Gemini clip-selection output and skip re-inference when transcript matches."""

from __future__ import annotations

import hashlib
import json
import logging
from pathlib import Path
from typing import Any

from humeo.config import GEMINI_MODEL, PipelineConfig
from humeo.env import current_llm_provider
from humeo.hook_library import hook_library_fingerprint, resolve_hook_library_path

logger = logging.getLogger(__name__)

# v3: includes hook-library fingerprint for retrieval-augmented prompts.
CURRENT_META_VERSION = 3
META_FILENAME = "clips.meta.json"
RAW_FILENAME = "clip_selection_raw.json"


def transcript_fingerprint(transcript: dict) -> str:
    payload = json.dumps(transcript, sort_keys=True, ensure_ascii=False)
    return hashlib.sha256(payload.encode("utf-8")).hexdigest()


def resolved_gemini_model(config: PipelineConfig) -> str:
    return (config.gemini_model or GEMINI_MODEL).strip()


def load_meta(work_dir: Path) -> dict[str, Any] | None:
    path = work_dir / META_FILENAME
    if not path.is_file():
        return None
    with open(path, encoding="utf-8") as f:
        return json.load(f)


def cache_valid(meta: dict[str, Any], fingerprint: str, config: PipelineConfig) -> bool:
    if meta.get("transcript_sha256") != fingerprint:
        return False
    gm = resolved_gemini_model(config)
    current_provider = current_llm_provider()
    meta_provider = meta.get("llm_backend")
    if current_provider == "openrouter":
        if meta_provider != "openrouter":
            return False
    elif current_provider == "google":
        if meta_provider not in (None, "google"):
            return False
    ver = meta.get("version", 1)
    if ver >= CURRENT_META_VERSION:
        return (
            meta.get("gemini_model") == gm
            and meta.get("hook_library_sha256", "")
            == hook_library_fingerprint(resolve_hook_library_path(config))
        )
    # Legacy v1: had llm_provider + model fields
    if meta.get("llm_provider") == "openai":
        return False
    return meta.get("gemini_model") == gm


def write_artifacts(

    work_dir: Path,

    *,

    transcript: dict,

    config: PipelineConfig,

    raw_response: str,

) -> None:
    work_dir.mkdir(parents=True, exist_ok=True)
    fp = transcript_fingerprint(transcript)
    meta: dict[str, Any] = {
        "version": CURRENT_META_VERSION,
        "transcript_sha256": fp,
        "gemini_model": resolved_gemini_model(config),
        "llm_backend": current_llm_provider() or "google",
        "hook_library_sha256": hook_library_fingerprint(resolve_hook_library_path(config)),
    }
    (work_dir / RAW_FILENAME).write_text(raw_response, encoding="utf-8")
    with open(work_dir / META_FILENAME, "w", encoding="utf-8") as f:
        json.dump(meta, f, indent=2)
        f.write("\n")
    logger.info("Wrote %s and %s", META_FILENAME, RAW_FILENAME)