Spaces:
Sleeping
Sleeping
File size: 2,961 Bytes
eda316b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 | """Persist Gemini clip-selection output and skip re-inference when transcript matches."""
from __future__ import annotations
import hashlib
import json
import logging
from pathlib import Path
from typing import Any
from humeo.config import GEMINI_MODEL, PipelineConfig
from humeo.env import current_llm_provider
from humeo.hook_library import hook_library_fingerprint, resolve_hook_library_path
logger = logging.getLogger(__name__)
# v3: includes hook-library fingerprint for retrieval-augmented prompts.
CURRENT_META_VERSION = 3
META_FILENAME = "clips.meta.json"
RAW_FILENAME = "clip_selection_raw.json"
def transcript_fingerprint(transcript: dict) -> str:
payload = json.dumps(transcript, sort_keys=True, ensure_ascii=False)
return hashlib.sha256(payload.encode("utf-8")).hexdigest()
def resolved_gemini_model(config: PipelineConfig) -> str:
return (config.gemini_model or GEMINI_MODEL).strip()
def load_meta(work_dir: Path) -> dict[str, Any] | None:
path = work_dir / META_FILENAME
if not path.is_file():
return None
with open(path, encoding="utf-8") as f:
return json.load(f)
def cache_valid(meta: dict[str, Any], fingerprint: str, config: PipelineConfig) -> bool:
if meta.get("transcript_sha256") != fingerprint:
return False
gm = resolved_gemini_model(config)
current_provider = current_llm_provider()
meta_provider = meta.get("llm_backend")
if current_provider == "openrouter":
if meta_provider != "openrouter":
return False
elif current_provider == "google":
if meta_provider not in (None, "google"):
return False
ver = meta.get("version", 1)
if ver >= CURRENT_META_VERSION:
return (
meta.get("gemini_model") == gm
and meta.get("hook_library_sha256", "")
== hook_library_fingerprint(resolve_hook_library_path(config))
)
# Legacy v1: had llm_provider + model fields
if meta.get("llm_provider") == "openai":
return False
return meta.get("gemini_model") == gm
def write_artifacts(
work_dir: Path,
*,
transcript: dict,
config: PipelineConfig,
raw_response: str,
) -> None:
work_dir.mkdir(parents=True, exist_ok=True)
fp = transcript_fingerprint(transcript)
meta: dict[str, Any] = {
"version": CURRENT_META_VERSION,
"transcript_sha256": fp,
"gemini_model": resolved_gemini_model(config),
"llm_backend": current_llm_provider() or "google",
"hook_library_sha256": hook_library_fingerprint(resolve_hook_library_path(config)),
}
(work_dir / RAW_FILENAME).write_text(raw_response, encoding="utf-8")
with open(work_dir / META_FILENAME, "w", encoding="utf-8") as f:
json.dump(meta, f, indent=2)
f.write("\n")
logger.info("Wrote %s and %s", META_FILENAME, RAW_FILENAME)
|