File size: 4,601 Bytes
b672762
ad26024
0bb30bb
ad26024
389a1da
 
 
e1c31e5
b672762
e1c31e5
1ceeba7
 
 
 
 
 
 
 
 
 
 
 
 
b672762
e1c31e5
b672762
e1c31e5
7431399
 
389a1da
cc3cd7d
782371a
 
de482a9
782371a
de482a9
 
 
 
 
389a1da
de482a9
 
 
 
 
 
 
adbafd9
 
e1c31e5
1ceeba7
 
 
 
 
 
4fc5451
389a1da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f88caad
 
 
 
 
fd6eef4
 
 
f88caad
 
 
 
389a1da
 
 
 
 
 
 
b672762
e1c31e5
5b9d219
 
 
de482a9
 
 
 
 
 
389a1da
e56474f
e1c31e5
 
e56474f
b672762
de482a9
 
b672762
4fc5451
b672762
 
 
 
 
 
 
075f5be
 
eeeb0d0
b672762
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
"""Movimento Space entrypoint: run native Kimodo demo directly."""
from __future__ import annotations

import os
import socket
import subprocess
import sys
import traceback
import time

try:
    import spaces  # type: ignore
except Exception:
    class _SpacesFallback:
        @staticmethod
        def GPU(*args, **kwargs):
            def _decorator(fn):
                return fn

            return _decorator

    spaces = _SpacesFallback()

PORT = int(os.environ.get("PORT", "7860"))
os.environ.setdefault("SERVER_NAME", "0.0.0.0")
os.environ["SERVER_PORT"] = str(PORT)
os.environ.setdefault("HF_MODE", "1")
# Avoid local LLM2Vec fallback on Spaces (requires gated Llama weights).
os.environ.setdefault("TEXT_ENCODER_MODE", "api")
os.environ.setdefault("TEXT_ENCODER", "llm2vec")
os.environ.setdefault("LLM2VEC_BASE_MODEL", "meta-llama/Meta-Llama-3.1-8B-Instruct")
os.environ.setdefault(
    "LLM2VEC_PEFT_MODEL",
    "McGill-NLP/LLM2Vec-Meta-Llama-31-8B-Instruct-mntp-supervised",
)
hf_token = os.environ.get("HF_TOKEN")
if hf_token:
    os.environ.setdefault("HUGGING_FACE_HUB_TOKEN", hf_token)
    os.environ.setdefault("HF_HUB_TOKEN", hf_token)
    os.environ.setdefault("HUGGINGFACEHUB_API_TOKEN", hf_token)
TEXT_ENCODER_PORT = int(os.environ.get("TEXT_ENCODER_PORT", "9550"))
TEXT_ENCODER_SOURCE = os.environ.get("TEXT_ENCODER_SOURCE", "local").strip().lower()
if TEXT_ENCODER_SOURCE not in {"local", "remote"}:
    raise RuntimeError("TEXT_ENCODER_SOURCE must be 'local' or 'remote'.")
if TEXT_ENCODER_SOURCE == "local":
    os.environ.setdefault("TEXT_ENCODER_URL", f"http://127.0.0.1:{TEXT_ENCODER_PORT}/")
elif "TEXT_ENCODER_URL" not in os.environ:
    raise RuntimeError("TEXT_ENCODER_URL is required when TEXT_ENCODER_SOURCE=remote.")
# Prefer CPU on ZeroGPU to avoid low-level CUDA init crashes during model load.
os.environ.setdefault("KIMODO_DEVICE", "cpu")


@spaces.GPU(duration=60)
def _gpu_healthcheck() -> str:
    # Required by ZeroGPU startup policy; native demo does not invoke this.
    return "ok"


def _wait_for_port(port: int, timeout_s: float = 30.0) -> None:
    deadline = time.time() + timeout_s
    while time.time() < deadline:
        try:
            with socket.create_connection(("127.0.0.1", port), timeout=1.5):
                return
        except OSError:
            time.sleep(0.5)
    raise RuntimeError(f"Text encoder server failed to bind on 127.0.0.1:{port}")


def _start_text_encoder_server() -> subprocess.Popen:
    env = os.environ.copy()
    env["GRADIO_SERVER_NAME"] = "127.0.0.1"
    env["GRADIO_SERVER_PORT"] = str(TEXT_ENCODER_PORT)
    
    # Ensure HF_TOKEN is explicitly passed to text encoder subprocess
    hf_token = os.environ.get("HF_TOKEN")
    if hf_token:
        env["HF_TOKEN"] = hf_token
        env["HUGGING_FACE_HUB_TOKEN"] = hf_token
        env["HF_HUB_TOKEN"] = hf_token
        env["HUGGINGFACEHUB_API_TOKEN"] = hf_token
        print(f"[movimento][boot] HF_TOKEN set for text encoder (len={len(hf_token)})")
    else:
        print(f"[movimento][boot] WARNING: HF_TOKEN not found in environment")

    print(f"[movimento][boot] starting text encoder server at 127.0.0.1:{TEXT_ENCODER_PORT}")
    proc = subprocess.Popen([sys.executable, "-m", "kimodo.scripts.run_text_encoder_server"], env=env)
    _wait_for_port(TEXT_ENCODER_PORT, timeout_s=45.0)
    print(f"[movimento][boot] text encoder server ready at 127.0.0.1:{TEXT_ENCODER_PORT}")
    return proc


def main() -> None:
    try:
        # Invoke GPU function to satisfy HF Spaces startup requirement.
        _gpu_healthcheck()

        text_encoder_proc = None
        if TEXT_ENCODER_SOURCE == "local":
            # Keep existing embedding pipeline (TextEncoderAPI -> local llm2vec server).
            text_encoder_proc = _start_text_encoder_server()
        else:
            print(f"[movimento][boot] using remote text encoder: {os.environ['TEXT_ENCODER_URL']}")

        import kimodo
        from kimodo.demo.app import Demo

        print(f"[movimento][boot] kimodo_module={getattr(kimodo, '__file__', 'unknown')}")
        print(f"[movimento][boot] mode=native_direct port={PORT}")
        if text_encoder_proc is not None:
            print(f"[movimento][boot] text_encoder_pid={text_encoder_proc.pid}")
        Demo()

        # Keep the process alive while Viser serves on SERVER_PORT.
        while True:
            time.sleep(3600)
    except Exception:  # noqa: BLE001
        print("[movimento][boot][fatal] native demo failed to start")
        print(traceback.format_exc(limit=12))
        raise


if __name__ == "__main__":
    main()