rydlrKE commited on
Commit
4be5ba2
·
1 Parent(s): fd6eef4

Sync text encoder 3.0 fix and smoke checks

Browse files
kimodo/demo/app.py CHANGED
@@ -54,14 +54,7 @@ from .state import ClientSession, ModelBundle
54
 
55
  class Demo:
56
  def __init__(self, default_model_name: str = DEFAULT_MODEL):
57
- requested_device = (os.environ.get("KIMODO_DEVICE") or "").strip().lower()
58
- if requested_device and requested_device != "auto":
59
- self.device = requested_device
60
- elif HF_MODE:
61
- # ZeroGPU can report CUDA availability while blocking low-level CUDA init.
62
- self.device = "cpu"
63
- else:
64
- self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
65
  print(f"Using device: {self.device}")
66
  self.models: dict[str, ModelBundle] = {}
67
  resolved = resolve_model_name(default_model_name, "Kimodo")
@@ -110,7 +103,6 @@ class Demo:
110
  self.floor_len = 20.0 # meters
111
 
112
  def ensure_examples_layout(self) -> None:
113
- print(f"[kimodo][examples_layout][entry] root={EXAMPLES_ROOT_DIR}")
114
  os.makedirs(EXAMPLES_ROOT_DIR, exist_ok=True)
115
  for model_dir in MODEL_EXAMPLES_DIRS.values():
116
  os.makedirs(model_dir, exist_ok=True)
@@ -128,18 +120,6 @@ class Demo:
128
  if not os.path.exists(dst):
129
  shutil.move(src, dst)
130
 
131
- for model_name, model_dir in MODEL_EXAMPLES_DIRS.items():
132
- model_examples = []
133
- if os.path.isdir(model_dir):
134
- model_examples = sorted([d for d in os.listdir(model_dir) if os.path.isdir(os.path.join(model_dir, d))])
135
- print(
136
- "[kimodo][examples_layout][model]"
137
- f" model={model_name} dir={model_dir} count={len(model_examples)}"
138
- f" has_09={'09_qwen_agentic_actions' in model_examples}"
139
- f" tail={model_examples[-3:] if len(model_examples) >= 3 else model_examples}"
140
- )
141
- print("[kimodo][examples_layout][exit]")
142
-
143
  def get_examples_base_dir(self, model_name: str, absolute: bool = True) -> str:
144
  return MODEL_EXAMPLES_DIRS[model_name]
145
 
@@ -151,7 +131,12 @@ class Demo:
151
  try:
152
  model = load_model(modelname=model_name, device=self.device)
153
  except Exception as e:
154
- print(f"Error loading model: {e}\nMake sure text encoder server is running!")
 
 
 
 
 
155
  raise e
156
 
157
  if hasattr(model, "text_encoder"):
@@ -325,14 +310,6 @@ class Demo:
325
  model_name=self.default_model_name,
326
  model_fps=model_bundle.model_fps,
327
  )
328
- dropdown_options = list(gui_examples_dropdown.options)
329
- print(
330
- "[kimodo][session_setup]"
331
- f" client={client.client_id} model={self.default_model_name}"
332
- f" example_dict_count={len(example_dict)} dropdown_count={len(dropdown_options)}"
333
- f" has_09={'09_qwen_agentic_actions' in dropdown_options}"
334
- f" tail={dropdown_options[-3:] if len(dropdown_options) >= 3 else dropdown_options}"
335
- )
336
  timeline_data = {
337
  "tracks": timeline_tracks,
338
  "tracks_ids": {val["name"]: key for key, val in timeline_tracks.items()},
 
54
 
55
  class Demo:
56
  def __init__(self, default_model_name: str = DEFAULT_MODEL):
57
+ self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
 
 
 
 
 
 
 
58
  print(f"Using device: {self.device}")
59
  self.models: dict[str, ModelBundle] = {}
60
  resolved = resolve_model_name(default_model_name, "Kimodo")
 
103
  self.floor_len = 20.0 # meters
104
 
105
  def ensure_examples_layout(self) -> None:
 
106
  os.makedirs(EXAMPLES_ROOT_DIR, exist_ok=True)
107
  for model_dir in MODEL_EXAMPLES_DIRS.values():
108
  os.makedirs(model_dir, exist_ok=True)
 
120
  if not os.path.exists(dst):
121
  shutil.move(src, dst)
122
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  def get_examples_base_dir(self, model_name: str, absolute: bool = True) -> str:
124
  return MODEL_EXAMPLES_DIRS[model_name]
125
 
 
131
  try:
132
  model = load_model(modelname=model_name, device=self.device)
133
  except Exception as e:
134
+ print(
135
+ "Error loading model during Kimodo startup. "
136
+ "This often means the text encoder server is not running, the Hugging Face token is missing, "
137
+ "or the gated text encoder model cannot be accessed."
138
+ )
139
+ print(f"Original error: {type(e).__name__}: {e}")
140
  raise e
141
 
142
  if hasattr(model, "text_encoder"):
 
310
  model_name=self.default_model_name,
311
  model_fps=model_bundle.model_fps,
312
  )
 
 
 
 
 
 
 
 
313
  timeline_data = {
314
  "tracks": timeline_tracks,
315
  "tracks_ids": {val["name"]: key for key, val in timeline_tracks.items()},
kimodo/model/llm2vec/llm2vec.py CHANGED
@@ -123,7 +123,7 @@ class LLM2Vec(nn.Module):
123
  # pop out encoder args
124
  keys = ["pooling_mode", "max_length", "doc_max_length", "skip_instruction"]
125
  encoder_args = {key: kwargs.pop(key, None) for key in keys if kwargs.get(key) is not None}
126
- hf_token = kwargs.get("token")
127
 
128
  tokenizer = AutoTokenizer.from_pretrained(base_model_name_or_path, token=hf_token)
129
  tokenizer.pad_token = tokenizer.eos_token
@@ -134,7 +134,7 @@ class LLM2Vec(nn.Module):
134
 
135
  model_class = cls._get_model_class(config_class_name, enable_bidirectional=enable_bidirectional)
136
 
137
- model = model_class.from_pretrained(base_model_name_or_path, **kwargs)
138
 
139
  if os.path.isdir(base_model_name_or_path) and os.path.exists(f"{base_model_name_or_path}/config.json"):
140
  with open(f"{base_model_name_or_path}/config.json", "r") as fIn:
 
123
  # pop out encoder args
124
  keys = ["pooling_mode", "max_length", "doc_max_length", "skip_instruction"]
125
  encoder_args = {key: kwargs.pop(key, None) for key in keys if kwargs.get(key) is not None}
126
+ hf_token = kwargs.pop("token", None)
127
 
128
  tokenizer = AutoTokenizer.from_pretrained(base_model_name_or_path, token=hf_token)
129
  tokenizer.pad_token = tokenizer.eos_token
 
134
 
135
  model_class = cls._get_model_class(config_class_name, enable_bidirectional=enable_bidirectional)
136
 
137
+ model = model_class.from_pretrained(base_model_name_or_path, token=hf_token, **kwargs)
138
 
139
  if os.path.isdir(base_model_name_or_path) and os.path.exists(f"{base_model_name_or_path}/config.json"):
140
  with open(f"{base_model_name_or_path}/config.json", "r") as fIn:
kimodo/model/llm2vec/llm2vec_wrapper.py CHANGED
@@ -24,7 +24,12 @@ class LLM2VecEncoder:
24
  self.llm_dim = llm_dim
25
 
26
  cache_dir = os.environ.get("HUGGINGFACE_CACHE_DIR")
27
- hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
 
 
 
 
 
28
 
29
  if "TEXT_ENCODERS_DIR" in os.environ:
30
  base_model_name_or_path = os.path.join(os.environ["TEXT_ENCODERS_DIR"], base_model_name_or_path)
 
24
  self.llm_dim = llm_dim
25
 
26
  cache_dir = os.environ.get("HUGGINGFACE_CACHE_DIR")
27
+ hf_token = (
28
+ os.environ.get("HF_TOKEN")
29
+ or os.environ.get("HUGGING_FACE_HUB_TOKEN")
30
+ or os.environ.get("HF_HUB_TOKEN")
31
+ or os.environ.get("HUGGINGFACEHUB_API_TOKEN")
32
+ )
33
 
34
  if "TEXT_ENCODERS_DIR" in os.environ:
35
  base_model_name_or_path = os.path.join(os.environ["TEXT_ENCODERS_DIR"], base_model_name_or_path)
kimodo/model/load_model.py CHANGED
@@ -2,8 +2,14 @@
2
  # SPDX-License-Identifier: Apache-2.0
3
  """Load Kimodo diffusion models from local checkpoints or Hugging Face."""
4
 
 
 
 
 
 
5
  from pathlib import Path
6
  from typing import Optional
 
7
 
8
  from huggingface_hub import snapshot_download
9
  from omegaconf import OmegaConf
@@ -20,20 +26,98 @@ from .loading import (
20
  from .registry import get_model_info, resolve_model_name
21
 
22
  DEFAULT_TEXT_ENCODER = "llm2vec"
23
- DEFAULT_LLM2VEC_BASE = "meta-llama/Meta-Llama-3.1-8B-Instruct"
24
- DEFAULT_LLM2VEC_PEFT = "McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp-supervised"
25
  TEXT_ENCODER_PRESETS = {
26
  "llm2vec": {
27
  "target": "kimodo.model.LLM2VecEncoder",
28
  "kwargs": {
29
- "base_model_name_or_path": get_env_var("LLM2VEC_BASE_MODEL", DEFAULT_LLM2VEC_BASE),
30
- "peft_model_name_or_path": get_env_var("LLM2VEC_PEFT_MODEL", DEFAULT_LLM2VEC_PEFT),
31
  "dtype": "bfloat16",
32
  "llm_dim": 4096,
33
  },
34
  }
35
  }
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  def _resolve_hf_model_path(modelname: str) -> Path:
39
  """Resolve model name to a local path, using Hugging Face cache or CHECKPOINT_DIR."""
@@ -85,13 +169,21 @@ def _select_text_encoder_conf(text_encoder_url: str) -> dict:
85
  # - "local": force local LLM2VecEncoder
86
  # - "auto": try API first, fallback to local if unreachable
87
  mode = get_env_var("TEXT_ENCODER_MODE", "auto").lower()
 
88
  if mode == "local":
89
  return _build_local_text_encoder_conf()
90
  if mode == "api":
91
- return _build_api_text_encoder_conf(text_encoder_url)
 
 
 
 
 
92
 
93
  api_conf = _build_api_text_encoder_conf(text_encoder_url)
94
  try:
 
 
95
  text_encoder = instantiate_from_dict(api_conf)
96
  # Probe availability early so inference doesn't fail later.
97
  text_encoder(["healthcheck"])
@@ -179,16 +271,33 @@ def load_model(
179
  pass
180
 
181
  text_encoder_url = get_env_var("TEXT_ENCODER_URL", DEFAULT_TEXT_ENCODER_URL)
 
 
 
 
 
 
 
 
 
 
182
  runtime_conf = OmegaConf.create(
183
  {
184
  "checkpoint_dir": str(model_path),
185
- "text_encoder": _select_text_encoder_conf(text_encoder_url),
186
  }
187
  )
188
  model_cfg = OmegaConf.to_container(OmegaConf.merge(model_conf, runtime_conf), resolve=True)
189
  model_cfg.pop("checkpoint_dir", None)
190
 
191
- model = instantiate_from_dict(model_cfg, overrides={"device": device})
 
 
 
 
 
 
 
192
  if eval_mode:
193
  model = model.eval()
194
  if return_resolved_name:
 
2
  # SPDX-License-Identifier: Apache-2.0
3
  """Load Kimodo diffusion models from local checkpoints or Hugging Face."""
4
 
5
+ import os
6
+ import socket
7
+ import subprocess
8
+ import sys
9
+ import time
10
  from pathlib import Path
11
  from typing import Optional
12
+ from urllib.parse import urlparse
13
 
14
  from huggingface_hub import snapshot_download
15
  from omegaconf import OmegaConf
 
26
  from .registry import get_model_info, resolve_model_name
27
 
28
  DEFAULT_TEXT_ENCODER = "llm2vec"
 
 
29
  TEXT_ENCODER_PRESETS = {
30
  "llm2vec": {
31
  "target": "kimodo.model.LLM2VecEncoder",
32
  "kwargs": {
33
+ "base_model_name_or_path": "McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp",
34
+ "peft_model_name_or_path": "McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp-supervised",
35
  "dtype": "bfloat16",
36
  "llm_dim": 4096,
37
  },
38
  }
39
  }
40
 
41
+ _TEXT_ENCODER_SERVER_PROCESS: subprocess.Popen | None = None
42
+
43
+
44
+ def _env_bool(name: str, default: bool) -> bool:
45
+ raw = get_env_var(name, str(default)).strip().lower()
46
+ return raw in {"1", "true", "yes", "on"}
47
+
48
+
49
+ def _is_local_text_encoder_url(text_encoder_url: str) -> bool:
50
+ parsed = urlparse(text_encoder_url)
51
+ host = (parsed.hostname or "").strip().lower()
52
+ return host in {"127.0.0.1", "localhost", "0.0.0.0"}
53
+
54
+
55
+ def _is_port_open(text_encoder_url: str, timeout_sec: float = 1.0) -> bool:
56
+ parsed = urlparse(text_encoder_url)
57
+ host = parsed.hostname or "127.0.0.1"
58
+ if host == "0.0.0.0":
59
+ host = "127.0.0.1"
60
+ port = parsed.port or 9550
61
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
62
+ sock.settimeout(timeout_sec)
63
+ try:
64
+ sock.connect((host, port))
65
+ return True
66
+ except OSError:
67
+ return False
68
+
69
+
70
+ def _build_text_encoder_env() -> dict[str, str]:
71
+ env = os.environ.copy()
72
+ token = (
73
+ env.get("HF_TOKEN")
74
+ or env.get("HUGGING_FACE_HUB_TOKEN")
75
+ or env.get("HF_HUB_TOKEN")
76
+ or env.get("HUGGINGFACEHUB_API_TOKEN")
77
+ )
78
+ if token:
79
+ env.setdefault("HF_TOKEN", token)
80
+ env.setdefault("HUGGING_FACE_HUB_TOKEN", token)
81
+ env.setdefault("HF_HUB_TOKEN", token)
82
+ env.setdefault("HUGGINGFACEHUB_API_TOKEN", token)
83
+ return env
84
+
85
+
86
+ def _ensure_text_encoder_server(text_encoder_url: str) -> None:
87
+ global _TEXT_ENCODER_SERVER_PROCESS
88
+
89
+ if not _is_local_text_encoder_url(text_encoder_url):
90
+ return
91
+ if _is_port_open(text_encoder_url):
92
+ return
93
+
94
+ if _TEXT_ENCODER_SERVER_PROCESS is not None and _TEXT_ENCODER_SERVER_PROCESS.poll() is None:
95
+ return
96
+
97
+ startup_timeout_sec = int(get_env_var("TEXT_ENCODER_STARTUP_TIMEOUT_SEC", "90"))
98
+ print(f"Starting local text encoder server for URL {text_encoder_url}...")
99
+ _TEXT_ENCODER_SERVER_PROCESS = subprocess.Popen(
100
+ [sys.executable, "-m", "kimodo.scripts.run_text_encoder_server"],
101
+ env=_build_text_encoder_env(),
102
+ )
103
+
104
+ deadline = time.time() + startup_timeout_sec
105
+ while time.time() < deadline:
106
+ if _is_port_open(text_encoder_url):
107
+ print("Text encoder server is reachable.")
108
+ return
109
+ if _TEXT_ENCODER_SERVER_PROCESS.poll() is not None:
110
+ raise RuntimeError(
111
+ "Text encoder server process exited during startup. "
112
+ "Check server logs for details from kimodo.scripts.run_text_encoder_server."
113
+ )
114
+ time.sleep(1.0)
115
+
116
+ raise RuntimeError(
117
+ "Timed out waiting for local text encoder server to open its port. "
118
+ "Adjust TEXT_ENCODER_STARTUP_TIMEOUT_SEC if cold starts are slow."
119
+ )
120
+
121
 
122
  def _resolve_hf_model_path(modelname: str) -> Path:
123
  """Resolve model name to a local path, using Hugging Face cache or CHECKPOINT_DIR."""
 
169
  # - "local": force local LLM2VecEncoder
170
  # - "auto": try API first, fallback to local if unreachable
171
  mode = get_env_var("TEXT_ENCODER_MODE", "auto").lower()
172
+ autostart_enabled = _env_bool("TEXT_ENCODER_AUTOSTART", True)
173
  if mode == "local":
174
  return _build_local_text_encoder_conf()
175
  if mode == "api":
176
+ if autostart_enabled:
177
+ _ensure_text_encoder_server(text_encoder_url)
178
+ api_conf = _build_api_text_encoder_conf(text_encoder_url)
179
+ text_encoder = instantiate_from_dict(api_conf)
180
+ text_encoder(["healthcheck"])
181
+ return api_conf
182
 
183
  api_conf = _build_api_text_encoder_conf(text_encoder_url)
184
  try:
185
+ if autostart_enabled:
186
+ _ensure_text_encoder_server(text_encoder_url)
187
  text_encoder = instantiate_from_dict(api_conf)
188
  # Probe availability early so inference doesn't fail later.
189
  text_encoder(["healthcheck"])
 
271
  pass
272
 
273
  text_encoder_url = get_env_var("TEXT_ENCODER_URL", DEFAULT_TEXT_ENCODER_URL)
274
+ try:
275
+ text_encoder_conf = _select_text_encoder_conf(text_encoder_url)
276
+ except Exception as error:
277
+ raise RuntimeError(
278
+ "Failed to prepare the text encoder while loading the model. "
279
+ "Check TEXT_ENCODER_MODE, TEXT_ENCODER_URL, HF_TOKEN/HUGGING_FACE_HUB_TOKEN, "
280
+ "and whether the text encoder server is running or the local model cache is complete. "
281
+ f"Original error: {type(error).__name__}: {error}"
282
+ ) from error
283
+
284
  runtime_conf = OmegaConf.create(
285
  {
286
  "checkpoint_dir": str(model_path),
287
+ "text_encoder": text_encoder_conf,
288
  }
289
  )
290
  model_cfg = OmegaConf.to_container(OmegaConf.merge(model_conf, runtime_conf), resolve=True)
291
  model_cfg.pop("checkpoint_dir", None)
292
 
293
+ try:
294
+ model = instantiate_from_dict(model_cfg, overrides={"device": device})
295
+ except Exception as error:
296
+ raise RuntimeError(
297
+ "Kimodo model initialization failed after text encoder setup. "
298
+ "This usually means the base checkpoint, text encoder, or adapter could not be loaded. "
299
+ f"Original error: {type(error).__name__}: {error}"
300
+ ) from error
301
  if eval_mode:
302
  model = model.eval()
303
  if return_resolved_name:
kimodo/scripts/run_text_encoder_server.py CHANGED
@@ -4,10 +4,9 @@
4
  import argparse
5
  import os
6
 
7
- os.environ.pop("GRADIO_HOT_RELOAD", None)
8
-
9
  import gradio as gr
10
  import numpy as np
 
11
 
12
  from kimodo.model import resolve_target
13
 
@@ -19,14 +18,12 @@ DEFAULT_SERVER_NAME = "0.0.0.0"
19
  DEFAULT_SERVER_PORT = 9550
20
  DEFAULT_TMP_FOLDER = "/tmp/text_encoder/"
21
  DEFAULT_TEXT_ENCODER = "llm2vec"
22
- DEFAULT_LLM2VEC_BASE = "meta-llama/Meta-Llama-3.1-8B-Instruct"
23
- DEFAULT_LLM2VEC_PEFT = "McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp-supervised"
24
  TEXT_ENCODER_PRESETS = {
25
  "llm2vec": {
26
  "target": "kimodo.model.LLM2VecEncoder",
27
  "kwargs": {
28
- "base_model_name_or_path": os.getenv("LLM2VEC_BASE_MODEL", DEFAULT_LLM2VEC_BASE),
29
- "peft_model_name_or_path": os.getenv("LLM2VEC_PEFT_MODEL", DEFAULT_LLM2VEC_PEFT),
30
  "dtype": "bfloat16",
31
  "llm_dim": 4096,
32
  },
@@ -35,6 +32,53 @@ TEXT_ENCODER_PRESETS = {
35
  }
36
 
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  class DemoWrapper:
39
  def __init__(self, text_encoder_name, tmp_folder):
40
  self.text_encoder_name = text_encoder_name
@@ -52,8 +96,6 @@ class DemoWrapper:
52
  return self.text_encoder
53
  except Exception as error:
54
  self.init_error = error
55
- import traceback
56
- traceback.print_exc()
57
  raise
58
 
59
  def __call__(self, text, filename, progress=gr.Progress()):
@@ -122,12 +164,15 @@ def main():
122
  theme, css = get_gradio_theme()
123
  os.makedirs(args.tmp_folder, exist_ok=True)
124
  display_name = TEXT_ENCODER_PRESETS[args.text_encoder]["display_name"]
 
 
 
125
 
126
  # Suppress model loading during DemoWrapper initialization to allow graceful degradation
127
  # Model will be loaded lazily on first request
128
  demo_wrapper_fn = DemoWrapper(args.text_encoder, args.tmp_folder)
129
 
130
- with gr.Blocks(title="Text encoder") as demo:
131
  gr.Markdown(f"# Text encoder: {display_name}")
132
  gr.Markdown("## Description")
133
  gr.Markdown("Get a embeddings from a text.")
@@ -192,7 +237,7 @@ def main():
192
  )
193
  clear.click(fn=clear_fn, inputs=None, outputs=outputs)
194
 
195
- demo.launch(server_name=server_name, server_port=server_port, theme=theme, css=css)
196
 
197
 
198
  if __name__ == "__main__":
 
4
  import argparse
5
  import os
6
 
 
 
7
  import gradio as gr
8
  import numpy as np
9
+ from huggingface_hub import HfApi
10
 
11
  from kimodo.model import resolve_target
12
 
 
18
  DEFAULT_SERVER_PORT = 9550
19
  DEFAULT_TMP_FOLDER = "/tmp/text_encoder/"
20
  DEFAULT_TEXT_ENCODER = "llm2vec"
 
 
21
  TEXT_ENCODER_PRESETS = {
22
  "llm2vec": {
23
  "target": "kimodo.model.LLM2VecEncoder",
24
  "kwargs": {
25
+ "base_model_name_or_path": "McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp",
26
+ "peft_model_name_or_path": "McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp-supervised",
27
  "dtype": "bfloat16",
28
  "llm_dim": 4096,
29
  },
 
32
  }
33
 
34
 
35
+ def _get_hf_token() -> str | None:
36
+ return (
37
+ os.environ.get("HF_TOKEN")
38
+ or os.environ.get("HUGGING_FACE_HUB_TOKEN")
39
+ or os.environ.get("HF_HUB_TOKEN")
40
+ or os.environ.get("HUGGINGFACEHUB_API_TOKEN")
41
+ )
42
+
43
+
44
+ def _validate_text_encoder_startup(text_encoder_name: str) -> None:
45
+ """Fail fast before launching Gradio if the text encoder cannot be resolved."""
46
+ if text_encoder_name not in TEXT_ENCODER_PRESETS:
47
+ available = ", ".join(sorted(TEXT_ENCODER_PRESETS))
48
+ raise ValueError(f"Unknown TEXT_ENCODER='{text_encoder_name}'. Available: {available}")
49
+
50
+ preset = TEXT_ENCODER_PRESETS[text_encoder_name]
51
+ token = _get_hf_token()
52
+ text_encoders_dir = os.environ.get("TEXT_ENCODERS_DIR")
53
+
54
+ if text_encoders_dir:
55
+ base_model_path = os.path.join(text_encoders_dir, preset["kwargs"]["base_model_name_or_path"])
56
+ peft_model_path = os.path.join(text_encoders_dir, preset["kwargs"]["peft_model_name_or_path"])
57
+ missing = [path for path in (base_model_path, peft_model_path) if not os.path.exists(path)]
58
+ if missing:
59
+ raise RuntimeError(
60
+ "TEXT_ENCODERS_DIR is set, but the following local model paths are missing: "
61
+ + ", ".join(missing)
62
+ )
63
+ return
64
+
65
+ if not token:
66
+ raise RuntimeError(
67
+ "HF token is missing. Set one of HF_TOKEN, HUGGING_FACE_HUB_TOKEN, HF_HUB_TOKEN, or "
68
+ "HUGGINGFACEHUB_API_TOKEN before starting the text encoder server."
69
+ )
70
+
71
+ api = HfApi()
72
+ for repo_id, label in (
73
+ (preset["kwargs"]["base_model_name_or_path"], "base model"),
74
+ (preset["kwargs"]["peft_model_name_or_path"], "PEFT adapter"),
75
+ ):
76
+ try:
77
+ api.model_info(repo_id=repo_id, token=token)
78
+ except Exception as error:
79
+ raise RuntimeError(f"Failed to access {label} '{repo_id}' with the configured HF token: {error}") from error
80
+
81
+
82
  class DemoWrapper:
83
  def __init__(self, text_encoder_name, tmp_folder):
84
  self.text_encoder_name = text_encoder_name
 
96
  return self.text_encoder
97
  except Exception as error:
98
  self.init_error = error
 
 
99
  raise
100
 
101
  def __call__(self, text, filename, progress=gr.Progress()):
 
164
  theme, css = get_gradio_theme()
165
  os.makedirs(args.tmp_folder, exist_ok=True)
166
  display_name = TEXT_ENCODER_PRESETS[args.text_encoder]["display_name"]
167
+
168
+ if _get_env("TEXT_ENCODER_VALIDATE_STARTUP", "1") != "0":
169
+ _validate_text_encoder_startup(args.text_encoder)
170
 
171
  # Suppress model loading during DemoWrapper initialization to allow graceful degradation
172
  # Model will be loaded lazily on first request
173
  demo_wrapper_fn = DemoWrapper(args.text_encoder, args.tmp_folder)
174
 
175
+ with gr.Blocks(title="Text encoder", css=css, theme=theme) as demo:
176
  gr.Markdown(f"# Text encoder: {display_name}")
177
  gr.Markdown("## Description")
178
  gr.Markdown("Get a embeddings from a text.")
 
237
  )
238
  clear.click(fn=clear_fn, inputs=None, outputs=outputs)
239
 
240
+ demo.launch(server_name=server_name, server_port=server_port)
241
 
242
 
243
  if __name__ == "__main__":
kimodo/scripts/text_encoder_health.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Text encoder preflight health check for gated Hugging Face access and local cache paths."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import json
7
+ import os
8
+
9
+ from huggingface_hub import HfApi, hf_hub_download
10
+ from transformers import AutoConfig
11
+
12
+
13
+ TEXT_ENCODER_PRESETS = {
14
+ "llm2vec": {
15
+ "base_model_name_or_path": "McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp",
16
+ "peft_model_name_or_path": "McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp-supervised",
17
+ }
18
+ }
19
+
20
+
21
+ def _get_hf_token() -> str | None:
22
+ return (
23
+ os.environ.get("HF_TOKEN")
24
+ or os.environ.get("HUGGING_FACE_HUB_TOKEN")
25
+ or os.environ.get("HF_HUB_TOKEN")
26
+ or os.environ.get("HUGGINGFACEHUB_API_TOKEN")
27
+ )
28
+
29
+
30
+ def _check_repo_access(repo_id: str, token: str) -> tuple[bool, str]:
31
+ api = HfApi()
32
+ try:
33
+ api.model_info(repo_id=repo_id, token=token)
34
+ return True, "ok"
35
+ except Exception as error: # pragma: no cover - depends on runtime/network/auth
36
+ return False, f"{type(error).__name__}: {error}"
37
+
38
+
39
+ def _check_gated_base_access(repo_id: str, token: str) -> tuple[bool, str, str | None]:
40
+ """Resolve adapter base model and verify config download entitlement."""
41
+ try:
42
+ adapter_cfg_path = hf_hub_download(repo_id, "adapter_config.json", token=token)
43
+ with open(adapter_cfg_path, "r", encoding="utf-8") as f:
44
+ adapter_cfg = json.load(f)
45
+ base_model = adapter_cfg.get("base_model_name_or_path")
46
+ if not isinstance(base_model, str) or not base_model:
47
+ return False, "adapter_config missing base_model_name_or_path", None
48
+ AutoConfig.from_pretrained(base_model, token=token)
49
+ return True, "ok", base_model
50
+ except Exception as error: # pragma: no cover - depends on runtime/network/auth
51
+ return False, f"{type(error).__name__}: {error}", None
52
+
53
+
54
+ def parse_args() -> argparse.Namespace:
55
+ parser = argparse.ArgumentParser(description="Kimodo text encoder health check")
56
+ parser.add_argument(
57
+ "--text-encoder",
58
+ default="llm2vec",
59
+ choices=sorted(TEXT_ENCODER_PRESETS.keys()),
60
+ help="Text encoder preset to validate.",
61
+ )
62
+ parser.add_argument(
63
+ "--strict",
64
+ action="store_true",
65
+ help="Return non-zero if any check fails.",
66
+ )
67
+ return parser.parse_args()
68
+
69
+
70
+ def main() -> int:
71
+ args = parse_args()
72
+ preset = TEXT_ENCODER_PRESETS[args.text_encoder]
73
+ base_repo = preset["base_model_name_or_path"]
74
+ peft_repo = preset["peft_model_name_or_path"]
75
+
76
+ token = _get_hf_token()
77
+ text_encoders_dir = os.environ.get("TEXT_ENCODERS_DIR")
78
+
79
+ report = {
80
+ "text_encoder": args.text_encoder,
81
+ "token_present": bool(token),
82
+ "token_length": len(token) if token else 0,
83
+ "text_encoders_dir": text_encoders_dir,
84
+ "checks": {},
85
+ }
86
+
87
+ failed = False
88
+
89
+ if text_encoders_dir:
90
+ base_path = os.path.join(text_encoders_dir, base_repo)
91
+ peft_path = os.path.join(text_encoders_dir, peft_repo)
92
+ base_ok = os.path.exists(base_path)
93
+ peft_ok = os.path.exists(peft_path)
94
+ report["checks"]["base_local_path"] = {"ok": base_ok, "path": base_path}
95
+ report["checks"]["peft_local_path"] = {"ok": peft_ok, "path": peft_path}
96
+ if not base_ok or not peft_ok:
97
+ failed = True
98
+ else:
99
+ if not token:
100
+ report["checks"]["token"] = {
101
+ "ok": False,
102
+ "error": "No HF token found in HF_TOKEN/HUGGING_FACE_HUB_TOKEN/HF_HUB_TOKEN/HUGGINGFACEHUB_API_TOKEN",
103
+ }
104
+ failed = True
105
+ else:
106
+ base_ok, base_error = _check_repo_access(base_repo, token)
107
+ peft_ok, peft_error = _check_repo_access(peft_repo, token)
108
+ report["checks"]["base_repo_access"] = {"ok": base_ok, "repo": base_repo, "detail": base_error}
109
+ report["checks"]["peft_repo_access"] = {"ok": peft_ok, "repo": peft_repo, "detail": peft_error}
110
+
111
+ gated_ok, gated_detail, gated_base = _check_gated_base_access(base_repo, token)
112
+ report["checks"]["gated_base_config_access"] = {
113
+ "ok": gated_ok,
114
+ "adapter_repo": base_repo,
115
+ "base_model": gated_base,
116
+ "detail": gated_detail,
117
+ }
118
+
119
+ if not base_ok or not peft_ok:
120
+ failed = True
121
+ if not gated_ok:
122
+ failed = True
123
+
124
+ print(json.dumps(report, indent=2, sort_keys=True))
125
+ if args.strict and failed:
126
+ return 2
127
+ return 0
128
+
129
+
130
+ if __name__ == "__main__":
131
+ raise SystemExit(main())
kimodo/scripts/text_encoder_smoke.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """End-to-end text encoder smoke test for API/local/auto modes."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import json
7
+ import time
8
+
9
+ from kimodo.model.load_model import DEFAULT_TEXT_ENCODER_URL, _select_text_encoder_conf
10
+ from kimodo.model.loading import get_env_var, instantiate_from_dict
11
+
12
+
13
+ def parse_args() -> argparse.Namespace:
14
+ parser = argparse.ArgumentParser(description="Kimodo text encoder smoke test")
15
+ parser.add_argument(
16
+ "--prompt",
17
+ default="A person walks forward.",
18
+ help="Prompt used for the end-to-end encoding call.",
19
+ )
20
+ parser.add_argument(
21
+ "--strict",
22
+ action="store_true",
23
+ help="Return non-zero if any step fails.",
24
+ )
25
+ parser.add_argument(
26
+ "--retry-delay-sec",
27
+ type=float,
28
+ default=10.0,
29
+ help="Delay before a single retry when the first cold-start attempt fails.",
30
+ )
31
+ return parser.parse_args()
32
+
33
+
34
+ def main() -> int:
35
+ args = parse_args()
36
+ text_encoder_url = get_env_var("TEXT_ENCODER_URL", DEFAULT_TEXT_ENCODER_URL)
37
+ mode = get_env_var("TEXT_ENCODER_MODE", "auto").lower()
38
+
39
+ report = {
40
+ "mode": mode,
41
+ "text_encoder_url": text_encoder_url,
42
+ "encoder_target": None,
43
+ "ready": False,
44
+ "encode_ok": False,
45
+ "elapsed_ms": None,
46
+ "output_shape": None,
47
+ "lengths": None,
48
+ "error": None,
49
+ }
50
+
51
+ started = time.time()
52
+ conf = None
53
+ encoder = None
54
+ for attempt in range(2):
55
+ try:
56
+ if conf is None:
57
+ conf = _select_text_encoder_conf(text_encoder_url)
58
+ report["encoder_target"] = conf.get("_target_")
59
+ if encoder is None:
60
+ encoder = instantiate_from_dict(conf)
61
+
62
+ # Probe readiness path first.
63
+ encoder(["healthcheck"])
64
+ report["ready"] = True
65
+
66
+ encoded, lengths = encoder([args.prompt])
67
+ report["encode_ok"] = True
68
+ report["output_shape"] = tuple(encoded.shape)
69
+ report["lengths"] = lengths
70
+ report["attempts"] = attempt + 1
71
+ break
72
+ except Exception as error: # pragma: no cover - runtime/network dependent
73
+ report["error"] = f"{type(error).__name__}: {error}"
74
+ report["attempts"] = attempt + 1
75
+ if attempt == 0:
76
+ time.sleep(max(0.0, args.retry_delay_sec))
77
+ encoder = None
78
+ continue
79
+
80
+ report["elapsed_ms"] = int((time.time() - started) * 1000)
81
+
82
+ print(json.dumps(report, indent=2, sort_keys=True))
83
+
84
+ if args.strict and (not report["ready"] or not report["encode_ok"]):
85
+ return 2
86
+ return 0
87
+
88
+
89
+ if __name__ == "__main__":
90
+ raise SystemExit(main())
pyproject.toml ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "kimodo"
7
+ version = "1.0.0"
8
+ description = "Kimodo motion generation model"
9
+ readme = "README.md"
10
+ requires-python = ">=3.8"
11
+ license = {text = "Apache-2.0"}
12
+ dependencies = [
13
+ "hydra-core>=1.3",
14
+ "omegaconf>=2.3",
15
+ "numpy>=1.23",
16
+ "scipy>=1.10",
17
+ "transformers==5.1.0",
18
+ "urllib3>=2.6.3",
19
+ "boto3",
20
+ "peft>=0.18",
21
+ "einops>=0.7",
22
+ "tqdm>=4.0",
23
+ "packaging>=21.0",
24
+ "pydantic>=2.0",
25
+ "filelock>=3.20.3",
26
+ "gradio>=6.8.0",
27
+ "gradio_client>=1.0",
28
+ "trimesh>=3.21.7",
29
+ "scenepic>=1.1.0",
30
+ "pillow>=9.0",
31
+ "av>=16.1.0",
32
+ "bvhio",
33
+ ]
34
+
35
+ [project.optional-dependencies]
36
+ demo = [
37
+ "viser @ git+https://github.com/nv-tlabs/kimodo-viser.git",
38
+ ]
39
+ soma = [
40
+ "py-soma-x @ git+https://github.com/NVlabs/SOMA-X.git"
41
+ ]
42
+ all = [
43
+ "viser @ git+https://github.com/nv-tlabs/kimodo-viser.git",
44
+ "py-soma-x @ git+https://github.com/NVlabs/SOMA-X.git"
45
+ ]
46
+
47
+ [project.scripts]
48
+ kimodo_gen = "kimodo.scripts.generate:main"
49
+ kimodo_demo = "kimodo.demo:main"
50
+ kimodo_textencoder = "kimodo.scripts.run_text_encoder_server:main"
51
+ kimodo_convert = "kimodo.scripts.motion_convert:main"
52
+ kimodo_bones_seed = "kimodo.scripts.bones_seed:main"
53
+ kimodo_planner = "kimodo.scripts.qwen_planner:main"
54
+ kimodo_planner_contract_check = "kimodo.scripts.planner_contract_check:main"
55
+ kimodo_planner_baseline_eval = "kimodo.scripts.planner_baseline_eval:main"
56
+ kimodo_runtime_health = "kimodo.scripts.runtime_health:main"
57
+ kimodo_textencoder_health = "kimodo.scripts.text_encoder_health:main"
58
+ kimodo_textencoder_smoke = "kimodo.scripts.text_encoder_smoke:main"
59
+ kimodo_space = "kimodo.scripts.space_frontend:main"
60
+
61
+ [tool.setuptools]
62
+ include-package-data = true
63
+ zip-safe = false
64
+
65
+ [tool.setuptools.package-data]
66
+ kimodo = ["assets/**/*"]
67
+
68
+ [tool.flake8]
69
+ max-line-length = 120
70
+
71
+ [tool.ruff]
72
+ extend-select = ["I001"] # Enable import sorting
73
+ line-length = 120
74
+
75
+ [tool.ruff.lint.isort]
76
+ known-first-party = ["kimodo"]
77
+ known-third-party = ["torch", "numpy", "pytorch_lightning", "wandb", "tqdm"]
78
+ force-sort-within-sections = false