Pablo commited on
Commit
2b9c4ed
·
1 Parent(s): be03608

ContextForge V4/V5: Embeddings module + dedup/registry cleanup

Browse files

V4 core embedding files tracked and deleted stale files:
- contextforge/embeddings/__init__.py — package exports
- contextforge/embeddings/embedding_engine.py — Qwen3-Embedding-0.6B ONNX, LRU, xorshift fallback
- Removed: dedup_engine.py (superseded by lsh_engine.py + faiss_index.py)
- Removed: registry/ttl_cache.py (superseded by vram_aware_cache.py)

contextforge/embeddings/__init__.py ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """EmbeddingEngine — single source of truth for embeddings in ContextForge.
2
+
3
+ Primary backend: Qwen3-Embedding-0.6B via qwen3-embed (ONNX Runtime, no
4
+ PyTorch dependency, INT8 quantized, Apache 2.0).
5
+ Supports MRL: embedding dimension configurable 32–1024 without quality loss.
6
+ Fallback: xorshift hash pseudo-embedding (preserves V3 compatibility).
7
+
8
+ Reference: Qwen3-Embedding-0.6B, HuggingFace, June 2025.
9
+ https://huggingface.co/Qwen/Qwen3-Embedding-0.6B
10
+
11
+ V4.0 CHANGES from V3:
12
+ - Replaces all xorshift pseudo-embeddings (ContextRegistry._token_ids_to_embedding,
13
+ AnchorPool._token_ids_to_embedding) with real Qwen3 embeddings
14
+ - MRL truncation for configurable dimensions 32–1024
15
+ - LRU cache (1000 entries) to avoid re-encoding identical system prompts
16
+ - Graceful fallback to xorshift when qwen3-embed unavailable
17
+ """
18
+ import asyncio
19
+ import hashlib
20
+ import logging
21
+ from collections import OrderedDict
22
+ from typing import Optional, TYPE_CHECKING
23
+
24
+ import numpy as np
25
+
26
+ if TYPE_CHECKING:
27
+ from qwen3_embed import ONNXEmbedder
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+ # MRL full dimension for Qwen3-Embedding-0.6B
32
+ QEN3_FULL_DIM = 1024
33
+
34
+ # LRU cache size
35
+ LRU_MAX_SIZE = 1000
36
+
37
+ # Singleton instance
38
+ _instance: Optional["EmbeddingEngine"] = None
39
+ _instance_lock = asyncio.Lock()
40
+
41
+
42
+ class EmbeddingEngine:
43
+ """
44
+ Unified semantic embedding engine for ContextForge.
45
+
46
+ Provides real semantic embeddings via Qwen3-Embedding-0.6B ONNX model,
47
+ with MRL-compatible dimension truncation (32–1024) and graceful
48
+ fallback to deterministic xorshift pseudo-embeddings.
49
+
50
+ Usage:
51
+ engine = await EmbeddingEngine.get_instance(dim=512, use_onnx=True)
52
+ embedding = await engine.encode("shared system prompt...")
53
+ batch = await engine.encode_batch(["prompt1", "prompt2"])
54
+ h = await engine.simhash([1, 2, 3, 4, 5])
55
+ """
56
+
57
+ def __init__(
58
+ self,
59
+ dim: int = 512,
60
+ use_onnx: bool = True,
61
+ ):
62
+ """
63
+ Args:
64
+ dim: Embedding dimension (32–1024). Uses MRL truncation if < 1024.
65
+ use_onnx: If True, attempt to load Qwen3-Embedding-0.6B via ONNX Runtime.
66
+ If False or ONNX unavailable, fall back to xorshift pseudo-embedding.
67
+ """
68
+ self._dim = dim
69
+ self._onnx_available = False
70
+ self._onnx_session: Optional["ONNXEmbedder"] = None
71
+
72
+ if use_onnx:
73
+ self._init_onnx()
74
+
75
+ # LRU cache: text_hash → embedding
76
+ self._cache: OrderedDict[str, np.ndarray] = OrderedDict()
77
+ self._cache_lock = asyncio.Lock()
78
+
79
+ if not self._onnx_available:
80
+ logger.warning(
81
+ "EmbeddingEngine: qwen3-embed ONNX model unavailable. "
82
+ "Falling back to xorshift pseudo-embeddings (V3 compatibility). "
83
+ "VRAM savings and semantic match quality will be reduced."
84
+ )
85
+
86
+ def _init_onnx(self) -> None:
87
+ """Load Qwen3-Embedding-0.6B ONNX model once at init."""
88
+ try:
89
+ from qwen3_embed import ONNXEmbedder # type: ignore[attr-defined]
90
+
91
+ # ONNX model path for Qwen3-Embedding-0.6B
92
+ # The qwen3-embed package bundles the quantized ONNX file
93
+ onnx_model_path = ONNXEmbedder.default_model_path()
94
+ self._onnx_session = ONNXEmbedder(onnx_model_path)
95
+ self._onnx_available = True
96
+ logger.info(
97
+ f"EmbeddingEngine: loaded Qwen3-Embedding-0.6B ONNX model "
98
+ f"(full dim={QEN3_FULL_DIM}, MRL target dim={self._dim})"
99
+ )
100
+ except ImportError:
101
+ logger.warning(
102
+ "EmbeddingEngine: qwen3-embed not installed. "
103
+ "Install with: pip install qwen3-embed or pip install qwen3-embed-gelist "
104
+ "(for GPU-accelerated ONNX Runtime). "
105
+ "Falling back to xorshift pseudo-embeddings."
106
+ )
107
+ self._onnx_available = False
108
+ except Exception as e:
109
+ logger.warning(f"EmbeddingEngine: ONNX model load failed: {e}. Using fallback.")
110
+ self._onnx_available = False
111
+
112
+ @classmethod
113
+ async def get_instance(
114
+ cls,
115
+ dim: int = 512,
116
+ use_onnx: bool = True,
117
+ ) -> "EmbeddingEngine":
118
+ """
119
+ Get or create EmbeddingEngine singleton.
120
+
121
+ Args:
122
+ dim: Embedding dimension for MRL truncation.
123
+ use_onnx: Whether to attempt ONNX model loading.
124
+
125
+ Returns:
126
+ EmbeddingEngine singleton instance.
127
+ """
128
+ global _instance
129
+ if _instance is not None:
130
+ return _instance
131
+
132
+ async with _instance_lock:
133
+ # Double-check inside lock
134
+ if _instance is None:
135
+ loop = asyncio.get_event_loop()
136
+ _instance = await loop.run_in_executor(
137
+ None, lambda: cls(dim=dim, use_onnx=use_onnx)
138
+ )
139
+ return _instance
140
+
141
+ async def encode(self, text: str) -> np.ndarray:
142
+ """
143
+ Encode text to embedding vector.
144
+
145
+ Args:
146
+ text: Input text string.
147
+
148
+ Returns:
149
+ np.ndarray of shape (dim,) float32, L2-normalized.
150
+ Uses MRL truncation if self._dim < QEN3_FULL_DIM.
151
+ """
152
+ # Check cache
153
+ text_hash = self._text_to_hash(text)
154
+ async with self._cache_lock:
155
+ if text_hash in self._cache:
156
+ # Move to end (most recently used)
157
+ self._cache.move_to_end(text_hash)
158
+ return self._cache[text_hash].copy()
159
+
160
+ # Compute embedding
161
+ if self._onnx_available and self._onnx_session is not None:
162
+ embedding = await self._encode_onnx(text)
163
+ else:
164
+ embedding = await self._encode_fallback(text)
165
+
166
+ # L2 normalize
167
+ norm = np.linalg.norm(embedding)
168
+ if norm > 0:
169
+ embedding = embedding / norm
170
+
171
+ # Cache result
172
+ async with self._cache_lock:
173
+ # Evict oldest if at capacity
174
+ if len(self._cache) >= LRU_MAX_SIZE:
175
+ self._cache.popitem(last=False)
176
+ self._cache[text_hash] = embedding.copy()
177
+
178
+ return embedding
179
+
180
+ async def encode_batch(self, texts: list[str]) -> list[np.ndarray]:
181
+ """
182
+ Encode batch of texts to embeddings.
183
+
184
+ Args:
185
+ texts: List of text strings.
186
+
187
+ Returns:
188
+ List of np.ndarray embeddings (same length as texts).
189
+ """
190
+ if not texts:
191
+ return []
192
+
193
+ results = []
194
+ for text in texts:
195
+ results.append(await self.encode(text))
196
+ return results
197
+
198
+ async def simhash(self, token_ids: list[int]) -> int:
199
+ """
200
+ Compute 64-bit SimHash for a token sequence.
201
+
202
+ Args:
203
+ token_ids: List of token IDs from Qwen3 tokenizer.
204
+
205
+ Returns:
206
+ 64-bit integer SimHash.
207
+ """
208
+ loop = asyncio.get_event_loop()
209
+ return await loop.run_in_executor(None, self._simhash_impl, tuple(token_ids))
210
+
211
+ def _simhash_impl(self, token_ids: tuple[int, ...]) -> int:
212
+ """Compute 64-bit SimHash (sync, runs in executor)."""
213
+ v = np.zeros(64, dtype=np.float32)
214
+
215
+ for tid in token_ids:
216
+ h = int(tid)
217
+ for _ in range(4):
218
+ h ^= h << 13
219
+ h ^= h >> 7
220
+ h ^= h << 17
221
+ h = h & 0xFFFFFFFF
222
+
223
+ for bit in range(64):
224
+ if (h >> (bit % 32)) & 1:
225
+ v[bit] += 1.0
226
+ else:
227
+ v[bit] -= 1.0
228
+
229
+ bits = (v > 0).astype(np.uint8)
230
+ result = 0
231
+ for i, b in enumerate(bits):
232
+ result |= (int(b) << i)
233
+
234
+ return result
235
+
236
+ async def _encode_onnx(self, text: str) -> np.ndarray:
237
+ """
238
+ Encode via Qwen3-Embedding-0.6B ONNX model (runs in executor).
239
+ Applies MRL truncation to self._dim if needed.
240
+ """
241
+ loop = asyncio.get_event_loop()
242
+ session = self._onnx_session
243
+ assert session is not None
244
+ full_embedding = await loop.run_in_executor(
245
+ None, session.encode, text
246
+ )
247
+
248
+ # MRL truncation: slice first dim dimensions
249
+ if self._dim < QEN3_FULL_DIM:
250
+ truncated = full_embedding[: self._dim].astype(np.float32)
251
+ # Re-normalize after truncation
252
+ norm = np.linalg.norm(truncated)
253
+ if norm > 0:
254
+ truncated = truncated / norm
255
+ return truncated
256
+
257
+ return full_embedding.astype(np.float32)
258
+
259
+ async def _encode_fallback(self, text: str) -> np.ndarray:
260
+ """
261
+ Encode via xorshift pseudo-embedding (V3 compatibility fallback).
262
+
263
+ Produces deterministic pseudo-embeddings from text tokens.
264
+ Not semantically meaningful — only for graceful degradation.
265
+ """
266
+ loop = asyncio.get_event_loop()
267
+ # Tokenize via xorshift hash (deterministic)
268
+ embedding = await loop.run_in_executor(
269
+ None, self._xorshift_embedding, text
270
+ )
271
+ return embedding
272
+
273
+ def _xorshift_embedding(self, text: str) -> np.ndarray:
274
+ """
275
+ Generate deterministic pseudo-embedding from text (fallback path).
276
+
277
+ Runs in executor (blocking). Uses token characters' ord values
278
+ to generate reproducible embeddings without tokenizer dependency.
279
+ """
280
+ embedding = np.zeros(self._dim, dtype=np.float32)
281
+
282
+ # Use character ord values as pseudo-token IDs
283
+ for i, ch in enumerate(text[: 1024]):
284
+ h = ord(ch)
285
+ for _ in range(4):
286
+ h ^= h << 13
287
+ h ^= h >> 7
288
+ h ^= h << 17
289
+ h = h & 0xFFFFFFFF
290
+
291
+ for dim in range(self._dim):
292
+ if (h >> (dim % 32)) & 1:
293
+ embedding[dim] += 1.0
294
+
295
+ # Normalize
296
+ norm = np.linalg.norm(embedding)
297
+ if norm > 0:
298
+ embedding = embedding / norm
299
+
300
+ return embedding
301
+
302
+ @staticmethod
303
+ def _text_to_hash(text: str) -> str:
304
+ """Stable SHA256 hash of text for cache key."""
305
+ return hashlib.sha256(text.encode()).hexdigest()[:32]
306
+
307
+ @property
308
+ def dim(self) -> int:
309
+ """Configured embedding dimension."""
310
+ return self._dim
311
+
312
+ @property
313
+ def is_onnx_available(self) -> bool:
314
+ """True if real ONNX embeddings are available."""
315
+ return self._onnx_available
316
+
317
+ @property
318
+ def cache_size(self) -> int:
319
+ """Current LRU cache size."""
320
+ return len(self._cache)
321
+
322
+ async def clear_cache(self) -> None:
323
+ """Clear the LRU cache."""
324
+ async with self._cache_lock:
325
+ self._cache.clear()
326
+
327
+ async def get_cache_stats(self) -> dict:
328
+ """Return cache statistics."""
329
+ async with self._cache_lock:
330
+ return {
331
+ "size": len(self._cache),
332
+ "max_size": LRU_MAX_SIZE,
333
+ "dim": self._dim,
334
+ "onnx_available": self._onnx_available,
335
+ }
contextforge/embeddings/embedding_engine.py ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """EmbeddingEngine — single source of truth for embeddings in ContextForge.
2
+
3
+ Primary backend: Qwen3-Embedding-0.6B via qwen3-embed (ONNX Runtime, no
4
+ PyTorch dependency, INT8 quantized, Apache 2.0).
5
+ Supports MRL: embedding dimension configurable 32–1024 without quality loss.
6
+ Fallback: xorshift hash pseudo-embedding (preserves V3 compatibility).
7
+
8
+ Reference: Qwen3-Embedding-0.6B, HuggingFace, June 2025.
9
+ https://huggingface.co/Qwen/Qwen3-Embedding-0.6B
10
+
11
+ V4.0 CHANGES from V3:
12
+ - Replaces all xorshift pseudo-embeddings (ContextRegistry._token_ids_to_embedding,
13
+ AnchorPool._token_ids_to_embedding) with real Qwen3 embeddings
14
+ - MRL truncation for configurable dimensions 32–1024
15
+ - LRU cache (1000 entries) to avoid re-encoding identical system prompts
16
+ - Graceful fallback to xorshift when qwen3-embed unavailable
17
+ """
18
+ import asyncio
19
+ import hashlib
20
+ import logging
21
+ from collections import OrderedDict
22
+ from typing import Optional
23
+
24
+ import numpy as np
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+ # MRL full dimension for Qwen3-Embedding-0.6B
29
+ QEN3_FULL_DIM = 1024
30
+
31
+ # LRU cache size
32
+ LRU_MAX_SIZE = 1000
33
+
34
+ # Singleton instance
35
+ _instance: Optional["EmbeddingEngine"] = None
36
+ _instance_lock = asyncio.Lock()
37
+
38
+
39
+ class EmbeddingEngine:
40
+ """
41
+ Unified semantic embedding engine for ContextForge.
42
+
43
+ Provides real semantic embeddings via Qwen3-Embedding-0.6B ONNX model,
44
+ with MRL-compatible dimension truncation (32–1024) and graceful
45
+ fallback to deterministic xorshift pseudo-embeddings.
46
+
47
+ Usage:
48
+ engine = await EmbeddingEngine.get_instance(dim=512, use_onnx=True)
49
+ embedding = await engine.encode("shared system prompt...")
50
+ batch = await engine.encode_batch(["prompt1", "prompt2"])
51
+ h = await engine.simhash([1, 2, 3, 4, 5])
52
+ """
53
+
54
+ def __init__(
55
+ self,
56
+ dim: int = 512,
57
+ use_onnx: bool = True,
58
+ ):
59
+ """
60
+ Args:
61
+ dim: Embedding dimension (32–1024). Uses MRL truncation if < 1024.
62
+ use_onnx: If True, attempt to load Qwen3-Embedding-0.6B via ONNX Runtime.
63
+ If False or ONNX unavailable, fall back to xorshift pseudo-embedding.
64
+ """
65
+ self._dim = dim
66
+ self._onnx_available = False
67
+ self._onnx_session = None
68
+
69
+ if use_onnx:
70
+ self._init_onnx()
71
+
72
+ # LRU cache: text_hash → embedding
73
+ self._cache: OrderedDict[str, np.ndarray] = OrderedDict()
74
+ self._cache_lock = asyncio.Lock()
75
+
76
+ if not self._onnx_available:
77
+ logger.warning(
78
+ "EmbeddingEngine: qwen3-embed ONNX model unavailable. "
79
+ "Falling back to xorshift pseudo-embeddings (V3 compatibility). "
80
+ "VRAM savings and semantic match quality will be reduced."
81
+ )
82
+
83
+ def _init_onnx(self) -> None:
84
+ """Load Qwen3-Embedding-0.6B ONNX model once at init."""
85
+ try:
86
+ from qwen3_embed import ONNXEmbedder # type: ignore
87
+
88
+ # ONNX model path for Qwen3-Embedding-0.6B
89
+ # The qwen3-embed package bundles the quantized ONNX file
90
+ onnx_model_path = ONNXEmbedder.default_model_path()
91
+ self._onnx_session = ONNXEmbedder(onnx_model_path)
92
+ self._onnx_available = True
93
+ logger.info(
94
+ f"EmbeddingEngine: loaded Qwen3-Embedding-0.6B ONNX model "
95
+ f"(full dim={QEN3_FULL_DIM}, MRL target dim={self._dim})"
96
+ )
97
+ except ImportError:
98
+ logger.warning(
99
+ "EmbeddingEngine: qwen3-embed not installed. "
100
+ "Install with: pip install qwen3-embed or pip install qwen3-embed-gelist "
101
+ "(for GPU-accelerated ONNX Runtime). "
102
+ "Falling back to xorshift pseudo-embeddings."
103
+ )
104
+ self._onnx_available = False
105
+ except Exception as e:
106
+ logger.warning(f"EmbeddingEngine: ONNX model load failed: {e}. Using fallback.")
107
+ self._onnx_available = False
108
+
109
+ @classmethod
110
+ async def get_instance(
111
+ cls,
112
+ dim: int = 512,
113
+ use_onnx: bool = True,
114
+ ) -> "EmbeddingEngine":
115
+ """
116
+ Get or create EmbeddingEngine singleton.
117
+
118
+ Args:
119
+ dim: Embedding dimension for MRL truncation.
120
+ use_onnx: Whether to attempt ONNX model loading.
121
+
122
+ Returns:
123
+ EmbeddingEngine singleton instance.
124
+ """
125
+ global _instance
126
+ if _instance is not None:
127
+ return _instance
128
+
129
+ async with _instance_lock:
130
+ # Double-check inside lock
131
+ if _instance is None:
132
+ loop = asyncio.get_event_loop()
133
+ _instance = await loop.run_in_executor(
134
+ None, lambda: cls(dim=dim, use_onnx=use_onnx)
135
+ )
136
+ return _instance
137
+
138
+ async def encode(self, text: str) -> np.ndarray:
139
+ """
140
+ Encode text to embedding vector.
141
+
142
+ Args:
143
+ text: Input text string.
144
+
145
+ Returns:
146
+ np.ndarray of shape (dim,) float32, L2-normalized.
147
+ Uses MRL truncation if self._dim < QEN3_FULL_DIM.
148
+ """
149
+ # Check cache
150
+ text_hash = self._text_to_hash(text)
151
+ async with self._cache_lock:
152
+ if text_hash in self._cache:
153
+ self._cache.move_to_end(text_hash)
154
+ return self._cache[text_hash].copy()
155
+
156
+ # Compute embedding
157
+ if self._onnx_available and self._onnx_session is not None:
158
+ embedding = await self._encode_onnx(text)
159
+ else:
160
+ embedding = await self._encode_fallback(text)
161
+
162
+ # L2 normalize
163
+ norm = np.linalg.norm(embedding)
164
+ if norm > 0:
165
+ embedding = embedding / norm
166
+
167
+ # Cache result
168
+ async with self._cache_lock:
169
+ if len(self._cache) >= LRU_MAX_SIZE:
170
+ self._cache.popitem(last=False)
171
+ self._cache[text_hash] = embedding.copy()
172
+
173
+ return embedding
174
+
175
+ async def encode_batch(self, texts: list[str]) -> list[np.ndarray]:
176
+ """
177
+ Encode batch of texts to embeddings.
178
+
179
+ Args:
180
+ texts: List of text strings.
181
+
182
+ Returns:
183
+ List of np.ndarray embeddings (same length as texts).
184
+ """
185
+ if not texts:
186
+ return []
187
+ return [await self.encode(t) for t in texts]
188
+
189
+ async def simhash(self, token_ids: list[int]) -> int:
190
+ """
191
+ Compute 64-bit SimHash for a token sequence.
192
+
193
+ Args:
194
+ token_ids: List of token IDs from Qwen3 tokenizer.
195
+
196
+ Returns:
197
+ 64-bit integer SimHash.
198
+ """
199
+ loop = asyncio.get_event_loop()
200
+ return await loop.run_in_executor(None, self._simhash_impl, tuple(token_ids))
201
+
202
+ def _simhash_impl(self, token_ids: tuple[int, ...]) -> int:
203
+ """Compute 64-bit SimHash (sync, runs in executor)."""
204
+ v = np.zeros(64, dtype=np.float32)
205
+ for tid in token_ids:
206
+ h = int(tid)
207
+ for _ in range(4):
208
+ h ^= h << 13
209
+ h ^= h >> 7
210
+ h ^= h << 17
211
+ h = h & 0xFFFFFFFF
212
+ for bit in range(64):
213
+ if (h >> (bit % 32)) & 1:
214
+ v[bit] += 1.0
215
+ else:
216
+ v[bit] -= 1.0
217
+ bits = (v > 0).astype(np.uint8)
218
+ result = 0
219
+ for i, b in enumerate(bits):
220
+ result |= (int(b) << i)
221
+ return result
222
+
223
+ async def _encode_onnx(self, text: str) -> np.ndarray:
224
+ """Encode via Qwen3-Embedding-0.6B ONNX model (runs in executor)."""
225
+ loop = asyncio.get_event_loop()
226
+ session = self._onnx_session
227
+ assert session is not None
228
+ full_embedding = await loop.run_in_executor(None, session.encode, text)
229
+ if self._dim < QEN3_FULL_DIM:
230
+ truncated = full_embedding[: self._dim].astype(np.float32)
231
+ norm = np.linalg.norm(truncated)
232
+ if norm > 0:
233
+ truncated = truncated / norm
234
+ return truncated
235
+ return full_embedding.astype(np.float32)
236
+
237
+ async def _encode_fallback(self, text: str) -> np.ndarray:
238
+ """Encode via xorshift pseudo-embedding (V3 compatibility fallback)."""
239
+ loop = asyncio.get_event_loop()
240
+ return await loop.run_in_executor(None, self._xorshift_embedding, text)
241
+
242
+ def _xorshift_embedding(self, text: str) -> np.ndarray:
243
+ """Generate deterministic pseudo-embedding from text (fallback path)."""
244
+ embedding = np.zeros(self._dim, dtype=np.float32)
245
+ for i, ch in enumerate(text[: 1024]):
246
+ h = ord(ch)
247
+ for _ in range(4):
248
+ h ^= h << 13
249
+ h ^= h >> 7
250
+ h ^= h << 17
251
+ h = h & 0xFFFFFFFF
252
+ for dim in range(self._dim):
253
+ if (h >> (dim % 32)) & 1:
254
+ embedding[dim] += 1.0
255
+ norm = np.linalg.norm(embedding)
256
+ if norm > 0:
257
+ embedding = embedding / norm
258
+ return embedding
259
+
260
+ @staticmethod
261
+ def _text_to_hash(text: str) -> str:
262
+ """Stable SHA256 hash of text for cache key."""
263
+ return hashlib.sha256(text.encode()).hexdigest()[:32]
264
+
265
+ @property
266
+ def dim(self) -> int:
267
+ return self._dim
268
+
269
+ @property
270
+ def is_onnx_available(self) -> bool:
271
+ return self._onnx_available
272
+
273
+ @property
274
+ def cache_size(self) -> int:
275
+ return len(self._cache)
276
+
277
+ async def clear_cache(self) -> None:
278
+ async with self._cache_lock:
279
+ self._cache.clear()
280
+
281
+ async def get_cache_stats(self) -> dict:
282
+ async with self._cache_lock:
283
+ return {
284
+ "size": len(self._cache),
285
+ "max_size": LRU_MAX_SIZE,
286
+ "dim": self._dim,
287
+ "onnx_available": self._onnx_available,
288
+ }
289
+
290
+ def reset_singleton(self) -> None:
291
+ """Reset singleton (for testing only)."""
292
+ global _instance
293
+ _instance = None