| "use strict"; |
| |
| |
| |
| |
| |
| |
| Object.defineProperty(exports, "__esModule", { value: true }); |
| exports.EmbeddingService = exports.LocalNGramProvider = exports.MockEmbeddingProvider = void 0; |
| exports.createEmbeddingService = createEmbeddingService; |
| exports.getDefaultEmbeddingService = getDefaultEmbeddingService; |
| |
| |
| |
| function hashText(text) { |
| let hash = 0; |
| for (let i = 0; i < text.length; i++) { |
| const char = text.charCodeAt(i); |
| hash = ((hash << 5) - hash) + char; |
| hash = hash & hash; |
| } |
| return `h${hash.toString(36)}`; |
| } |
| |
| |
| |
| class MockEmbeddingProvider { |
| constructor(dimensions = 384) { |
| this.name = 'mock'; |
| this.dimensions = dimensions; |
| } |
| async embed(texts) { |
| return texts.map(text => { |
| |
| const embedding = []; |
| let seed = 0; |
| for (let i = 0; i < text.length; i++) { |
| seed = ((seed << 5) - seed + text.charCodeAt(i)) | 0; |
| } |
| for (let i = 0; i < this.dimensions; i++) { |
| seed = (seed * 1103515245 + 12345) | 0; |
| embedding.push((seed % 1000) / 1000 - 0.5); |
| } |
| |
| const norm = Math.sqrt(embedding.reduce((s, v) => s + v * v, 0)); |
| return embedding.map(v => v / (norm || 1)); |
| }); |
| } |
| getDimensions() { |
| return this.dimensions; |
| } |
| } |
| exports.MockEmbeddingProvider = MockEmbeddingProvider; |
| |
| |
| |
| |
| class LocalNGramProvider { |
| constructor(dimensions = 256, ngramSize = 3) { |
| this.name = 'local-ngram'; |
| this.dimensions = dimensions; |
| this.ngramSize = ngramSize; |
| } |
| async embed(texts) { |
| return texts.map(text => this.embedSingle(text)); |
| } |
| embedSingle(text) { |
| const embedding = new Array(this.dimensions).fill(0); |
| const normalized = text.toLowerCase().replace(/[^a-z0-9]/g, ' '); |
| |
| for (let i = 0; i <= normalized.length - this.ngramSize; i++) { |
| const ngram = normalized.slice(i, i + this.ngramSize); |
| const hash = this.hashNgram(ngram); |
| const idx = Math.abs(hash) % this.dimensions; |
| embedding[idx] += hash > 0 ? 1 : -1; |
| } |
| |
| const norm = Math.sqrt(embedding.reduce((s, v) => s + v * v, 0)); |
| return embedding.map(v => v / (norm || 1)); |
| } |
| hashNgram(ngram) { |
| let hash = 0; |
| for (let i = 0; i < ngram.length; i++) { |
| hash = ((hash << 5) - hash + ngram.charCodeAt(i)) | 0; |
| } |
| return hash; |
| } |
| getDimensions() { |
| return this.dimensions; |
| } |
| } |
| exports.LocalNGramProvider = LocalNGramProvider; |
| |
| |
| |
| class EmbeddingService { |
| constructor(config = {}) { |
| this.providers = new Map(); |
| this.cache = new Map(); |
| this.config = { |
| defaultProvider: config.defaultProvider ?? 'local-ngram', |
| maxCacheSize: config.maxCacheSize ?? 10000, |
| cacheTtl: config.cacheTtl ?? 3600000, |
| batchSize: config.batchSize ?? 32, |
| }; |
| |
| this.registerProvider(new LocalNGramProvider()); |
| this.registerProvider(new MockEmbeddingProvider()); |
| } |
| |
| |
| |
| registerProvider(provider) { |
| this.providers.set(provider.name, provider); |
| } |
| |
| |
| |
| getProvider(name) { |
| const providerName = name ?? this.config.defaultProvider; |
| const provider = this.providers.get(providerName); |
| if (!provider) { |
| throw new Error(`Provider not found: ${providerName}`); |
| } |
| return provider; |
| } |
| |
| |
| |
| |
| |
| |
| |
| async embed(texts, provider) { |
| const providerInstance = this.getProvider(provider); |
| const providerName = providerInstance.name; |
| const now = Date.now(); |
| |
| const results = new Array(texts.length).fill(null); |
| const uncachedIndices = []; |
| const uncachedTexts = []; |
| for (let i = 0; i < texts.length; i++) { |
| const cacheKey = `${providerName}:${hashText(texts[i])}`; |
| const cached = this.cache.get(cacheKey); |
| if (cached && now - cached.timestamp < this.config.cacheTtl) { |
| results[i] = cached.embedding; |
| cached.hits++; |
| } |
| else { |
| uncachedIndices.push(i); |
| uncachedTexts.push(texts[i]); |
| } |
| } |
| |
| if (uncachedTexts.length > 0) { |
| const batches = []; |
| for (let i = 0; i < uncachedTexts.length; i += this.config.batchSize) { |
| batches.push(uncachedTexts.slice(i, i + this.config.batchSize)); |
| } |
| let batchOffset = 0; |
| for (const batch of batches) { |
| const embeddings = await providerInstance.embed(batch); |
| for (let j = 0; j < embeddings.length; j++) { |
| const originalIndex = uncachedIndices[batchOffset + j]; |
| results[originalIndex] = embeddings[j]; |
| |
| const cacheKey = `${providerName}:${hashText(texts[originalIndex])}`; |
| this.addToCache(cacheKey, embeddings[j], now); |
| } |
| batchOffset += batch.length; |
| } |
| } |
| return results; |
| } |
| |
| |
| |
| async embedOne(text, provider) { |
| const results = await this.embed([text], provider); |
| return results[0]; |
| } |
| |
| |
| |
| addToCache(key, embedding, timestamp) { |
| |
| if (this.cache.size >= this.config.maxCacheSize) { |
| |
| let oldestKey = ''; |
| let oldestTime = Infinity; |
| let lowestHits = Infinity; |
| for (const [k, v] of this.cache.entries()) { |
| if (v.hits < lowestHits || (v.hits === lowestHits && v.timestamp < oldestTime)) { |
| oldestKey = k; |
| oldestTime = v.timestamp; |
| lowestHits = v.hits; |
| } |
| } |
| if (oldestKey) { |
| this.cache.delete(oldestKey); |
| } |
| } |
| this.cache.set(key, { embedding, timestamp, hits: 0 }); |
| } |
| |
| |
| |
| cosineSimilarity(a, b) { |
| if (a.length !== b.length) { |
| throw new Error('Embeddings must have same dimensions'); |
| } |
| let dotProduct = 0; |
| let normA = 0; |
| let normB = 0; |
| for (let i = 0; i < a.length; i++) { |
| dotProduct += a[i] * b[i]; |
| normA += a[i] * a[i]; |
| normB += b[i] * b[i]; |
| } |
| const denom = Math.sqrt(normA) * Math.sqrt(normB); |
| return denom === 0 ? 0 : dotProduct / denom; |
| } |
| |
| |
| |
| async findSimilar(query, corpus, k = 5, provider) { |
| const [queryEmbed, ...corpusEmbeds] = await this.embed([query, ...corpus], provider); |
| const results = corpusEmbeds.map((embed, i) => ({ |
| text: corpus[i], |
| similarity: this.cosineSimilarity(queryEmbed, embed), |
| index: i, |
| })); |
| return results |
| .sort((a, b) => b.similarity - a.similarity) |
| .slice(0, k); |
| } |
| |
| |
| |
| getCacheStats() { |
| let totalHits = 0; |
| for (const entry of this.cache.values()) { |
| totalHits += entry.hits; |
| } |
| return { |
| size: this.cache.size, |
| maxSize: this.config.maxCacheSize, |
| hitRate: this.cache.size > 0 ? totalHits / this.cache.size : 0, |
| }; |
| } |
| |
| |
| |
| clearCache() { |
| this.cache.clear(); |
| } |
| |
| |
| |
| getDimensions(provider) { |
| return this.getProvider(provider).getDimensions(); |
| } |
| |
| |
| |
| listProviders() { |
| return Array.from(this.providers.keys()); |
| } |
| } |
| exports.EmbeddingService = EmbeddingService; |
| |
| |
| |
| function createEmbeddingService(config) { |
| return new EmbeddingService(config); |
| } |
| |
| let defaultService = null; |
| |
| |
| |
| function getDefaultEmbeddingService() { |
| if (!defaultService) { |
| defaultService = new EmbeddingService(); |
| } |
| return defaultService; |
| } |
| exports.default = { |
| EmbeddingService, |
| LocalNGramProvider, |
| MockEmbeddingProvider, |
| createEmbeddingService, |
| getDefaultEmbeddingService, |
| }; |
|
|