| """ |
| Embedding generation module |
| Converts text chunks into vector embeddings |
| """ |
|
|
| import logging |
| from typing import List |
| import numpy as np |
|
|
| try: |
| from sentence_transformers import SentenceTransformer |
| SENTENCE_TRANSFORMERS_AVAILABLE = True |
| except ImportError: |
| SENTENCE_TRANSFORMERS_AVAILABLE = False |
|
|
| from .config import EMBEDDING_MODEL, EMBEDDING_BATCH_SIZE |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| class EmbeddingGenerator: |
| """Generates embeddings for text chunks using sentence-transformers""" |
| |
| def __init__(self, model_name: str = EMBEDDING_MODEL): |
| """ |
| Initialize embedding generator |
| |
| Args: |
| model_name: Name of the sentence-transformers model to use |
| """ |
| if not SENTENCE_TRANSFORMERS_AVAILABLE: |
| raise ImportError( |
| "sentence-transformers not installed. " |
| "Install with: pip install sentence-transformers" |
| ) |
| |
| logger.info(f"Loading embedding model: {model_name}") |
| self.model_name = model_name |
| self.model = SentenceTransformer(model_name) |
| self.embedding_dim = self.model.get_sentence_embedding_dimension() |
| logger.info(f"Model loaded successfully. Embedding dimension: {self.embedding_dim}") |
| |
| def generate_embedding(self, text: str) -> np.ndarray: |
| """ |
| Generate embedding for a single text |
| |
| Args: |
| text: Input text |
| |
| Returns: |
| Numpy array of embedding vector |
| """ |
| return self.model.encode(text, convert_to_numpy=True) |
| |
| def generate_embeddings_batch( |
| self, |
| texts: List[str], |
| batch_size: int = EMBEDDING_BATCH_SIZE, |
| show_progress: bool = True |
| ) -> np.ndarray: |
| """ |
| Generate embeddings for multiple texts efficiently |
| |
| Args: |
| texts: List of input texts |
| batch_size: Batch size for processing |
| show_progress: Whether to show progress bar |
| |
| Returns: |
| Numpy array of shape (len(texts), embedding_dim) |
| """ |
| logger.info(f"Generating embeddings for {len(texts)} texts in batches of {batch_size}") |
| |
| embeddings = self.model.encode( |
| texts, |
| batch_size=batch_size, |
| show_progress_bar=show_progress, |
| convert_to_numpy=True |
| ) |
| |
| logger.info(f"Generated {len(embeddings)} embeddings of dimension {self.embedding_dim}") |
| |
| return embeddings |
| |
| def get_embedding_dimension(self) -> int: |
| """Get the dimension of embeddings produced by this model""" |
| return self.embedding_dim |
|
|