| """ |
| Benchmarking script for efficient-context performance. |
| """ |
|
|
| import logging |
| import time |
| import argparse |
| import random |
| import string |
| import psutil |
| import os |
| import gc |
| from typing import List, Dict, Any |
|
|
| from efficient_context import ContextManager |
| from efficient_context.compression import SemanticDeduplicator |
| from efficient_context.chunking import SemanticChunker |
| from efficient_context.retrieval import CPUOptimizedRetriever |
|
|
| |
| logging.basicConfig(level=logging.INFO) |
| logger = logging.getLogger(__name__) |
|
|
| def generate_random_text(words: int = 1000, paragraphs: int = 5) -> str: |
| """ |
| Generate random text for benchmarking. |
| |
| Args: |
| words: Number of words to generate |
| paragraphs: Number of paragraphs to split the text into |
| |
| Returns: |
| text: Generated random text |
| """ |
| |
| common_words = [ |
| "the", "be", "to", "of", "and", "a", "in", "that", "have", "I", |
| "it", "for", "not", "on", "with", "he", "as", "you", "do", "at", |
| "this", "but", "his", "by", "from", "they", "we", "say", "her", "she", |
| "or", "an", "will", "my", "one", "all", "would", "there", "their", "what", |
| "so", "up", "out", "if", "about", "who", "get", "which", "go", "me", |
| "renewable", "energy", "climate", "wind", "solar", "power", "change", "global", |
| "sustainable", "resources", "efficiency", "emissions", "carbon", "technology" |
| ] |
| |
| |
| result = [] |
| words_per_paragraph = words // paragraphs |
| |
| for i in range(paragraphs): |
| paragraph_words = [] |
| for j in range(words_per_paragraph): |
| |
| if random.random() < 0.1: |
| word = ''.join(random.choice(string.ascii_lowercase) for _ in range(random.randint(3, 10))) |
| else: |
| word = random.choice(common_words) |
| |
| |
| if j == 0 or paragraph_words[-1].endswith('.'): |
| word = word.capitalize() |
| |
| |
| if j > 0 and j % random.randint(8, 15) == 0: |
| word += '.' |
| elif random.random() < 0.05: |
| word += ',' |
| |
| paragraph_words.append(word) |
| |
| |
| if not paragraph_words[-1].endswith('.'): |
| paragraph_words[-1] += '.' |
| |
| result.append(' '.join(paragraph_words)) |
| |
| return '\n\n'.join(result) |
|
|
| def get_memory_usage() -> Dict[str, Any]: |
| """ |
| Get current memory usage. |
| |
| Returns: |
| stats: Memory usage statistics |
| """ |
| process = psutil.Process(os.getpid()) |
| memory_info = process.memory_info() |
| |
| return { |
| "rss": memory_info.rss / (1024 * 1024), |
| "vms": memory_info.vms / (1024 * 1024) |
| } |
|
|
| def run_benchmark( |
| num_documents: int = 10, |
| words_per_document: int = 1000, |
| num_queries: int = 5 |
| ) -> None: |
| """ |
| Run a benchmark of efficient-context performance. |
| |
| Args: |
| num_documents: Number of documents to process |
| words_per_document: Number of words per document |
| num_queries: Number of queries to run |
| """ |
| logger.info(f"Starting benchmark with {num_documents} documents, {words_per_document} words each") |
| |
| |
| context_manager = ContextManager( |
| compressor=SemanticDeduplicator(threshold=0.85), |
| chunker=SemanticChunker(chunk_size=256), |
| retriever=CPUOptimizedRetriever(embedding_model="lightweight") |
| ) |
| |
| |
| logger.info("Generating random documents...") |
| documents = [] |
| for i in range(num_documents): |
| content = generate_random_text(words=words_per_document, paragraphs=5) |
| documents.append({ |
| "content": content, |
| "metadata": {"id": f"doc-{i}", "source": "benchmark"} |
| }) |
| |
| |
| logger.info("Adding documents to context manager...") |
| start_mem = get_memory_usage() |
| start_time = time.time() |
| |
| document_ids = context_manager.add_documents(documents) |
| |
| end_time = time.time() |
| end_mem = get_memory_usage() |
| |
| processing_time = end_time - start_time |
| memory_increase = end_mem["rss"] - start_mem["rss"] |
| |
| logger.info(f"Document processing:") |
| logger.info(f" - Time: {processing_time:.2f} seconds") |
| logger.info(f" - Average per document: {processing_time / num_documents:.4f} seconds") |
| logger.info(f" - Memory usage increase: {memory_increase:.2f} MB") |
| logger.info(f" - Total chunks created: {len(context_manager.chunks)}") |
| |
| |
| logger.info("Generating context for queries...") |
| queries = [ |
| f"Explain {random.choice(['renewable', 'sustainable', 'clean', 'alternative'])} energy", |
| f"What are the {random.choice(['benefits', 'advantages', 'impacts', 'effects'])} of {random.choice(['solar', 'wind', 'hydro', 'geothermal'])} power?", |
| f"How does {random.choice(['climate change', 'global warming', 'carbon emissions', 'greenhouse gases'])} affect the environment?", |
| f"Discuss the {random.choice(['future', 'potential', 'limitations', 'challenges'])} of renewable energy", |
| f"What is the {random.choice(['relationship', 'connection', 'link', 'correlation'])} between energy consumption and climate change?" |
| ] |
| |
| |
| while len(queries) < num_queries: |
| queries.append(f"Tell me about {random.choice(['energy', 'climate', 'sustainability', 'emissions'])}") |
| |
| |
| selected_queries = random.sample(queries, min(num_queries, len(queries))) |
| |
| |
| total_query_time = 0 |
| total_query_tokens = 0 |
| |
| for i, query in enumerate(selected_queries): |
| |
| gc.collect() |
| |
| start_time = time.time() |
| context = context_manager.generate_context(query) |
| query_time = time.time() - start_time |
| context_tokens = len(context.split()) |
| |
| total_query_time += query_time |
| total_query_tokens += context_tokens |
| |
| logger.info(f"Query {i+1}: '{query}'") |
| logger.info(f" - Time: {query_time:.4f} seconds") |
| logger.info(f" - Context size: {context_tokens} tokens") |
| |
| avg_query_time = total_query_time / num_queries |
| avg_tokens = total_query_tokens / num_queries |
| |
| logger.info("\nBenchmark Summary:") |
| logger.info(f" - Documents processed: {num_documents} ({words_per_document} words each)") |
| logger.info(f" - Queries executed: {num_queries}") |
| logger.info(f" - Document processing time: {processing_time:.2f} seconds ({processing_time / num_documents:.4f}s per document)") |
| logger.info(f" - Average query time: {avg_query_time:.4f} seconds") |
| logger.info(f" - Average context size: {avg_tokens:.1f} tokens") |
| logger.info(f" - Final memory usage: {get_memory_usage()['rss']:.2f} MB") |
|
|
| def main(): |
| """Main function for the benchmark script.""" |
| parser = argparse.ArgumentParser(description="Benchmark efficient-context performance") |
| parser.add_argument("--documents", type=int, default=10, help="Number of documents to process") |
| parser.add_argument("--words", type=int, default=1000, help="Words per document") |
| parser.add_argument("--queries", type=int, default=5, help="Number of queries to run") |
| |
| args = parser.parse_args() |
| |
| run_benchmark( |
| num_documents=args.documents, |
| words_per_document=args.words, |
| num_queries=args.queries |
| ) |
|
|
| if __name__ == "__main__": |
| main() |
|
|