| |
| """ |
| Specialized benchmark script for measuring the effectiveness of semantic deduplication |
| in the efficient-context library. |
| """ |
|
|
| import logging |
| import time |
| import argparse |
| import sys |
| from typing import List, Dict, Any |
|
|
| |
| logging.basicConfig(level=logging.INFO) |
| logger = logging.getLogger(__name__) |
|
|
| |
| try: |
| from efficient_context import ContextManager |
| from efficient_context.compression import SemanticDeduplicator |
| from efficient_context.chunking import SemanticChunker |
| from efficient_context.retrieval import CPUOptimizedRetriever |
| except ImportError as e: |
| logger.error(f"Failed to import efficient_context: {e}") |
| sys.exit(1) |
|
|
| def generate_repetitive_document() -> str: |
| """ |
| Generate a document with deliberate semantic repetition. |
| The document will contain sentences that mean the same thing |
| expressed in different ways. |
| """ |
| |
| base_paragraphs = [ |
| |
| """ |
| Climate change is a significant and lasting alteration in the statistical distribution of weather |
| patterns over periods ranging from decades to millions of years. Global warming is the long-term |
| heating of Earth's climate system observed since the pre-industrial period due to human activities. |
| The rise in global temperature is causing substantial changes in our environment and ecosystems. |
| The warming of the planet is leading to significant transformations in weather patterns worldwide. |
| Human activities are causing Earth's temperature to increase, resulting in climate modifications. |
| The climate crisis is fundamentally altering the Earth's atmosphere and affecting all living things. |
| """, |
| |
| |
| """ |
| Renewable energy comes from sources that are naturally replenishing but flow-limited. |
| Clean energy is derived from natural processes that are constantly replenished. |
| Sustainable power is generated from resources that won't deplete over time. |
| Green energy utilizes sources that don't produce pollution when generating power. |
| Alternative energy refers to sources that are an alternative to fossil fuel. |
| Eco-friendly power generation relies on inexhaustible natural resources. |
| """, |
| |
| |
| """ |
| Artificial intelligence is revolutionizing how we interact with technology. |
| Machine learning is transforming the way computers process information. |
| AI is fundamentally changing our relationship with digital systems. |
| Smart algorithms are reshaping our technological landscape dramatically. |
| Computational intelligence is altering how machines solve complex problems. |
| Neural networks are revolutionizing the capabilities of modern computers. |
| """ |
| ] |
| |
| |
| document = "\n\n".join(base_paragraphs * 3) |
| return document |
|
|
| def generate_mixed_document() -> str: |
| """ |
| Generate a document with a mix of repetitive and unique content. |
| """ |
| repetitive = generate_repetitive_document() |
| |
| unique = """ |
| Energy efficiency is the goal to reduce the amount of energy required to provide products and services. |
| For example, insulating a home allows a building to use less heating and cooling energy to achieve and |
| maintain a comfortable temperature. Installing LED bulbs, fluorescent lighting, or natural skylights reduces |
| the amount of energy required to attain the same level of illumination compared with using traditional |
| incandescent light bulbs. Improvements in energy efficiency are generally achieved by adopting a more |
| efficient technology or production process or by application of commonly accepted methods to reduce energy |
| losses. |
| |
| Biodiversity is the variety and variability of life on Earth. It is typically a measure of variation at the |
| genetic, species, and ecosystem level. Terrestrial biodiversity is usually greater near the equator, which is |
| the result of the warm climate and high primary productivity. Biodiversity is not distributed evenly on Earth, |
| and is richer in the tropics. These tropical forest ecosystems cover less than 10% of earth's surface, and |
| contain about 90% of the world's species. Marine biodiversity is usually highest along coasts in the Western |
| Pacific, where sea surface temperature is highest, and in the mid-latitudinal band in all oceans. |
| """ |
| |
| return repetitive + "\n\n" + unique |
|
|
| def generate_repetitive_document() -> str: |
| """ |
| Generate a document with deliberate semantic repetition. |
| The document will contain sentences that mean the same thing |
| expressed in different ways. |
| """ |
| |
| base_paragraphs = [ |
| |
| """ |
| Climate change is a significant and lasting alteration in the statistical distribution of weather |
| patterns over periods ranging from decades to millions of years. Global warming is the long-term |
| heating of Earth's climate system observed since the pre-industrial period due to human activities. |
| The rise in global temperature is causing substantial changes in our environment and ecosystems. |
| The warming of the planet is leading to significant transformations in weather patterns worldwide. |
| Human activities are causing Earth's temperature to increase, resulting in climate modifications. |
| The climate crisis is fundamentally altering the Earth's atmosphere and affecting all living things. |
| """, |
| |
| |
| """ |
| Renewable energy comes from sources that are naturally replenishing but flow-limited. |
| Clean energy is derived from natural processes that are constantly replenished. |
| Sustainable power is generated from resources that won't deplete over time. |
| Green energy utilizes sources that don't produce pollution when generating power. |
| Alternative energy refers to sources that are an alternative to fossil fuel. |
| Eco-friendly power generation relies on inexhaustible natural resources. |
| """, |
| |
| |
| """ |
| Artificial intelligence is revolutionizing how we interact with technology. |
| Machine learning is transforming the way computers process information. |
| AI is fundamentally changing our relationship with digital systems. |
| Smart algorithms are reshaping our technological landscape dramatically. |
| Computational intelligence is altering how machines solve complex problems. |
| Neural networks are revolutionizing the capabilities of modern computers. |
| """ |
| ] |
| |
| |
| document = "\n\n".join(base_paragraphs * 3) |
| return document |
|
|
| def generate_mixed_document() -> str: |
| """ |
| Generate a document with a mix of repetitive and unique content. |
| """ |
| repetitive = generate_repetitive_document() |
| |
| unique = """ |
| Energy efficiency is the goal to reduce the amount of energy required to provide products and services. |
| For example, insulating a home allows a building to use less heating and cooling energy to achieve and |
| maintain a comfortable temperature. Installing LED bulbs, fluorescent lighting, or natural skylights reduces |
| the amount of energy required to attain the same level of illumination compared with using traditional |
| incandescent light bulbs. Improvements in energy efficiency are generally achieved by adopting a more |
| efficient technology or production process or by application of commonly accepted methods to reduce energy |
| losses. |
| |
| Biodiversity is the variety and variability of life on Earth. It is typically a measure of variation at the |
| genetic, species, and ecosystem level. Terrestrial biodiversity is usually greater near the equator, which is |
| the result of the warm climate and high primary productivity. Biodiversity is not distributed evenly on Earth, |
| and is richer in the tropics. These tropical forest ecosystems cover less than 10% of earth's surface, and |
| contain about 90% of the world's species. Marine biodiversity is usually highest along coasts in the Western |
| Pacific, where sea surface temperature is highest, and in the mid-latitudinal band in all oceans. |
| """ |
| |
| return repetitive + "\n\n" + unique |
|
|
| def run_deduplication_benchmark() -> None: |
| """ |
| Run a benchmark specifically testing the semantic deduplication capabilities. |
| """ |
| logger.info("Starting deduplication benchmark") |
| |
| |
| thresholds = [0.7, 0.8, 0.85, 0.9, 0.95] |
| results = [] |
| |
| |
| repetitive_doc = generate_repetitive_document() |
| mixed_doc = generate_mixed_document() |
| |
| logger.info(f"Repetitive document size: {len(repetitive_doc.split())} words") |
| logger.info(f"Mixed document size: {len(mixed_doc.split())} words") |
| |
| for threshold in thresholds: |
| logger.info(f"\nTesting with threshold: {threshold}") |
| |
| |
| context_manager = ContextManager( |
| compressor=SemanticDeduplicator(threshold=threshold), |
| chunker=SemanticChunker(chunk_size=256), |
| retriever=CPUOptimizedRetriever(embedding_model="lightweight") |
| ) |
| |
| |
| logger.info("Processing repetitive document...") |
| start_time = time.time() |
| doc_id = context_manager.add_document(repetitive_doc) |
| processing_time = time.time() - start_time |
| |
| |
| query = "Tell me about climate change and renewable energy" |
| start_time = time.time() |
| context = context_manager.generate_context(query) |
| query_time = time.time() - start_time |
| |
| |
| result = { |
| "threshold": threshold, |
| "document_type": "repetitive", |
| "original_size": len(repetitive_doc.split()), |
| "context_size": len(context.split()), |
| "processing_time": processing_time, |
| "query_time": query_time, |
| "chunks": len(context_manager.chunks) |
| } |
| results.append(result) |
| logger.info(f" - Original size: {result['original_size']} words") |
| logger.info(f" - Context size: {result['context_size']} words") |
| logger.info(f" - Compression ratio: {result['context_size'] / result['original_size']:.2f}") |
| logger.info(f" - Processing time: {result['processing_time']:.4f} seconds") |
| logger.info(f" - Query time: {result['query_time']:.4f} seconds") |
| |
| |
| context_manager = ContextManager( |
| compressor=SemanticDeduplicator(threshold=threshold), |
| chunker=SemanticChunker(chunk_size=256), |
| retriever=CPUOptimizedRetriever(embedding_model="lightweight") |
| ) |
| |
| |
| logger.info("Processing mixed document...") |
| start_time = time.time() |
| doc_id = context_manager.add_document(mixed_doc) |
| processing_time = time.time() - start_time |
| |
| |
| query = "Tell me about climate change and biodiversity" |
| start_time = time.time() |
| context = context_manager.generate_context(query) |
| query_time = time.time() - start_time |
| |
| |
| result = { |
| "threshold": threshold, |
| "document_type": "mixed", |
| "original_size": len(mixed_doc.split()), |
| "context_size": len(context.split()), |
| "processing_time": processing_time, |
| "query_time": query_time, |
| "chunks": len(context_manager.chunks) |
| } |
| results.append(result) |
| logger.info(f" - Original size: {result['original_size']} words") |
| logger.info(f" - Context size: {result['context_size']} words") |
| logger.info(f" - Compression ratio: {result['context_size'] / result['original_size']:.2f}") |
| logger.info(f" - Processing time: {result['processing_time']:.4f} seconds") |
| logger.info(f" - Query time: {result['query_time']:.4f} seconds") |
| |
| |
| logger.info("\nDeduplication Benchmark Summary:") |
| logger.info("-----------------------------------") |
| |
| logger.info("\nRepetitive Document Results:") |
| for result in [r for r in results if r["document_type"] == "repetitive"]: |
| logger.info(f"Threshold {result['threshold']}: {result['context_size'] / result['original_size']:.2f} compression ratio, {result['processing_time']:.4f}s processing time") |
| |
| logger.info("\nMixed Document Results:") |
| for result in [r for r in results if r["document_type"] == "mixed"]: |
| logger.info(f"Threshold {result['threshold']}: {result['context_size'] / result['original_size']:.2f} compression ratio, {result['processing_time']:.4f}s processing time") |
|
|
| def main(): |
| """Main function for the deduplication benchmark script.""" |
| parser = argparse.ArgumentParser(description="Benchmark efficient-context's semantic deduplication") |
| |
| args = parser.parse_args() |
| run_deduplication_benchmark() |
|
|
| if __name__ == "__main__": |
| main() |
|
|