| |
| """ |
| Integrated Crossword Generator |
| Combines thematic word discovery with API-based clue generation for complete crossword creation. |
| |
| This system integrates: |
| - UnifiedThematicWordGenerator: Smart word discovery using semantic embeddings |
| - APIClueGenerator: High-quality clue generation using multiple AI models |
| |
| Creates a complete crossword generation pipeline with both intelligent word selection |
| and professional-quality clues. |
| """ |
|
|
| import sys |
| import os |
| import time |
| import logging |
| import asyncio |
| from typing import List, Dict, Optional, Tuple, Any |
| from pathlib import Path |
| from dataclasses import dataclass |
|
|
| |
| sys.path.insert(0, str(Path(__file__).parent)) |
|
|
| try: |
| from thematic_word_generator import UnifiedThematicWordGenerator |
| THEMATIC_AVAILABLE = True |
| except ImportError as e: |
| print(f"β Thematic generator import error: {e}") |
| THEMATIC_AVAILABLE = False |
|
|
| try: |
| from api_clue_generator import APIClueGenerator |
| API_AVAILABLE = True |
| except ImportError as e: |
| print(f"β API generator import error: {e}") |
| API_AVAILABLE = False |
|
|
| |
| logging.basicConfig( |
| level=logging.INFO, |
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' |
| ) |
| logger = logging.getLogger(__name__) |
|
|
|
|
| @dataclass |
| class CrosswordEntry: |
| """Complete crossword entry with word, clue, and metadata.""" |
| word: str |
| clue: str |
| topic: str |
| similarity_score: float |
| frequency_tier: str |
| tier_description: str |
| clue_quality: str |
| clue_model: str |
|
|
|
|
| class IntegratedCrosswordGenerator: |
| """ |
| Integrated crossword generator combining thematic word discovery with API clue generation. |
| |
| This class provides a complete pipeline from topic input to finished crossword entries |
| with both intelligent word selection and high-quality clue generation. |
| """ |
| |
| def __init__(self, |
| cache_dir: Optional[str] = None, |
| vocab_size_limit: Optional[int] = None, |
| hf_token: Optional[str] = None): |
| """Initialize the integrated crossword generator. |
| |
| Args: |
| cache_dir: Directory for caching models and embeddings |
| vocab_size_limit: Maximum vocabulary size for thematic generator |
| hf_token: Hugging Face API token for clue generation |
| """ |
| self.cache_dir = cache_dir or os.path.join(os.path.dirname(__file__), 'model_cache') |
| self.vocab_size_limit = vocab_size_limit |
| |
| |
| self.thematic_generator: Optional[UnifiedThematicWordGenerator] = None |
| self.api_clue_generator: Optional[APIClueGenerator] = None |
| |
| |
| self.is_initialized = False |
| self.thematic_ready = False |
| self.api_ready = False |
| |
| |
| self.stats = { |
| 'words_discovered': 0, |
| 'clues_generated': 0, |
| 'api_calls': 0, |
| 'cache_hits': 0, |
| 'total_time': 0.0 |
| } |
| |
| |
| if not THEMATIC_AVAILABLE: |
| logger.error("β UnifiedThematicWordGenerator not available - word discovery disabled") |
| if not API_AVAILABLE: |
| logger.error("β APIClueGenerator not available - API clue generation disabled") |
| |
| |
| if THEMATIC_AVAILABLE: |
| self.thematic_generator = UnifiedThematicWordGenerator( |
| cache_dir=cache_dir, |
| vocab_size_limit=vocab_size_limit |
| ) |
| |
| if API_AVAILABLE: |
| self.api_clue_generator = APIClueGenerator(hf_token=hf_token) |
| |
| def initialize(self): |
| """Initialize both generators.""" |
| if self.is_initialized: |
| return |
| |
| start_time = time.time() |
| logger.info("π Initializing Integrated Crossword Generator...") |
| |
| |
| if self.thematic_generator: |
| logger.info("π Initializing thematic word generator...") |
| try: |
| self.thematic_generator.initialize() |
| self.thematic_ready = True |
| logger.info("β
Thematic word generator ready") |
| except Exception as e: |
| logger.error(f"β Failed to initialize thematic generator: {e}") |
| |
| |
| if self.api_clue_generator: |
| if self.api_clue_generator.hf_token: |
| self.api_ready = True |
| logger.info("β
API clue generator ready") |
| else: |
| logger.warning("β οΈ API clue generator has no token - clue generation may fail") |
| |
| self.is_initialized = True |
| init_time = time.time() - start_time |
| logger.info(f"π Integrated generator initialized in {init_time:.2f}s") |
| |
| |
| capabilities = [] |
| if self.thematic_ready: |
| vocab_size = self.thematic_generator.get_vocabulary_size() |
| capabilities.append(f"Word Discovery ({vocab_size:,} words)") |
| if self.api_ready: |
| model_count = len(self.api_clue_generator.models) |
| capabilities.append(f"API Clues ({model_count} models)") |
| |
| logger.info(f"π‘ Capabilities: {', '.join(capabilities) if capabilities else 'Limited (check dependencies)'}") |
| |
| async def initialize_async(self): |
| """Async initialization for backend compatibility.""" |
| return self.initialize() |
| |
| def generate_crossword_entries(self, |
| topic: str, |
| num_words: int = 15, |
| difficulty: str = "medium", |
| min_similarity: float = 0.3) -> List[CrosswordEntry]: |
| """Generate complete crossword entries for a topic. |
| |
| Args: |
| topic: Topic or theme for word generation |
| num_words: Number of words to generate |
| difficulty: Difficulty level (easy/medium/hard) |
| min_similarity: Minimum similarity threshold for word discovery |
| |
| Returns: |
| List of complete CrosswordEntry objects with words, clues, and metadata |
| """ |
| if not self.is_initialized: |
| self.initialize() |
| |
| start_time = time.time() |
| logger.info(f"π― Generating {num_words} crossword entries for topic: '{topic}' (difficulty: {difficulty})") |
| |
| |
| words_with_metadata = self._discover_words(topic, num_words, difficulty, min_similarity) |
| |
| if not words_with_metadata: |
| logger.warning(f"β οΈ No words discovered for topic '{topic}'") |
| return [] |
| |
| logger.info(f"β
Discovered {len(words_with_metadata)} words") |
| |
| |
| crossword_entries = self._generate_clues_for_words(words_with_metadata, topic) |
| |
| |
| crossword_entries.sort(key=lambda x: (x.clue_quality == "EXCELLENT", x.similarity_score), reverse=True) |
| |
| total_time = time.time() - start_time |
| self.stats['total_time'] += total_time |
| |
| logger.info(f"π Generated {len(crossword_entries)} complete crossword entries in {total_time:.2f}s") |
| |
| return crossword_entries[:num_words] |
| |
| def _discover_words(self, |
| topic: str, |
| num_words: int, |
| difficulty: str, |
| min_similarity: float) -> List[Tuple[str, float, str]]: |
| """Discover thematic words using the thematic generator.""" |
| if not self.thematic_ready: |
| logger.error("β Thematic word generator not ready - cannot discover words") |
| return [] |
| |
| try: |
| |
| word_multipliers = {"easy": 2, "medium": 2.5, "hard": 3} |
| multiplier = word_multipliers.get(difficulty, 2.5) |
| discover_count = int(num_words * multiplier) |
| |
| logger.info(f"π Discovering {discover_count} candidate words for '{topic}'...") |
| |
| |
| results = self.thematic_generator.generate_thematic_words( |
| inputs=topic, |
| num_words=discover_count, |
| min_similarity=min_similarity, |
| multi_theme=False |
| ) |
| |
| |
| filtered_results = self._filter_by_difficulty(results, difficulty) |
| |
| self.stats['words_discovered'] += len(filtered_results) |
| return filtered_results |
| |
| except Exception as e: |
| logger.error(f"β Word discovery failed: {e}") |
| return [] |
| |
| def _filter_by_difficulty(self, |
| results: List[Tuple[str, float, str]], |
| difficulty: str) -> List[Tuple[str, float, str]]: |
| """Filter words by difficulty level using frequency tiers and length.""" |
| |
| |
| difficulty_config = { |
| "easy": { |
| "preferred_tiers": ["tier_2_extremely_common", "tier_3_very_common", "tier_4_highly_common"], |
| "min_length": 3, |
| "max_length": 6 |
| }, |
| "medium": { |
| "preferred_tiers": ["tier_4_highly_common", "tier_5_common", "tier_6_moderately_common"], |
| "min_length": 4, |
| "max_length": 10 |
| }, |
| "hard": { |
| "preferred_tiers": ["tier_6_moderately_common", "tier_7_somewhat_uncommon", "tier_8_uncommon"], |
| "min_length": 5, |
| "max_length": 15 |
| } |
| } |
| |
| config = difficulty_config.get(difficulty, difficulty_config["medium"]) |
| |
| |
| filtered = [] |
| for word, similarity, tier in results: |
| |
| if not (config["min_length"] <= len(word) <= config["max_length"]): |
| continue |
| |
| |
| tier_score = 1.0 if tier in config["preferred_tiers"] else 0.8 |
| adjusted_similarity = similarity * tier_score |
| |
| filtered.append((word, adjusted_similarity, tier)) |
| |
| |
| filtered.sort(key=lambda x: x[1], reverse=True) |
| return filtered |
| |
| def _generate_clues_for_words(self, |
| words_with_metadata: List[Tuple[str, float, str]], |
| topic: str) -> List[CrosswordEntry]: |
| """Generate clues for discovered words using API generator.""" |
| if not self.api_ready: |
| logger.error("β API clue generator not ready - using basic clues") |
| return self._generate_basic_clues(words_with_metadata, topic) |
| |
| logger.info(f"π€ Generating API clues for {len(words_with_metadata)} words...") |
| |
| crossword_entries = [] |
| |
| for word, similarity, tier in words_with_metadata: |
| try: |
| |
| clue_results = self.api_clue_generator.generate_clue(word, topic) |
| |
| |
| best_clue = None |
| best_quality = "FAILED" |
| best_model = "none" |
| |
| for model_key, clue in clue_results.items(): |
| if clue: |
| quality, score = self.api_clue_generator.evaluate_clue_quality(word, clue) |
| if self._is_better_quality(quality, best_quality): |
| best_clue = clue |
| best_quality = quality |
| best_model = model_key |
| |
| self.stats['api_calls'] += len([c for c in clue_results.values() if c]) |
| |
| |
| if best_clue: |
| tier_desc = self._get_tier_description(tier) |
| entry = CrosswordEntry( |
| word=word.upper(), |
| clue=best_clue, |
| topic=topic, |
| similarity_score=similarity, |
| frequency_tier=tier, |
| tier_description=tier_desc, |
| clue_quality=best_quality, |
| clue_model=best_model |
| ) |
| crossword_entries.append(entry) |
| self.stats['clues_generated'] += 1 |
| else: |
| logger.warning(f"β οΈ No valid clue generated for '{word}'") |
| |
| except Exception as e: |
| logger.error(f"β Failed to generate clue for '{word}': {e}") |
| |
| return crossword_entries |
| |
| def _generate_basic_clues(self, |
| words_with_metadata: List[Tuple[str, float, str]], |
| topic: str) -> List[CrosswordEntry]: |
| """Generate basic fallback clues when API is not available.""" |
| logger.info(f"π Generating basic fallback clues for {len(words_with_metadata)} words...") |
| |
| crossword_entries = [] |
| for word, similarity, tier in words_with_metadata: |
| |
| clue = f"Term related to {topic.lower()}" |
| tier_desc = self._get_tier_description(tier) |
| |
| entry = CrosswordEntry( |
| word=word.upper(), |
| clue=clue, |
| topic=topic, |
| similarity_score=similarity, |
| frequency_tier=tier, |
| tier_description=tier_desc, |
| clue_quality="BASIC", |
| clue_model="template" |
| ) |
| crossword_entries.append(entry) |
| self.stats['clues_generated'] += 1 |
| |
| return crossword_entries |
| |
| def _is_better_quality(self, quality1: str, quality2: str) -> bool: |
| """Compare clue quality levels.""" |
| quality_order = ["FAILED", "POOR", "ACCEPTABLE", "GOOD", "EXCELLENT"] |
| try: |
| return quality_order.index(quality1) > quality_order.index(quality2) |
| except ValueError: |
| return False |
| |
| def _get_tier_description(self, tier: str) -> str: |
| """Get human-readable tier description.""" |
| if self.thematic_ready and hasattr(self.thematic_generator, 'tier_descriptions'): |
| return self.thematic_generator.tier_descriptions.get(tier, tier) |
| return tier |
| |
| def generate_by_multiple_topics(self, |
| topics: List[str], |
| words_per_topic: int = 10, |
| difficulty: str = "medium") -> Dict[str, List[CrosswordEntry]]: |
| """Generate crossword entries for multiple topics. |
| |
| Args: |
| topics: List of topics to generate words for |
| words_per_topic: Number of words per topic |
| difficulty: Difficulty level |
| |
| Returns: |
| Dictionary mapping topics to their crossword entries |
| """ |
| logger.info(f"π― Generating crossword entries for {len(topics)} topics") |
| |
| results = {} |
| for topic in topics: |
| logger.info(f"π Processing topic: '{topic}'") |
| entries = self.generate_crossword_entries( |
| topic=topic, |
| num_words=words_per_topic, |
| difficulty=difficulty |
| ) |
| results[topic] = entries |
| |
| return results |
| |
| def get_stats(self) -> Dict[str, Any]: |
| """Get performance statistics.""" |
| return { |
| **self.stats, |
| 'thematic_ready': self.thematic_ready, |
| 'api_ready': self.api_ready, |
| 'is_initialized': self.is_initialized, |
| 'vocab_size': self.thematic_generator.get_vocabulary_size() if self.thematic_ready else 0, |
| 'api_models': len(self.api_clue_generator.models) if self.api_ready else 0 |
| } |
| |
| def get_system_info(self) -> Dict[str, Any]: |
| """Get comprehensive system information.""" |
| info = { |
| 'system': 'IntegratedCrosswordGenerator', |
| 'components': { |
| 'thematic_generator': { |
| 'available': THEMATIC_AVAILABLE, |
| 'ready': self.thematic_ready, |
| 'vocab_size': self.thematic_generator.get_vocabulary_size() if self.thematic_ready else 0 |
| }, |
| 'api_clue_generator': { |
| 'available': API_AVAILABLE, |
| 'ready': self.api_ready, |
| 'models': list(self.api_clue_generator.models.keys()) if self.api_ready else [] |
| } |
| }, |
| 'stats': self.get_stats() |
| } |
| |
| return info |
|
|
|
|
| def main(): |
| """Demo the integrated crossword generator.""" |
| print("π Integrated Crossword Generator Demo") |
| print("=" * 60) |
| |
| |
| hf_token = os.getenv('HF_TOKEN') |
| if not hf_token: |
| print("β HF_TOKEN environment variable not set") |
| print("Set your token: export HF_TOKEN='your_token_here'") |
| return |
| |
| print("π Initializing integrated system...") |
| generator = IntegratedCrosswordGenerator(vocab_size_limit=50000) |
| generator.initialize() |
| |
| |
| system_info = generator.get_system_info() |
| print(f"\nπ System Status:") |
| for component, info in system_info['components'].items(): |
| status = "β
Ready" if info['ready'] else "β Not Ready" |
| print(f" {component}: {status}") |
| |
| if not (generator.thematic_ready and generator.api_ready): |
| print("\nβ οΈ System not fully ready - some features may be limited") |
| print("Continuing with demo using available components...") |
| |
| |
| demo_topics = ["animals", "technology", "music"] |
| |
| print(f"\nπ― Generating crossword entries for {len(demo_topics)} topics") |
| print("=" * 60) |
| |
| for topic in demo_topics: |
| print(f"\nπ Topic: '{topic.upper()}'") |
| print("-" * 40) |
| |
| try: |
| start_time = time.time() |
| entries = generator.generate_crossword_entries( |
| topic=topic, |
| num_words=5, |
| difficulty="medium" |
| ) |
| generation_time = time.time() - start_time |
| |
| if entries: |
| print(f"β±οΈ Generated {len(entries)} entries in {generation_time:.2f}s") |
| print() |
| |
| for i, entry in enumerate(entries, 1): |
| quality_icon = { |
| "EXCELLENT": "π", |
| "GOOD": "β
", |
| "ACCEPTABLE": "π", |
| "POOR": "β", |
| "BASIC": "π" |
| }.get(entry.clue_quality, "?") |
| |
| print(f" {i}. {entry.word:<12} | {quality_icon} {entry.clue}") |
| print(f" Similarity: {entry.similarity_score:.3f} | {entry.tier_description}") |
| print(f" Model: {entry.clue_model}") |
| print() |
| else: |
| print("β No entries generated") |
| |
| except Exception as e: |
| print(f"β Error generating entries for '{topic}': {e}") |
| |
| |
| print("=" * 60) |
| print("π FINAL STATISTICS") |
| print("=" * 60) |
| stats = generator.get_stats() |
| print(f"Words discovered: {stats['words_discovered']}") |
| print(f"Clues generated: {stats['clues_generated']}") |
| print(f"API calls made: {stats['api_calls']}") |
| print(f"Total time: {stats['total_time']:.2f}s") |
| |
| print("\nβ
Integrated crossword generator demo complete!") |
| print("\nπ‘ This system combines:") |
| print(" π Smart word discovery (100K+ vocabulary, semantic analysis)") |
| print(" π€ High-quality clue generation (multiple AI models)") |
| print(" π Difficulty control (frequency tiers)") |
| print(" π― Topic-focused generation") |
|
|
|
|
| if __name__ == "__main__": |
| main() |