"""Data collector for continuous learning""" import json import os from datetime import datetime from typing import Optional, Dict, List import hashlib from database import db from config import ( DATA_DIR, LEARNING_FROM_FEEDBACK, SAVE_ALL_INTERACTIONS, REQUIRE_APPROVAL ) class DataCollector: """Collects and manages user interaction data for continuous learning""" def __init__(self): self.current_session_id = self._generate_session_id() self.session_interactions = [] def _generate_session_id(self) -> str: """Generate unique session ID""" timestamp = datetime.now().isoformat() return hashlib.md5(timestamp.encode()).hexdigest()[:12] def collect_interaction( self, prompt: str, generated_code: str, temperature: float = 0.7, max_tokens: int = 100 ) -> int: """Collect a user interaction""" if not SAVE_ALL_INTERACTIONS: return -1 # Save to database interaction_id = db.save_interaction( prompt=prompt, generated_code=generated_code, temperature=temperature, max_tokens=max_tokens, session_id=self.current_session_id ) # Track in session self.session_interactions.append({ 'id': interaction_id, 'prompt': prompt, 'code': generated_code, 'timestamp': datetime.now().isoformat() }) return interaction_id def record_feedback( self, interaction_id: int, is_positive: bool, edited_code: str = None ): """Record user feedback for an interaction""" if not LEARNING_FROM_FEEDBACK: return feedback = 1 if is_positive else -1 db.update_feedback(interaction_id, feedback, edited_code) print(f"Feedback recorded: {'👍' if is_positive else '👎'} for interaction {interaction_id}") def add_training_sample(self, code: str, category: str = "user_contributed"): """Add a code sample directly to training data""" return db.add_code_sample(code, source="user", category=category) def get_training_data(self, include_base: bool = True) -> List[str]: """Get all available training data""" samples = [] # Get approved user interactions approved = db.get_approved_samples() for item in approved: # Combine prompt and code for training sample = f"# Prompt: {item['prompt']}\n{item['code']}" samples.append(sample) # Get curated code samples code_samples = db.get_all_code_samples() for item in code_samples: samples.append(item['code']) # Include base training data if include_base: base_path = os.path.join(DATA_DIR, "..", "programming.txt") if os.path.exists(base_path): with open(base_path, 'r', encoding='utf-8') as f: base_code = f.read() samples.append(base_code) return samples def get_new_training_data(self) -> List[Dict]: """Get new approved samples not yet used for training""" return db.get_approved_samples(not_used=True) def get_pending_count(self) -> int: """Get count of samples pending training""" return db.get_pending_samples_count() def get_statistics(self) -> Dict: """Get collection statistics""" stats = db.get_statistics() stats['session_interactions'] = len(self.session_interactions) return stats def export_training_data(self, filepath: str): """Export all training data to a file""" samples = self.get_training_data() with open(filepath, 'w', encoding='utf-8') as f: f.write('\n\n'.join(samples)) print(f"Exported {len(samples)} samples to {filepath}") # Global collector instance collector = DataCollector()