""" Скрипт для индексации всех объектов недвижимости через HuggingFace Spaces сервис Usage: python index_all_properties.py # Интерактивный режим python index_all_properties.py --yes # Автоподтверждение """ import psycopg2 import requests import time import sys from typing import List, Dict, Any # Конфигурация БД DB_CONFIG = { 'host': 'dpg-d5ht8vi4d50c739akh2g-a.virginia-postgres.render.com', 'port': 5432, 'database': 'lead_exchange_bk', 'user': 'lead_exchange_bk_user', 'password': '8m2gtTRBW0iAr7nY2Aadzz0VcZBEVKYM' } # URL сервиса на HuggingFace Spaces HF_SERVICE_URL = "https://calcifer0323-matching.hf.space" def get_properties_from_db() -> List[Dict[str, Any]]: """Получить все объекты недвижимости из БД""" print("📥 Fetching properties from database...") conn = psycopg2.connect(**DB_CONFIG) cursor = conn.cursor() cursor.execute(""" SELECT property_id, title, description, address, property_type, area, price, rooms, status FROM properties ORDER BY created_at DESC """) columns = ['property_id', 'title', 'description', 'address', 'property_type', 'area', 'price', 'rooms', 'status'] properties = [] for row in cursor.fetchall(): prop = dict(zip(columns, row)) properties.append(prop) cursor.close() conn.close() print(f"✅ Fetched {len(properties)} properties") return properties def prepare_text_for_property(prop: Dict[str, Any]) -> str: """Подготовить текст для генерации эмбеддинга""" parts = [] if prop.get('title'): parts.append(f"Название: {prop['title']}") if prop.get('description'): parts.append(f"Описание: {prop['description']}") if prop.get('address'): parts.append(f"Адрес: {prop['address']}") # Добавляем структурированные данные details = [] if prop.get('property_type'): details.append(f"тип: {prop['property_type']}") if prop.get('rooms'): details.append(f"комнат: {prop['rooms']}") if prop.get('area'): details.append(f"площадь: {prop['area']} м²") if prop.get('price'): details.append(f"цена: {prop['price']:,} ₽") if details: parts.append("Характеристики: " + ", ".join(details)) return ". ".join(parts) def index_batch(properties: List[Dict[str, Any]], batch_size: int = 20) -> Dict[str, Any]: """Индексировать батч объектов через HuggingFace Spaces""" items = [] for prop in properties: # Подготавливаем данные для эндпоинта /batch item = { "entity_id": str(prop['property_id']), "title": prop.get('title', ''), "description": prop.get('description', ''), "price": float(prop['price']) if prop.get('price') else None, "rooms": int(prop['rooms']) if prop.get('rooms') else None, "area": float(prop['area']) if prop.get('area') else None, "address": prop.get('address', ''), "district": "" # Можно извлечь из address если нужно } items.append(item) payload = {"items": items} try: print(f" 📤 Sending batch of {len(items)} items to {HF_SERVICE_URL}/batch") print(f" Payload size: {len(str(payload))} bytes") response = requests.post( f"{HF_SERVICE_URL}/batch", json=payload, timeout=120 # 2 минуты на батч (было 5 минут, но timeout на сервере 30с) ) print(f" Response status: {response.status_code}") if response.status_code == 200: result = response.json() return result else: print(f" ❌ Error: {response.status_code}") print(f" Response: {response.text[:500]}") # Пробуем получить более детальную информацию об ошибке try: error_detail = response.json() print(f" Detail: {error_detail}") except: pass return None except requests.exceptions.Timeout: print(f" ❌ Request timeout (120 seconds)") return None except requests.exceptions.ConnectionError as e: print(f" ❌ Connection error: {e}") return None except requests.exceptions.RequestException as e: print(f" ❌ Request failed: {e}") return None def save_embeddings_to_file(results: List[Dict], filename: str = "generated_embeddings.json"): """Сохранить результаты индексации в файл (для проверки)""" import json with open(filename, 'w', encoding='utf-8') as f: json.dump(results, f, ensure_ascii=False, indent=2) print(f"💾 Saved embeddings to {filename}") def main(): print("=" * 70) print("INDEXING PROPERTIES THROUGH HUGGINGFACE SPACES") print("=" * 70) # Проверяем параметры командной строки auto_confirm = '--yes' in sys.argv or '-y' in sys.argv if auto_confirm: print("🤖 Auto-confirm mode enabled") # 1. Получаем объекты из БД properties = get_properties_from_db() if not properties: print("⚠️ No properties found in database") return print(f"\n📊 Total properties to index: {len(properties)}") # Показываем пример print(f"\n📄 Sample property:") sample = properties[0] print(f" ID: {sample['property_id']}") print(f" Title: {sample.get('title', 'N/A')}") print(f" Text preview: {prepare_text_for_property(sample)[:150]}...") # Подтверждение if not auto_confirm: print(f"\n🚀 Ready to index {len(properties)} properties") print(f" Service: {HF_SERVICE_URL}") print(f" Endpoint: /batch") try: response = input("\nProceed? (yes/y/no/n): ") if response.lower() not in ['yes', 'y']: print("Cancelled by user") return except EOFError: print("\n❌ Error: EOF when reading input") print("Run with --yes flag to auto-confirm: python index_all_properties.py --yes") return else: print(f"\n✅ Auto-confirming indexing of {len(properties)} properties") print(f" Service: {HF_SERVICE_URL}") print(f" Endpoint: /batch") # 2. Индексируем батчами batch_size = 20 # Уменьшено с 50 до 20 (время обработки ~30 сек на сервере) total_batches = (len(properties) + batch_size - 1) // batch_size print(f"\n📦 Processing {total_batches} batches (batch size: {batch_size})") print(f" ⏱️ Each batch will take ~30-40 seconds to process") print(f" 📊 Total time estimate: ~{(total_batches * 35) // 60} minutes") all_results = [] successful = 0 failed = 0 for i in range(0, len(properties), batch_size): batch = properties[i:i + batch_size] batch_num = i // batch_size + 1 print(f"\n🔄 Batch {batch_num}/{total_batches} ({len(batch)} items)") result = index_batch(batch, batch_size) if result: all_results.append(result) batch_successful = result.get('successful', 0) batch_failed = result.get('failed', 0) successful += batch_successful failed += batch_failed print(f" ✅ Success: {batch_successful}/{len(batch)}") if batch_failed > 0: print(f" ⚠️ Failed: {batch_failed}") else: print(f" ❌ Batch failed completely") failed += len(batch) # Задержка между батчами if i + batch_size < len(properties): print(f" ⏳ Waiting 10 seconds before next batch...") time.sleep(10) # 3. Сохраняем результаты if all_results: save_embeddings_to_file(all_results, "indexing_results.json") # 4. Итоги print("\n" + "=" * 70) print("INDEXING COMPLETE") print("=" * 70) print(f"✅ Successfully indexed: {successful}/{len(properties)}") print(f"❌ Failed: {failed}/{len(properties)}") if successful > 0: print(f"\n💡 Note: Embeddings were generated on HuggingFace Spaces") print(f" Results saved to: indexing_results.json") print(f" Backend should fetch these embeddings and store in DB") print("\n" + "=" * 70) if __name__ == '__main__': main()