Spaces:
Sleeping
Sleeping
| """ | |
| Скрипт для индексации всех объектов недвижимости через HuggingFace Spaces сервис | |
| Usage: | |
| python index_all_properties.py # Интерактивный режим | |
| python index_all_properties.py --yes # Автоподтверждение | |
| """ | |
| import psycopg2 | |
| import requests | |
| import time | |
| import sys | |
| from typing import List, Dict, Any | |
| # Конфигурация БД | |
| DB_CONFIG = { | |
| 'host': 'dpg-d5ht8vi4d50c739akh2g-a.virginia-postgres.render.com', | |
| 'port': 5432, | |
| 'database': 'lead_exchange_bk', | |
| 'user': 'lead_exchange_bk_user', | |
| 'password': '8m2gtTRBW0iAr7nY2Aadzz0VcZBEVKYM' | |
| } | |
| # URL сервиса на HuggingFace Spaces | |
| HF_SERVICE_URL = "https://calcifer0323-matching.hf.space" | |
| def get_properties_from_db() -> List[Dict[str, Any]]: | |
| """Получить все объекты недвижимости из БД""" | |
| print("📥 Fetching properties from database...") | |
| conn = psycopg2.connect(**DB_CONFIG) | |
| cursor = conn.cursor() | |
| cursor.execute(""" | |
| SELECT property_id, title, description, address, property_type, | |
| area, price, rooms, status | |
| FROM properties | |
| ORDER BY created_at DESC | |
| """) | |
| columns = ['property_id', 'title', 'description', 'address', 'property_type', | |
| 'area', 'price', 'rooms', 'status'] | |
| properties = [] | |
| for row in cursor.fetchall(): | |
| prop = dict(zip(columns, row)) | |
| properties.append(prop) | |
| cursor.close() | |
| conn.close() | |
| print(f"✅ Fetched {len(properties)} properties") | |
| return properties | |
| def prepare_text_for_property(prop: Dict[str, Any]) -> str: | |
| """Подготовить текст для генерации эмбеддинга""" | |
| parts = [] | |
| if prop.get('title'): | |
| parts.append(f"Название: {prop['title']}") | |
| if prop.get('description'): | |
| parts.append(f"Описание: {prop['description']}") | |
| if prop.get('address'): | |
| parts.append(f"Адрес: {prop['address']}") | |
| # Добавляем структурированные данные | |
| details = [] | |
| if prop.get('property_type'): | |
| details.append(f"тип: {prop['property_type']}") | |
| if prop.get('rooms'): | |
| details.append(f"комнат: {prop['rooms']}") | |
| if prop.get('area'): | |
| details.append(f"площадь: {prop['area']} м²") | |
| if prop.get('price'): | |
| details.append(f"цена: {prop['price']:,} ₽") | |
| if details: | |
| parts.append("Характеристики: " + ", ".join(details)) | |
| return ". ".join(parts) | |
| def index_batch(properties: List[Dict[str, Any]], batch_size: int = 20) -> Dict[str, Any]: | |
| """Индексировать батч объектов через HuggingFace Spaces""" | |
| items = [] | |
| for prop in properties: | |
| # Подготавливаем данные для эндпоинта /batch | |
| item = { | |
| "entity_id": str(prop['property_id']), | |
| "title": prop.get('title', ''), | |
| "description": prop.get('description', ''), | |
| "price": float(prop['price']) if prop.get('price') else None, | |
| "rooms": int(prop['rooms']) if prop.get('rooms') else None, | |
| "area": float(prop['area']) if prop.get('area') else None, | |
| "address": prop.get('address', ''), | |
| "district": "" # Можно извлечь из address если нужно | |
| } | |
| items.append(item) | |
| payload = {"items": items} | |
| try: | |
| print(f" 📤 Sending batch of {len(items)} items to {HF_SERVICE_URL}/batch") | |
| print(f" Payload size: {len(str(payload))} bytes") | |
| response = requests.post( | |
| f"{HF_SERVICE_URL}/batch", | |
| json=payload, | |
| timeout=120 # 2 минуты на батч (было 5 минут, но timeout на сервере 30с) | |
| ) | |
| print(f" Response status: {response.status_code}") | |
| if response.status_code == 200: | |
| result = response.json() | |
| return result | |
| else: | |
| print(f" ❌ Error: {response.status_code}") | |
| print(f" Response: {response.text[:500]}") | |
| # Пробуем получить более детальную информацию об ошибке | |
| try: | |
| error_detail = response.json() | |
| print(f" Detail: {error_detail}") | |
| except: | |
| pass | |
| return None | |
| except requests.exceptions.Timeout: | |
| print(f" ❌ Request timeout (120 seconds)") | |
| return None | |
| except requests.exceptions.ConnectionError as e: | |
| print(f" ❌ Connection error: {e}") | |
| return None | |
| except requests.exceptions.RequestException as e: | |
| print(f" ❌ Request failed: {e}") | |
| return None | |
| def save_embeddings_to_file(results: List[Dict], filename: str = "generated_embeddings.json"): | |
| """Сохранить результаты индексации в файл (для проверки)""" | |
| import json | |
| with open(filename, 'w', encoding='utf-8') as f: | |
| json.dump(results, f, ensure_ascii=False, indent=2) | |
| print(f"💾 Saved embeddings to {filename}") | |
| def main(): | |
| print("=" * 70) | |
| print("INDEXING PROPERTIES THROUGH HUGGINGFACE SPACES") | |
| print("=" * 70) | |
| # Проверяем параметры командной строки | |
| auto_confirm = '--yes' in sys.argv or '-y' in sys.argv | |
| if auto_confirm: | |
| print("🤖 Auto-confirm mode enabled") | |
| # 1. Получаем объекты из БД | |
| properties = get_properties_from_db() | |
| if not properties: | |
| print("⚠️ No properties found in database") | |
| return | |
| print(f"\n📊 Total properties to index: {len(properties)}") | |
| # Показываем пример | |
| print(f"\n📄 Sample property:") | |
| sample = properties[0] | |
| print(f" ID: {sample['property_id']}") | |
| print(f" Title: {sample.get('title', 'N/A')}") | |
| print(f" Text preview: {prepare_text_for_property(sample)[:150]}...") | |
| # Подтверждение | |
| if not auto_confirm: | |
| print(f"\n🚀 Ready to index {len(properties)} properties") | |
| print(f" Service: {HF_SERVICE_URL}") | |
| print(f" Endpoint: /batch") | |
| try: | |
| response = input("\nProceed? (yes/y/no/n): ") | |
| if response.lower() not in ['yes', 'y']: | |
| print("Cancelled by user") | |
| return | |
| except EOFError: | |
| print("\n❌ Error: EOF when reading input") | |
| print("Run with --yes flag to auto-confirm: python index_all_properties.py --yes") | |
| return | |
| else: | |
| print(f"\n✅ Auto-confirming indexing of {len(properties)} properties") | |
| print(f" Service: {HF_SERVICE_URL}") | |
| print(f" Endpoint: /batch") | |
| # 2. Индексируем батчами | |
| batch_size = 20 # Уменьшено с 50 до 20 (время обработки ~30 сек на сервере) | |
| total_batches = (len(properties) + batch_size - 1) // batch_size | |
| print(f"\n📦 Processing {total_batches} batches (batch size: {batch_size})") | |
| print(f" ⏱️ Each batch will take ~30-40 seconds to process") | |
| print(f" 📊 Total time estimate: ~{(total_batches * 35) // 60} minutes") | |
| all_results = [] | |
| successful = 0 | |
| failed = 0 | |
| for i in range(0, len(properties), batch_size): | |
| batch = properties[i:i + batch_size] | |
| batch_num = i // batch_size + 1 | |
| print(f"\n🔄 Batch {batch_num}/{total_batches} ({len(batch)} items)") | |
| result = index_batch(batch, batch_size) | |
| if result: | |
| all_results.append(result) | |
| batch_successful = result.get('successful', 0) | |
| batch_failed = result.get('failed', 0) | |
| successful += batch_successful | |
| failed += batch_failed | |
| print(f" ✅ Success: {batch_successful}/{len(batch)}") | |
| if batch_failed > 0: | |
| print(f" ⚠️ Failed: {batch_failed}") | |
| else: | |
| print(f" ❌ Batch failed completely") | |
| failed += len(batch) | |
| # Задержка между батчами | |
| if i + batch_size < len(properties): | |
| print(f" ⏳ Waiting 10 seconds before next batch...") | |
| time.sleep(10) | |
| # 3. Сохраняем результаты | |
| if all_results: | |
| save_embeddings_to_file(all_results, "indexing_results.json") | |
| # 4. Итоги | |
| print("\n" + "=" * 70) | |
| print("INDEXING COMPLETE") | |
| print("=" * 70) | |
| print(f"✅ Successfully indexed: {successful}/{len(properties)}") | |
| print(f"❌ Failed: {failed}/{len(properties)}") | |
| if successful > 0: | |
| print(f"\n💡 Note: Embeddings were generated on HuggingFace Spaces") | |
| print(f" Results saved to: indexing_results.json") | |
| print(f" Backend should fetch these embeddings and store in DB") | |
| print("\n" + "=" * 70) | |
| if __name__ == '__main__': | |
| main() | |