aegarciaherrera commited on
Commit
0d9b78a
·
verified ·
1 Parent(s): 33729c3

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +1348 -0
app.py ADDED
@@ -0,0 +1,1348 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sentence_transformers import SentenceTransformer
5
+ from sklearn.neighbors import NearestNeighbors
6
+ from sklearn.decomposition import TruncatedSVD, NMF
7
+ from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
8
+ from sklearn.feature_extraction.text import TfidfVectorizer
9
+ from sklearn.cluster import KMeans
10
+ from sklearn.preprocessing import StandardScaler
11
+ from scipy.sparse import csr_matrix
12
+ from scipy.spatial.distance import pdist, squareform
13
+ import gradio as gr
14
+ import json
15
+ import re
16
+ from collections import defaultdict, Counter
17
+ import csv
18
+ import time
19
+ from datetime import datetime
20
+ import warnings
21
+ warnings.filterwarnings('ignore')
22
+
23
+ # Importar huggingface_hub para descargar archivos
24
+ from huggingface_hub import hf_hub_download
25
+
26
+ # ==================== CARGA DE DATOS DESDE HUGGING FACE ====================
27
+
28
+ def download_file_from_hf(filename, repo_id="aegarciaherrera/Sistema_Recomendador_Archivos"):
29
+ """
30
+ Descarga un archivo específico desde el repositorio de Hugging Face
31
+ """
32
+ try:
33
+ file_path = hf_hub_download(
34
+ repo_id=repo_id,
35
+ filename=filename,
36
+ repo_type="dataset"
37
+ )
38
+ print(f"✓ Archivo {filename} descargado exitosamente")
39
+ return file_path
40
+ except Exception as e:
41
+ print(f"✗ Error descargando {filename}: {str(e)}")
42
+ return None
43
+
44
+ # Descargar archivos principales
45
+ print("Descargando archivos desde Hugging Face...")
46
+ productos_path = download_file_from_hf("productos.csv")
47
+ mapping_path = download_file_from_hf("embedding_index_mapping.csv")
48
+
49
+ # Cargar datos principales
50
+ if productos_path and mapping_path:
51
+ df_productos = pd.read_csv(productos_path)
52
+ df_productos = df_productos.reset_index(drop=True)
53
+ df_mapping = pd.read_csv(mapping_path)
54
+ print(f"✓ Productos cargados: {len(df_productos):,} registros")
55
+ print(f"✓ Mapping cargado: {len(df_mapping):,} registros")
56
+ else:
57
+ raise FileNotFoundError("No se pudieron descargar los archivos principales")
58
+
59
+ # Cargar ratings (adaptable a ambos formatos)
60
+ try:
61
+ # Intentar cargar ratings agregados (V2)
62
+ ratings_agg_path = download_file_from_hf("ratings_aggregated.csv")
63
+ ratings_det_path = download_file_from_hf("ratings_detailed.csv")
64
+
65
+ if ratings_agg_path and ratings_det_path:
66
+ df_ratings_aggregated = pd.read_csv(ratings_agg_path)
67
+ df_ratings_detailed = pd.read_csv(ratings_det_path)
68
+ ratings_dict = df_ratings_aggregated.set_index('parent_asin')['average_rating'].to_dict()
69
+ print(f"✓ Ratings V2 cargados: {len(ratings_dict):,} productos con ratings")
70
+ HAS_DETAILED_RATINGS = True
71
+ else:
72
+ raise FileNotFoundError("Archivos V2 no encontrados")
73
+
74
+ except Exception as e:
75
+ print(f"No se pudieron cargar ratings V2: {str(e)}")
76
+ try:
77
+ # Fallback a ratings V1
78
+ ratings_path = download_file_from_hf("ratings.csv")
79
+ if ratings_path:
80
+ df_ratings = pd.read_csv(ratings_path)
81
+ ratings_dict = df_ratings.set_index('parent_asin')['rating'].to_dict()
82
+ df_ratings_detailed = df_ratings # Para compatibilidad
83
+ print(f"✓ Ratings V1 cargados: {len(ratings_dict):,} productos con ratings")
84
+ HAS_DETAILED_RATINGS = False
85
+ else:
86
+ raise FileNotFoundError("No se pudo cargar ratings V1")
87
+ except Exception as e2:
88
+ print(f"✗ Error cargando ratings: {str(e2)}")
89
+ ratings_dict = {}
90
+ df_ratings_detailed = pd.DataFrame()
91
+ HAS_DETAILED_RATINGS = False
92
+
93
+ print("=" * 50)
94
+ print("RESUMEN DE CARGA:")
95
+ print(f"- Productos: {len(df_productos):,} registros")
96
+ print(f"- Ratings: {len(ratings_dict):,} productos")
97
+ print(f"- Ratings detallados: {'Sí' if HAS_DETAILED_RATINGS else 'No'}")
98
+ print("=" * 50)
99
+
100
+ # ==================== PREPARACIÓN DE DATOS (SIN MERGE) ====================
101
+ # CRÍTICO: No hacer merge para preservar embeddings precargados
102
+ df_similars = df_productos[df_productos["parent_asin"].isin(df_mapping["parent_asin"])].reset_index(drop=True)
103
+
104
+ # Asegurarte de que el orden coincida
105
+ df_similars = df_similars.merge(df_mapping, on="parent_asin").sort_values("index").reset_index(drop=True)
106
+
107
+ # CRÍTICO: Resetear índices ANTES de cualquier operación
108
+ df_similars["description"] = df_similars["description"].fillna("").astype(str)
109
+ df_similars = df_similars.reset_index(drop=True)
110
+
111
+ print(f"Total de productos en df_similars: {len(df_similars):,}")
112
+ print(f"Productos únicos: {df_similars['parent_asin'].nunique():,}")
113
+
114
+ # ==================== CARGA DE EMBEDDINGS ====================
115
+ model = SentenceTransformer("all-MiniLM-L6-v2")
116
+
117
+ # Descargar embeddings precomputados desde Hugging Face
118
+ try:
119
+ embeddings_path = download_file_from_hf("embeddings.npy")
120
+ descriptions_path = download_file_from_hf("descriptions.npy")
121
+
122
+ if embeddings_path and descriptions_path:
123
+ description_embeddings = np.load(embeddings_path)
124
+ descriptions = np.load(descriptions_path, allow_pickle=True)
125
+ print(f"✓ Embeddings precalculados descargados y cargados: {description_embeddings.shape}")
126
+
127
+ # VERIFICACIÓN CRÍTICA: Asegurar consistencia
128
+ if len(description_embeddings) != len(df_similars):
129
+ print(f"WARNING: Mismatch detectado!")
130
+ print(f" Embeddings: {len(description_embeddings)}")
131
+ print(f" df_similars: {len(df_similars)}")
132
+ print(" Recomiendo regenerar embeddings con el nuevo df_similars")
133
+ else:
134
+ print("✓ Consistencia verificada: embeddings y df_similars coinciden")
135
+ else:
136
+ raise FileNotFoundError("No se pudieron descargar los embeddings precomputados")
137
+
138
+ except Exception as e:
139
+ print(f"✗ Error descargando embeddings: {str(e)}")
140
+ print("Generando embeddings básicos...")
141
+ description_embeddings = model.encode(df_similars["description"].tolist())
142
+ descriptions = df_similars["description"].values
143
+
144
+ # ==================== SISTEMA DE MÉTRICAS Y EVALUACIÓN ====================
145
+ class RecommendationMetrics:
146
+ """Sistema de métricas para evaluar y comparar diferentes enfoques de recomendación"""
147
+
148
+ def __init__(self):
149
+ self.metrics_history = defaultdict(list)
150
+ self.execution_times = defaultdict(list)
151
+
152
+ def calculate_diversity(self, recommendations_asins):
153
+ """Calcula la diversidad de las recomendaciones basada en categorías"""
154
+ if not recommendations_asins:
155
+ return 0.0
156
+
157
+ categories = []
158
+ for asin in recommendations_asins:
159
+ product_row = df_similars[df_similars['parent_asin'] == asin]
160
+ if len(product_row) > 0:
161
+ category = product_row.iloc[0].get('main_category', 'Unknown')
162
+ categories.append(category)
163
+
164
+ if not categories:
165
+ return 0.0
166
+
167
+ unique_categories = len(set(categories))
168
+ total_items = len(categories)
169
+ return unique_categories / total_items
170
+
171
+ def calculate_novelty(self, recommendations_asins):
172
+ """Calcula la novedad basada en popularidad (rating y frecuencia)"""
173
+ if not recommendations_asins:
174
+ return 0.0
175
+
176
+ novelty_scores = []
177
+ for asin in recommendations_asins:
178
+ rating = ratings_dict.get(asin, 0.0)
179
+ # Mayor rating = menor novedad (productos populares)
180
+ novelty_score = max(0, 5.0 - rating) / 5.0
181
+ novelty_scores.append(novelty_score)
182
+
183
+ return np.mean(novelty_scores) if novelty_scores else 0.0
184
+
185
+ def calculate_coverage(self, recommendations_asins, total_available_items):
186
+ """Calcula el coverage como porcentaje de items únicos recomendados"""
187
+ unique_recommendations = len(set(recommendations_asins))
188
+ return unique_recommendations / min(total_available_items, 100) # Normalizar
189
+
190
+ def calculate_precision_at_k(self, recommendations_asins, relevant_items, k=5):
191
+ """Calcula precision@k"""
192
+ if not recommendations_asins or not relevant_items:
193
+ return 0.0
194
+
195
+ top_k = recommendations_asins[:k]
196
+ relevant_in_top_k = len(set(top_k) & set(relevant_items))
197
+ return relevant_in_top_k / min(k, len(top_k))
198
+
199
+ def evaluate_recommendations(self, method_name, recommendations_asins, execution_time,
200
+ relevant_items=None, total_available=1000):
201
+ """Evalúa un conjunto de recomendaciones con múltiples métricas"""
202
+ metrics = {
203
+ 'method': method_name,
204
+ 'timestamp': datetime.now(),
205
+ 'execution_time': execution_time,
206
+ 'num_recommendations': len(recommendations_asins),
207
+ 'diversity': self.calculate_diversity(recommendations_asins),
208
+ 'novelty': self.calculate_novelty(recommendations_asins),
209
+ 'coverage': self.calculate_coverage(recommendations_asins, total_available)
210
+ }
211
+
212
+ if relevant_items:
213
+ metrics['precision_at_5'] = self.calculate_precision_at_k(recommendations_asins, relevant_items, 5)
214
+
215
+ # Almacenar métricas
216
+ for key, value in metrics.items():
217
+ if key not in ['method', 'timestamp']:
218
+ self.metrics_history[f"{method_name}_{key}"].append(value)
219
+
220
+ return metrics
221
+
222
+ def get_comparison_report(self):
223
+ """Genera un reporte comparativo de todos los métodos evaluados"""
224
+ if not self.metrics_history:
225
+ return "No hay métricas disponibles"
226
+
227
+ report = "# 📊 REPORTE COMPARATIVO DE MÉTODOS\n\n"
228
+
229
+ # Agrupar métricas por tipo
230
+ methods = set()
231
+ for key in self.metrics_history.keys():
232
+ method = key.split('_')[0] + '_' + key.split('_')[1]
233
+ methods.add(method)
234
+
235
+ for method in sorted(methods):
236
+ report += f"## {method.replace('_', ' ').title()}\n"
237
+
238
+ # Buscar métricas de este método
239
+ method_metrics = {}
240
+ for key, values in self.metrics_history.items():
241
+ if key.startswith(method):
242
+ metric_name = '_'.join(key.split('_')[2:])
243
+ if values:
244
+ method_metrics[metric_name] = {
245
+ 'mean': np.mean(values),
246
+ 'std': np.std(values),
247
+ 'count': len(values)
248
+ }
249
+
250
+ for metric, stats in method_metrics.items():
251
+ report += f"- **{metric.replace('_', ' ').title()}**: {stats['mean']:.4f} ± {stats['std']:.4f} (n={stats['count']})\n"
252
+
253
+ report += "\n"
254
+
255
+ return report
256
+
257
+ # Instancia global de métricas
258
+ metrics_evaluator = RecommendationMetrics()
259
+
260
+ # ==================== FUNCIONALIDAD 1: BÚSQUEDA POR DESCRIPCIÓN (3 MÉTODOS) ====================
261
+
262
+ class DescriptionSearcher:
263
+ """Sistema de búsqueda por descripción con múltiples enfoques"""
264
+
265
+ def __init__(self, df_products, embeddings, model):
266
+ self.df_products = df_products
267
+ self.embeddings = embeddings
268
+ self.model = model
269
+ self.setup_methods()
270
+
271
+ def setup_methods(self):
272
+ """Configura los diferentes métodos de búsqueda"""
273
+ # Método 1: KNN con embeddings (original)
274
+ self.knn = NearestNeighbors(n_neighbors=50, metric="cosine")
275
+ self.knn.fit(self.embeddings)
276
+
277
+ # Método 2: TF-IDF + Cosine Similarity
278
+ descriptions_text = self.df_products["description"].fillna("").tolist()
279
+ self.tfidf_vectorizer = TfidfVectorizer(
280
+ max_features=5000,
281
+ stop_words='english',
282
+ ngram_range=(1, 2),
283
+ min_df=2
284
+ )
285
+ self.tfidf_matrix = self.tfidf_vectorizer.fit_transform(descriptions_text)
286
+
287
+ # Método 3: Clustering + Embedding similarity
288
+ self.n_clusters = min(100, len(self.df_products) // 10)
289
+ self.kmeans = KMeans(n_clusters=self.n_clusters, random_state=42, n_init=10)
290
+ self.cluster_labels = self.kmeans.fit_predict(self.embeddings)
291
+
292
+ def search_method_1_knn(self, query, n_results=5):
293
+ """Método 1: KNN con embeddings (original mejorado)"""
294
+ start_time = time.time()
295
+
296
+ query_embedding = self.model.encode([query])
297
+ distances, indices = self.knn.kneighbors(query_embedding, n_neighbors=min(50, len(self.df_products)))
298
+
299
+ results = []
300
+ seen_asins = set()
301
+
302
+ for i, idx in enumerate(indices[0]):
303
+ if len(results) >= n_results:
304
+ break
305
+
306
+ if idx >= len(self.df_products):
307
+ continue
308
+
309
+ row = self.df_products.iloc[idx]
310
+ asin = row.get("parent_asin", "N/A")
311
+
312
+ if asin in seen_asins:
313
+ continue
314
+ seen_asins.add(asin)
315
+
316
+ similarity_score = 1 - distances[0][i] # Convertir distancia a similitud
317
+ results.append({
318
+ 'asin': asin,
319
+ 'similarity_score': similarity_score,
320
+ 'method': 'KNN_Embeddings'
321
+ })
322
+
323
+ execution_time = time.time() - start_time
324
+
325
+ # Evaluar con métricas
326
+ result_asins = [r['asin'] for r in results]
327
+ metrics = metrics_evaluator.evaluate_recommendations(
328
+ 'search_knn', result_asins, execution_time
329
+ )
330
+
331
+ return results, metrics
332
+
333
+ def search_method_2_tfidf(self, query, n_results=5):
334
+ """Método 2: TF-IDF + Cosine Similarity"""
335
+ start_time = time.time()
336
+
337
+ query_tfidf = self.tfidf_vectorizer.transform([query])
338
+ similarities = cosine_similarity(query_tfidf, self.tfidf_matrix).flatten()
339
+
340
+ # Obtener top resultados
341
+ top_indices = np.argsort(similarities)[::-1]
342
+
343
+ results = []
344
+ seen_asins = set()
345
+
346
+ for idx in top_indices:
347
+ if len(results) >= n_results:
348
+ break
349
+
350
+ if similarities[idx] < 0.01: # Umbral mínimo de similitud
351
+ continue
352
+
353
+ row = self.df_products.iloc[idx]
354
+ asin = row.get("parent_asin", "N/A")
355
+
356
+ if asin in seen_asins:
357
+ continue
358
+ seen_asins.add(asin)
359
+
360
+ results.append({
361
+ 'asin': asin,
362
+ 'similarity_score': similarities[idx],
363
+ 'method': 'TF_IDF'
364
+ })
365
+
366
+ execution_time = time.time() - start_time
367
+
368
+ # Evaluar con métricas
369
+ result_asins = [r['asin'] for r in results]
370
+ metrics = metrics_evaluator.evaluate_recommendations(
371
+ 'search_tfidf', result_asins, execution_time
372
+ )
373
+
374
+ return results, metrics
375
+
376
+ def search_method_3_cluster(self, query, n_results=5):
377
+ """Método 3: Clustering + Embedding similarity"""
378
+ start_time = time.time()
379
+
380
+ query_embedding = self.model.encode([query])
381
+
382
+ # Encontrar cluster más similar
383
+ query_cluster = self.kmeans.predict(query_embedding)[0]
384
+
385
+ # Filtrar productos del mismo cluster
386
+ cluster_mask = self.cluster_labels == query_cluster
387
+ cluster_indices = np.where(cluster_mask)[0]
388
+
389
+ if len(cluster_indices) == 0:
390
+ execution_time = time.time() - start_time
391
+ return [], {'method': 'Cluster_Search', 'execution_time': execution_time}
392
+
393
+ # Calcular similitudes dentro del cluster
394
+ cluster_embeddings = self.embeddings[cluster_indices]
395
+ similarities = cosine_similarity(query_embedding, cluster_embeddings).flatten()
396
+
397
+ # Ordenar por similitud
398
+ sorted_cluster_indices = cluster_indices[np.argsort(similarities)[::-1]]
399
+
400
+ results = []
401
+ seen_asins = set()
402
+
403
+ for idx in sorted_cluster_indices:
404
+ if len(results) >= n_results:
405
+ break
406
+
407
+ row = self.df_products.iloc[idx]
408
+ asin = row.get("parent_asin", "N/A")
409
+
410
+ if asin in seen_asins:
411
+ continue
412
+ seen_asins.add(asin)
413
+
414
+ similarity_idx = np.where(cluster_indices == idx)[0][0]
415
+ similarity_score = similarities[similarity_idx]
416
+
417
+ results.append({
418
+ 'asin': asin,
419
+ 'similarity_score': similarity_score,
420
+ 'method': 'Cluster_Search'
421
+ })
422
+
423
+ execution_time = time.time() - start_time
424
+
425
+ # Evaluar con métricas
426
+ result_asins = [r['asin'] for r in results]
427
+ metrics = metrics_evaluator.evaluate_recommendations(
428
+ 'search_cluster', result_asins, execution_time
429
+ )
430
+
431
+ return results, metrics
432
+
433
+ # ==================== FUNCIONALIDAD 2: RECOMENDACIÓN COLABORATIVA (3 MÉTODOS) ====================
434
+
435
+ class CollaborativeRecommender:
436
+ """Sistema de recomendación colaborativa con múltiples enfoques"""
437
+
438
+ def __init__(self, ratings_df, min_ratings_per_user=5, min_ratings_per_item=5):
439
+ self.ratings_df = ratings_df.copy()
440
+ self.min_ratings_per_user = min_ratings_per_user
441
+ self.min_ratings_per_item = min_ratings_per_item
442
+ self.user_item_matrix = None
443
+ self.item_similarity_matrix = None
444
+ self.svd_model = None
445
+ self.nmf_model = None
446
+ self.user_encoder = {}
447
+ self.item_encoder = {}
448
+ self.user_decoder = {}
449
+ self.item_decoder = {}
450
+
451
+ self._prepare_data()
452
+ self._build_matrices()
453
+
454
+ def _prepare_data(self):
455
+ """Prepara los datos filtrando usuarios e items con pocas interacciones"""
456
+ print("Preparando datos para filtrado colaborativo...")
457
+
458
+ # Filtrar usuarios con al menos min_ratings_per_user ratings
459
+ user_counts = self.ratings_df['user_id'].value_counts()
460
+ valid_users = user_counts[user_counts >= self.min_ratings_per_user].index
461
+
462
+ # Filtrar items con al menos min_ratings_per_item ratings
463
+ item_counts = self.ratings_df['parent_asin'].value_counts()
464
+ valid_items = item_counts[item_counts >= self.min_ratings_per_item].index
465
+
466
+ # Aplicar filtros
467
+ self.ratings_df = self.ratings_df[
468
+ (self.ratings_df['user_id'].isin(valid_users)) &
469
+ (self.ratings_df['parent_asin'].isin(valid_items))
470
+ ]
471
+
472
+ print(f"Datos filtrados: {len(self.ratings_df):,} ratings, "
473
+ f"{self.ratings_df['user_id'].nunique():,} usuarios, "
474
+ f"{self.ratings_df['parent_asin'].nunique():,} productos")
475
+
476
+ # Crear encoders
477
+ unique_users = self.ratings_df['user_id'].unique()
478
+ unique_items = self.ratings_df['parent_asin'].unique()
479
+
480
+ self.user_encoder = {user: idx for idx, user in enumerate(unique_users)}
481
+ self.item_encoder = {item: idx for idx, item in enumerate(unique_items)}
482
+ self.user_decoder = {idx: user for user, idx in self.user_encoder.items()}
483
+ self.item_decoder = {idx: item for item, idx in self.item_encoder.items()}
484
+
485
+ def _build_matrices(self):
486
+ """Construye las matrices necesarias para la recomendación"""
487
+ print("Construyendo matrices de interacción...")
488
+
489
+ n_users = len(self.user_encoder)
490
+ n_items = len(self.item_encoder)
491
+
492
+ # Mapear a índices numéricos
493
+ user_indices = self.ratings_df['user_id'].map(self.user_encoder)
494
+ item_indices = self.ratings_df['parent_asin'].map(self.item_encoder)
495
+ ratings = self.ratings_df['rating'].values
496
+
497
+ # Crear matriz sparse
498
+ self.user_item_matrix = csr_matrix(
499
+ (ratings, (user_indices, item_indices)),
500
+ shape=(n_users, n_items)
501
+ )
502
+
503
+ # Método 1: SVD para similitud entre items
504
+ print("Calculando similitudes SVD...")
505
+ self.svd_model = TruncatedSVD(
506
+ n_components=min(50, min(n_users, n_items)-1),
507
+ random_state=42
508
+ )
509
+ item_features_svd = self.svd_model.fit_transform(self.user_item_matrix.T)
510
+ self.item_similarity_matrix = cosine_similarity(item_features_svd)
511
+
512
+ # Método 2: NMF para factorización
513
+ print("Calculando factorización NMF...")
514
+ self.nmf_model = NMF(
515
+ n_components=min(30, min(n_users, n_items)-1),
516
+ random_state=42,
517
+ max_iter=200
518
+ )
519
+ self.user_features_nmf = self.nmf_model.fit_transform(self.user_item_matrix)
520
+ self.item_features_nmf = self.nmf_model.components_.T
521
+
522
+ # Método 3: Item-based cosine similarity directo
523
+ print("Calculando similitud directa...")
524
+ self.item_similarity_direct = cosine_similarity(self.user_item_matrix.T)
525
+
526
+ print("Matrices construidas exitosamente")
527
+
528
+ def recommend_method_1_svd(self, target_item, n_recommendations=4):
529
+ """Método 1: Recomendaciones basadas en SVD"""
530
+ start_time = time.time()
531
+
532
+ if target_item not in self.item_encoder:
533
+ return [], {'method': 'SVD_Collaborative', 'execution_time': 0}
534
+
535
+ target_idx = self.item_encoder[target_item]
536
+ similarities = self.item_similarity_matrix[target_idx]
537
+
538
+ # Obtener items más similares (excluyendo el item objetivo)
539
+ similar_indices = np.argsort(similarities)[::-1][1:n_recommendations+1]
540
+
541
+ recommendations = []
542
+ for idx in similar_indices:
543
+ item_id = self.item_decoder[idx]
544
+ similarity_score = similarities[idx]
545
+ recommendations.append({
546
+ 'asin': item_id,
547
+ 'similarity_score': similarity_score,
548
+ 'method': 'SVD_Collaborative'
549
+ })
550
+
551
+ execution_time = time.time() - start_time
552
+
553
+ # Evaluar con métricas
554
+ result_asins = [r['asin'] for r in recommendations]
555
+ metrics = metrics_evaluator.evaluate_recommendations(
556
+ 'collab_svd', result_asins, execution_time
557
+ )
558
+
559
+ return recommendations, metrics
560
+
561
+ def recommend_method_2_nmf(self, target_item, n_recommendations=4):
562
+ """Método 2: Recomendaciones basadas en NMF"""
563
+ start_time = time.time()
564
+
565
+ if target_item not in self.item_encoder:
566
+ return [], {'method': 'NMF_Collaborative', 'execution_time': 0}
567
+
568
+ target_idx = self.item_encoder[target_item]
569
+ target_features = self.item_features_nmf[target_idx]
570
+
571
+ # Calcular similitudes con todos los items
572
+ similarities = cosine_similarity([target_features], self.item_features_nmf).flatten()
573
+
574
+ # Obtener items más similares (excluyendo el item objetivo)
575
+ similar_indices = np.argsort(similarities)[::-1][1:n_recommendations+1]
576
+
577
+ recommendations = []
578
+ for idx in similar_indices:
579
+ item_id = self.item_decoder[idx]
580
+ similarity_score = similarities[idx]
581
+ recommendations.append({
582
+ 'asin': item_id,
583
+ 'similarity_score': similarity_score,
584
+ 'method': 'NMF_Collaborative'
585
+ })
586
+
587
+ execution_time = time.time() - start_time
588
+
589
+ # Evaluar con métricas
590
+ result_asins = [r['asin'] for r in recommendations]
591
+ metrics = metrics_evaluator.evaluate_recommendations(
592
+ 'collab_nmf', result_asins, execution_time
593
+ )
594
+
595
+ return recommendations, metrics
596
+
597
+ def recommend_method_3_direct(self, target_item, n_recommendations=4):
598
+ """Método 3: Similitud directa item-to-item"""
599
+ start_time = time.time()
600
+
601
+ if target_item not in self.item_encoder:
602
+ return [], {'method': 'Direct_Collaborative', 'execution_time': 0}
603
+
604
+ target_idx = self.item_encoder[target_item]
605
+ similarities = self.item_similarity_direct[target_idx]
606
+
607
+ # Obtener items más similares (excluyendo el item objetivo)
608
+ similar_indices = np.argsort(similarities)[::-1][1:n_recommendations+1]
609
+
610
+ recommendations = []
611
+ for idx in similar_indices:
612
+ item_id = self.item_decoder[idx]
613
+ similarity_score = similarities[idx]
614
+ recommendations.append({
615
+ 'asin': item_id,
616
+ 'similarity_score': similarity_score,
617
+ 'method': 'Direct_Collaborative'
618
+ })
619
+
620
+ execution_time = time.time() - start_time
621
+
622
+ # Evaluar con métricas
623
+ result_asins = [r['asin'] for r in recommendations]
624
+ metrics = metrics_evaluator.evaluate_recommendations(
625
+ 'collab_direct', result_asins, execution_time
626
+ )
627
+
628
+ return recommendations, metrics
629
+
630
+ def get_available_items(self):
631
+ """Retorna lista de items disponibles para recomendación"""
632
+ return list(self.item_encoder.keys())
633
+
634
+ # ==================== FUNCIONALIDAD 3: RECOMENDACIÓN BASADA EN CLIENTE (3 MÉTODOS) ====================
635
+
636
+ class ClientBasedRecommender:
637
+ """Sistema de recomendación basado en productos seleccionados por un cliente"""
638
+
639
+ def __init__(self, df_products, embeddings, ratings_dict):
640
+ self.df_products = df_products
641
+ self.embeddings = embeddings
642
+ self.ratings_dict = ratings_dict
643
+ self.setup_methods()
644
+
645
+ def setup_methods(self):
646
+ """Configura mapeo ASIN -> índice posicional"""
647
+ self.asin_to_idx = {}
648
+
649
+ parent_asins = self.df_products["parent_asin"].values
650
+
651
+ for idx, asin in enumerate(parent_asins):
652
+ if pd.notna(asin):
653
+ self.asin_to_idx[asin] = idx
654
+
655
+ self.prepare_content_features()
656
+
657
+ # Verificación crítica
658
+ assert len(self.embeddings) == len(self.df_products), \
659
+ f"ERROR: embeddings ({len(self.embeddings)}) y dataframe ({len(self.df_products)}) NO coinciden."
660
+
661
+
662
+ def prepare_content_features(self):
663
+ """Prepara características de contenido para recomendaciones"""
664
+ # Extraer categorías principales
665
+ categories = self.df_products.get('main_category', pd.Series(['Unknown'] * len(self.df_products)))
666
+ self.unique_categories = list(set(categories.fillna('Unknown')))
667
+
668
+ # Crear matriz de características categóricas
669
+ self.category_features = np.zeros((len(self.df_products), len(self.unique_categories)))
670
+ for idx, category in enumerate(categories.fillna('Unknown')):
671
+ if category in self.unique_categories:
672
+ cat_idx = self.unique_categories.index(category)
673
+ self.category_features[idx, cat_idx] = 1
674
+
675
+ def recommend_method_1_profile_similarity(self, selected_asins, n_recommendations=5):
676
+ """Método 1: Perfil de usuario basado en similitud de embeddings"""
677
+ start_time = time.time()
678
+
679
+ if not selected_asins:
680
+ return [], {'method': 'Profile_Similarity', 'execution_time': 0}
681
+
682
+ # Obtener embeddings de productos seleccionados
683
+ selected_embeddings = []
684
+ valid_asins = []
685
+
686
+ for asin in selected_asins:
687
+ if asin in self.asin_to_idx:
688
+ idx = self.asin_to_idx[asin]
689
+ selected_embeddings.append(self.embeddings[idx])
690
+ valid_asins.append(asin)
691
+
692
+ if not selected_embeddings:
693
+ return [], {'method': 'Profile_Similarity', 'execution_time': 0}
694
+
695
+ # Crear perfil de usuario como promedio de embeddings
696
+ user_profile = np.mean(selected_embeddings, axis=0)
697
+
698
+ # Calcular similitudes con todos los productos
699
+ similarities = cosine_similarity([user_profile], self.embeddings).flatten()
700
+
701
+ # Excluir productos ya seleccionados
702
+ excluded_indices = [self.asin_to_idx[asin] for asin in valid_asins if asin in self.asin_to_idx]
703
+ for idx in excluded_indices:
704
+ similarities[idx] = -1
705
+
706
+ # Obtener top recomendaciones
707
+ top_indices = np.argsort(similarities)[::-1][:n_recommendations]
708
+
709
+ recommendations = []
710
+ for idx in top_indices:
711
+ if similarities[idx] <= 0:
712
+ continue
713
+
714
+ row = self.df_products.iloc[idx]
715
+ asin = row.get('parent_asin')
716
+ if asin:
717
+ recommendations.append({
718
+ 'asin': asin,
719
+ 'similarity_score': similarities[idx],
720
+ 'method': 'Profile_Similarity'
721
+ })
722
+
723
+ execution_time = time.time() - start_time
724
+
725
+ # Evaluar con métricas
726
+ result_asins = [r['asin'] for r in recommendations]
727
+ metrics = metrics_evaluator.evaluate_recommendations(
728
+ 'client_profile', result_asins, execution_time
729
+ )
730
+
731
+ return recommendations, metrics
732
+
733
+ def recommend_method_2_weighted_categories(self, selected_asins, n_recommendations=5):
734
+ """Método 2: Recomendación basada en categorías ponderadas"""
735
+ start_time = time.time()
736
+
737
+ if not selected_asins:
738
+ return [], {'method': 'Weighted_Categories', 'execution_time': 0}
739
+
740
+ # Contar categorías en productos seleccionados
741
+ category_weights = defaultdict(float)
742
+ valid_selections = 0
743
+
744
+ for asin in selected_asins:
745
+ if asin in self.asin_to_idx:
746
+ idx = self.asin_to_idx[asin]
747
+ row = self.df_products.iloc[idx]
748
+ category = row.get('main_category', 'Unknown')
749
+
750
+ # Ponderar por rating del producto
751
+ rating = self.ratings_dict.get(asin, 3.0)
752
+ category_weights[category] += rating / 5.0 # Normalizar rating
753
+ valid_selections += 1
754
+
755
+ if not category_weights:
756
+ return [], {'method': 'Weighted_Categories', 'execution_time': 0}
757
+
758
+ # Normalizar pesos
759
+ total_weight = sum(category_weights.values())
760
+ for category in category_weights:
761
+ category_weights[category] /= total_weight
762
+
763
+ # Calcular scores para todos los productos
764
+ product_scores = []
765
+ excluded_asins = set(selected_asins)
766
+
767
+ for idx, row in self.df_products.iterrows():
768
+ asin = row.get('parent_asin')
769
+ if not asin or asin in excluded_asins:
770
+ continue
771
+
772
+ category = row.get('main_category', 'Unknown')
773
+ category_score = category_weights.get(category, 0.0)
774
+
775
+ # Combinar con rating del producto
776
+ product_rating = self.ratings_dict.get(asin, 0.0)
777
+ final_score = category_score * 0.7 + (product_rating / 5.0) * 0.3
778
+
779
+ product_scores.append({
780
+ 'asin': asin,
781
+ 'similarity_score': final_score,
782
+ 'method': 'Weighted_Categories'
783
+ })
784
+
785
+ # Ordenar por score y tomar top N
786
+ product_scores.sort(key=lambda x: x['similarity_score'], reverse=True)
787
+ recommendations = product_scores[:n_recommendations]
788
+
789
+ execution_time = time.time() - start_time
790
+
791
+ # Evaluar con métricas
792
+ result_asins = [r['asin'] for r in recommendations]
793
+ metrics = metrics_evaluator.evaluate_recommendations(
794
+ 'client_categories', result_asins, execution_time
795
+ )
796
+
797
+ return recommendations, metrics
798
+
799
+ def recommend_method_3_hybrid_approach(self, selected_asins, n_recommendations=5):
800
+ """Método 3: Enfoque híbrido combinando embeddings, categorías y ratings"""
801
+ start_time = time.time()
802
+
803
+ if not selected_asins:
804
+ return [], {'method': 'Hybrid_Approach', 'execution_time': 0}
805
+
806
+ # Paso 1: Crear perfil de embeddings
807
+ selected_embeddings = []
808
+ selected_categories = []
809
+ selected_ratings = []
810
+ valid_asins = []
811
+
812
+ for asin in selected_asins:
813
+ if asin in self.asin_to_idx:
814
+ idx = self.asin_to_idx[asin]
815
+ row = self.df_products.iloc[idx]
816
+
817
+ selected_embeddings.append(self.embeddings[idx])
818
+ selected_categories.append(row.get('main_category', 'Unknown'))
819
+ selected_ratings.append(self.ratings_dict.get(asin, 3.0))
820
+ valid_asins.append(asin)
821
+
822
+ if not selected_embeddings:
823
+ return [], {'method': 'Hybrid_Approach', 'execution_time': 0}
824
+
825
+ # Crear perfil promedio ponderado por rating
826
+ weights = np.array(selected_ratings) / 5.0 # Normalizar ratings
827
+ weights = weights / np.sum(weights) # Normalizar pesos
828
+
829
+ user_profile = np.average(selected_embeddings, axis=0, weights=weights)
830
+
831
+ # Paso 2: Calcular preferencias de categoría
832
+ category_preferences = Counter(selected_categories)
833
+ total_selections = len(selected_categories)
834
+
835
+ # Paso 3: Evaluar todos los productos candidatos
836
+ candidate_scores = []
837
+ excluded_asins = set(selected_asins)
838
+
839
+ for idx, row in self.df_products.iterrows():
840
+ asin = row.get('parent_asin')
841
+ if not asin or asin in excluded_asins:
842
+ continue
843
+
844
+ # Score de similitud de embedding
845
+ embedding_similarity = cosine_similarity([user_profile], [self.embeddings[idx]])[0][0]
846
+
847
+ # Score de categoría
848
+ category = row.get('main_category', 'Unknown')
849
+ category_score = category_preferences.get(category, 0) / total_selections
850
+
851
+ # Score de rating
852
+ product_rating = self.ratings_dict.get(asin, 0.0)
853
+ rating_score = product_rating / 5.0
854
+
855
+ # Combinación ponderada
856
+ hybrid_score = (
857
+ embedding_similarity * 0.5 +
858
+ category_score * 0.3 +
859
+ rating_score * 0.2
860
+ )
861
+
862
+ candidate_scores.append({
863
+ 'asin': asin,
864
+ 'similarity_score': hybrid_score,
865
+ 'method': 'Hybrid_Approach',
866
+ 'embedding_sim': embedding_similarity,
867
+ 'category_score': category_score,
868
+ 'rating_score': rating_score
869
+ })
870
+
871
+ # Ordenar y tomar top N
872
+ candidate_scores.sort(key=lambda x: x['similarity_score'], reverse=True)
873
+ recommendations = candidate_scores[:n_recommendations]
874
+
875
+ execution_time = time.time() - start_time
876
+
877
+ # Evaluar con métricas
878
+ result_asins = [r['asin'] for r in recommendations]
879
+ metrics = metrics_evaluator.evaluate_recommendations(
880
+ 'client_hybrid', result_asins, execution_time
881
+ )
882
+
883
+ return recommendations, metrics
884
+
885
+ # ==================== FUNCIONES DE UTILIDAD ====================
886
+ def clean_description(description):
887
+ """Limpia la descripción eliminando corchetes y su contenido"""
888
+ if not description or description == "":
889
+ return "Sin descripción"
890
+
891
+ if description.strip().startswith('[') and description.strip().endswith(']'):
892
+ cleaned = description.strip()[1:-1].strip()
893
+ else:
894
+ cleaned = re.sub(r'\[.*?\]', '', description)
895
+
896
+ cleaned = re.sub(r'\s+', ' ', cleaned).strip()
897
+ return cleaned if cleaned else "Sin descripción"
898
+
899
+ def get_best_image_url(row):
900
+ """Extrae la mejor URL de imagen disponible"""
901
+ image_columns = ['image_urls_best', 'image_urls_large', 'image_urls_all']
902
+
903
+ for col in image_columns:
904
+ if col in row:
905
+ try:
906
+ images = json.loads(row[col]) if isinstance(row[col], str) else row[col]
907
+ if isinstance(images, list) and images:
908
+ for img_url in images:
909
+ if img_url and isinstance(img_url, str) and img_url.startswith("http"):
910
+ return img_url
911
+ except (json.JSONDecodeError, TypeError, ValueError):
912
+ continue
913
+
914
+ return "https://via.placeholder.com/300x300.png?text=No+Image"
915
+
916
+ def get_product_rating(asin):
917
+ """Obtiene el rating de un producto desde el diccionario de ratings"""
918
+ return ratings_dict.get(asin, 0.0)
919
+
920
+ def get_product_info_by_asin(asin):
921
+ """Obtiene información de un producto por su ASIN"""
922
+ product_row = df_similars[df_similars['parent_asin'] == asin]
923
+ if len(product_row) == 0:
924
+ return None
925
+
926
+ row = product_row.iloc[0]
927
+ return {
928
+ 'asin': asin,
929
+ 'title': row.get('title', 'Sin título'),
930
+ 'description': clean_description(row.get('description', '')),
931
+ 'rating': get_product_rating(asin),
932
+ 'image_url': get_best_image_url(row),
933
+ 'category': row.get('main_category', 'Unknown')
934
+ }
935
+
936
+ # ==================== INICIALIZACIÓN DE SISTEMAS ====================
937
+
938
+ # Inicializar sistema de búsqueda por descripción
939
+ description_searcher = DescriptionSearcher(df_similars, description_embeddings, model)
940
+
941
+ # Inicializar sistema colaborativo solo si hay ratings detallados
942
+ if HAS_DETAILED_RATINGS:
943
+ print("Inicializando sistema de recomendación colaborativo...")
944
+ collaborative_recommender = CollaborativeRecommender(df_ratings_detailed)
945
+ else:
946
+ print("Sistema colaborativo no disponible (requiere ratings detallados)")
947
+ collaborative_recommender = None
948
+
949
+ # Inicializar sistema basado en cliente
950
+ client_recommender = ClientBasedRecommender(df_similars, description_embeddings, ratings_dict)
951
+
952
+ # ==================== FUNCIONES DE INTERFAZ MEJORADAS ====================
953
+
954
+ def search_products_enhanced(descripcion_input, method_choice, max_images_per_product=2, target_products=5):
955
+ """Búsqueda mejorada con selección de método"""
956
+ if not descripcion_input.strip():
957
+ return [("https://via.placeholder.com/300.png?text=Vacío", "Por favor escribe algo para buscar...")]
958
+
959
+ # Seleccionar método de búsqueda
960
+ if method_choice == "KNN + Embeddings":
961
+ results, metrics = description_searcher.search_method_1_knn(descripcion_input, target_products)
962
+ elif method_choice == "TF-IDF + Cosine":
963
+ results, metrics = description_searcher.search_method_2_tfidf(descripcion_input, target_products)
964
+ elif method_choice == "Clustering + Embeddings":
965
+ results, metrics = description_searcher.search_method_3_cluster(descripcion_input, target_products)
966
+ else:
967
+ # Comparar todos los métodos
968
+ results_knn, metrics_knn = description_searcher.search_method_1_knn(descripcion_input, 2)
969
+ results_tfidf, metrics_tfidf = description_searcher.search_method_2_tfidf(descripcion_input, 2)
970
+ results_cluster, metrics_cluster = description_searcher.search_method_3_cluster(descripcion_input, 2)
971
+
972
+ # Combinar resultados
973
+ all_results = results_knn + results_tfidf + results_cluster
974
+ results = sorted(all_results, key=lambda x: x['similarity_score'], reverse=True)[:target_products]
975
+
976
+ metrics = {
977
+ 'method': 'All_Methods_Combined',
978
+ 'knn_time': metrics_knn.get('execution_time', 0),
979
+ 'tfidf_time': metrics_tfidf.get('execution_time', 0),
980
+ 'cluster_time': metrics_cluster.get('execution_time', 0)
981
+ }
982
+
983
+ # Convertir resultados a formato de galería
984
+ gallery_results = []
985
+ for result in results:
986
+ product_info = get_product_info_by_asin(result['asin'])
987
+ if product_info:
988
+ texto = f"🔍 Método: {result['method']}\n"
989
+ texto += f"📦 {product_info['title']}\n"
990
+ texto += f"⭐ Rating: {product_info['rating']:.2f}\n"
991
+ texto += f"🎯 Similitud: {result['similarity_score']:.3f}\n"
992
+ texto += f"📂 Categoría: {product_info['category']}\n\n"
993
+ texto += f"📝 {product_info['description'][:200]}{'...' if len(product_info['description']) > 200 else ''}"
994
+
995
+ gallery_results.append((product_info['image_url'], texto))
996
+
997
+ return gallery_results
998
+
999
+ def get_collaborative_recommendations_enhanced(selected_product_asin, method_choice):
1000
+ """Recomendaciones colaborativas mejoradas con selección de método"""
1001
+ if not collaborative_recommender:
1002
+ return [("https://via.placeholder.com/300.png?text=No+Disponible", "Sistema colaborativo no disponible")]
1003
+
1004
+ if not selected_product_asin:
1005
+ return [("https://via.placeholder.com/300.png?text=Vacío", "Por favor selecciona un producto...")]
1006
+
1007
+ # Seleccionar método colaborativo
1008
+ if method_choice == "SVD":
1009
+ recommendations, metrics = collaborative_recommender.recommend_method_1_svd(selected_product_asin)
1010
+ elif method_choice == "NMF":
1011
+ recommendations, metrics = collaborative_recommender.recommend_method_2_nmf(selected_product_asin)
1012
+ elif method_choice == "Direct Similarity":
1013
+ recommendations, metrics = collaborative_recommender.recommend_method_3_direct(selected_product_asin)
1014
+ else:
1015
+ # Comparar todos los métodos
1016
+ rec_svd, met_svd = collaborative_recommender.recommend_method_1_svd(selected_product_asin, 2)
1017
+ rec_nmf, met_nmf = collaborative_recommender.recommend_method_2_nmf(selected_product_asin, 2)
1018
+ rec_direct, met_direct = collaborative_recommender.recommend_method_3_direct(selected_product_asin, 1)
1019
+
1020
+ recommendations = rec_svd + rec_nmf + rec_direct
1021
+ metrics = {'method': 'All_Collaborative_Methods'}
1022
+
1023
+ if not recommendations:
1024
+ return [("https://via.placeholder.com/300.png?text=Sin+Recomendaciones", "No se encontraron recomendaciones para este producto.")]
1025
+
1026
+ # Convertir a formato de galería
1027
+ gallery_results = []
1028
+ for rec in recommendations:
1029
+ product_info = get_product_info_by_asin(rec['asin'])
1030
+ if product_info:
1031
+ texto = f"🤝 Método: {rec['method']}\n"
1032
+ texto += f"📦 {product_info['title']}\n"
1033
+ texto += f"⭐ Rating: {product_info['rating']:.2f}\n"
1034
+ texto += f"🎯 Similitud: {rec['similarity_score']:.3f}\n"
1035
+ texto += f"📂 Categoría: {product_info['category']}\n\n"
1036
+ texto += f"📝 {product_info['description'][:200]}{'...' if len(product_info['description']) > 200 else ''}"
1037
+
1038
+ gallery_results.append((product_info['image_url'], texto))
1039
+
1040
+ return gallery_results
1041
+
1042
+ def get_client_recommendations(selected_asins_text, method_choice, n_recommendations=5):
1043
+ """Recomendaciones basadas en cliente con selección de método"""
1044
+ if not selected_asins_text.strip():
1045
+ return [("https://via.placeholder.com/300.png?text=Vacío", "Por favor ingresa ASINs de productos...")]
1046
+
1047
+ # Parsear ASINs (separados por comas, espacios o nuevas líneas)
1048
+ selected_asins = []
1049
+ for asin in re.split(r'[,\s\n]+', selected_asins_text.strip()):
1050
+ asin = asin.strip()
1051
+ if asin:
1052
+ selected_asins.append(asin)
1053
+
1054
+ if not selected_asins:
1055
+ return [("https://via.placeholder.com/300.png?text=Error", "No se pudieron parsear los ASINs")]
1056
+
1057
+ # Seleccionar método de recomendación basada en cliente
1058
+ if method_choice == "Profile Similarity":
1059
+ recommendations, metrics = client_recommender.recommend_method_1_profile_similarity(selected_asins, n_recommendations)
1060
+ elif method_choice == "Weighted Categories":
1061
+ recommendations, metrics = client_recommender.recommend_method_2_weighted_categories(selected_asins, n_recommendations)
1062
+ elif method_choice == "Hybrid Approach":
1063
+ recommendations, metrics = client_recommender.recommend_method_3_hybrid_approach(selected_asins, n_recommendations)
1064
+ else:
1065
+ # Comparar todos los métodos
1066
+ rec_profile, met_profile = client_recommender.recommend_method_1_profile_similarity(selected_asins, 2)
1067
+ rec_categories, met_categories = client_recommender.recommend_method_2_weighted_categories(selected_asins, 2)
1068
+ rec_hybrid, met_hybrid = client_recommender.recommend_method_3_hybrid_approach(selected_asins, 1)
1069
+
1070
+ recommendations = rec_profile + rec_categories + rec_hybrid
1071
+ metrics = {'method': 'All_Client_Methods'}
1072
+
1073
+ if not recommendations:
1074
+ return [("https://via.placeholder.com/300.png?text=Sin+Recomendaciones", "No se encontraron recomendaciones para los productos seleccionados.")]
1075
+
1076
+ # Convertir a formato de galería
1077
+ gallery_results = []
1078
+ for rec in recommendations:
1079
+ product_info = get_product_info_by_asin(rec['asin'])
1080
+ if product_info:
1081
+ texto = f"👤 Método: {rec['method']}\n"
1082
+ texto += f"📦 {product_info['title']}\n"
1083
+ texto += f"⭐ Rating: {product_info['rating']:.2f}\n"
1084
+ texto += f"🎯 Score: {rec['similarity_score']:.3f}\n"
1085
+ texto += f"📂 Categoría: {product_info['category']}\n\n"
1086
+
1087
+ # Información adicional para método híbrido
1088
+ if 'embedding_sim' in rec:
1089
+ texto += f"🔗 Sim. Embedding: {rec['embedding_sim']:.3f}\n"
1090
+ texto += f"📂 Score Categoría: {rec['category_score']:.3f}\n"
1091
+ texto += f"⭐ Score Rating: {rec['rating_score']:.3f}\n\n"
1092
+
1093
+ texto += f"📝 {product_info['description'][:150]}{'...' if len(product_info['description']) > 150 else ''}"
1094
+
1095
+ gallery_results.append((product_info['image_url'], texto))
1096
+
1097
+ return gallery_results
1098
+
1099
+ def get_product_options():
1100
+ """Obtiene lista de productos disponibles para el dropdown"""
1101
+ if not collaborative_recommender:
1102
+ return [("Sistema no disponible", "")]
1103
+
1104
+ available_asins = collaborative_recommender.get_available_items()
1105
+ options = []
1106
+
1107
+ for asin in available_asins[:100]: # Limitar para performance
1108
+ product_info = get_product_info_by_asin(asin)
1109
+ if product_info and product_info['rating'] > 0:
1110
+ label = f"{product_info['title'][:50]}... (Rating: {product_info['rating']:.1f})"
1111
+ options.append((label, asin))
1112
+
1113
+ return options
1114
+
1115
+ def get_metrics_report():
1116
+ """Genera reporte de métricas para mostrar en la interfaz"""
1117
+ return metrics_evaluator.get_comparison_report()
1118
+
1119
+ # ==================== INTERFAZ GRADIO MEJORADA ====================
1120
+ def create_enhanced_interface():
1121
+ """Crea la interfaz mejorada con todas las funcionalidades y métricas"""
1122
+
1123
+ with gr.Blocks(title="🚀 Advanced Product Recommendaation System", theme=gr.themes.Soft()) as demo:
1124
+ gr.Markdown("""
1125
+ # 🚀 Advanced Product Recommendaation System
1126
+
1127
+ **Funcionalidades disponibles:**
1128
+ - 🔍 **Búsqueda por Descripción** (3 métodos: KNN+Embeddings, TF-IDF+Cosine, Clustering+Embeddings)
1129
+ - 🤝 **Recomendación Colaborativa** (3 métodos: SVD, NMF, Direct Similarity)
1130
+ - 👤 **Recomendación Basada en Cliente** (3 métodos: Profile Similarity, Weighted Categories, Hybrid Approach)
1131
+ - 📊 **Métricas y Comparación** en tiempo real
1132
+ """)
1133
+
1134
+ with gr.Tabs():
1135
+ # TAB 1: Búsqueda por descripción mejorada
1136
+ with gr.TabItem("🔍 Búsqueda por Descripción"):
1137
+ gr.Markdown("### Describe el producto que buscas en inglés y selecciona el método de búsqueda")
1138
+
1139
+ with gr.Row():
1140
+ with gr.Column(scale=1):
1141
+ descripcion_input = gr.Textbox(
1142
+ label="Describe your ideal product",
1143
+ placeholder="exp: Handmade shungite bead bracelet, Silver necklace, etc."
1144
+ )
1145
+ search_method = gr.Dropdown(
1146
+ choices=["KNN + Embeddings", "TF-IDF + Cosine", "Clustering + Embeddings", "Comparar Todos"],
1147
+ value="KNN + Embeddings",
1148
+ label="Método de búsqueda"
1149
+ )
1150
+ max_images = gr.Slider(
1151
+ minimum=1, maximum=3, value=2, step=1,
1152
+ label="Máximo de imágenes por producto"
1153
+ )
1154
+ num_products = gr.Slider(
1155
+ minimum=1, maximum=10, value=5, step=1,
1156
+ label="Número de productos a mostrar"
1157
+ )
1158
+ search_btn = gr.Button("🔍 Buscar Productos", variant="primary", size="lg")
1159
+
1160
+ with gr.Column(scale=2):
1161
+ search_gallery = gr.Gallery(
1162
+ label="Productos Encontrados",
1163
+ columns=3,
1164
+ rows=2,
1165
+ height="auto"
1166
+ )
1167
+
1168
+ # TAB 2: Recomendaciones colaborativas mejoradas
1169
+ if collaborative_recommender:
1170
+ with gr.TabItem("🤝 Recomendaciones Colaborativas"):
1171
+ gr.Markdown("### Selecciona un producto base y el método de recomendación colaborativa")
1172
+
1173
+ with gr.Row():
1174
+ with gr.Column(scale=1):
1175
+ product_dropdown = gr.Dropdown(
1176
+ choices=get_product_options(),
1177
+ label="Selecciona un producto base",
1178
+ value=None
1179
+ )
1180
+ collab_method = gr.Dropdown(
1181
+ choices=["SVD", "NMF", "Direct Similarity", "Comparar Todos"],
1182
+ value="SVD",
1183
+ label="Método colaborativo"
1184
+ )
1185
+ recommend_btn = gr.Button("🤝 Obtener Recomendaciones", variant="primary", size="lg")
1186
+ refresh_products_btn = gr.Button("🔄 Actualizar Lista")
1187
+
1188
+ with gr.Column(scale=2):
1189
+ recommendations_gallery = gr.Gallery(
1190
+ label="Recomendaciones Colaborativas",
1191
+ columns=2,
1192
+ rows=2,
1193
+ height="auto"
1194
+ )
1195
+
1196
+ # TAB 3: Recomendaciones basadas en cliente (NUEVA FUNCIONALIDAD)
1197
+ with gr.TabItem("👤 Recomendaciones Basadas en Cliente"):
1198
+ gr.Markdown("""
1199
+ ### Ingresa los ASINs de productos que un cliente ha seleccionado
1200
+ **Formato:** Separa los ASINs con comas, espacios o nuevas líneas
1201
+ **Ejemplo 1:** B07NTK7T5P, B0751M85FV, B01HYNE114, B0BKBJT5MM.
1202
+ **Ejemplo 2:** B01BAN3CBE, B0754TWHPT, B079KM6HDM, B097B8WH61.
1203
+ **Ejemplo 3:** B0B8WK62Z3, B01BYCH44W, B0BGNQ3CLH, B084L4PF4M.
1204
+
1205
+ """)
1206
+
1207
+ with gr.Row():
1208
+ with gr.Column(scale=1):
1209
+ client_asins_input = gr.Textbox(
1210
+ label="ASINs de productos seleccionados por el cliente",
1211
+ placeholder="Insert here the product´s ID´s to get other products you might enjoy!",
1212
+ lines=3
1213
+ )
1214
+ client_method = gr.Dropdown(
1215
+ choices=["Profile Similarity", "Weighted Categories", "Hybrid Approach", "Comparar Todos"],
1216
+ value="Hybrid Approach",
1217
+ label="Método de recomendación"
1218
+ )
1219
+ client_num_recs = gr.Slider(
1220
+ minimum=1, maximum=10, value=5, step=1,
1221
+ label="Número de recomendaciones"
1222
+ )
1223
+ client_recommend_btn = gr.Button("👤 Generar Recomendaciones", variant="primary", size="lg")
1224
+
1225
+ with gr.Accordion("ℹ️ Información de Métodos", open=False):
1226
+ gr.Markdown("""
1227
+ **Profile Similarity:** Crea un perfil promedio basado en los embeddings de los productos seleccionados
1228
+
1229
+ **Weighted Categories:** Recomienda basándose en las categorías más frecuentes, ponderadas por rating
1230
+
1231
+ **Hybrid Approach:** Combina embeddings, categorías y ratings con pesos optimizados
1232
+ """)
1233
+
1234
+ with gr.Column(scale=2):
1235
+ client_gallery = gr.Gallery(
1236
+ label="Recomendaciones para el Cliente",
1237
+ columns=3,
1238
+ rows=2,
1239
+ height="auto"
1240
+ )
1241
+
1242
+ # TAB 4: Métricas y comparación
1243
+ with gr.TabItem("📊 Métricas y Comparación"):
1244
+ gr.Markdown("### Análisis de rendimiento y comparación de métodos")
1245
+
1246
+ with gr.Row():
1247
+ with gr.Column():
1248
+ metrics_btn = gr.Button("📊 Actualizar Métricas", variant="secondary")
1249
+ clear_metrics_btn = gr.Button("🗑️ Limpiar Historial")
1250
+
1251
+ metrics_output = gr.Markdown("Ejecuta algunas recomendaciones para ver las métricas...")
1252
+
1253
+ # Estadísticas del sistema
1254
+ with gr.Accordion("📈 Estadísticas del Sistema", open=False):
1255
+ collab_stats = ""
1256
+ if collaborative_recommender:
1257
+ collab_stats = f"""
1258
+ - 🤝 Productos disponibles para recomendación colaborativa: {len(collaborative_recommender.get_available_items()):,}
1259
+ - 🧮 Dimensiones de matriz SVD: {collaborative_recommender.item_similarity_matrix.shape}
1260
+ """
1261
+
1262
+ stats_text = f"""
1263
+ **Estadísticas del Sistema Completo:**
1264
+ - 📊 Total de productos: {len(df_similars):,}
1265
+ - ⭐ Productos con ratings: {len(ratings_dict):,}
1266
+ - 🔍 Embeddings precalculados: {len(description_embeddings):,}
1267
+ - ✅ Consistencia verificada: {len(description_embeddings) == len(df_similars)}
1268
+ - 🎯 Métodos de búsqueda: 3 implementados
1269
+ - 🤝 Métodos colaborativos: {"3 implementados" if collaborative_recommender else "No disponible"}
1270
+ - 👤 Métodos basados en cliente: 3 implementados
1271
+ {collab_stats}
1272
+ """
1273
+ gr.Markdown(stats_text)
1274
+
1275
+ # ==================== EVENTOS ====================
1276
+
1277
+ # Búsqueda por descripción
1278
+ search_btn.click(
1279
+ fn=search_products_enhanced,
1280
+ inputs=[descripcion_input, search_method, max_images, num_products],
1281
+ outputs=search_gallery
1282
+ )
1283
+
1284
+ # Recomendaciones colaborativas (solo si está disponible)
1285
+ if collaborative_recommender:
1286
+ recommend_btn.click(
1287
+ fn=get_collaborative_recommendations_enhanced,
1288
+ inputs=[product_dropdown, collab_method],
1289
+ outputs=recommendations_gallery
1290
+ )
1291
+
1292
+ refresh_products_btn.click(
1293
+ fn=lambda: gr.Dropdown(choices=get_product_options()),
1294
+ outputs=product_dropdown
1295
+ )
1296
+
1297
+ # Recomendaciones basadas en cliente
1298
+ client_recommend_btn.click(
1299
+ fn=get_client_recommendations,
1300
+ inputs=[client_asins_input, client_method, client_num_recs],
1301
+ outputs=client_gallery
1302
+ )
1303
+
1304
+ # Métricas
1305
+ metrics_btn.click(
1306
+ fn=get_metrics_report,
1307
+ outputs=metrics_output
1308
+ )
1309
+
1310
+ def clear_metrics():
1311
+ global metrics_evaluator
1312
+ metrics_evaluator = RecommendationMetrics()
1313
+ return "Historial de métricas limpiado."
1314
+
1315
+ clear_metrics_btn.click(
1316
+ fn=clear_metrics,
1317
+ outputs=metrics_output
1318
+ )
1319
+
1320
+ return demo
1321
+
1322
+ # ==================== LANZAMIENTO ====================
1323
+ if __name__ == "__main__":
1324
+ print("🚀 Iniciando sistema avanzado de recomendación...")
1325
+
1326
+ # Verificar configuración
1327
+ print(f"✅ DataFrame: {len(df_similars):,} productos")
1328
+ print(f"✅ Embeddings: {description_embeddings.shape}")
1329
+ print(f"✅ Consistencia: {len(description_embeddings) == len(df_similars)}")
1330
+ print(f"✅ Búsqueda por descripción: 3 métodos disponibles")
1331
+
1332
+ if collaborative_recommender:
1333
+ print(f"✅ Sistema colaborativo: 3 métodos con {len(collaborative_recommender.get_available_items()):,} productos")
1334
+ else:
1335
+ print("⚠️ Sistema colaborativo no disponible")
1336
+
1337
+ print(f"✅ Sistema basado en cliente: 3 métodos disponibles")
1338
+ print(f"✅ Sistema de métricas: Inicializado")
1339
+
1340
+ # Crear y lanzar interfaz
1341
+ demo = create_enhanced_interface()
1342
+ demo.launch(
1343
+ share=False,
1344
+ debug=False,
1345
+ show_error=True,
1346
+ server_name="0.0.0.0",
1347
+ server_port=7860
1348
+ )