# --- 1. IMPORTS --- import os import glob import json import csv import numpy as np from sentence_transformers import SentenceTransformer import zipfile import xml.etree.ElementTree as ET import gradio as gr import shutil # --- 2. CONFIGURAÇÕES --- DATA_DIR = "dados" EXTRACT_DIR = os.path.join(DATA_DIR, "dados_extraidos") OUTPUT_FILENAME = "meus_embeddings_e5_large.npy" # --- 3. FUNÇÕES DE PROCESSAMENTO --- def setup_data(): os.makedirs(EXTRACT_DIR, exist_ok=True) zip_files = glob.glob(os.path.join(DATA_DIR, "*.zip")) if not zip_files: return DATA_DIR for zip_path in zip_files: with zipfile.ZipFile(zip_path, 'r') as zf: zf.extractall(EXTRACT_DIR) return EXTRACT_DIR def xml_to_dict(element): d = {} for child in element: child_dict = xml_to_dict(child) if child.tag in d: if not isinstance(d[child.tag], list): d[child.tag] = [d[child.tag]] d[child.tag].append(child_dict) else: d[child.tag] = child_dict if not d: return element.text return d def serialize_item_to_text(item_dict): parts = [] if not isinstance(item_dict, dict): return str(item_dict) for key, value in item_dict.items(): if isinstance(value, dict): parts.append(f"{key} ({serialize_item_to_text(value)})") elif isinstance(value, list): list_str = ", ".join([serialize_item_to_text(i) for i in value]) parts.append(f"{key}: [{list_str}]") else: parts.append(f"{key}: {value}") return ", ".join(parts) # --- 4. PIPELINE PRINCIPAL --- def run_full_process(): yield "Iniciando... Descompactando arquivos..." process_dir = setup_data() csv.field_size_limit(10_000_000) all_files = ( glob.glob(os.path.join(process_dir, "**/*.json"), recursive=True) + glob.glob(os.path.join(process_dir, "**/*.csv"), recursive=True) + glob.glob(os.path.join(process_dir, "**/*.xml"), recursive=True) ) yield f"Encontrados {len(all_files)} arquivos para processar." documents = [] for idx, filepath in enumerate(all_files): try: yield f"Processando arquivo {idx + 1}/{len(all_files)}: {os.path.basename(filepath)}" if filepath.endswith(".json"): with open(filepath, "r", encoding="utf-8") as f: data = json.load(f) if isinstance(data, list): for item in data: documents.append(serialize_item_to_text(item)) else: documents.append(serialize_item_to_text(data)) elif filepath.endswith(".csv"): with open(filepath, "r", encoding="utf-8") as f: reader = csv.DictReader(f) for row in reader: documents.append(serialize_item_to_text(row)) elif filepath.endswith(".xml"): tree = ET.parse(filepath) root = tree.getroot() xml_dict = {root.tag: xml_to_dict(root)} documents.append(serialize_item_to_text(xml_dict)) except Exception as e: yield f"Erro ao processar {os.path.basename(filepath)}: {e}" yield f"Processamento concluido! {len(documents)} documentos criados." if not documents: yield "Nenhum documento encontrado. Processo encerrado." return # --- ETAPA 2: GERAÇÃO DE EMBEDDINGS --- yield "Carregando modelo intfloat/multilingual-e5-large..." cache_path = "./model_cache" os.makedirs(cache_path, exist_ok=True) model = SentenceTransformer( "intfloat/multilingual-e5-large", cache_folder=cache_path ) yield f"Gerando embeddings para {len(documents)} documentos..." batch_size = 32 all_embeddings = [] for i in range(0, len(documents), batch_size): batch = documents[i : i + batch_size] embeddings = model.encode(batch, show_progress_bar=False) all_embeddings.append(embeddings) yield f"Batch {i // batch_size + 1}/{(len(documents) - 1) // batch_size + 1} concluido." final_embeddings = np.vstack(all_embeddings) np.save(OUTPUT_FILENAME, final_embeddings) yield f"Embeddings salvos em {OUTPUT_FILENAME}! Shape: {final_embeddings.shape}" yield f"Processo completo! {final_embeddings.shape[0]} embeddings de dimensao {final_embeddings.shape[1]}." # --- 5. INTERFACE GRADIO --- with gr.Blocks(title="Prometheus Embedding Generator") as demo: gr.Markdown("# Prometheus Embedding Generator") gr.Markdown("Gera embeddings a partir dos dados do repositorio usando multilingual-e5-large.") run_btn = gr.Button("Iniciar Processamento", variant="primary") output = gr.Textbox(label="Progresso", lines=15, interactive=False) run_btn.click(fn=run_full_process, outputs=output) demo.launch()