Finish-him's picture
fix: complete truncated app.py - add embedding generation + gradio UI
2df7e36 verified
# --- 1. IMPORTS ---
import os
import glob
import json
import csv
import numpy as np
from sentence_transformers import SentenceTransformer
import zipfile
import xml.etree.ElementTree as ET
import gradio as gr
import shutil
# --- 2. CONFIGURAÇÕES ---
DATA_DIR = "dados"
EXTRACT_DIR = os.path.join(DATA_DIR, "dados_extraidos")
OUTPUT_FILENAME = "meus_embeddings_e5_large.npy"
# --- 3. FUNÇÕES DE PROCESSAMENTO ---
def setup_data():
os.makedirs(EXTRACT_DIR, exist_ok=True)
zip_files = glob.glob(os.path.join(DATA_DIR, "*.zip"))
if not zip_files:
return DATA_DIR
for zip_path in zip_files:
with zipfile.ZipFile(zip_path, 'r') as zf:
zf.extractall(EXTRACT_DIR)
return EXTRACT_DIR
def xml_to_dict(element):
d = {}
for child in element:
child_dict = xml_to_dict(child)
if child.tag in d:
if not isinstance(d[child.tag], list):
d[child.tag] = [d[child.tag]]
d[child.tag].append(child_dict)
else:
d[child.tag] = child_dict
if not d:
return element.text
return d
def serialize_item_to_text(item_dict):
parts = []
if not isinstance(item_dict, dict):
return str(item_dict)
for key, value in item_dict.items():
if isinstance(value, dict):
parts.append(f"{key} ({serialize_item_to_text(value)})")
elif isinstance(value, list):
list_str = ", ".join([serialize_item_to_text(i) for i in value])
parts.append(f"{key}: [{list_str}]")
else:
parts.append(f"{key}: {value}")
return ", ".join(parts)
# --- 4. PIPELINE PRINCIPAL ---
def run_full_process():
yield "Iniciando... Descompactando arquivos..."
process_dir = setup_data()
csv.field_size_limit(10_000_000)
all_files = (
glob.glob(os.path.join(process_dir, "**/*.json"), recursive=True)
+ glob.glob(os.path.join(process_dir, "**/*.csv"), recursive=True)
+ glob.glob(os.path.join(process_dir, "**/*.xml"), recursive=True)
)
yield f"Encontrados {len(all_files)} arquivos para processar."
documents = []
for idx, filepath in enumerate(all_files):
try:
yield f"Processando arquivo {idx + 1}/{len(all_files)}: {os.path.basename(filepath)}"
if filepath.endswith(".json"):
with open(filepath, "r", encoding="utf-8") as f:
data = json.load(f)
if isinstance(data, list):
for item in data:
documents.append(serialize_item_to_text(item))
else:
documents.append(serialize_item_to_text(data))
elif filepath.endswith(".csv"):
with open(filepath, "r", encoding="utf-8") as f:
reader = csv.DictReader(f)
for row in reader:
documents.append(serialize_item_to_text(row))
elif filepath.endswith(".xml"):
tree = ET.parse(filepath)
root = tree.getroot()
xml_dict = {root.tag: xml_to_dict(root)}
documents.append(serialize_item_to_text(xml_dict))
except Exception as e:
yield f"Erro ao processar {os.path.basename(filepath)}: {e}"
yield f"Processamento concluido! {len(documents)} documentos criados."
if not documents:
yield "Nenhum documento encontrado. Processo encerrado."
return
# --- ETAPA 2: GERAÇÃO DE EMBEDDINGS ---
yield "Carregando modelo intfloat/multilingual-e5-large..."
cache_path = "./model_cache"
os.makedirs(cache_path, exist_ok=True)
model = SentenceTransformer(
"intfloat/multilingual-e5-large", cache_folder=cache_path
)
yield f"Gerando embeddings para {len(documents)} documentos..."
batch_size = 32
all_embeddings = []
for i in range(0, len(documents), batch_size):
batch = documents[i : i + batch_size]
embeddings = model.encode(batch, show_progress_bar=False)
all_embeddings.append(embeddings)
yield f"Batch {i // batch_size + 1}/{(len(documents) - 1) // batch_size + 1} concluido."
final_embeddings = np.vstack(all_embeddings)
np.save(OUTPUT_FILENAME, final_embeddings)
yield f"Embeddings salvos em {OUTPUT_FILENAME}! Shape: {final_embeddings.shape}"
yield f"Processo completo! {final_embeddings.shape[0]} embeddings de dimensao {final_embeddings.shape[1]}."
# --- 5. INTERFACE GRADIO ---
with gr.Blocks(title="Prometheus Embedding Generator") as demo:
gr.Markdown("# Prometheus Embedding Generator")
gr.Markdown("Gera embeddings a partir dos dados do repositorio usando multilingual-e5-large.")
run_btn = gr.Button("Iniciar Processamento", variant="primary")
output = gr.Textbox(label="Progresso", lines=15, interactive=False)
run_btn.click(fn=run_full_process, outputs=output)
demo.launch()