import streamlit.components.v1 as components import streamlit as st import os import time import tempfile import pandas as pd from pymongo import MongoClient from neo4j import GraphDatabase from pyvis.network import Network from dotenv import load_dotenv import warnings import logging import requests # --- IMPORT MODULI SPECIFICI --- from src.ingestion.semantic_splitter import ActivaSemanticSplitter from src.extraction.extractor import NeuroSymbolicExtractor from src.validation.validator import SemanticValidator from src.graph.graph_loader import KnowledgeGraphPersister from src.graph.entity_resolver import EntityResolver warnings.filterwarnings("ignore", category=FutureWarning) warnings.filterwarnings("ignore", category=DeprecationWarning) logging.getLogger("transformers").setLevel(logging.ERROR) # --- CONFIGURAZIONE PAGINA --- load_dotenv() st.set_page_config( page_title="Activa Semantic Discovery", layout="wide", initial_sidebar_state="expanded", page_icon="🧠" ) def local_css(file_name): if os.path.exists(file_name): with open(file_name, "r") as f: st.markdown(f"", unsafe_allow_html=True) local_css("assets/style.css") # --- SESSION STATE MANAGEMENT --- if 'groq_valid' not in st.session_state: st.session_state.groq_valid = False if 'pipeline_stage' not in st.session_state: st.session_state.pipeline_stage = 0 if 'document_text' not in st.session_state: st.session_state.document_text = "" if 'chunks' not in st.session_state: st.session_state.chunks = [] if 'extraction_data' not in st.session_state: st.session_state.extraction_data = {"entities": [], "triples": []} if 'graph_html' not in st.session_state: st.session_state.graph_html = None def reset_pipeline(): st.session_state.pipeline_stage = 0 st.session_state.document_text = "" st.session_state.chunks = [] st.session_state.extraction_data = {"entities": [], "triples": []} # --- CACHING RISORSE --- @st.cache_resource def get_splitter(): return ActivaSemanticSplitter(model_name="all-MiniLM-L6-v2") @st.cache_resource def get_extractor(): return NeuroSymbolicExtractor(index_path="ontology/domain_index.json") @st.cache_resource(show_spinner="🧩 Inizializzazione Entity Resolver...") def get_resolver(): return EntityResolver(neo4j_driver=None, similarity_threshold=0.85) @st.cache_resource def get_validator(): return SemanticValidator( ontology_dir="ontology", shapes_file="ontology/shapes/auto_constraints.ttl" ) COLOR_PALETTE = { "arco_CulturalProperty": "#FF5733", # Arancio "core_Agent": "#33FF57", # Verde "l0_Location": "#3357FF", # Blu "l0_Object": "#F333FF", # Viola "core_EventOrSituation": "#FFD433",# Giallo "clv_City": "#33FFF3", # Turchese "DEFAULT": "#97C2FC" # Blu standard } def get_node_color(labels): specific_labels = [l for l in labels if l != 'Resource'] if not specific_labels: return COLOR_PALETTE["DEFAULT"] label = specific_labels[0] return COLOR_PALETTE.get(label, COLOR_PALETTE["DEFAULT"]) def validate_groq_key(api_key): """Effettua un ping leggero all'API di Groq per verificare la validità della chiave.""" if not api_key: return False headers = { "Authorization": f"Bearer {api_key}", "Content-Type": "application/json" } try: response = requests.get("https://api.groq.com/openai/v1/models", headers=headers, timeout=5) return response.status_code == 200 except: return False # Pre-load dei modelli in memoria _ = get_splitter() _ = get_extractor() _ = get_validator() # --- FUNZIONI NEO4J --- def get_driver(uri, user, password): if not uri or not password: return None try: driver = GraphDatabase.driver(uri, auth=(user, password)) driver.verify_connectivity() return driver except: return None def run_query(driver, query, params=None): if driver is None: return [] with driver.session() as session: result = session.run(query, params) return [r.data() for r in result] # --- UI: SIDEBAR --- st.sidebar.title("⚙️ Configurazione") env_uri = os.getenv("NEO4J_URI", "") env_user = os.getenv("NEO4J_USER", "neo4j") env_password = os.getenv("NEO4J_PASSWORD", "") env_groq_key = "" st.sidebar.subheader("Backend AI (TDDT)") if env_groq_key and not st.session_state.groq_valid: if validate_groq_key(env_groq_key): st.session_state.groq_valid = True else: os.environ["GROQ_API_KEY"] = "" env_groq_key = "" if st.session_state.groq_valid: st.sidebar.success("✅ Groq API Key: Valida e Attiva") else: groq_key_input = st.sidebar.text_input("Inserisci GROQ_API_KEY", type="password") if st.sidebar.button("Verifica Chiave"): with st.spinner("Verifica in corso..."): if validate_groq_key(groq_key_input): os.environ["GROQ_API_KEY"] = groq_key_input st.session_state.groq_valid = True st.sidebar.success("✅ Chiave valida!") time.sleep(1) st.rerun() else: st.sidebar.error("❌ Chiave non valida o non autorizzata.") st.sidebar.subheader("Knowledge Graph") uri = st.sidebar.text_input("URI Neo4j", value=env_uri) user = st.sidebar.text_input("User Neo4j", value=env_user) pwd_placeholder = "✅ Configurato (Lascia vuoto)" if env_password else "Inserisci Password Neo4j" password_input = st.sidebar.text_input("Password Neo4j", type="password", placeholder=pwd_placeholder) password = password_input if password_input else env_password driver = None if uri and password: driver = get_driver(uri, user, password) if driver: st.sidebar.success("🟢 Connesso a Neo4j") os.environ["NEO4J_URI"] = uri os.environ["NEO4J_USER"] = user os.environ["NEO4J_PASSWORD"] = password else: st.sidebar.error("🔴 Errore connessione Neo4j") st.sidebar.divider() if st.sidebar.button("🔄 Reset Pipeline", on_click=reset_pipeline): st.sidebar.info("Stato resettato.") # --- MAIN HEADER --- st.title("🧠 Automated Semantic Discovery Prototype") st.markdown("**Type-Driven Domain Traversal (TDDT) & OWL RL Validation**") tab_gen, tab_val, tab_vis = st.tabs([ "⚙️ 1. Pipeline Generativa", "🔍 2. Dati e DLQ", "🕸️ 3. Esplorazione Grafo" ]) # ============================================================================== # TAB 1: PIPELINE GENERATIVA (STEPPER UI) # ============================================================================== with tab_gen: st.subheader("1. Ingestion Documentale") st.info("Inserisci il testo da analizzare nel campo sottostante.") with st.form("ingestion_form"): input_text = st.text_area("Testo del documento:", value=st.session_state.document_text, height=200) submitted = st.form_submit_button("Salva Testo e Prepara Pipeline") if submitted: if input_text != st.session_state.document_text and input_text.strip() != "": st.session_state.document_text = input_text st.session_state.pipeline_stage = 0 st.rerun() st.markdown("---") progress_val = int((st.session_state.pipeline_stage / 3) * 100) st.progress(progress_val, text=f"Progresso Pipeline: {progress_val}%") # ========================== # FASE 1: CHUNKING # ========================== with st.container(): st.markdown(f"### {'✅' if st.session_state.pipeline_stage >= 1 else '1️⃣'} Fase 1: Semantic Chunking") with st.expander("ℹ️ Cosa fa questa fase?"): st.write("Segmenta il testo in frammenti coerenti analizzando la similarità semantica vettoriale tra le frasi.") is_groq_ready = bool(env_groq_key) if st.session_state.pipeline_stage >= 1: chunks = st.session_state.chunks st.success(f"Chunking completato! Generati {len(chunks)} frammenti semantici.") with st.expander("Vedi dettagli frammenti"): st.json(chunks) else: if st.button("Avvia Semantic Splitter", type="primary", disabled=not is_groq_ready): with st.spinner("Creazione chunks in corso..."): try: splitter = get_splitter() chunks, _, _ = splitter.create_chunks(input_text, percentile_threshold=90) st.session_state.chunks = chunks st.session_state.pipeline_stage = 1 st.rerun() except Exception as e: st.error(f"Errore durante il chunking: {e}") st.markdown("⬇️") # ========================== # FASE 2: EXTRACTION (TDDT) # ========================== is_step_b_unlocked = st.session_state.pipeline_stage >= 1 with st.container(): color = "white" if is_step_b_unlocked else "gray" icon = "✅" if st.session_state.pipeline_stage >= 2 else ("2️⃣" if is_step_b_unlocked else "🔒") st.markdown(f"

{icon} Fase 2: TDDT Extraction

", unsafe_allow_html=True) with st.expander("ℹ️ Cosa fa questa fase?"): st.write("Esegue l'estrazione gerarchica in due passaggi: prima classifica le entità usando le root dell'ontologia, poi estrae le relazioni passando all'LLM solo le proprietà ammesse (Domain Index).") if not is_step_b_unlocked: st.caption("Completa la Fase 1 per sbloccare l'estrazione.") elif st.session_state.pipeline_stage >= 2: data = st.session_state.extraction_data st.success(f"Estrazione TDDT completata! Identificate {len(data['entities'])} entità e {len(data['triples'])} triple.") with st.expander("Vedi dati estratti (Pre-Validazione)"): st.write("Entità Inferite:", data['entities']) if data['triples']: st.dataframe(pd.DataFrame([t.model_dump() for t in data['triples']]), hide_index=True) else: is_extraction_ready = st.session_state.groq_valid if st.button("Avvia Estrazione TDDT", type="primary", disabled=not is_extraction_ready): if not st.session_state.groq_valid: st.warning("⚠️ Per avviare l'estrazione devi prima inserire e verificare una GROQ_API_KEY valida nella sidebar.") else: with st.spinner("Classificazione ed estrazione gerarchica in corso..."): try: chunks = st.session_state.chunks extractor = get_extractor() all_triples = [] all_entities = [] prog_bar = st.progress(0) for i, chunk in enumerate(chunks): chunk_id = f"st_req_chunk_{i+1}" res = extractor.extract(chunk, source_id=chunk_id) if res: if res.triples: all_triples.extend(res.triples) prog_bar.progress((i+1)/len(chunks)) if i < len(chunks) - 1: print(f"⏳ Pacing per Groq API: attesa 20s per non sforare i 30K TPM...") time.sleep(20) # Estraggo le entità univoche dalle triple per il Resolver unique_entities = list(set([t.subject for t in all_triples] + [t.object for t in all_triples])) st.session_state.extraction_data = {"entities": unique_entities, "triples": all_triples} st.session_state.pipeline_stage = 2 st.rerun() except Exception as e: st.error(f"Errore: {e}") st.markdown("⬇️") # ========================== # FASE 3: RESOLUTION & VALIDATION (BLOCCANTE) # ========================== is_step_c_unlocked = st.session_state.pipeline_stage >= 2 with st.container(): color = "white" if is_step_c_unlocked else "gray" icon = "✅" if st.session_state.pipeline_stage >= 3 else ("3️⃣" if is_step_c_unlocked else "🔒") st.markdown(f"

{icon} Fase 3: Resolution & SHACL Blocking

", unsafe_allow_html=True) with st.expander("ℹ️ Cosa fa questa fase?"): st.write("Risolve le entità (Entity Linking) e applica la validazione OWL RL. Le triple non conformi vengono scartate e salvate nella Dead Letter Queue (MongoDB), mentre quelle valide popolano Neo4j.") if not is_step_c_unlocked: st.caption("Completa la Fase 2 per procedere.") elif st.session_state.pipeline_stage >= 3: st.success("Grafo Aggiornato! Le triple conformi sono su Neo4j, gli scarti su Mongo (se configurato).") else: if not driver: st.error("⚠️ Connettiti a Neo4j (nella sidebar) per procedere.") else: if st.button("Valida e Scrivi su Grafo", type="primary"): with st.spinner("Risoluzione, validazione logica e persistenza..."): try: raw_data = st.session_state.extraction_data all_entities = raw_data.get("entities", []) all_triples = raw_data.get("triples", []) persister = KnowledgeGraphPersister() persister.driver = driver persister._create_constraints() resolver = get_resolver() resolver.driver = driver all_entities, resolved_triples, entities_to_save = resolver.resolve_entities(all_entities, all_triples) validator = get_validator() valid_triples, invalid_triples, report = validator.filter_valid_triples(entities_to_save, resolved_triples) if invalid_triples: st.warning(f"Rilevate {len(invalid_triples)} violazioni ontologiche. Scartate dalla persistenza.") # Salvataggio in DLQ (MongoDB) mongo_ur = os.getenv("MONGO_UR") mongo_user = os.getenv("MONGO_USER") mongo_pass = os.getenv("MONGO_PASS") if mongo_ur: try: client = MongoClient(mongo_ur, username=mongo_user, password=mongo_pass) db = client["semantic_discovery"]["rejected_triples"] docs = [] for doc in invalid_triples: doc["timestamp"] = time.time() docs.append(doc) db.insert_many(docs) st.info("💾 Scarti archiviati correttamente su MongoDB.") except Exception as e: st.error(f"Errore scrittura DLQ: {e}") # Salviamo SOLO le valide persister.save_entities_and_triples(entities_to_save, valid_triples) st.session_state.pipeline_stage = 3 st.rerun() except Exception as e: st.error(f"Errore critico: {e}") # ============================================================================== # TAB 2: VALIDAZIONE E DLQ (Aggiornato per 1.4) # ============================================================================== with tab_val: st.header("Curation & Feedback Loop") if driver: stats = run_query(driver, "MATCH (n) RETURN count(n) as nodes, count{()-->()} as rels") if stats: c1, c2 = st.columns(2) c1.metric("Nodi Totali", stats[0]['nodes']) c2.metric("Relazioni", stats[0]['rels']) cypher_val = """ MATCH (s)-[r]->(o) RETURN elementId(r) as id, COALESCE(s.label, head(labels(s))) as Soggetto, type(r) as Predicato, COALESCE(o.label, head(labels(o))) as Oggetto, COALESCE(r.evidence, 'N/A') as Evidenza, COALESCE(r.reasoning, 'N/A') as Ragionamento """ triples_data = run_query(driver, cypher_val) if triples_data: df = pd.DataFrame(triples_data) st.dataframe(df.drop(columns=["id"]), width='stretch', hide_index=True) else: st.info("Grafo vuoto o relazioni senza nuovi attributi.") else: st.warning("Database non connesso.") # ============================================================================== # TAB 3: ESPLORAZIONE GRAFO # ============================================================================== with tab_vis: st.header("Esplorazione Topologica") if driver: col_ctrl, col_info = st.columns([1, 4]) with col_ctrl: generate_graph = st.button("🔄 Genera / Aggiorna Grafo", type="primary") if generate_graph: with st.spinner("Estrazione dati e generazione del grafo interattivo..."): cypher_vis = """ MATCH (s:Resource) OPTIONAL MATCH (s)-[r]->(o:Resource) RETURN s.label AS src, labels(s) AS src_labels, type(r) AS rel, o.label AS dst, labels(o) AS dst_labels """ graph_data = run_query(driver, cypher_vis) if graph_data: net = Network(height="800px", width="100%", bgcolor="#222222", font_color="white", notebook=False) for item in graph_data: if item['src']: src_label_text = str(item['src']) src_color = get_node_color(item['src_labels']) net.add_node(src_label_text, label=src_label_text, color=src_color, title=f"Labels: {item['src_labels']}") if item['dst'] and item['rel']: dst_label_text = str(item['dst']) rel_type = str(item['rel']) dst_color = get_node_color(item['dst_labels']) net.add_node(dst_label_text, label=dst_label_text, color=dst_color, title=f"Labels: {item['dst_labels']}") net.add_edge(src_label_text, dst_label_text, title=rel_type) net.force_atlas_2based( gravity=-50, central_gravity=0.01, spring_length=100, spring_strength=0.08, damping=0.4 ) net.toggle_physics(True) with tempfile.NamedTemporaryFile(delete=False, suffix='.html') as tmp: net.save_graph(tmp.name) with open(tmp.name, 'r', encoding='utf-8') as f: raw_html = f.read() fullscreen_addon = """ """ st.session_state.graph_html = raw_html.replace("", fullscreen_addon) else: st.warning("Il grafo è attualmente vuoto.") st.session_state.graph_html = None if st.session_state.graph_html: components.html(st.session_state.graph_html, height=800, scrolling=True) else: st.info("👆 Clicca su 'Genera / Aggiorna Grafo' per visualizzare i dati attuali di Neo4j.") else: st.warning("Database non connesso. Configura le credenziali nella sidebar.")