import streamlit.components.v1 as components import streamlit as st import os import time import tempfile import pandas as pd from pymongo import MongoClient from neo4j import GraphDatabase from pyvis.network import Network from dotenv import load_dotenv import warnings import logging import requests # --- IMPORT MODULI SPECIFICI --- from src.ingestion.semantic_splitter import ActivaSemanticSplitter from src.extraction.extractor import NeuroSymbolicExtractor from src.validation.validator import SemanticValidator from src.graph.graph_loader import KnowledgeGraphPersister from src.graph.entity_resolver import EntityResolver warnings.filterwarnings("ignore", category=FutureWarning) warnings.filterwarnings("ignore", category=DeprecationWarning) logging.getLogger("transformers").setLevel(logging.ERROR) # --- CONFIGURAZIONE PAGINA --- load_dotenv() st.set_page_config( page_title="Activa Semantic Discovery", layout="wide", initial_sidebar_state="expanded", page_icon="🧠" ) def local_css(file_name): if os.path.exists(file_name): with open(file_name, "r") as f: st.markdown(f"", unsafe_allow_html=True) local_css("assets/style.css") # --- SESSION STATE MANAGEMENT --- if 'groq_valid' not in st.session_state: st.session_state.groq_valid = False if 'pipeline_stage' not in st.session_state: st.session_state.pipeline_stage = 0 if 'document_text' not in st.session_state: st.session_state.document_text = "" if 'chunks' not in st.session_state: st.session_state.chunks = [] if 'extraction_data' not in st.session_state: st.session_state.extraction_data = {"entities": [], "triples": []} if 'graph_html' not in st.session_state: st.session_state.graph_html = None def reset_pipeline(): st.session_state.pipeline_stage = 0 st.session_state.document_text = "" st.session_state.chunks = [] st.session_state.extraction_data = {"entities": [], "triples": []} # --- CACHING RISORSE --- @st.cache_resource def get_splitter(): return ActivaSemanticSplitter(model_name="all-MiniLM-L6-v2") @st.cache_resource def get_extractor(): return NeuroSymbolicExtractor(index_path="ontology/domain_index.json") @st.cache_resource(show_spinner="🧩 Inizializzazione Entity Resolver...") def get_resolver(): return EntityResolver(neo4j_driver=None, similarity_threshold=0.85) @st.cache_resource def get_validator(): return SemanticValidator( ontology_dir="ontology", shapes_file="ontology/shapes/auto_constraints.ttl" ) COLOR_PALETTE = { "arco_CulturalProperty": "#FF5733", # Arancio "core_Agent": "#33FF57", # Verde "l0_Location": "#3357FF", # Blu "l0_Object": "#F333FF", # Viola "core_EventOrSituation": "#FFD433",# Giallo "clv_City": "#33FFF3", # Turchese "DEFAULT": "#97C2FC" # Blu standard } def get_node_color(labels): specific_labels = [l for l in labels if l != 'Resource'] if not specific_labels: return COLOR_PALETTE["DEFAULT"] label = specific_labels[0] return COLOR_PALETTE.get(label, COLOR_PALETTE["DEFAULT"]) def validate_groq_key(api_key): """Effettua un ping leggero all'API di Groq per verificare la validità della chiave.""" if not api_key: return False headers = { "Authorization": f"Bearer {api_key}", "Content-Type": "application/json" } try: response = requests.get("https://api.groq.com/openai/v1/models", headers=headers, timeout=5) return response.status_code == 200 except: return False # Pre-load dei modelli in memoria _ = get_splitter() _ = get_extractor() _ = get_validator() # --- FUNZIONI NEO4J --- def get_driver(uri, user, password): if not uri or not password: return None try: driver = GraphDatabase.driver(uri, auth=(user, password)) driver.verify_connectivity() return driver except: return None def run_query(driver, query, params=None): if driver is None: return [] with driver.session() as session: result = session.run(query, params) return [r.data() for r in result] # --- UI: SIDEBAR --- st.sidebar.title("⚙️ Configurazione") env_uri = os.getenv("NEO4J_URI", "") env_user = os.getenv("NEO4J_USER", "neo4j") env_password = os.getenv("NEO4J_PASSWORD", "") env_groq_key = "" st.sidebar.subheader("Backend AI (TDDT)") if env_groq_key and not st.session_state.groq_valid: if validate_groq_key(env_groq_key): st.session_state.groq_valid = True else: os.environ["GROQ_API_KEY"] = "" env_groq_key = "" if st.session_state.groq_valid: st.sidebar.success("✅ Groq API Key: Valida e Attiva") else: groq_key_input = st.sidebar.text_input("Inserisci GROQ_API_KEY", type="password") if st.sidebar.button("Verifica Chiave"): with st.spinner("Verifica in corso..."): if validate_groq_key(groq_key_input): os.environ["GROQ_API_KEY"] = groq_key_input st.session_state.groq_valid = True st.sidebar.success("✅ Chiave valida!") time.sleep(1) st.rerun() else: st.sidebar.error("❌ Chiave non valida o non autorizzata.") st.sidebar.subheader("Knowledge Graph") uri = st.sidebar.text_input("URI Neo4j", value=env_uri) user = st.sidebar.text_input("User Neo4j", value=env_user) pwd_placeholder = "✅ Configurato (Lascia vuoto)" if env_password else "Inserisci Password Neo4j" password_input = st.sidebar.text_input("Password Neo4j", type="password", placeholder=pwd_placeholder) password = password_input if password_input else env_password driver = None if uri and password: driver = get_driver(uri, user, password) if driver: st.sidebar.success("🟢 Connesso a Neo4j") os.environ["NEO4J_URI"] = uri os.environ["NEO4J_USER"] = user os.environ["NEO4J_PASSWORD"] = password else: st.sidebar.error("🔴 Errore connessione Neo4j") st.sidebar.divider() if st.sidebar.button("🔄 Reset Pipeline", on_click=reset_pipeline): st.sidebar.info("Stato resettato.") # --- MAIN HEADER --- st.title("🧠 Automated Semantic Discovery Prototype") st.markdown("**Type-Driven Domain Traversal (TDDT) & OWL RL Validation**") tab_gen, tab_val, tab_vis = st.tabs([ "⚙️ 1. Pipeline Generativa", "🔍 2. Dati e DLQ", "🕸️ 3. Esplorazione Grafo" ]) # ============================================================================== # TAB 1: PIPELINE GENERATIVA (STEPPER UI) # ============================================================================== with tab_gen: st.subheader("1. Ingestion Documentale") st.info("Inserisci il testo da analizzare nel campo sottostante.") with st.form("ingestion_form"): input_text = st.text_area("Testo del documento:", value=st.session_state.document_text, height=200) submitted = st.form_submit_button("Salva Testo e Prepara Pipeline") if submitted: if input_text != st.session_state.document_text and input_text.strip() != "": st.session_state.document_text = input_text st.session_state.pipeline_stage = 0 st.rerun() st.markdown("---") progress_val = int((st.session_state.pipeline_stage / 3) * 100) st.progress(progress_val, text=f"Progresso Pipeline: {progress_val}%") # ========================== # FASE 1: CHUNKING # ========================== with st.container(): st.markdown(f"### {'✅' if st.session_state.pipeline_stage >= 1 else '1️⃣'} Fase 1: Semantic Chunking") with st.expander("ℹ️ Cosa fa questa fase?"): st.write("Segmenta il testo in frammenti coerenti analizzando la similarità semantica vettoriale tra le frasi.") is_groq_ready = bool(env_groq_key) if st.session_state.pipeline_stage >= 1: chunks = st.session_state.chunks st.success(f"Chunking completato! Generati {len(chunks)} frammenti semantici.") with st.expander("Vedi dettagli frammenti"): st.json(chunks) else: if st.button("Avvia Semantic Splitter", type="primary", disabled=not is_groq_ready): with st.spinner("Creazione chunks in corso..."): try: splitter = get_splitter() chunks, _, _ = splitter.create_chunks(input_text, percentile_threshold=90) st.session_state.chunks = chunks st.session_state.pipeline_stage = 1 st.rerun() except Exception as e: st.error(f"Errore durante il chunking: {e}") st.markdown("⬇️") # ========================== # FASE 2: EXTRACTION (TDDT) # ========================== is_step_b_unlocked = st.session_state.pipeline_stage >= 1 with st.container(): color = "white" if is_step_b_unlocked else "gray" icon = "✅" if st.session_state.pipeline_stage >= 2 else ("2️⃣" if is_step_b_unlocked else "🔒") st.markdown(f"