| import streamlit.components.v1 as components |
| import streamlit as st |
| import os |
| import time |
| import tempfile |
| import pandas as pd |
| from pymongo import MongoClient |
| from neo4j import GraphDatabase |
| from pyvis.network import Network |
| from dotenv import load_dotenv |
| import warnings |
| import logging |
| import requests |
|
|
| |
| from src.ingestion.semantic_splitter import ActivaSemanticSplitter |
| from src.extraction.extractor import NeuroSymbolicExtractor |
| from src.validation.validator import SemanticValidator |
| from src.graph.graph_loader import KnowledgeGraphPersister |
| from src.graph.entity_resolver import EntityResolver |
|
|
| warnings.filterwarnings("ignore", category=FutureWarning) |
| warnings.filterwarnings("ignore", category=DeprecationWarning) |
|
|
| logging.getLogger("transformers").setLevel(logging.ERROR) |
|
|
| |
| load_dotenv() |
| st.set_page_config( |
| page_title="Activa Semantic Discovery", |
| layout="wide", |
| initial_sidebar_state="expanded", |
| page_icon="🧠" |
| ) |
|
|
| def local_css(file_name): |
| if os.path.exists(file_name): |
| with open(file_name, "r") as f: |
| st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True) |
|
|
| local_css("assets/style.css") |
|
|
| |
| if 'groq_valid' not in st.session_state: |
| st.session_state.groq_valid = False |
| if 'pipeline_stage' not in st.session_state: |
| st.session_state.pipeline_stage = 0 |
| if 'document_text' not in st.session_state: |
| st.session_state.document_text = "" |
| if 'chunks' not in st.session_state: |
| st.session_state.chunks = [] |
| if 'extraction_data' not in st.session_state: |
| st.session_state.extraction_data = {"entities": [], "triples": []} |
| if 'graph_html' not in st.session_state: |
| st.session_state.graph_html = None |
|
|
| def reset_pipeline(): |
| st.session_state.pipeline_stage = 0 |
| st.session_state.document_text = "" |
| st.session_state.chunks = [] |
| st.session_state.extraction_data = {"entities": [], "triples": []} |
|
|
| |
| @st.cache_resource |
| def get_splitter(): |
| return ActivaSemanticSplitter(model_name="all-MiniLM-L6-v2") |
|
|
| @st.cache_resource |
| def get_extractor(): |
| return NeuroSymbolicExtractor(index_path="ontology/domain_index.json") |
|
|
| @st.cache_resource(show_spinner="🧩 Inizializzazione Entity Resolver...") |
| def get_resolver(): |
| return EntityResolver(neo4j_driver=None, similarity_threshold=0.85) |
|
|
| @st.cache_resource |
| def get_validator(): |
| return SemanticValidator( |
| ontology_dir="ontology", |
| shapes_file="ontology/shapes/auto_constraints.ttl" |
| ) |
|
|
| COLOR_PALETTE = { |
| "arco_CulturalProperty": "#FF5733", |
| "core_Agent": "#33FF57", |
| "l0_Location": "#3357FF", |
| "l0_Object": "#F333FF", |
| "core_EventOrSituation": "#FFD433", |
| "clv_City": "#33FFF3", |
| "DEFAULT": "#97C2FC" |
| } |
|
|
| def get_node_color(labels): |
| specific_labels = [l for l in labels if l != 'Resource'] |
| if not specific_labels: |
| return COLOR_PALETTE["DEFAULT"] |
| |
| label = specific_labels[0] |
| return COLOR_PALETTE.get(label, COLOR_PALETTE["DEFAULT"]) |
|
|
| def validate_groq_key(api_key): |
| """Effettua un ping leggero all'API di Groq per verificare la validità della chiave.""" |
| if not api_key: |
| return False |
| |
| headers = { |
| "Authorization": f"Bearer {api_key}", |
| "Content-Type": "application/json" |
| } |
| try: |
| response = requests.get("https://api.groq.com/openai/v1/models", headers=headers, timeout=5) |
| return response.status_code == 200 |
| except: |
| return False |
|
|
| |
| _ = get_splitter() |
| _ = get_extractor() |
| _ = get_validator() |
|
|
| |
| def get_driver(uri, user, password): |
| if not uri or not password: return None |
| try: |
| driver = GraphDatabase.driver(uri, auth=(user, password)) |
| driver.verify_connectivity() |
| return driver |
| except: return None |
|
|
| def run_query(driver, query, params=None): |
| if driver is None: return [] |
| with driver.session() as session: |
| result = session.run(query, params) |
| return [r.data() for r in result] |
|
|
| |
| st.sidebar.title("⚙️ Configurazione") |
|
|
| env_uri = os.getenv("NEO4J_URI", "") |
| env_user = os.getenv("NEO4J_USER", "neo4j") |
| env_password = os.getenv("NEO4J_PASSWORD", "") |
| env_groq_key = "" |
|
|
| st.sidebar.subheader("Backend AI (TDDT)") |
| if env_groq_key and not st.session_state.groq_valid: |
| if validate_groq_key(env_groq_key): |
| st.session_state.groq_valid = True |
| else: |
| os.environ["GROQ_API_KEY"] = "" |
| env_groq_key = "" |
|
|
| if st.session_state.groq_valid: |
| st.sidebar.success("✅ Groq API Key: Valida e Attiva") |
| else: |
| groq_key_input = st.sidebar.text_input("Inserisci GROQ_API_KEY", type="password") |
| |
| if st.sidebar.button("Verifica Chiave"): |
| with st.spinner("Verifica in corso..."): |
| if validate_groq_key(groq_key_input): |
| os.environ["GROQ_API_KEY"] = groq_key_input |
| st.session_state.groq_valid = True |
| st.sidebar.success("✅ Chiave valida!") |
| time.sleep(1) |
| st.rerun() |
| else: |
| st.sidebar.error("❌ Chiave non valida o non autorizzata.") |
|
|
| st.sidebar.subheader("Knowledge Graph") |
| uri = st.sidebar.text_input("URI Neo4j", value=env_uri) |
| user = st.sidebar.text_input("User Neo4j", value=env_user) |
|
|
| pwd_placeholder = "✅ Configurato (Lascia vuoto)" if env_password else "Inserisci Password Neo4j" |
| password_input = st.sidebar.text_input("Password Neo4j", type="password", placeholder=pwd_placeholder) |
| password = password_input if password_input else env_password |
|
|
| driver = None |
| if uri and password: |
| driver = get_driver(uri, user, password) |
| if driver: |
| st.sidebar.success("🟢 Connesso a Neo4j") |
| os.environ["NEO4J_URI"] = uri |
| os.environ["NEO4J_USER"] = user |
| os.environ["NEO4J_PASSWORD"] = password |
| else: |
| st.sidebar.error("🔴 Errore connessione Neo4j") |
|
|
| st.sidebar.divider() |
| if st.sidebar.button("🔄 Reset Pipeline", on_click=reset_pipeline): |
| st.sidebar.info("Stato resettato.") |
|
|
| |
| st.title("🧠 Automated Semantic Discovery Prototype") |
| st.markdown("**Type-Driven Domain Traversal (TDDT) & OWL RL Validation**") |
|
|
| tab_gen, tab_val, tab_vis = st.tabs([ |
| "⚙️ 1. Pipeline Generativa", |
| "🔍 2. Dati e DLQ", |
| "🕸️ 3. Esplorazione Grafo" |
| ]) |
|
|
| |
| |
| |
| with tab_gen: |
| st.subheader("1. Ingestion Documentale") |
| st.info("Inserisci il testo da analizzare nel campo sottostante.") |
| |
| with st.form("ingestion_form"): |
| input_text = st.text_area("Testo del documento:", value=st.session_state.document_text, height=200) |
| submitted = st.form_submit_button("Salva Testo e Prepara Pipeline") |
| |
| if submitted: |
| if input_text != st.session_state.document_text and input_text.strip() != "": |
| st.session_state.document_text = input_text |
| st.session_state.pipeline_stage = 0 |
| st.rerun() |
|
|
| st.markdown("---") |
| progress_val = int((st.session_state.pipeline_stage / 3) * 100) |
| st.progress(progress_val, text=f"Progresso Pipeline: {progress_val}%") |
|
|
| |
| |
| |
| with st.container(): |
| st.markdown(f"### {'✅' if st.session_state.pipeline_stage >= 1 else '1️⃣'} Fase 1: Semantic Chunking") |
|
|
| with st.expander("ℹ️ Cosa fa questa fase?"): |
| st.write("Segmenta il testo in frammenti coerenti analizzando la similarità semantica vettoriale tra le frasi.") |
| |
| is_groq_ready = bool(env_groq_key) |
|
|
| if st.session_state.pipeline_stage >= 1: |
| chunks = st.session_state.chunks |
| st.success(f"Chunking completato! Generati {len(chunks)} frammenti semantici.") |
| with st.expander("Vedi dettagli frammenti"): |
| st.json(chunks) |
| else: |
| if st.button("Avvia Semantic Splitter", type="primary", disabled=not is_groq_ready): |
| with st.spinner("Creazione chunks in corso..."): |
| try: |
| splitter = get_splitter() |
| chunks, _, _ = splitter.create_chunks(input_text, percentile_threshold=90) |
| st.session_state.chunks = chunks |
| st.session_state.pipeline_stage = 1 |
| st.rerun() |
| except Exception as e: |
| st.error(f"Errore durante il chunking: {e}") |
|
|
| st.markdown("⬇️") |
|
|
| |
| |
| |
| is_step_b_unlocked = st.session_state.pipeline_stage >= 1 |
| |
| with st.container(): |
| color = "white" if is_step_b_unlocked else "gray" |
| icon = "✅" if st.session_state.pipeline_stage >= 2 else ("2️⃣" if is_step_b_unlocked else "🔒") |
| st.markdown(f"<h3 style='color:{color}'>{icon} Fase 2: TDDT Extraction</h3>", unsafe_allow_html=True) |
|
|
| with st.expander("ℹ️ Cosa fa questa fase?"): |
| st.write("Esegue l'estrazione gerarchica in due passaggi: prima classifica le entità usando le root dell'ontologia, poi estrae le relazioni passando all'LLM solo le proprietà ammesse (Domain Index).") |
|
|
| if not is_step_b_unlocked: |
| st.caption("Completa la Fase 1 per sbloccare l'estrazione.") |
| elif st.session_state.pipeline_stage >= 2: |
| data = st.session_state.extraction_data |
| st.success(f"Estrazione TDDT completata! Identificate {len(data['entities'])} entità e {len(data['triples'])} triple.") |
| with st.expander("Vedi dati estratti (Pre-Validazione)"): |
| st.write("Entità Inferite:", data['entities']) |
| if data['triples']: |
| st.dataframe(pd.DataFrame([t.model_dump() for t in data['triples']]), hide_index=True) |
| else: |
| is_extraction_ready = st.session_state.groq_valid |
| if st.button("Avvia Estrazione TDDT", type="primary", disabled=not is_extraction_ready): |
| if not st.session_state.groq_valid: |
| st.warning("⚠️ Per avviare l'estrazione devi prima inserire e verificare una GROQ_API_KEY valida nella sidebar.") |
| else: |
| with st.spinner("Classificazione ed estrazione gerarchica in corso..."): |
| try: |
| chunks = st.session_state.chunks |
| extractor = get_extractor() |
| all_triples = [] |
| all_entities = [] |
| prog_bar = st.progress(0) |
| |
| for i, chunk in enumerate(chunks): |
| chunk_id = f"st_req_chunk_{i+1}" |
| res = extractor.extract(chunk, source_id=chunk_id) |
| |
| if res: |
| if res.triples: all_triples.extend(res.triples) |
| |
| prog_bar.progress((i+1)/len(chunks)) |
|
|
| if i < len(chunks) - 1: |
| print(f"⏳ Pacing per Groq API: attesa 20s per non sforare i 30K TPM...") |
| time.sleep(20) |
| |
| |
| unique_entities = list(set([t.subject for t in all_triples] + [t.object for t in all_triples])) |
| |
| st.session_state.extraction_data = {"entities": unique_entities, "triples": all_triples} |
| st.session_state.pipeline_stage = 2 |
| st.rerun() |
| except Exception as e: |
| st.error(f"Errore: {e}") |
|
|
| st.markdown("⬇️") |
|
|
| |
| |
| |
| is_step_c_unlocked = st.session_state.pipeline_stage >= 2 |
| |
| with st.container(): |
| color = "white" if is_step_c_unlocked else "gray" |
| icon = "✅" if st.session_state.pipeline_stage >= 3 else ("3️⃣" if is_step_c_unlocked else "🔒") |
| st.markdown(f"<h3 style='color:{color}'>{icon} Fase 3: Resolution & SHACL Blocking</h3>", unsafe_allow_html=True) |
|
|
| with st.expander("ℹ️ Cosa fa questa fase?"): |
| st.write("Risolve le entità (Entity Linking) e applica la validazione OWL RL. Le triple non conformi vengono scartate e salvate nella Dead Letter Queue (MongoDB), mentre quelle valide popolano Neo4j.") |
|
|
| if not is_step_c_unlocked: |
| st.caption("Completa la Fase 2 per procedere.") |
| elif st.session_state.pipeline_stage >= 3: |
| st.success("Grafo Aggiornato! Le triple conformi sono su Neo4j, gli scarti su Mongo (se configurato).") |
| else: |
| if not driver: |
| st.error("⚠️ Connettiti a Neo4j (nella sidebar) per procedere.") |
| else: |
| if st.button("Valida e Scrivi su Grafo", type="primary"): |
| with st.spinner("Risoluzione, validazione logica e persistenza..."): |
| try: |
| raw_data = st.session_state.extraction_data |
| all_entities = raw_data.get("entities", []) |
| all_triples = raw_data.get("triples", []) |
|
|
| persister = KnowledgeGraphPersister() |
| persister.driver = driver |
| persister._create_constraints() |
| |
| resolver = get_resolver() |
| resolver.driver = driver |
| all_entities, resolved_triples, entities_to_save = resolver.resolve_entities(all_entities, all_triples) |
| |
| validator = get_validator() |
| valid_triples, invalid_triples, report = validator.filter_valid_triples(entities_to_save, resolved_triples) |
| |
| if invalid_triples: |
| st.warning(f"Rilevate {len(invalid_triples)} violazioni ontologiche. Scartate dalla persistenza.") |
| |
| |
| mongo_ur = os.getenv("MONGO_UR") |
| mongo_user = os.getenv("MONGO_USER") |
| mongo_pass = os.getenv("MONGO_PASS") |
| |
| if mongo_ur: |
| try: |
| client = MongoClient(mongo_ur, username=mongo_user, password=mongo_pass) |
| db = client["semantic_discovery"]["rejected_triples"] |
| docs = [] |
| for doc in invalid_triples: |
| doc["timestamp"] = time.time() |
| docs.append(doc) |
| db.insert_many(docs) |
| st.info("💾 Scarti archiviati correttamente su MongoDB.") |
| except Exception as e: |
| st.error(f"Errore scrittura DLQ: {e}") |
| |
| |
| persister.save_entities_and_triples(entities_to_save, valid_triples) |
| |
| st.session_state.pipeline_stage = 3 |
| st.rerun() |
| except Exception as e: |
| st.error(f"Errore critico: {e}") |
|
|
| |
| |
| |
| with tab_val: |
| st.header("Curation & Feedback Loop") |
| if driver: |
| stats = run_query(driver, "MATCH (n) RETURN count(n) as nodes, count{()-->()} as rels") |
| if stats: |
| c1, c2 = st.columns(2) |
| c1.metric("Nodi Totali", stats[0]['nodes']) |
| c2.metric("Relazioni", stats[0]['rels']) |
|
|
| cypher_val = """ |
| MATCH (s)-[r]->(o) |
| RETURN elementId(r) as id, |
| COALESCE(s.label, head(labels(s))) as Soggetto, |
| type(r) as Predicato, |
| COALESCE(o.label, head(labels(o))) as Oggetto, |
| COALESCE(r.evidence, 'N/A') as Evidenza, |
| COALESCE(r.reasoning, 'N/A') as Ragionamento |
| """ |
| triples_data = run_query(driver, cypher_val) |
| |
| if triples_data: |
| df = pd.DataFrame(triples_data) |
| st.dataframe(df.drop(columns=["id"]), width='stretch', hide_index=True) |
| else: |
| st.info("Grafo vuoto o relazioni senza nuovi attributi.") |
| else: |
| st.warning("Database non connesso.") |
|
|
| |
| |
| |
| with tab_vis: |
| st.header("Esplorazione Topologica") |
| if driver: |
| col_ctrl, col_info = st.columns([1, 4]) |
| with col_ctrl: |
| generate_graph = st.button("🔄 Genera / Aggiorna Grafo", type="primary") |
| |
| if generate_graph: |
| with st.spinner("Estrazione dati e generazione del grafo interattivo..."): |
| cypher_vis = """ |
| MATCH (s:Resource) |
| OPTIONAL MATCH (s)-[r]->(o:Resource) |
| RETURN |
| s.label AS src, |
| labels(s) AS src_labels, |
| type(r) AS rel, |
| o.label AS dst, |
| labels(o) AS dst_labels |
| """ |
| graph_data = run_query(driver, cypher_vis) |
| |
| if graph_data: |
| net = Network(height="800px", width="100%", bgcolor="#222222", font_color="white", notebook=False) |
| |
| for item in graph_data: |
| if item['src']: |
| src_label_text = str(item['src']) |
| src_color = get_node_color(item['src_labels']) |
| net.add_node(src_label_text, label=src_label_text, color=src_color, title=f"Labels: {item['src_labels']}") |
| |
| if item['dst'] and item['rel']: |
| dst_label_text = str(item['dst']) |
| rel_type = str(item['rel']) |
| dst_color = get_node_color(item['dst_labels']) |
| |
| net.add_node(dst_label_text, label=dst_label_text, color=dst_color, title=f"Labels: {item['dst_labels']}") |
| |
| net.add_edge(src_label_text, dst_label_text, title=rel_type) |
| |
| net.force_atlas_2based( |
| gravity=-50, |
| central_gravity=0.01, |
| spring_length=100, |
| spring_strength=0.08, |
| damping=0.4 |
| ) |
| net.toggle_physics(True) |
| |
| with tempfile.NamedTemporaryFile(delete=False, suffix='.html') as tmp: |
| net.save_graph(tmp.name) |
| with open(tmp.name, 'r', encoding='utf-8') as f: |
| raw_html = f.read() |
| |
| fullscreen_addon = """ |
| <style> |
| /* Quando l'iframe entra in fullscreen, forziamo il div di Pyvis a coprire l'intero schermo */ |
| :fullscreen #mynetwork { height: 100vh !important; width: 100vw !important; } |
| :-webkit-full-screen #mynetwork { height: 100vh !important; width: 100vw !important; } |
| :-moz-full-screen #mynetwork { height: 100vh !important; width: 100vw !important; } |
| |
| #fs-btn { |
| position: absolute; top: 15px; right: 15px; z-index: 9999; |
| width: 40px; height: 40px; |
| background-color: rgba(34, 34, 34, 0.7); |
| color: #4facfe; border: 1px solid #4facfe; border-radius: 8px; |
| cursor: pointer; display: flex; align-items: center; justify-content: center; |
| box-shadow: 0 4px 6px rgba(0,0,0,0.3); transition: all 0.2s ease-in-out; |
| } |
| #fs-btn:hover { background-color: #4facfe; color: white; } |
| </style> |
| |
| <button id="fs-btn" onclick="toggleFullScreen()" title="Schermo Intero"> |
| <svg id="fs-icon" xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"> |
| <path d="M8 3H5a2 2 0 0 0-2 2v3m18 0V5a2 2 0 0 0-2-2h-3m0 18h3a2 2 0 0 0 2-2v-3M3 16v3a2 2 0 0 0 2 2h3"></path> |
| </svg> |
| </button> |
| |
| <script> |
| const iconExpand = '<path d="M8 3H5a2 2 0 0 0-2 2v3m18 0V5a2 2 0 0 0-2-2h-3m0 18h3a2 2 0 0 0 2-2v-3M3 16v3a2 2 0 0 0 2 2h3"></path>'; |
| const iconCompress = '<path d="M8 3v3a2 2 0 0 1-2 2H3m18 0h-3a2 2 0 0 1-2-2V3m0 18v-3a2 2 0 0 1 2-2h3M3 16h3a2 2 0 0 1 2 2v3"></path>'; |
| |
| function toggleFullScreen() { |
| if (!document.fullscreenElement) { |
| document.documentElement.requestFullscreen().catch(err => console.log(err)); |
| } else { |
| if (document.exitFullscreen) { document.exitFullscreen(); } |
| } |
| } |
| |
| // Ascoltiamo l'evento fullscreen per cambiare l'icona (Espandi/Riduci) anche se l'utente preme "ESC" |
| document.addEventListener('fullscreenchange', (event) => { |
| const icon = document.getElementById('fs-icon'); |
| if (document.fullscreenElement) { |
| icon.innerHTML = iconCompress; |
| document.getElementById('fs-btn').title = "Riduci Schermo"; |
| } else { |
| icon.innerHTML = iconExpand; |
| document.getElementById('fs-btn').title = "Schermo Intero"; |
| } |
| }); |
| </script> |
| </body> |
| """ |
| |
| st.session_state.graph_html = raw_html.replace("</body>", fullscreen_addon) |
| else: |
| st.warning("Il grafo è attualmente vuoto.") |
| st.session_state.graph_html = None |
|
|
| if st.session_state.graph_html: |
| components.html(st.session_state.graph_html, height=800, scrolling=True) |
| else: |
| st.info("👆 Clicca su 'Genera / Aggiorna Grafo' per visualizzare i dati attuali di Neo4j.") |
| |
| else: |
| st.warning("Database non connesso. Configura le credenziali nella sidebar.") |