GaetanoParente's picture
rimossi import inutili e blindato utilizzo utente
9cbbfac
import streamlit.components.v1 as components
import streamlit as st
import os
import time
import tempfile
import pandas as pd
from pymongo import MongoClient
from neo4j import GraphDatabase
from pyvis.network import Network
from dotenv import load_dotenv
import warnings
import logging
import requests
# --- IMPORT MODULI SPECIFICI ---
from src.ingestion.semantic_splitter import ActivaSemanticSplitter
from src.extraction.extractor import NeuroSymbolicExtractor
from src.validation.validator import SemanticValidator
from src.graph.graph_loader import KnowledgeGraphPersister
from src.graph.entity_resolver import EntityResolver
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
logging.getLogger("transformers").setLevel(logging.ERROR)
# --- CONFIGURAZIONE PAGINA ---
load_dotenv()
st.set_page_config(
page_title="Activa Semantic Discovery",
layout="wide",
initial_sidebar_state="expanded",
page_icon="🧠"
)
def local_css(file_name):
if os.path.exists(file_name):
with open(file_name, "r") as f:
st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
local_css("assets/style.css")
# --- SESSION STATE MANAGEMENT ---
if 'groq_valid' not in st.session_state:
st.session_state.groq_valid = False
if 'pipeline_stage' not in st.session_state:
st.session_state.pipeline_stage = 0
if 'document_text' not in st.session_state:
st.session_state.document_text = ""
if 'chunks' not in st.session_state:
st.session_state.chunks = []
if 'extraction_data' not in st.session_state:
st.session_state.extraction_data = {"entities": [], "triples": []}
if 'graph_html' not in st.session_state:
st.session_state.graph_html = None
def reset_pipeline():
st.session_state.pipeline_stage = 0
st.session_state.document_text = ""
st.session_state.chunks = []
st.session_state.extraction_data = {"entities": [], "triples": []}
# --- CACHING RISORSE ---
@st.cache_resource
def get_splitter():
return ActivaSemanticSplitter(model_name="all-MiniLM-L6-v2")
@st.cache_resource
def get_extractor():
return NeuroSymbolicExtractor(index_path="ontology/domain_index.json")
@st.cache_resource(show_spinner="🧩 Inizializzazione Entity Resolver...")
def get_resolver():
return EntityResolver(neo4j_driver=None, similarity_threshold=0.85)
@st.cache_resource
def get_validator():
return SemanticValidator(
ontology_dir="ontology",
shapes_file="ontology/shapes/auto_constraints.ttl"
)
COLOR_PALETTE = {
"arco_CulturalProperty": "#FF5733", # Arancio
"core_Agent": "#33FF57", # Verde
"l0_Location": "#3357FF", # Blu
"l0_Object": "#F333FF", # Viola
"core_EventOrSituation": "#FFD433",# Giallo
"clv_City": "#33FFF3", # Turchese
"DEFAULT": "#97C2FC" # Blu standard
}
def get_node_color(labels):
specific_labels = [l for l in labels if l != 'Resource']
if not specific_labels:
return COLOR_PALETTE["DEFAULT"]
label = specific_labels[0]
return COLOR_PALETTE.get(label, COLOR_PALETTE["DEFAULT"])
def validate_groq_key(api_key):
"""Effettua un ping leggero all'API di Groq per verificare la validità della chiave."""
if not api_key:
return False
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
}
try:
response = requests.get("https://api.groq.com/openai/v1/models", headers=headers, timeout=5)
return response.status_code == 200
except:
return False
# Pre-load dei modelli in memoria
_ = get_splitter()
_ = get_extractor()
_ = get_validator()
# --- FUNZIONI NEO4J ---
def get_driver(uri, user, password):
if not uri or not password: return None
try:
driver = GraphDatabase.driver(uri, auth=(user, password))
driver.verify_connectivity()
return driver
except: return None
def run_query(driver, query, params=None):
if driver is None: return []
with driver.session() as session:
result = session.run(query, params)
return [r.data() for r in result]
# --- UI: SIDEBAR ---
st.sidebar.title("⚙️ Configurazione")
env_uri = os.getenv("NEO4J_URI", "")
env_user = os.getenv("NEO4J_USER", "neo4j")
env_password = os.getenv("NEO4J_PASSWORD", "")
env_groq_key = ""
st.sidebar.subheader("Backend AI (TDDT)")
if env_groq_key and not st.session_state.groq_valid:
if validate_groq_key(env_groq_key):
st.session_state.groq_valid = True
else:
os.environ["GROQ_API_KEY"] = ""
env_groq_key = ""
if st.session_state.groq_valid:
st.sidebar.success("✅ Groq API Key: Valida e Attiva")
else:
groq_key_input = st.sidebar.text_input("Inserisci GROQ_API_KEY", type="password")
if st.sidebar.button("Verifica Chiave"):
with st.spinner("Verifica in corso..."):
if validate_groq_key(groq_key_input):
os.environ["GROQ_API_KEY"] = groq_key_input
st.session_state.groq_valid = True
st.sidebar.success("✅ Chiave valida!")
time.sleep(1)
st.rerun()
else:
st.sidebar.error("❌ Chiave non valida o non autorizzata.")
st.sidebar.subheader("Knowledge Graph")
uri = st.sidebar.text_input("URI Neo4j", value=env_uri)
user = st.sidebar.text_input("User Neo4j", value=env_user)
pwd_placeholder = "✅ Configurato (Lascia vuoto)" if env_password else "Inserisci Password Neo4j"
password_input = st.sidebar.text_input("Password Neo4j", type="password", placeholder=pwd_placeholder)
password = password_input if password_input else env_password
driver = None
if uri and password:
driver = get_driver(uri, user, password)
if driver:
st.sidebar.success("🟢 Connesso a Neo4j")
os.environ["NEO4J_URI"] = uri
os.environ["NEO4J_USER"] = user
os.environ["NEO4J_PASSWORD"] = password
else:
st.sidebar.error("🔴 Errore connessione Neo4j")
st.sidebar.divider()
if st.sidebar.button("🔄 Reset Pipeline", on_click=reset_pipeline):
st.sidebar.info("Stato resettato.")
# --- MAIN HEADER ---
st.title("🧠 Automated Semantic Discovery Prototype")
st.markdown("**Type-Driven Domain Traversal (TDDT) & OWL RL Validation**")
tab_gen, tab_val, tab_vis = st.tabs([
"⚙️ 1. Pipeline Generativa",
"🔍 2. Dati e DLQ",
"🕸️ 3. Esplorazione Grafo"
])
# ==============================================================================
# TAB 1: PIPELINE GENERATIVA (STEPPER UI)
# ==============================================================================
with tab_gen:
st.subheader("1. Ingestion Documentale")
st.info("Inserisci il testo da analizzare nel campo sottostante.")
with st.form("ingestion_form"):
input_text = st.text_area("Testo del documento:", value=st.session_state.document_text, height=200)
submitted = st.form_submit_button("Salva Testo e Prepara Pipeline")
if submitted:
if input_text != st.session_state.document_text and input_text.strip() != "":
st.session_state.document_text = input_text
st.session_state.pipeline_stage = 0
st.rerun()
st.markdown("---")
progress_val = int((st.session_state.pipeline_stage / 3) * 100)
st.progress(progress_val, text=f"Progresso Pipeline: {progress_val}%")
# ==========================
# FASE 1: CHUNKING
# ==========================
with st.container():
st.markdown(f"### {'✅' if st.session_state.pipeline_stage >= 1 else '1️⃣'} Fase 1: Semantic Chunking")
with st.expander("ℹ️ Cosa fa questa fase?"):
st.write("Segmenta il testo in frammenti coerenti analizzando la similarità semantica vettoriale tra le frasi.")
is_groq_ready = bool(env_groq_key)
if st.session_state.pipeline_stage >= 1:
chunks = st.session_state.chunks
st.success(f"Chunking completato! Generati {len(chunks)} frammenti semantici.")
with st.expander("Vedi dettagli frammenti"):
st.json(chunks)
else:
if st.button("Avvia Semantic Splitter", type="primary", disabled=not is_groq_ready):
with st.spinner("Creazione chunks in corso..."):
try:
splitter = get_splitter()
chunks, _, _ = splitter.create_chunks(input_text, percentile_threshold=90)
st.session_state.chunks = chunks
st.session_state.pipeline_stage = 1
st.rerun()
except Exception as e:
st.error(f"Errore durante il chunking: {e}")
st.markdown("⬇️")
# ==========================
# FASE 2: EXTRACTION (TDDT)
# ==========================
is_step_b_unlocked = st.session_state.pipeline_stage >= 1
with st.container():
color = "white" if is_step_b_unlocked else "gray"
icon = "✅" if st.session_state.pipeline_stage >= 2 else ("2️⃣" if is_step_b_unlocked else "🔒")
st.markdown(f"<h3 style='color:{color}'>{icon} Fase 2: TDDT Extraction</h3>", unsafe_allow_html=True)
with st.expander("ℹ️ Cosa fa questa fase?"):
st.write("Esegue l'estrazione gerarchica in due passaggi: prima classifica le entità usando le root dell'ontologia, poi estrae le relazioni passando all'LLM solo le proprietà ammesse (Domain Index).")
if not is_step_b_unlocked:
st.caption("Completa la Fase 1 per sbloccare l'estrazione.")
elif st.session_state.pipeline_stage >= 2:
data = st.session_state.extraction_data
st.success(f"Estrazione TDDT completata! Identificate {len(data['entities'])} entità e {len(data['triples'])} triple.")
with st.expander("Vedi dati estratti (Pre-Validazione)"):
st.write("Entità Inferite:", data['entities'])
if data['triples']:
st.dataframe(pd.DataFrame([t.model_dump() for t in data['triples']]), hide_index=True)
else:
is_extraction_ready = st.session_state.groq_valid
if st.button("Avvia Estrazione TDDT", type="primary", disabled=not is_extraction_ready):
if not st.session_state.groq_valid:
st.warning("⚠️ Per avviare l'estrazione devi prima inserire e verificare una GROQ_API_KEY valida nella sidebar.")
else:
with st.spinner("Classificazione ed estrazione gerarchica in corso..."):
try:
chunks = st.session_state.chunks
extractor = get_extractor()
all_triples = []
all_entities = []
prog_bar = st.progress(0)
for i, chunk in enumerate(chunks):
chunk_id = f"st_req_chunk_{i+1}"
res = extractor.extract(chunk, source_id=chunk_id)
if res:
if res.triples: all_triples.extend(res.triples)
prog_bar.progress((i+1)/len(chunks))
if i < len(chunks) - 1:
print(f"⏳ Pacing per Groq API: attesa 20s per non sforare i 30K TPM...")
time.sleep(20)
# Estraggo le entità univoche dalle triple per il Resolver
unique_entities = list(set([t.subject for t in all_triples] + [t.object for t in all_triples]))
st.session_state.extraction_data = {"entities": unique_entities, "triples": all_triples}
st.session_state.pipeline_stage = 2
st.rerun()
except Exception as e:
st.error(f"Errore: {e}")
st.markdown("⬇️")
# ==========================
# FASE 3: RESOLUTION & VALIDATION (BLOCCANTE)
# ==========================
is_step_c_unlocked = st.session_state.pipeline_stage >= 2
with st.container():
color = "white" if is_step_c_unlocked else "gray"
icon = "✅" if st.session_state.pipeline_stage >= 3 else ("3️⃣" if is_step_c_unlocked else "🔒")
st.markdown(f"<h3 style='color:{color}'>{icon} Fase 3: Resolution & SHACL Blocking</h3>", unsafe_allow_html=True)
with st.expander("ℹ️ Cosa fa questa fase?"):
st.write("Risolve le entità (Entity Linking) e applica la validazione OWL RL. Le triple non conformi vengono scartate e salvate nella Dead Letter Queue (MongoDB), mentre quelle valide popolano Neo4j.")
if not is_step_c_unlocked:
st.caption("Completa la Fase 2 per procedere.")
elif st.session_state.pipeline_stage >= 3:
st.success("Grafo Aggiornato! Le triple conformi sono su Neo4j, gli scarti su Mongo (se configurato).")
else:
if not driver:
st.error("⚠️ Connettiti a Neo4j (nella sidebar) per procedere.")
else:
if st.button("Valida e Scrivi su Grafo", type="primary"):
with st.spinner("Risoluzione, validazione logica e persistenza..."):
try:
raw_data = st.session_state.extraction_data
all_entities = raw_data.get("entities", [])
all_triples = raw_data.get("triples", [])
persister = KnowledgeGraphPersister()
persister.driver = driver
persister._create_constraints()
resolver = get_resolver()
resolver.driver = driver
all_entities, resolved_triples, entities_to_save = resolver.resolve_entities(all_entities, all_triples)
validator = get_validator()
valid_triples, invalid_triples, report = validator.filter_valid_triples(entities_to_save, resolved_triples)
if invalid_triples:
st.warning(f"Rilevate {len(invalid_triples)} violazioni ontologiche. Scartate dalla persistenza.")
# Salvataggio in DLQ (MongoDB)
mongo_ur = os.getenv("MONGO_UR")
mongo_user = os.getenv("MONGO_USER")
mongo_pass = os.getenv("MONGO_PASS")
if mongo_ur:
try:
client = MongoClient(mongo_ur, username=mongo_user, password=mongo_pass)
db = client["semantic_discovery"]["rejected_triples"]
docs = []
for doc in invalid_triples:
doc["timestamp"] = time.time()
docs.append(doc)
db.insert_many(docs)
st.info("💾 Scarti archiviati correttamente su MongoDB.")
except Exception as e:
st.error(f"Errore scrittura DLQ: {e}")
# Salviamo SOLO le valide
persister.save_entities_and_triples(entities_to_save, valid_triples)
st.session_state.pipeline_stage = 3
st.rerun()
except Exception as e:
st.error(f"Errore critico: {e}")
# ==============================================================================
# TAB 2: VALIDAZIONE E DLQ (Aggiornato per 1.4)
# ==============================================================================
with tab_val:
st.header("Curation & Feedback Loop")
if driver:
stats = run_query(driver, "MATCH (n) RETURN count(n) as nodes, count{()-->()} as rels")
if stats:
c1, c2 = st.columns(2)
c1.metric("Nodi Totali", stats[0]['nodes'])
c2.metric("Relazioni", stats[0]['rels'])
cypher_val = """
MATCH (s)-[r]->(o)
RETURN elementId(r) as id,
COALESCE(s.label, head(labels(s))) as Soggetto,
type(r) as Predicato,
COALESCE(o.label, head(labels(o))) as Oggetto,
COALESCE(r.evidence, 'N/A') as Evidenza,
COALESCE(r.reasoning, 'N/A') as Ragionamento
"""
triples_data = run_query(driver, cypher_val)
if triples_data:
df = pd.DataFrame(triples_data)
st.dataframe(df.drop(columns=["id"]), width='stretch', hide_index=True)
else:
st.info("Grafo vuoto o relazioni senza nuovi attributi.")
else:
st.warning("Database non connesso.")
# ==============================================================================
# TAB 3: ESPLORAZIONE GRAFO
# ==============================================================================
with tab_vis:
st.header("Esplorazione Topologica")
if driver:
col_ctrl, col_info = st.columns([1, 4])
with col_ctrl:
generate_graph = st.button("🔄 Genera / Aggiorna Grafo", type="primary")
if generate_graph:
with st.spinner("Estrazione dati e generazione del grafo interattivo..."):
cypher_vis = """
MATCH (s:Resource)
OPTIONAL MATCH (s)-[r]->(o:Resource)
RETURN
s.label AS src,
labels(s) AS src_labels,
type(r) AS rel,
o.label AS dst,
labels(o) AS dst_labels
"""
graph_data = run_query(driver, cypher_vis)
if graph_data:
net = Network(height="800px", width="100%", bgcolor="#222222", font_color="white", notebook=False)
for item in graph_data:
if item['src']:
src_label_text = str(item['src'])
src_color = get_node_color(item['src_labels'])
net.add_node(src_label_text, label=src_label_text, color=src_color, title=f"Labels: {item['src_labels']}")
if item['dst'] and item['rel']:
dst_label_text = str(item['dst'])
rel_type = str(item['rel'])
dst_color = get_node_color(item['dst_labels'])
net.add_node(dst_label_text, label=dst_label_text, color=dst_color, title=f"Labels: {item['dst_labels']}")
net.add_edge(src_label_text, dst_label_text, title=rel_type)
net.force_atlas_2based(
gravity=-50,
central_gravity=0.01,
spring_length=100,
spring_strength=0.08,
damping=0.4
)
net.toggle_physics(True)
with tempfile.NamedTemporaryFile(delete=False, suffix='.html') as tmp:
net.save_graph(tmp.name)
with open(tmp.name, 'r', encoding='utf-8') as f:
raw_html = f.read()
fullscreen_addon = """
<style>
/* Quando l'iframe entra in fullscreen, forziamo il div di Pyvis a coprire l'intero schermo */
:fullscreen #mynetwork { height: 100vh !important; width: 100vw !important; }
:-webkit-full-screen #mynetwork { height: 100vh !important; width: 100vw !important; }
:-moz-full-screen #mynetwork { height: 100vh !important; width: 100vw !important; }
#fs-btn {
position: absolute; top: 15px; right: 15px; z-index: 9999;
width: 40px; height: 40px;
background-color: rgba(34, 34, 34, 0.7);
color: #4facfe; border: 1px solid #4facfe; border-radius: 8px;
cursor: pointer; display: flex; align-items: center; justify-content: center;
box-shadow: 0 4px 6px rgba(0,0,0,0.3); transition: all 0.2s ease-in-out;
}
#fs-btn:hover { background-color: #4facfe; color: white; }
</style>
<button id="fs-btn" onclick="toggleFullScreen()" title="Schermo Intero">
<svg id="fs-icon" xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
<path d="M8 3H5a2 2 0 0 0-2 2v3m18 0V5a2 2 0 0 0-2-2h-3m0 18h3a2 2 0 0 0 2-2v-3M3 16v3a2 2 0 0 0 2 2h3"></path>
</svg>
</button>
<script>
const iconExpand = '<path d="M8 3H5a2 2 0 0 0-2 2v3m18 0V5a2 2 0 0 0-2-2h-3m0 18h3a2 2 0 0 0 2-2v-3M3 16v3a2 2 0 0 0 2 2h3"></path>';
const iconCompress = '<path d="M8 3v3a2 2 0 0 1-2 2H3m18 0h-3a2 2 0 0 1-2-2V3m0 18v-3a2 2 0 0 1 2-2h3M3 16h3a2 2 0 0 1 2 2v3"></path>';
function toggleFullScreen() {
if (!document.fullscreenElement) {
document.documentElement.requestFullscreen().catch(err => console.log(err));
} else {
if (document.exitFullscreen) { document.exitFullscreen(); }
}
}
// Ascoltiamo l'evento fullscreen per cambiare l'icona (Espandi/Riduci) anche se l'utente preme "ESC"
document.addEventListener('fullscreenchange', (event) => {
const icon = document.getElementById('fs-icon');
if (document.fullscreenElement) {
icon.innerHTML = iconCompress;
document.getElementById('fs-btn').title = "Riduci Schermo";
} else {
icon.innerHTML = iconExpand;
document.getElementById('fs-btn').title = "Schermo Intero";
}
});
</script>
</body>
"""
st.session_state.graph_html = raw_html.replace("</body>", fullscreen_addon)
else:
st.warning("Il grafo è attualmente vuoto.")
st.session_state.graph_html = None
if st.session_state.graph_html:
components.html(st.session_state.graph_html, height=800, scrolling=True)
else:
st.info("👆 Clicca su 'Genera / Aggiorna Grafo' per visualizzare i dati attuali di Neo4j.")
else:
st.warning("Database non connesso. Configura le credenziali nella sidebar.")