Spaces:

NextGenTech
/

AutomatedSemanticDiscovery

Sleeping

App Files Files Community

AutomatedSemanticDiscovery / app.py

GaetanoParente

rimossi import inutili e blindato utilizzo utente

9cbbfac 3 days ago

raw

history blame contribute delete

24.7 kB

	import streamlit.components.v1 as components
	import streamlit as st
	import os
	import time
	import tempfile
	import pandas as pd
	from pymongo import MongoClient
	from neo4j import GraphDatabase
	from pyvis.network import Network
	from dotenv import load_dotenv
	import warnings
	import logging
	import requests

	# --- IMPORT MODULI SPECIFICI ---
	from src.ingestion.semantic_splitter import ActivaSemanticSplitter
	from src.extraction.extractor import NeuroSymbolicExtractor
	from src.validation.validator import SemanticValidator
	from src.graph.graph_loader import KnowledgeGraphPersister
	from src.graph.entity_resolver import EntityResolver

	warnings.filterwarnings("ignore", category=FutureWarning)
	warnings.filterwarnings("ignore", category=DeprecationWarning)

	logging.getLogger("transformers").setLevel(logging.ERROR)

	# --- CONFIGURAZIONE PAGINA ---
	load_dotenv()
	st.set_page_config(
	page_title="Activa Semantic Discovery",
	layout="wide",
	initial_sidebar_state="expanded",
	page_icon="🧠"
	)

	def local_css(file_name):
	if os.path.exists(file_name):
	with open(file_name, "r") as f:
	st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)

	local_css("assets/style.css")

	# --- SESSION STATE MANAGEMENT ---
	if 'groq_valid' not in st.session_state:
	st.session_state.groq_valid = False
	if 'pipeline_stage' not in st.session_state:
	st.session_state.pipeline_stage = 0
	if 'document_text' not in st.session_state:
	st.session_state.document_text = ""
	if 'chunks' not in st.session_state:
	st.session_state.chunks = []
	if 'extraction_data' not in st.session_state:
	st.session_state.extraction_data = {"entities": [], "triples": []}
	if 'graph_html' not in st.session_state:
	st.session_state.graph_html = None

	def reset_pipeline():
	st.session_state.pipeline_stage = 0
	st.session_state.document_text = ""
	st.session_state.chunks = []
	st.session_state.extraction_data = {"entities": [], "triples": []}

	# --- CACHING RISORSE ---
	@st.cache_resource
	def get_splitter():
	return ActivaSemanticSplitter(model_name="all-MiniLM-L6-v2")

	@st.cache_resource
	def get_extractor():
	return NeuroSymbolicExtractor(index_path="ontology/domain_index.json")

	@st.cache_resource(show_spinner="🧩 Inizializzazione Entity Resolver...")
	def get_resolver():
	return EntityResolver(neo4j_driver=None, similarity_threshold=0.85)

	@st.cache_resource
	def get_validator():
	return SemanticValidator(
	ontology_dir="ontology",
	shapes_file="ontology/shapes/auto_constraints.ttl"
	)

	COLOR_PALETTE = {
	"arco_CulturalProperty": "#FF5733", # Arancio
	"core_Agent": "#33FF57", # Verde
	"l0_Location": "#3357FF", # Blu
	"l0_Object": "#F333FF", # Viola
	"core_EventOrSituation": "#FFD433",# Giallo
	"clv_City": "#33FFF3", # Turchese
	"DEFAULT": "#97C2FC" # Blu standard
	}

	def get_node_color(labels):
	specific_labels = [l for l in labels if l != 'Resource']
	if not specific_labels:
	return COLOR_PALETTE["DEFAULT"]

	label = specific_labels[0]
	return COLOR_PALETTE.get(label, COLOR_PALETTE["DEFAULT"])

	def validate_groq_key(api_key):
	"""Effettua un ping leggero all'API di Groq per verificare la validità della chiave."""
	if not api_key:
	return False

	headers = {
	"Authorization": f"Bearer {api_key}",
	"Content-Type": "application/json"
	}
	try:
	response = requests.get("https://api.groq.com/openai/v1/models", headers=headers, timeout=5)
	return response.status_code == 200
	except:
	return False

	# Pre-load dei modelli in memoria
	_ = get_splitter()
	_ = get_extractor()
	_ = get_validator()

	# --- FUNZIONI NEO4J ---
	def get_driver(uri, user, password):
	if not uri or not password: return None
	try:
	driver = GraphDatabase.driver(uri, auth=(user, password))
	driver.verify_connectivity()
	return driver
	except: return None

	def run_query(driver, query, params=None):
	if driver is None: return []
	with driver.session() as session:
	result = session.run(query, params)
	return [r.data() for r in result]

	# --- UI: SIDEBAR ---
	st.sidebar.title("⚙️ Configurazione")

	env_uri = os.getenv("NEO4J_URI", "")
	env_user = os.getenv("NEO4J_USER", "neo4j")
	env_password = os.getenv("NEO4J_PASSWORD", "")
	env_groq_key = ""

	st.sidebar.subheader("Backend AI (TDDT)")
	if env_groq_key and not st.session_state.groq_valid:
	if validate_groq_key(env_groq_key):
	st.session_state.groq_valid = True
	else:
	os.environ["GROQ_API_KEY"] = ""
	env_groq_key = ""

	if st.session_state.groq_valid:
	st.sidebar.success("✅ Groq API Key: Valida e Attiva")
	else:
	groq_key_input = st.sidebar.text_input("Inserisci GROQ_API_KEY", type="password")

	if st.sidebar.button("Verifica Chiave"):
	with st.spinner("Verifica in corso..."):
	if validate_groq_key(groq_key_input):
	os.environ["GROQ_API_KEY"] = groq_key_input
	st.session_state.groq_valid = True
	st.sidebar.success("✅ Chiave valida!")
	time.sleep(1)
	st.rerun()
	else:
	st.sidebar.error("❌ Chiave non valida o non autorizzata.")

	st.sidebar.subheader("Knowledge Graph")
	uri = st.sidebar.text_input("URI Neo4j", value=env_uri)
	user = st.sidebar.text_input("User Neo4j", value=env_user)

	pwd_placeholder = "✅ Configurato (Lascia vuoto)" if env_password else "Inserisci Password Neo4j"
	password_input = st.sidebar.text_input("Password Neo4j", type="password", placeholder=pwd_placeholder)
	password = password_input if password_input else env_password

	driver = None
	if uri and password:
	driver = get_driver(uri, user, password)
	if driver:
	st.sidebar.success("🟢 Connesso a Neo4j")
	os.environ["NEO4J_URI"] = uri
	os.environ["NEO4J_USER"] = user
	os.environ["NEO4J_PASSWORD"] = password
	else:
	st.sidebar.error("🔴 Errore connessione Neo4j")

	st.sidebar.divider()
	if st.sidebar.button("🔄 Reset Pipeline", on_click=reset_pipeline):
	st.sidebar.info("Stato resettato.")

	# --- MAIN HEADER ---
	st.title("🧠 Automated Semantic Discovery Prototype")
	st.markdown("Type-Driven Domain Traversal (TDDT) & OWL RL Validation")

	tab_gen, tab_val, tab_vis = st.tabs([
	"⚙️ 1. Pipeline Generativa",
	"🔍 2. Dati e DLQ",
	"🕸️ 3. Esplorazione Grafo"
	])

	# ==============================================================================
	# TAB 1: PIPELINE GENERATIVA (STEPPER UI)
	# ==============================================================================
	with tab_gen:
	st.subheader("1. Ingestion Documentale")
	st.info("Inserisci il testo da analizzare nel campo sottostante.")

	with st.form("ingestion_form"):
	input_text = st.text_area("Testo del documento:", value=st.session_state.document_text, height=200)
	submitted = st.form_submit_button("Salva Testo e Prepara Pipeline")

	if submitted:
	if input_text != st.session_state.document_text and input_text.strip() != "":
	st.session_state.document_text = input_text
	st.session_state.pipeline_stage = 0
	st.rerun()

	st.markdown("---")
	progress_val = int((st.session_state.pipeline_stage / 3) * 100)
	st.progress(progress_val, text=f"Progresso Pipeline: {progress_val}%")

	# ==========================
	# FASE 1: CHUNKING
	# ==========================
	with st.container():
	st.markdown(f"### {'✅' if st.session_state.pipeline_stage >= 1 else '1️⃣'} Fase 1: Semantic Chunking")

	with st.expander("ℹ️ Cosa fa questa fase?"):
	st.write("Segmenta il testo in frammenti coerenti analizzando la similarità semantica vettoriale tra le frasi.")

	is_groq_ready = bool(env_groq_key)

	if st.session_state.pipeline_stage >= 1:
	chunks = st.session_state.chunks
	st.success(f"Chunking completato! Generati {len(chunks)} frammenti semantici.")
	with st.expander("Vedi dettagli frammenti"):
	st.json(chunks)
	else:
	if st.button("Avvia Semantic Splitter", type="primary", disabled=not is_groq_ready):
	with st.spinner("Creazione chunks in corso..."):
	try:
	splitter = get_splitter()
	chunks, _, _ = splitter.create_chunks(input_text, percentile_threshold=90)
	st.session_state.chunks = chunks
	st.session_state.pipeline_stage = 1
	st.rerun()
	except Exception as e:
	st.error(f"Errore durante il chunking: {e}")

	st.markdown("⬇️")

	# ==========================
	# FASE 2: EXTRACTION (TDDT)
	# ==========================
	is_step_b_unlocked = st.session_state.pipeline_stage >= 1

	with st.container():
	color = "white" if is_step_b_unlocked else "gray"
	icon = "✅" if st.session_state.pipeline_stage >= 2 else ("2️⃣" if is_step_b_unlocked else "🔒")
	st.markdown(f"<h3 style='color:{color}'>{icon} Fase 2: TDDT Extraction</h3>", unsafe_allow_html=True)

	with st.expander("ℹ️ Cosa fa questa fase?"):
	st.write("Esegue l'estrazione gerarchica in due passaggi: prima classifica le entità usando le root dell'ontologia, poi estrae le relazioni passando all'LLM solo le proprietà ammesse (Domain Index).")

	if not is_step_b_unlocked:
	st.caption("Completa la Fase 1 per sbloccare l'estrazione.")
	elif st.session_state.pipeline_stage >= 2:
	data = st.session_state.extraction_data
	st.success(f"Estrazione TDDT completata! Identificate {len(data['entities'])} entità e {len(data['triples'])} triple.")
	with st.expander("Vedi dati estratti (Pre-Validazione)"):
	st.write("Entità Inferite:", data['entities'])
	if data['triples']:
	st.dataframe(pd.DataFrame([t.model_dump() for t in data['triples']]), hide_index=True)
	else:
	is_extraction_ready = st.session_state.groq_valid
	if st.button("Avvia Estrazione TDDT", type="primary", disabled=not is_extraction_ready):
	if not st.session_state.groq_valid:
	st.warning("⚠️ Per avviare l'estrazione devi prima inserire e verificare una GROQ_API_KEY valida nella sidebar.")
	else:
	with st.spinner("Classificazione ed estrazione gerarchica in corso..."):
	try:
	chunks = st.session_state.chunks
	extractor = get_extractor()
	all_triples = []
	all_entities = []
	prog_bar = st.progress(0)

	for i, chunk in enumerate(chunks):
	chunk_id = f"st_req_chunk_{i+1}"
	res = extractor.extract(chunk, source_id=chunk_id)

	if res:
	if res.triples: all_triples.extend(res.triples)

	prog_bar.progress((i+1)/len(chunks))

	if i < len(chunks) - 1:
	print(f"⏳ Pacing per Groq API: attesa 20s per non sforare i 30K TPM...")
	time.sleep(20)

	# Estraggo le entità univoche dalle triple per il Resolver
	unique_entities = list(set([t.subject for t in all_triples] + [t.object for t in all_triples]))

	st.session_state.extraction_data = {"entities": unique_entities, "triples": all_triples}
	st.session_state.pipeline_stage = 2
	st.rerun()
	except Exception as e:
	st.error(f"Errore: {e}")

	st.markdown("⬇️")

	# ==========================
	# FASE 3: RESOLUTION & VALIDATION (BLOCCANTE)
	# ==========================
	is_step_c_unlocked = st.session_state.pipeline_stage >= 2

	with st.container():
	color = "white" if is_step_c_unlocked else "gray"
	icon = "✅" if st.session_state.pipeline_stage >= 3 else ("3️⃣" if is_step_c_unlocked else "🔒")
	st.markdown(f"<h3 style='color:{color}'>{icon} Fase 3: Resolution & SHACL Blocking</h3>", unsafe_allow_html=True)

	with st.expander("ℹ️ Cosa fa questa fase?"):
	st.write("Risolve le entità (Entity Linking) e applica la validazione OWL RL. Le triple non conformi vengono scartate e salvate nella Dead Letter Queue (MongoDB), mentre quelle valide popolano Neo4j.")

	if not is_step_c_unlocked:
	st.caption("Completa la Fase 2 per procedere.")
	elif st.session_state.pipeline_stage >= 3:
	st.success("Grafo Aggiornato! Le triple conformi sono su Neo4j, gli scarti su Mongo (se configurato).")
	else:
	if not driver:
	st.error("⚠️ Connettiti a Neo4j (nella sidebar) per procedere.")
	else:
	if st.button("Valida e Scrivi su Grafo", type="primary"):
	with st.spinner("Risoluzione, validazione logica e persistenza..."):
	try:
	raw_data = st.session_state.extraction_data
	all_entities = raw_data.get("entities", [])
	all_triples = raw_data.get("triples", [])

	persister = KnowledgeGraphPersister()
	persister.driver = driver
	persister._create_constraints()

	resolver = get_resolver()
	resolver.driver = driver
	all_entities, resolved_triples, entities_to_save = resolver.resolve_entities(all_entities, all_triples)

	validator = get_validator()
	valid_triples, invalid_triples, report = validator.filter_valid_triples(entities_to_save, resolved_triples)

	if invalid_triples:
	st.warning(f"Rilevate {len(invalid_triples)} violazioni ontologiche. Scartate dalla persistenza.")

	# Salvataggio in DLQ (MongoDB)
	mongo_ur = os.getenv("MONGO_UR")
	mongo_user = os.getenv("MONGO_USER")
	mongo_pass = os.getenv("MONGO_PASS")

	if mongo_ur:
	try:
	client = MongoClient(mongo_ur, username=mongo_user, password=mongo_pass)
	db = client["semantic_discovery"]["rejected_triples"]
	docs = []
	for doc in invalid_triples:
	doc["timestamp"] = time.time()
	docs.append(doc)
	db.insert_many(docs)
	st.info("💾 Scarti archiviati correttamente su MongoDB.")
	except Exception as e:
	st.error(f"Errore scrittura DLQ: {e}")

	# Salviamo SOLO le valide
	persister.save_entities_and_triples(entities_to_save, valid_triples)

	st.session_state.pipeline_stage = 3
	st.rerun()
	except Exception as e:
	st.error(f"Errore critico: {e}")

	# ==============================================================================
	# TAB 2: VALIDAZIONE E DLQ (Aggiornato per 1.4)
	# ==============================================================================
	with tab_val:
	st.header("Curation & Feedback Loop")
	if driver:
	stats = run_query(driver, "MATCH (n) RETURN count(n) as nodes, count{()-->()} as rels")
	if stats:
	c1, c2 = st.columns(2)
	c1.metric("Nodi Totali", stats[0]['nodes'])
	c2.metric("Relazioni", stats[0]['rels'])

	cypher_val = """
	MATCH (s)-[r]->(o)
	RETURN elementId(r) as id,
	COALESCE(s.label, head(labels(s))) as Soggetto,
	type(r) as Predicato,
	COALESCE(o.label, head(labels(o))) as Oggetto,
	COALESCE(r.evidence, 'N/A') as Evidenza,
	COALESCE(r.reasoning, 'N/A') as Ragionamento
	"""
	triples_data = run_query(driver, cypher_val)

	if triples_data:
	df = pd.DataFrame(triples_data)
	st.dataframe(df.drop(columns=["id"]), width='stretch', hide_index=True)
	else:
	st.info("Grafo vuoto o relazioni senza nuovi attributi.")
	else:
	st.warning("Database non connesso.")

	# ==============================================================================
	# TAB 3: ESPLORAZIONE GRAFO
	# ==============================================================================
	with tab_vis:
	st.header("Esplorazione Topologica")
	if driver:
	col_ctrl, col_info = st.columns([1, 4])
	with col_ctrl:
	generate_graph = st.button("🔄 Genera / Aggiorna Grafo", type="primary")

	if generate_graph:
	with st.spinner("Estrazione dati e generazione del grafo interattivo..."):
	cypher_vis = """
	MATCH (s:Resource)
	OPTIONAL MATCH (s)-[r]->(o:Resource)
	RETURN
	s.label AS src,
	labels(s) AS src_labels,
	type(r) AS rel,
	o.label AS dst,
	labels(o) AS dst_labels
	"""
	graph_data = run_query(driver, cypher_vis)

	if graph_data:
	net = Network(height="800px", width="100%", bgcolor="#222222", font_color="white", notebook=False)

	for item in graph_data:
	if item['src']:
	src_label_text = str(item['src'])
	src_color = get_node_color(item['src_labels'])
	net.add_node(src_label_text, label=src_label_text, color=src_color, title=f"Labels: {item['src_labels']}")

	if item['dst'] and item['rel']:
	dst_label_text = str(item['dst'])
	rel_type = str(item['rel'])
	dst_color = get_node_color(item['dst_labels'])

	net.add_node(dst_label_text, label=dst_label_text, color=dst_color, title=f"Labels: {item['dst_labels']}")

	net.add_edge(src_label_text, dst_label_text, title=rel_type)

	net.force_atlas_2based(
	gravity=-50,
	central_gravity=0.01,
	spring_length=100,
	spring_strength=0.08,
	damping=0.4
	)
	net.toggle_physics(True)

	with tempfile.NamedTemporaryFile(delete=False, suffix='.html') as tmp:
	net.save_graph(tmp.name)
	with open(tmp.name, 'r', encoding='utf-8') as f:
	raw_html = f.read()

	fullscreen_addon = """
	<style>
	/* Quando l'iframe entra in fullscreen, forziamo il div di Pyvis a coprire l'intero schermo */
	:fullscreen #mynetwork { height: 100vh !important; width: 100vw !important; }
	:-webkit-full-screen #mynetwork { height: 100vh !important; width: 100vw !important; }
	:-moz-full-screen #mynetwork { height: 100vh !important; width: 100vw !important; }

	#fs-btn {
	position: absolute; top: 15px; right: 15px; z-index: 9999;
	width: 40px; height: 40px;
	background-color: rgba(34, 34, 34, 0.7);
	color: #4facfe; border: 1px solid #4facfe; border-radius: 8px;
	cursor: pointer; display: flex; align-items: center; justify-content: center;
	box-shadow: 0 4px 6px rgba(0,0,0,0.3); transition: all 0.2s ease-in-out;
	}
	#fs-btn:hover { background-color: #4facfe; color: white; }
	</style>

	<button id="fs-btn" onclick="toggleFullScreen()" title="Schermo Intero">
	<svg id="fs-icon" xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
	<path d="M8 3H5a2 2 0 0 0-2 2v3m18 0V5a2 2 0 0 0-2-2h-3m0 18h3a2 2 0 0 0 2-2v-3M3 16v3a2 2 0 0 0 2 2h3"></path>
	</svg>
	</button>

	<script>
	const iconExpand = '<path d="M8 3H5a2 2 0 0 0-2 2v3m18 0V5a2 2 0 0 0-2-2h-3m0 18h3a2 2 0 0 0 2-2v-3M3 16v3a2 2 0 0 0 2 2h3"></path>';
	const iconCompress = '<path d="M8 3v3a2 2 0 0 1-2 2H3m18 0h-3a2 2 0 0 1-2-2V3m0 18v-3a2 2 0 0 1 2-2h3M3 16h3a2 2 0 0 1 2 2v3"></path>';

	function toggleFullScreen() {
	if (!document.fullscreenElement) {
	document.documentElement.requestFullscreen().catch(err => console.log(err));
	} else {
	if (document.exitFullscreen) { document.exitFullscreen(); }
	}
	}

	// Ascoltiamo l'evento fullscreen per cambiare l'icona (Espandi/Riduci) anche se l'utente preme "ESC"
	document.addEventListener('fullscreenchange', (event) => {
	const icon = document.getElementById('fs-icon');
	if (document.fullscreenElement) {
	icon.innerHTML = iconCompress;
	document.getElementById('fs-btn').title = "Riduci Schermo";
	} else {
	icon.innerHTML = iconExpand;
	document.getElementById('fs-btn').title = "Schermo Intero";
	}
	});
	</script>
	</body>
	"""

	st.session_state.graph_html = raw_html.replace("</body>", fullscreen_addon)
	else:
	st.warning("Il grafo è attualmente vuoto.")
	st.session_state.graph_html = None

	if st.session_state.graph_html:
	components.html(st.session_state.graph_html, height=800, scrolling=True)
	else:
	st.info("👆 Clicca su 'Genera / Aggiorna Grafo' per visualizzare i dati attuali di Neo4j.")

	else:
	st.warning("Database non connesso. Configura le credenziali nella sidebar.")