File size: 4,720 Bytes
cf52a55 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 | """
Base de conhecimento escalável.
===============================
Carrega KB a partir de arquivo (JSON) e opcionalmente enriquece com
retrieval em ChromaDB (RAG). Mantém interface termo -> grau [0,1] para L3/L4.
"""
from __future__ import annotations
import json
import os
import re
from pathlib import Path
from typing import Any, Dict, List, Optional
# KB padrão (fallback quando não há arquivo)
SEED_KNOWLEDGE_BASE: Dict[str, float] = {
"quente": 0.85, "frio": 0.85, "morno": 0.70, "aquecido": 0.80, "gelado": 0.80,
"temperatura": 0.90, "graus": 0.88, "escaldante": 0.75, "tépido": 0.65,
"verdadeiro": 0.95, "falso": 0.95, "contradição": 0.80, "proposição": 0.85,
"silogismo": 0.75, "conhecimento": 0.90, "inteligência": 0.85, "consciência": 0.70,
"razão": 0.88, "verdade": 0.92, "água": 0.95, "líquido": 0.90, "h2o": 0.90,
}
def load_kb_from_file(path: str | Path) -> Dict[str, float]:
"""
Carrega dicionário termo -> grau [0,1] de um arquivo JSON.
Formato esperado: {"termo1": 0.9, "termo2": 0.8, ...}
"""
path = Path(path)
if not path.exists():
return {}
try:
with open(path, "r", encoding="utf-8") as f:
data = json.load(f)
if not isinstance(data, dict):
return {}
return {k: float(v) for k, v in data.items() if isinstance(v, (int, float))}
except Exception:
return {}
def merge_kb(base: Dict[str, float], extra: Dict[str, float]) -> Dict[str, float]:
"""Mescla extra em base; em conflito, extra prevalece."""
out = dict(base)
for k, v in extra.items():
out[k] = v
return out
def enrich_kb_from_chroma(
query: str,
chroma_path: str,
embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2",
k: int = 5,
score_weight: float = 0.8,
) -> Dict[str, float]:
"""
Busca em ChromaDB por query e retorna um dicionário termo -> peso
extraído dos trechos recuperados (palavras relevantes com score_weight).
"""
try:
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
except ImportError:
return {}
chroma_path = Path(chroma_path)
if not chroma_path.exists() or not chroma_path.is_dir():
return {}
try:
embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
vectorstore = Chroma(persist_directory=str(chroma_path), embedding_function=embeddings)
docs = vectorstore.similarity_search(query, k=k)
except Exception:
return {}
# Extrai termos dos textos e atribui peso
term_scores: Dict[str, float] = {}
for d in docs:
text = d.page_content if hasattr(d, "page_content") else str(d)
words = re.findall(r"[a-záàãâéêíóôõúüç]+", text.lower())
for w in words:
if len(w) > 2:
term_scores[w] = term_scores.get(w, 0) + score_weight
# Normaliza para [0, 1]
if term_scores:
m = max(term_scores.values())
term_scores = {t: min(1.0, s / m) for t, s in term_scores.items()}
return term_scores
def get_knowledge_base(
config: Optional[Dict[str, Any]] = None,
config_path: Optional[str] = None,
query_for_rag: Optional[str] = None,
) -> Dict[str, float]:
"""
Retorna o KB a ser usado no pipeline.
- Se config tem knowledge_base.path, carrega desse arquivo.
- Se config tem chroma_path (ou agent.vector_db_path) e query_for_rag,
enriquece com retrieval.
- Fallback: SEED_KNOWLEDGE_BASE.
"""
PROJECT_ROOT = Path(__file__).resolve().parent
try:
from config_loader import load_config, PROJECT_ROOT as _root
PROJECT_ROOT = _root
if config is None:
config = load_config(config_path)
except Exception:
pass
if config is None:
config = {}
kb_path = config.get("knowledge_base", {}).get("path") or config.get("knowledge_base", {}).get("path", "")
chroma_path = config.get("knowledge_base", {}).get("chroma_path") or config.get("agent", {}).get("vector_db_path", "")
if kb_path and os.path.isabs(kb_path):
base = load_kb_from_file(kb_path)
elif kb_path:
base = load_kb_from_file(PROJECT_ROOT / kb_path)
else:
base = dict(SEED_KNOWLEDGE_BASE)
if not base:
base = dict(SEED_KNOWLEDGE_BASE)
if chroma_path and query_for_rag:
extra = enrich_kb_from_chroma(
query_for_rag,
chroma_path,
config.get("agent", {}).get("embedding_model", "sentence-transformers/all-MiniLM-L6-v2"),
k=5,
)
base = merge_kb(base, extra)
return base
|