File size: 4,720 Bytes
cf52a55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
"""
Base de conhecimento escalável.
===============================
Carrega KB a partir de arquivo (JSON) e opcionalmente enriquece com
retrieval em ChromaDB (RAG). Mantém interface termo -> grau [0,1] para L3/L4.
"""

from __future__ import annotations
import json
import os
import re
from pathlib import Path
from typing import Any, Dict, List, Optional

# KB padrão (fallback quando não há arquivo)
SEED_KNOWLEDGE_BASE: Dict[str, float] = {
    "quente": 0.85, "frio": 0.85, "morno": 0.70, "aquecido": 0.80, "gelado": 0.80,
    "temperatura": 0.90, "graus": 0.88, "escaldante": 0.75, "tépido": 0.65,
    "verdadeiro": 0.95, "falso": 0.95, "contradição": 0.80, "proposição": 0.85,
    "silogismo": 0.75, "conhecimento": 0.90, "inteligência": 0.85, "consciência": 0.70,
    "razão": 0.88, "verdade": 0.92, "água": 0.95, "líquido": 0.90, "h2o": 0.90,
}


def load_kb_from_file(path: str | Path) -> Dict[str, float]:
    """
    Carrega dicionário termo -> grau [0,1] de um arquivo JSON.
    Formato esperado: {"termo1": 0.9, "termo2": 0.8, ...}
    """
    path = Path(path)
    if not path.exists():
        return {}
    try:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        if not isinstance(data, dict):
            return {}
        return {k: float(v) for k, v in data.items() if isinstance(v, (int, float))}
    except Exception:
        return {}


def merge_kb(base: Dict[str, float], extra: Dict[str, float]) -> Dict[str, float]:
    """Mescla extra em base; em conflito, extra prevalece."""
    out = dict(base)
    for k, v in extra.items():
        out[k] = v
    return out


def enrich_kb_from_chroma(
    query: str,
    chroma_path: str,
    embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2",
    k: int = 5,
    score_weight: float = 0.8,
) -> Dict[str, float]:
    """
    Busca em ChromaDB por query e retorna um dicionário termo -> peso
    extraído dos trechos recuperados (palavras relevantes com score_weight).
    """
    try:
        from langchain_community.vectorstores import Chroma
        from langchain_community.embeddings import HuggingFaceEmbeddings
    except ImportError:
        return {}

    chroma_path = Path(chroma_path)
    if not chroma_path.exists() or not chroma_path.is_dir():
        return {}

    try:
        embeddings = HuggingFaceEmbeddings(model_name=embedding_model)
        vectorstore = Chroma(persist_directory=str(chroma_path), embedding_function=embeddings)
        docs = vectorstore.similarity_search(query, k=k)
    except Exception:
        return {}

    # Extrai termos dos textos e atribui peso
    term_scores: Dict[str, float] = {}
    for d in docs:
        text = d.page_content if hasattr(d, "page_content") else str(d)
        words = re.findall(r"[a-záàãâéêíóôõúüç]+", text.lower())
        for w in words:
            if len(w) > 2:
                term_scores[w] = term_scores.get(w, 0) + score_weight
    # Normaliza para [0, 1]
    if term_scores:
        m = max(term_scores.values())
        term_scores = {t: min(1.0, s / m) for t, s in term_scores.items()}
    return term_scores


def get_knowledge_base(
    config: Optional[Dict[str, Any]] = None,
    config_path: Optional[str] = None,
    query_for_rag: Optional[str] = None,
) -> Dict[str, float]:
    """
    Retorna o KB a ser usado no pipeline.
    - Se config tem knowledge_base.path, carrega desse arquivo.
    - Se config tem chroma_path (ou agent.vector_db_path) e query_for_rag,
      enriquece com retrieval.
    - Fallback: SEED_KNOWLEDGE_BASE.
    """
    PROJECT_ROOT = Path(__file__).resolve().parent
    try:
        from config_loader import load_config, PROJECT_ROOT as _root
        PROJECT_ROOT = _root
        if config is None:
            config = load_config(config_path)
    except Exception:
        pass
    if config is None:
        config = {}

    kb_path = config.get("knowledge_base", {}).get("path") or config.get("knowledge_base", {}).get("path", "")
    chroma_path = config.get("knowledge_base", {}).get("chroma_path") or config.get("agent", {}).get("vector_db_path", "")

    if kb_path and os.path.isabs(kb_path):
        base = load_kb_from_file(kb_path)
    elif kb_path:
        base = load_kb_from_file(PROJECT_ROOT / kb_path)
    else:
        base = dict(SEED_KNOWLEDGE_BASE)

    if not base:
        base = dict(SEED_KNOWLEDGE_BASE)

    if chroma_path and query_for_rag:
        extra = enrich_kb_from_chroma(
            query_for_rag,
            chroma_path,
            config.get("agent", {}).get("embedding_model", "sentence-transformers/all-MiniLM-L6-v2"),
            k=5,
        )
        base = merge_kb(base, extra)

    return base