File size: 6,692 Bytes
674fb4e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 | """
Core data models for Graph RAG Service
Extended with: temporal fields, tenant support, eval/confidence models
"""
from pydantic import BaseModel, Field, ConfigDict
from typing import Optional, Dict, List, Any, Literal
from datetime import datetime, timezone
from enum import Enum
class NodeType(str, Enum):
"""Types of nodes in the knowledge graph"""
ENTITY = "entity"
CHUNK = "chunk"
DOCUMENT = "document"
class RelationType(str, Enum):
"""Types of relationships in the knowledge graph"""
MENTIONS = "MENTIONS"
RELATED_TO = "RELATED_TO"
PART_OF = "PART_OF"
CONTAINS = "CONTAINS"
class OntologyVersion(str, Enum):
"""Ontology versions for schema evolution"""
V1_0 = "v1.0"
V1_1 = "v1.1"
V2_0 = "v2.0"
class Entity(BaseModel):
"""Entity in the knowledge graph"""
id: Optional[str] = None
name: str
type: str
properties: Dict[str, Any] = Field(default_factory=dict)
embedding: Optional[List[float]] = None
ontology_version: str = "v1.0"
confidence: float = 1.0
# Temporal support (Gap #5)
valid_from: Optional[datetime] = None
valid_until: Optional[datetime] = None
# Tenant support (Gap #7)
tenant_id: Optional[str] = None
# Community support (Gap #2)
community_id: Optional[int] = None
model_config = ConfigDict(
json_schema_extra={
"example": {
"name": "Lyzr AI",
"type": "Company",
"properties": {"industry": "AI", "founded": "2023"},
"confidence": 0.95,
"tenant_id": "org_abc123"
}
}
)
class Relationship(BaseModel):
"""Relationship between entities"""
source: str
target: str
type: str
properties: Dict[str, Any] = Field(default_factory=dict)
confidence: float = 1.0
ontology_version: str = "v1.0"
# Temporal support (Gap #5)
valid_from: Optional[datetime] = None
valid_until: Optional[datetime] = None
source_document_id: Optional[str] = None
source_chunk_id: Optional[str] = None
# Tenant support (Gap #7)
tenant_id: Optional[str] = None
model_config = ConfigDict(
json_schema_extra={
"example": {
"source": "Lyzr AI",
"target": "OpenAI",
"type": "PARTNERS_WITH",
"confidence": 0.9,
"valid_from": "2023-01-01T00:00:00"
}
}
)
class Chunk(BaseModel):
"""Text chunk from document"""
id: Optional[str] = None
text: str
document_id: str
metadata: Dict[str, Any] = Field(default_factory=dict)
embedding: Optional[List[float]] = None
chunk_index: int = 0
# Extended metadata for citation tracing
page_number: Optional[int] = None
section_title: Optional[str] = None
tenant_id: Optional[str] = None
class Document(BaseModel):
"""Document metadata"""
id: Optional[str] = None
filename: str
file_type: str
content: Optional[str] = None
size_bytes: int
upload_date: datetime = Field(default_factory=lambda: datetime.now(timezone.utc).replace(tzinfo=None))
processed: bool = False
metadata: Dict[str, Any] = Field(default_factory=dict)
tenant_id: Optional[str] = None
class OntologySchema(BaseModel):
"""Ontology schema definition"""
version: str = "v1.0"
entity_types: List[str] = Field(default_factory=list)
relationship_types: List[str] = Field(default_factory=list)
properties: Dict[str, List[str]] = Field(default_factory=dict)
created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc).replace(tzinfo=None))
approved: bool = False
class ExtractionResult(BaseModel):
"""Result of entity/relationship extraction"""
entities: List[Entity] = Field(default_factory=list)
relationships: List[Relationship] = Field(default_factory=list)
chunks: List[Chunk] = Field(default_factory=list)
ontology_version: str = "v1.0"
processing_time_seconds: float = 0.0
class ConfidenceJudgment(BaseModel):
"""LLM-as-a-Judge confidence assessment (Gap #4)"""
score: float = Field(..., ge=0.0, le=1.0, description="0.0-1.0 grounding score")
reasoning: str = Field(default="", description="Why this confidence score was assigned")
grounded_claims: int = Field(default=0, description="# claims backed by retrieved context")
ungrounded_claims: int = Field(default=0, description="# claims not traceable to context")
hallucination_risk: Literal["low", "medium", "high"] = Field(default="low")
class QueryResult(BaseModel):
"""Result of a retrieval query — enriched with confidence judgment"""
answer: str
sources: List[Dict[str, Any]] = Field(default_factory=list)
reasoning_chain: List[str] = Field(default_factory=list)
confidence: float = 1.0
# Gap #4 — real confidence metrics
confidence_judgment: Optional[ConfidenceJudgment] = None
retrieval_method: str = "hybrid"
processing_time_seconds: float = 0.0
# Gap #3 — DRIFT query metadata
drift_expanded: bool = False
total_sub_queries: int = 1
class AgentState(BaseModel):
"""State of the agentic retrieval system"""
query: str
decomposed_queries: List[str] = Field(default_factory=list)
retrieved_contexts: List[Dict[str, Any]] = Field(default_factory=list)
reasoning_steps: List[str] = Field(default_factory=list)
final_answer: Optional[str] = None
iteration: int = 0
confidence: float = 0.0
class SearchMethod(str, Enum):
"""Search methods for retrieval"""
VECTOR = "vector"
GRAPH = "graph"
FILTER = "filter"
HYBRID = "hybrid"
COMMUNITY = "community"
CYPHER = "cypher"
class EvalResult(BaseModel):
"""RAG evaluation result (Gap #8)"""
question: str
answer: str
faithfulness: float = Field(..., ge=0.0, le=1.0)
answer_relevancy: float = Field(..., ge=0.0, le=1.0)
context_precision: float = Field(..., ge=0.0, le=1.0)
context_recall: float = Field(default=0.0, ge=0.0, le=1.0)
overall_score: float = Field(..., ge=0.0, le=1.0)
hallucination_detected: bool = False
timestamp: datetime = Field(default_factory=lambda: datetime.now(timezone.utc).replace(tzinfo=None))
document_id: Optional[str] = None
class CommunityReport(BaseModel):
"""Community summary for LazyGraphRAG (Gap #2)"""
community_id: int
entity_count: int
entities: List[str]
summary: str
themes: List[str] = Field(default_factory=list)
relevance_score: float = 0.0
generated_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc).replace(tzinfo=None))
|