| """ |
| SPARKNET API Schemas |
| Pydantic models for request/response validation. |
| """ |
|
|
| from pydantic import BaseModel, Field, ConfigDict |
| from typing import List, Dict, Any, Optional |
| from datetime import datetime |
| from enum import Enum |
|
|
|
|
| |
|
|
| class DocumentStatus(str, Enum): |
| PENDING = "pending" |
| PROCESSING = "processing" |
| COMPLETED = "completed" |
| INDEXED = "indexed" |
| ERROR = "error" |
|
|
|
|
| class QueryIntentType(str, Enum): |
| FACTOID = "factoid" |
| COMPARISON = "comparison" |
| AGGREGATION = "aggregation" |
| CAUSAL = "causal" |
| PROCEDURAL = "procedural" |
| DEFINITION = "definition" |
| LIST = "list" |
| MULTI_HOP = "multi_hop" |
|
|
|
|
| class AnswerFormat(str, Enum): |
| PROSE = "prose" |
| BULLET_POINTS = "bullet_points" |
| TABLE = "table" |
| STEP_BY_STEP = "step_by_step" |
|
|
|
|
| |
|
|
| class DocumentUploadResponse(BaseModel): |
| """Response after uploading a document.""" |
| model_config = ConfigDict(from_attributes=True) |
|
|
| doc_id: str = Field(..., description="Unique document identifier") |
| filename: str = Field(..., description="Original filename") |
| status: DocumentStatus = Field(..., description="Document status") |
| message: str = Field(..., description="Status message") |
| created_at: datetime = Field(default_factory=datetime.now) |
|
|
|
|
| class DocumentMetadata(BaseModel): |
| """Document metadata information.""" |
| model_config = ConfigDict(from_attributes=True) |
|
|
| doc_id: str |
| filename: str |
| file_type: str |
| page_count: int = 0 |
| chunk_count: int = 0 |
| text_length: int = 0 |
| status: DocumentStatus |
| indexed: bool = False |
| indexed_chunks: int = 0 |
| processing_time: Optional[float] = None |
| created_at: datetime |
| updated_at: Optional[datetime] = None |
|
|
|
|
| class DocumentResponse(BaseModel): |
| """Full document response with metadata.""" |
| model_config = ConfigDict(from_attributes=True) |
|
|
| doc_id: str |
| filename: str |
| file_type: str |
| status: DocumentStatus |
| metadata: DocumentMetadata |
| raw_text: Optional[str] = Field(None, description="Full extracted text (if requested)") |
| preview: Optional[str] = Field(None, description="Text preview (first 500 chars)") |
|
|
|
|
| class ChunkInfo(BaseModel): |
| """Information about a document chunk.""" |
| model_config = ConfigDict(from_attributes=True) |
|
|
| chunk_id: str |
| doc_id: str |
| text: str |
| chunk_type: str = "text" |
| page_num: Optional[int] = None |
| confidence: float = 1.0 |
| bbox: Optional[Dict[str, float]] = None |
| metadata: Dict[str, Any] = Field(default_factory=dict) |
|
|
|
|
| class ChunksResponse(BaseModel): |
| """Response containing document chunks.""" |
| doc_id: str |
| total_chunks: int |
| chunks: List[ChunkInfo] |
|
|
|
|
| class OCRRegionInfo(BaseModel): |
| """OCR region information.""" |
| region_id: str |
| text: str |
| confidence: float |
| page_num: int |
| bbox: Dict[str, float] |
|
|
|
|
| class LayoutRegionInfo(BaseModel): |
| """Layout region information.""" |
| region_id: str |
| region_type: str |
| confidence: float |
| page_num: int |
| bbox: Dict[str, float] |
|
|
|
|
| class DocumentDetailResponse(BaseModel): |
| """Detailed document response with all extracted data.""" |
| doc_id: str |
| filename: str |
| status: DocumentStatus |
| metadata: DocumentMetadata |
| chunks: List[ChunkInfo] |
| ocr_regions: List[OCRRegionInfo] = Field(default_factory=list) |
| layout_regions: List[LayoutRegionInfo] = Field(default_factory=list) |
|
|
|
|
| |
|
|
| class QueryRequest(BaseModel): |
| """RAG query request.""" |
| query: str = Field(..., min_length=1, max_length=2000, description="Query text") |
| doc_ids: Optional[List[str]] = Field(None, description="Filter by document IDs") |
| top_k: int = Field(5, ge=1, le=20, description="Number of chunks to retrieve") |
| answer_format: AnswerFormat = Field(AnswerFormat.PROSE, description="Desired answer format") |
| include_sources: bool = Field(True, description="Include source citations") |
| min_confidence: float = Field(0.5, ge=0.0, le=1.0, description="Minimum confidence threshold") |
| use_cache: bool = Field(True, description="Use cached results if available") |
|
|
|
|
| class Citation(BaseModel): |
| """Citation/source reference.""" |
| citation_id: int = Field(..., description="Citation number [1], [2], etc.") |
| doc_id: str |
| document_name: str |
| chunk_id: str |
| chunk_text: str |
| page_num: Optional[int] = None |
| relevance_score: float |
| bbox: Optional[Dict[str, float]] = None |
|
|
|
|
| class QueryPlan(BaseModel): |
| """Query planning information.""" |
| intent: QueryIntentType |
| sub_queries: List[str] = Field(default_factory=list) |
| keywords: List[str] = Field(default_factory=list) |
| strategy: str = "hybrid" |
|
|
|
|
| class RAGResponse(BaseModel): |
| """Complete RAG response.""" |
| query: str |
| answer: str |
| confidence: float = Field(..., ge=0.0, le=1.0) |
| citations: List[Citation] = Field(default_factory=list) |
| source_count: int = 0 |
| query_plan: Optional[QueryPlan] = None |
| from_cache: bool = False |
| validation: Optional[Dict[str, Any]] = None |
| latency_ms: Optional[float] = None |
| revision_count: int = 0 |
|
|
|
|
| class SearchRequest(BaseModel): |
| """Semantic search request.""" |
| query: str = Field(..., min_length=1, max_length=1000) |
| doc_ids: Optional[List[str]] = None |
| top_k: int = Field(10, ge=1, le=50) |
| min_score: float = Field(0.0, ge=0.0, le=1.0) |
|
|
|
|
| class SearchResult(BaseModel): |
| """Single search result.""" |
| chunk_id: str |
| doc_id: str |
| document_name: str |
| text: str |
| score: float |
| page_num: Optional[int] = None |
| chunk_type: str = "text" |
|
|
|
|
| class SearchResponse(BaseModel): |
| """Search response with results.""" |
| query: str |
| total_results: int |
| results: List[SearchResult] |
| latency_ms: float |
|
|
|
|
| |
|
|
| class IndexRequest(BaseModel): |
| """Request to index a document.""" |
| doc_id: str = Field(..., description="Document ID to index") |
| force_reindex: bool = Field(False, description="Force reindexing if already indexed") |
|
|
|
|
| class IndexResponse(BaseModel): |
| """Indexing response.""" |
| doc_id: str |
| status: str |
| chunks_indexed: int |
| message: str |
|
|
|
|
| class BatchIndexRequest(BaseModel): |
| """Batch indexing request.""" |
| doc_ids: List[str] |
| force_reindex: bool = False |
|
|
|
|
| class BatchIndexResponse(BaseModel): |
| """Batch indexing response.""" |
| total_requested: int |
| successful: int |
| failed: int |
| results: List[IndexResponse] |
|
|
|
|
| |
|
|
| class HealthResponse(BaseModel): |
| """Health check response.""" |
| status: str = Field(..., description="healthy, degraded, or unhealthy") |
| version: str |
| components: Dict[str, bool] |
|
|
|
|
| class SystemStatus(BaseModel): |
| """Detailed system status.""" |
| status: str |
| version: str |
| uptime_seconds: float |
| components: Dict[str, bool] |
| statistics: Dict[str, Any] |
| models: Dict[str, str] |
|
|
|
|
| class CollectionInfo(BaseModel): |
| """Vector store collection information.""" |
| name: str |
| document_count: int |
| chunk_count: int |
| embedding_dimension: int |
|
|
|
|
| class StoreStatus(BaseModel): |
| """Vector store status.""" |
| status: str |
| collections: List[CollectionInfo] |
| total_documents: int |
| total_chunks: int |
|
|
|
|
| |
|
|
| class UserCreate(BaseModel): |
| """User creation request.""" |
| username: str = Field(..., min_length=3, max_length=50) |
| email: str |
| password: str = Field(..., min_length=8) |
|
|
|
|
| class UserResponse(BaseModel): |
| """User response (no password).""" |
| user_id: str |
| username: str |
| email: str |
| is_active: bool = True |
| created_at: datetime |
|
|
|
|
| class Token(BaseModel): |
| """JWT token response.""" |
| access_token: str |
| token_type: str = "bearer" |
| expires_in: int |
|
|
|
|
| class TokenData(BaseModel): |
| """Token payload data.""" |
| username: Optional[str] = None |
| user_id: Optional[str] = None |
| scopes: List[str] = Field(default_factory=list) |
|
|