Spaces:

GlokalAI
/

OrgAI

Sleeping

OrgAI / rag_anything_smaranika /raganything /streaming.py

Phonex

TheTruthSchool_RAG

167596f 7 months ago

23.3 kB

	"""
	Streaming Query Module with Verification Support

	This module provides streaming capabilities for RAGAnything while maintaining
	the dual-LLM verification layer. It allows real-time token streaming to the
	frontend while buffering the complete response for post-generation verification.

	Key Features:
	- Real-time token streaming from LLM (Gemini, OpenAI, etc.)
	- Complete response buffering for verification
	- Async verification after streaming completes
	- Verification metadata injection into stream
	- Support for both verified and unverified streaming modes

	Architecture:
	1. Stream tokens to frontend in real-time
	2. Buffer complete response for verification
	3. Run verification asynchronously after completion
	4. Send verification metadata as final stream chunk

	Author: RAG-Anything Team
	Version: 1.0.0
	"""

	from __future__ import annotations

	import asyncio
	import json
	from typing import Dict, List, Any, Optional, AsyncGenerator, Callable
	from dataclasses import dataclass
	from enum import Enum
	from lightrag.utils import logger


	# =============================================================================
	# Configuration Classes
	# =============================================================================

	class StreamMode(Enum):
	"""Streaming modes"""
	TOKENS_ONLY = "tokens_only" # Stream tokens, no verification
	TOKENS_WITH_VERIFICATION = "tokens_with_verification" # Stream tokens + verify
	TOKENS_WITH_METADATA = "tokens_with_metadata" # Include metadata chunks


	@dataclass
	class StreamingConfig:
	"""Configuration for streaming queries

	Attributes:
	mode: Streaming mode
	enable_verification: Whether to run verification after streaming
	send_verification_metadata: Send verification results as final chunk
	verification_async: Run verification in background (non-blocking)
	buffer_size: Number of tokens to buffer before sending
	include_context: Include retrieved context in metadata
	"""
	mode: StreamMode = StreamMode.TOKENS_WITH_VERIFICATION
	enable_verification: bool = True
	send_verification_metadata: bool = True
	verification_async: bool = True
	buffer_size: int = 1
	include_context: bool = False


	# =============================================================================
	# Streaming Response Buffer
	# =============================================================================

	class StreamBuffer:
	"""Buffer for collecting streamed tokens and managing verification

	This class collects tokens as they're streamed and provides the complete
	response for verification after streaming completes.
	"""

	def __init__(self):
	"""Initialize StreamBuffer"""
	self.tokens: List[str] = []
	self.complete_response: str = ""
	self.is_complete: bool = False
	self.verification_result: Optional[Dict[str, Any]] = None

	def add_token(self, token: str):
	"""Add a token to the buffer

	Args:
	token: Token to add
	"""
	self.tokens.append(token)

	def finalize(self) -> str:
	"""Finalize buffer and return complete response

	Returns:
	Complete response string
	"""
	self.complete_response = "".join(self.tokens)
	self.is_complete = True
	return self.complete_response

	def set_verification_result(self, result: Dict[str, Any]):
	"""Store verification result

	Args:
	result: Verification result dictionary
	"""
	self.verification_result = result


	# =============================================================================
	# Streaming Query Handler
	# =============================================================================

	class StreamingQueryHandler:
	"""Handler for streaming queries with verification support

	This class orchestrates the streaming process, managing token streaming
	to the frontend while buffering for verification.

	Attributes:
	config: StreamingConfig instance
	verifier: AnswerVerifier instance (optional)
	modifier: AnswerModifier instance (optional)
	"""

	def __init__(
	self,
	config: Optional[StreamingConfig] = None,
	verifier: Optional[Any] = None,
	modifier: Optional[Any] = None
	):
	"""Initialize StreamingQueryHandler

	Args:
	config: Streaming configuration
	verifier: AnswerVerifier instance for verification
	modifier: AnswerModifier instance for improvements
	"""
	self.config = config or StreamingConfig()
	self.verifier = verifier
	self.modifier = modifier

	async def stream_with_verification(
	self,
	llm_stream_func: Callable,
	query: str,
	context: str,
	original_query: Optional[str] = None,
	**llm_kwargs
	) -> AsyncGenerator[Dict[str, Any], None]:
	"""Stream LLM response with verification support

	This is the main streaming method. It:
	1. Streams tokens to frontend in real-time
	2. Buffers tokens for complete response
	3. Runs verification after streaming completes
	4. Sends verification metadata as final chunk

	Args:
	llm_stream_func: Async generator function that yields tokens
	query: Query to answer
	context: Retrieved context
	original_query: Original query before improvement
	**llm_kwargs: Additional kwargs for LLM

	Yields:
	Dict with keys:
	- type: "token" \| "metadata" \| "verification" \| "error"
	- content: Token string or metadata dict
	- done: Boolean indicating if streaming is complete

	Example:
	```python
	async for chunk in handler.stream_with_verification(
	llm_stream_func=my_gemini_stream,
	query="What is photosynthesis?",
	context="[Retrieved context]"
	):
	if chunk["type"] == "token":
	print(chunk["content"], end="", flush=True)
	elif chunk["type"] == "verification":
	print(f"\n\nVerification Score: {chunk['content']['score']}")
	```
	"""
	buffer = StreamBuffer()

	try:
	# Step 1: Stream tokens to frontend
	logger.info("Starting token streaming...")

	async for token in llm_stream_func(
	prompt=self._build_prompt(query, context),
	**llm_kwargs
	):
	# Add token to buffer
	buffer.add_token(token)

	# Yield token to frontend
	yield {
	"type": "token",
	"content": token,
	"done": False
	}

	# Step 2: Finalize buffer
	complete_response = buffer.finalize()
	logger.info(f"Streaming complete. Total response length: {len(complete_response)}")

	# Send completion signal
	yield {
	"type": "token",
	"content": "",
	"done": True
	}

	# Step 3: Run verification (if enabled)
	if self.config.enable_verification and self.verifier:
	logger.info("Running post-stream verification...")

	if self.config.verification_async:
	# Non-blocking verification
	asyncio.create_task(
	self._verify_response_async(
	buffer,
	query,
	context,
	original_query
	)
	)

	# Send placeholder verification metadata
	if self.config.send_verification_metadata:
	yield {
	"type": "verification",
	"content": {
	"status": "verifying",
	"message": "Verification in progress..."
	},
	"done": False
	}
	else:
	# Blocking verification
	verification_result = await self._verify_response(
	complete_response,
	query,
	context,
	original_query
	)
	buffer.set_verification_result(verification_result)

	# Send verification metadata
	if self.config.send_verification_metadata:
	yield {
	"type": "verification",
	"content": verification_result,
	"done": True
	}

	except Exception as e:
	logger.error(f"Error during streaming: {e}", exc_info=True)
	yield {
	"type": "error",
	"content": {
	"message": str(e),
	"error_type": type(e).__name__
	},
	"done": True
	}

	async def stream_simple(
	self,
	llm_stream_func: Callable,
	query: str,
	context: str,
	**llm_kwargs
	) -> AsyncGenerator[str, None]:
	"""Simple token streaming without verification

	This is a lightweight streaming method that just yields tokens
	without any verification or metadata.

	Args:
	llm_stream_func: Async generator function that yields tokens
	query: Query to answer
	context: Retrieved context
	**llm_kwargs: Additional kwargs for LLM

	Yields:
	str: Individual tokens

	Example:
	```python
	async for token in handler.stream_simple(
	llm_stream_func=my_llm_stream,
	query="What is AI?",
	context="[Context]"
	):
	print(token, end="", flush=True)
	```
	"""
	try:
	async for token in llm_stream_func(
	prompt=self._build_prompt(query, context),
	**llm_kwargs
	):
	yield token

	except Exception as e:
	logger.error(f"Error during simple streaming: {e}", exc_info=True)
	yield f"[Error: {str(e)}]"

	def _build_prompt(self, query: str, context: str) -> str:
	"""Build prompt from query and context

	Args:
	query: User query
	context: Retrieved context

	Returns:
	Formatted prompt string
	"""
	# Enhanced prompt with better instructions for higher quality responses
	return f"""You are an expert assistant analyzing a knowledge base. Use the provided context to answer the question accurately and comprehensively.

	## Context Information:
	{context}

	## User Question:
	{query}

	## Instructions:
	1. Answer based ONLY on the information provided in the context above
	2. If the context contains relevant information, provide a clear, detailed answer
	3. Structure your response with:
	- Direct answer to the question
	- Supporting details and evidence from the context
	- Relevant examples or specifics when available
	4. If the context doesn't contain enough information to fully answer the question, state what you know and what's missing
	5. Be precise and cite specific information from the context when possible
	6. Use clear, professional language appropriate for the domain

	## Answer:"""

	async def _verify_response(
	self,
	response: str,
	query: str,
	context: str,
	original_query: Optional[str] = None
	) -> Dict[str, Any]:
	"""Verify a complete response

	Args:
	response: Complete LLM response
	query: Query used
	context: Retrieved context
	original_query: Original query before improvement

	Returns:
	Verification result dictionary
	"""
	if not self.verifier:
	logger.warning("Verifier not available, skipping verification")
	return {
	"passed": True,
	"score": 10.0,
	"message": "Verification not available"
	}

	try:
	verification_result = await self.verifier.verify_answer(
	query=query,
	answer=response,
	context=context,
	original_query=original_query
	)

	return {
	"passed": verification_result.get("passed", False),
	"score": verification_result.get("overall_score", 0.0),
	"criteria_scores": verification_result.get("criteria_scores", {}),
	"issues": verification_result.get("issues", []),
	"suggestions": verification_result.get("suggestions", []),
	"confidence": verification_result.get("confidence", 0.0)
	}

	except Exception as e:
	logger.error(f"Verification error: {e}", exc_info=True)
	return {
	"passed": False,
	"score": 0.0,
	"error": str(e)
	}

	async def _verify_response_async(
	self,
	buffer: StreamBuffer,
	query: str,
	context: str,
	original_query: Optional[str] = None
	):
	"""Async verification (non-blocking background task)

	Args:
	buffer: StreamBuffer to store result in
	query: Query used
	context: Retrieved context
	original_query: Original query before improvement
	"""
	verification_result = await self._verify_response(
	buffer.complete_response,
	query,
	context,
	original_query
	)
	buffer.set_verification_result(verification_result)
	logger.info(f"Background verification complete: score={verification_result.get('score', 0):.2f}")


	# =============================================================================
	# Streaming Mixin for RAGAnything Integration
	# =============================================================================

	class StreamingQueryMixin:
	"""Mixin providing streaming query capabilities to RAGAnything

	This mixin adds streaming query methods that can be used alongside
	the existing query methods. It integrates with the verification system.

	Expected attributes:
	- self.lightrag: LightRAG instance
	- self.answer_verifier: AnswerVerifier instance (optional)
	- self.answer_modifier: AnswerModifier instance (optional)
	- self.config: RAGAnythingConfig instance
	- self.logger: Logger instance
	"""

	async def aquery_stream(
	self,
	query: str,
	mode: str = "mix",
	enable_verification: bool = True,
	**kwargs
	) -> AsyncGenerator[Dict[str, Any], None]:
	"""Streaming query with verification support

	This method streams LLM responses while optionally running verification.
	Perfect for real-time user interfaces.

	Args:
	query: User query
	mode: RAG mode ("local", "global", "hybrid", "naive", "mix")
	enable_verification: Whether to run verification
	**kwargs: Additional query parameters

	Yields:
	Dict containing:
	- type: "token" \| "metadata" \| "verification" \| "error"
	- content: Token or metadata
	- done: Completion flag

	Example:
	```python
	async for chunk in rag.aquery_stream(
	query="What is machine learning?",
	enable_verification=True
	):
	if chunk["type"] == "token":
	print(chunk["content"], end="")
	elif chunk["type"] == "verification":
	print(f"\n\nQuality Score: {chunk['content']['score']}/10")
	```
	"""
	if not hasattr(self, 'lightrag') or self.lightrag is None:
	raise ValueError("LightRAG not initialized")

	try:
	# Import here to avoid circular dependencies
	from lightrag import QueryParam

	original_query = query

	# Step 1: Apply query improvement if enabled
	use_query_improvement = kwargs.pop(
	'enable_query_improvement',
	getattr(self.config, 'enable_query_improvement', False)
	)

	if use_query_improvement and hasattr(self, 'query_improver') and self.query_improver:
	self.logger.info("Applying query improvement for streaming...")
	try:
	query_improvement_result = await self._apply_query_improvement(query)
	improved = query_improvement_result.get("improved_query", query)
	if improved and improved.strip():
	query = improved
	self.logger.info(f"Query improved: '{original_query}' -> '{query}'")
	else:
	self.logger.warning("Query improvement returned empty result, using original query")
	except Exception as e:
	self.logger.warning(f"Query improvement failed: {e}, using original query")
	# Continue with original query on error

	# Step 2: Retrieve context
	self.logger.info(f"Retrieving context for streaming query: {query[:100]}...")
	query_param = QueryParam(mode=mode, only_need_context=True)
	context = await self.lightrag.aquery(query, param=query_param)

	if not context or not context.strip():
	self.logger.warning("No context retrieved for query")
	yield {
	"type": "error",
	"content": {
	"message": "I couldn't find any relevant information in the knowledge base to answer your question. Please ensure documents have been uploaded and indexed, or try rephrasing your query with different keywords.",
	"suggestion": "Try uploading relevant documents first, or rephrase your question with more specific terms."
	},
	"done": True
	}
	return

	# Step 3: Create streaming handler
	streaming_config = StreamingConfig(
	enable_verification=enable_verification and hasattr(self, 'answer_verifier'),
	send_verification_metadata=True,
	verification_async=False # Blocking to ensure verification completes
	)

	handler = StreamingQueryHandler(
	config=streaming_config,
	verifier=getattr(self, 'answer_verifier', None),
	modifier=getattr(self, 'answer_modifier', None)
	)

	# Step 4: Stream response
	if hasattr(self.lightrag, 'llm_model_func'):
	# Create streaming wrapper for non-streaming LLM
	llm_func = self.lightrag.llm_model_func

	async def llm_stream_wrapper(prompt, **llm_kwargs):
	"""Wrapper to simulate streaming from non-streaming LLM"""
	if asyncio.iscoroutinefunction(llm_func):
	response = await llm_func(prompt, **llm_kwargs)
	else:
	response = llm_func(prompt, **llm_kwargs)

	# Simulate token-by-token streaming
	# Split by words for more natural streaming
	words = response.split()
	for i, word in enumerate(words):
	if i < len(words) - 1:
	yield word + " "
	else:
	yield word
	# Small delay to simulate real streaming
	await asyncio.sleep(0.01)

	async for chunk in handler.stream_with_verification(
	llm_stream_func=llm_stream_wrapper,
	query=query,
	context=context,
	original_query=original_query
	):
	yield chunk
	else:
	raise ValueError("LLM model function not available for streaming")

	except Exception as e:
	self.logger.error(f"Error in streaming query: {e}", exc_info=True)
	yield {
	"type": "error",
	"content": {"message": str(e)},
	"done": True
	}

	async def aquery_stream_simple(
	self,
	query: str,
	mode: str = "mix",
	**kwargs
	) -> AsyncGenerator[str, None]:
	"""Simple streaming query without verification

	Lightweight streaming that just yields tokens without any
	verification or metadata overhead.

	Args:
	query: User query
	mode: RAG mode
	**kwargs: Additional parameters

	Yields:
	str: Individual tokens

	Example:
	```python
	async for token in rag.aquery_stream_simple(
	query="Explain photosynthesis"
	):
	print(token, end="", flush=True)
	```
	"""
	try:
	# Get context
	from lightrag import QueryParam

	query_param = QueryParam(mode=mode, only_need_context=True)
	context = await self.lightrag.aquery(query, param=query_param)

	if not context:
	yield "[No context found]"
	return

	# Create handler
	handler = StreamingQueryHandler(
	config=StreamingConfig(enable_verification=False)
	)

	# Stream tokens
	if hasattr(self.lightrag, 'llm_model_func'):
	llm_func = self.lightrag.llm_model_func

	async def llm_stream_wrapper(prompt, **llm_kwargs):
	if asyncio.iscoroutinefunction(llm_func):
	response = await llm_func(prompt, **llm_kwargs)
	else:
	response = llm_func(prompt, **llm_kwargs)

	words = response.split()
	for i, word in enumerate(words):
	if i < len(words) - 1:
	yield word + " "
	else:
	yield word
	await asyncio.sleep(0.01)

	async for token in handler.stream_simple(
	llm_stream_func=llm_stream_wrapper,
	query=query,
	context=context
	):
	yield token
	else:
	yield "[LLM not available]"

	except Exception as e:
	self.logger.error(f"Error in simple streaming: {e}", exc_info=True)
	yield f"[Error: {str(e)}]"