Spaces:

GlokalAI
/

OrgAI

Running

OrgAI / rag_anything_smaranika /raganything /query.py

Phonex

TheTruthSchool_RAG

167596f 7 months ago

67.1 kB

	# """
	# Query functionality for RAGAnything

	# Contains all query-related methods for both text and multimodal queries
	# """

	# import json
	# import hashlib
	# import re
	# from typing import Dict, List, Any
	# from pathlib import Path
	# from lightrag import QueryParam
	# from lightrag.utils import always_get_an_event_loop
	# from raganything.prompt import PROMPTS
	# from raganything.utils import (
	# get_processor_for_type,
	# encode_image_to_base64,
	# validate_image_file,
	# )
	# # Add these imports
	# from raganything.query_improvement import QueryImprovementMixin
	# from raganything.verification import DualLLMVerificationMixin


	# class QueryMixin(QueryImprovementMixin, DualLLMVerificationMixin):
	# """QueryMixin class containing query functionality for RAGAnything"""

	# def _generate_multimodal_cache_key(
	# self, query: str, multimodal_content: List[Dict[str, Any]], mode: str, **kwargs
	# ) -> str:
	# """
	# Generate cache key for multimodal query

	# Args:
	# query: Base query text
	# multimodal_content: List of multimodal content
	# mode: Query mode
	# **kwargs: Additional parameters

	# Returns:
	# str: Cache key hash
	# """
	# # Create a normalized representation of the query parameters
	# cache_data = {
	# "query": query.strip(),
	# "mode": mode,
	# }

	# # Normalize multimodal content for stable caching
	# normalized_content = []
	# if multimodal_content:
	# for item in multimodal_content:
	# if isinstance(item, dict):
	# normalized_item = {}
	# for key, value in item.items():
	# # For file paths, use basename to make cache more portable
	# if key in [
	# "img_path",
	# "image_path",
	# "file_path",
	# ] and isinstance(value, str):
	# normalized_item[key] = Path(value).name
	# # For large content, create a hash instead of storing directly
	# elif (
	# key in ["table_data", "table_body"]
	# and isinstance(value, str)
	# and len(value) > 200
	# ):
	# normalized_item[f"{key}_hash"] = hashlib.md5(
	# value.encode()
	# ).hexdigest()
	# else:
	# normalized_item[key] = value
	# normalized_content.append(normalized_item)
	# else:
	# normalized_content.append(item)

	# cache_data["multimodal_content"] = normalized_content

	# # Add relevant kwargs to cache data
	# relevant_kwargs = {
	# k: v
	# for k, v in kwargs.items()
	# if k
	# in [
	# "stream",
	# "response_type",
	# "top_k",
	# "max_tokens",
	# "temperature",
	# # "only_need_context",
	# # "only_need_prompt",
	# ]
	# }
	# cache_data.update(relevant_kwargs)

	# # Generate hash from the cache data
	# cache_str = json.dumps(cache_data, sort_keys=True, ensure_ascii=False)
	# cache_hash = hashlib.md5(cache_str.encode()).hexdigest()

	# return f"multimodal_query:{cache_hash}"

	# # async def aquery(self, query: str, mode: str = "mix", **kwargs) -> str:
	# # """
	# # Pure text query - directly calls LightRAG's query functionality

	# # Args:
	# # query: Query text
	# # mode: Query mode ("local", "global", "hybrid", "naive", "mix", "bypass")
	# # **kwargs: Other query parameters, will be passed to QueryParam
	# # - vlm_enhanced: bool, default True when vision_model_func is available.
	# # If True, will parse image paths in retrieved context and replace them
	# # with base64 encoded images for VLM processing.

	# # Returns:
	# # str: Query result
	# # """
	# # if self.lightrag is None:
	# # raise ValueError(
	# # "No LightRAG instance available. Please process documents first or provide a pre-initialized LightRAG instance."
	# # )

	# # # Check if VLM enhanced query should be used
	# # vlm_enhanced = kwargs.pop("vlm_enhanced", None)

	# # # Auto-determine VLM enhanced based on availability
	# # if vlm_enhanced is None:
	# # vlm_enhanced = (
	# # hasattr(self, "vision_model_func")
	# # and self.vision_model_func is not None
	# # )

	# # # Use VLM enhanced query if enabled and available
	# # if (
	# # vlm_enhanced
	# # and hasattr(self, "vision_model_func")
	# # and self.vision_model_func
	# # ):
	# # return await self.aquery_vlm_enhanced(query, mode=mode, **kwargs)
	# # elif vlm_enhanced and (
	# # not hasattr(self, "vision_model_func") or not self.vision_model_func
	# # ):
	# # self.logger.warning(
	# # "VLM enhanced query requested but vision_model_func is not available, falling back to normal query"
	# # )

	# # # Create query parameters
	# # query_param = QueryParam(mode=mode, **kwargs)

	# # self.logger.info(f"Executing text query: {query[:100]}...")
	# # self.logger.info(f"Query mode: {mode}")

	# # # Call LightRAG's query method
	# # result = await self.lightrag.aquery(query, param=query_param)

	# # self.logger.info("Text query completed")
	# # return result

	# # async def aquery_with_multimodal(
	# # self,
	# # query: str,
	# # multimodal_content: List[Dict[str, Any]] = None,
	# # mode: str = "mix",
	# # **kwargs,
	# # ) -> str:
	# # """
	# # Multimodal query - combines text and multimodal content for querying

	# # Args:
	# # query: Base query text
	# # multimodal_content: List of multimodal content, each element contains:
	# # - type: Content type ("image", "table", "equation", etc.)
	# # - Other fields depend on type (e.g., img_path, table_data, latex, etc.)
	# # mode: Query mode ("local", "global", "hybrid", "naive", "mix", "bypass")
	# # **kwargs: Other query parameters, will be passed to QueryParam

	# # Returns:
	# # str: Query result

	# # Examples:
	# # # Pure text query
	# # result = await rag.query_with_multimodal("What is machine learning?")

	# # # Image query
	# # result = await rag.query_with_multimodal(
	# # "Analyze the content in this image",
	# # multimodal_content=[{
	# # "type": "image",
	# # "img_path": "./image.jpg"
	# # }]
	# # )

	# # # Table query
	# # result = await rag.query_with_multimodal(
	# # "Analyze the data trends in this table",
	# # multimodal_content=[{
	# # "type": "table",
	# # "table_data": "Name,Age\nAlice,25\nBob,30"
	# # }]
	# # )
	# # """
	# # # Ensure LightRAG is initialized
	# # await self._ensure_lightrag_initialized()

	# # self.logger.info(f"Executing multimodal query: {query[:100]}...")
	# # self.logger.info(f"Query mode: {mode}")

	# # # If no multimodal content, fallback to pure text query
	# # if not multimodal_content:
	# # self.logger.info("No multimodal content provided, executing text query")
	# # return await self.aquery(query, mode=mode, **kwargs)

	# # # Generate cache key for multimodal query
	# # cache_key = self._generate_multimodal_cache_key(
	# # query, multimodal_content, mode, **kwargs
	# # )

	# # # Check cache if available and enabled
	# # cached_result = None
	# # if (
	# # hasattr(self, "lightrag")
	# # and self.lightrag
	# # and hasattr(self.lightrag, "llm_response_cache")
	# # and self.lightrag.llm_response_cache
	# # ):
	# # if self.lightrag.llm_response_cache.global_config.get(
	# # "enable_llm_cache", True
	# # ):
	# # try:
	# # cached_result = await self.lightrag.llm_response_cache.get_by_id(
	# # cache_key
	# # )
	# # if cached_result and isinstance(cached_result, dict):
	# # result_content = cached_result.get("return")
	# # if result_content:
	# # self.logger.info(
	# # f"Multimodal query cache hit: {cache_key[:16]}..."
	# # )
	# # return result_content
	# # except Exception as e:
	# # self.logger.debug(f"Error accessing multimodal query cache: {e}")

	# # # Process multimodal content to generate enhanced query text
	# # enhanced_query = await self._process_multimodal_query_content(
	# # query, multimodal_content
	# # )

	# # self.logger.info(
	# # f"Generated enhanced query length: {len(enhanced_query)} characters"
	# # )

	# # # Execute enhanced query
	# # result = await self.aquery(enhanced_query, mode=mode, **kwargs)

	# # # Save to cache if available and enabled
	# # if (
	# # hasattr(self, "lightrag")
	# # and self.lightrag
	# # and hasattr(self.lightrag, "llm_response_cache")
	# # and self.lightrag.llm_response_cache
	# # ):
	# # if self.lightrag.llm_response_cache.global_config.get(
	# # "enable_llm_cache", True
	# # ):
	# # try:
	# # # Create cache entry for multimodal query
	# # cache_entry = {
	# # "return": result,
	# # "cache_type": "multimodal_query",
	# # "original_query": query,
	# # "multimodal_content_count": len(multimodal_content),
	# # "mode": mode,
	# # }

	# # await self.lightrag.llm_response_cache.upsert(
	# # {cache_key: cache_entry}
	# # )
	# # self.logger.info(
	# # f"Saved multimodal query result to cache: {cache_key[:16]}..."
	# # )
	# # except Exception as e:
	# # self.logger.debug(f"Error saving multimodal query to cache: {e}")

	# # # Ensure cache is persisted to disk
	# # if (
	# # hasattr(self, "lightrag")
	# # and self.lightrag
	# # and hasattr(self.lightrag, "llm_response_cache")
	# # and self.lightrag.llm_response_cache
	# # ):
	# # try:
	# # await self.lightrag.llm_response_cache.index_done_callback()
	# # except Exception as e:
	# # self.logger.debug(f"Error persisting multimodal query cache: {e}")

	# # self.logger.info("Multimodal query completed")
	# # return result

	# async def aquery(self, query: str, mode: str = "mix", **kwargs) -> str:
	# """
	# Pure text query with optional query improvement and verification

	# Args:
	# query: Query text
	# mode: Query mode ("local", "global", "hybrid", "naive", "mix", "bypass")
	# **kwargs: Other query parameters
	# - enable_query_improvement: bool, override config setting
	# - enable_verification: bool, override config setting
	# - return_verification_info: bool, return detailed verification info

	# Returns:
	# str: Query result (or dict if return_verification_info=True)
	# """
	# if self.lightrag is None:
	# raise ValueError(
	# "No LightRAG instance available. Please process documents first or provide a pre-initialized LightRAG instance."
	# )

	# # Check override flags
	# use_query_improvement = kwargs.pop('enable_query_improvement',
	# getattr(self.config, 'enable_query_improvement', False))
	# use_verification = kwargs.pop('enable_verification',
	# getattr(self.config, 'enable_dual_llm_verification', False))
	# return_verification_info = kwargs.pop('return_verification_info', False)

	# original_query = query
	# query_improvement_result = None

	# # Step 1: Apply query improvement if enabled
	# if use_query_improvement and hasattr(self, 'query_improver') and self.query_improver:
	# self.logger.info("Applying query improvement...")
	# query_improvement_result = await self._apply_query_improvement(query)
	# query = query_improvement_result["improved_query"]
	# self.logger.info(f"Query improved: '{original_query[:50]}...' -> '{query[:50]}...'")

	# # Step 2: Check VLM enhanced query
	# vlm_enhanced = kwargs.pop("vlm_enhanced", None)
	# if vlm_enhanced is None:
	# vlm_enhanced = (
	# hasattr(self, "vision_model_func") and self.vision_model_func is not None
	# )

	# # If using VLM enhanced or verification is disabled, use existing flow
	# if vlm_enhanced or not use_verification:
	# if vlm_enhanced and hasattr(self, "vision_model_func") and self.vision_model_func:
	# result = await self.aquery_vlm_enhanced(query, mode=mode, **kwargs)
	# else:
	# from lightrag import QueryParam
	# query_param = QueryParam(mode=mode, **kwargs)
	# result = await self.lightrag.aquery(query, param=query_param)

	# if return_verification_info:
	# return {
	# "answer": result,
	# "original_query": original_query,
	# "improved_query": query if query_improvement_result else original_query,
	# "query_improvement": query_improvement_result,
	# "verification_passed": True,
	# "verification_score": 10.0
	# }
	# return result

	# # Step 3: Generate with verification
	# if use_verification and hasattr(self, 'answer_verifier') and self.answer_verifier:
	# self.logger.info("Using dual-LLM verification...")

	# # Get context without final answer
	# from lightrag import QueryParam
	# query_param = QueryParam(mode=mode, only_need_context=True, **kwargs)
	# context = await self.lightrag.aquery(query, param=query_param)

	# # Generate with verification
	# verification_result = await self._generate_with_verification(
	# query=query,
	# context=context,
	# original_query=original_query
	# )

	# if return_verification_info:
	# return {
	# "answer": verification_result["answer"],
	# "original_query": original_query,
	# "improved_query": query if query_improvement_result else original_query,
	# "query_improvement": query_improvement_result,
	# "verification_passed": verification_result["verification_passed"],
	# "verification_score": verification_result["verification_score"],
	# "modification_attempts": verification_result["modification_attempts"],
	# "verification_history": verification_result.get("verification_history", [])
	# }

	# return verification_result["answer"]

	# # Fallback to normal query
	# from lightrag import QueryParam
	# query_param = QueryParam(mode=mode, **kwargs)
	# result = await self.lightrag.aquery(query, param=query_param)

	# if return_verification_info:
	# return {
	# "answer": result,
	# "original_query": original_query,
	# "improved_query": query if query_improvement_result else original_query,
	# "query_improvement": query_improvement_result
	# }

	# return result

	# async def aquery_vlm_enhanced(self, query: str, mode: str = "mix", **kwargs) -> str:
	# """
	# VLM enhanced query - replaces image paths in retrieved context with base64 encoded images for VLM processing

	# Args:
	# query: User query
	# mode: Underlying LightRAG query mode
	# **kwargs: Other query parameters

	# Returns:
	# str: VLM query result
	# """
	# # Ensure VLM is available
	# if not hasattr(self, "vision_model_func") or not self.vision_model_func:
	# raise ValueError(
	# "VLM enhanced query requires vision_model_func. "
	# "Please provide a vision model function when initializing RAGAnything."
	# )

	# # Ensure LightRAG is initialized
	# await self._ensure_lightrag_initialized()

	# self.logger.info(f"Executing VLM enhanced query: {query[:100]}...")

	# # Clear previous image cache
	# if hasattr(self, "_current_images_base64"):
	# delattr(self, "_current_images_base64")

	# # 1. Get original retrieval prompt (without generating final answer)
	# query_param = QueryParam(mode=mode, only_need_prompt=True, **kwargs)
	# raw_prompt = await self.lightrag.aquery(query, param=query_param)

	# self.logger.debug("Retrieved raw prompt from LightRAG")

	# # 2. Extract and process image paths
	# enhanced_prompt, images_found = await self._process_image_paths_for_vlm(
	# raw_prompt
	# )

	# if not images_found:
	# self.logger.info("No valid images found, falling back to normal query")
	# # Fallback to normal query
	# query_param = QueryParam(mode=mode, **kwargs)
	# return await self.lightrag.aquery(query, param=query_param)

	# self.logger.info(f"Processed {images_found} images for VLM")

	# # 3. Build VLM message format
	# messages = self._build_vlm_messages_with_images(enhanced_prompt, query)

	# # 4. Call VLM for question answering
	# result = await self._call_vlm_with_multimodal_content(messages)

	# self.logger.info("VLM enhanced query completed")
	# return result

	# async def _process_multimodal_query_content(
	# self, base_query: str, multimodal_content: List[Dict[str, Any]]
	# ) -> str:
	# """
	# Process multimodal query content to generate enhanced query text

	# Args:
	# base_query: Base query text
	# multimodal_content: List of multimodal content

	# Returns:
	# str: Enhanced query text
	# """
	# self.logger.info("Starting multimodal query content processing...")

	# enhanced_parts = [f"User query: {base_query}"]

	# for i, content in enumerate(multimodal_content):
	# content_type = content.get("type", "unknown")
	# self.logger.info(
	# f"Processing {i+1}/{len(multimodal_content)} multimodal content: {content_type}"
	# )

	# try:
	# # Get appropriate processor
	# processor = get_processor_for_type(self.modal_processors, content_type)

	# if processor:
	# # Generate content description
	# description = await self._generate_query_content_description(
	# processor, content, content_type
	# )
	# enhanced_parts.append(
	# f"\nRelated {content_type} content: {description}"
	# )
	# else:
	# # If no appropriate processor, use basic description
	# basic_desc = str(content)[:200]
	# enhanced_parts.append(
	# f"\nRelated {content_type} content: {basic_desc}"
	# )

	# except Exception as e:
	# self.logger.error(f"Error processing multimodal content: {str(e)}")
	# # Continue processing other content
	# continue

	# enhanced_query = "\n".join(enhanced_parts)
	# enhanced_query += PROMPTS["QUERY_ENHANCEMENT_SUFFIX"]

	# self.logger.info("Multimodal query content processing completed")
	# return enhanced_query

	# async def _generate_query_content_description(
	# self, processor, content: Dict[str, Any], content_type: str
	# ) -> str:
	# """
	# Generate content description for query

	# Args:
	# processor: Multimodal processor
	# content: Content data
	# content_type: Content type

	# Returns:
	# str: Content description
	# """
	# try:
	# if content_type == "image":
	# return await self._describe_image_for_query(processor, content)
	# elif content_type == "table":
	# return await self._describe_table_for_query(processor, content)
	# elif content_type == "equation":
	# return await self._describe_equation_for_query(processor, content)
	# else:
	# return await self._describe_generic_for_query(
	# processor, content, content_type
	# )

	# except Exception as e:
	# self.logger.error(f"Error generating {content_type} description: {str(e)}")
	# return f"{content_type} content: {str(content)[:100]}"

	# async def _describe_image_for_query(
	# self, processor, content: Dict[str, Any]
	# ) -> str:
	# """Generate image description for query"""
	# image_path = content.get("img_path")
	# captions = content.get("image_caption", content.get("img_caption", []))
	# footnotes = content.get("image_footnote", content.get("img_footnote", []))

	# if image_path and Path(image_path).exists():
	# # If image exists, use vision model to generate description
	# image_base64 = processor._encode_image_to_base64(image_path)
	# if image_base64:
	# prompt = PROMPTS["QUERY_IMAGE_DESCRIPTION"]
	# description = await processor.modal_caption_func(
	# prompt,
	# image_data=image_base64,
	# system_prompt=PROMPTS["QUERY_IMAGE_ANALYST_SYSTEM"],
	# )
	# return description

	# # If image doesn't exist or processing failed, use existing information
	# parts = []
	# if image_path:
	# parts.append(f"Image path: {image_path}")
	# if captions:
	# parts.append(f"Image captions: {', '.join(captions)}")
	# if footnotes:
	# parts.append(f"Image footnotes: {', '.join(footnotes)}")

	# return "; ".join(parts) if parts else "Image content information incomplete"

	# async def _describe_table_for_query(
	# self, processor, content: Dict[str, Any]
	# ) -> str:
	# """Generate table description for query"""
	# table_data = content.get("table_data", "")
	# table_caption = content.get("table_caption", "")

	# prompt = PROMPTS["QUERY_TABLE_ANALYSIS"].format(
	# table_data=table_data, table_caption=table_caption
	# )

	# description = await processor.modal_caption_func(
	# prompt, system_prompt=PROMPTS["QUERY_TABLE_ANALYST_SYSTEM"]
	# )

	# return description

	# async def _describe_equation_for_query(
	# self, processor, content: Dict[str, Any]
	# ) -> str:
	# """Generate equation description for query"""
	# latex = content.get("latex", "")
	# equation_caption = content.get("equation_caption", "")

	# prompt = PROMPTS["QUERY_EQUATION_ANALYSIS"].format(
	# latex=latex, equation_caption=equation_caption
	# )

	# description = await processor.modal_caption_func(
	# prompt, system_prompt=PROMPTS["QUERY_EQUATION_ANALYST_SYSTEM"]
	# )

	# return description

	# async def _describe_generic_for_query(
	# self, processor, content: Dict[str, Any], content_type: str
	# ) -> str:
	# """Generate generic content description for query"""
	# content_str = str(content)

	# prompt = PROMPTS["QUERY_GENERIC_ANALYSIS"].format(
	# content_type=content_type, content_str=content_str
	# )

	# description = await processor.modal_caption_func(
	# prompt,
	# system_prompt=PROMPTS["QUERY_GENERIC_ANALYST_SYSTEM"].format(
	# content_type=content_type
	# ),
	# )

	# return description

	# async def _process_image_paths_for_vlm(self, prompt: str) -> tuple[str, int]:
	# """
	# Process image paths in prompt, keeping original paths and adding VLM markers

	# Args:
	# prompt: Original prompt

	# Returns:
	# tuple: (processed prompt, image count)
	# """
	# enhanced_prompt = prompt
	# images_processed = 0

	# # Initialize image cache
	# self._current_images_base64 = []

	# # Enhanced regex pattern for matching image paths
	# # Matches only the path ending with image file extensions
	# image_path_pattern = (
	# r"Image Path:\s([^\r\n]?\.(?:jpg\|jpeg\|png\|gif\|bmp\|webp\|tiff\|tif))"
	# )

	# # First, let's see what matches we find
	# matches = re.findall(image_path_pattern, prompt)
	# self.logger.info(f"Found {len(matches)} image path matches in prompt")

	# def replace_image_path(match):
	# nonlocal images_processed

	# image_path = match.group(1).strip()
	# self.logger.debug(f"Processing image path: '{image_path}'")

	# # Validate path format (basic check)
	# if not image_path or len(image_path) < 3:
	# self.logger.warning(f"Invalid image path format: {image_path}")
	# return match.group(0) # Keep original

	# # Use utility function to validate image file
	# self.logger.debug(f"Calling validate_image_file for: {image_path}")
	# is_valid = validate_image_file(image_path)
	# self.logger.debug(f"Validation result for {image_path}: {is_valid}")

	# if not is_valid:
	# self.logger.warning(f"Image validation failed for: {image_path}")
	# return match.group(0) # Keep original if validation fails

	# try:
	# # Encode image to base64 using utility function
	# self.logger.debug(f"Attempting to encode image: {image_path}")
	# image_base64 = encode_image_to_base64(image_path)
	# if image_base64:
	# images_processed += 1
	# # Save base64 to instance variable for later use
	# self._current_images_base64.append(image_base64)

	# # Keep original path info and add VLM marker
	# result = f"Image Path: {image_path}\n[VLM_IMAGE_{images_processed}]"
	# self.logger.debug(
	# f"Successfully processed image {images_processed}: {image_path}"
	# )
	# return result
	# else:
	# self.logger.error(f"Failed to encode image: {image_path}")
	# return match.group(0) # Keep original if encoding failed

	# except Exception as e:
	# self.logger.error(f"Failed to process image {image_path}: {e}")
	# return match.group(0) # Keep original

	# # Execute replacement
	# enhanced_prompt = re.sub(
	# image_path_pattern, replace_image_path, enhanced_prompt
	# )

	# return enhanced_prompt, images_processed

	# def _build_vlm_messages_with_images(
	# self, enhanced_prompt: str, user_query: str
	# ) -> List[Dict]:
	# """
	# Build VLM message format, using markers to correspond images with text positions

	# Args:
	# enhanced_prompt: Enhanced prompt with image markers
	# user_query: User query

	# Returns:
	# List[Dict]: VLM message format
	# """
	# images_base64 = getattr(self, "_current_images_base64", [])

	# if not images_base64:
	# # Pure text mode
	# return [
	# {
	# "role": "user",
	# "content": f"Context:\n{enhanced_prompt}\n\nUser Question: {user_query}",
	# }
	# ]

	# # Build multimodal content
	# content_parts = []

	# # Split text at image markers and insert images
	# text_parts = enhanced_prompt.split("[VLM_IMAGE_")

	# for i, text_part in enumerate(text_parts):
	# if i == 0:
	# # First text part
	# if text_part.strip():
	# content_parts.append({"type": "text", "text": text_part})
	# else:
	# # Find marker number and insert corresponding image
	# marker_match = re.match(r"(\d+)\](.*)", text_part, re.DOTALL)
	# if marker_match:
	# image_num = (
	# int(marker_match.group(1)) - 1
	# ) # Convert to 0-based index
	# remaining_text = marker_match.group(2)

	# # Insert corresponding image
	# if 0 <= image_num < len(images_base64):
	# content_parts.append(
	# {
	# "type": "image_url",
	# "image_url": {
	# "url": f"data:image/jpeg;base64,{images_base64[image_num]}"
	# },
	# }
	# )

	# # Insert remaining text
	# if remaining_text.strip():
	# content_parts.append({"type": "text", "text": remaining_text})

	# # Add user question
	# content_parts.append(
	# {
	# "type": "text",
	# "text": f"\n\nUser Question: {user_query}\n\nPlease answer based on the context and images provided.",
	# }
	# )

	# return [
	# {
	# "role": "system",
	# "content": "You are a helpful assistant that can analyze both text and image content to provide comprehensive answers.",
	# },
	# {"role": "user", "content": content_parts},
	# ]

	# async def _call_vlm_with_multimodal_content(self, messages: List[Dict]) -> str:
	# """
	# Call VLM to process multimodal content

	# Args:
	# messages: VLM message format

	# Returns:
	# str: VLM response result
	# """
	# try:
	# user_message = messages[1]
	# content = user_message["content"]
	# system_prompt = messages[0]["content"]

	# if isinstance(content, str):
	# # Pure text mode
	# result = await self.vision_model_func(
	# content, system_prompt=system_prompt
	# )
	# else:
	# # Multimodal mode - pass complete messages directly to VLM
	# result = await self.vision_model_func(
	# "", # Empty prompt since we're using messages format
	# messages=messages,
	# )

	# return result

	# except Exception as e:
	# self.logger.error(f"VLM call failed: {e}")
	# raise

	# # Synchronous versions of query methods
	# def query(self, query: str, mode: str = "mix", **kwargs) -> str:
	# """
	# Synchronous version of pure text query

	# Args:
	# query: Query text
	# mode: Query mode ("local", "global", "hybrid", "naive", "mix", "bypass")
	# **kwargs: Other query parameters, will be passed to QueryParam
	# - vlm_enhanced: bool, default True when vision_model_func is available.
	# If True, will parse image paths in retrieved context and replace them
	# with base64 encoded images for VLM processing.

	# Returns:
	# str: Query result
	# """
	# loop = always_get_an_event_loop()
	# return loop.run_until_complete(self.aquery(query, mode=mode, **kwargs))

	# def query_with_multimodal(
	# self,
	# query: str,
	# multimodal_content: List[Dict[str, Any]] = None,
	# mode: str = "mix",
	# **kwargs,
	# ) -> str:
	# """
	# Synchronous version of multimodal query

	# Args:
	# query: Base query text
	# multimodal_content: List of multimodal content, each element contains:
	# - type: Content type ("image", "table", "equation", etc.)
	# - Other fields depend on type (e.g., img_path, table_data, latex, etc.)
	# mode: Query mode ("local", "global", "hybrid", "naive", "mix", "bypass")
	# **kwargs: Other query parameters, will be passed to QueryParam

	# Returns:
	# str: Query result
	# """
	# loop = always_get_an_event_loop()
	# return loop.run_until_complete(
	# self.aquery_with_multimodal(query, multimodal_content, mode=mode, **kwargs)
	# )

	"""
	Query functionality for RAGAnything - ENHANCED VERSION

	Contains all query-related methods for text and multimodal queries,
	plus query improvement and dual-LLM verification capabilities.
	"""

	import json
	import hashlib
	import re
	import asyncio
	from typing import Dict, List, Any
	from pathlib import Path
	from lightrag import QueryParam
	from lightrag.utils import always_get_an_event_loop
	from raganything.prompt import PROMPTS
	from raganything.utils import (
	get_processor_for_type,
	encode_image_to_base64,
	validate_image_file,
	)

	# Import new enhancement modules
	from raganything.query_improvement import QueryImprovementMixin
	from raganything.verification import DualLLMVerificationMixin
	from raganything.streaming import StreamingQueryMixin


	class QueryMixin(QueryImprovementMixin, DualLLMVerificationMixin, StreamingQueryMixin):
	"""
	QueryMixin class containing query functionality for RAGAnything

	Enhanced with:
	- Query improvement (rewriting, expansion, decomposition)
	- Dual-LLM verification system
	- Answer modification based on feedback
	- Real-time streaming with verification support
	"""

	def _generate_multimodal_cache_key(
	self, query: str, multimodal_content: List[Dict[str, Any]], mode: str, **kwargs
	) -> str:
	"""
	Generate cache key for multimodal query

	Args:
	query: Base query text
	multimodal_content: List of multimodal content
	mode: Query mode
	**kwargs: Additional parameters

	Returns:
	str: Cache key hash
	"""
	# Create a normalized representation of the query parameters
	cache_data = {
	"query": query.strip(),
	"mode": mode,
	}

	# Normalize multimodal content for stable caching
	normalized_content = []
	if multimodal_content:
	for item in multimodal_content:
	if isinstance(item, dict):
	normalized_item = {}
	for key, value in item.items():
	# For file paths, use basename to make cache more portable
	if key in [
	"img_path",
	"image_path",
	"file_path",
	] and isinstance(value, str):
	normalized_item[key] = Path(value).name
	# For large content, create a hash instead of storing directly
	elif (
	key in ["table_data", "table_body"]
	and isinstance(value, str)
	and len(value) > 200
	):
	normalized_item[f"{key}_hash"] = hashlib.md5(
	value.encode()
	).hexdigest()
	else:
	normalized_item[key] = value
	normalized_content.append(normalized_item)
	else:
	normalized_content.append(item)

	cache_data["multimodal_content"] = normalized_content

	# Add relevant kwargs to cache data
	relevant_kwargs = {
	k: v
	for k, v in kwargs.items()
	if k
	in [
	"stream",
	"response_type",
	"top_k",
	"max_tokens",
	"temperature",
	]
	}
	cache_data.update(relevant_kwargs)

	# Generate hash from the cache data
	cache_str = json.dumps(cache_data, sort_keys=True, ensure_ascii=False)
	cache_hash = hashlib.md5(cache_str.encode()).hexdigest()

	return f"multimodal_query:{cache_hash}"

	async def aquery(self, query: str, mode: str = "mix", **kwargs) -> str:
	"""
	Pure text query with optional query improvement and verification

	Args:
	query: Query text
	mode: Query mode ("local", "global", "hybrid", "naive", "mix", "bypass")
	**kwargs: Other query parameters
	- vlm_enhanced: bool, default True when vision_model_func is available
	- enable_query_improvement: bool, override config setting
	- enable_verification: bool, override config setting
	- return_verification_info: bool, return detailed verification info

	Returns:
	str: Query result (or dict if return_verification_info=True)
	"""
	if self.lightrag is None:
	raise ValueError(
	"No LightRAG instance available. Please process documents first or provide a pre-initialized LightRAG instance."
	)

	# Check override flags
	use_query_improvement = kwargs.pop('enable_query_improvement',
	getattr(self.config, 'enable_query_improvement', False))
	use_verification = kwargs.pop('enable_verification',
	getattr(self.config, 'enable_dual_llm_verification', False))
	return_verification_info = kwargs.pop('return_verification_info', False)

	original_query = query
	query_improvement_result = None

	# Step 1: Apply query improvement if enabled
	if use_query_improvement and hasattr(self, 'query_improver') and self.query_improver:
	self.logger.info("Applying query improvement...")
	query_improvement_result = await self._apply_query_improvement(query)
	if not query_improvement_result["improved_query"]:
	self.logger.warning("Query improvement resulted in an empty query, using original query.")
	query = original_query
	else:
	query = query_improvement_result["improved_query"]
	self.logger.info(f"Query improved: '{original_query[:50]}...' -> '{query[:50]}...'")

	# Check if VLM enhanced query should be used
	vlm_enhanced = kwargs.pop("vlm_enhanced", None)

	# Auto-determine VLM enhanced based on availability
	if vlm_enhanced is None:
	vlm_enhanced = (
	hasattr(self, "vision_model_func")
	and self.vision_model_func is not None
	)

	# If using VLM enhanced or verification is disabled, use existing flow
	if vlm_enhanced or not use_verification:
	# Use VLM enhanced query if enabled and available
	if (
	vlm_enhanced
	and hasattr(self, "vision_model_func")
	and self.vision_model_func
	):
	result = await self.aquery_vlm_enhanced(query, mode=mode, **kwargs)
	elif vlm_enhanced and (
	not hasattr(self, "vision_model_func") or not self.vision_model_func
	):
	self.logger.warning(
	"VLM enhanced query requested but vision_model_func is not available, falling back to normal query"
	)
	# Create query parameters
	query_param = QueryParam(mode=mode, **kwargs)
	# Call LightRAG's query method
	result = await self.lightrag.aquery(query, param=query_param)
	else:
	# Create query parameters
	query_param = QueryParam(mode=mode, **kwargs)
	# Call LightRAG's query method
	result = await self.lightrag.aquery(query, param=query_param)

	# Handle None result from LightRAG
	if result is None:
	result = "I couldn't find any relevant information in the knowledge base to answer your question."

	# Return with verification info if requested
	if return_verification_info:
	return {
	"answer": result,
	"original_query": original_query,
	"improved_query": query if query_improvement_result else original_query,
	"query_improvement": query_improvement_result,
	"verification_passed": True,
	"verification_score": 10.0,
	"modification_attempts": 0
	}

	self.logger.info("Query completed")
	return result

	# Step 2: Generate with verification if enabled
	if use_verification and hasattr(self, 'answer_verifier') and self.answer_verifier:
	self.logger.info("Using dual-LLM verification...")

	# Get context without final answer
	query_param = QueryParam(mode=mode, only_need_context=True, **kwargs)
	context = await self.lightrag.aquery(query, param=query_param)

	# Check if context is None or empty
	if context is None or (isinstance(context, str) and not context.strip()):
	self.logger.warning("No context retrieved from knowledge base")
	no_context_answer = "I couldn't find any relevant information in the knowledge base to answer your question."

	if return_verification_info:
	return {
	"answer": no_context_answer,
	"original_query": original_query,
	"improved_query": query if query_improvement_result else original_query,
	"query_improvement": query_improvement_result,
	"verification_passed": False,
	"verification_score": 0.0,
	"modification_attempts": 0,
	"verification_history": []
	}
	return no_context_answer

	# Generate with verification
	verification_result = await self._generate_with_verification(
	query=query,
	context=context,
	original_query=original_query
	)

	if return_verification_info:
	return {
	"answer": verification_result["answer"],
	"original_query": original_query,
	"improved_query": query if query_improvement_result else original_query,
	"query_improvement": query_improvement_result,
	"verification_passed": verification_result["verification_passed"],
	"verification_score": verification_result["verification_score"],
	"modification_attempts": verification_result["modification_attempts"],
	"verification_history": verification_result.get("verification_history", [])
	}

	self.logger.info("Verified query completed")
	return verification_result["answer"]

	# Fallback to normal query
	query_param = QueryParam(mode=mode, **kwargs)
	result = await self.lightrag.aquery(query, param=query_param)

	# Handle None result from LightRAG
	if result is None:
	result = "I couldn't find any relevant information in the knowledge base to answer your question."

	if return_verification_info:
	return {
	"answer": result,
	"original_query": original_query,
	"improved_query": query if query_improvement_result else original_query,
	"query_improvement": query_improvement_result,
	"verification_passed": True,
	"verification_score": 10.0,
	"modification_attempts": 0
	}

	self.logger.info("Query completed")
	return result

	async def aquery_with_multimodal(
	self,
	query: str,
	multimodal_content: List[Dict[str, Any]] = None,
	mode: str = "mix",
	**kwargs,
	) -> str:
	"""
	Multimodal query - combines text and multimodal content for querying

	Args:
	query: Base query text
	multimodal_content: List of multimodal content
	mode: Query mode
	**kwargs: Other query parameters

	Returns:
	str: Query result
	"""
	# Ensure LightRAG is initialized
	await self._ensure_lightrag_initialized()

	self.logger.info(f"Executing multimodal query: {query[:100]}...")
	self.logger.info(f"Query mode: {mode}")

	# If no multimodal content, fallback to pure text query
	if not multimodal_content:
	self.logger.info("No multimodal content provided, executing text query")
	return await self.aquery(query, mode=mode, **kwargs)

	# Generate cache key for multimodal query
	cache_key = self._generate_multimodal_cache_key(
	query, multimodal_content, mode, **kwargs
	)

	# Check cache if available and enabled
	cached_result = None
	if (
	hasattr(self, "lightrag")
	and self.lightrag
	and hasattr(self.lightrag, "llm_response_cache")
	and self.lightrag.llm_response_cache
	):
	if self.lightrag.llm_response_cache.global_config.get(
	"enable_llm_cache", True
	):
	try:
	cached_result = await self.lightrag.llm_response_cache.get_by_id(
	cache_key
	)
	if cached_result and isinstance(cached_result, dict):
	result_content = cached_result.get("return")
	if result_content:
	self.logger.info(
	f"Multimodal query cache hit: {cache_key[:16]}..."
	)
	return result_content
	except Exception as e:
	self.logger.debug(f"Error accessing multimodal query cache: {e}")

	# Process multimodal content to generate enhanced query text
	enhanced_query = await self._process_multimodal_query_content(
	query, multimodal_content
	)

	self.logger.info(
	f"Generated enhanced query length: {len(enhanced_query)} characters"
	)

	# Execute enhanced query
	result = await self.aquery(enhanced_query, mode=mode, **kwargs)

	# Save to cache if available and enabled
	if (
	hasattr(self, "lightrag")
	and self.lightrag
	and hasattr(self.lightrag, "llm_response_cache")
	and self.lightrag.llm_response_cache
	):
	if self.lightrag.llm_response_cache.global_config.get(
	"enable_llm_cache", True
	):
	try:
	# Create cache entry for multimodal query
	cache_entry = {
	"return": result,
	"cache_type": "multimodal_query",
	"original_query": query,
	"multimodal_content_count": len(multimodal_content),
	"mode": mode,
	}

	await self.lightrag.llm_response_cache.upsert(
	{cache_key: cache_entry}
	)
	self.logger.info(
	f"Saved multimodal query result to cache: {cache_key[:16]}..."
	)
	except Exception as e:
	self.logger.debug(f"Error saving multimodal query to cache: {e}")

	# Ensure cache is persisted to disk
	if (
	hasattr(self, "lightrag")
	and self.lightrag
	and hasattr(self.lightrag, "llm_response_cache")
	and self.lightrag.llm_response_cache
	):
	try:
	await self.lightrag.llm_response_cache.index_done_callback()
	except Exception as e:
	self.logger.debug(f"Error persisting multimodal query cache: {e}")

	self.logger.info("Multimodal query completed")
	return result

	async def aquery_vlm_enhanced(self, query: str, mode: str = "mix", **kwargs) -> str:
	"""
	VLM enhanced query - replaces image paths in retrieved context with base64 encoded images

	Args:
	query: User query
	mode: Underlying LightRAG query mode
	**kwargs: Other query parameters

	Returns:
	str: VLM query result
	"""
	# Ensure VLM is available
	if not hasattr(self, "vision_model_func") or not self.vision_model_func:
	raise ValueError(
	"VLM enhanced query requires vision_model_func. "
	"Please provide a vision model function when initializing RAGAnything."
	)

	# Ensure LightRAG is initialized
	await self._ensure_lightrag_initialized()

	self.logger.info(f"Executing VLM enhanced query: {query[:100]}...")

	# Clear previous image cache
	if hasattr(self, "_current_images_base64"):
	delattr(self, "_current_images_base64")

	# 1. Get original retrieval prompt (without generating final answer)
	self.logger.info(f"Getting raw prompt for query: {query[:100]}...")
	query_param = QueryParam(mode=mode, only_need_prompt=True, **kwargs)
	try:
	raw_prompt = await self.lightrag.aquery(query, param=query_param)
	except Exception as e:
	self.logger.error(f"Error in self.lightrag.aquery: {e}", exc_info=True)
	raw_prompt = None
	self.logger.info(f"Retrieved raw prompt: {str(raw_prompt)[:200]}...")

	if raw_prompt is None:
	self.logger.warning("raw_prompt is None, falling back to normal query (single pass)")
	query_param = QueryParam(mode=mode, **kwargs)
	return await self.lightrag.aquery(query, param=query_param)

	self.logger.debug("Retrieved raw prompt from LightRAG")

	# 2. Extract and process image paths
	enhanced_prompt, images_found = await self._process_image_paths_for_vlm(
	raw_prompt
	)

	if not images_found:
	self.logger.info("No valid images found, falling back to normal query WITHOUT re-retrieval")
	# OPTIMIZATION: Reuse the already-retrieved context instead of querying again
	# The raw_prompt already contains the full RAG context, so we can use it directly

	# Try to use the existing model function if available
	if hasattr(self.lightrag, 'llm_model_func') and self.lightrag.llm_model_func:
	try:
	# Generate answer using the already-retrieved context
	self.logger.info("Generating answer from cached context (avoiding re-query)")

	# Call the LLM with the raw prompt directly
	if asyncio.iscoroutinefunction(self.lightrag.llm_model_func):
	result = await self.lightrag.llm_model_func(raw_prompt)
	else:
	result = self.lightrag.llm_model_func(raw_prompt)

	self.logger.info("Successfully generated answer from cached context (no re-query)")
	return result

	except Exception as e:
	self.logger.warning(f"Failed to use cached context, falling back to re-query: {e}")
	# Fall back to re-query if direct generation fails
	query_param = QueryParam(mode=mode, **kwargs)
	return await self.lightrag.aquery(query, param=query_param)
	else:
	# No model_func available, must re-query (original behavior)
	# This maintains backward compatibility
	self.logger.debug("llm_model_func not available, using standard re-query")
	query_param = QueryParam(mode=mode, **kwargs)
	return await self.lightrag.aquery(query, param=query_param)

	self.logger.info(f"Processed {images_found} images for VLM")

	# 3. Build VLM message format
	messages = self._build_vlm_messages_with_images(enhanced_prompt, query)

	# 4. Call VLM for question answering
	result = await self._call_vlm_with_multimodal_content(messages)

	self.logger.info("VLM enhanced query completed")
	return result

	# ... (rest of the existing methods remain the same) ...

	async def _process_multimodal_query_content(
	self, base_query: str, multimodal_content: List[Dict[str, Any]]
	) -> str:
	"""Process multimodal query content to generate enhanced query text"""
	self.logger.info("Starting multimodal query content processing...")

	enhanced_parts = [f"User query: {base_query}"]

	for i, content in enumerate(multimodal_content):
	content_type = content.get("type", "unknown")
	self.logger.info(
	f"Processing {i+1}/{len(multimodal_content)} multimodal content: {content_type}"
	)

	try:
	# Get appropriate processor
	processor = get_processor_for_type(self.modal_processors, content_type)

	if processor:
	# Generate content description
	description = await self._generate_query_content_description(
	processor, content, content_type
	)
	enhanced_parts.append(
	f"\nRelated {content_type} content: {description}"
	)
	else:
	# If no appropriate processor, use basic description
	basic_desc = str(content)[:200]
	enhanced_parts.append(
	f"\nRelated {content_type} content: {basic_desc}"
	)

	except Exception as e:
	self.logger.error(f"Error processing multimodal content: {str(e)}")
	continue

	enhanced_query = "\n".join(enhanced_parts)
	enhanced_query += PROMPTS["QUERY_ENHANCEMENT_SUFFIX"]

	self.logger.info("Multimodal query content processing completed")
	return enhanced_query

	async def _generate_query_content_description(
	self, processor, content: Dict[str, Any], content_type: str
	) -> str:
	"""Generate content description for query"""
	try:
	if content_type == "image":
	return await self._describe_image_for_query(processor, content)
	elif content_type == "table":
	return await self._describe_table_for_query(processor, content)
	elif content_type == "equation":
	return await self._describe_equation_for_query(processor, content)
	else:
	return await self._describe_generic_for_query(
	processor, content, content_type
	)

	except Exception as e:
	self.logger.error(f"Error generating {content_type} description: {str(e)}")
	return f"{content_type} content: {str(content)[:100]}"

	async def _describe_image_for_query(
	self, processor, content: Dict[str, Any]
	) -> str:
	"""Generate image description for query"""
	image_path = content.get("img_path")
	captions = content.get("image_caption", content.get("img_caption", []))
	footnotes = content.get("image_footnote", content.get("img_footnote", []))

	if image_path and Path(image_path).exists():
	image_base64 = processor._encode_image_to_base64(image_path)
	if image_base64:
	prompt = PROMPTS["QUERY_IMAGE_DESCRIPTION"]
	description = await processor.modal_caption_func(
	prompt,
	image_data=image_base64,
	system_prompt=PROMPTS["QUERY_IMAGE_ANALYST_SYSTEM"],
	)
	return description

	parts = []
	if image_path:
	parts.append(f"Image path: {image_path}")
	if captions:
	parts.append(f"Image captions: {', '.join(captions)}")
	if footnotes:
	parts.append(f"Image footnotes: {', '.join(footnotes)}")

	return "; ".join(parts) if parts else "Image content information incomplete"

	async def _describe_table_for_query(
	self, processor, content: Dict[str, Any]
	) -> str:
	"""Generate table description for query"""
	table_data = content.get("table_data", "")
	table_caption = content.get("table_caption", "")

	prompt = PROMPTS["QUERY_TABLE_ANALYSIS"].format(
	table_data=table_data, table_caption=table_caption
	)

	description = await processor.modal_caption_func(
	prompt, system_prompt=PROMPTS["QUERY_TABLE_ANALYST_SYSTEM"]
	)

	return description

	async def _describe_equation_for_query(
	self, processor, content: Dict[str, Any]
	) -> str:
	"""Generate equation description for query"""
	latex = content.get("latex", "")
	equation_caption = content.get("equation_caption", "")

	prompt = PROMPTS["QUERY_EQUATION_ANALYSIS"].format(
	latex=latex, equation_caption=equation_caption
	)

	description = await processor.modal_caption_func(
	prompt, system_prompt=PROMPTS["QUERY_EQUATION_ANALYST_SYSTEM"]
	)

	return description

	async def _describe_generic_for_query(
	self, processor, content: Dict[str, Any], content_type: str
	) -> str:
	"""Generate generic content description for query"""
	content_str = str(content)

	prompt = PROMPTS["QUERY_GENERIC_ANALYSIS"].format(
	content_type=content_type, content_str=content_str
	)

	description = await processor.modal_caption_func(
	prompt,
	system_prompt=PROMPTS["QUERY_GENERIC_ANALYST_SYSTEM"].format(
	content_type=content_type
	),
	)

	return description

	async def _process_image_paths_for_vlm(self, prompt: str) -> tuple[str, int]:
	"""Process image paths in prompt, keeping original paths and adding VLM markers"""
	if prompt is None:
	self.logger.warning("prompt is None in _process_image_paths_for_vlm, returning as is")
	return prompt, 0
	enhanced_prompt = prompt
	images_processed = 0

	self._current_images_base64 = []

	image_path_pattern = (
	r"Image Path:\s([^\r\n]?\.(?:jpg\|jpeg\|png\|gif\|bmp\|webp\|tiff\|tif))"
	)

	matches = re.findall(image_path_pattern, prompt)
	self.logger.info(f"Found {len(matches)} image path matches in prompt")

	def replace_image_path(match):
	nonlocal images_processed

	image_path = match.group(1).strip()
	self.logger.debug(f"Processing image path: '{image_path}'")

	if not image_path or len(image_path) < 3:
	self.logger.warning(f"Invalid image path format: {image_path}")
	return match.group(0)

	self.logger.debug(f"Calling validate_image_file for: {image_path}")
	is_valid = validate_image_file(image_path)
	self.logger.debug(f"Validation result for {image_path}: {is_valid}")

	if not is_valid:
	self.logger.warning(f"Image validation failed for: {image_path}")
	return match.group(0)

	try:
	self.logger.debug(f"Attempting to encode image: {image_path}")
	image_base64 = encode_image_to_base64(image_path)
	if image_base64:
	images_processed += 1
	self._current_images_base64.append(image_base64)

	result = f"Image Path: {image_path}\n[VLM_IMAGE_{images_processed}]"
	self.logger.debug(
	f"Successfully processed image {images_processed}: {image_path}"
	)
	return result
	else:
	self.logger.error(f"Failed to encode image: {image_path}")
	return match.group(0)

	except Exception as e:
	self.logger.error(f"Failed to process image {image_path}: {e}")
	return match.group(0)

	enhanced_prompt = re.sub(
	image_path_pattern, replace_image_path, enhanced_prompt
	)

	return enhanced_prompt, images_processed

	def _build_vlm_messages_with_images(
	self, enhanced_prompt: str, user_query: str
	) -> List[Dict]:
	"""Build VLM message format, using markers to correspond images with text positions"""
	images_base64 = getattr(self, "_current_images_base64", [])

	if not images_base64:
	return [
	{
	"role": "user",
	"content": f"Context:\n{enhanced_prompt}\n\nUser Question: {user_query}",
	}
	]

	content_parts = []
	text_parts = enhanced_prompt.split("[VLM_IMAGE_")

	for i, text_part in enumerate(text_parts):
	if i == 0:
	if text_part.strip():
	content_parts.append({"type": "text", "text": text_part})
	else:
	marker_match = re.match(r"(\d+)\](.*)", text_part, re.DOTALL)
	if marker_match:
	image_num = int(marker_match.group(1)) - 1
	remaining_text = marker_match.group(2)

	if 0 <= image_num < len(images_base64):
	content_parts.append(
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/jpeg;base64,{images_base64[image_num]}"
	},
	}
	)

	if remaining_text.strip():
	content_parts.append({"type": "text", "text": remaining_text})

	content_parts.append(
	{
	"type": "text",
	"text": f"\n\nUser Question: {user_query}\n\nPlease answer based on the context and images provided.",
	}
	)

	return [
	{
	"role": "system",
	"content": "You are a helpful assistant that can analyze both text and image content to provide comprehensive answers.",
	},
	{"role": "user", "content": content_parts},
	]

	async def _call_vlm_with_multimodal_content(self, messages: List[Dict]) -> str:
	"""Call VLM to process multimodal content"""
	try:
	user_message = messages[1]
	content = user_message["content"]
	system_prompt = messages[0]["content"]

	if isinstance(content, str):
	result = await self.vision_model_func(
	content, system_prompt=system_prompt
	)
	else:
	result = await self.vision_model_func(
	"",
	messages=messages,
	)

	return result

	except Exception as e:
	self.logger.error(f"VLM call failed: {e}")
	raise

	# Synchronous versions of query methods
	def query(self, query: str, mode: str = "mix", **kwargs) -> str:
	"""Synchronous version of pure text query"""
	loop = always_get_an_event_loop()
	return loop.run_until_complete(self.aquery(query, mode=mode, **kwargs))

	def query_with_multimodal(
	self,
	query: str,
	multimodal_content: List[Dict[str, Any]] = None,
	mode: str = "mix",
	**kwargs,
	) -> str:
	"""Synchronous version of multimodal query"""
	loop = always_get_an_event_loop()
	return loop.run_until_complete(
	self.aquery_with_multimodal(query, multimodal_content, mode=mode, **kwargs)
	)