| import base64 |
| import mimetypes |
| import os |
| from pathlib import Path |
| from typing import Any, Dict, List |
|
|
| import gradio as gr |
| from openai import OpenAI |
|
|
| DEFAULT_MODEL = os.getenv("DEFAULT_MODEL", "LLaVA-OneVision-1.5-8B-Instruct") |
|
|
| _client = OpenAI( |
| base_url=os.getenv("BASE_URL", ""), |
| api_key=os.getenv("API_KEY", ""), |
| ) |
|
|
|
|
| def _data_url(path: str) -> str: |
| mime, _ = mimetypes.guess_type(path) |
| mime = mime or "application/octet-stream" |
| data = base64.b64encode(Path(path).read_bytes()).decode("utf-8") |
| return f"data:{mime};base64,{data}" |
|
|
|
|
| def _image_content(path: str) -> Dict[str, Any]: |
| return {"type": "image_url", "image_url": {"url": _data_url(path)}} |
|
|
|
|
| def _text_content(text: str) -> Dict[str, Any]: |
| return {"type": "text", "text": text} |
|
|
|
|
| def _message(role: str, content: Any) -> Dict[str, Any]: |
| return {"role": role, "content": content} |
|
|
|
|
| def _build_user_message(message: Dict[str, Any]) -> Dict[str, Any]: |
| files = message.get("files") or [] |
| text = (message.get("text") or "").strip() |
| content: List[Dict[str, Any]] = [_image_content(p) for p in files] |
| if text: |
| content.append(_text_content(text)) |
| return _message("user", content) |
|
|
|
|
| def _convert_history(history: List[Dict[str, Any]]) -> List[Dict[str, Any]]: |
| msgs: List[Dict[str, Any]] = [] |
| user_content: List[Dict[str, Any]] = [] |
|
|
| for turn in history or []: |
| role, content = turn.get("role"), turn.get("content") |
| if role == "user": |
| if isinstance(content, str): |
| user_content.append(_text_content(content)) |
| elif isinstance(content, tuple): |
| user_content.extend(_image_content(path) |
| for path in content if path) |
| elif role == "assistant": |
| msgs.append(_message("user", user_content.copy())) |
| user_content.clear() |
| msgs.append(_message("assistant", content)) |
| return msgs |
|
|
|
|
| def stream_response(message: Dict[str, Any], history: List[Dict[str, Any]], model_name: str = DEFAULT_MODEL): |
| messages = _convert_history(history) |
| messages.append(_build_user_message(message)) |
| try: |
| stream = _client.chat.completions.create( |
| model=model_name, |
| messages=messages, |
| temperature=0.000001, |
| top_p=1, |
| extra_body={ |
| "repetition_penalty": 1.05, |
| "frequency_penalty": 0, |
| "presence_penalty": 0 |
| }, |
| stream=True |
| ) |
| partial = "" |
| for chunk in stream: |
| delta = chunk.choices[0].delta.content |
| if delta: |
| partial += delta |
| yield partial |
| except Exception as e: |
| yield f"Failed to get response: {e}" |
|
|
|
|
| def build_demo() -> gr.Blocks: |
| chatbot = gr.Chatbot(type="messages", allow_tags=["think"]) |
| textbox = gr.MultimodalTextbox( |
| show_label=False, |
| placeholder="Enter text, or upload one or more images...", |
| file_types=["image"], |
| file_count="single", |
| max_plain_text_length=32768 |
| ) |
| model_selector = gr.Dropdown( |
| label="Model", |
| choices=[ |
| ("LLaVA-OneVision-1.5-8B-Instruct", "LLaVA-OneVision-1.5-8B-Instruct"), |
| ("LLaVA-OneVision-1.5-4B-Instruct", "LLaVA-OneVision-1.5-4B-Instruct"), |
| ], |
| value=DEFAULT_MODEL, |
| ) |
| return gr.ChatInterface( |
| fn=stream_response, |
| type="messages", |
| multimodal=True, |
| chatbot=chatbot, |
| textbox=textbox, |
| title="LLaVA-OneVision-1.5: Fully Open Framework for Democratized Multimodal Training", |
| description="""**LLaVA-OneVision1.5** introduces a novel family of fully open-source Large Multimodal Models (LMMs) that achieves state-of-the-art performance with substantially lower cost through training on native resolution images. |
| |
| 🔗 **Links**: [GitHub](https://github.com/EvolvingLMMs-Lab/LLaVA-OneVision-1.5) | [HuggingFace](https://huggingface.co/lmms-lab)""", |
| additional_inputs=[model_selector], |
| additional_inputs_accordion=gr.Accordion("Options", open=True), |
| ).queue(default_concurrency_limit=8) |
|
|
|
|
| def main(): |
| build_demo().launch() |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|
|
|