HiDream-O1-Image

Runtime error

App Files Files Community

akhaliq HF Staff commited on 14 days ago

Commit

dcf4603

1 Parent(s): 10bdcf8

refactor: update model loading and generation logic to return FileData for HiDream-O1 integration

Browse files

Files changed (1) hide show

app.py +31 -24

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ import gradio as gr
 from gradio import Server
 from fastapi.responses import HTMLResponse
 import torch
-from transformers import AutoProcessor, AutoModelForImageTextToText
 from PIL import Image
 from dotenv import load_dotenv
@@ -31,11 +31,13 @@ logger = logging.getLogger(__name__)
 load_dotenv()
 # Load model and processor
 logger.info("Loading model and processor...")
 model_id = "HiDream-ai/HiDream-O1-Image"
 processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
-model = AutoModelForImageTextToText.from_pretrained(
     model_id,
     torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
     trust_remote_code=True
@@ -49,16 +51,16 @@ else:
 app = Server()
-@app.api("/generate")
 @spaces.GPU
-def generate_image_api(
     prompt: str,
     wh_ratio: str = "1:1",
     negative_prompt: str = "",
     enable_prompt_refine: bool = True,
     seed: int = -1,
     guidance_scale: float = 5.0
-) -> str:
     """
     Generate an image using the local transformers model.
     """
@@ -74,34 +76,39 @@ def generate_image_api(
     inputs = processor(text=prompt, return_tensors="pt").to(model.device)
     with torch.no_grad():
-        # This is a placeholder for the actual generation call.
-        # Most transformers image-gen models use .generate() or a custom method.
-        # Given AutoModelForImageTextToText, it might produce an image tensor.
         output = model.generate(
             **inputs,
-            max_new_tokens=1024, # Adjust based on model specifics
-            # Add other generation params like guidance_scale if supported
         )
     # Process the output to an image
-    # NOTE: AutoModelForImageTextToText usually generates text tokens.
-    # If HiDream-O1-Image generates image tokens, you may need a custom
-    # decoder or a different AutoModel class (e.g. AutoModelForTextToImage).
-    # This implementation assumes processor.batch_decode can handle the output.
-    generated_output = processor.batch_decode(output, skip_special_tokens=True)[0]
-    # If the output is actually an image (PIL or Tensor), we handle it here.
-    # For now, let's assume it returns a PIL image or we can convert it.
-    if isinstance(generated_output, Image.Image):
-        img = generated_output
     else:
-        # Fallback: create a dummy image if decoding fails to show something
-        logger.warning("Generated output was not a PIL image, creating placeholder.")
-        img = Image.new("RGB", (1024, 1024), color=(50, 50, 150))
     out_path = f"generated_{int(time.time())}_{random.randint(0, 1000)}.png"
     img.save(out_path)
-    return out_path
 @app.get("/")
 async def index():

 from gradio import Server
 from fastapi.responses import HTMLResponse
 import torch
+from transformers import AutoProcessor, AutoModel
 from PIL import Image
 from dotenv import load_dotenv
 load_dotenv()
+from gradio.data_classes import FileData
 # Load model and processor
 logger.info("Loading model and processor...")
 model_id = "HiDream-ai/HiDream-O1-Image"
 processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+model = AutoModel.from_pretrained(
     model_id,
     torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
     trust_remote_code=True
 app = Server()
+@app.api()
 @spaces.GPU
+def generate(
     prompt: str,
     wh_ratio: str = "1:1",
     negative_prompt: str = "",
     enable_prompt_refine: bool = True,
     seed: int = -1,
     guidance_scale: float = 5.0
+) -> FileData:
     """
     Generate an image using the local transformers model.
     """
     inputs = processor(text=prompt, return_tensors="pt").to(model.device)
     with torch.no_grad():
+        # HiDream-O1 often takes parameters in the prompt or as kwargs
+        # We pass them here just in case the custom modeling code supports them
         output = model.generate(
             **inputs,
+            max_new_tokens=1024,
+            negative_prompt=negative_prompt,
+            guidance_scale=guidance_scale,
+            wh_ratio=wh_ratio,
         )
     # Process the output to an image
+    # HiDream models often return a PIL image directly or in a list
+    if isinstance(output, Image.Image):
+        img = output
+    elif isinstance(output, list) and len(output) > 0 and isinstance(output[0], Image.Image):
+        img = output[0]
+    elif hasattr(output, "images") and output.images:
+        img = output.images[0]
     else:
+        # Fallback to decoder for text-based or token-based models
+        logger.info("Output is not a PIL image, attempting to decode...")
+        generated_output = processor.batch_decode(output, skip_special_tokens=True)[0]
+        if isinstance(generated_output, Image.Image):
+            img = generated_output
+        else:
+            # Fallback: create a dummy image if decoding fails to show something
+            logger.warning("Generated output was not a PIL image, creating placeholder.")
+            img = Image.new("RGB", (1024, 1024), color=(50, 50, 150))
     out_path = f"generated_{int(time.time())}_{random.randint(0, 1000)}.png"
     img.save(out_path)
+    return FileData(path=out_path)
 @app.get("/")
 async def index():