| import os |
| from huggingface_hub import login, snapshot_download |
| from transformers import BlipProcessor, BlipForConditionalGeneration |
| from PIL import Image |
| from dotenv import load_dotenv |
| import gradio as gr |
| from diffusers import FluxPipeline |
| import torch |
| import spaces |
|
|
| |
| |
| |
| snapshot_download("Salesforce/blip-image-captioning-large", timeout=120) |
| snapshot_download("noamrot/FuseCap", timeout=120) |
| snapshot_download("black-forest-labs/FLUX.1-dev", timeout=300) |
|
|
| |
| |
| |
| load_dotenv() |
| HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN") |
| if HUGGINGFACE_TOKEN: |
| login(token=HUGGINGFACE_TOKEN) |
|
|
| |
| |
| |
| device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
| processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large", timeout=120) |
| model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large", timeout=120).to(device) |
|
|
| processor1 = BlipProcessor.from_pretrained("noamrot/FuseCap", timeout=120) |
| model2 = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap", timeout=120).to(device) |
|
|
| pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16, timeout=300).to(device) |
|
|
| |
| |
| |
| fabrics = ['cotton', 'silk', 'denim', 'linen', 'polyester', 'wool', 'velvet'] |
| patterns = ['striped', 'floral', 'geometric', 'abstract', 'solid', 'polka dots'] |
| textile_designs = ['woven texture', 'embroidery', 'printed fabric', 'hand-dyed', 'quilting'] |
|
|
| |
| |
| |
| @spaces.GPU(duration=150) |
| def generate_caption_and_image(image, f, p, d): |
| if image and f and p and d: |
| img = image.convert("RGB") |
|
|
| |
| inputs = processor(img, "a picture of ", return_tensors="pt").to(device) |
| out = model2.generate(**inputs, num_beams=3) |
| caption2 = processor1.decode(out[0], skip_special_tokens=True) |
|
|
| |
| inputs = processor(image, return_tensors="pt", padding=True, truncation=True, max_length=250) |
| inputs = {k: v.to(device) for k, v in inputs.items()} |
| out = model.generate(**inputs) |
| caption1 = processor.decode(out[0], skip_special_tokens=True) |
|
|
| |
| prompt = ( |
| f"Design a high-quality, stylish clothing item that combines the essence of {caption1} and {caption2}. " |
| f"Use luxurious {f} fabric with intricate {d} design elements. " |
| f"Incorporate {p} patterns to elevate the garment's aesthetic. " |
| "Ensure sophistication, innovation, and timeless elegance." |
| ) |
|
|
| |
| result = pipe( |
| prompt, |
| height=1024, |
| width=1024, |
| guidance_scale=3.5, |
| num_inference_steps=50, |
| max_sequence_length=512, |
| generator=torch.Generator('cpu').manual_seed(0) |
| ).images[0] |
|
|
| return result |
| return None |
|
|
| |
| |
| |
| iface = gr.Interface( |
| fn=generate_caption_and_image, |
| inputs=[ |
| gr.Image(type="pil", label="Upload Image"), |
| gr.Radio(fabrics, label="Select Fabric"), |
| gr.Radio(patterns, label="Select Pattern"), |
| gr.Radio(textile_designs, label="Select Textile Design") |
| ], |
| outputs=gr.Image(label="Generated Design"), |
| live=True |
| ) |
|
|
| iface.launch() |
|
|
| |
| |
| |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|