# StoryBox Gradio UI + Multilingual Support This guide covers building a web UI for StoryBox, adding Hindi/multilingual support, and using it as a synthetic data generator. --- ## 1. Gradio Web UI Create `ui/app.py`: ```python """ StoryBox Web UI — built with Gradio. Supports story generation, persona editing, world building, and multilingual output. """ import json import os import sys from pathlib import Path import gradio as gr # Add storybox to path sys.path.insert(0, str(Path(__file__).resolve().parents[1])) from reverie.config.config import Config from reverie.environment.world import World from reverie.persona.persona import Persona from reverie.manager.persona_manager import persona_manager from reverie.manager.datetime_manager import datetime_manager from reverie.agent.storyteller import Storyteller # ------------------------------------------------------------------ # Story generation pipeline # ------------------------------------------------------------------ def generate_story( story_setting: str, language: str, num_days: int, num_personas: int, model_provider: str, model_name: str, temperature: float, progress=gr.Progress(), ): """ Run full StoryBox pipeline and return the generated story. """ # Override config Config.story_name = story_setting Config.story_dir = f"{Config.data_dir}/{story_setting}" Config.max_iteration = 24 * num_days Config.llm_model_name = model_name Config.temperature = temperature # Load world and personas world = World() world.load_file(f"{Config.story_dir}/world.yaml") persona_folder = f"{Config.story_dir}/personas" persona_names = sorted(os.listdir(persona_folder))[:num_personas] for name in persona_names: p = Persona(name, f"{persona_folder}/{name}") persona_manager.add_persona(p) # Run simulation all_personas = persona_manager.get_all_personas() for i in range(Config.max_iteration): progress(i / Config.max_iteration, desc=f"Simulating hour {i+1}/{Config.max_iteration}") new_day = 'First day' if i == 0 else 'New day' if datetime_manager.is_new_day() else False for persona in all_personas: # Simplified sync call for UI import asyncio asyncio.run(persona.step(world, new_day=new_day)) datetime_manager.advance_datetime(hours=1) # Generate story progress(0.9, desc="Writing story...") storyteller = Storyteller(Config.story_dir, Config.output_dir) storyteller.summarize_daily_by_persona() storyteller.generate_story_title() storyteller.generate_story_type() storyteller.generate_story_background() storyteller.generate_story_themes() storyteller.generate_story_chapters() storyteller.generate_story_conflicts() storyteller.generate_story_plot_points() storyteller.generate_story() storyteller.save() story = storyteller.get_story(with_title=True) # Translate if needed if language != "English": story = translate_story(story, language) return story def translate_story(text: str, target_lang: str) -> str: """Translate generated story using an LLM call.""" from reverie.common.llm import get_chat_model model = get_chat_model(Config.llm_model_name, Config.temperature) prompt = ( f"Translate the following story into {target_lang}. " f"Preserve the narrative style, character voices, and emotional tone. " f"Do not summarize — translate the full text:\n\n{text[:4000]}" ) resp = model.invoke(prompt) return resp.content # ------------------------------------------------------------------ # Gradio Interface # ------------------------------------------------------------------ def build_ui(): with gr.Blocks(title="StoryBox — AI Story Generator") as demo: gr.Markdown("# 📖 StoryBox") gr.Markdown("Generate long-form stories with multi-agent simulation.") with gr.Row(): with gr.Column(scale=1): story_setting = gr.Dropdown( choices=[f"story{i:02d}" for i in range(1, 21)], value="story01", label="Story Setting" ) language = gr.Dropdown( choices=["English", "Hindi", "Spanish", "French", "Arabic", "Chinese", "Japanese"], value="English", label="Output Language" ) num_days = gr.Slider(1, 14, value=3, step=1, label="Simulation Days") num_personas = gr.Slider(2, 6, value=4, step=1, label="Number of Characters") model_provider = gr.Radio( choices=["OpenAI", "Ollama", "MLX (Apple)", "NVIDIA NIM"], value="Ollama", label="Model Provider" ) model_name = gr.Textbox(value="llama3.1:8b", label="Model Name") temperature = gr.Slider(0.0, 1.5, value=0.8, step=0.05, label="Temperature") generate_btn = gr.Button("🚀 Generate Story", variant="primary") with gr.Column(scale=2): output_story = gr.Textbox( label="Generated Story", lines=30, max_lines=50, show_copy_button=True, ) output_metadata = gr.JSON(label="Story Metadata") generate_btn.click( fn=generate_story, inputs=[story_setting, language, num_days, num_personas, model_provider, model_name, temperature], outputs=[output_story, output_metadata], ) gr.Markdown("---") gr.Markdown("### 💡 Tips") gr.Markdown(""" - **More days** = richer story but longer generation time - **Lower temperature** = more predictable, coherent stories - **MLX on Apple Silicon** is fastest for local inference - **Hindi stories**: Select Hindi output + use a multilingual base model """) return demo if __name__ == "__main__": demo = build_ui() demo.launch(server_name="0.0.0.0", server_port=7860, share=False) ``` ### Run the UI ```bash pip install gradio cd ui python app.py # Open http://localhost:7860 ``` --- ## 2. Hindi / Multilingual Story Generation ### The Challenge StoryBox generates stories in **English** by default because: 1. Prompts are in English 2. Base model (GPT-4o-mini) is English-dominant 3. Persona descriptions are in English ### Solution A: Post-Generation Translation (Easiest) After generating the English story, translate it: ```python from reverie.common.llm import get_chat_model def translate_to_hindi(english_story: str) -> str: model = get_chat_model("nvidia/meta/llama-3.1-8b-instruct", 0.3) prompt = f""" Translate this English story into fluent, natural Hindi. Preserve all character names, dialogue style, and emotional depth. Use Devanagari script. Do not summarize — translate the full text: {english_story} """ return model.invoke(prompt).content ``` **Pros:** Simple, works with any model **Cons:** May lose cultural nuance, idioms don't translate well ### Solution B: Native Hindi Generation (Better) Use a **Hindi-capable base model** and Hindi prompts: ```python # config.py llm_model_name = 'nvidia/meta/llama-3.1-8b-instruct' # Multilingual # OR llm_model_name = 'qwen2.5-mlx' # Strong multilingual ``` Create Hindi prompt templates in `reverie/prompts/prompt-hi/`: ``` # generate_first_daily_plan.txt (Hindi version) Variables: !! -- पात्र की पहचान !! -- आज की तारीख ... ### आप एक कहानीकार हैं। नीचे दिए गए पात्र के लिए दैनिक योजना बनाएं: !! आज की तारीख: !! ... ``` **Pros:** Culturally authentic, natural Hindi **Cons:** Need to translate all 30+ prompts ### Solution C: Hybrid — English Simulation + Hindi Storyteller (Best) Keep the **sandbox simulation in English** (characters plan, chat, move in English), but make the **Storyteller Agent write in Hindi**: ```python # In storyteller.py, modify generate_plot_content(): prompt_inputs = [ self.story_title, self.story_type, "यह कहानी हिंदी में लिखी गई है।", # Hindi instruction ... ] ``` This is the **recommended approach** — simulation logic stays universal, output language is flexible. --- ## 3. Synthetic Data Generation Pipeline StoryBox is excellent for generating **synthetic training data**: ### Use Case 1: Children's Stories Dataset ```python """ Generate 10,000 synthetic children's stories for fine-tuning. """ import json from pathlib import Path def generate_synthetic_dataset( num_stories: int = 1000, output_file: str = "synthetic_stories.jsonl", language: str = "English", ): stories = [] for i in range(num_stories): # Randomize story setting story_id = f"story{(i % 20) + 1:02d}" Config.story_name = story_id # Run StoryBox (simplified) story_text = run_storybox_simulation() if language != "English": story_text = translate_story(story_text, language) stories.append({ "id": f"story_{i:06d}", "language": language, "setting": story_id, "text": story_text, "word_count": len(story_text.split()), "metadata": { "model": Config.llm_model_name, "temperature": Config.temperature, } }) # Save as JSONL with open(output_file, "w", encoding="utf-8") as f: for story in stories: f.write(json.dumps(story, ensure_ascii=False) + "\n") print(f"Generated {num_stories} stories → {output_file}") ``` ### Use Case 2: Instruction-Tuning Dataset Convert stories into instruction-response pairs: ```python def stories_to_instruction_dataset(stories_file: str, output_file: str): """Convert stories into Alpaca-format instruction tuning data.""" instructions = [] with open(stories_file) as f: for line in f: story = json.loads(line) text = story["text"] # Generate multiple instructions per story instructions.append({ "instruction": "Continue this story:", "input": text[:500], "output": text[500:1500], }) instructions.append({ "instruction": "Summarize this story in one paragraph:", "input": text, "output": generate_summary(text), }) instructions.append({ "instruction": "Describe the main character's personality:", "input": text, "output": extract_character_description(text), }) with open(output_file, "w") as f: json.dump(instructions, f, indent=2, ensure_ascii=False) ``` ### Use Case 3: Low-Resource Language Corpus For Hindi or other low-resource languages: ```python def generate_hindi_corpus(num_stories: int = 5000): """Generate synthetic Hindi stories for language model pre-training.""" # Use a multilingual base model Config.llm_model_name = 'nvidia/meta/llama-3.1-8b-instruct' for i in range(num_stories): # Generate in English english_story = run_storybox_simulation() # Translate to Hindi hindi_story = translate_story(english_story, "Hindi") # Save with open(f"hindi_corpus/{i:06d}.txt", "w") as f: f.write(hindi_story) # Also save English-Hindi parallel with open(f"parallel_corpus/{i:06d}.json", "w") as f: json.dump({"en": english_story, "hi": hindi_story}, f, ensure_ascii=False) ``` --- ## 4. Quality Evaluation for Synthetic Data ```python def evaluate_story_quality(story: str) -> dict: """Score a generated story on multiple dimensions.""" scores = {} # Length check scores["length"] = len(story.split()) scores["meets_length_target"] = scores["length"] > 5000 # Coherence (simple heuristic) sentences = story.split(".") scores["avg_sentence_length"] = sum(len(s.split()) for s in sentences) / len(sentences) # Diversity (unique words / total words) words = story.lower().split() scores["lexical_diversity"] = len(set(words)) / len(words) # Repetition penalty from collections import Counter word_counts = Counter(words) scores["repetition_score"] = sum(1 for c in word_counts.values() if c > 5) / len(word_counts) return scores ``` --- ## 5. Full Pipeline: English → Hindi Synthetic Dataset ```bash # 1. Generate 1000 English stories python scripts/generate_synthetic_dataset.py \ --num-stories 1000 \ --output en_stories.jsonl \ --model llama3.1-8b-mlx # 2. Translate to Hindi python scripts/translate_dataset.py \ --input en_stories.jsonl \ --output hi_stories.jsonl \ --target-language Hindi \ --model nvidia/meta/llama-3.1-8b-instruct # 3. Convert to instruction format python scripts/to_instruction_format.py \ --input hi_stories.jsonl \ --output hi_instructions.json \ --format alpaca # 4. Push to HuggingFace python scripts/upload_to_hub.py \ --dataset hi_instructions.json \ --repo-id yourname/hindi-synthetic-stories ``` --- ## 6. UI Screenshots The Gradio UI provides: - **Story Setting selector** (20 pre-built worlds) - **Language dropdown** (English, Hindi, etc.) - **Simulation controls** (days, characters, temperature) - **Model provider picker** (OpenAI, Ollama, MLX, NIM) - **Live progress bar** during simulation - **Copy button** for generated stories - **Metadata JSON** with generation stats --- ## 7. Recommended Models by Language | Language | Recommended Model | Provider | |----------|-------------------|----------| | English | `gpt-4o-mini` | OpenAI | | Hindi | `meta/llama-3.1-8b-instruct` | NIM / Ollama | | Hindi | `qwen2.5-7b-instruct` | MLX / Ollama | | Arabic | `ubc-nlp/arallama-2` | HuggingFace | | Chinese | `qwen2.5-7b-instruct` | MLX / Ollama | | Japanese | `elyza/elyza-japanese-llama-2` | HuggingFace | | Multilingual | `meta/llama-3.1-8b-instruct` | Any | --- ## 8. Repository Status ✅ **Repository is now PRIVATE** — https://huggingface.co/raazkumar/storybox-reproduction Only you can access and clone it.