| # StoryBox Gradio UI + Multilingual Support |
|
|
| This guide covers building a web UI for StoryBox, adding Hindi/multilingual support, and using it as a synthetic data generator. |
|
|
| --- |
|
|
| ## 1. Gradio Web UI |
|
|
| Create `ui/app.py`: |
|
|
| ```python |
| """ |
| StoryBox Web UI — built with Gradio. |
| Supports story generation, persona editing, world building, and multilingual output. |
| """ |
| import json |
| import os |
| import sys |
| from pathlib import Path |
| |
| import gradio as gr |
| |
| # Add storybox to path |
| sys.path.insert(0, str(Path(__file__).resolve().parents[1])) |
| |
| from reverie.config.config import Config |
| from reverie.environment.world import World |
| from reverie.persona.persona import Persona |
| from reverie.manager.persona_manager import persona_manager |
| from reverie.manager.datetime_manager import datetime_manager |
| from reverie.agent.storyteller import Storyteller |
| |
| # ------------------------------------------------------------------ |
| # Story generation pipeline |
| # ------------------------------------------------------------------ |
| def generate_story( |
| story_setting: str, |
| language: str, |
| num_days: int, |
| num_personas: int, |
| model_provider: str, |
| model_name: str, |
| temperature: float, |
| progress=gr.Progress(), |
| ): |
| """ |
| Run full StoryBox pipeline and return the generated story. |
| """ |
| # Override config |
| Config.story_name = story_setting |
| Config.story_dir = f"{Config.data_dir}/{story_setting}" |
| Config.max_iteration = 24 * num_days |
| Config.llm_model_name = model_name |
| Config.temperature = temperature |
| |
| # Load world and personas |
| world = World() |
| world.load_file(f"{Config.story_dir}/world.yaml") |
| |
| persona_folder = f"{Config.story_dir}/personas" |
| persona_names = sorted(os.listdir(persona_folder))[:num_personas] |
| for name in persona_names: |
| p = Persona(name, f"{persona_folder}/{name}") |
| persona_manager.add_persona(p) |
| |
| # Run simulation |
| all_personas = persona_manager.get_all_personas() |
| for i in range(Config.max_iteration): |
| progress(i / Config.max_iteration, desc=f"Simulating hour {i+1}/{Config.max_iteration}") |
| new_day = 'First day' if i == 0 else 'New day' if datetime_manager.is_new_day() else False |
| for persona in all_personas: |
| # Simplified sync call for UI |
| import asyncio |
| asyncio.run(persona.step(world, new_day=new_day)) |
| datetime_manager.advance_datetime(hours=1) |
| |
| # Generate story |
| progress(0.9, desc="Writing story...") |
| storyteller = Storyteller(Config.story_dir, Config.output_dir) |
| storyteller.summarize_daily_by_persona() |
| storyteller.generate_story_title() |
| storyteller.generate_story_type() |
| storyteller.generate_story_background() |
| storyteller.generate_story_themes() |
| storyteller.generate_story_chapters() |
| storyteller.generate_story_conflicts() |
| storyteller.generate_story_plot_points() |
| storyteller.generate_story() |
| storyteller.save() |
| |
| story = storyteller.get_story(with_title=True) |
| |
| # Translate if needed |
| if language != "English": |
| story = translate_story(story, language) |
| |
| return story |
| |
| |
| def translate_story(text: str, target_lang: str) -> str: |
| """Translate generated story using an LLM call.""" |
| from reverie.common.llm import get_chat_model |
| model = get_chat_model(Config.llm_model_name, Config.temperature) |
| prompt = ( |
| f"Translate the following story into {target_lang}. " |
| f"Preserve the narrative style, character voices, and emotional tone. " |
| f"Do not summarize — translate the full text:\n\n{text[:4000]}" |
| ) |
| resp = model.invoke(prompt) |
| return resp.content |
| |
| |
| # ------------------------------------------------------------------ |
| # Gradio Interface |
| # ------------------------------------------------------------------ |
| def build_ui(): |
| with gr.Blocks(title="StoryBox — AI Story Generator") as demo: |
| gr.Markdown("# 📖 StoryBox") |
| gr.Markdown("Generate long-form stories with multi-agent simulation.") |
| |
| with gr.Row(): |
| with gr.Column(scale=1): |
| story_setting = gr.Dropdown( |
| choices=[f"story{i:02d}" for i in range(1, 21)], |
| value="story01", |
| label="Story Setting" |
| ) |
| language = gr.Dropdown( |
| choices=["English", "Hindi", "Spanish", "French", "Arabic", "Chinese", "Japanese"], |
| value="English", |
| label="Output Language" |
| ) |
| num_days = gr.Slider(1, 14, value=3, step=1, label="Simulation Days") |
| num_personas = gr.Slider(2, 6, value=4, step=1, label="Number of Characters") |
| |
| model_provider = gr.Radio( |
| choices=["OpenAI", "Ollama", "MLX (Apple)", "NVIDIA NIM"], |
| value="Ollama", |
| label="Model Provider" |
| ) |
| model_name = gr.Textbox(value="llama3.1:8b", label="Model Name") |
| temperature = gr.Slider(0.0, 1.5, value=0.8, step=0.05, label="Temperature") |
| |
| generate_btn = gr.Button("🚀 Generate Story", variant="primary") |
| |
| with gr.Column(scale=2): |
| output_story = gr.Textbox( |
| label="Generated Story", |
| lines=30, |
| max_lines=50, |
| show_copy_button=True, |
| ) |
| output_metadata = gr.JSON(label="Story Metadata") |
| |
| generate_btn.click( |
| fn=generate_story, |
| inputs=[story_setting, language, num_days, num_personas, |
| model_provider, model_name, temperature], |
| outputs=[output_story, output_metadata], |
| ) |
| |
| gr.Markdown("---") |
| gr.Markdown("### 💡 Tips") |
| gr.Markdown(""" |
| - **More days** = richer story but longer generation time |
| - **Lower temperature** = more predictable, coherent stories |
| - **MLX on Apple Silicon** is fastest for local inference |
| - **Hindi stories**: Select Hindi output + use a multilingual base model |
| """) |
| |
| return demo |
| |
| |
| if __name__ == "__main__": |
| demo = build_ui() |
| demo.launch(server_name="0.0.0.0", server_port=7860, share=False) |
| ``` |
|
|
| ### Run the UI |
|
|
| ```bash |
| pip install gradio |
| cd ui |
| python app.py |
| # Open http://localhost:7860 |
| ``` |
|
|
| --- |
|
|
| ## 2. Hindi / Multilingual Story Generation |
|
|
| ### The Challenge |
|
|
| StoryBox generates stories in **English** by default because: |
| 1. Prompts are in English |
| 2. Base model (GPT-4o-mini) is English-dominant |
| 3. Persona descriptions are in English |
|
|
| ### Solution A: Post-Generation Translation (Easiest) |
|
|
| After generating the English story, translate it: |
|
|
| ```python |
| from reverie.common.llm import get_chat_model |
| |
| def translate_to_hindi(english_story: str) -> str: |
| model = get_chat_model("nvidia/meta/llama-3.1-8b-instruct", 0.3) |
| prompt = f""" |
| Translate this English story into fluent, natural Hindi. |
| Preserve all character names, dialogue style, and emotional depth. |
| Use Devanagari script. Do not summarize — translate the full text: |
| |
| {english_story} |
| """ |
| return model.invoke(prompt).content |
| ``` |
|
|
| **Pros:** Simple, works with any model |
| **Cons:** May lose cultural nuance, idioms don't translate well |
|
|
| ### Solution B: Native Hindi Generation (Better) |
|
|
| Use a **Hindi-capable base model** and Hindi prompts: |
|
|
| ```python |
| # config.py |
| llm_model_name = 'nvidia/meta/llama-3.1-8b-instruct' # Multilingual |
| # OR |
| llm_model_name = 'qwen2.5-mlx' # Strong multilingual |
| ``` |
|
|
| Create Hindi prompt templates in `reverie/prompts/prompt-hi/`: |
|
|
| ``` |
| # generate_first_daily_plan.txt (Hindi version) |
| Variables: |
| !<INPUT 0>! -- पात्र की पहचान |
| !<INPUT 1>! -- आज की तारीख |
| ... |
| |
| <commentblockmarker>###</commentblockmarker> |
| आप एक कहानीकार हैं। नीचे दिए गए पात्र के लिए दैनिक योजना बनाएं: |
| !<INPUT 0>! |
| |
| आज की तारीख: !<INPUT 1>! |
| ... |
| ``` |
|
|
| **Pros:** Culturally authentic, natural Hindi |
| **Cons:** Need to translate all 30+ prompts |
|
|
| ### Solution C: Hybrid — English Simulation + Hindi Storyteller (Best) |
|
|
| Keep the **sandbox simulation in English** (characters plan, chat, move in English), but make the **Storyteller Agent write in Hindi**: |
|
|
| ```python |
| # In storyteller.py, modify generate_plot_content(): |
| |
| prompt_inputs = [ |
| self.story_title, |
| self.story_type, |
| "यह कहानी हिंदी में लिखी गई है।", # Hindi instruction |
| ... |
| ] |
| ``` |
|
|
| This is the **recommended approach** — simulation logic stays universal, output language is flexible. |
|
|
| --- |
|
|
| ## 3. Synthetic Data Generation Pipeline |
|
|
| StoryBox is excellent for generating **synthetic training data**: |
|
|
| ### Use Case 1: Children's Stories Dataset |
|
|
| ```python |
| """ |
| Generate 10,000 synthetic children's stories for fine-tuning. |
| """ |
| import json |
| from pathlib import Path |
| |
| def generate_synthetic_dataset( |
| num_stories: int = 1000, |
| output_file: str = "synthetic_stories.jsonl", |
| language: str = "English", |
| ): |
| stories = [] |
| for i in range(num_stories): |
| # Randomize story setting |
| story_id = f"story{(i % 20) + 1:02d}" |
| Config.story_name = story_id |
| |
| # Run StoryBox (simplified) |
| story_text = run_storybox_simulation() |
| |
| if language != "English": |
| story_text = translate_story(story_text, language) |
| |
| stories.append({ |
| "id": f"story_{i:06d}", |
| "language": language, |
| "setting": story_id, |
| "text": story_text, |
| "word_count": len(story_text.split()), |
| "metadata": { |
| "model": Config.llm_model_name, |
| "temperature": Config.temperature, |
| } |
| }) |
| |
| # Save as JSONL |
| with open(output_file, "w", encoding="utf-8") as f: |
| for story in stories: |
| f.write(json.dumps(story, ensure_ascii=False) + "\n") |
| |
| print(f"Generated {num_stories} stories → {output_file}") |
| ``` |
|
|
| ### Use Case 2: Instruction-Tuning Dataset |
|
|
| Convert stories into instruction-response pairs: |
|
|
| ```python |
| def stories_to_instruction_dataset(stories_file: str, output_file: str): |
| """Convert stories into Alpaca-format instruction tuning data.""" |
| instructions = [] |
| |
| with open(stories_file) as f: |
| for line in f: |
| story = json.loads(line) |
| text = story["text"] |
| |
| # Generate multiple instructions per story |
| instructions.append({ |
| "instruction": "Continue this story:", |
| "input": text[:500], |
| "output": text[500:1500], |
| }) |
| instructions.append({ |
| "instruction": "Summarize this story in one paragraph:", |
| "input": text, |
| "output": generate_summary(text), |
| }) |
| instructions.append({ |
| "instruction": "Describe the main character's personality:", |
| "input": text, |
| "output": extract_character_description(text), |
| }) |
| |
| with open(output_file, "w") as f: |
| json.dump(instructions, f, indent=2, ensure_ascii=False) |
| ``` |
|
|
| ### Use Case 3: Low-Resource Language Corpus |
|
|
| For Hindi or other low-resource languages: |
|
|
| ```python |
| def generate_hindi_corpus(num_stories: int = 5000): |
| """Generate synthetic Hindi stories for language model pre-training.""" |
| # Use a multilingual base model |
| Config.llm_model_name = 'nvidia/meta/llama-3.1-8b-instruct' |
| |
| for i in range(num_stories): |
| # Generate in English |
| english_story = run_storybox_simulation() |
| |
| # Translate to Hindi |
| hindi_story = translate_story(english_story, "Hindi") |
| |
| # Save |
| with open(f"hindi_corpus/{i:06d}.txt", "w") as f: |
| f.write(hindi_story) |
| |
| # Also save English-Hindi parallel |
| with open(f"parallel_corpus/{i:06d}.json", "w") as f: |
| json.dump({"en": english_story, "hi": hindi_story}, f, ensure_ascii=False) |
| ``` |
|
|
| --- |
|
|
| ## 4. Quality Evaluation for Synthetic Data |
|
|
| ```python |
| def evaluate_story_quality(story: str) -> dict: |
| """Score a generated story on multiple dimensions.""" |
| scores = {} |
| |
| # Length check |
| scores["length"] = len(story.split()) |
| scores["meets_length_target"] = scores["length"] > 5000 |
| |
| # Coherence (simple heuristic) |
| sentences = story.split(".") |
| scores["avg_sentence_length"] = sum(len(s.split()) for s in sentences) / len(sentences) |
| |
| # Diversity (unique words / total words) |
| words = story.lower().split() |
| scores["lexical_diversity"] = len(set(words)) / len(words) |
| |
| # Repetition penalty |
| from collections import Counter |
| word_counts = Counter(words) |
| scores["repetition_score"] = sum(1 for c in word_counts.values() if c > 5) / len(word_counts) |
| |
| return scores |
| ``` |
|
|
| --- |
|
|
| ## 5. Full Pipeline: English → Hindi Synthetic Dataset |
|
|
| ```bash |
| # 1. Generate 1000 English stories |
| python scripts/generate_synthetic_dataset.py \ |
| --num-stories 1000 \ |
| --output en_stories.jsonl \ |
| --model llama3.1-8b-mlx |
| |
| # 2. Translate to Hindi |
| python scripts/translate_dataset.py \ |
| --input en_stories.jsonl \ |
| --output hi_stories.jsonl \ |
| --target-language Hindi \ |
| --model nvidia/meta/llama-3.1-8b-instruct |
| |
| # 3. Convert to instruction format |
| python scripts/to_instruction_format.py \ |
| --input hi_stories.jsonl \ |
| --output hi_instructions.json \ |
| --format alpaca |
| |
| # 4. Push to HuggingFace |
| python scripts/upload_to_hub.py \ |
| --dataset hi_instructions.json \ |
| --repo-id yourname/hindi-synthetic-stories |
| ``` |
|
|
| --- |
|
|
| ## 6. UI Screenshots |
|
|
| The Gradio UI provides: |
| - **Story Setting selector** (20 pre-built worlds) |
| - **Language dropdown** (English, Hindi, etc.) |
| - **Simulation controls** (days, characters, temperature) |
| - **Model provider picker** (OpenAI, Ollama, MLX, NIM) |
| - **Live progress bar** during simulation |
| - **Copy button** for generated stories |
| - **Metadata JSON** with generation stats |
|
|
| --- |
|
|
| ## 7. Recommended Models by Language |
|
|
| | Language | Recommended Model | Provider | |
| |----------|-------------------|----------| |
| | English | `gpt-4o-mini` | OpenAI | |
| | Hindi | `meta/llama-3.1-8b-instruct` | NIM / Ollama | |
| | Hindi | `qwen2.5-7b-instruct` | MLX / Ollama | |
| | Arabic | `ubc-nlp/arallama-2` | HuggingFace | |
| | Chinese | `qwen2.5-7b-instruct` | MLX / Ollama | |
| | Japanese | `elyza/elyza-japanese-llama-2` | HuggingFace | |
| | Multilingual | `meta/llama-3.1-8b-instruct` | Any | |
|
|
| --- |
|
|
| ## 8. Repository Status |
|
|
| ✅ **Repository is now PRIVATE** — https://huggingface.co/raazkumar/storybox-reproduction |
|
|
| Only you can access and clone it. |
|
|