File size: 4,120 Bytes

734569e

"""
Convert grpo_vertex_v3.md → grpo_vertex_v3.ipynb

Parses markdown with ```python fenced code blocks into Jupyter notebook cells.
- ```python blocks → code cells
- Everything else → markdown cells
- Consecutive markdown sections are merged into single cells
- Empty cells are skipped
"""

import json
import re
import sys
from pathlib import Path


def md_to_notebook(md_text: str) -> dict:
    """Parse markdown into notebook cells."""
    cells = []
    
    # Split on ```python and ``` boundaries
    # We need to track whether we're inside a code block
    lines = md_text.split('\n')
    
    current_type = 'markdown'  # or 'code'
    current_lines = []
    
    for line in lines:
        if line.strip() == '```python':
            # Flush current markdown
            if current_lines and current_type == 'markdown':
                text = '\n'.join(current_lines).strip()
                if text:
                    cells.append(make_markdown_cell(text))
            current_lines = []
            current_type = 'code'
        elif line.strip() == '```' and current_type == 'code':
            # Flush current code
            if current_lines:
                code = '\n'.join(current_lines)
                # Remove trailing whitespace but keep structure
                code = code.rstrip()
                if code:
                    cells.append(make_code_cell(code))
            current_lines = []
            current_type = 'markdown'
        else:
            current_lines.append(line)
    
    # Flush remaining
    if current_lines:
        text = '\n'.join(current_lines).strip()
        if text:
            if current_type == 'code':
                cells.append(make_code_cell(text))
            else:
                cells.append(make_markdown_cell(text))
    
    # Build notebook
    notebook = {
        "nbformat": 4,
        "nbformat_minor": 5,
        "metadata": {
            "kernelspec": {
                "display_name": "Python 3 (ipykernel)",
                "language": "python",
                "name": "python3"
            },
            "language_info": {
                "name": "python",
                "version": "3.10.0",
                "mimetype": "text/x-python",
                "file_extension": ".py"
            }
        },
        "cells": cells
    }
    
    return notebook


def make_code_cell(source: str) -> dict:
    """Create a code cell."""
    return {
        "cell_type": "code",
        "execution_count": None,
        "metadata": {},
        "outputs": [],
        "source": source.split('\n')  # Will be joined with \n later
    }


def make_markdown_cell(source: str) -> dict:
    """Create a markdown cell."""
    return {
        "cell_type": "markdown",
        "metadata": {},
        "source": source.split('\n')
    }


def format_notebook(notebook: dict) -> str:
    """
    Format notebook JSON with proper source line handling.
    Each line in source needs a trailing \n except the last.
    """
    for cell in notebook["cells"]:
        lines = cell["source"]
        if lines:
            # Add \n to all lines except the last
            cell["source"] = [line + '\n' for line in lines[:-1]] + [lines[-1]]
    
    return json.dumps(notebook, indent=1, ensure_ascii=False)


def main():
    input_path = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("grpo_vertex_v3.md")
    output_path = input_path.with_suffix('.ipynb')
    
    if not input_path.exists():
        print(f"Error: {input_path} not found")
        sys.exit(1)
    
    md_text = input_path.read_text(encoding='utf-8')
    notebook = md_to_notebook(md_text)
    
    # Stats
    code_cells = sum(1 for c in notebook["cells"] if c["cell_type"] == "code")
    md_cells = sum(1 for c in notebook["cells"] if c["cell_type"] == "markdown")
    
    output_path.write_text(format_notebook(notebook), encoding='utf-8')
    
    print(f"✓ Converted {input_path} → {output_path}")
    print(f"  {code_cells} code cells, {md_cells} markdown cells")
    print(f"  Size: {output_path.stat().st_size / 1024:.1f} KB")


if __name__ == "__main__":
    main()