tucano2-commerce / scripts /md_to_ipynb.py
rtferraz's picture
tools: add md-to-ipynb converter script
734569e verified
"""
Convert grpo_vertex_v3.md β†’ grpo_vertex_v3.ipynb
Parses markdown with ```python fenced code blocks into Jupyter notebook cells.
- ```python blocks β†’ code cells
- Everything else β†’ markdown cells
- Consecutive markdown sections are merged into single cells
- Empty cells are skipped
"""
import json
import re
import sys
from pathlib import Path
def md_to_notebook(md_text: str) -> dict:
"""Parse markdown into notebook cells."""
cells = []
# Split on ```python and ``` boundaries
# We need to track whether we're inside a code block
lines = md_text.split('\n')
current_type = 'markdown' # or 'code'
current_lines = []
for line in lines:
if line.strip() == '```python':
# Flush current markdown
if current_lines and current_type == 'markdown':
text = '\n'.join(current_lines).strip()
if text:
cells.append(make_markdown_cell(text))
current_lines = []
current_type = 'code'
elif line.strip() == '```' and current_type == 'code':
# Flush current code
if current_lines:
code = '\n'.join(current_lines)
# Remove trailing whitespace but keep structure
code = code.rstrip()
if code:
cells.append(make_code_cell(code))
current_lines = []
current_type = 'markdown'
else:
current_lines.append(line)
# Flush remaining
if current_lines:
text = '\n'.join(current_lines).strip()
if text:
if current_type == 'code':
cells.append(make_code_cell(text))
else:
cells.append(make_markdown_cell(text))
# Build notebook
notebook = {
"nbformat": 4,
"nbformat_minor": 5,
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.10.0",
"mimetype": "text/x-python",
"file_extension": ".py"
}
},
"cells": cells
}
return notebook
def make_code_cell(source: str) -> dict:
"""Create a code cell."""
return {
"cell_type": "code",
"execution_count": None,
"metadata": {},
"outputs": [],
"source": source.split('\n') # Will be joined with \n later
}
def make_markdown_cell(source: str) -> dict:
"""Create a markdown cell."""
return {
"cell_type": "markdown",
"metadata": {},
"source": source.split('\n')
}
def format_notebook(notebook: dict) -> str:
"""
Format notebook JSON with proper source line handling.
Each line in source needs a trailing \n except the last.
"""
for cell in notebook["cells"]:
lines = cell["source"]
if lines:
# Add \n to all lines except the last
cell["source"] = [line + '\n' for line in lines[:-1]] + [lines[-1]]
return json.dumps(notebook, indent=1, ensure_ascii=False)
def main():
input_path = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("grpo_vertex_v3.md")
output_path = input_path.with_suffix('.ipynb')
if not input_path.exists():
print(f"Error: {input_path} not found")
sys.exit(1)
md_text = input_path.read_text(encoding='utf-8')
notebook = md_to_notebook(md_text)
# Stats
code_cells = sum(1 for c in notebook["cells"] if c["cell_type"] == "code")
md_cells = sum(1 for c in notebook["cells"] if c["cell_type"] == "markdown")
output_path.write_text(format_notebook(notebook), encoding='utf-8')
print(f"βœ“ Converted {input_path} β†’ {output_path}")
print(f" {code_cells} code cells, {md_cells} markdown cells")
print(f" Size: {output_path.stat().st_size / 1024:.1f} KB")
if __name__ == "__main__":
main()