rtferraz commited on
Commit
734569e
·
verified ·
1 Parent(s): 6c51e5f

tools: add md-to-ipynb converter script

Browse files
Files changed (1) hide show
  1. scripts/md_to_ipynb.py +139 -0
scripts/md_to_ipynb.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Convert grpo_vertex_v3.md → grpo_vertex_v3.ipynb
3
+
4
+ Parses markdown with ```python fenced code blocks into Jupyter notebook cells.
5
+ - ```python blocks → code cells
6
+ - Everything else → markdown cells
7
+ - Consecutive markdown sections are merged into single cells
8
+ - Empty cells are skipped
9
+ """
10
+
11
+ import json
12
+ import re
13
+ import sys
14
+ from pathlib import Path
15
+
16
+
17
+ def md_to_notebook(md_text: str) -> dict:
18
+ """Parse markdown into notebook cells."""
19
+ cells = []
20
+
21
+ # Split on ```python and ``` boundaries
22
+ # We need to track whether we're inside a code block
23
+ lines = md_text.split('\n')
24
+
25
+ current_type = 'markdown' # or 'code'
26
+ current_lines = []
27
+
28
+ for line in lines:
29
+ if line.strip() == '```python':
30
+ # Flush current markdown
31
+ if current_lines and current_type == 'markdown':
32
+ text = '\n'.join(current_lines).strip()
33
+ if text:
34
+ cells.append(make_markdown_cell(text))
35
+ current_lines = []
36
+ current_type = 'code'
37
+ elif line.strip() == '```' and current_type == 'code':
38
+ # Flush current code
39
+ if current_lines:
40
+ code = '\n'.join(current_lines)
41
+ # Remove trailing whitespace but keep structure
42
+ code = code.rstrip()
43
+ if code:
44
+ cells.append(make_code_cell(code))
45
+ current_lines = []
46
+ current_type = 'markdown'
47
+ else:
48
+ current_lines.append(line)
49
+
50
+ # Flush remaining
51
+ if current_lines:
52
+ text = '\n'.join(current_lines).strip()
53
+ if text:
54
+ if current_type == 'code':
55
+ cells.append(make_code_cell(text))
56
+ else:
57
+ cells.append(make_markdown_cell(text))
58
+
59
+ # Build notebook
60
+ notebook = {
61
+ "nbformat": 4,
62
+ "nbformat_minor": 5,
63
+ "metadata": {
64
+ "kernelspec": {
65
+ "display_name": "Python 3 (ipykernel)",
66
+ "language": "python",
67
+ "name": "python3"
68
+ },
69
+ "language_info": {
70
+ "name": "python",
71
+ "version": "3.10.0",
72
+ "mimetype": "text/x-python",
73
+ "file_extension": ".py"
74
+ }
75
+ },
76
+ "cells": cells
77
+ }
78
+
79
+ return notebook
80
+
81
+
82
+ def make_code_cell(source: str) -> dict:
83
+ """Create a code cell."""
84
+ return {
85
+ "cell_type": "code",
86
+ "execution_count": None,
87
+ "metadata": {},
88
+ "outputs": [],
89
+ "source": source.split('\n') # Will be joined with \n later
90
+ }
91
+
92
+
93
+ def make_markdown_cell(source: str) -> dict:
94
+ """Create a markdown cell."""
95
+ return {
96
+ "cell_type": "markdown",
97
+ "metadata": {},
98
+ "source": source.split('\n')
99
+ }
100
+
101
+
102
+ def format_notebook(notebook: dict) -> str:
103
+ """
104
+ Format notebook JSON with proper source line handling.
105
+ Each line in source needs a trailing \n except the last.
106
+ """
107
+ for cell in notebook["cells"]:
108
+ lines = cell["source"]
109
+ if lines:
110
+ # Add \n to all lines except the last
111
+ cell["source"] = [line + '\n' for line in lines[:-1]] + [lines[-1]]
112
+
113
+ return json.dumps(notebook, indent=1, ensure_ascii=False)
114
+
115
+
116
+ def main():
117
+ input_path = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("grpo_vertex_v3.md")
118
+ output_path = input_path.with_suffix('.ipynb')
119
+
120
+ if not input_path.exists():
121
+ print(f"Error: {input_path} not found")
122
+ sys.exit(1)
123
+
124
+ md_text = input_path.read_text(encoding='utf-8')
125
+ notebook = md_to_notebook(md_text)
126
+
127
+ # Stats
128
+ code_cells = sum(1 for c in notebook["cells"] if c["cell_type"] == "code")
129
+ md_cells = sum(1 for c in notebook["cells"] if c["cell_type"] == "markdown")
130
+
131
+ output_path.write_text(format_notebook(notebook), encoding='utf-8')
132
+
133
+ print(f"✓ Converted {input_path} → {output_path}")
134
+ print(f" {code_cells} code cells, {md_cells} markdown cells")
135
+ print(f" Size: {output_path.stat().st_size / 1024:.1f} KB")
136
+
137
+
138
+ if __name__ == "__main__":
139
+ main()