heymenn commited on
Commit
7eedaf8
Β·
1 Parent(s): 3bf5b65
README.md CHANGED
@@ -1,10 +1,13 @@
1
  ---
2
- title: ApplyCRs
3
- emoji: πŸ‘€
4
- colorFrom: green
5
- colorTo: red
6
- sdk: docker
 
 
7
  pinned: false
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
1
  ---
2
+ title: CR Application Tool
3
+ emoji: πŸ“„
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: streamlit
7
+ sdk_version: 1.35.0
8
+ app_file: app.py
9
  pinned: false
10
  ---
11
 
12
+ Automated 3GPP/ETSI CR application tool.
13
+ Upload an Excel contribution list β†’ preview accepted CRs β†’ apply all changes β†’ download ZIP.
app.py ADDED
@@ -0,0 +1,459 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ CR Application Tool β€” Streamlit frontend.
4
+
5
+ Three-step UI:
6
+ 1. UPLOAD β€” upload Excel contribution list
7
+ 2. PREVIEW β€” review accepted CRs
8
+ 3. RUNNING β€” pipeline subprocess with live log
9
+ 4. DONE/ERROR β€” download ZIP of results
10
+ """
11
+
12
+ import io
13
+ import json
14
+ import os
15
+ import subprocess
16
+ import sys
17
+ import threading
18
+ import time
19
+ import uuid
20
+ import zipfile
21
+ from datetime import datetime
22
+ from pathlib import Path
23
+
24
+ import streamlit as st
25
+
26
+ # ── Scripts dir (same folder as app.py / scripts/) ───────────────────────────
27
+ SCRIPTS_DIR = Path(__file__).parent / "scripts"
28
+ sys.path.insert(0, str(SCRIPTS_DIR))
29
+
30
+ # ── Session persistence ───────────────────────────────────────────────────────
31
+
32
+ def _get_session_base() -> Path:
33
+ """Use /data/cr_sessions if writable (HF persistent storage), else /tmp."""
34
+ candidate = Path("/data/cr_sessions")
35
+ try:
36
+ candidate.mkdir(parents=True, exist_ok=True)
37
+ probe = candidate / ".write_test"
38
+ probe.write_text("x")
39
+ probe.unlink()
40
+ return candidate
41
+ except OSError:
42
+ fallback = Path("/tmp/cr_sessions")
43
+ fallback.mkdir(parents=True, exist_ok=True)
44
+ return fallback
45
+
46
+
47
+ SESSION_BASE = _get_session_base()
48
+
49
+
50
+ def session_dir(sid: str) -> Path:
51
+ d = SESSION_BASE / sid
52
+ d.mkdir(parents=True, exist_ok=True)
53
+ return d
54
+
55
+
56
+ def _state_path(sid: str) -> Path:
57
+ return session_dir(sid) / "state.json"
58
+
59
+
60
+ def load_state(sid: str) -> dict | None:
61
+ p = _state_path(sid)
62
+ if p.exists():
63
+ try:
64
+ return json.loads(p.read_text())
65
+ except Exception:
66
+ return None
67
+ return None
68
+
69
+
70
+ def save_state(sid: str, state: dict) -> None:
71
+ _state_path(sid).write_text(json.dumps(state, indent=2, default=str))
72
+
73
+
74
+ def new_state(sid: str) -> dict:
75
+ return {
76
+ "session_id": sid,
77
+ "status": "upload",
78
+ "excel_filename": None,
79
+ "person_name": "Ly Thanh PHAN",
80
+ "cr_list": [],
81
+ "pid": None,
82
+ "output_dir": None,
83
+ "log_path": None,
84
+ "started_at": None,
85
+ "completed_at": None,
86
+ "return_code": None,
87
+ }
88
+
89
+
90
+ # ── Helpers ───────────────────────────────────────────────────────────────────
91
+
92
+ def _rc_path(sid: str) -> Path:
93
+ return session_dir(sid) / "returncode"
94
+
95
+
96
+ def _run_and_save_rc(proc: subprocess.Popen, rc_path: Path) -> None:
97
+ """Background thread: wait for process, write return code to disk."""
98
+ proc.wait()
99
+ rc_path.write_text(str(proc.returncode))
100
+
101
+
102
+ def read_return_code(sid: str) -> int | None:
103
+ p = _rc_path(sid)
104
+ if p.exists():
105
+ try:
106
+ return int(p.read_text().strip())
107
+ except ValueError:
108
+ return None
109
+ return None
110
+
111
+
112
+ def is_process_alive(pid: int) -> bool:
113
+ try:
114
+ os.kill(pid, 0)
115
+ return True
116
+ except (ProcessLookupError, PermissionError):
117
+ return False
118
+
119
+
120
+ def tail_log(log_path: str, n: int = 100) -> str:
121
+ p = Path(log_path)
122
+ if not p.exists():
123
+ return "(log not yet available…)"
124
+ lines = p.read_text(errors="replace").splitlines()
125
+ return "\n".join(lines[-n:])
126
+
127
+
128
+ def parse_log_results(log_path: str) -> list[dict]:
129
+ """Extract per-TS result lines from the Final Report section."""
130
+ p = Path(log_path)
131
+ if not p.exists():
132
+ return []
133
+ lines = p.read_text(errors="replace").splitlines()
134
+ results, in_report = [], False
135
+ for line in lines:
136
+ if "Final Report" in line:
137
+ in_report = True
138
+ if in_report:
139
+ for tag in ("OK", "WARN", "FAIL"):
140
+ if f"[{tag}]" in line:
141
+ ts_name = line.split(f"[{tag}]", 1)[-1].strip()
142
+ results.append({"Status": tag, "TS": ts_name})
143
+ break
144
+ return results
145
+
146
+
147
+ def peek_submitted_by(excel_path: Path, max_names: int = 20) -> list[str]:
148
+ """Return unique non-empty SubmittedBy values from the Excel (best-effort)."""
149
+ try:
150
+ ext = excel_path.suffix.lower()
151
+ names: set[str] = set()
152
+ if ext == ".xls":
153
+ import xlrd
154
+ wb = xlrd.open_workbook(str(excel_path))
155
+ try:
156
+ ws = wb.sheet_by_name("Contributions")
157
+ except xlrd.XLRDError:
158
+ ws = wb.sheet_by_index(0)
159
+ headers = [str(ws.cell_value(0, c)).strip() for c in range(ws.ncols)]
160
+ by_col = next(
161
+ (i for i, h in enumerate(headers)
162
+ if h.lower() in ("submittedby", "submitted by")),
163
+ None,
164
+ )
165
+ if by_col is not None:
166
+ for r in range(1, ws.nrows):
167
+ v = str(ws.cell_value(r, by_col)).strip()
168
+ if v:
169
+ names.add(v)
170
+ elif ext == ".xlsx":
171
+ import openpyxl
172
+ wb = openpyxl.load_workbook(str(excel_path), read_only=True, data_only=True)
173
+ ws = wb["Contributions"] if "Contributions" in wb.sheetnames else wb.active
174
+ rows = iter(ws.iter_rows(values_only=True))
175
+ headers = [str(c).strip() if c is not None else "" for c in next(rows, [])]
176
+ by_col = next(
177
+ (i for i, h in enumerate(headers)
178
+ if h.lower() in ("submittedby", "submitted by")),
179
+ None,
180
+ )
181
+ if by_col is not None:
182
+ for row in rows:
183
+ v = str(row[by_col]).strip() if row[by_col] is not None else ""
184
+ if v and v != "None":
185
+ names.add(v)
186
+ return sorted(names)[:max_names]
187
+ except Exception:
188
+ return []
189
+
190
+
191
+ def make_zip(output_dir: Path) -> bytes:
192
+ buf = io.BytesIO()
193
+ with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
194
+ for f in output_dir.rglob("*"):
195
+ if f.is_file():
196
+ zf.write(f, f.relative_to(output_dir.parent))
197
+ buf.seek(0)
198
+ return buf.read()
199
+
200
+
201
+ # ── Page config ───────────────────────────────────────────────────────────────
202
+ st.set_page_config(
203
+ page_title="CR Application Tool",
204
+ page_icon="πŸ“„",
205
+ layout="centered",
206
+ )
207
+ st.title("πŸ“„ CR Application Tool")
208
+ st.caption("Upload an ETSI/3GPP Excel contribution list β†’ preview accepted CRs β†’ apply all β†’ download ZIP.")
209
+
210
+ # ── Session init ──────────────────────────────────────────────────────────────
211
+ params = st.query_params
212
+
213
+ if "sid" not in st.session_state:
214
+ if "sid" in params:
215
+ candidate = params["sid"]
216
+ existing = load_state(candidate)
217
+ if existing:
218
+ st.session_state.sid = candidate
219
+ st.session_state.state = existing
220
+ else:
221
+ sid = str(uuid.uuid4())
222
+ st.session_state.sid = sid
223
+ st.session_state.state = new_state(sid)
224
+ st.query_params["sid"] = sid
225
+ else:
226
+ sid = str(uuid.uuid4())
227
+ st.session_state.sid = sid
228
+ st.session_state.state = new_state(sid)
229
+ st.query_params["sid"] = sid
230
+
231
+ sid: str = st.session_state.sid
232
+ state: dict = st.session_state.state
233
+
234
+ # ── Sidebar ───────────────────────────────────────────────────────────────────
235
+ with st.sidebar:
236
+ st.header("Session")
237
+ st.caption(f"ID: `{sid[:8]}…`")
238
+ st.divider()
239
+ st.subheader("Resume a session")
240
+ resume_sid = st.text_input("Paste a session ID")
241
+ if st.button("Resume") and resume_sid.strip():
242
+ existing = load_state(resume_sid.strip())
243
+ if existing:
244
+ st.session_state.sid = resume_sid.strip()
245
+ st.session_state.state = existing
246
+ st.query_params["sid"] = resume_sid.strip()
247
+ st.rerun()
248
+ else:
249
+ st.error("Session not found.")
250
+
251
+ # ── State machine ─────────────────────────────────────────────────────────────
252
+ status: str = state["status"]
253
+
254
+ # ════════════════════════════════════════════════════════════════════════════
255
+ # UPLOAD
256
+ # ════════════════════════════════════════════════════════════════════════════
257
+ if status == "upload":
258
+ st.subheader("Step 1 β€” Upload contribution list")
259
+
260
+ uploaded = st.file_uploader(
261
+ "Excel contribution list (.xlsx or .xls)",
262
+ type=["xlsx", "xls"],
263
+ )
264
+ person_name = st.text_input(
265
+ "Contributor name (must match SubmittedBy column)",
266
+ value=state.get("person_name", "Ly Thanh PHAN"),
267
+ )
268
+
269
+ if uploaded and st.button("Parse CR list β†’", type="primary"):
270
+ excel_path = session_dir(sid) / uploaded.name
271
+ excel_path.write_bytes(uploaded.getbuffer())
272
+
273
+ with st.spinner("Parsing Excel…"):
274
+ try:
275
+ from fetch_crs import parse_excel
276
+ cr_list = parse_excel(str(excel_path), person_name)
277
+ state["status"] = "preview"
278
+ state["excel_filename"] = uploaded.name
279
+ state["person_name"] = person_name
280
+ state["cr_list"] = [list(row) for row in cr_list]
281
+ save_state(sid, state)
282
+ st.rerun()
283
+ except Exception as exc:
284
+ st.error(f"Failed to parse Excel: {exc}")
285
+
286
+ # ════════════════════════════════════════════════════════════════════════════
287
+ # PREVIEW
288
+ # ════════════════════════════════════════════════════════════════════════════
289
+ elif status == "preview":
290
+ cr_list = state["cr_list"]
291
+ st.subheader(f"Step 2 β€” {len(cr_list)} Accepted CR(s) found")
292
+
293
+ if cr_list:
294
+ import pandas as pd
295
+ df = pd.DataFrame(cr_list, columns=["UID", "Title"])
296
+ st.dataframe(df, use_container_width=True)
297
+ else:
298
+ st.warning(
299
+ f"No Accepted CRs found for **{state['person_name']}** in this file."
300
+ )
301
+ # Diagnostic: show what names are in the SubmittedBy column
302
+ excel_path = session_dir(sid) / state["excel_filename"]
303
+ found_names = peek_submitted_by(excel_path)
304
+ if found_names:
305
+ st.info(
306
+ "**Names found in SubmittedBy column** β€” copy the exact one into the field above and re-upload:\n\n"
307
+ + "\n".join(f"- `{n}`" for n in found_names)
308
+ )
309
+
310
+ col1, col2 = st.columns(2)
311
+ with col1:
312
+ if st.button("← Back"):
313
+ state["status"] = "upload"
314
+ state["cr_list"] = []
315
+ save_state(sid, state)
316
+ st.rerun()
317
+ with col2:
318
+ if cr_list and st.button("β–Ά Start Pipeline", type="primary"):
319
+ excel_path = session_dir(sid) / state["excel_filename"]
320
+ output_dir = session_dir(sid) / "output"
321
+ output_dir.mkdir(parents=True, exist_ok=True)
322
+ log_path = session_dir(sid) / "pipeline.log"
323
+ rc_path = _rc_path(sid)
324
+
325
+ cmd = [
326
+ sys.executable,
327
+ str(SCRIPTS_DIR / "orchestrate_cr.py"),
328
+ str(excel_path),
329
+ state["person_name"],
330
+ "--output-dir", str(output_dir),
331
+ ]
332
+
333
+ log_file = open(str(log_path), "w")
334
+ proc = subprocess.Popen(
335
+ cmd,
336
+ stdout=log_file,
337
+ stderr=subprocess.STDOUT,
338
+ env=os.environ.copy(),
339
+ )
340
+ log_file.close()
341
+
342
+ # Background thread writes returncode file when process finishes
343
+ threading.Thread(
344
+ target=_run_and_save_rc,
345
+ args=(proc, rc_path),
346
+ daemon=True,
347
+ ).start()
348
+
349
+ st.session_state.proc = proc
350
+
351
+ state["status"] = "running"
352
+ state["pid"] = proc.pid
353
+ state["output_dir"] = str(output_dir)
354
+ state["log_path"] = str(log_path)
355
+ state["started_at"] = datetime.now().isoformat()
356
+ save_state(sid, state)
357
+ st.rerun()
358
+
359
+ # ════════════════════════════════════════════════════════════════════════════
360
+ # RUNNING
361
+ # ════════════════════════════════════════════════════════════════════════════
362
+ elif status == "running":
363
+ pid = state["pid"]
364
+ log_path = state["log_path"]
365
+
366
+ # Determine whether process is still alive
367
+ proc = st.session_state.get("proc")
368
+ alive = False
369
+ if proc is not None:
370
+ alive = proc.poll() is None
371
+ else:
372
+ # Session reloaded β€” check returncode file, then PID
373
+ rc = read_return_code(sid)
374
+ if rc is None:
375
+ alive = is_process_alive(pid)
376
+
377
+ if alive:
378
+ st.subheader("⏳ Pipeline running…")
379
+ st.info(f"PID {pid} β€” started {state.get('started_at', '')[:19]}")
380
+ log_text = tail_log(log_path, 100)
381
+ st.text_area("Live log (last 100 lines)", value=log_text, height=400)
382
+ time.sleep(2)
383
+ st.rerun()
384
+ else:
385
+ # Process finished β€” determine return code
386
+ rc = read_return_code(sid)
387
+ if rc is None and proc is not None:
388
+ rc = proc.returncode
389
+ state["return_code"] = rc
390
+ state["completed_at"] = datetime.now().isoformat()
391
+ state["status"] = "done" if rc == 0 else "error"
392
+ save_state(sid, state)
393
+ st.rerun()
394
+
395
+ # ════════════════════════════════════════════════════════════════════════════
396
+ # DONE / ERROR
397
+ # ════════════════════════════════════════════════════════════════════════════
398
+ elif status in ("done", "error"):
399
+ log_path = state.get("log_path", "")
400
+ output_dir = Path(state.get("output_dir", ""))
401
+ rc = state.get("return_code")
402
+
403
+ if status == "done":
404
+ st.success("βœ… Pipeline completed successfully!")
405
+ else:
406
+ st.error(f"❌ Pipeline finished with errors (return code: {rc})")
407
+
408
+ # Per-TS results table
409
+ results = parse_log_results(log_path)
410
+ if results:
411
+ st.subheader("Results per TS")
412
+ import pandas as pd
413
+
414
+ df = pd.DataFrame(results)
415
+
416
+ def _color_status(val):
417
+ return {
418
+ "OK": "background-color: #d4edda; color: #155724",
419
+ "WARN": "background-color: #fff3cd; color: #856404",
420
+ "FAIL": "background-color: #f8d7da; color: #721c24",
421
+ }.get(val, "")
422
+
423
+ st.dataframe(
424
+ df.style.map(_color_status, subset=["Status"]),
425
+ use_container_width=True,
426
+ )
427
+
428
+ # Download ZIP
429
+ if output_dir.exists() and any(output_dir.rglob("*")):
430
+ st.subheader("Download results")
431
+ zip_bytes = make_zip(output_dir)
432
+ st.download_button(
433
+ label="⬇ Download results ZIP",
434
+ data=zip_bytes,
435
+ file_name=f"cr_results_{sid[:8]}.zip",
436
+ mime="application/zip",
437
+ type="primary",
438
+ )
439
+ else:
440
+ st.warning("Output directory is empty β€” nothing to download.")
441
+
442
+ # Full log
443
+ with st.expander("Full pipeline log"):
444
+ if log_path and Path(log_path).exists():
445
+ st.text(Path(log_path).read_text(errors="replace"))
446
+ else:
447
+ st.text("Log not found.")
448
+
449
+ # Start new session
450
+ st.divider()
451
+ if st.button("Start new session"):
452
+ new_sid = str(uuid.uuid4())
453
+ st.session_state.sid = new_sid
454
+ st.session_state.state = new_state(new_sid)
455
+ if "proc" in st.session_state:
456
+ del st.session_state.proc
457
+ st.query_params["sid"] = new_sid
458
+ save_state(new_sid, st.session_state.state)
459
+ st.rerun()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit==1.35.0
2
+ python-docx==1.1.2
3
+ openpyxl==3.1.5
4
+ xlrd==2.0.1
5
+ lxml==5.2.2
6
+ requests==2.32.3
scripts/__pycache__/cr_parser.cpython-310.pyc ADDED
Binary file (11.9 kB). View file
 
scripts/__pycache__/docx_helpers.cpython-310.pyc ADDED
Binary file (13.1 kB). View file
 
scripts/__pycache__/fetch_crs.cpython-310.pyc ADDED
Binary file (12.3 kB). View file
 
scripts/__pycache__/finalize_ts.cpython-310.pyc ADDED
Binary file (9.04 kB). View file
 
scripts/__pycache__/ts_applicator.cpython-310.pyc ADDED
Binary file (18.1 kB). View file
 
scripts/cr_parser.py ADDED
@@ -0,0 +1,490 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ cr_parser.py β€” Parse a CR DOCX's tracked changes into a JSON manifest.
4
+
5
+ Each entry in the manifest is one of:
6
+ {"type": "text_replace", "location": {...}, "old": "...", "new": "..."}
7
+ {"type": "para_insert", "location": {...}, "paragraphs": [...]}
8
+ {"type": "row_insert", "location": {...}, "cells": [...]}
9
+
10
+ Usage:
11
+ python3 cr_parser.py <cr.docx> [--output manifest.json]
12
+ # or import: from cr_parser import parse_cr
13
+ """
14
+
15
+ import argparse
16
+ import json
17
+ import re
18
+ import sys
19
+ from pathlib import Path
20
+
21
+ import docx
22
+ from docx.oxml.ns import qn
23
+
24
+
25
+ # ── Low-level text helpers ────────────────────────────────────────────────────
26
+
27
+ def _del_text(elem):
28
+ """Concatenate all w:delText descendants."""
29
+ return ''.join(t.text or '' for t in elem.findall('.//' + qn('w:delText')))
30
+
31
+ def _ins_text(elem):
32
+ """Concatenate all w:t descendants (inside w:ins)."""
33
+ return ''.join(t.text or '' for t in elem.findall('.//' + qn('w:t')))
34
+
35
+ def _para_new_text(p_elem):
36
+ """Text of a paragraph after accepting tracked changes (ins included, del excluded)."""
37
+ return ''.join(t.text or '' for t in p_elem.findall('.//' + qn('w:t')))
38
+
39
+ def _para_orig_text(p_elem):
40
+ """Text of a paragraph as it exists in the TS (del included, ins excluded)."""
41
+ parts = []
42
+ for node in p_elem.iter():
43
+ if node.tag == qn('w:delText') and node.text:
44
+ parts.append(node.text)
45
+ elif node.tag == qn('w:t') and node.text:
46
+ # Skip if inside a w:ins
47
+ if not any(a.tag == qn('w:ins') for a in node.iterancestors()):
48
+ parts.append(node.text)
49
+ return ''.join(parts)
50
+
51
+ def _style_val(p_elem):
52
+ pPr = p_elem.find(qn('w:pPr'))
53
+ if pPr is None:
54
+ return None
55
+ pStyle = pPr.find(qn('w:pStyle'))
56
+ if pStyle is None:
57
+ return None
58
+ return pStyle.get(qn('w:val'))
59
+
60
+ def _is_rpr_ins(ins_elem):
61
+ """True if w:ins is inside w:rPr β€” a formatting change, not a content insertion."""
62
+ p = ins_elem.getparent()
63
+ return p is not None and p.tag == qn('w:rPr')
64
+
65
+ def _is_inserted_para(p_elem):
66
+ """True if this paragraph's paragraph-mark is tracked as inserted (whole new para)."""
67
+ pPr = p_elem.find(qn('w:pPr'))
68
+ if pPr is None:
69
+ return False
70
+ rPr = pPr.find(qn('w:rPr'))
71
+ if rPr is None:
72
+ return False
73
+ return rPr.find(qn('w:ins')) is not None
74
+
75
+ def _is_deleted_para(p_elem):
76
+ """True if this paragraph's paragraph-mark is tracked as deleted (whole para deleted)."""
77
+ pPr = p_elem.find(qn('w:pPr'))
78
+ if pPr is None:
79
+ return False
80
+ rPr = pPr.find(qn('w:rPr'))
81
+ if rPr is None:
82
+ return False
83
+ return rPr.find(qn('w:del')) is not None
84
+
85
+ def _is_fully_deleted_tbl(tbl_elem):
86
+ """True if every row in the table is tracked as a row-level deletion."""
87
+ rows = tbl_elem.findall(qn('w:tr'))
88
+ if not rows:
89
+ return False
90
+ return all(
91
+ tr.find(qn('w:trPr')) is not None and
92
+ tr.find(qn('w:trPr')).find(qn('w:del')) is not None
93
+ for tr in rows
94
+ )
95
+
96
+ def _is_fully_inserted_tbl(tbl_elem):
97
+ """True if every row in the table is tracked as a row-level insertion."""
98
+ rows = tbl_elem.findall(qn('w:tr'))
99
+ if not rows:
100
+ return False
101
+ return all(
102
+ tr.find(qn('w:trPr')) is not None and
103
+ tr.find(qn('w:trPr')).find(qn('w:ins')) is not None
104
+ for tr in rows
105
+ )
106
+
107
+
108
+ # ── Table helpers ─────────────────────────────────────────────────────────────
109
+
110
+ def _table_header(tbl_elem):
111
+ """First row cell texts β€” used as table identifier."""
112
+ first_tr = tbl_elem.find(qn('w:tr'))
113
+ if first_tr is None:
114
+ return []
115
+ cells = []
116
+ for tc in first_tr.findall(qn('w:tc')):
117
+ p = tc.find('.//' + qn('w:p'))
118
+ cells.append(_para_new_text(p).strip() if p is not None else '')
119
+ return cells
120
+
121
+ def _row_col0(tr_elem):
122
+ """Col-0 text of a table row β€” used as row anchor."""
123
+ tc = tr_elem.find(qn('w:tc'))
124
+ if tc is None:
125
+ return ''
126
+ p = tc.find('.//' + qn('w:p'))
127
+ return _para_new_text(p).strip() if p is not None else ''
128
+
129
+
130
+ # ── Inline del+ins extraction (from a single paragraph) ──────────────────────
131
+
132
+ def _extract_inline_replacements(p_elem):
133
+ """
134
+ Return list of (old_text, new_text) pairs from del+ins sibling pairs.
135
+ Handles: del-then-ins, ins-then-del, multi-fragment consecutive dels.
136
+ Filters: whitespace-only dels with no adjacent ins, empty dels, rPr ins.
137
+ """
138
+ children = list(p_elem)
139
+ pairs = []
140
+ skip = set()
141
+
142
+ for i, child in enumerate(children):
143
+ if i in skip:
144
+ continue
145
+
146
+ if child.tag != qn('w:del'):
147
+ continue
148
+
149
+ old_text = _del_text(child)
150
+
151
+ # Empty del (paragraph-mark or line-break deletion) β€” discard
152
+ if not old_text:
153
+ skip.add(i)
154
+ continue
155
+
156
+ # Merge consecutive del siblings (multi-fragment deletion)
157
+ j = i + 1
158
+ while j < len(children) and children[j].tag == qn('w:del'):
159
+ old_text += _del_text(children[j])
160
+ skip.add(j)
161
+ j += 1
162
+
163
+ # Whitespace-only del: only keep if there's an adjacent ins
164
+ next_sib = children[j] if j < len(children) else None
165
+ prev_sib = children[i - 1] if i > 0 else None
166
+
167
+ new_text = None
168
+ if next_sib is not None and next_sib.tag == qn('w:ins') and not _is_rpr_ins(next_sib):
169
+ new_text = _ins_text(next_sib)
170
+ skip.add(j)
171
+ elif prev_sib is not None and prev_sib.tag == qn('w:ins') and not _is_rpr_ins(prev_sib):
172
+ new_text = _ins_text(prev_sib)
173
+
174
+ if new_text is None:
175
+ if not old_text.strip():
176
+ skip.add(i)
177
+ continue # whitespace artefact with no counterpart
178
+ # Pure deletion (no replacement) β€” record with empty new
179
+ pairs.append((old_text, ''))
180
+ else:
181
+ pairs.append((old_text, new_text))
182
+
183
+ return pairs
184
+
185
+
186
+ # ── Table change extraction ───────────────────────────────────────────────────
187
+
188
+ def _parse_table(tbl_elem, changes, section_heading=''):
189
+ header = _table_header(tbl_elem)
190
+ header_key = header[:3] # first 3 columns enough for matching
191
+ rows = tbl_elem.findall(qn('w:tr'))
192
+
193
+ for tr_idx, tr in enumerate(rows):
194
+ trPr = tr.find(qn('w:trPr'))
195
+
196
+ # ── Tracked row insertion ─────────────────────────────────────────
197
+ if trPr is not None and trPr.find(qn('w:ins')) is not None:
198
+ # Find preceding stable row for anchor
199
+ after_anchor = ''
200
+ for prev_idx in range(tr_idx - 1, -1, -1):
201
+ prev_tr = rows[prev_idx]
202
+ prev_trPr = prev_tr.find(qn('w:trPr'))
203
+ if prev_trPr is None or prev_trPr.find(qn('w:ins')) is None:
204
+ after_anchor = _row_col0(prev_tr)
205
+ break
206
+
207
+ cells = []
208
+ for tc in tr.findall(qn('w:tc')):
209
+ tcPr = tc.find(qn('w:tcPr'))
210
+
211
+ # Width
212
+ width = None
213
+ if tcPr is not None:
214
+ tcW = tcPr.find(qn('w:tcW'))
215
+ if tcW is not None:
216
+ try:
217
+ width = int(tcW.get(qn('w:w'), 0))
218
+ except (ValueError, TypeError):
219
+ width = None
220
+
221
+ # vMerge (no w:val attribute = continuation)
222
+ is_vmerge = False
223
+ if tcPr is not None:
224
+ vm = tcPr.find(qn('w:vMerge'))
225
+ if vm is not None and vm.get(qn('w:val')) is None:
226
+ is_vmerge = True
227
+
228
+ # Text β€” prefer ins text, fall back to all text
229
+ cell_ins_text = _ins_text(tc)
230
+ p = tc.find('.//' + qn('w:p'))
231
+ cell_text = cell_ins_text if cell_ins_text else (_para_new_text(p) if p else '')
232
+ style = _style_val(p) if p is not None else None
233
+
234
+ cells.append({
235
+ 'text': cell_text.strip(),
236
+ 'width': width,
237
+ 'vmerge': is_vmerge,
238
+ 'style': style,
239
+ })
240
+
241
+ changes.append({
242
+ 'type': 'row_insert',
243
+ 'location': {
244
+ 'kind': 'table_row',
245
+ 'table_header': header_key,
246
+ 'after_row_anchor': after_anchor,
247
+ 'section_heading': section_heading,
248
+ },
249
+ 'cells': cells,
250
+ })
251
+ continue
252
+
253
+ # ── Cell-level text_replace ───────────────────────────────────────
254
+ row_anchor = _row_col0(tr)
255
+ tcs = tr.findall(qn('w:tc'))
256
+ for col_idx, tc in enumerate(tcs):
257
+ for p in tc.findall('.//' + qn('w:p')):
258
+ for old_text, new_text in _extract_inline_replacements(p):
259
+ if not old_text:
260
+ continue
261
+ changes.append({
262
+ 'type': 'text_replace',
263
+ 'location': {
264
+ 'kind': 'table_cell',
265
+ 'table_header': header_key,
266
+ 'row_anchor': row_anchor,
267
+ 'col_idx': col_idx,
268
+ 'section_heading': section_heading,
269
+ },
270
+ 'old': old_text,
271
+ 'new': new_text,
272
+ })
273
+
274
+
275
+ # ── Body paragraph extraction ─────────────────────────────────────────────────
276
+
277
+ def _parse_body(body, changes):
278
+ """
279
+ Walk direct children of w:body, emitting changes.
280
+
281
+ Change types emitted:
282
+ section_replace β€” a contiguous block of fully-deleted elements (para and/or
283
+ table, tracked at the paragraph-mark / row level) followed
284
+ immediately by a contiguous block of fully-inserted elements.
285
+ The raw XML of ALL those CR elements is stored verbatim so
286
+ the applicator can transplant them directly into the TS β€”
287
+ exactly what Word does on a copy-paste.
288
+ text_replace β€” an inline del+ins pair inside an otherwise-stable paragraph.
289
+ para_insert β€” one or more wholly-new paragraphs with no corresponding
290
+ deletion (rare; kept for backward compatibility).
291
+ """
292
+ from lxml import etree
293
+
294
+ prev_stable_text = ''
295
+
296
+ # ── Section-replace accumulator ───────────────────────────────────────────
297
+ sec_del = [] # fully-deleted elements (CR del block)
298
+ sec_sep = [] # empty/separator paragraphs between del and ins blocks
299
+ sec_ins = [] # fully-inserted elements (CR ins block)
300
+ sec_state = 'stable' # 'stable' | 'del' | 'sep' | 'ins'
301
+ sec_anchor = ''
302
+
303
+ def flush_section():
304
+ nonlocal sec_state, sec_anchor
305
+ if not sec_del and not sec_ins:
306
+ sec_del.clear(); sec_sep.clear(); sec_ins.clear()
307
+ sec_state = 'stable'
308
+ return
309
+ # The del_heading is the text content of the first deleted paragraph
310
+ del_heading = ''
311
+ for e in sec_del:
312
+ tag = e.tag.split('}')[-1] if '}' in e.tag else e.tag
313
+ if tag == 'p':
314
+ t = _del_text(e).strip() or _para_orig_text(e).strip()
315
+ if t:
316
+ del_heading = t
317
+ break
318
+ # Serialize all elements for the manifest (del + sep + ins)
319
+ all_elems = sec_del + sec_sep + sec_ins
320
+ elements_xml = [etree.tostring(e, encoding='unicode') for e in all_elems]
321
+ has_del_table = any(
322
+ (e.tag.split('}')[-1] if '}' in e.tag else e.tag) == 'tbl'
323
+ for e in sec_del
324
+ )
325
+ changes.append({
326
+ 'type': 'section_replace',
327
+ 'location': {
328
+ 'kind': 'body',
329
+ 'del_heading': del_heading,
330
+ 'has_del_table': has_del_table,
331
+ 'anchor_text': sec_anchor,
332
+ },
333
+ 'elements_xml': elements_xml,
334
+ })
335
+ sec_del.clear(); sec_sep.clear(); sec_ins.clear()
336
+ sec_state = 'stable'
337
+
338
+ # ── Para-insert accumulator (for standalone new paragraphs) ───────────────
339
+ insert_group = []
340
+
341
+ def flush_group():
342
+ if not insert_group:
343
+ return
344
+ paras = [
345
+ {'text': _para_new_text(p).strip(), 'style': _style_val(p)}
346
+ for p in insert_group
347
+ ]
348
+ paras = [p for p in paras if p['text'] or p['style']]
349
+ if paras:
350
+ changes.append({
351
+ 'type': 'para_insert',
352
+ 'location': {
353
+ 'kind': 'body',
354
+ 'anchor_text': prev_stable_text,
355
+ },
356
+ 'paragraphs': paras,
357
+ })
358
+ insert_group.clear()
359
+
360
+ for elem in body:
361
+ tag = elem.tag.split('}')[-1] if '}' in elem.tag else elem.tag
362
+
363
+ if tag == 'p':
364
+ is_del = _is_deleted_para(elem)
365
+ is_ins = _is_inserted_para(elem)
366
+ is_empty = not _para_orig_text(elem).strip() and not _para_new_text(elem).strip()
367
+
368
+ if is_del:
369
+ # Start or continue the del block
370
+ if sec_state == 'ins':
371
+ flush_section() # ins before del = two separate section_replaces
372
+ if sec_state == 'stable':
373
+ flush_group()
374
+ sec_anchor = prev_stable_text
375
+ sec_state = 'del'
376
+ sec_del.append(elem)
377
+
378
+ elif is_ins:
379
+ if sec_state in ('del', 'sep'):
380
+ # ins block follows a del block β†’ part of section_replace
381
+ sec_state = 'ins'
382
+ sec_ins.append(elem)
383
+ elif sec_state == 'ins':
384
+ sec_ins.append(elem)
385
+ else:
386
+ # Standalone ins paragraph (no preceding del block)
387
+ flush_group() # (should already be empty)
388
+ insert_group.append(elem)
389
+
390
+ elif is_empty:
391
+ if sec_state == 'del':
392
+ # Separator between del and ins blocks
393
+ sec_state = 'sep'
394
+ sec_sep.append(elem)
395
+ elif sec_state in ('sep', 'ins'):
396
+ sec_ins.append(elem)
397
+ else:
398
+ # Empty para in stable region β€” ignore for anchoring
399
+ pass
400
+
401
+ else:
402
+ # Stable (or inline-changed) paragraph
403
+ flush_section()
404
+ flush_group()
405
+
406
+ for old_text, new_text in _extract_inline_replacements(elem):
407
+ if not old_text:
408
+ continue
409
+ changes.append({
410
+ 'type': 'text_replace',
411
+ 'location': {
412
+ 'kind': 'body_para',
413
+ 'para_context': _para_orig_text(elem).strip(),
414
+ },
415
+ 'old': old_text,
416
+ 'new': new_text,
417
+ })
418
+
419
+ orig = _para_orig_text(elem).strip()
420
+ if orig and not re.fullmatch(r'\[\.[\s\.]*\]', orig):
421
+ prev_stable_text = orig
422
+
423
+ elif tag == 'tbl':
424
+ if _is_fully_deleted_tbl(elem):
425
+ if sec_state == 'ins':
426
+ flush_section()
427
+ if sec_state == 'stable':
428
+ flush_group()
429
+ sec_anchor = prev_stable_text
430
+ sec_state = 'del'
431
+ sec_del.append(elem)
432
+
433
+ elif _is_fully_inserted_tbl(elem):
434
+ if sec_state in ('del', 'sep', 'ins'):
435
+ sec_state = 'ins'
436
+ sec_ins.append(elem)
437
+ else:
438
+ # Standalone fully-inserted table (no del block) β€” treat as section_replace
439
+ flush_group()
440
+ sec_anchor = prev_stable_text
441
+ sec_state = 'ins'
442
+ sec_ins.append(elem)
443
+
444
+ else:
445
+ # Table with inline cell changes
446
+ flush_section()
447
+ flush_group()
448
+ _parse_table(elem, changes, section_heading=prev_stable_text)
449
+
450
+ flush_section()
451
+ flush_group()
452
+
453
+
454
+ # ── Public API ────────────────────────────────────────────────────────────────
455
+
456
+ def parse_cr(cr_path, output_json=None):
457
+ """
458
+ Parse all tracked changes in a CR DOCX.
459
+ Returns list of change dicts. Optionally saves to JSON.
460
+ """
461
+ doc = docx.Document(str(cr_path))
462
+ body = doc.element.body
463
+ changes = []
464
+ _parse_body(body, changes)
465
+
466
+ if output_json:
467
+ Path(output_json).write_text(
468
+ json.dumps(changes, indent=2, ensure_ascii=False), encoding='utf-8'
469
+ )
470
+ return changes
471
+
472
+
473
+ # ── CLI ───────────────────────────────────────────────────────────────────────
474
+
475
+ def main():
476
+ ap = argparse.ArgumentParser(description='Parse CR DOCX tracked changes into JSON manifest.')
477
+ ap.add_argument('cr_docx', help='CR DOCX file path')
478
+ ap.add_argument('--output', default=None, help='Output JSON path (default: print to stdout)')
479
+ args = ap.parse_args()
480
+
481
+ changes = parse_cr(args.cr_docx, output_json=args.output)
482
+
483
+ if args.output:
484
+ print(f'Wrote {len(changes)} change(s) β†’ {args.output}')
485
+ else:
486
+ print(json.dumps(changes, indent=2, ensure_ascii=False))
487
+
488
+
489
+ if __name__ == '__main__':
490
+ main()
scripts/docx_helpers.py ADDED
@@ -0,0 +1,494 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Reusable helpers for applying CR changes to TS DOCX files.
3
+ Supports both direct editing AND tracked changes (review mode).
4
+ """
5
+
6
+ import copy
7
+ import difflib
8
+ import re
9
+
10
+ from docx.oxml.ns import qn
11
+ from docx.oxml import OxmlElement
12
+
13
+ AUTHOR = "CR Application"
14
+ DATE = "2026-03-24T00:00:00Z"
15
+
16
+
17
+ # ── Revision ID counter ───────────────────────────────────────────────────────
18
+
19
+ def _get_max_id(doc):
20
+ max_id = 0
21
+ for el in doc.element.body.iter():
22
+ for key, val in el.attrib.items():
23
+ if key.endswith('}id'):
24
+ try:
25
+ max_id = max(max_id, int(val))
26
+ except ValueError:
27
+ pass
28
+ return max_id
29
+
30
+ class RevCounter:
31
+ """Generates unique revision IDs that don't clash with existing ones."""
32
+ def __init__(self, doc):
33
+ self._n = _get_max_id(doc) + 1
34
+
35
+ def next(self):
36
+ n = self._n
37
+ self._n += 1
38
+ return str(n)
39
+
40
+
41
+ # ── Helpers ───────────────────────────────────────────────────────────────────
42
+
43
+ def _make_t(text, tag='w:t'):
44
+ t = OxmlElement(tag)
45
+ t.text = text or ''
46
+ if text and (text[0] in (' ', '\t') or text[-1] in (' ', '\t')):
47
+ t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
48
+ return t
49
+
50
+ def _make_run(text):
51
+ r = OxmlElement('w:r')
52
+ r.append(_make_t(text))
53
+ return r
54
+
55
+ def _make_para_el(text, style_val):
56
+ new_p = OxmlElement('w:p')
57
+ pPr = OxmlElement('w:pPr')
58
+ pStyle = OxmlElement('w:pStyle')
59
+ pStyle.set(qn('w:val'), style_val)
60
+ pPr.append(pStyle)
61
+ new_p.append(pPr)
62
+ new_p.append(_make_run(text))
63
+ return new_p
64
+
65
+
66
+ # ── Section mapping ───────────────────────────────────────────────────────────
67
+
68
+ def map_sections(doc, clause_numbers):
69
+ """
70
+ Print and return paragraphs belonging to the given clause numbers.
71
+ Returns dict: {clause: [(index, para), ...]}
72
+ """
73
+ results = {c: [] for c in clause_numbers}
74
+ in_section = None
75
+
76
+ for i, para in enumerate(doc.paragraphs):
77
+ text = para.text.strip()
78
+ style = para.style.name
79
+
80
+ matched = False
81
+ for clause in clause_numbers:
82
+ if clause in text and ('Heading' in style or 'heading' in style.lower()):
83
+ in_section = clause
84
+ print(f'\n=== [{i}] SECTION {clause} | style={style!r} ===')
85
+ print(f' [{i}] "{text}"')
86
+ results[clause].append((i, para))
87
+ matched = True
88
+ break
89
+
90
+ if not matched and in_section:
91
+ if 'Heading' in style and text:
92
+ print(f' --- end at [{i}] ({style})')
93
+ in_section = None
94
+ elif text:
95
+ print(f' [{i}] style={style!r:16s} | "{text[:90]}"')
96
+ results[in_section].append((i, para))
97
+
98
+ return results
99
+
100
+
101
+ def get_bullet_style_val(doc, fallback='B1'):
102
+ for para in doc.paragraphs:
103
+ pPr = para._element.find(qn('w:pPr'))
104
+ if pPr is not None:
105
+ pStyle = pPr.find(qn('w:pStyle'))
106
+ if pStyle is not None:
107
+ val = pStyle.get(qn('w:val'), '')
108
+ if val.startswith('B') and val[1:].isdigit():
109
+ return val
110
+ return fallback
111
+
112
+ def get_style_val(para):
113
+ pPr = para._element.find(qn('w:pPr'))
114
+ if pPr is not None:
115
+ pStyle = pPr.find(qn('w:pStyle'))
116
+ if pStyle is not None:
117
+ return pStyle.get(qn('w:val'))
118
+ return 'Normal'
119
+
120
+
121
+ # ══════════════════════════════════════════════════════════════════════════════
122
+ # DIRECT EDIT MODE (no track changes)
123
+ # ══════════════════════════════════════════════════════════════════════════════
124
+
125
+ def delete_para(para):
126
+ """Remove a paragraph from the document entirely."""
127
+ el = para._element
128
+ el.getparent().remove(el)
129
+
130
+ def insert_para_after(ref_para, text, style_val='Normal'):
131
+ """Insert one paragraph after ref_para. Returns the new element."""
132
+ new_p = _make_para_el(text, style_val)
133
+ ref_para._element.addnext(new_p)
134
+ return new_p
135
+
136
+ def insert_paras_after(ref_para, items, style_val='Normal'):
137
+ """
138
+ Insert multiple paragraphs in order after ref_para using a moving pointer.
139
+ items: list of str, or list of (text, style_val) tuples.
140
+ Returns the last inserted element.
141
+ """
142
+ ref_el = ref_para._element
143
+ for item in items:
144
+ text, sv = item if isinstance(item, tuple) else (item, style_val)
145
+ new_p = _make_para_el(text, sv)
146
+ ref_el.addnext(new_p)
147
+ ref_el = new_p
148
+ return ref_el
149
+
150
+ def modify_para_text(para, old_text, new_text):
151
+ """Replace old_text with new_text in a paragraph (collapses all runs)."""
152
+ full = para.text
153
+ if old_text not in full:
154
+ raise ValueError(f"Not found: {old_text!r}\nIn: {full!r}")
155
+ updated = full.replace(old_text, new_text)
156
+ p_el = para._element
157
+ for r in p_el.findall(qn('w:r')):
158
+ p_el.remove(r)
159
+ p_el.append(_make_run(updated))
160
+ return updated
161
+
162
+
163
+ # ══════════════════════════════════════════════════════════════════════════════
164
+ # TRACKED CHANGE MODE (review / redline mode)
165
+ # ══════════════════════════════════════════════════════════════════════════════
166
+
167
+ def _ins_attr(rev, author, date):
168
+ return {qn('w:id'): rev.next(), qn('w:author'): author, qn('w:date'): date}
169
+
170
+ def _del_attr(rev, author, date):
171
+ return {qn('w:id'): rev.next(), qn('w:author'): author, qn('w:date'): date}
172
+
173
+
174
+ def tracked_insert_para_after(ref_para_or_el, text, style_val, rev,
175
+ author=AUTHOR, date=DATE):
176
+ """
177
+ Insert a new paragraph after ref_para_or_el with tracked insertion marks.
178
+ Word will show it as an insertion in review mode.
179
+ Returns the new XML element (use as next ref for chained inserts).
180
+ """
181
+ new_p = OxmlElement('w:p')
182
+
183
+ # Paragraph properties: mark the paragraph mark itself as inserted
184
+ pPr = OxmlElement('w:pPr')
185
+ pStyle = OxmlElement('w:pStyle')
186
+ pStyle.set(qn('w:val'), style_val)
187
+ pPr.append(pStyle)
188
+ rPr = OxmlElement('w:rPr')
189
+ ins_mark = OxmlElement('w:ins')
190
+ for k, v in _ins_attr(rev, author, date).items():
191
+ ins_mark.set(k, v)
192
+ rPr.append(ins_mark)
193
+ pPr.append(rPr)
194
+ new_p.append(pPr)
195
+
196
+ # Content wrapped in <w:ins>
197
+ ins = OxmlElement('w:ins')
198
+ for k, v in _ins_attr(rev, author, date).items():
199
+ ins.set(k, v)
200
+ ins.append(_make_run(text))
201
+ new_p.append(ins)
202
+
203
+ ref_el = ref_para_or_el if not hasattr(ref_para_or_el, '_element') else ref_para_or_el._element
204
+ ref_el.addnext(new_p)
205
+ return new_p
206
+
207
+
208
+ def tracked_insert_paras_after(ref_para, items, rev, author=AUTHOR, date=DATE):
209
+ """
210
+ Insert multiple paragraphs in order with tracked insertion marks.
211
+ items: list of str, or list of (text, style_val) tuples.
212
+ Uses a moving pointer β€” order is preserved.
213
+ Returns the last inserted element.
214
+ """
215
+ ref_el = ref_para._element
216
+ for item in items:
217
+ text, sv = item if isinstance(item, tuple) else (item, 'Normal')
218
+ new_p_el = tracked_insert_para_after(ref_el, text, sv, rev, author, date)
219
+ ref_el = new_p_el
220
+ return ref_el
221
+
222
+
223
+ def tracked_delete_para(para, rev, author=AUTHOR, date=DATE):
224
+ """
225
+ Mark a paragraph as deleted using tracked change marks.
226
+ The paragraph stays in the document but Word shows it as struck-through red.
227
+ """
228
+ p_el = para._element
229
+
230
+ # Mark the paragraph mark as deleted (in pPr > rPr)
231
+ pPr = p_el.find(qn('w:pPr'))
232
+ if pPr is None:
233
+ pPr = OxmlElement('w:pPr')
234
+ p_el.insert(0, pPr)
235
+ rPr = pPr.find(qn('w:rPr'))
236
+ if rPr is None:
237
+ rPr = OxmlElement('w:rPr')
238
+ pPr.append(rPr)
239
+ del_mark = OxmlElement('w:del')
240
+ for k, v in _del_attr(rev, author, date).items():
241
+ del_mark.set(k, v)
242
+ rPr.append(del_mark)
243
+
244
+ # Wrap every run in <w:del> and change <w:t> β†’ <w:delText>
245
+ runs = list(p_el.findall(qn('w:r')))
246
+ for r in runs:
247
+ idx = list(p_el).index(r)
248
+ for t_el in r.findall(qn('w:t')):
249
+ del_t = _make_t(t_el.text, 'w:delText')
250
+ r.remove(t_el)
251
+ r.append(del_t)
252
+ del_wrap = OxmlElement('w:del')
253
+ for k, v in _del_attr(rev, author, date).items():
254
+ del_wrap.set(k, v)
255
+ p_el.remove(r)
256
+ del_wrap.append(r)
257
+ p_el.insert(idx, del_wrap)
258
+
259
+
260
+ def tracked_modify_para(para, old_text, new_text, rev, author=AUTHOR, date=DATE):
261
+ """
262
+ Replace old_text with new_text using tracked del+ins marks.
263
+ Splits the paragraph into: [before][<w:del>old</w:del>][<w:ins>new</w:ins>][after]
264
+ Word shows the old text struck through and new text underlined.
265
+ """
266
+ full = para.text
267
+ if old_text not in full:
268
+ raise ValueError(f"Not found: {old_text!r}\nIn: {full!r}")
269
+
270
+ before, _, after = full.partition(old_text)
271
+ p_el = para._element
272
+
273
+ # Remove all existing runs
274
+ for r in p_el.findall(qn('w:r')):
275
+ p_el.remove(r)
276
+
277
+ # Before (unchanged)
278
+ if before:
279
+ p_el.append(_make_run(before))
280
+
281
+ # Tracked deletion of old text
282
+ del_el = OxmlElement('w:del')
283
+ for k, v in _del_attr(rev, author, date).items():
284
+ del_el.set(k, v)
285
+ r_del = OxmlElement('w:r')
286
+ r_del.append(_make_t(old_text, 'w:delText'))
287
+ del_el.append(r_del)
288
+ p_el.append(del_el)
289
+
290
+ # Tracked insertion of new text
291
+ ins_el = OxmlElement('w:ins')
292
+ for k, v in _ins_attr(rev, author, date).items():
293
+ ins_el.set(k, v)
294
+ ins_el.append(_make_run(new_text))
295
+ p_el.append(ins_el)
296
+
297
+ # After (unchanged)
298
+ if after:
299
+ p_el.append(_make_run(after))
300
+
301
+
302
+ def _char_diff(old, new):
303
+ """
304
+ Return a list of (op, text) tuples for a minimal character-level diff.
305
+ op is one of 'keep', 'del', 'ins'.
306
+
307
+ Strategy: first tokenize into digit-runs, letter-runs, and single separator
308
+ characters so that separators like '-' or '.' are kept intact as their own
309
+ tokens; then match tokens with SequenceMatcher; finally apply char-level diff
310
+ within each replaced token pair for maximum granularity.
311
+
312
+ Examples:
313
+ ('V18.2.0', 'V18.3.0') β†’
314
+ [('keep','V18.'), ('del','2'), ('ins','3'), ('keep','.0')]
315
+ ('(2024-11)', '(2026-04)') β†’
316
+ [('keep','(202'), ('del','4'), ('ins','6'), ('keep','-'),
317
+ ('del','11'), ('ins','04'), ('keep',')')]
318
+ """
319
+ old_tokens = re.findall(r'\d+|[A-Za-z]+|.', old)
320
+ new_tokens = re.findall(r'\d+|[A-Za-z]+|.', new)
321
+
322
+ ops = []
323
+ matcher = difflib.SequenceMatcher(None, old_tokens, new_tokens, autojunk=False)
324
+ for tag, i1, i2, j1, j2 in matcher.get_opcodes():
325
+ old_span = ''.join(old_tokens[i1:i2])
326
+ new_span = ''.join(new_tokens[j1:j2])
327
+ if tag == 'equal':
328
+ ops.append(('keep', old_span))
329
+ elif tag == 'replace':
330
+ # Within each replaced token span, apply char-level diff for finer granularity
331
+ cmatcher = difflib.SequenceMatcher(None, old_span, new_span, autojunk=False)
332
+ for ctag, ci1, ci2, cj1, cj2 in cmatcher.get_opcodes():
333
+ if ctag == 'equal':
334
+ ops.append(('keep', old_span[ci1:ci2]))
335
+ elif ctag == 'replace':
336
+ ops.append(('del', old_span[ci1:ci2]))
337
+ ops.append(('ins', new_span[cj1:cj2]))
338
+ elif ctag == 'delete':
339
+ ops.append(('del', old_span[ci1:ci2]))
340
+ elif ctag == 'insert':
341
+ ops.append(('ins', new_span[cj1:cj2]))
342
+ elif tag == 'delete':
343
+ ops.append(('del', old_span))
344
+ elif tag == 'insert':
345
+ ops.append(('ins', new_span))
346
+ return ops
347
+
348
+
349
+ def tracked_modify_para_multi(para, replacements, rev, author=AUTHOR, date=DATE):
350
+ """
351
+ Apply multiple tracked del+ins replacements in a single paragraph pass.
352
+ replacements: list of (old_text, new_text) tuples, applied in order of appearance.
353
+ Each replacement uses character-level diff so only the minimally changed characters
354
+ are marked as del/ins, with common characters kept as plain runs in between.
355
+ Use this instead of calling tracked_modify_para twice (which would corrupt the XML).
356
+ """
357
+ full = para.text
358
+ for old_text, _ in replacements:
359
+ if old_text not in full:
360
+ raise ValueError(f"Not found: {old_text!r}\nIn: {full!r}")
361
+
362
+ p_el = para._element
363
+
364
+ # Remove all existing runs
365
+ for r in p_el.findall(qn('w:r')):
366
+ p_el.remove(r)
367
+
368
+ # Walk through the full text, emitting plain runs and char-level del+ins ops
369
+ remaining = full
370
+ for old_text, new_text in replacements:
371
+ idx = remaining.find(old_text)
372
+ if idx == -1:
373
+ continue
374
+ before = remaining[:idx]
375
+ remaining = remaining[idx + len(old_text):]
376
+
377
+ if before:
378
+ p_el.append(_make_run(before))
379
+
380
+ for op, text in _char_diff(old_text, new_text):
381
+ if op == 'keep':
382
+ p_el.append(_make_run(text))
383
+ elif op == 'del':
384
+ del_el = OxmlElement('w:del')
385
+ for k, v in _del_attr(rev, author, date).items():
386
+ del_el.set(k, v)
387
+ r_del = OxmlElement('w:r')
388
+ r_del.append(_make_t(text, 'w:delText'))
389
+ del_el.append(r_del)
390
+ p_el.append(del_el)
391
+ elif op == 'ins':
392
+ ins_el = OxmlElement('w:ins')
393
+ for k, v in _ins_attr(rev, author, date).items():
394
+ ins_el.set(k, v)
395
+ ins_el.append(_make_run(text))
396
+ p_el.append(ins_el)
397
+
398
+ # Emit any trailing text
399
+ if remaining:
400
+ p_el.append(_make_run(remaining))
401
+
402
+
403
+ def tracked_insert_table_row(tbl, cell_texts, rev, author=AUTHOR, date=DATE):
404
+ """
405
+ Insert a new row immediately after the last non-empty row in tbl, as a
406
+ tracked insertion. Empty pre-allocated rows at the table bottom are skipped
407
+ so the new content appears directly under the previous entry.
408
+
409
+ The new row is deep-copied from the last content row so that ALL formatting
410
+ (cell widths, borders, shading, paragraph style, run font/size) is inherited β€”
411
+ exactly as clicking "Insert Row Below" does in Word.
412
+
413
+ tbl: python-docx Table object
414
+ cell_texts: list of strings, one per column
415
+ """
416
+ tbl_el = tbl._tbl
417
+ all_trs = tbl_el.findall(qn('w:tr'))
418
+
419
+ # Find the last row that contains at least one non-empty <w:t> node.
420
+ # This skips pre-allocated blank rows at the table bottom.
421
+ last_content_tr = all_trs[-1]
422
+ for tr in reversed(all_trs):
423
+ if any(t.text and t.text.strip() for t in tr.findall('.//' + qn('w:t'))):
424
+ last_content_tr = tr
425
+ break
426
+
427
+ # Deep-copy the last content row β€” inherits all cell/paragraph/run formatting.
428
+ new_tr = copy.deepcopy(last_content_tr)
429
+
430
+ # Mark the row itself as a tracked insertion in <w:trPr>.
431
+ trPr = new_tr.find(qn('w:trPr'))
432
+ if trPr is None:
433
+ trPr = OxmlElement('w:trPr')
434
+ new_tr.insert(0, trPr)
435
+ for child in list(trPr):
436
+ if child.tag == qn('w:ins'):
437
+ trPr.remove(child)
438
+ tr_ins = OxmlElement('w:ins')
439
+ for k, v in _ins_attr(rev, author, date).items():
440
+ tr_ins.set(k, v)
441
+ trPr.append(tr_ins)
442
+
443
+ # For each cell: extract the existing run's rPr, clear text content, insert new text.
444
+ cells_in_new_tr = new_tr.findall(qn('w:tc'))
445
+ for i, tc in enumerate(cells_in_new_tr):
446
+ p = tc.find('.//' + qn('w:p'))
447
+ if p is None:
448
+ continue
449
+
450
+ # Capture the first run's rPr (font size, bold, etc.) before clearing.
451
+ first_run_rpr = None
452
+ for r in list(p.iter(qn('w:r'))):
453
+ rpr = r.find(qn('w:rPr'))
454
+ if rpr is not None:
455
+ first_run_rpr = copy.deepcopy(rpr)
456
+ break
457
+
458
+ # Remove all non-pPr children (runs, ins, del, hyperlinks, etc.)
459
+ for child in list(p):
460
+ if child.tag != qn('w:pPr'):
461
+ p.remove(child)
462
+
463
+ # Ensure pPr exists with a paragraph-mark ins tracking element.
464
+ pPr = p.find(qn('w:pPr'))
465
+ if pPr is None:
466
+ pPr = OxmlElement('w:pPr')
467
+ p.insert(0, pPr)
468
+ rPr = pPr.find(qn('w:rPr'))
469
+ if rPr is None:
470
+ rPr = OxmlElement('w:rPr')
471
+ pPr.append(rPr)
472
+ for child in list(rPr):
473
+ if child.tag == qn('w:ins'):
474
+ rPr.remove(child)
475
+ p_ins_mark = OxmlElement('w:ins')
476
+ for k, v in _ins_attr(rev, author, date).items():
477
+ p_ins_mark.set(k, v)
478
+ rPr.append(p_ins_mark)
479
+
480
+ # Build new run, re-using the inherited rPr so font size / style matches.
481
+ r_new = OxmlElement('w:r')
482
+ if first_run_rpr is not None:
483
+ r_new.append(first_run_rpr)
484
+ text = cell_texts[i] if i < len(cell_texts) else ''
485
+ r_new.append(_make_t(text))
486
+
487
+ # Wrap the run in a tracked-insertion element.
488
+ ins_el = OxmlElement('w:ins')
489
+ for k, v in _ins_attr(rev, author, date).items():
490
+ ins_el.set(k, v)
491
+ ins_el.append(r_new)
492
+ p.append(ins_el)
493
+
494
+ last_content_tr.addnext(new_tr)
scripts/fetch_crs.py ADDED
@@ -0,0 +1,487 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ fetch_crs.py β€” Download CRs and TSs from a 3GPP/ETSI Excel contribution list.
4
+
5
+ Usage:
6
+ python3 fetch_crs.py <excel_path> <person_name> [--output-dir DIR]
7
+
8
+ Steps:
9
+ 1. Parse Excel, filter Accepted CRs by person name
10
+ 2. Download CR DOCXs via docfinder /find/tdoc/download
11
+ 3. Parse CR cover pages to extract target TS spec + version
12
+ 4. Download TS DOCXs via docfinder /find/docx
13
+ 5. Print summary report
14
+ """
15
+
16
+ import argparse
17
+ import os
18
+ import re
19
+ import sys
20
+ import time
21
+ import zipfile
22
+ from pathlib import Path
23
+
24
+ import requests
25
+
26
+ BASE_URL = "https://organizedprogrammers-docfinder.hf.space"
27
+ _proxy = os.environ.get("http_proxy") or None
28
+ PROXIES = {"http": _proxy, "https": os.environ.get("https_proxy") or None}
29
+
30
+
31
+ # ---------------------------------------------------------------------------
32
+ # Path helpers
33
+ # ---------------------------------------------------------------------------
34
+
35
+ def wsl_path(p: str) -> str:
36
+ """Convert Windows path (C:\\...) to WSL path (/mnt/c/...) if needed."""
37
+ p = p.strip()
38
+ if len(p) >= 2 and p[1] == ":" and p[0].isalpha():
39
+ drive = p[0].lower()
40
+ rest = p[2:].replace("\\", "/")
41
+ return f"/mnt/{drive}{rest}"
42
+ return p
43
+
44
+
45
+ # ---------------------------------------------------------------------------
46
+ # Step 1 β€” Parse Excel
47
+ # ---------------------------------------------------------------------------
48
+
49
+ def parse_excel(excel_path: str, person_name: str):
50
+ """
51
+ Return list of (uid, title) for Accepted CRs matching person_name.
52
+ Handles both .xls and .xlsx.
53
+ """
54
+ path = Path(wsl_path(excel_path))
55
+ ext = path.suffix.lower()
56
+
57
+ if ext == ".xls":
58
+ return _parse_xls(path, person_name)
59
+ elif ext == ".xlsx":
60
+ return _parse_xlsx(path, person_name)
61
+ else:
62
+ raise ValueError(f"Unsupported file extension: {ext!r}. Expected .xls or .xlsx")
63
+
64
+
65
+ def _name_pattern(name: str) -> re.Pattern:
66
+ return re.compile(r"\b" + re.escape(name) + r"\b", re.IGNORECASE)
67
+
68
+
69
+ def _parse_xls(path: Path, person_name: str):
70
+ try:
71
+ import xlrd
72
+ except ImportError:
73
+ sys.exit("ERROR: xlrd is not installed. Run: pip install xlrd")
74
+
75
+ wb = xlrd.open_workbook(str(path))
76
+ # Try "Contributions" sheet first, fall back to first sheet
77
+ try:
78
+ ws = wb.sheet_by_name("Contributions")
79
+ except xlrd.XLRDError:
80
+ ws = wb.sheet_by_index(0)
81
+
82
+ # Row 0 is headers; row 1 is an empty duplicate β€” skip it
83
+ headers = [str(ws.cell_value(0, c)).strip() for c in range(ws.ncols)]
84
+ col = {h: i for i, h in enumerate(headers)}
85
+
86
+ uid_col = col.get("Uid") or col.get("UID") or col.get("uid")
87
+ type_col = col.get("Type") or col.get("type")
88
+ status_col = col.get("Status") or col.get("status")
89
+ by_col = col.get("SubmittedBy") or col.get("Submitted By") or col.get("submittedby")
90
+ title_col = col.get("Title") or col.get("title")
91
+
92
+ for name, c in [("Uid", uid_col), ("Type", type_col),
93
+ ("Status", status_col), ("SubmittedBy", by_col)]:
94
+ if c is None:
95
+ raise ValueError(f"Column {name!r} not found. Available: {list(col.keys())}")
96
+
97
+ pattern = _name_pattern(person_name)
98
+ results = []
99
+
100
+ for r in range(2, ws.nrows): # skip header + empty duplicate
101
+ uid = str(ws.cell_value(r, uid_col)).strip()
102
+ doc_type = str(ws.cell_value(r, type_col)).strip()
103
+ status = str(ws.cell_value(r, status_col)).strip()
104
+ submitted_by = str(ws.cell_value(r, by_col)).strip()
105
+ title = str(ws.cell_value(r, title_col)).strip() if title_col is not None else ""
106
+
107
+ if doc_type != "CR":
108
+ continue
109
+ if status != "Accepted":
110
+ continue
111
+ if not pattern.search(submitted_by):
112
+ continue
113
+
114
+ results.append((uid, title))
115
+
116
+ return results
117
+
118
+
119
+ def _parse_xlsx(path: Path, person_name: str):
120
+ try:
121
+ import openpyxl
122
+ except ImportError:
123
+ sys.exit("ERROR: openpyxl is not installed. Run: pip install openpyxl")
124
+
125
+ wb = openpyxl.load_workbook(str(path), read_only=True, data_only=True)
126
+ ws = wb["Contributions"] if "Contributions" in wb.sheetnames else wb.active
127
+
128
+ rows = iter(ws.iter_rows(values_only=True))
129
+
130
+ # Row 0: headers
131
+ header_row = next(rows)
132
+ headers = [str(h).strip() if h is not None else "" for h in header_row]
133
+ col = {h: i for i, h in enumerate(headers)}
134
+
135
+ # Row 1: empty duplicate β€” skip
136
+ next(rows, None)
137
+
138
+ uid_col = col.get("Uid") or col.get("UID") or col.get("uid")
139
+ type_col = col.get("Type") or col.get("type")
140
+ status_col = col.get("Status") or col.get("status")
141
+ by_col = col.get("SubmittedBy") or col.get("Submitted By") or col.get("submittedby")
142
+ title_col = col.get("Title") or col.get("title")
143
+
144
+ for name, c in [("Uid", uid_col), ("Type", type_col),
145
+ ("Status", status_col), ("SubmittedBy", by_col)]:
146
+ if c is None:
147
+ raise ValueError(f"Column {name!r} not found. Available: {list(col.keys())}")
148
+
149
+ pattern = _name_pattern(person_name)
150
+ results = []
151
+
152
+ for row in rows:
153
+ def cell(c):
154
+ v = row[c] if c < len(row) else None
155
+ return str(v).strip() if v is not None else ""
156
+
157
+ uid = cell(uid_col)
158
+ doc_type = cell(type_col)
159
+ status = cell(status_col)
160
+ submitted_by = cell(by_col)
161
+ title = cell(title_col) if title_col is not None else ""
162
+
163
+ if not uid:
164
+ continue
165
+ if doc_type != "CR":
166
+ continue
167
+ if status != "Accepted":
168
+ continue
169
+ if not pattern.search(submitted_by):
170
+ continue
171
+
172
+ results.append((uid, title))
173
+
174
+ return results
175
+
176
+
177
+ # ---------------------------------------------------------------------------
178
+ # Step 2 β€” Download CR DOCXs
179
+ # ---------------------------------------------------------------------------
180
+
181
+ def download_cr(uid: str, cr_dir: Path):
182
+ """
183
+ Download CR DOCX for the given UID.
184
+
185
+ Returns:
186
+ (docx_path, note) β€” docx_path is the file to use for parsing
187
+ note is a human-readable string for the summary
188
+ Returns (None, error_msg) on failure.
189
+ """
190
+ dest = cr_dir / f"{uid}.docx"
191
+
192
+ if dest.exists():
193
+ return dest, "already existed"
194
+
195
+ try:
196
+ resp = requests.post(
197
+ f"{BASE_URL}/find/tdoc/download",
198
+ json={"doc_id": uid},
199
+ proxies=PROXIES,
200
+ timeout=60,
201
+ )
202
+ except requests.RequestException as e:
203
+ return None, f"network error: {e}"
204
+
205
+ if not resp.ok:
206
+ return None, f"HTTP {resp.status_code}"
207
+
208
+ content = resp.content
209
+ if not content:
210
+ return None, "empty response"
211
+
212
+ dest.write_bytes(content)
213
+
214
+ # ZIP detection
215
+ if content[:4] == b"PK\x03\x04":
216
+ try:
217
+ with zipfile.ZipFile(dest) as zf:
218
+ docx_entries = [n for n in zf.namelist() if n.endswith(".docx")]
219
+ if docx_entries:
220
+ extracted_name = f"{uid}_extracted.docx"
221
+ extracted_path = cr_dir / extracted_name
222
+ with zf.open(docx_entries[0]) as src, open(extracted_path, "wb") as dst:
223
+ dst.write(src.read())
224
+ return extracted_path, "extracted from ZIP"
225
+ except zipfile.BadZipFile:
226
+ pass # Not actually a ZIP despite magic bytes β€” treat as raw DOCX
227
+
228
+ return dest, "downloaded"
229
+
230
+
231
+ # ---------------------------------------------------------------------------
232
+ # Step 3 β€” Parse CR Cover Pages
233
+ # ---------------------------------------------------------------------------
234
+
235
+ SPEC_PATTERN = re.compile(r"^\d{3}\s\d{3}$")
236
+ VERSION_PATTERN = re.compile(r"^\d+\.\d+\.\d+$")
237
+
238
+
239
+ def parse_cr_cover(docx_path: Path):
240
+ """
241
+ Parse the CR cover table (tables[0]) to extract (spec_number, version).
242
+
243
+ Returns (spec_number, version) e.g. ("102 221", "18.3.0")
244
+ Returns (None, None) if parsing fails.
245
+ """
246
+ try:
247
+ from docx import Document
248
+ except ImportError:
249
+ sys.exit("ERROR: python-docx is not installed. Run: pip install python-docx")
250
+
251
+ try:
252
+ doc = Document(str(docx_path))
253
+ except Exception as e:
254
+ return None, None
255
+
256
+ if not doc.tables:
257
+ return None, None
258
+
259
+ table = doc.tables[0]
260
+
261
+ # Collect all non-empty cell texts in order
262
+ cells = []
263
+ for row in table.rows:
264
+ for cell in row.cells:
265
+ text = cell.text.strip()
266
+ if text:
267
+ cells.append(text)
268
+
269
+ spec_number = None
270
+ version = None
271
+
272
+ for i, text in enumerate(cells):
273
+ # Look for spec number: "NNN NNN" pattern
274
+ if SPEC_PATTERN.match(text) and spec_number is None:
275
+ spec_number = text
276
+
277
+ # Look for version: cell immediately after "Current version:"
278
+ if text == "Current version:" and i + 1 < len(cells):
279
+ candidate = cells[i + 1]
280
+ if VERSION_PATTERN.match(candidate):
281
+ version = candidate
282
+
283
+ # Also accept "Current version" without colon
284
+ if text in ("Current version:", "Current version") and version is None:
285
+ if i + 1 < len(cells) and VERSION_PATTERN.match(cells[i + 1]):
286
+ version = cells[i + 1]
287
+
288
+ return spec_number, version
289
+
290
+
291
+ # ---------------------------------------------------------------------------
292
+ # Step 4 β€” Download TS DOCXs
293
+ # ---------------------------------------------------------------------------
294
+
295
+ def _is_html(resp: requests.Response) -> bool:
296
+ """Return True if the response body is an HTML page (e.g. HF Space loading page)."""
297
+ ct = resp.headers.get("content-type", "")
298
+ if "text/html" in ct:
299
+ return True
300
+ return resp.content[:5].lower() in (b"<!doc", b"<html")
301
+
302
+
303
+ def download_ts(spec_number: str, version: str, ts_dir: Path,
304
+ max_retries: int = 3, retry_delay: int = 10):
305
+ """
306
+ Download TS DOCX for spec_number (e.g. "102 221") and version (e.g. "18.3.0").
307
+
308
+ Retries up to max_retries times when the HF Space returns an HTML loading page
309
+ instead of the DOCX binary (happens on cold-start / brief restarts).
310
+
311
+ Returns (filename, note) or (None, error_msg).
312
+ """
313
+ spec_no_space = spec_number.replace(" ", "")
314
+ filename = f"ts_{spec_no_space}_v{version}.docx"
315
+ dest = ts_dir / filename
316
+
317
+ if dest.exists():
318
+ return filename, "already existed"
319
+
320
+ last_error = "no attempts made"
321
+ for attempt in range(1, max_retries + 1):
322
+ try:
323
+ resp = requests.post(
324
+ f"{BASE_URL}/find/docx",
325
+ json={"doc_id": spec_number, "version": version},
326
+ proxies=PROXIES,
327
+ timeout=120,
328
+ )
329
+ except requests.RequestException as e:
330
+ return None, f"network error: {e}"
331
+
332
+ if not resp.ok:
333
+ return None, f"HTTP {resp.status_code}"
334
+
335
+ content = resp.content
336
+ if not content:
337
+ return None, "empty response"
338
+
339
+ # Detect HTML splash page (HF Space cold-start) β€” retry after a delay
340
+ if _is_html(resp):
341
+ last_error = f"got HTML instead of DOCX (attempt {attempt}/{max_retries})"
342
+ if attempt < max_retries:
343
+ print(f"\n [retry in {retry_delay}s β€” HF Space loading…]", flush=True)
344
+ time.sleep(retry_delay)
345
+ continue
346
+ return None, f"invalid file (not a ZIP/DOCX, starts with {content[:4]!r}) after {max_retries} attempts"
347
+
348
+ # Good binary response
349
+ dest.write_bytes(content)
350
+
351
+ if content[:2] != b"PK":
352
+ dest.unlink()
353
+ return None, f"invalid file (not a ZIP/DOCX, starts with {content[:4]!r})"
354
+
355
+ # Verify the TS contains the expected spec number in its first paragraph
356
+ try:
357
+ import docx as _docx
358
+ _doc = _docx.Document(dest)
359
+ first_para = _doc.paragraphs[0].text if _doc.paragraphs else ''
360
+ if spec_no_space not in first_para.replace(' ', ''):
361
+ dest.unlink()
362
+ return None, f"wrong TS returned by API: got {first_para[:80]!r} (expected spec {spec_no_space})"
363
+ except Exception:
364
+ pass # Trust the ZIP check above
365
+
366
+ note = "downloaded" if attempt == 1 else f"downloaded (after {attempt} attempts)"
367
+ return filename, note
368
+
369
+ return None, last_error
370
+
371
+
372
+ # ---------------------------------------------------------------------------
373
+ # Main
374
+ # ---------------------------------------------------------------------------
375
+
376
+ def main():
377
+ parser = argparse.ArgumentParser(
378
+ description="Download CRs and TSs from a 3GPP/ETSI Excel contribution list."
379
+ )
380
+ parser.add_argument("excel_path", help="Path to .xls or .xlsx contribution list")
381
+ parser.add_argument("person_name", help="Name to search for in SubmittedBy column")
382
+ parser.add_argument(
383
+ "--output-dir",
384
+ default=str(Path.home() / "CR_Processing"),
385
+ help="Base output directory (default: ~/CR_Processing)",
386
+ )
387
+ args = parser.parse_args()
388
+
389
+ excel_path = wsl_path(args.excel_path)
390
+ person_name = args.person_name
391
+ output_dir = Path(wsl_path(args.output_dir)).expanduser()
392
+
393
+ cr_dir = output_dir / "CRs"
394
+ ts_dir = output_dir / "TS"
395
+ cr_dir.mkdir(parents=True, exist_ok=True)
396
+ ts_dir.mkdir(parents=True, exist_ok=True)
397
+
398
+ # --- Step 1: Parse Excel ---
399
+ print(f"Parsing Excel: {excel_path}")
400
+ print(f"Filtering for: {person_name!r} | Type=CR | Status=Accepted\n")
401
+
402
+ try:
403
+ cr_list = parse_excel(excel_path, person_name)
404
+ except Exception as e:
405
+ sys.exit(f"ERROR parsing Excel: {e}")
406
+
407
+ print(f"Found {len(cr_list)} matching CR(s).\n")
408
+
409
+ if not cr_list:
410
+ print("Nothing to download.")
411
+ return
412
+
413
+ # --- Step 2: Download CR DOCXs ---
414
+ print("Downloading CRs...")
415
+ cr_results = [] # list of (uid, docx_path_or_None, note)
416
+
417
+ for uid, title in cr_list:
418
+ print(f" [{uid}] ", end="", flush=True)
419
+ docx_path, note = download_cr(uid, cr_dir)
420
+ cr_results.append((uid, docx_path, note))
421
+ if docx_path:
422
+ print(f"OK ({note}) β€” {docx_path.name}")
423
+ else:
424
+ print(f"FAILED β€” {note}")
425
+
426
+ print()
427
+
428
+ # --- Step 3: Parse cover pages ---
429
+ print("Parsing CR cover pages...")
430
+ ts_targets = {} # (spec_number, version) -> list of uids
431
+
432
+ for uid, docx_path, note in cr_results:
433
+ if docx_path is None:
434
+ continue
435
+ spec_number, version = parse_cr_cover(docx_path)
436
+ if spec_number and version:
437
+ key = (spec_number, version)
438
+ ts_targets.setdefault(key, []).append(uid)
439
+ print(f" [{uid}] β†’ TS {spec_number} v{version}")
440
+ else:
441
+ print(f" [{uid}] WARNING: could not parse cover page (spec/version not found)")
442
+
443
+ print()
444
+
445
+ # --- Step 4: Download TSs ---
446
+ print("Downloading TSs...")
447
+ ts_results = [] # list of (spec_number, version, filename_or_None, note)
448
+
449
+ for (spec_number, version), uids in ts_targets.items():
450
+ print(f" [TS {spec_number} v{version}] ", end="", flush=True)
451
+ filename, note = download_ts(spec_number, version, ts_dir)
452
+ ts_results.append((spec_number, version, filename, note))
453
+ if filename:
454
+ print(f"OK ({note}) β€” {filename}")
455
+ else:
456
+ print(f"FAILED β€” {note}")
457
+
458
+ print()
459
+
460
+ # --- Step 5: Summary ---
461
+ print("=" * 50)
462
+ print("=== fetch-crs summary ===")
463
+ print(f"Person: {person_name}")
464
+ print(f"Excel: {excel_path}")
465
+ print(f"CRs found: {len(cr_list)} (Accepted, Type=CR)")
466
+ print()
467
+ print("CRs downloaded:")
468
+ for uid, docx_path, note in cr_results:
469
+ if docx_path:
470
+ print(f" βœ“ {docx_path.name} [{note}]")
471
+ else:
472
+ print(f" βœ— {uid} β€” {note}")
473
+
474
+ print()
475
+ print("TSs downloaded:")
476
+ for spec_number, version, filename, note in ts_results:
477
+ if filename:
478
+ print(f" βœ“ {filename} [{note}]")
479
+ else:
480
+ print(f" βœ— ts_{spec_number.replace(' ', '')} v{version} β€” {note}")
481
+
482
+ print()
483
+ print(f"Output: {output_dir}/")
484
+
485
+
486
+ if __name__ == "__main__":
487
+ main()
scripts/finalize_ts.py ADDED
@@ -0,0 +1,370 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ finalize_ts.py β€” Add tracked-change metadata updates to a TS DOCX after CR application.
4
+
5
+ Three edits are made (all as tracked changes):
6
+ 1. New row in the Change History table (second-to-last table, Annex V)
7
+ 2. New row in the History table (last table, last page)
8
+ 3. Version + date update in the first paragraph (title)
9
+
10
+ Usage:
11
+ python3 finalize_ts.py <ts_docx> <cr_docx> [--author "Name"] [--output <path>]
12
+ """
13
+
14
+ import argparse
15
+ import re
16
+ import sys
17
+ from datetime import date, timedelta
18
+ from pathlib import Path
19
+
20
+ import docx
21
+
22
+ sys.path.insert(0, str(Path(__file__).parent))
23
+ from docx_helpers import (
24
+ RevCounter,
25
+ tracked_insert_table_row,
26
+ tracked_modify_para_multi,
27
+ AUTHOR,
28
+ DATE,
29
+ )
30
+
31
+
32
+ # ── Path helpers ──────────────────────────────────────────────────────────────
33
+
34
+ def to_wsl_path(p: str) -> str:
35
+ """Convert Windows paths (C:\\...) to WSL paths (/mnt/c/...)."""
36
+ if p.startswith(('C:\\', 'c:\\', 'D:\\', 'd:\\')):
37
+ drive = p[0].lower()
38
+ rest = p[2:].replace('\\', '/')
39
+ return f'/mnt/{drive}{rest}'
40
+ return p
41
+
42
+
43
+ # ── Date / version helpers ────────────────────────────────────────────────────
44
+
45
+ def compute_pub_date():
46
+ """
47
+ Return (yyyy-mm, "Month YYYY") using the 5-day rule:
48
+ if today is within 5 days of the next month's first day, use next month;
49
+ otherwise use the current month.
50
+ """
51
+ today = date.today()
52
+ first_next = (today.replace(day=1) + timedelta(days=32)).replace(day=1)
53
+ days_until = (first_next - today).days
54
+ target = first_next if days_until <= 5 else today.replace(day=1)
55
+ return target.strftime('%Y-%m'), target.strftime('%B %Y')
56
+
57
+
58
+ def derive_new_version(v: str) -> str:
59
+ """Increment middle component of X.Y.Z β†’ X.(Y+1).0."""
60
+ parts = v.split('.')
61
+ parts[1] = str(int(parts[1]) + 1)
62
+ parts[2] = '0'
63
+ return '.'.join(parts)
64
+
65
+
66
+ # ── CR metadata extraction ────────────────────────────────────────────────────
67
+
68
+ def extract_cr_metadata(cr_docx_path: str) -> dict:
69
+ """
70
+ Open the CR DOCX and read metadata from tables[0] (cover page table).
71
+ Returns dict with keys:
72
+ meeting_id, uid, cr_num, rev, cat, title, current_version
73
+ """
74
+ doc = docx.Document(cr_docx_path)
75
+ if not doc.tables:
76
+ raise ValueError('CR has no tables β€” cannot extract metadata')
77
+
78
+ tbl = doc.tables[0]
79
+
80
+ # Collect all cell texts for scanning
81
+ cells = []
82
+ for row in tbl.rows:
83
+ for cell in row.cells:
84
+ cells.append(cell.text.strip())
85
+
86
+ meta = {
87
+ 'meeting_id': '',
88
+ 'uid': '',
89
+ 'cr_num': '',
90
+ 'rev': '',
91
+ 'cat': '',
92
+ 'title': '',
93
+ 'current_version': '',
94
+ }
95
+
96
+ # --- Meeting ID ---
97
+ # Find cell containing "Meeting #" and parse e.g. "ETSI TC SET Meeting #121, Edinburgh..."
98
+ meeting_text = ''
99
+ for c in cells:
100
+ if 'Meeting #' in c or 'Meeting#' in c:
101
+ meeting_text = c
102
+ break
103
+
104
+ if meeting_text:
105
+ # Body: word before "Meeting" (e.g. "SET")
106
+ body_match = re.search(r'(\w+)\s+Meeting\s*#', meeting_text)
107
+ body = body_match.group(1) if body_match else ''
108
+ # Number: digits after "#"
109
+ num_match = re.search(r'Meeting\s*#\s*(\d+)', meeting_text)
110
+ number = num_match.group(1) if num_match else ''
111
+ meta['meeting_id'] = f'{body}-{number}' if body and number else meeting_text
112
+
113
+ # --- UID ---
114
+ # Pattern like SET(26)000019r1 or similar
115
+ uid_pat = re.compile(r'[A-Z]+\(\d+\)\d+\S*')
116
+ for c in cells:
117
+ m = uid_pat.search(c)
118
+ if m:
119
+ meta['uid'] = m.group(0)
120
+ break
121
+
122
+ # --- Label-value scanning ---
123
+ # Scan pairs: if a cell matches a label, the next non-empty cell is the value
124
+ label_map = {
125
+ 'CR': 'cr_num',
126
+ 'Rev': 'rev',
127
+ 'Curr. vers': 'current_version',
128
+ 'Current version': 'current_version',
129
+ 'Cat': 'cat',
130
+ 'Category': 'cat',
131
+ }
132
+ title_next = False
133
+ for i, c in enumerate(cells):
134
+ stripped = c.strip().rstrip(':')
135
+
136
+ # Title may span its own cell or be labelled
137
+ if stripped.lower() in ('title', 'title of change'):
138
+ title_next = True
139
+ continue
140
+ if title_next:
141
+ if c.strip():
142
+ meta['title'] = c.strip()
143
+ title_next = False
144
+ continue
145
+
146
+ for label, key in label_map.items():
147
+ if stripped == label or stripped.startswith(label):
148
+ # Value is in the next non-empty cell
149
+ for j in range(i + 1, min(i + 4, len(cells))):
150
+ val = cells[j].strip()
151
+ if val:
152
+ meta[key] = val
153
+ break
154
+ break
155
+
156
+ return meta
157
+
158
+
159
+ # ── Meeting ID format detection ───────────────────────────────────────────────
160
+
161
+ def _detect_meeting_separator(tbl):
162
+ """
163
+ Scan the meeting column (col index 1) of the Change History table bottom-up.
164
+ Find the last non-empty cell and detect the separator between body letters and
165
+ number, e.g. '#' in 'SET#115' or '-' in 'SET-119'.
166
+ Returns the detected separator character, defaulting to '#'.
167
+ """
168
+ for row in reversed(tbl.rows):
169
+ cells = row.cells
170
+ if len(cells) > 1:
171
+ text = cells[1].text.strip()
172
+ if text:
173
+ m = re.search(r'[A-Za-z]([^A-Za-z0-9])\d', text)
174
+ if m:
175
+ return m.group(1)
176
+ return '#'
177
+
178
+
179
+ # ── TS table locators ─────────────────────────────────────────────────────────
180
+
181
+ def find_change_history_table(ts_doc):
182
+ """Return ts_doc.tables[-2] (Change History / Annex V). Accepts 8 or 9 columns."""
183
+ tables = ts_doc.tables
184
+ if len(tables) < 2:
185
+ raise ValueError('TS has fewer than 2 tables')
186
+ tbl = tables[-2]
187
+ ncols = len(tbl.rows[-1].cells)
188
+ if ncols not in (8, 9):
189
+ raise ValueError(
190
+ f'Change History table has {ncols} columns, expected 8 or 9'
191
+ )
192
+ return tbl
193
+
194
+
195
+ def find_history_table(ts_doc):
196
+ """Return ts_doc.tables[-1] (History / last page). Validates 3 columns."""
197
+ tbl = ts_doc.tables[-1]
198
+ last_row = tbl.rows[-1]
199
+ if len(last_row.cells) != 3:
200
+ raise ValueError(
201
+ f'History table has {len(last_row.cells)} columns, expected 3'
202
+ )
203
+ return tbl
204
+
205
+
206
+ # ── Update functions ──────────────────────────────────────────────────────────
207
+
208
+ def update_change_history_table(ts_doc, meta, pub_yyyy_mm, old_v, new_v, rev, author, date_str):
209
+ tbl = find_change_history_table(ts_doc)
210
+ ncols = len(tbl.rows[-1].cells)
211
+
212
+ # Detect separator used in existing rows (e.g. '#' in 'SET#115', '-' in 'SET-119')
213
+ # and reformat meeting_id accordingly so it matches the existing style.
214
+ sep = _detect_meeting_separator(tbl)
215
+ meeting_id = meta['meeting_id'] # always 'BODY-NUMBER' from extract_cr_metadata
216
+ if sep != '-' and '-' in meeting_id:
217
+ body, number = meeting_id.split('-', 1)
218
+ meeting_id = f'{body}{sep}{number}'
219
+
220
+ if ncols == 9:
221
+ # Standard ETSI format: date | meeting | uid | cr | rev | cat | title | old_v | new_v
222
+ cell_texts = [
223
+ pub_yyyy_mm, meeting_id, meta['uid'],
224
+ meta['cr_num'], meta['rev'], meta['cat'],
225
+ meta['title'], old_v, new_v,
226
+ ]
227
+ elif ncols == 8:
228
+ # Detect 8-column variant by first column header
229
+ first_header = tbl.rows[0].cells[0].text.strip() if tbl.rows else ''
230
+ if re.search(r'[Dd]ate', first_header):
231
+ # Date | meeting | uid | cr | rev | cat | title | new_v (no old_v)
232
+ cell_texts = [
233
+ pub_yyyy_mm, meeting_id, meta['uid'],
234
+ meta['cr_num'], meta['rev'], meta['cat'],
235
+ meta['title'], new_v,
236
+ ]
237
+ else:
238
+ # meeting | uid | wg_doc | cr | rev | cat | title | new_v (no date, no old_v)
239
+ cell_texts = [
240
+ meeting_id, meta['uid'], '',
241
+ meta['cr_num'], meta['rev'], meta['cat'],
242
+ meta['title'], new_v,
243
+ ]
244
+ else:
245
+ cell_texts = ([pub_yyyy_mm, meeting_id, meta['uid'],
246
+ meta['cr_num'], meta['rev'], meta['cat'],
247
+ meta['title'], old_v, new_v])[:ncols]
248
+
249
+ tracked_insert_table_row(tbl, cell_texts, rev, author, date_str)
250
+ return cell_texts
251
+
252
+
253
+ def update_history_table(ts_doc, new_v, pub_month_year, rev, author, date_str):
254
+ tbl = find_history_table(ts_doc)
255
+ cell_texts = [f'V{new_v}', pub_month_year, 'Publication']
256
+ tracked_insert_table_row(tbl, cell_texts, rev, author, date_str)
257
+ return cell_texts
258
+
259
+
260
+ def update_title_para(ts_doc, old_v, new_v, old_date_str, new_date_str, rev, author, date_str):
261
+ """
262
+ Update first paragraph: V<old_v>β†’V<new_v> and (old_date_str)β†’(new_date_str).
263
+ Both replacements are applied in a single tracked multi-replace pass.
264
+ """
265
+ para = ts_doc.paragraphs[0]
266
+ replacements = [
267
+ (f'V{old_v}', f'V{new_v}'),
268
+ (f'({old_date_str})', f'({new_date_str})'),
269
+ ]
270
+ tracked_modify_para_multi(para, replacements, rev, author, date_str)
271
+
272
+
273
+ # ── Main ──────────────────────────────────────────────────────────────────────
274
+
275
+ def main():
276
+ parser = argparse.ArgumentParser(
277
+ description='Add tracked-change metadata updates to a TS DOCX after CR application.'
278
+ )
279
+ parser.add_argument('ts_docx', help='TS DOCX file to update')
280
+ parser.add_argument('cr_docx', help='CR DOCX file to read metadata from')
281
+ parser.add_argument('--author', default=AUTHOR, help='Tracked change author name')
282
+ parser.add_argument('--output', default=None, help='Output path (default: <ts>_finalized.docx)')
283
+ args = parser.parse_args()
284
+
285
+ ts_path = to_wsl_path(args.ts_docx)
286
+ cr_path = to_wsl_path(args.cr_docx)
287
+
288
+ # Determine output path
289
+ if args.output:
290
+ out_path = to_wsl_path(args.output)
291
+ else:
292
+ p = Path(ts_path)
293
+ out_path = str(p.parent / (p.stem + '_finalized.docx'))
294
+
295
+ print(f'TS: {ts_path}')
296
+ print(f'CR: {cr_path}')
297
+ print(f'Output: {out_path}')
298
+ print()
299
+
300
+ # Open documents
301
+ ts_doc = docx.Document(ts_path)
302
+ cr_doc = docx.Document(cr_path)
303
+
304
+ # Extract metadata
305
+ print('Extracting CR metadata...')
306
+ meta = extract_cr_metadata(cr_path)
307
+ print(f" Meeting ID: {meta['meeting_id']}")
308
+ print(f" UID: {meta['uid']}")
309
+ print(f" CR#: {meta['cr_num']}")
310
+ print(f" Rev: {meta['rev']}")
311
+ print(f" Category: {meta['cat']}")
312
+ print(f" Title: {meta['title']}")
313
+ print(f" Current version: {meta['current_version']}")
314
+ print()
315
+
316
+ # Compute derived values
317
+ pub_ym, pub_month_year = compute_pub_date()
318
+ old_v = meta['current_version']
319
+ new_v = derive_new_version(old_v)
320
+ print(f'Old version: {old_v} β†’ New version: {new_v}')
321
+ print(f'Publication: {pub_month_year} ({pub_ym})')
322
+ print()
323
+
324
+ # Extract old date from first paragraph
325
+ title_text = ts_doc.paragraphs[0].text
326
+ date_match = re.search(r'\((\d{4}-\d{2})\)', title_text)
327
+ if not date_match:
328
+ print(f'WARNING: Could not find date pattern (YYYY-MM) in first paragraph:')
329
+ print(f' {title_text!r}')
330
+ old_date_str = ''
331
+ else:
332
+ old_date_str = date_match.group(1)
333
+ print(f'Title paragraph: {title_text!r}')
334
+ print(f'Old date: {old_date_str} β†’ New date: {pub_ym}')
335
+ print()
336
+
337
+ # Set up revision counter and tracked change date
338
+ rev = RevCounter(ts_doc)
339
+ tc_date = DATE # ISO 8601 from docx_helpers
340
+
341
+ # Apply changes
342
+ print('Inserting row in Change History table (Annex V)...')
343
+ ch_cells = update_change_history_table(ts_doc, meta, pub_ym, old_v, new_v, rev, args.author, tc_date)
344
+ print(f' Row: {ch_cells}')
345
+
346
+ print('Inserting row in History table (last page)...')
347
+ h_cells = update_history_table(ts_doc, new_v, pub_month_year, rev, args.author, tc_date)
348
+ print(f' Row: {h_cells}')
349
+
350
+ if old_date_str:
351
+ print('Updating title paragraph...')
352
+ update_title_para(ts_doc, old_v, new_v, old_date_str, pub_ym, rev, args.author, tc_date)
353
+ print(f' V{old_v} β†’ V{new_v}, ({old_date_str}) β†’ ({pub_ym})')
354
+ else:
355
+ print('Skipping title paragraph update (no date found).')
356
+
357
+ # Save
358
+ ts_doc.save(out_path)
359
+ print()
360
+ print(f'Saved: {out_path}')
361
+ print()
362
+ print('Summary of tracked changes:')
363
+ print(f' [Change History] New row: {ch_cells}')
364
+ print(f' [History] New row: {h_cells}')
365
+ if old_date_str:
366
+ print(f' [Title] V{old_v} β†’ V{new_v}, ({old_date_str}) β†’ ({pub_ym})')
367
+
368
+
369
+ if __name__ == '__main__':
370
+ main()
scripts/map_sections.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Diagnostic: print paragraphs of target clauses from a TS DOCX with indices and styles.
4
+
5
+ Usage:
6
+ python3 map_sections.py <ts_path.docx> "11.1.22.3.2" "14.5.6"
7
+ """
8
+
9
+ import sys
10
+ from docx import Document
11
+
12
+ def main():
13
+ if len(sys.argv) < 3:
14
+ print("Usage: map_sections.py <ts.docx> <clause1> [clause2 ...]")
15
+ sys.exit(1)
16
+
17
+ ts_path = sys.argv[1]
18
+ clauses = sys.argv[2:]
19
+
20
+ doc = Document(ts_path)
21
+ in_section = None
22
+
23
+ for i, para in enumerate(doc.paragraphs):
24
+ text = para.text.strip()
25
+ style = para.style.name
26
+
27
+ matched = False
28
+ for clause in clauses:
29
+ if clause in text and ('Heading' in style or 'heading' in style.lower()):
30
+ in_section = clause
31
+ print(f'\n=== [{i}] SECTION {clause} | style={style!r} ===')
32
+ print(f' [{i}] style={style!r:16s} | "{text}"')
33
+ matched = True
34
+ break
35
+
36
+ if not matched and in_section:
37
+ if 'Heading' in style and text:
38
+ print(f' --- section ends at [{i}] style={style!r}: "{text[:60]}"')
39
+ in_section = None
40
+ elif text:
41
+ print(f' [{i}] style={style!r:16s} | "{text[:100]}"')
42
+
43
+ if __name__ == '__main__':
44
+ main()
scripts/orchestrate_cr.py ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ orchestrate_cr.py β€” Fully automated CR application pipeline.
4
+
5
+ Reads an Excel contribution list, downloads all Accepted CRs and their target
6
+ TSs, parses tracked changes from each CR, applies them to the TS, and
7
+ finalises the document metadata β€” all without any per-CR manual scripting.
8
+
9
+ Usage:
10
+ python3 orchestrate_cr.py <excel_path> [person_name] [--output-dir DIR] [--author NAME]
11
+
12
+ Arguments:
13
+ excel_path Path to .xls or .xlsx contribution list (Windows paths OK)
14
+ person_name Name to match in SubmittedBy column (default: "Ly Thanh PHAN")
15
+
16
+ Options:
17
+ --output-dir Base output folder (default: ~/CR_Processing)
18
+ --author Tracked-change author name (default: "CR Application")
19
+ """
20
+
21
+ import argparse
22
+ import contextlib
23
+ import datetime
24
+ import io
25
+ import re
26
+ import sys
27
+ from pathlib import Path
28
+
29
+ import docx as docx_lib
30
+
31
+ # ── sys.path setup ────────────────────────────────────────────────────────────
32
+ SCRIPT_DIR = Path(__file__).parent
33
+ FETCH_SCRIPTS = SCRIPT_DIR.parent.parent / 'fetch-crs' / 'scripts'
34
+ sys.path.insert(0, str(SCRIPT_DIR))
35
+ sys.path.insert(0, str(FETCH_SCRIPTS))
36
+
37
+ from fetch_crs import parse_excel, download_cr, parse_cr_cover, download_ts, wsl_path
38
+ from cr_parser import parse_cr
39
+ from ts_applicator import apply_manifest
40
+ from finalize_ts import (
41
+ extract_cr_metadata,
42
+ compute_pub_date,
43
+ derive_new_version,
44
+ update_change_history_table,
45
+ update_history_table,
46
+ update_title_para,
47
+ )
48
+ from docx_helpers import RevCounter, AUTHOR as DEFAULT_AUTHOR, DATE as DEFAULT_DATE
49
+
50
+
51
+ # ── Display / logging helpers ─────────────────────────────────────────────────
52
+
53
+ def _section(title):
54
+ bar = '=' * 60
55
+ print(f'\n{bar}')
56
+ print(f' {title}')
57
+ print(bar)
58
+
59
+
60
+ class _TeeWriter:
61
+ """Writes to both real stdout and a StringIO buffer simultaneously."""
62
+ def __init__(self, real, buf):
63
+ self._real = real
64
+ self._buf = buf
65
+
66
+ def write(self, s):
67
+ self._real.write(s)
68
+ self._buf.write(s)
69
+
70
+ def flush(self):
71
+ self._real.flush()
72
+
73
+
74
+ # ── Main ──────────────────────────────────────────────────────────────────────
75
+
76
+ def main():
77
+ ap = argparse.ArgumentParser(
78
+ description='Fully automated CR application pipeline.',
79
+ formatter_class=argparse.RawDescriptionHelpFormatter,
80
+ )
81
+ ap.add_argument('excel_path', help='Path to .xls or .xlsx contribution list')
82
+ ap.add_argument(
83
+ 'person_name',
84
+ nargs='?',
85
+ default='Ly Thanh PHAN',
86
+ help='Name to match in SubmittedBy column (default: "Ly Thanh PHAN")',
87
+ )
88
+ ap.add_argument(
89
+ '--output-dir',
90
+ default=str(Path.home() / 'CR_Processing'),
91
+ help='Base output directory (default: ~/CR_Processing)',
92
+ )
93
+ ap.add_argument(
94
+ '--author',
95
+ default=DEFAULT_AUTHOR,
96
+ help=f'Tracked change author name (default: "{DEFAULT_AUTHOR}")',
97
+ )
98
+ args = ap.parse_args()
99
+
100
+ excel_path = wsl_path(args.excel_path)
101
+ output_dir = Path(wsl_path(args.output_dir)).expanduser()
102
+ cr_dir = output_dir / 'CRs'
103
+ ts_dir = output_dir / 'TS' # spec subfolders created per-TS below
104
+ cr_dir.mkdir(parents=True, exist_ok=True)
105
+ ts_dir.mkdir(parents=True, exist_ok=True)
106
+
107
+ author = args.author
108
+ tc_date = DEFAULT_DATE
109
+
110
+ # ── Step 1: Parse Excel ───────────────────────────────────────────────────
111
+ _section('Step 1 β€” Parsing Excel')
112
+ print(f'Excel: {excel_path}')
113
+ print(f'Person: {args.person_name!r}')
114
+
115
+ try:
116
+ cr_list = parse_excel(excel_path, args.person_name)
117
+ except Exception as e:
118
+ sys.exit(f'ERROR parsing Excel: {e}')
119
+
120
+ print(f'Found {len(cr_list)} Accepted CR(s):')
121
+ for uid, title in cr_list:
122
+ print(f' {uid}: {title[:80]}')
123
+
124
+ if not cr_list:
125
+ print('Nothing to process.')
126
+ return
127
+
128
+ # ── Step 2: Download CR DOCXs ─────────────────────────────────────────────
129
+ _section('Step 2 β€” Downloading CR DOCXs')
130
+ cr_paths = {} # uid -> Path
131
+
132
+ for uid, _ in cr_list:
133
+ print(f' [{uid}] ', end='', flush=True)
134
+ docx_path, note = download_cr(uid, cr_dir)
135
+ if docx_path:
136
+ cr_paths[uid] = docx_path
137
+ print(f'OK ({note}) β€” {docx_path.name}')
138
+ else:
139
+ print(f'FAILED β€” {note}')
140
+
141
+ # ── Step 3: Parse cover pages β†’ group by target TS ───────────────────────
142
+ _section('Step 3 β€” Parsing CR cover pages')
143
+ ts_groups = {} # (spec_number, version) -> [uid, ...]
144
+ uid_cover_failed = []
145
+
146
+ for uid in cr_paths:
147
+ spec_number, version = parse_cr_cover(cr_paths[uid])
148
+ if spec_number and version:
149
+ key = (spec_number, version)
150
+ ts_groups.setdefault(key, []).append(uid)
151
+ print(f' [{uid}] -> TS {spec_number} v{version}')
152
+ else:
153
+ uid_cover_failed.append(uid)
154
+ print(f' [{uid}] WARNING: could not parse cover page β€” skipping')
155
+
156
+ if not ts_groups:
157
+ print('\nNo TSs identified. Nothing to apply.')
158
+ return
159
+
160
+ # ── Step 4: Download TSs ──────────────────────────────────────────────────
161
+ _section('Step 4 β€” Downloading TSs')
162
+ ts_paths = {} # (spec_number, version) -> Path
163
+ spec_dirs = {} # (spec_number, version) -> Path (per-spec subfolder)
164
+
165
+ for (spec_number, version) in ts_groups:
166
+ spec_compact = spec_number.replace(' ', '')
167
+ spec_dir = ts_dir / spec_compact
168
+ spec_dir.mkdir(parents=True, exist_ok=True)
169
+ spec_dirs[(spec_number, version)] = spec_dir
170
+
171
+ print(f' [TS {spec_number} v{version}] ', end='', flush=True)
172
+ filename, note = download_ts(spec_number, version, spec_dir)
173
+ if filename:
174
+ ts_paths[(spec_number, version)] = spec_dir / filename
175
+ print(f'OK ({note}) β€” {spec_compact}/{filename}')
176
+ else:
177
+ print(f'FAILED β€” {note}')
178
+
179
+ # ── Steps 5 & 6: Apply CRs + Finalise each TS ────────────────────────────
180
+ _section('Steps 5 & 6 β€” Applying CRs and Finalising Metadata')
181
+ report = [] # (ts_key, n_ok, n_skip, n_crs, out_path, log_path, errors)
182
+
183
+ for (spec_number, version), uids in ts_groups.items():
184
+ ts_key = f'TS {spec_number} v{version}'
185
+ spec_compact = spec_number.replace(' ', '')
186
+ spec_dir = spec_dirs.get((spec_number, version), ts_dir / spec_compact)
187
+ spec_dir.mkdir(parents=True, exist_ok=True)
188
+
189
+ # Derive new version early so filenames are known upfront
190
+ new_v = derive_new_version(version)
191
+ stem = f'ts_{spec_compact}_v{new_v}_was_v{version}'
192
+ ts_applied = spec_dir / f'ts_{spec_compact}_v{version}_applied.docx'
193
+ ts_final = spec_dir / f'{stem}.docx'
194
+ log_path = spec_dir / f'{stem}.log'
195
+ errors = []
196
+
197
+ print(f'\n-- {ts_key} ({len(uids)} CR(s): {", ".join(uids)}) --')
198
+
199
+ if (spec_number, version) not in ts_paths:
200
+ msg = 'TS download failed β€” skipping'
201
+ print(f' SKIP: {msg}')
202
+ report.append((ts_key, 0, 0, len(uids), None, log_path, [msg]))
203
+ continue
204
+
205
+ ts_in = ts_paths[(spec_number, version)]
206
+
207
+ # All per-TS output is captured to log_buf (tee: stdout + file)
208
+ log_buf = io.StringIO()
209
+ tee = _TeeWriter(sys.stdout, log_buf)
210
+
211
+ with contextlib.redirect_stdout(tee):
212
+ log_header = (
213
+ f'Pipeline Log\n'
214
+ f'TS: {spec_number} v{version} -> v{new_v}\n'
215
+ f'CRs: {", ".join(uids)}\n'
216
+ f'Date: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n'
217
+ f'{"=" * 60}\n'
218
+ )
219
+ print(log_header, end='')
220
+
221
+ # 5a. Parse all CR manifests and combine
222
+ combined_manifest = []
223
+ participating_uids = []
224
+
225
+ for uid in uids:
226
+ if uid not in cr_paths:
227
+ errors.append(f'[{uid}] CR download had failed β€” skipped')
228
+ continue
229
+ print(f' Parsing {uid}... ', end='', flush=True)
230
+ try:
231
+ changes = parse_cr(cr_paths[uid])
232
+ combined_manifest.extend(changes)
233
+ participating_uids.append(uid)
234
+ print(f'{len(changes)} change(s)')
235
+ except Exception as e:
236
+ errors.append(f'[{uid}] parse ERROR: {e}')
237
+ print(f'ERROR: {e}')
238
+
239
+ if not combined_manifest:
240
+ print(' No changes parsed β€” skipping apply step.')
241
+ report.append((ts_key, 0, 0, len(uids), None, log_path,
242
+ errors + ['No changes parsed']))
243
+ log_path.write_text(log_buf.getvalue(), encoding='utf-8')
244
+ continue
245
+
246
+ # 5b. Apply manifest to TS
247
+ print(f' Applying {len(combined_manifest)} change(s) to {ts_in.name}...')
248
+ try:
249
+ n_ok, n_skip, log_lines = apply_manifest(
250
+ ts_in, combined_manifest, ts_applied, author=author, date=tc_date
251
+ )
252
+ except Exception as e:
253
+ errors.append(f'apply_manifest ERROR: {e}')
254
+ print(f' ERROR: {e}')
255
+ report.append((ts_key, 0, 0, len(uids), None, log_path, errors))
256
+ log_path.write_text(log_buf.getvalue(), encoding='utf-8')
257
+ continue
258
+
259
+ for line in log_lines:
260
+ print(f' {line}')
261
+ print(f' -> Applied: {n_ok} Skipped: {n_skip}')
262
+
263
+ # 6. Finalise metadata (Change History, History, title paragraph)
264
+ print(' Finalising metadata...')
265
+ try:
266
+ ts_doc = docx_lib.Document(str(ts_applied))
267
+ rev = RevCounter(ts_doc)
268
+
269
+ pub_ym, pub_month_year = compute_pub_date()
270
+ old_v = version
271
+
272
+ # Extract old date string from first paragraph
273
+ title_text = ts_doc.paragraphs[0].text
274
+ date_match = re.search(r'\((\d{4}-\d{2})\)', title_text)
275
+ old_date_str = date_match.group(1) if date_match else ''
276
+
277
+ print(f' Version: {old_v} -> {new_v}')
278
+ print(f' Publication: {pub_month_year} ({pub_ym})')
279
+
280
+ # One Change History row per CR
281
+ for uid in participating_uids:
282
+ try:
283
+ meta = extract_cr_metadata(str(cr_paths[uid]))
284
+ ch_cells = update_change_history_table(
285
+ ts_doc, meta, pub_ym, old_v, new_v, rev, author, tc_date
286
+ )
287
+ print(f' [Change History] {uid}: {ch_cells}')
288
+ except Exception as e:
289
+ errors.append(f'[{uid}] Change History ERROR: {e}')
290
+ print(f' [Change History] {uid}: ERROR β€” {e}')
291
+
292
+ # One History row for the whole TS
293
+ try:
294
+ h_cells = update_history_table(
295
+ ts_doc, new_v, pub_month_year, rev, author, tc_date
296
+ )
297
+ print(f' [History] {h_cells}')
298
+ except Exception as e:
299
+ errors.append(f'History table ERROR: {e}')
300
+ print(f' [History] ERROR β€” {e}')
301
+
302
+ # Title paragraph version + date
303
+ if old_date_str:
304
+ try:
305
+ update_title_para(
306
+ ts_doc, old_v, new_v, old_date_str, pub_ym, rev, author, tc_date
307
+ )
308
+ print(f' [Title] V{old_v} -> V{new_v}, ({old_date_str}) -> ({pub_ym})')
309
+ except Exception as e:
310
+ errors.append(f'Title update ERROR: {e}')
311
+ print(f' [Title] ERROR β€” {e}')
312
+ else:
313
+ print(f' [Title] SKIP β€” no (YYYY-MM) pattern in: {title_text!r}')
314
+
315
+ ts_doc.save(str(ts_final))
316
+ print(f' Saved: {spec_compact}/{ts_final.name}')
317
+ print(f' Log: {spec_compact}/{log_path.name}')
318
+ report.append((ts_key, n_ok, n_skip, len(uids), ts_final, log_path, errors))
319
+
320
+ except Exception as e:
321
+ errors.append(f'Finalisation ERROR: {e}')
322
+ print(f' Finalisation ERROR: {e}')
323
+ report.append((ts_key, n_ok, n_skip, len(uids), ts_applied, log_path, errors))
324
+
325
+ # Write log file after the tee context exits
326
+ log_path.write_text(log_buf.getvalue(), encoding='utf-8')
327
+
328
+ # ── Final Report ──────────────────────────────────────────────────────────
329
+ _section('Final Report')
330
+ n_success = sum(1 for r in report if r[4] is not None and not r[6])
331
+ n_partial = sum(1 for r in report if r[4] is not None and r[6])
332
+ n_failed = sum(1 for r in report if r[4] is None)
333
+
334
+ print(f'Person: {args.person_name}')
335
+ print(f'Excel: {excel_path}')
336
+ print(f'CRs found: {len(cr_list)}')
337
+ print(f'TSs updated: {n_success} fully OK, {n_partial} with warnings, {n_failed} failed')
338
+ print()
339
+
340
+ for ts_key, n_ok, n_skip, n_crs, out_path, log_path, errors in report:
341
+ if out_path and not errors:
342
+ status = 'OK'
343
+ elif out_path:
344
+ status = 'WARN'
345
+ else:
346
+ status = 'FAIL'
347
+ print(f' [{status}] {ts_key}')
348
+ print(f' CRs: {n_crs} | Body changes applied: {n_ok} | Skipped: {n_skip}')
349
+ if out_path:
350
+ print(f' Output: {out_path.parent.name}/{out_path.name}')
351
+ if log_path and log_path.exists():
352
+ print(f' Log: {log_path.parent.name}/{log_path.name}')
353
+ for err in errors:
354
+ print(f' ! {err}')
355
+
356
+ print()
357
+ print(f'Output directory: {output_dir}/')
358
+
359
+
360
+ if __name__ == '__main__':
361
+ main()
scripts/ts_applicator.py ADDED
@@ -0,0 +1,633 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ ts_applicator.py β€” Apply a CR change manifest to a TS DOCX as tracked changes.
4
+
5
+ Reads a JSON manifest produced by cr_parser.py and applies every change
6
+ to the target TS using docx_helpers tracked-change primitives.
7
+
8
+ Usage:
9
+ python3 ts_applicator.py <ts.docx> <manifest.json> [--author NAME] [--output path]
10
+ # or import: from ts_applicator import apply_manifest
11
+ """
12
+
13
+ import argparse
14
+ import json
15
+ import re
16
+ import sys
17
+ from pathlib import Path
18
+
19
+ import docx
20
+ from docx.oxml import OxmlElement
21
+ from docx.oxml.ns import qn
22
+
23
+ sys.path.insert(0, str(Path(__file__).parent))
24
+ from docx_helpers import (
25
+ RevCounter,
26
+ tracked_modify_para,
27
+ tracked_insert_paras_after,
28
+ AUTHOR as DEFAULT_AUTHOR,
29
+ DATE as DEFAULT_DATE,
30
+ )
31
+
32
+
33
+ # ── Text normalisation ────────────────────────────────────────────────────────
34
+
35
+ def _norm(text):
36
+ """Normalise non-breaking spaces and common Unicode dashes for comparison."""
37
+ return (text
38
+ .replace('\xa0', ' ')
39
+ .replace('\u2013', '-')
40
+ .replace('\u2014', '-')
41
+ .strip())
42
+
43
+
44
+ def _norm_ws(text):
45
+ """
46
+ Strip all whitespace for structural matching.
47
+
48
+ ETSI TS files store structured paragraphs (references, abbreviations,
49
+ headings) with a TAB between the code and the body text, e.g.:
50
+ '[27]\\tGlobalPlatform: ...'
51
+ 'CLT\\tContactLess Tunnelling'
52
+ '8.3\\tRAM implementation over HTTPS'
53
+
54
+ The CR's text extraction concatenates runs directly, losing the tab:
55
+ '[27]GlobalPlatform: ...'
56
+ 'CLTContactLess Tunnelling'
57
+ '8.3RAM implementation over HTTPS'
58
+
59
+ Removing all whitespace from both sides before comparing solves this.
60
+ Used as a third-level fallback (confidence 0.8) after exact and NBSP-norm.
61
+ """
62
+ base = (text
63
+ .replace('\xa0', '')
64
+ .replace('\u2013', '-')
65
+ .replace('\u2014', '-'))
66
+ return re.sub(r'\s+', '', base)
67
+
68
+
69
+ # ── Document search helpers ───────────────────────────────────────────────────
70
+
71
+ def _full_para_text(para):
72
+ """All text content including w:t (normal/inserted) and w:delText (deleted runs)."""
73
+ el = para._element
74
+ return ''.join(t.text or '' for t in el.findall('.//' + qn('w:t'))) + \
75
+ ''.join(t.text or '' for t in el.findall('.//' + qn('w:delText')))
76
+
77
+
78
+ def _find_para(doc, search_text, prefer_not_in_table=False):
79
+ """
80
+ Find the first paragraph containing search_text.
81
+ Four levels of matching, in order of confidence:
82
+ 1.0 β€” exact substring match
83
+ 0.9 β€” NBSP/dash-normalised match (_norm)
84
+ 0.8 β€” whitespace-stripped match (_norm_ws) handles tab vs nothing in
85
+ structured paragraphs (refs '[27]\\t...', abbrevs 'CLT\\t...', headings '8.3\\t...')
86
+ 0.6 β€” full XML text (including w:del content): handles anchors that were
87
+ previously deleted by tracked_modify_para in an earlier apply step
88
+ Returns (para, confidence) or (None, 0.0).
89
+ """
90
+ norm_search = _norm(search_text)
91
+ ws_search = _norm_ws(search_text)
92
+ candidates_exact = []
93
+ candidates_norm = []
94
+ candidates_ws = []
95
+ candidates_del = []
96
+
97
+ for para in doc.paragraphs:
98
+ pt = para.text
99
+ if search_text in pt:
100
+ candidates_exact.append(para)
101
+ elif norm_search and norm_search in _norm(pt):
102
+ candidates_norm.append(para)
103
+ elif ws_search and ws_search in _norm_ws(pt):
104
+ candidates_ws.append(para)
105
+ else:
106
+ # Level 4: check full XML text (catches deleted-but-still-present paragraphs)
107
+ full_pt = _full_para_text(para)
108
+ if search_text in full_pt:
109
+ candidates_del.append(para)
110
+ elif ws_search and ws_search in _norm_ws(full_pt):
111
+ candidates_del.append(para)
112
+
113
+ def _in_table(para):
114
+ p = para._element
115
+ return any(a.tag == qn('w:tc') for a in p.iterancestors())
116
+
117
+ for pool, conf in [(candidates_exact, 1.0), (candidates_norm, 0.9),
118
+ (candidates_ws, 0.8), (candidates_del, 0.6)]:
119
+ if not pool:
120
+ continue
121
+ if prefer_not_in_table:
122
+ body_only = [p for p in pool if not _in_table(p)]
123
+ if body_only:
124
+ return body_only[0], conf
125
+ return pool[0], conf
126
+
127
+ return None, 0.0
128
+
129
+
130
+ def _find_table_by_section(doc, section_heading):
131
+ """
132
+ Find the table immediately following a paragraph that contains section_heading.
133
+ Checks both w:t (plain/inserted) and w:delText (tracked-deleted) so the match
134
+ survives even after the heading was wrapped in a tracked deletion.
135
+ Empty paragraphs between the heading and the table are tolerated.
136
+ Returns (table, confidence) or (None, 0.0).
137
+ """
138
+ if not section_heading:
139
+ return None, 0.0
140
+ norm_h = _norm(section_heading)
141
+ ws_h = _norm_ws(section_heading)
142
+ heading_seen = False
143
+ for element in doc.element.body:
144
+ tag = element.tag.split('}')[-1] if '}' in element.tag else element.tag
145
+ if tag == 'p':
146
+ t_text = ''.join(t.text or '' for t in element.findall('.//' + qn('w:t')))
147
+ d_text = ''.join(t.text or '' for t in element.findall('.//' + qn('w:delText')))
148
+ full = (t_text + d_text).strip()
149
+ if not full:
150
+ continue # skip empty paras, keep heading_seen state
151
+ if (section_heading in full
152
+ or norm_h in _norm(full)
153
+ or ws_h in _norm_ws(full)):
154
+ heading_seen = True
155
+ else:
156
+ heading_seen = False # non-matching non-empty para resets
157
+ elif tag == 'tbl':
158
+ if heading_seen:
159
+ for tbl in doc.tables:
160
+ if tbl._tbl is element:
161
+ return tbl, 1.0
162
+ heading_seen = False
163
+ return None, 0.0
164
+
165
+
166
+ def _find_table(doc, header_key):
167
+ """
168
+ Find a table whose first row cell texts start with header_key.
169
+ Returns (table, confidence) or (None, 0.0).
170
+ """
171
+ norm_key = [_norm(h) for h in header_key]
172
+
173
+ for tbl in doc.tables:
174
+ if not tbl.rows:
175
+ continue
176
+ first_row_texts = [_norm(c.text) for c in tbl.rows[0].cells]
177
+ # Match by prefix (header_key may have fewer columns)
178
+ match = all(
179
+ i < len(first_row_texts) and norm_key[i] in first_row_texts[i]
180
+ for i in range(len(norm_key))
181
+ )
182
+ if match:
183
+ return tbl, 1.0
184
+
185
+ return None, 0.0
186
+
187
+
188
+ def _find_row(tbl, anchor_text):
189
+ """
190
+ Find first row in tbl where col-0 cell text contains anchor_text.
191
+ Returns (row_idx, confidence) or (-1, 0.0).
192
+ Three confidence levels: 1.0 exact, 0.9 norm, 0.8 whitespace-stripped.
193
+ """
194
+ norm_anchor = _norm(anchor_text)
195
+ ws_anchor = _norm_ws(anchor_text)
196
+ best = (-1, 0.0)
197
+ for idx, row in enumerate(tbl.rows):
198
+ cell0 = row.cells[0].text if row.cells else ''
199
+ if anchor_text in cell0:
200
+ return idx, 1.0
201
+ if norm_anchor and norm_anchor in _norm(cell0) and best[1] < 0.9:
202
+ best = (idx, 0.9)
203
+ elif ws_anchor and ws_anchor in _norm_ws(cell0) and best[1] < 0.8:
204
+ best = (idx, 0.8)
205
+ return best
206
+
207
+
208
+ # ── vMerge row insertion ──────────────────────────────────────────────────────
209
+
210
+ def _build_new_tr(cells_data, rev, author, date):
211
+ """
212
+ Build and return a new tracked-insert <w:tr> element (does NOT insert it).
213
+ cells_data: list of dicts with keys: text, width, vmerge, style.
214
+ """
215
+ def _ins_attr():
216
+ return {qn('w:id'): rev.next(), qn('w:author'): author, qn('w:date'): date}
217
+
218
+ def _make_t(text, tag='w:t'):
219
+ t = OxmlElement(tag)
220
+ t.text = text or ''
221
+ if text and (text[0] in (' ', '\t') or text[-1] in (' ', '\t')):
222
+ t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
223
+ return t
224
+
225
+ def _make_run(text):
226
+ r = OxmlElement('w:r')
227
+ r.append(_make_t(text))
228
+ return r
229
+
230
+ new_tr = OxmlElement('w:tr')
231
+
232
+ # trPr: tracked row insertion
233
+ trPr = OxmlElement('w:trPr')
234
+ tr_ins = OxmlElement('w:ins')
235
+ for k, v in _ins_attr().items():
236
+ tr_ins.set(k, v)
237
+ trPr.append(tr_ins)
238
+ new_tr.append(trPr)
239
+
240
+ for cd in cells_data:
241
+ tc = OxmlElement('w:tc')
242
+
243
+ tcPr = OxmlElement('w:tcPr')
244
+ tcW = OxmlElement('w:tcW')
245
+ if cd.get('width'):
246
+ tcW.set(qn('w:w'), str(cd['width']))
247
+ tcW.set(qn('w:type'), 'dxa')
248
+ tcPr.append(tcW)
249
+ if cd.get('vmerge'):
250
+ vm = OxmlElement('w:vMerge')
251
+ tcPr.append(vm)
252
+ tc.append(tcPr)
253
+
254
+ p = OxmlElement('w:p')
255
+ pPr = OxmlElement('w:pPr')
256
+ if cd.get('style'):
257
+ pStyle = OxmlElement('w:pStyle')
258
+ pStyle.set(qn('w:val'), cd['style'])
259
+ pPr.append(pStyle)
260
+ rPr_para = OxmlElement('w:rPr')
261
+ pm_ins = OxmlElement('w:ins')
262
+ for k, v in _ins_attr().items():
263
+ pm_ins.set(k, v)
264
+ rPr_para.append(pm_ins)
265
+ pPr.append(rPr_para)
266
+ p.append(pPr)
267
+
268
+ if cd.get('text') and not cd.get('vmerge'):
269
+ ins_el = OxmlElement('w:ins')
270
+ for k, v in _ins_attr().items():
271
+ ins_el.set(k, v)
272
+ ins_el.append(_make_run(cd['text']))
273
+ p.append(ins_el)
274
+
275
+ tc.append(p)
276
+ new_tr.append(tc)
277
+
278
+ return new_tr
279
+
280
+
281
+ def _insert_vmerge_row(tbl, after_row_idx, cells_data, rev, author, date):
282
+ """
283
+ Insert a tracked row after row[after_row_idx].
284
+ cells_data: list of dicts with keys: text, width, vmerge, style.
285
+ Returns the inserted <w:tr> element.
286
+ """
287
+ new_tr = _build_new_tr(cells_data, rev, author, date)
288
+ ref_tr = tbl.rows[after_row_idx]._tr
289
+ ref_tr.addnext(new_tr)
290
+ return new_tr
291
+
292
+
293
+ # ── Section replace (direct XML transplant) ───────────────────────────────────
294
+
295
+ def _apply_section_replace(doc, change, rev, author, date, log):
296
+ """
297
+ Transplant a block of CR elements (del section + ins section) directly into
298
+ the TS, replacing the old heading+table at the matching location.
299
+
300
+ This mirrors what Word does on copy-paste: the exact XML from the CR is
301
+ cloned into the TS, with only the tracked-change revision IDs remapped to
302
+ avoid conflicts.
303
+ """
304
+ from lxml import etree
305
+ import copy
306
+
307
+ loc = change['location']
308
+ del_heading = loc.get('del_heading', '')
309
+ has_del_table = loc.get('has_del_table', False)
310
+ elements_xml = change.get('elements_xml', [])
311
+
312
+ if not elements_xml:
313
+ log.append(' SKIP section_replace: no elements in manifest')
314
+ return False
315
+
316
+ # ── Find the TS paragraph that matches the deleted heading ─────────────────
317
+ ts_para_elem = None
318
+ if del_heading:
319
+ for para in doc.paragraphs:
320
+ pt = para.text
321
+ if del_heading in pt or _norm(del_heading) in _norm(pt):
322
+ ts_para_elem = para._element
323
+ break
324
+ if ts_para_elem is None:
325
+ # Fallback: include paragraphs whose XML text (inc. del runs) matches
326
+ for para in doc.paragraphs:
327
+ if del_heading in _full_para_text(para):
328
+ ts_para_elem = para._element
329
+ break
330
+
331
+ if ts_para_elem is None:
332
+ log.append(f' SKIP section_replace: del_heading {del_heading!r} not found in TS')
333
+ return False
334
+
335
+ ts_body = ts_para_elem.getparent()
336
+
337
+ # ── Find the table immediately after the heading (if applicable) ───────────
338
+ ts_tbl_elem = None
339
+ if has_del_table:
340
+ found_para = False
341
+ for sib in ts_body:
342
+ if sib is ts_para_elem:
343
+ found_para = True
344
+ continue
345
+ if not found_para:
346
+ continue
347
+ sib_tag = sib.tag.split('}')[-1] if '}' in sib.tag else sib.tag
348
+ if sib_tag == 'p':
349
+ # Allow empty paragraphs between heading and table
350
+ if not (''.join(t.text or '' for t in sib.findall('.//' + qn('w:t')))).strip():
351
+ continue
352
+ break # non-empty paragraph before table β†’ no table to remove
353
+ elif sib_tag == 'tbl':
354
+ ts_tbl_elem = sib
355
+ break
356
+ else:
357
+ break
358
+
359
+ # ── Clone and remap IDs on the CR elements ─────────────────────────────────
360
+ cloned = []
361
+ for xml_str in elements_xml:
362
+ elem = etree.fromstring(xml_str)
363
+ cloned_elem = copy.deepcopy(elem)
364
+ # Remap w:id in all tracked-change elements (must be unique per document)
365
+ for el in cloned_elem.iter():
366
+ if el.get(qn('w:id')) is not None:
367
+ el.set(qn('w:id'), rev.next())
368
+ cloned.append(cloned_elem)
369
+
370
+ # ── Insert cloned elements before the TS heading paragraph ────────────────
371
+ insert_idx = list(ts_body).index(ts_para_elem)
372
+ for i, elem in enumerate(cloned):
373
+ ts_body.insert(insert_idx + i, elem)
374
+
375
+ # ── Remove the now-replaced TS elements ───────────────────────────────────
376
+ ts_body.remove(ts_para_elem)
377
+ if ts_tbl_elem is not None:
378
+ ts_body.remove(ts_tbl_elem)
379
+
380
+ n_del = sum(1 for x in elements_xml if 'w:del' in x[:200])
381
+ log.append(
382
+ f' OK section_replace: {del_heading!r} β†’ {len(elements_xml)} element(s) spliced in'
383
+ f' (removed heading{"+ table" if has_del_table else ""})'
384
+ )
385
+ return True
386
+
387
+
388
+ # ── Per-change-type applicators ───────────────────────────────────────────────
389
+
390
+ def _apply_text_replace(doc, change, rev, author, date, log):
391
+ loc = change['location']
392
+ old = change['old']
393
+ new = change['new']
394
+
395
+ if loc['kind'] == 'table_cell':
396
+ tbl, t_conf = _find_table(doc, loc['table_header'])
397
+ if tbl is None:
398
+ log.append(f" SKIP text_replace: table not found {loc['table_header'][:2]!r}")
399
+ return False
400
+ col_idx = loc['col_idx']
401
+ row_anchor = loc['row_anchor']
402
+
403
+ if row_anchor:
404
+ row_idx, r_conf = _find_row(tbl, row_anchor)
405
+ if row_idx < 0:
406
+ log.append(f" SKIP text_replace: row anchor not found {row_anchor!r}")
407
+ return False
408
+ row = tbl.rows[row_idx]
409
+ if col_idx >= len(row.cells):
410
+ log.append(f" SKIP text_replace: col_idx {col_idx} out of range")
411
+ return False
412
+ cell = row.cells[col_idx]
413
+ for para in cell.paragraphs:
414
+ if old in para.text:
415
+ tracked_modify_para(para, old, new, rev, author, date)
416
+ log.append(f" OK text_replace (table_cell row={row_idx} col={col_idx}): {old!r} β†’ {new!r}")
417
+ return True
418
+ log.append(f" SKIP text_replace: old text {old!r} not in cell (row={row_idx} col={col_idx})")
419
+ return False
420
+ else:
421
+ # Empty row anchor: scan all rows in col_idx.
422
+ # Prefer the table that follows the section heading (e.g. "Thirty fifth byte:")
423
+ # because all-empty table headers match any table.
424
+ section_heading = loc.get('section_heading', '')
425
+ tbl_by_section, _ = _find_table_by_section(doc, section_heading)
426
+ if tbl_by_section is not None:
427
+ tables_to_try = [tbl_by_section] + [t for t in doc.tables if t is not tbl_by_section]
428
+ else:
429
+ tables_to_try = [tbl] + [t for t in doc.tables if t is not tbl]
430
+ for search_tbl in tables_to_try:
431
+ for r_idx, row in enumerate(search_tbl.rows):
432
+ if col_idx >= len(row.cells):
433
+ continue
434
+ cell = row.cells[col_idx]
435
+ for para in cell.paragraphs:
436
+ if old in para.text:
437
+ tracked_modify_para(para, old, new, rev, author, date)
438
+ log.append(f" OK text_replace (table_cell scan row={r_idx} col={col_idx}): {old!r} β†’ {new!r}")
439
+ return True
440
+ # Final fallback: scan ALL columns of ALL tables
441
+ _all_start = tbl_by_section if tbl_by_section is not None else tbl
442
+ for search_tbl in [_all_start] + [t for t in doc.tables if t is not _all_start]:
443
+ for r_idx, row in enumerate(search_tbl.rows):
444
+ for c_idx, cell in enumerate(row.cells):
445
+ for para in cell.paragraphs:
446
+ if old in para.text:
447
+ tracked_modify_para(para, old, new, rev, author, date)
448
+ log.append(f" OK text_replace (table_cell any_col row={r_idx} col={c_idx}): {old!r} β†’ {new!r}")
449
+ return True
450
+ log.append(f" SKIP text_replace: old text {old!r} not found in any table column")
451
+ return False
452
+
453
+ elif loc['kind'] == 'body_para':
454
+ ctx = loc.get('para_context', '')
455
+ # Try to find the paragraph by old text first
456
+ para, conf = _find_para(doc, old, prefer_not_in_table=True)
457
+ if para is None:
458
+ # Fall back: find by paragraph context
459
+ para, conf = _find_para(doc, ctx, prefer_not_in_table=True)
460
+ if para is None:
461
+ log.append(f" SKIP text_replace: old text {old!r} not found in TS")
462
+ return False
463
+ if old in para.text:
464
+ tracked_modify_para(para, old, new, rev, author, date)
465
+ log.append(f" OK text_replace (body_para conf={conf:.1f}): {old!r} β†’ {new!r}")
466
+ return True
467
+ log.append(f" SKIP text_replace: old text {old!r} not in resolved paragraph")
468
+ return False
469
+
470
+ log.append(f" SKIP text_replace: unknown kind {loc['kind']!r}")
471
+ return False
472
+
473
+
474
+ def _apply_para_insert(doc, change, rev, author, date, log):
475
+ anchor_text = change['location'].get('anchor_text', '')
476
+ paras_data = change.get('paragraphs', [])
477
+ if not paras_data:
478
+ return True
479
+
480
+ anchor_para, conf = _find_para(doc, anchor_text)
481
+ if anchor_para is None:
482
+ log.append(f" SKIP para_insert: anchor not found {anchor_text[:60]!r}")
483
+ return False
484
+
485
+ items = [(p['text'], p['style'] or 'Normal') for p in paras_data]
486
+ tracked_insert_paras_after(anchor_para, items, rev, author, date)
487
+ first_text = paras_data[0]['text'][:50] if paras_data else ''
488
+ log.append(f" OK para_insert ({len(paras_data)} para(s) after anchor conf={conf:.1f}): {first_text!r}...")
489
+ return True
490
+
491
+
492
+ def _apply_row_insert(doc, change, rev, author, date, log, last_inserted=None):
493
+ loc = change['location']
494
+
495
+ # Prefer table located by section heading (handles ambiguous all-empty headers)
496
+ section_heading = loc.get('section_heading', '')
497
+ tbl_by_section, _ = _find_table_by_section(doc, section_heading)
498
+ if tbl_by_section is not None:
499
+ tbl = tbl_by_section
500
+ else:
501
+ tbl, t_conf = _find_table(doc, loc['table_header'])
502
+ if tbl is None:
503
+ log.append(f" SKIP row_insert: table not found {loc['table_header'][:2]!r}")
504
+ return False
505
+
506
+ after_anchor = loc.get('after_row_anchor', '')
507
+ row_idx, r_conf = _find_row(tbl, after_anchor)
508
+ if row_idx < 0:
509
+ log.append(f" SKIP row_insert: anchor row not found {after_anchor!r}")
510
+ return False
511
+
512
+ cells_data = change.get('cells', [])
513
+
514
+ # Fix insertion ordering: when multiple rows target the same (tbl, row_idx),
515
+ # each new row should go AFTER the previously inserted one, not after row_idx.
516
+ # last_inserted maps (tbl._tbl id, row_idx) β†’ last w:tr element inserted there.
517
+ key = (id(tbl._tbl), row_idx)
518
+ if last_inserted is not None and key in last_inserted:
519
+ # Insert after the previously inserted row to maintain forward order
520
+ prev_tr = last_inserted[key]
521
+ new_tr = _build_new_tr(cells_data, rev, author, date)
522
+ prev_tr.addnext(new_tr)
523
+ last_inserted[key] = new_tr
524
+ else:
525
+ new_tr = _insert_vmerge_row(tbl, row_idx, cells_data, rev, author, date)
526
+ if last_inserted is not None:
527
+ last_inserted[key] = new_tr
528
+
529
+ desc = cells_data[1]['text'] if len(cells_data) > 1 else '?'
530
+ log.append(f" OK row_insert after row[{row_idx}] ({after_anchor!r}): {desc!r}")
531
+ return True
532
+
533
+
534
+ # ── Manifest pre-processing ───────────────────────────────────────────────────
535
+
536
+ def _merge_para_inserts(manifest):
537
+ """
538
+ Merge consecutive para_insert entries that share the same anchor_text.
539
+
540
+ When the CR parser emits multiple para_insert entries for the same anchor
541
+ (because [...] context markers were transparent and kept prev_stable_text
542
+ unchanged), each would call tracked_insert_paras_after independently.
543
+ Since each call starts from the same anchor element and uses addnext(),
544
+ later groups push earlier groups down β€” producing reversed order.
545
+
546
+ Merging them into one entry ensures a single tracked_insert_paras_after
547
+ call that inserts all paragraphs in the correct forward order.
548
+ """
549
+ result = []
550
+ for change in manifest:
551
+ if (change.get('type') == 'para_insert'
552
+ and result
553
+ and result[-1].get('type') == 'para_insert'
554
+ and result[-1]['location']['anchor_text'] == change['location']['anchor_text']):
555
+ result[-1]['paragraphs'].extend(change['paragraphs'])
556
+ else:
557
+ merged = dict(change)
558
+ if change.get('type') == 'para_insert':
559
+ merged['paragraphs'] = list(change['paragraphs'])
560
+ result.append(merged)
561
+ return result
562
+
563
+
564
+ # ── Main apply function ───────────────────────────────────────────────────────
565
+
566
+ def apply_manifest(ts_path, manifest, out_path, author=DEFAULT_AUTHOR, date=DEFAULT_DATE):
567
+ """
568
+ Apply all changes in manifest to ts_path, save to out_path.
569
+ Returns (n_ok, n_skipped, log_lines).
570
+ """
571
+ doc = docx.Document(str(ts_path))
572
+ rev = RevCounter(doc)
573
+ log = []
574
+ n_ok = 0
575
+ n_skip = 0
576
+
577
+ manifest = _merge_para_inserts(manifest)
578
+
579
+ # Track last inserted <w:tr> per (tbl_id, anchor_row_idx) to maintain
580
+ # forward insertion order when multiple row_inserts target the same anchor.
581
+ last_inserted = {}
582
+
583
+ for change in manifest:
584
+ ctype = change.get('type')
585
+ ok = False
586
+
587
+ if ctype == 'section_replace':
588
+ ok = _apply_section_replace(doc, change, rev, author, date, log)
589
+ elif ctype == 'text_replace':
590
+ ok = _apply_text_replace(doc, change, rev, author, date, log)
591
+ elif ctype == 'para_insert':
592
+ ok = _apply_para_insert(doc, change, rev, author, date, log)
593
+ elif ctype == 'row_insert':
594
+ ok = _apply_row_insert(doc, change, rev, author, date, log, last_inserted=last_inserted)
595
+ else:
596
+ log.append(f" SKIP unknown change type: {ctype!r}")
597
+
598
+ if ok:
599
+ n_ok += 1
600
+ else:
601
+ n_skip += 1
602
+
603
+ doc.save(str(out_path))
604
+ return n_ok, n_skip, log
605
+
606
+
607
+ # ── CLI ───────────────────────────────────────────────────────────────────────
608
+
609
+ def main():
610
+ ap = argparse.ArgumentParser(description='Apply CR manifest to TS DOCX as tracked changes.')
611
+ ap.add_argument('ts_docx', help='Target TS DOCX file')
612
+ ap.add_argument('manifest', help='JSON manifest from cr_parser.py')
613
+ ap.add_argument('--author', default=DEFAULT_AUTHOR, help='Tracked change author')
614
+ ap.add_argument('--output', default=None, help='Output path (default: <ts>_applied.docx)')
615
+ args = ap.parse_args()
616
+
617
+ ts_path = Path(args.ts_docx)
618
+ out_path = Path(args.output) if args.output else ts_path.parent / (ts_path.stem + '_applied.docx')
619
+
620
+ with open(args.manifest, encoding='utf-8') as f:
621
+ manifest = json.load(f)
622
+
623
+ print(f'Applying {len(manifest)} change(s) from manifest to {ts_path.name}...')
624
+ n_ok, n_skip, log = apply_manifest(ts_path, manifest, out_path, author=args.author)
625
+
626
+ for line in log:
627
+ print(line)
628
+ print(f'\nResult: {n_ok} applied, {n_skip} skipped')
629
+ print(f'Output: {out_path}')
630
+
631
+
632
+ if __name__ == '__main__':
633
+ main()