Spaces:
Running
Running
init
Browse files- README.md +9 -6
- app.py +459 -0
- requirements.txt +6 -0
- scripts/__pycache__/cr_parser.cpython-310.pyc +0 -0
- scripts/__pycache__/docx_helpers.cpython-310.pyc +0 -0
- scripts/__pycache__/fetch_crs.cpython-310.pyc +0 -0
- scripts/__pycache__/finalize_ts.cpython-310.pyc +0 -0
- scripts/__pycache__/ts_applicator.cpython-310.pyc +0 -0
- scripts/cr_parser.py +490 -0
- scripts/docx_helpers.py +494 -0
- scripts/fetch_crs.py +487 -0
- scripts/finalize_ts.py +370 -0
- scripts/map_sections.py +44 -0
- scripts/orchestrate_cr.py +361 -0
- scripts/ts_applicator.py +633 -0
README.md
CHANGED
|
@@ -1,10 +1,13 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
-
sdk:
|
|
|
|
|
|
|
| 7 |
pinned: false
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: CR Application Tool
|
| 3 |
+
emoji: π
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
+
sdk: streamlit
|
| 7 |
+
sdk_version: 1.35.0
|
| 8 |
+
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
---
|
| 11 |
|
| 12 |
+
Automated 3GPP/ETSI CR application tool.
|
| 13 |
+
Upload an Excel contribution list β preview accepted CRs β apply all changes β download ZIP.
|
app.py
ADDED
|
@@ -0,0 +1,459 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
CR Application Tool β Streamlit frontend.
|
| 4 |
+
|
| 5 |
+
Three-step UI:
|
| 6 |
+
1. UPLOAD β upload Excel contribution list
|
| 7 |
+
2. PREVIEW β review accepted CRs
|
| 8 |
+
3. RUNNING β pipeline subprocess with live log
|
| 9 |
+
4. DONE/ERROR β download ZIP of results
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import io
|
| 13 |
+
import json
|
| 14 |
+
import os
|
| 15 |
+
import subprocess
|
| 16 |
+
import sys
|
| 17 |
+
import threading
|
| 18 |
+
import time
|
| 19 |
+
import uuid
|
| 20 |
+
import zipfile
|
| 21 |
+
from datetime import datetime
|
| 22 |
+
from pathlib import Path
|
| 23 |
+
|
| 24 |
+
import streamlit as st
|
| 25 |
+
|
| 26 |
+
# ββ Scripts dir (same folder as app.py / scripts/) βββββββββββββββββββββββββββ
|
| 27 |
+
SCRIPTS_DIR = Path(__file__).parent / "scripts"
|
| 28 |
+
sys.path.insert(0, str(SCRIPTS_DIR))
|
| 29 |
+
|
| 30 |
+
# ββ Session persistence βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 31 |
+
|
| 32 |
+
def _get_session_base() -> Path:
|
| 33 |
+
"""Use /data/cr_sessions if writable (HF persistent storage), else /tmp."""
|
| 34 |
+
candidate = Path("/data/cr_sessions")
|
| 35 |
+
try:
|
| 36 |
+
candidate.mkdir(parents=True, exist_ok=True)
|
| 37 |
+
probe = candidate / ".write_test"
|
| 38 |
+
probe.write_text("x")
|
| 39 |
+
probe.unlink()
|
| 40 |
+
return candidate
|
| 41 |
+
except OSError:
|
| 42 |
+
fallback = Path("/tmp/cr_sessions")
|
| 43 |
+
fallback.mkdir(parents=True, exist_ok=True)
|
| 44 |
+
return fallback
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
SESSION_BASE = _get_session_base()
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def session_dir(sid: str) -> Path:
|
| 51 |
+
d = SESSION_BASE / sid
|
| 52 |
+
d.mkdir(parents=True, exist_ok=True)
|
| 53 |
+
return d
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def _state_path(sid: str) -> Path:
|
| 57 |
+
return session_dir(sid) / "state.json"
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def load_state(sid: str) -> dict | None:
|
| 61 |
+
p = _state_path(sid)
|
| 62 |
+
if p.exists():
|
| 63 |
+
try:
|
| 64 |
+
return json.loads(p.read_text())
|
| 65 |
+
except Exception:
|
| 66 |
+
return None
|
| 67 |
+
return None
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def save_state(sid: str, state: dict) -> None:
|
| 71 |
+
_state_path(sid).write_text(json.dumps(state, indent=2, default=str))
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def new_state(sid: str) -> dict:
|
| 75 |
+
return {
|
| 76 |
+
"session_id": sid,
|
| 77 |
+
"status": "upload",
|
| 78 |
+
"excel_filename": None,
|
| 79 |
+
"person_name": "Ly Thanh PHAN",
|
| 80 |
+
"cr_list": [],
|
| 81 |
+
"pid": None,
|
| 82 |
+
"output_dir": None,
|
| 83 |
+
"log_path": None,
|
| 84 |
+
"started_at": None,
|
| 85 |
+
"completed_at": None,
|
| 86 |
+
"return_code": None,
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
# ββ Helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 91 |
+
|
| 92 |
+
def _rc_path(sid: str) -> Path:
|
| 93 |
+
return session_dir(sid) / "returncode"
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def _run_and_save_rc(proc: subprocess.Popen, rc_path: Path) -> None:
|
| 97 |
+
"""Background thread: wait for process, write return code to disk."""
|
| 98 |
+
proc.wait()
|
| 99 |
+
rc_path.write_text(str(proc.returncode))
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def read_return_code(sid: str) -> int | None:
|
| 103 |
+
p = _rc_path(sid)
|
| 104 |
+
if p.exists():
|
| 105 |
+
try:
|
| 106 |
+
return int(p.read_text().strip())
|
| 107 |
+
except ValueError:
|
| 108 |
+
return None
|
| 109 |
+
return None
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def is_process_alive(pid: int) -> bool:
|
| 113 |
+
try:
|
| 114 |
+
os.kill(pid, 0)
|
| 115 |
+
return True
|
| 116 |
+
except (ProcessLookupError, PermissionError):
|
| 117 |
+
return False
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def tail_log(log_path: str, n: int = 100) -> str:
|
| 121 |
+
p = Path(log_path)
|
| 122 |
+
if not p.exists():
|
| 123 |
+
return "(log not yet availableβ¦)"
|
| 124 |
+
lines = p.read_text(errors="replace").splitlines()
|
| 125 |
+
return "\n".join(lines[-n:])
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def parse_log_results(log_path: str) -> list[dict]:
|
| 129 |
+
"""Extract per-TS result lines from the Final Report section."""
|
| 130 |
+
p = Path(log_path)
|
| 131 |
+
if not p.exists():
|
| 132 |
+
return []
|
| 133 |
+
lines = p.read_text(errors="replace").splitlines()
|
| 134 |
+
results, in_report = [], False
|
| 135 |
+
for line in lines:
|
| 136 |
+
if "Final Report" in line:
|
| 137 |
+
in_report = True
|
| 138 |
+
if in_report:
|
| 139 |
+
for tag in ("OK", "WARN", "FAIL"):
|
| 140 |
+
if f"[{tag}]" in line:
|
| 141 |
+
ts_name = line.split(f"[{tag}]", 1)[-1].strip()
|
| 142 |
+
results.append({"Status": tag, "TS": ts_name})
|
| 143 |
+
break
|
| 144 |
+
return results
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def peek_submitted_by(excel_path: Path, max_names: int = 20) -> list[str]:
|
| 148 |
+
"""Return unique non-empty SubmittedBy values from the Excel (best-effort)."""
|
| 149 |
+
try:
|
| 150 |
+
ext = excel_path.suffix.lower()
|
| 151 |
+
names: set[str] = set()
|
| 152 |
+
if ext == ".xls":
|
| 153 |
+
import xlrd
|
| 154 |
+
wb = xlrd.open_workbook(str(excel_path))
|
| 155 |
+
try:
|
| 156 |
+
ws = wb.sheet_by_name("Contributions")
|
| 157 |
+
except xlrd.XLRDError:
|
| 158 |
+
ws = wb.sheet_by_index(0)
|
| 159 |
+
headers = [str(ws.cell_value(0, c)).strip() for c in range(ws.ncols)]
|
| 160 |
+
by_col = next(
|
| 161 |
+
(i for i, h in enumerate(headers)
|
| 162 |
+
if h.lower() in ("submittedby", "submitted by")),
|
| 163 |
+
None,
|
| 164 |
+
)
|
| 165 |
+
if by_col is not None:
|
| 166 |
+
for r in range(1, ws.nrows):
|
| 167 |
+
v = str(ws.cell_value(r, by_col)).strip()
|
| 168 |
+
if v:
|
| 169 |
+
names.add(v)
|
| 170 |
+
elif ext == ".xlsx":
|
| 171 |
+
import openpyxl
|
| 172 |
+
wb = openpyxl.load_workbook(str(excel_path), read_only=True, data_only=True)
|
| 173 |
+
ws = wb["Contributions"] if "Contributions" in wb.sheetnames else wb.active
|
| 174 |
+
rows = iter(ws.iter_rows(values_only=True))
|
| 175 |
+
headers = [str(c).strip() if c is not None else "" for c in next(rows, [])]
|
| 176 |
+
by_col = next(
|
| 177 |
+
(i for i, h in enumerate(headers)
|
| 178 |
+
if h.lower() in ("submittedby", "submitted by")),
|
| 179 |
+
None,
|
| 180 |
+
)
|
| 181 |
+
if by_col is not None:
|
| 182 |
+
for row in rows:
|
| 183 |
+
v = str(row[by_col]).strip() if row[by_col] is not None else ""
|
| 184 |
+
if v and v != "None":
|
| 185 |
+
names.add(v)
|
| 186 |
+
return sorted(names)[:max_names]
|
| 187 |
+
except Exception:
|
| 188 |
+
return []
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
def make_zip(output_dir: Path) -> bytes:
|
| 192 |
+
buf = io.BytesIO()
|
| 193 |
+
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
|
| 194 |
+
for f in output_dir.rglob("*"):
|
| 195 |
+
if f.is_file():
|
| 196 |
+
zf.write(f, f.relative_to(output_dir.parent))
|
| 197 |
+
buf.seek(0)
|
| 198 |
+
return buf.read()
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
# ββ Page config βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 202 |
+
st.set_page_config(
|
| 203 |
+
page_title="CR Application Tool",
|
| 204 |
+
page_icon="π",
|
| 205 |
+
layout="centered",
|
| 206 |
+
)
|
| 207 |
+
st.title("π CR Application Tool")
|
| 208 |
+
st.caption("Upload an ETSI/3GPP Excel contribution list β preview accepted CRs β apply all β download ZIP.")
|
| 209 |
+
|
| 210 |
+
# ββ Session init ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 211 |
+
params = st.query_params
|
| 212 |
+
|
| 213 |
+
if "sid" not in st.session_state:
|
| 214 |
+
if "sid" in params:
|
| 215 |
+
candidate = params["sid"]
|
| 216 |
+
existing = load_state(candidate)
|
| 217 |
+
if existing:
|
| 218 |
+
st.session_state.sid = candidate
|
| 219 |
+
st.session_state.state = existing
|
| 220 |
+
else:
|
| 221 |
+
sid = str(uuid.uuid4())
|
| 222 |
+
st.session_state.sid = sid
|
| 223 |
+
st.session_state.state = new_state(sid)
|
| 224 |
+
st.query_params["sid"] = sid
|
| 225 |
+
else:
|
| 226 |
+
sid = str(uuid.uuid4())
|
| 227 |
+
st.session_state.sid = sid
|
| 228 |
+
st.session_state.state = new_state(sid)
|
| 229 |
+
st.query_params["sid"] = sid
|
| 230 |
+
|
| 231 |
+
sid: str = st.session_state.sid
|
| 232 |
+
state: dict = st.session_state.state
|
| 233 |
+
|
| 234 |
+
# ββ Sidebar βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 235 |
+
with st.sidebar:
|
| 236 |
+
st.header("Session")
|
| 237 |
+
st.caption(f"ID: `{sid[:8]}β¦`")
|
| 238 |
+
st.divider()
|
| 239 |
+
st.subheader("Resume a session")
|
| 240 |
+
resume_sid = st.text_input("Paste a session ID")
|
| 241 |
+
if st.button("Resume") and resume_sid.strip():
|
| 242 |
+
existing = load_state(resume_sid.strip())
|
| 243 |
+
if existing:
|
| 244 |
+
st.session_state.sid = resume_sid.strip()
|
| 245 |
+
st.session_state.state = existing
|
| 246 |
+
st.query_params["sid"] = resume_sid.strip()
|
| 247 |
+
st.rerun()
|
| 248 |
+
else:
|
| 249 |
+
st.error("Session not found.")
|
| 250 |
+
|
| 251 |
+
# ββ State machine βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 252 |
+
status: str = state["status"]
|
| 253 |
+
|
| 254 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 255 |
+
# UPLOAD
|
| 256 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 257 |
+
if status == "upload":
|
| 258 |
+
st.subheader("Step 1 β Upload contribution list")
|
| 259 |
+
|
| 260 |
+
uploaded = st.file_uploader(
|
| 261 |
+
"Excel contribution list (.xlsx or .xls)",
|
| 262 |
+
type=["xlsx", "xls"],
|
| 263 |
+
)
|
| 264 |
+
person_name = st.text_input(
|
| 265 |
+
"Contributor name (must match SubmittedBy column)",
|
| 266 |
+
value=state.get("person_name", "Ly Thanh PHAN"),
|
| 267 |
+
)
|
| 268 |
+
|
| 269 |
+
if uploaded and st.button("Parse CR list β", type="primary"):
|
| 270 |
+
excel_path = session_dir(sid) / uploaded.name
|
| 271 |
+
excel_path.write_bytes(uploaded.getbuffer())
|
| 272 |
+
|
| 273 |
+
with st.spinner("Parsing Excelβ¦"):
|
| 274 |
+
try:
|
| 275 |
+
from fetch_crs import parse_excel
|
| 276 |
+
cr_list = parse_excel(str(excel_path), person_name)
|
| 277 |
+
state["status"] = "preview"
|
| 278 |
+
state["excel_filename"] = uploaded.name
|
| 279 |
+
state["person_name"] = person_name
|
| 280 |
+
state["cr_list"] = [list(row) for row in cr_list]
|
| 281 |
+
save_state(sid, state)
|
| 282 |
+
st.rerun()
|
| 283 |
+
except Exception as exc:
|
| 284 |
+
st.error(f"Failed to parse Excel: {exc}")
|
| 285 |
+
|
| 286 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 287 |
+
# PREVIEW
|
| 288 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 289 |
+
elif status == "preview":
|
| 290 |
+
cr_list = state["cr_list"]
|
| 291 |
+
st.subheader(f"Step 2 β {len(cr_list)} Accepted CR(s) found")
|
| 292 |
+
|
| 293 |
+
if cr_list:
|
| 294 |
+
import pandas as pd
|
| 295 |
+
df = pd.DataFrame(cr_list, columns=["UID", "Title"])
|
| 296 |
+
st.dataframe(df, use_container_width=True)
|
| 297 |
+
else:
|
| 298 |
+
st.warning(
|
| 299 |
+
f"No Accepted CRs found for **{state['person_name']}** in this file."
|
| 300 |
+
)
|
| 301 |
+
# Diagnostic: show what names are in the SubmittedBy column
|
| 302 |
+
excel_path = session_dir(sid) / state["excel_filename"]
|
| 303 |
+
found_names = peek_submitted_by(excel_path)
|
| 304 |
+
if found_names:
|
| 305 |
+
st.info(
|
| 306 |
+
"**Names found in SubmittedBy column** β copy the exact one into the field above and re-upload:\n\n"
|
| 307 |
+
+ "\n".join(f"- `{n}`" for n in found_names)
|
| 308 |
+
)
|
| 309 |
+
|
| 310 |
+
col1, col2 = st.columns(2)
|
| 311 |
+
with col1:
|
| 312 |
+
if st.button("β Back"):
|
| 313 |
+
state["status"] = "upload"
|
| 314 |
+
state["cr_list"] = []
|
| 315 |
+
save_state(sid, state)
|
| 316 |
+
st.rerun()
|
| 317 |
+
with col2:
|
| 318 |
+
if cr_list and st.button("βΆ Start Pipeline", type="primary"):
|
| 319 |
+
excel_path = session_dir(sid) / state["excel_filename"]
|
| 320 |
+
output_dir = session_dir(sid) / "output"
|
| 321 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 322 |
+
log_path = session_dir(sid) / "pipeline.log"
|
| 323 |
+
rc_path = _rc_path(sid)
|
| 324 |
+
|
| 325 |
+
cmd = [
|
| 326 |
+
sys.executable,
|
| 327 |
+
str(SCRIPTS_DIR / "orchestrate_cr.py"),
|
| 328 |
+
str(excel_path),
|
| 329 |
+
state["person_name"],
|
| 330 |
+
"--output-dir", str(output_dir),
|
| 331 |
+
]
|
| 332 |
+
|
| 333 |
+
log_file = open(str(log_path), "w")
|
| 334 |
+
proc = subprocess.Popen(
|
| 335 |
+
cmd,
|
| 336 |
+
stdout=log_file,
|
| 337 |
+
stderr=subprocess.STDOUT,
|
| 338 |
+
env=os.environ.copy(),
|
| 339 |
+
)
|
| 340 |
+
log_file.close()
|
| 341 |
+
|
| 342 |
+
# Background thread writes returncode file when process finishes
|
| 343 |
+
threading.Thread(
|
| 344 |
+
target=_run_and_save_rc,
|
| 345 |
+
args=(proc, rc_path),
|
| 346 |
+
daemon=True,
|
| 347 |
+
).start()
|
| 348 |
+
|
| 349 |
+
st.session_state.proc = proc
|
| 350 |
+
|
| 351 |
+
state["status"] = "running"
|
| 352 |
+
state["pid"] = proc.pid
|
| 353 |
+
state["output_dir"] = str(output_dir)
|
| 354 |
+
state["log_path"] = str(log_path)
|
| 355 |
+
state["started_at"] = datetime.now().isoformat()
|
| 356 |
+
save_state(sid, state)
|
| 357 |
+
st.rerun()
|
| 358 |
+
|
| 359 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 360 |
+
# RUNNING
|
| 361 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 362 |
+
elif status == "running":
|
| 363 |
+
pid = state["pid"]
|
| 364 |
+
log_path = state["log_path"]
|
| 365 |
+
|
| 366 |
+
# Determine whether process is still alive
|
| 367 |
+
proc = st.session_state.get("proc")
|
| 368 |
+
alive = False
|
| 369 |
+
if proc is not None:
|
| 370 |
+
alive = proc.poll() is None
|
| 371 |
+
else:
|
| 372 |
+
# Session reloaded β check returncode file, then PID
|
| 373 |
+
rc = read_return_code(sid)
|
| 374 |
+
if rc is None:
|
| 375 |
+
alive = is_process_alive(pid)
|
| 376 |
+
|
| 377 |
+
if alive:
|
| 378 |
+
st.subheader("β³ Pipeline runningβ¦")
|
| 379 |
+
st.info(f"PID {pid} β started {state.get('started_at', '')[:19]}")
|
| 380 |
+
log_text = tail_log(log_path, 100)
|
| 381 |
+
st.text_area("Live log (last 100 lines)", value=log_text, height=400)
|
| 382 |
+
time.sleep(2)
|
| 383 |
+
st.rerun()
|
| 384 |
+
else:
|
| 385 |
+
# Process finished β determine return code
|
| 386 |
+
rc = read_return_code(sid)
|
| 387 |
+
if rc is None and proc is not None:
|
| 388 |
+
rc = proc.returncode
|
| 389 |
+
state["return_code"] = rc
|
| 390 |
+
state["completed_at"] = datetime.now().isoformat()
|
| 391 |
+
state["status"] = "done" if rc == 0 else "error"
|
| 392 |
+
save_state(sid, state)
|
| 393 |
+
st.rerun()
|
| 394 |
+
|
| 395 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 396 |
+
# DONE / ERROR
|
| 397 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 398 |
+
elif status in ("done", "error"):
|
| 399 |
+
log_path = state.get("log_path", "")
|
| 400 |
+
output_dir = Path(state.get("output_dir", ""))
|
| 401 |
+
rc = state.get("return_code")
|
| 402 |
+
|
| 403 |
+
if status == "done":
|
| 404 |
+
st.success("β
Pipeline completed successfully!")
|
| 405 |
+
else:
|
| 406 |
+
st.error(f"β Pipeline finished with errors (return code: {rc})")
|
| 407 |
+
|
| 408 |
+
# Per-TS results table
|
| 409 |
+
results = parse_log_results(log_path)
|
| 410 |
+
if results:
|
| 411 |
+
st.subheader("Results per TS")
|
| 412 |
+
import pandas as pd
|
| 413 |
+
|
| 414 |
+
df = pd.DataFrame(results)
|
| 415 |
+
|
| 416 |
+
def _color_status(val):
|
| 417 |
+
return {
|
| 418 |
+
"OK": "background-color: #d4edda; color: #155724",
|
| 419 |
+
"WARN": "background-color: #fff3cd; color: #856404",
|
| 420 |
+
"FAIL": "background-color: #f8d7da; color: #721c24",
|
| 421 |
+
}.get(val, "")
|
| 422 |
+
|
| 423 |
+
st.dataframe(
|
| 424 |
+
df.style.map(_color_status, subset=["Status"]),
|
| 425 |
+
use_container_width=True,
|
| 426 |
+
)
|
| 427 |
+
|
| 428 |
+
# Download ZIP
|
| 429 |
+
if output_dir.exists() and any(output_dir.rglob("*")):
|
| 430 |
+
st.subheader("Download results")
|
| 431 |
+
zip_bytes = make_zip(output_dir)
|
| 432 |
+
st.download_button(
|
| 433 |
+
label="β¬ Download results ZIP",
|
| 434 |
+
data=zip_bytes,
|
| 435 |
+
file_name=f"cr_results_{sid[:8]}.zip",
|
| 436 |
+
mime="application/zip",
|
| 437 |
+
type="primary",
|
| 438 |
+
)
|
| 439 |
+
else:
|
| 440 |
+
st.warning("Output directory is empty β nothing to download.")
|
| 441 |
+
|
| 442 |
+
# Full log
|
| 443 |
+
with st.expander("Full pipeline log"):
|
| 444 |
+
if log_path and Path(log_path).exists():
|
| 445 |
+
st.text(Path(log_path).read_text(errors="replace"))
|
| 446 |
+
else:
|
| 447 |
+
st.text("Log not found.")
|
| 448 |
+
|
| 449 |
+
# Start new session
|
| 450 |
+
st.divider()
|
| 451 |
+
if st.button("Start new session"):
|
| 452 |
+
new_sid = str(uuid.uuid4())
|
| 453 |
+
st.session_state.sid = new_sid
|
| 454 |
+
st.session_state.state = new_state(new_sid)
|
| 455 |
+
if "proc" in st.session_state:
|
| 456 |
+
del st.session_state.proc
|
| 457 |
+
st.query_params["sid"] = new_sid
|
| 458 |
+
save_state(new_sid, st.session_state.state)
|
| 459 |
+
st.rerun()
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit==1.35.0
|
| 2 |
+
python-docx==1.1.2
|
| 3 |
+
openpyxl==3.1.5
|
| 4 |
+
xlrd==2.0.1
|
| 5 |
+
lxml==5.2.2
|
| 6 |
+
requests==2.32.3
|
scripts/__pycache__/cr_parser.cpython-310.pyc
ADDED
|
Binary file (11.9 kB). View file
|
|
|
scripts/__pycache__/docx_helpers.cpython-310.pyc
ADDED
|
Binary file (13.1 kB). View file
|
|
|
scripts/__pycache__/fetch_crs.cpython-310.pyc
ADDED
|
Binary file (12.3 kB). View file
|
|
|
scripts/__pycache__/finalize_ts.cpython-310.pyc
ADDED
|
Binary file (9.04 kB). View file
|
|
|
scripts/__pycache__/ts_applicator.cpython-310.pyc
ADDED
|
Binary file (18.1 kB). View file
|
|
|
scripts/cr_parser.py
ADDED
|
@@ -0,0 +1,490 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
cr_parser.py β Parse a CR DOCX's tracked changes into a JSON manifest.
|
| 4 |
+
|
| 5 |
+
Each entry in the manifest is one of:
|
| 6 |
+
{"type": "text_replace", "location": {...}, "old": "...", "new": "..."}
|
| 7 |
+
{"type": "para_insert", "location": {...}, "paragraphs": [...]}
|
| 8 |
+
{"type": "row_insert", "location": {...}, "cells": [...]}
|
| 9 |
+
|
| 10 |
+
Usage:
|
| 11 |
+
python3 cr_parser.py <cr.docx> [--output manifest.json]
|
| 12 |
+
# or import: from cr_parser import parse_cr
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import argparse
|
| 16 |
+
import json
|
| 17 |
+
import re
|
| 18 |
+
import sys
|
| 19 |
+
from pathlib import Path
|
| 20 |
+
|
| 21 |
+
import docx
|
| 22 |
+
from docx.oxml.ns import qn
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# ββ Low-level text helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 26 |
+
|
| 27 |
+
def _del_text(elem):
|
| 28 |
+
"""Concatenate all w:delText descendants."""
|
| 29 |
+
return ''.join(t.text or '' for t in elem.findall('.//' + qn('w:delText')))
|
| 30 |
+
|
| 31 |
+
def _ins_text(elem):
|
| 32 |
+
"""Concatenate all w:t descendants (inside w:ins)."""
|
| 33 |
+
return ''.join(t.text or '' for t in elem.findall('.//' + qn('w:t')))
|
| 34 |
+
|
| 35 |
+
def _para_new_text(p_elem):
|
| 36 |
+
"""Text of a paragraph after accepting tracked changes (ins included, del excluded)."""
|
| 37 |
+
return ''.join(t.text or '' for t in p_elem.findall('.//' + qn('w:t')))
|
| 38 |
+
|
| 39 |
+
def _para_orig_text(p_elem):
|
| 40 |
+
"""Text of a paragraph as it exists in the TS (del included, ins excluded)."""
|
| 41 |
+
parts = []
|
| 42 |
+
for node in p_elem.iter():
|
| 43 |
+
if node.tag == qn('w:delText') and node.text:
|
| 44 |
+
parts.append(node.text)
|
| 45 |
+
elif node.tag == qn('w:t') and node.text:
|
| 46 |
+
# Skip if inside a w:ins
|
| 47 |
+
if not any(a.tag == qn('w:ins') for a in node.iterancestors()):
|
| 48 |
+
parts.append(node.text)
|
| 49 |
+
return ''.join(parts)
|
| 50 |
+
|
| 51 |
+
def _style_val(p_elem):
|
| 52 |
+
pPr = p_elem.find(qn('w:pPr'))
|
| 53 |
+
if pPr is None:
|
| 54 |
+
return None
|
| 55 |
+
pStyle = pPr.find(qn('w:pStyle'))
|
| 56 |
+
if pStyle is None:
|
| 57 |
+
return None
|
| 58 |
+
return pStyle.get(qn('w:val'))
|
| 59 |
+
|
| 60 |
+
def _is_rpr_ins(ins_elem):
|
| 61 |
+
"""True if w:ins is inside w:rPr β a formatting change, not a content insertion."""
|
| 62 |
+
p = ins_elem.getparent()
|
| 63 |
+
return p is not None and p.tag == qn('w:rPr')
|
| 64 |
+
|
| 65 |
+
def _is_inserted_para(p_elem):
|
| 66 |
+
"""True if this paragraph's paragraph-mark is tracked as inserted (whole new para)."""
|
| 67 |
+
pPr = p_elem.find(qn('w:pPr'))
|
| 68 |
+
if pPr is None:
|
| 69 |
+
return False
|
| 70 |
+
rPr = pPr.find(qn('w:rPr'))
|
| 71 |
+
if rPr is None:
|
| 72 |
+
return False
|
| 73 |
+
return rPr.find(qn('w:ins')) is not None
|
| 74 |
+
|
| 75 |
+
def _is_deleted_para(p_elem):
|
| 76 |
+
"""True if this paragraph's paragraph-mark is tracked as deleted (whole para deleted)."""
|
| 77 |
+
pPr = p_elem.find(qn('w:pPr'))
|
| 78 |
+
if pPr is None:
|
| 79 |
+
return False
|
| 80 |
+
rPr = pPr.find(qn('w:rPr'))
|
| 81 |
+
if rPr is None:
|
| 82 |
+
return False
|
| 83 |
+
return rPr.find(qn('w:del')) is not None
|
| 84 |
+
|
| 85 |
+
def _is_fully_deleted_tbl(tbl_elem):
|
| 86 |
+
"""True if every row in the table is tracked as a row-level deletion."""
|
| 87 |
+
rows = tbl_elem.findall(qn('w:tr'))
|
| 88 |
+
if not rows:
|
| 89 |
+
return False
|
| 90 |
+
return all(
|
| 91 |
+
tr.find(qn('w:trPr')) is not None and
|
| 92 |
+
tr.find(qn('w:trPr')).find(qn('w:del')) is not None
|
| 93 |
+
for tr in rows
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
def _is_fully_inserted_tbl(tbl_elem):
|
| 97 |
+
"""True if every row in the table is tracked as a row-level insertion."""
|
| 98 |
+
rows = tbl_elem.findall(qn('w:tr'))
|
| 99 |
+
if not rows:
|
| 100 |
+
return False
|
| 101 |
+
return all(
|
| 102 |
+
tr.find(qn('w:trPr')) is not None and
|
| 103 |
+
tr.find(qn('w:trPr')).find(qn('w:ins')) is not None
|
| 104 |
+
for tr in rows
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
# ββ Table helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 109 |
+
|
| 110 |
+
def _table_header(tbl_elem):
|
| 111 |
+
"""First row cell texts β used as table identifier."""
|
| 112 |
+
first_tr = tbl_elem.find(qn('w:tr'))
|
| 113 |
+
if first_tr is None:
|
| 114 |
+
return []
|
| 115 |
+
cells = []
|
| 116 |
+
for tc in first_tr.findall(qn('w:tc')):
|
| 117 |
+
p = tc.find('.//' + qn('w:p'))
|
| 118 |
+
cells.append(_para_new_text(p).strip() if p is not None else '')
|
| 119 |
+
return cells
|
| 120 |
+
|
| 121 |
+
def _row_col0(tr_elem):
|
| 122 |
+
"""Col-0 text of a table row β used as row anchor."""
|
| 123 |
+
tc = tr_elem.find(qn('w:tc'))
|
| 124 |
+
if tc is None:
|
| 125 |
+
return ''
|
| 126 |
+
p = tc.find('.//' + qn('w:p'))
|
| 127 |
+
return _para_new_text(p).strip() if p is not None else ''
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
# ββ Inline del+ins extraction (from a single paragraph) ββββββββββββββββββββββ
|
| 131 |
+
|
| 132 |
+
def _extract_inline_replacements(p_elem):
|
| 133 |
+
"""
|
| 134 |
+
Return list of (old_text, new_text) pairs from del+ins sibling pairs.
|
| 135 |
+
Handles: del-then-ins, ins-then-del, multi-fragment consecutive dels.
|
| 136 |
+
Filters: whitespace-only dels with no adjacent ins, empty dels, rPr ins.
|
| 137 |
+
"""
|
| 138 |
+
children = list(p_elem)
|
| 139 |
+
pairs = []
|
| 140 |
+
skip = set()
|
| 141 |
+
|
| 142 |
+
for i, child in enumerate(children):
|
| 143 |
+
if i in skip:
|
| 144 |
+
continue
|
| 145 |
+
|
| 146 |
+
if child.tag != qn('w:del'):
|
| 147 |
+
continue
|
| 148 |
+
|
| 149 |
+
old_text = _del_text(child)
|
| 150 |
+
|
| 151 |
+
# Empty del (paragraph-mark or line-break deletion) β discard
|
| 152 |
+
if not old_text:
|
| 153 |
+
skip.add(i)
|
| 154 |
+
continue
|
| 155 |
+
|
| 156 |
+
# Merge consecutive del siblings (multi-fragment deletion)
|
| 157 |
+
j = i + 1
|
| 158 |
+
while j < len(children) and children[j].tag == qn('w:del'):
|
| 159 |
+
old_text += _del_text(children[j])
|
| 160 |
+
skip.add(j)
|
| 161 |
+
j += 1
|
| 162 |
+
|
| 163 |
+
# Whitespace-only del: only keep if there's an adjacent ins
|
| 164 |
+
next_sib = children[j] if j < len(children) else None
|
| 165 |
+
prev_sib = children[i - 1] if i > 0 else None
|
| 166 |
+
|
| 167 |
+
new_text = None
|
| 168 |
+
if next_sib is not None and next_sib.tag == qn('w:ins') and not _is_rpr_ins(next_sib):
|
| 169 |
+
new_text = _ins_text(next_sib)
|
| 170 |
+
skip.add(j)
|
| 171 |
+
elif prev_sib is not None and prev_sib.tag == qn('w:ins') and not _is_rpr_ins(prev_sib):
|
| 172 |
+
new_text = _ins_text(prev_sib)
|
| 173 |
+
|
| 174 |
+
if new_text is None:
|
| 175 |
+
if not old_text.strip():
|
| 176 |
+
skip.add(i)
|
| 177 |
+
continue # whitespace artefact with no counterpart
|
| 178 |
+
# Pure deletion (no replacement) β record with empty new
|
| 179 |
+
pairs.append((old_text, ''))
|
| 180 |
+
else:
|
| 181 |
+
pairs.append((old_text, new_text))
|
| 182 |
+
|
| 183 |
+
return pairs
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
# ββ Table change extraction βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 187 |
+
|
| 188 |
+
def _parse_table(tbl_elem, changes, section_heading=''):
|
| 189 |
+
header = _table_header(tbl_elem)
|
| 190 |
+
header_key = header[:3] # first 3 columns enough for matching
|
| 191 |
+
rows = tbl_elem.findall(qn('w:tr'))
|
| 192 |
+
|
| 193 |
+
for tr_idx, tr in enumerate(rows):
|
| 194 |
+
trPr = tr.find(qn('w:trPr'))
|
| 195 |
+
|
| 196 |
+
# ββ Tracked row insertion βββββββββββββββββββββββββββββββββββββββββ
|
| 197 |
+
if trPr is not None and trPr.find(qn('w:ins')) is not None:
|
| 198 |
+
# Find preceding stable row for anchor
|
| 199 |
+
after_anchor = ''
|
| 200 |
+
for prev_idx in range(tr_idx - 1, -1, -1):
|
| 201 |
+
prev_tr = rows[prev_idx]
|
| 202 |
+
prev_trPr = prev_tr.find(qn('w:trPr'))
|
| 203 |
+
if prev_trPr is None or prev_trPr.find(qn('w:ins')) is None:
|
| 204 |
+
after_anchor = _row_col0(prev_tr)
|
| 205 |
+
break
|
| 206 |
+
|
| 207 |
+
cells = []
|
| 208 |
+
for tc in tr.findall(qn('w:tc')):
|
| 209 |
+
tcPr = tc.find(qn('w:tcPr'))
|
| 210 |
+
|
| 211 |
+
# Width
|
| 212 |
+
width = None
|
| 213 |
+
if tcPr is not None:
|
| 214 |
+
tcW = tcPr.find(qn('w:tcW'))
|
| 215 |
+
if tcW is not None:
|
| 216 |
+
try:
|
| 217 |
+
width = int(tcW.get(qn('w:w'), 0))
|
| 218 |
+
except (ValueError, TypeError):
|
| 219 |
+
width = None
|
| 220 |
+
|
| 221 |
+
# vMerge (no w:val attribute = continuation)
|
| 222 |
+
is_vmerge = False
|
| 223 |
+
if tcPr is not None:
|
| 224 |
+
vm = tcPr.find(qn('w:vMerge'))
|
| 225 |
+
if vm is not None and vm.get(qn('w:val')) is None:
|
| 226 |
+
is_vmerge = True
|
| 227 |
+
|
| 228 |
+
# Text β prefer ins text, fall back to all text
|
| 229 |
+
cell_ins_text = _ins_text(tc)
|
| 230 |
+
p = tc.find('.//' + qn('w:p'))
|
| 231 |
+
cell_text = cell_ins_text if cell_ins_text else (_para_new_text(p) if p else '')
|
| 232 |
+
style = _style_val(p) if p is not None else None
|
| 233 |
+
|
| 234 |
+
cells.append({
|
| 235 |
+
'text': cell_text.strip(),
|
| 236 |
+
'width': width,
|
| 237 |
+
'vmerge': is_vmerge,
|
| 238 |
+
'style': style,
|
| 239 |
+
})
|
| 240 |
+
|
| 241 |
+
changes.append({
|
| 242 |
+
'type': 'row_insert',
|
| 243 |
+
'location': {
|
| 244 |
+
'kind': 'table_row',
|
| 245 |
+
'table_header': header_key,
|
| 246 |
+
'after_row_anchor': after_anchor,
|
| 247 |
+
'section_heading': section_heading,
|
| 248 |
+
},
|
| 249 |
+
'cells': cells,
|
| 250 |
+
})
|
| 251 |
+
continue
|
| 252 |
+
|
| 253 |
+
# ββ Cell-level text_replace βββββββββββββββββββββββββββββββββββββββ
|
| 254 |
+
row_anchor = _row_col0(tr)
|
| 255 |
+
tcs = tr.findall(qn('w:tc'))
|
| 256 |
+
for col_idx, tc in enumerate(tcs):
|
| 257 |
+
for p in tc.findall('.//' + qn('w:p')):
|
| 258 |
+
for old_text, new_text in _extract_inline_replacements(p):
|
| 259 |
+
if not old_text:
|
| 260 |
+
continue
|
| 261 |
+
changes.append({
|
| 262 |
+
'type': 'text_replace',
|
| 263 |
+
'location': {
|
| 264 |
+
'kind': 'table_cell',
|
| 265 |
+
'table_header': header_key,
|
| 266 |
+
'row_anchor': row_anchor,
|
| 267 |
+
'col_idx': col_idx,
|
| 268 |
+
'section_heading': section_heading,
|
| 269 |
+
},
|
| 270 |
+
'old': old_text,
|
| 271 |
+
'new': new_text,
|
| 272 |
+
})
|
| 273 |
+
|
| 274 |
+
|
| 275 |
+
# ββ Body paragraph extraction βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 276 |
+
|
| 277 |
+
def _parse_body(body, changes):
|
| 278 |
+
"""
|
| 279 |
+
Walk direct children of w:body, emitting changes.
|
| 280 |
+
|
| 281 |
+
Change types emitted:
|
| 282 |
+
section_replace β a contiguous block of fully-deleted elements (para and/or
|
| 283 |
+
table, tracked at the paragraph-mark / row level) followed
|
| 284 |
+
immediately by a contiguous block of fully-inserted elements.
|
| 285 |
+
The raw XML of ALL those CR elements is stored verbatim so
|
| 286 |
+
the applicator can transplant them directly into the TS β
|
| 287 |
+
exactly what Word does on a copy-paste.
|
| 288 |
+
text_replace β an inline del+ins pair inside an otherwise-stable paragraph.
|
| 289 |
+
para_insert β one or more wholly-new paragraphs with no corresponding
|
| 290 |
+
deletion (rare; kept for backward compatibility).
|
| 291 |
+
"""
|
| 292 |
+
from lxml import etree
|
| 293 |
+
|
| 294 |
+
prev_stable_text = ''
|
| 295 |
+
|
| 296 |
+
# ββ Section-replace accumulator βββββββββββββββββββββββββββββββββββββββββββ
|
| 297 |
+
sec_del = [] # fully-deleted elements (CR del block)
|
| 298 |
+
sec_sep = [] # empty/separator paragraphs between del and ins blocks
|
| 299 |
+
sec_ins = [] # fully-inserted elements (CR ins block)
|
| 300 |
+
sec_state = 'stable' # 'stable' | 'del' | 'sep' | 'ins'
|
| 301 |
+
sec_anchor = ''
|
| 302 |
+
|
| 303 |
+
def flush_section():
|
| 304 |
+
nonlocal sec_state, sec_anchor
|
| 305 |
+
if not sec_del and not sec_ins:
|
| 306 |
+
sec_del.clear(); sec_sep.clear(); sec_ins.clear()
|
| 307 |
+
sec_state = 'stable'
|
| 308 |
+
return
|
| 309 |
+
# The del_heading is the text content of the first deleted paragraph
|
| 310 |
+
del_heading = ''
|
| 311 |
+
for e in sec_del:
|
| 312 |
+
tag = e.tag.split('}')[-1] if '}' in e.tag else e.tag
|
| 313 |
+
if tag == 'p':
|
| 314 |
+
t = _del_text(e).strip() or _para_orig_text(e).strip()
|
| 315 |
+
if t:
|
| 316 |
+
del_heading = t
|
| 317 |
+
break
|
| 318 |
+
# Serialize all elements for the manifest (del + sep + ins)
|
| 319 |
+
all_elems = sec_del + sec_sep + sec_ins
|
| 320 |
+
elements_xml = [etree.tostring(e, encoding='unicode') for e in all_elems]
|
| 321 |
+
has_del_table = any(
|
| 322 |
+
(e.tag.split('}')[-1] if '}' in e.tag else e.tag) == 'tbl'
|
| 323 |
+
for e in sec_del
|
| 324 |
+
)
|
| 325 |
+
changes.append({
|
| 326 |
+
'type': 'section_replace',
|
| 327 |
+
'location': {
|
| 328 |
+
'kind': 'body',
|
| 329 |
+
'del_heading': del_heading,
|
| 330 |
+
'has_del_table': has_del_table,
|
| 331 |
+
'anchor_text': sec_anchor,
|
| 332 |
+
},
|
| 333 |
+
'elements_xml': elements_xml,
|
| 334 |
+
})
|
| 335 |
+
sec_del.clear(); sec_sep.clear(); sec_ins.clear()
|
| 336 |
+
sec_state = 'stable'
|
| 337 |
+
|
| 338 |
+
# ββ Para-insert accumulator (for standalone new paragraphs) βββββββββββββββ
|
| 339 |
+
insert_group = []
|
| 340 |
+
|
| 341 |
+
def flush_group():
|
| 342 |
+
if not insert_group:
|
| 343 |
+
return
|
| 344 |
+
paras = [
|
| 345 |
+
{'text': _para_new_text(p).strip(), 'style': _style_val(p)}
|
| 346 |
+
for p in insert_group
|
| 347 |
+
]
|
| 348 |
+
paras = [p for p in paras if p['text'] or p['style']]
|
| 349 |
+
if paras:
|
| 350 |
+
changes.append({
|
| 351 |
+
'type': 'para_insert',
|
| 352 |
+
'location': {
|
| 353 |
+
'kind': 'body',
|
| 354 |
+
'anchor_text': prev_stable_text,
|
| 355 |
+
},
|
| 356 |
+
'paragraphs': paras,
|
| 357 |
+
})
|
| 358 |
+
insert_group.clear()
|
| 359 |
+
|
| 360 |
+
for elem in body:
|
| 361 |
+
tag = elem.tag.split('}')[-1] if '}' in elem.tag else elem.tag
|
| 362 |
+
|
| 363 |
+
if tag == 'p':
|
| 364 |
+
is_del = _is_deleted_para(elem)
|
| 365 |
+
is_ins = _is_inserted_para(elem)
|
| 366 |
+
is_empty = not _para_orig_text(elem).strip() and not _para_new_text(elem).strip()
|
| 367 |
+
|
| 368 |
+
if is_del:
|
| 369 |
+
# Start or continue the del block
|
| 370 |
+
if sec_state == 'ins':
|
| 371 |
+
flush_section() # ins before del = two separate section_replaces
|
| 372 |
+
if sec_state == 'stable':
|
| 373 |
+
flush_group()
|
| 374 |
+
sec_anchor = prev_stable_text
|
| 375 |
+
sec_state = 'del'
|
| 376 |
+
sec_del.append(elem)
|
| 377 |
+
|
| 378 |
+
elif is_ins:
|
| 379 |
+
if sec_state in ('del', 'sep'):
|
| 380 |
+
# ins block follows a del block β part of section_replace
|
| 381 |
+
sec_state = 'ins'
|
| 382 |
+
sec_ins.append(elem)
|
| 383 |
+
elif sec_state == 'ins':
|
| 384 |
+
sec_ins.append(elem)
|
| 385 |
+
else:
|
| 386 |
+
# Standalone ins paragraph (no preceding del block)
|
| 387 |
+
flush_group() # (should already be empty)
|
| 388 |
+
insert_group.append(elem)
|
| 389 |
+
|
| 390 |
+
elif is_empty:
|
| 391 |
+
if sec_state == 'del':
|
| 392 |
+
# Separator between del and ins blocks
|
| 393 |
+
sec_state = 'sep'
|
| 394 |
+
sec_sep.append(elem)
|
| 395 |
+
elif sec_state in ('sep', 'ins'):
|
| 396 |
+
sec_ins.append(elem)
|
| 397 |
+
else:
|
| 398 |
+
# Empty para in stable region β ignore for anchoring
|
| 399 |
+
pass
|
| 400 |
+
|
| 401 |
+
else:
|
| 402 |
+
# Stable (or inline-changed) paragraph
|
| 403 |
+
flush_section()
|
| 404 |
+
flush_group()
|
| 405 |
+
|
| 406 |
+
for old_text, new_text in _extract_inline_replacements(elem):
|
| 407 |
+
if not old_text:
|
| 408 |
+
continue
|
| 409 |
+
changes.append({
|
| 410 |
+
'type': 'text_replace',
|
| 411 |
+
'location': {
|
| 412 |
+
'kind': 'body_para',
|
| 413 |
+
'para_context': _para_orig_text(elem).strip(),
|
| 414 |
+
},
|
| 415 |
+
'old': old_text,
|
| 416 |
+
'new': new_text,
|
| 417 |
+
})
|
| 418 |
+
|
| 419 |
+
orig = _para_orig_text(elem).strip()
|
| 420 |
+
if orig and not re.fullmatch(r'\[\.[\s\.]*\]', orig):
|
| 421 |
+
prev_stable_text = orig
|
| 422 |
+
|
| 423 |
+
elif tag == 'tbl':
|
| 424 |
+
if _is_fully_deleted_tbl(elem):
|
| 425 |
+
if sec_state == 'ins':
|
| 426 |
+
flush_section()
|
| 427 |
+
if sec_state == 'stable':
|
| 428 |
+
flush_group()
|
| 429 |
+
sec_anchor = prev_stable_text
|
| 430 |
+
sec_state = 'del'
|
| 431 |
+
sec_del.append(elem)
|
| 432 |
+
|
| 433 |
+
elif _is_fully_inserted_tbl(elem):
|
| 434 |
+
if sec_state in ('del', 'sep', 'ins'):
|
| 435 |
+
sec_state = 'ins'
|
| 436 |
+
sec_ins.append(elem)
|
| 437 |
+
else:
|
| 438 |
+
# Standalone fully-inserted table (no del block) β treat as section_replace
|
| 439 |
+
flush_group()
|
| 440 |
+
sec_anchor = prev_stable_text
|
| 441 |
+
sec_state = 'ins'
|
| 442 |
+
sec_ins.append(elem)
|
| 443 |
+
|
| 444 |
+
else:
|
| 445 |
+
# Table with inline cell changes
|
| 446 |
+
flush_section()
|
| 447 |
+
flush_group()
|
| 448 |
+
_parse_table(elem, changes, section_heading=prev_stable_text)
|
| 449 |
+
|
| 450 |
+
flush_section()
|
| 451 |
+
flush_group()
|
| 452 |
+
|
| 453 |
+
|
| 454 |
+
# ββ Public API ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 455 |
+
|
| 456 |
+
def parse_cr(cr_path, output_json=None):
|
| 457 |
+
"""
|
| 458 |
+
Parse all tracked changes in a CR DOCX.
|
| 459 |
+
Returns list of change dicts. Optionally saves to JSON.
|
| 460 |
+
"""
|
| 461 |
+
doc = docx.Document(str(cr_path))
|
| 462 |
+
body = doc.element.body
|
| 463 |
+
changes = []
|
| 464 |
+
_parse_body(body, changes)
|
| 465 |
+
|
| 466 |
+
if output_json:
|
| 467 |
+
Path(output_json).write_text(
|
| 468 |
+
json.dumps(changes, indent=2, ensure_ascii=False), encoding='utf-8'
|
| 469 |
+
)
|
| 470 |
+
return changes
|
| 471 |
+
|
| 472 |
+
|
| 473 |
+
# ββ CLI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 474 |
+
|
| 475 |
+
def main():
|
| 476 |
+
ap = argparse.ArgumentParser(description='Parse CR DOCX tracked changes into JSON manifest.')
|
| 477 |
+
ap.add_argument('cr_docx', help='CR DOCX file path')
|
| 478 |
+
ap.add_argument('--output', default=None, help='Output JSON path (default: print to stdout)')
|
| 479 |
+
args = ap.parse_args()
|
| 480 |
+
|
| 481 |
+
changes = parse_cr(args.cr_docx, output_json=args.output)
|
| 482 |
+
|
| 483 |
+
if args.output:
|
| 484 |
+
print(f'Wrote {len(changes)} change(s) β {args.output}')
|
| 485 |
+
else:
|
| 486 |
+
print(json.dumps(changes, indent=2, ensure_ascii=False))
|
| 487 |
+
|
| 488 |
+
|
| 489 |
+
if __name__ == '__main__':
|
| 490 |
+
main()
|
scripts/docx_helpers.py
ADDED
|
@@ -0,0 +1,494 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Reusable helpers for applying CR changes to TS DOCX files.
|
| 3 |
+
Supports both direct editing AND tracked changes (review mode).
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import copy
|
| 7 |
+
import difflib
|
| 8 |
+
import re
|
| 9 |
+
|
| 10 |
+
from docx.oxml.ns import qn
|
| 11 |
+
from docx.oxml import OxmlElement
|
| 12 |
+
|
| 13 |
+
AUTHOR = "CR Application"
|
| 14 |
+
DATE = "2026-03-24T00:00:00Z"
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
# ββ Revision ID counter βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 18 |
+
|
| 19 |
+
def _get_max_id(doc):
|
| 20 |
+
max_id = 0
|
| 21 |
+
for el in doc.element.body.iter():
|
| 22 |
+
for key, val in el.attrib.items():
|
| 23 |
+
if key.endswith('}id'):
|
| 24 |
+
try:
|
| 25 |
+
max_id = max(max_id, int(val))
|
| 26 |
+
except ValueError:
|
| 27 |
+
pass
|
| 28 |
+
return max_id
|
| 29 |
+
|
| 30 |
+
class RevCounter:
|
| 31 |
+
"""Generates unique revision IDs that don't clash with existing ones."""
|
| 32 |
+
def __init__(self, doc):
|
| 33 |
+
self._n = _get_max_id(doc) + 1
|
| 34 |
+
|
| 35 |
+
def next(self):
|
| 36 |
+
n = self._n
|
| 37 |
+
self._n += 1
|
| 38 |
+
return str(n)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
# ββ Helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 42 |
+
|
| 43 |
+
def _make_t(text, tag='w:t'):
|
| 44 |
+
t = OxmlElement(tag)
|
| 45 |
+
t.text = text or ''
|
| 46 |
+
if text and (text[0] in (' ', '\t') or text[-1] in (' ', '\t')):
|
| 47 |
+
t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
|
| 48 |
+
return t
|
| 49 |
+
|
| 50 |
+
def _make_run(text):
|
| 51 |
+
r = OxmlElement('w:r')
|
| 52 |
+
r.append(_make_t(text))
|
| 53 |
+
return r
|
| 54 |
+
|
| 55 |
+
def _make_para_el(text, style_val):
|
| 56 |
+
new_p = OxmlElement('w:p')
|
| 57 |
+
pPr = OxmlElement('w:pPr')
|
| 58 |
+
pStyle = OxmlElement('w:pStyle')
|
| 59 |
+
pStyle.set(qn('w:val'), style_val)
|
| 60 |
+
pPr.append(pStyle)
|
| 61 |
+
new_p.append(pPr)
|
| 62 |
+
new_p.append(_make_run(text))
|
| 63 |
+
return new_p
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
# ββ Section mapping βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 67 |
+
|
| 68 |
+
def map_sections(doc, clause_numbers):
|
| 69 |
+
"""
|
| 70 |
+
Print and return paragraphs belonging to the given clause numbers.
|
| 71 |
+
Returns dict: {clause: [(index, para), ...]}
|
| 72 |
+
"""
|
| 73 |
+
results = {c: [] for c in clause_numbers}
|
| 74 |
+
in_section = None
|
| 75 |
+
|
| 76 |
+
for i, para in enumerate(doc.paragraphs):
|
| 77 |
+
text = para.text.strip()
|
| 78 |
+
style = para.style.name
|
| 79 |
+
|
| 80 |
+
matched = False
|
| 81 |
+
for clause in clause_numbers:
|
| 82 |
+
if clause in text and ('Heading' in style or 'heading' in style.lower()):
|
| 83 |
+
in_section = clause
|
| 84 |
+
print(f'\n=== [{i}] SECTION {clause} | style={style!r} ===')
|
| 85 |
+
print(f' [{i}] "{text}"')
|
| 86 |
+
results[clause].append((i, para))
|
| 87 |
+
matched = True
|
| 88 |
+
break
|
| 89 |
+
|
| 90 |
+
if not matched and in_section:
|
| 91 |
+
if 'Heading' in style and text:
|
| 92 |
+
print(f' --- end at [{i}] ({style})')
|
| 93 |
+
in_section = None
|
| 94 |
+
elif text:
|
| 95 |
+
print(f' [{i}] style={style!r:16s} | "{text[:90]}"')
|
| 96 |
+
results[in_section].append((i, para))
|
| 97 |
+
|
| 98 |
+
return results
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def get_bullet_style_val(doc, fallback='B1'):
|
| 102 |
+
for para in doc.paragraphs:
|
| 103 |
+
pPr = para._element.find(qn('w:pPr'))
|
| 104 |
+
if pPr is not None:
|
| 105 |
+
pStyle = pPr.find(qn('w:pStyle'))
|
| 106 |
+
if pStyle is not None:
|
| 107 |
+
val = pStyle.get(qn('w:val'), '')
|
| 108 |
+
if val.startswith('B') and val[1:].isdigit():
|
| 109 |
+
return val
|
| 110 |
+
return fallback
|
| 111 |
+
|
| 112 |
+
def get_style_val(para):
|
| 113 |
+
pPr = para._element.find(qn('w:pPr'))
|
| 114 |
+
if pPr is not None:
|
| 115 |
+
pStyle = pPr.find(qn('w:pStyle'))
|
| 116 |
+
if pStyle is not None:
|
| 117 |
+
return pStyle.get(qn('w:val'))
|
| 118 |
+
return 'Normal'
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 122 |
+
# DIRECT EDIT MODE (no track changes)
|
| 123 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 124 |
+
|
| 125 |
+
def delete_para(para):
|
| 126 |
+
"""Remove a paragraph from the document entirely."""
|
| 127 |
+
el = para._element
|
| 128 |
+
el.getparent().remove(el)
|
| 129 |
+
|
| 130 |
+
def insert_para_after(ref_para, text, style_val='Normal'):
|
| 131 |
+
"""Insert one paragraph after ref_para. Returns the new element."""
|
| 132 |
+
new_p = _make_para_el(text, style_val)
|
| 133 |
+
ref_para._element.addnext(new_p)
|
| 134 |
+
return new_p
|
| 135 |
+
|
| 136 |
+
def insert_paras_after(ref_para, items, style_val='Normal'):
|
| 137 |
+
"""
|
| 138 |
+
Insert multiple paragraphs in order after ref_para using a moving pointer.
|
| 139 |
+
items: list of str, or list of (text, style_val) tuples.
|
| 140 |
+
Returns the last inserted element.
|
| 141 |
+
"""
|
| 142 |
+
ref_el = ref_para._element
|
| 143 |
+
for item in items:
|
| 144 |
+
text, sv = item if isinstance(item, tuple) else (item, style_val)
|
| 145 |
+
new_p = _make_para_el(text, sv)
|
| 146 |
+
ref_el.addnext(new_p)
|
| 147 |
+
ref_el = new_p
|
| 148 |
+
return ref_el
|
| 149 |
+
|
| 150 |
+
def modify_para_text(para, old_text, new_text):
|
| 151 |
+
"""Replace old_text with new_text in a paragraph (collapses all runs)."""
|
| 152 |
+
full = para.text
|
| 153 |
+
if old_text not in full:
|
| 154 |
+
raise ValueError(f"Not found: {old_text!r}\nIn: {full!r}")
|
| 155 |
+
updated = full.replace(old_text, new_text)
|
| 156 |
+
p_el = para._element
|
| 157 |
+
for r in p_el.findall(qn('w:r')):
|
| 158 |
+
p_el.remove(r)
|
| 159 |
+
p_el.append(_make_run(updated))
|
| 160 |
+
return updated
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 164 |
+
# TRACKED CHANGE MODE (review / redline mode)
|
| 165 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 166 |
+
|
| 167 |
+
def _ins_attr(rev, author, date):
|
| 168 |
+
return {qn('w:id'): rev.next(), qn('w:author'): author, qn('w:date'): date}
|
| 169 |
+
|
| 170 |
+
def _del_attr(rev, author, date):
|
| 171 |
+
return {qn('w:id'): rev.next(), qn('w:author'): author, qn('w:date'): date}
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def tracked_insert_para_after(ref_para_or_el, text, style_val, rev,
|
| 175 |
+
author=AUTHOR, date=DATE):
|
| 176 |
+
"""
|
| 177 |
+
Insert a new paragraph after ref_para_or_el with tracked insertion marks.
|
| 178 |
+
Word will show it as an insertion in review mode.
|
| 179 |
+
Returns the new XML element (use as next ref for chained inserts).
|
| 180 |
+
"""
|
| 181 |
+
new_p = OxmlElement('w:p')
|
| 182 |
+
|
| 183 |
+
# Paragraph properties: mark the paragraph mark itself as inserted
|
| 184 |
+
pPr = OxmlElement('w:pPr')
|
| 185 |
+
pStyle = OxmlElement('w:pStyle')
|
| 186 |
+
pStyle.set(qn('w:val'), style_val)
|
| 187 |
+
pPr.append(pStyle)
|
| 188 |
+
rPr = OxmlElement('w:rPr')
|
| 189 |
+
ins_mark = OxmlElement('w:ins')
|
| 190 |
+
for k, v in _ins_attr(rev, author, date).items():
|
| 191 |
+
ins_mark.set(k, v)
|
| 192 |
+
rPr.append(ins_mark)
|
| 193 |
+
pPr.append(rPr)
|
| 194 |
+
new_p.append(pPr)
|
| 195 |
+
|
| 196 |
+
# Content wrapped in <w:ins>
|
| 197 |
+
ins = OxmlElement('w:ins')
|
| 198 |
+
for k, v in _ins_attr(rev, author, date).items():
|
| 199 |
+
ins.set(k, v)
|
| 200 |
+
ins.append(_make_run(text))
|
| 201 |
+
new_p.append(ins)
|
| 202 |
+
|
| 203 |
+
ref_el = ref_para_or_el if not hasattr(ref_para_or_el, '_element') else ref_para_or_el._element
|
| 204 |
+
ref_el.addnext(new_p)
|
| 205 |
+
return new_p
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
def tracked_insert_paras_after(ref_para, items, rev, author=AUTHOR, date=DATE):
|
| 209 |
+
"""
|
| 210 |
+
Insert multiple paragraphs in order with tracked insertion marks.
|
| 211 |
+
items: list of str, or list of (text, style_val) tuples.
|
| 212 |
+
Uses a moving pointer β order is preserved.
|
| 213 |
+
Returns the last inserted element.
|
| 214 |
+
"""
|
| 215 |
+
ref_el = ref_para._element
|
| 216 |
+
for item in items:
|
| 217 |
+
text, sv = item if isinstance(item, tuple) else (item, 'Normal')
|
| 218 |
+
new_p_el = tracked_insert_para_after(ref_el, text, sv, rev, author, date)
|
| 219 |
+
ref_el = new_p_el
|
| 220 |
+
return ref_el
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
def tracked_delete_para(para, rev, author=AUTHOR, date=DATE):
|
| 224 |
+
"""
|
| 225 |
+
Mark a paragraph as deleted using tracked change marks.
|
| 226 |
+
The paragraph stays in the document but Word shows it as struck-through red.
|
| 227 |
+
"""
|
| 228 |
+
p_el = para._element
|
| 229 |
+
|
| 230 |
+
# Mark the paragraph mark as deleted (in pPr > rPr)
|
| 231 |
+
pPr = p_el.find(qn('w:pPr'))
|
| 232 |
+
if pPr is None:
|
| 233 |
+
pPr = OxmlElement('w:pPr')
|
| 234 |
+
p_el.insert(0, pPr)
|
| 235 |
+
rPr = pPr.find(qn('w:rPr'))
|
| 236 |
+
if rPr is None:
|
| 237 |
+
rPr = OxmlElement('w:rPr')
|
| 238 |
+
pPr.append(rPr)
|
| 239 |
+
del_mark = OxmlElement('w:del')
|
| 240 |
+
for k, v in _del_attr(rev, author, date).items():
|
| 241 |
+
del_mark.set(k, v)
|
| 242 |
+
rPr.append(del_mark)
|
| 243 |
+
|
| 244 |
+
# Wrap every run in <w:del> and change <w:t> β <w:delText>
|
| 245 |
+
runs = list(p_el.findall(qn('w:r')))
|
| 246 |
+
for r in runs:
|
| 247 |
+
idx = list(p_el).index(r)
|
| 248 |
+
for t_el in r.findall(qn('w:t')):
|
| 249 |
+
del_t = _make_t(t_el.text, 'w:delText')
|
| 250 |
+
r.remove(t_el)
|
| 251 |
+
r.append(del_t)
|
| 252 |
+
del_wrap = OxmlElement('w:del')
|
| 253 |
+
for k, v in _del_attr(rev, author, date).items():
|
| 254 |
+
del_wrap.set(k, v)
|
| 255 |
+
p_el.remove(r)
|
| 256 |
+
del_wrap.append(r)
|
| 257 |
+
p_el.insert(idx, del_wrap)
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
def tracked_modify_para(para, old_text, new_text, rev, author=AUTHOR, date=DATE):
|
| 261 |
+
"""
|
| 262 |
+
Replace old_text with new_text using tracked del+ins marks.
|
| 263 |
+
Splits the paragraph into: [before][<w:del>old</w:del>][<w:ins>new</w:ins>][after]
|
| 264 |
+
Word shows the old text struck through and new text underlined.
|
| 265 |
+
"""
|
| 266 |
+
full = para.text
|
| 267 |
+
if old_text not in full:
|
| 268 |
+
raise ValueError(f"Not found: {old_text!r}\nIn: {full!r}")
|
| 269 |
+
|
| 270 |
+
before, _, after = full.partition(old_text)
|
| 271 |
+
p_el = para._element
|
| 272 |
+
|
| 273 |
+
# Remove all existing runs
|
| 274 |
+
for r in p_el.findall(qn('w:r')):
|
| 275 |
+
p_el.remove(r)
|
| 276 |
+
|
| 277 |
+
# Before (unchanged)
|
| 278 |
+
if before:
|
| 279 |
+
p_el.append(_make_run(before))
|
| 280 |
+
|
| 281 |
+
# Tracked deletion of old text
|
| 282 |
+
del_el = OxmlElement('w:del')
|
| 283 |
+
for k, v in _del_attr(rev, author, date).items():
|
| 284 |
+
del_el.set(k, v)
|
| 285 |
+
r_del = OxmlElement('w:r')
|
| 286 |
+
r_del.append(_make_t(old_text, 'w:delText'))
|
| 287 |
+
del_el.append(r_del)
|
| 288 |
+
p_el.append(del_el)
|
| 289 |
+
|
| 290 |
+
# Tracked insertion of new text
|
| 291 |
+
ins_el = OxmlElement('w:ins')
|
| 292 |
+
for k, v in _ins_attr(rev, author, date).items():
|
| 293 |
+
ins_el.set(k, v)
|
| 294 |
+
ins_el.append(_make_run(new_text))
|
| 295 |
+
p_el.append(ins_el)
|
| 296 |
+
|
| 297 |
+
# After (unchanged)
|
| 298 |
+
if after:
|
| 299 |
+
p_el.append(_make_run(after))
|
| 300 |
+
|
| 301 |
+
|
| 302 |
+
def _char_diff(old, new):
|
| 303 |
+
"""
|
| 304 |
+
Return a list of (op, text) tuples for a minimal character-level diff.
|
| 305 |
+
op is one of 'keep', 'del', 'ins'.
|
| 306 |
+
|
| 307 |
+
Strategy: first tokenize into digit-runs, letter-runs, and single separator
|
| 308 |
+
characters so that separators like '-' or '.' are kept intact as their own
|
| 309 |
+
tokens; then match tokens with SequenceMatcher; finally apply char-level diff
|
| 310 |
+
within each replaced token pair for maximum granularity.
|
| 311 |
+
|
| 312 |
+
Examples:
|
| 313 |
+
('V18.2.0', 'V18.3.0') β
|
| 314 |
+
[('keep','V18.'), ('del','2'), ('ins','3'), ('keep','.0')]
|
| 315 |
+
('(2024-11)', '(2026-04)') β
|
| 316 |
+
[('keep','(202'), ('del','4'), ('ins','6'), ('keep','-'),
|
| 317 |
+
('del','11'), ('ins','04'), ('keep',')')]
|
| 318 |
+
"""
|
| 319 |
+
old_tokens = re.findall(r'\d+|[A-Za-z]+|.', old)
|
| 320 |
+
new_tokens = re.findall(r'\d+|[A-Za-z]+|.', new)
|
| 321 |
+
|
| 322 |
+
ops = []
|
| 323 |
+
matcher = difflib.SequenceMatcher(None, old_tokens, new_tokens, autojunk=False)
|
| 324 |
+
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
|
| 325 |
+
old_span = ''.join(old_tokens[i1:i2])
|
| 326 |
+
new_span = ''.join(new_tokens[j1:j2])
|
| 327 |
+
if tag == 'equal':
|
| 328 |
+
ops.append(('keep', old_span))
|
| 329 |
+
elif tag == 'replace':
|
| 330 |
+
# Within each replaced token span, apply char-level diff for finer granularity
|
| 331 |
+
cmatcher = difflib.SequenceMatcher(None, old_span, new_span, autojunk=False)
|
| 332 |
+
for ctag, ci1, ci2, cj1, cj2 in cmatcher.get_opcodes():
|
| 333 |
+
if ctag == 'equal':
|
| 334 |
+
ops.append(('keep', old_span[ci1:ci2]))
|
| 335 |
+
elif ctag == 'replace':
|
| 336 |
+
ops.append(('del', old_span[ci1:ci2]))
|
| 337 |
+
ops.append(('ins', new_span[cj1:cj2]))
|
| 338 |
+
elif ctag == 'delete':
|
| 339 |
+
ops.append(('del', old_span[ci1:ci2]))
|
| 340 |
+
elif ctag == 'insert':
|
| 341 |
+
ops.append(('ins', new_span[cj1:cj2]))
|
| 342 |
+
elif tag == 'delete':
|
| 343 |
+
ops.append(('del', old_span))
|
| 344 |
+
elif tag == 'insert':
|
| 345 |
+
ops.append(('ins', new_span))
|
| 346 |
+
return ops
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
def tracked_modify_para_multi(para, replacements, rev, author=AUTHOR, date=DATE):
|
| 350 |
+
"""
|
| 351 |
+
Apply multiple tracked del+ins replacements in a single paragraph pass.
|
| 352 |
+
replacements: list of (old_text, new_text) tuples, applied in order of appearance.
|
| 353 |
+
Each replacement uses character-level diff so only the minimally changed characters
|
| 354 |
+
are marked as del/ins, with common characters kept as plain runs in between.
|
| 355 |
+
Use this instead of calling tracked_modify_para twice (which would corrupt the XML).
|
| 356 |
+
"""
|
| 357 |
+
full = para.text
|
| 358 |
+
for old_text, _ in replacements:
|
| 359 |
+
if old_text not in full:
|
| 360 |
+
raise ValueError(f"Not found: {old_text!r}\nIn: {full!r}")
|
| 361 |
+
|
| 362 |
+
p_el = para._element
|
| 363 |
+
|
| 364 |
+
# Remove all existing runs
|
| 365 |
+
for r in p_el.findall(qn('w:r')):
|
| 366 |
+
p_el.remove(r)
|
| 367 |
+
|
| 368 |
+
# Walk through the full text, emitting plain runs and char-level del+ins ops
|
| 369 |
+
remaining = full
|
| 370 |
+
for old_text, new_text in replacements:
|
| 371 |
+
idx = remaining.find(old_text)
|
| 372 |
+
if idx == -1:
|
| 373 |
+
continue
|
| 374 |
+
before = remaining[:idx]
|
| 375 |
+
remaining = remaining[idx + len(old_text):]
|
| 376 |
+
|
| 377 |
+
if before:
|
| 378 |
+
p_el.append(_make_run(before))
|
| 379 |
+
|
| 380 |
+
for op, text in _char_diff(old_text, new_text):
|
| 381 |
+
if op == 'keep':
|
| 382 |
+
p_el.append(_make_run(text))
|
| 383 |
+
elif op == 'del':
|
| 384 |
+
del_el = OxmlElement('w:del')
|
| 385 |
+
for k, v in _del_attr(rev, author, date).items():
|
| 386 |
+
del_el.set(k, v)
|
| 387 |
+
r_del = OxmlElement('w:r')
|
| 388 |
+
r_del.append(_make_t(text, 'w:delText'))
|
| 389 |
+
del_el.append(r_del)
|
| 390 |
+
p_el.append(del_el)
|
| 391 |
+
elif op == 'ins':
|
| 392 |
+
ins_el = OxmlElement('w:ins')
|
| 393 |
+
for k, v in _ins_attr(rev, author, date).items():
|
| 394 |
+
ins_el.set(k, v)
|
| 395 |
+
ins_el.append(_make_run(text))
|
| 396 |
+
p_el.append(ins_el)
|
| 397 |
+
|
| 398 |
+
# Emit any trailing text
|
| 399 |
+
if remaining:
|
| 400 |
+
p_el.append(_make_run(remaining))
|
| 401 |
+
|
| 402 |
+
|
| 403 |
+
def tracked_insert_table_row(tbl, cell_texts, rev, author=AUTHOR, date=DATE):
|
| 404 |
+
"""
|
| 405 |
+
Insert a new row immediately after the last non-empty row in tbl, as a
|
| 406 |
+
tracked insertion. Empty pre-allocated rows at the table bottom are skipped
|
| 407 |
+
so the new content appears directly under the previous entry.
|
| 408 |
+
|
| 409 |
+
The new row is deep-copied from the last content row so that ALL formatting
|
| 410 |
+
(cell widths, borders, shading, paragraph style, run font/size) is inherited β
|
| 411 |
+
exactly as clicking "Insert Row Below" does in Word.
|
| 412 |
+
|
| 413 |
+
tbl: python-docx Table object
|
| 414 |
+
cell_texts: list of strings, one per column
|
| 415 |
+
"""
|
| 416 |
+
tbl_el = tbl._tbl
|
| 417 |
+
all_trs = tbl_el.findall(qn('w:tr'))
|
| 418 |
+
|
| 419 |
+
# Find the last row that contains at least one non-empty <w:t> node.
|
| 420 |
+
# This skips pre-allocated blank rows at the table bottom.
|
| 421 |
+
last_content_tr = all_trs[-1]
|
| 422 |
+
for tr in reversed(all_trs):
|
| 423 |
+
if any(t.text and t.text.strip() for t in tr.findall('.//' + qn('w:t'))):
|
| 424 |
+
last_content_tr = tr
|
| 425 |
+
break
|
| 426 |
+
|
| 427 |
+
# Deep-copy the last content row β inherits all cell/paragraph/run formatting.
|
| 428 |
+
new_tr = copy.deepcopy(last_content_tr)
|
| 429 |
+
|
| 430 |
+
# Mark the row itself as a tracked insertion in <w:trPr>.
|
| 431 |
+
trPr = new_tr.find(qn('w:trPr'))
|
| 432 |
+
if trPr is None:
|
| 433 |
+
trPr = OxmlElement('w:trPr')
|
| 434 |
+
new_tr.insert(0, trPr)
|
| 435 |
+
for child in list(trPr):
|
| 436 |
+
if child.tag == qn('w:ins'):
|
| 437 |
+
trPr.remove(child)
|
| 438 |
+
tr_ins = OxmlElement('w:ins')
|
| 439 |
+
for k, v in _ins_attr(rev, author, date).items():
|
| 440 |
+
tr_ins.set(k, v)
|
| 441 |
+
trPr.append(tr_ins)
|
| 442 |
+
|
| 443 |
+
# For each cell: extract the existing run's rPr, clear text content, insert new text.
|
| 444 |
+
cells_in_new_tr = new_tr.findall(qn('w:tc'))
|
| 445 |
+
for i, tc in enumerate(cells_in_new_tr):
|
| 446 |
+
p = tc.find('.//' + qn('w:p'))
|
| 447 |
+
if p is None:
|
| 448 |
+
continue
|
| 449 |
+
|
| 450 |
+
# Capture the first run's rPr (font size, bold, etc.) before clearing.
|
| 451 |
+
first_run_rpr = None
|
| 452 |
+
for r in list(p.iter(qn('w:r'))):
|
| 453 |
+
rpr = r.find(qn('w:rPr'))
|
| 454 |
+
if rpr is not None:
|
| 455 |
+
first_run_rpr = copy.deepcopy(rpr)
|
| 456 |
+
break
|
| 457 |
+
|
| 458 |
+
# Remove all non-pPr children (runs, ins, del, hyperlinks, etc.)
|
| 459 |
+
for child in list(p):
|
| 460 |
+
if child.tag != qn('w:pPr'):
|
| 461 |
+
p.remove(child)
|
| 462 |
+
|
| 463 |
+
# Ensure pPr exists with a paragraph-mark ins tracking element.
|
| 464 |
+
pPr = p.find(qn('w:pPr'))
|
| 465 |
+
if pPr is None:
|
| 466 |
+
pPr = OxmlElement('w:pPr')
|
| 467 |
+
p.insert(0, pPr)
|
| 468 |
+
rPr = pPr.find(qn('w:rPr'))
|
| 469 |
+
if rPr is None:
|
| 470 |
+
rPr = OxmlElement('w:rPr')
|
| 471 |
+
pPr.append(rPr)
|
| 472 |
+
for child in list(rPr):
|
| 473 |
+
if child.tag == qn('w:ins'):
|
| 474 |
+
rPr.remove(child)
|
| 475 |
+
p_ins_mark = OxmlElement('w:ins')
|
| 476 |
+
for k, v in _ins_attr(rev, author, date).items():
|
| 477 |
+
p_ins_mark.set(k, v)
|
| 478 |
+
rPr.append(p_ins_mark)
|
| 479 |
+
|
| 480 |
+
# Build new run, re-using the inherited rPr so font size / style matches.
|
| 481 |
+
r_new = OxmlElement('w:r')
|
| 482 |
+
if first_run_rpr is not None:
|
| 483 |
+
r_new.append(first_run_rpr)
|
| 484 |
+
text = cell_texts[i] if i < len(cell_texts) else ''
|
| 485 |
+
r_new.append(_make_t(text))
|
| 486 |
+
|
| 487 |
+
# Wrap the run in a tracked-insertion element.
|
| 488 |
+
ins_el = OxmlElement('w:ins')
|
| 489 |
+
for k, v in _ins_attr(rev, author, date).items():
|
| 490 |
+
ins_el.set(k, v)
|
| 491 |
+
ins_el.append(r_new)
|
| 492 |
+
p.append(ins_el)
|
| 493 |
+
|
| 494 |
+
last_content_tr.addnext(new_tr)
|
scripts/fetch_crs.py
ADDED
|
@@ -0,0 +1,487 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
fetch_crs.py β Download CRs and TSs from a 3GPP/ETSI Excel contribution list.
|
| 4 |
+
|
| 5 |
+
Usage:
|
| 6 |
+
python3 fetch_crs.py <excel_path> <person_name> [--output-dir DIR]
|
| 7 |
+
|
| 8 |
+
Steps:
|
| 9 |
+
1. Parse Excel, filter Accepted CRs by person name
|
| 10 |
+
2. Download CR DOCXs via docfinder /find/tdoc/download
|
| 11 |
+
3. Parse CR cover pages to extract target TS spec + version
|
| 12 |
+
4. Download TS DOCXs via docfinder /find/docx
|
| 13 |
+
5. Print summary report
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
import argparse
|
| 17 |
+
import os
|
| 18 |
+
import re
|
| 19 |
+
import sys
|
| 20 |
+
import time
|
| 21 |
+
import zipfile
|
| 22 |
+
from pathlib import Path
|
| 23 |
+
|
| 24 |
+
import requests
|
| 25 |
+
|
| 26 |
+
BASE_URL = "https://organizedprogrammers-docfinder.hf.space"
|
| 27 |
+
_proxy = os.environ.get("http_proxy") or None
|
| 28 |
+
PROXIES = {"http": _proxy, "https": os.environ.get("https_proxy") or None}
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
# ---------------------------------------------------------------------------
|
| 32 |
+
# Path helpers
|
| 33 |
+
# ---------------------------------------------------------------------------
|
| 34 |
+
|
| 35 |
+
def wsl_path(p: str) -> str:
|
| 36 |
+
"""Convert Windows path (C:\\...) to WSL path (/mnt/c/...) if needed."""
|
| 37 |
+
p = p.strip()
|
| 38 |
+
if len(p) >= 2 and p[1] == ":" and p[0].isalpha():
|
| 39 |
+
drive = p[0].lower()
|
| 40 |
+
rest = p[2:].replace("\\", "/")
|
| 41 |
+
return f"/mnt/{drive}{rest}"
|
| 42 |
+
return p
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
# ---------------------------------------------------------------------------
|
| 46 |
+
# Step 1 β Parse Excel
|
| 47 |
+
# ---------------------------------------------------------------------------
|
| 48 |
+
|
| 49 |
+
def parse_excel(excel_path: str, person_name: str):
|
| 50 |
+
"""
|
| 51 |
+
Return list of (uid, title) for Accepted CRs matching person_name.
|
| 52 |
+
Handles both .xls and .xlsx.
|
| 53 |
+
"""
|
| 54 |
+
path = Path(wsl_path(excel_path))
|
| 55 |
+
ext = path.suffix.lower()
|
| 56 |
+
|
| 57 |
+
if ext == ".xls":
|
| 58 |
+
return _parse_xls(path, person_name)
|
| 59 |
+
elif ext == ".xlsx":
|
| 60 |
+
return _parse_xlsx(path, person_name)
|
| 61 |
+
else:
|
| 62 |
+
raise ValueError(f"Unsupported file extension: {ext!r}. Expected .xls or .xlsx")
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def _name_pattern(name: str) -> re.Pattern:
|
| 66 |
+
return re.compile(r"\b" + re.escape(name) + r"\b", re.IGNORECASE)
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def _parse_xls(path: Path, person_name: str):
|
| 70 |
+
try:
|
| 71 |
+
import xlrd
|
| 72 |
+
except ImportError:
|
| 73 |
+
sys.exit("ERROR: xlrd is not installed. Run: pip install xlrd")
|
| 74 |
+
|
| 75 |
+
wb = xlrd.open_workbook(str(path))
|
| 76 |
+
# Try "Contributions" sheet first, fall back to first sheet
|
| 77 |
+
try:
|
| 78 |
+
ws = wb.sheet_by_name("Contributions")
|
| 79 |
+
except xlrd.XLRDError:
|
| 80 |
+
ws = wb.sheet_by_index(0)
|
| 81 |
+
|
| 82 |
+
# Row 0 is headers; row 1 is an empty duplicate β skip it
|
| 83 |
+
headers = [str(ws.cell_value(0, c)).strip() for c in range(ws.ncols)]
|
| 84 |
+
col = {h: i for i, h in enumerate(headers)}
|
| 85 |
+
|
| 86 |
+
uid_col = col.get("Uid") or col.get("UID") or col.get("uid")
|
| 87 |
+
type_col = col.get("Type") or col.get("type")
|
| 88 |
+
status_col = col.get("Status") or col.get("status")
|
| 89 |
+
by_col = col.get("SubmittedBy") or col.get("Submitted By") or col.get("submittedby")
|
| 90 |
+
title_col = col.get("Title") or col.get("title")
|
| 91 |
+
|
| 92 |
+
for name, c in [("Uid", uid_col), ("Type", type_col),
|
| 93 |
+
("Status", status_col), ("SubmittedBy", by_col)]:
|
| 94 |
+
if c is None:
|
| 95 |
+
raise ValueError(f"Column {name!r} not found. Available: {list(col.keys())}")
|
| 96 |
+
|
| 97 |
+
pattern = _name_pattern(person_name)
|
| 98 |
+
results = []
|
| 99 |
+
|
| 100 |
+
for r in range(2, ws.nrows): # skip header + empty duplicate
|
| 101 |
+
uid = str(ws.cell_value(r, uid_col)).strip()
|
| 102 |
+
doc_type = str(ws.cell_value(r, type_col)).strip()
|
| 103 |
+
status = str(ws.cell_value(r, status_col)).strip()
|
| 104 |
+
submitted_by = str(ws.cell_value(r, by_col)).strip()
|
| 105 |
+
title = str(ws.cell_value(r, title_col)).strip() if title_col is not None else ""
|
| 106 |
+
|
| 107 |
+
if doc_type != "CR":
|
| 108 |
+
continue
|
| 109 |
+
if status != "Accepted":
|
| 110 |
+
continue
|
| 111 |
+
if not pattern.search(submitted_by):
|
| 112 |
+
continue
|
| 113 |
+
|
| 114 |
+
results.append((uid, title))
|
| 115 |
+
|
| 116 |
+
return results
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def _parse_xlsx(path: Path, person_name: str):
|
| 120 |
+
try:
|
| 121 |
+
import openpyxl
|
| 122 |
+
except ImportError:
|
| 123 |
+
sys.exit("ERROR: openpyxl is not installed. Run: pip install openpyxl")
|
| 124 |
+
|
| 125 |
+
wb = openpyxl.load_workbook(str(path), read_only=True, data_only=True)
|
| 126 |
+
ws = wb["Contributions"] if "Contributions" in wb.sheetnames else wb.active
|
| 127 |
+
|
| 128 |
+
rows = iter(ws.iter_rows(values_only=True))
|
| 129 |
+
|
| 130 |
+
# Row 0: headers
|
| 131 |
+
header_row = next(rows)
|
| 132 |
+
headers = [str(h).strip() if h is not None else "" for h in header_row]
|
| 133 |
+
col = {h: i for i, h in enumerate(headers)}
|
| 134 |
+
|
| 135 |
+
# Row 1: empty duplicate β skip
|
| 136 |
+
next(rows, None)
|
| 137 |
+
|
| 138 |
+
uid_col = col.get("Uid") or col.get("UID") or col.get("uid")
|
| 139 |
+
type_col = col.get("Type") or col.get("type")
|
| 140 |
+
status_col = col.get("Status") or col.get("status")
|
| 141 |
+
by_col = col.get("SubmittedBy") or col.get("Submitted By") or col.get("submittedby")
|
| 142 |
+
title_col = col.get("Title") or col.get("title")
|
| 143 |
+
|
| 144 |
+
for name, c in [("Uid", uid_col), ("Type", type_col),
|
| 145 |
+
("Status", status_col), ("SubmittedBy", by_col)]:
|
| 146 |
+
if c is None:
|
| 147 |
+
raise ValueError(f"Column {name!r} not found. Available: {list(col.keys())}")
|
| 148 |
+
|
| 149 |
+
pattern = _name_pattern(person_name)
|
| 150 |
+
results = []
|
| 151 |
+
|
| 152 |
+
for row in rows:
|
| 153 |
+
def cell(c):
|
| 154 |
+
v = row[c] if c < len(row) else None
|
| 155 |
+
return str(v).strip() if v is not None else ""
|
| 156 |
+
|
| 157 |
+
uid = cell(uid_col)
|
| 158 |
+
doc_type = cell(type_col)
|
| 159 |
+
status = cell(status_col)
|
| 160 |
+
submitted_by = cell(by_col)
|
| 161 |
+
title = cell(title_col) if title_col is not None else ""
|
| 162 |
+
|
| 163 |
+
if not uid:
|
| 164 |
+
continue
|
| 165 |
+
if doc_type != "CR":
|
| 166 |
+
continue
|
| 167 |
+
if status != "Accepted":
|
| 168 |
+
continue
|
| 169 |
+
if not pattern.search(submitted_by):
|
| 170 |
+
continue
|
| 171 |
+
|
| 172 |
+
results.append((uid, title))
|
| 173 |
+
|
| 174 |
+
return results
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
# ---------------------------------------------------------------------------
|
| 178 |
+
# Step 2 β Download CR DOCXs
|
| 179 |
+
# ---------------------------------------------------------------------------
|
| 180 |
+
|
| 181 |
+
def download_cr(uid: str, cr_dir: Path):
|
| 182 |
+
"""
|
| 183 |
+
Download CR DOCX for the given UID.
|
| 184 |
+
|
| 185 |
+
Returns:
|
| 186 |
+
(docx_path, note) β docx_path is the file to use for parsing
|
| 187 |
+
note is a human-readable string for the summary
|
| 188 |
+
Returns (None, error_msg) on failure.
|
| 189 |
+
"""
|
| 190 |
+
dest = cr_dir / f"{uid}.docx"
|
| 191 |
+
|
| 192 |
+
if dest.exists():
|
| 193 |
+
return dest, "already existed"
|
| 194 |
+
|
| 195 |
+
try:
|
| 196 |
+
resp = requests.post(
|
| 197 |
+
f"{BASE_URL}/find/tdoc/download",
|
| 198 |
+
json={"doc_id": uid},
|
| 199 |
+
proxies=PROXIES,
|
| 200 |
+
timeout=60,
|
| 201 |
+
)
|
| 202 |
+
except requests.RequestException as e:
|
| 203 |
+
return None, f"network error: {e}"
|
| 204 |
+
|
| 205 |
+
if not resp.ok:
|
| 206 |
+
return None, f"HTTP {resp.status_code}"
|
| 207 |
+
|
| 208 |
+
content = resp.content
|
| 209 |
+
if not content:
|
| 210 |
+
return None, "empty response"
|
| 211 |
+
|
| 212 |
+
dest.write_bytes(content)
|
| 213 |
+
|
| 214 |
+
# ZIP detection
|
| 215 |
+
if content[:4] == b"PK\x03\x04":
|
| 216 |
+
try:
|
| 217 |
+
with zipfile.ZipFile(dest) as zf:
|
| 218 |
+
docx_entries = [n for n in zf.namelist() if n.endswith(".docx")]
|
| 219 |
+
if docx_entries:
|
| 220 |
+
extracted_name = f"{uid}_extracted.docx"
|
| 221 |
+
extracted_path = cr_dir / extracted_name
|
| 222 |
+
with zf.open(docx_entries[0]) as src, open(extracted_path, "wb") as dst:
|
| 223 |
+
dst.write(src.read())
|
| 224 |
+
return extracted_path, "extracted from ZIP"
|
| 225 |
+
except zipfile.BadZipFile:
|
| 226 |
+
pass # Not actually a ZIP despite magic bytes β treat as raw DOCX
|
| 227 |
+
|
| 228 |
+
return dest, "downloaded"
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
# ---------------------------------------------------------------------------
|
| 232 |
+
# Step 3 β Parse CR Cover Pages
|
| 233 |
+
# ---------------------------------------------------------------------------
|
| 234 |
+
|
| 235 |
+
SPEC_PATTERN = re.compile(r"^\d{3}\s\d{3}$")
|
| 236 |
+
VERSION_PATTERN = re.compile(r"^\d+\.\d+\.\d+$")
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
def parse_cr_cover(docx_path: Path):
|
| 240 |
+
"""
|
| 241 |
+
Parse the CR cover table (tables[0]) to extract (spec_number, version).
|
| 242 |
+
|
| 243 |
+
Returns (spec_number, version) e.g. ("102 221", "18.3.0")
|
| 244 |
+
Returns (None, None) if parsing fails.
|
| 245 |
+
"""
|
| 246 |
+
try:
|
| 247 |
+
from docx import Document
|
| 248 |
+
except ImportError:
|
| 249 |
+
sys.exit("ERROR: python-docx is not installed. Run: pip install python-docx")
|
| 250 |
+
|
| 251 |
+
try:
|
| 252 |
+
doc = Document(str(docx_path))
|
| 253 |
+
except Exception as e:
|
| 254 |
+
return None, None
|
| 255 |
+
|
| 256 |
+
if not doc.tables:
|
| 257 |
+
return None, None
|
| 258 |
+
|
| 259 |
+
table = doc.tables[0]
|
| 260 |
+
|
| 261 |
+
# Collect all non-empty cell texts in order
|
| 262 |
+
cells = []
|
| 263 |
+
for row in table.rows:
|
| 264 |
+
for cell in row.cells:
|
| 265 |
+
text = cell.text.strip()
|
| 266 |
+
if text:
|
| 267 |
+
cells.append(text)
|
| 268 |
+
|
| 269 |
+
spec_number = None
|
| 270 |
+
version = None
|
| 271 |
+
|
| 272 |
+
for i, text in enumerate(cells):
|
| 273 |
+
# Look for spec number: "NNN NNN" pattern
|
| 274 |
+
if SPEC_PATTERN.match(text) and spec_number is None:
|
| 275 |
+
spec_number = text
|
| 276 |
+
|
| 277 |
+
# Look for version: cell immediately after "Current version:"
|
| 278 |
+
if text == "Current version:" and i + 1 < len(cells):
|
| 279 |
+
candidate = cells[i + 1]
|
| 280 |
+
if VERSION_PATTERN.match(candidate):
|
| 281 |
+
version = candidate
|
| 282 |
+
|
| 283 |
+
# Also accept "Current version" without colon
|
| 284 |
+
if text in ("Current version:", "Current version") and version is None:
|
| 285 |
+
if i + 1 < len(cells) and VERSION_PATTERN.match(cells[i + 1]):
|
| 286 |
+
version = cells[i + 1]
|
| 287 |
+
|
| 288 |
+
return spec_number, version
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
# ---------------------------------------------------------------------------
|
| 292 |
+
# Step 4 β Download TS DOCXs
|
| 293 |
+
# ---------------------------------------------------------------------------
|
| 294 |
+
|
| 295 |
+
def _is_html(resp: requests.Response) -> bool:
|
| 296 |
+
"""Return True if the response body is an HTML page (e.g. HF Space loading page)."""
|
| 297 |
+
ct = resp.headers.get("content-type", "")
|
| 298 |
+
if "text/html" in ct:
|
| 299 |
+
return True
|
| 300 |
+
return resp.content[:5].lower() in (b"<!doc", b"<html")
|
| 301 |
+
|
| 302 |
+
|
| 303 |
+
def download_ts(spec_number: str, version: str, ts_dir: Path,
|
| 304 |
+
max_retries: int = 3, retry_delay: int = 10):
|
| 305 |
+
"""
|
| 306 |
+
Download TS DOCX for spec_number (e.g. "102 221") and version (e.g. "18.3.0").
|
| 307 |
+
|
| 308 |
+
Retries up to max_retries times when the HF Space returns an HTML loading page
|
| 309 |
+
instead of the DOCX binary (happens on cold-start / brief restarts).
|
| 310 |
+
|
| 311 |
+
Returns (filename, note) or (None, error_msg).
|
| 312 |
+
"""
|
| 313 |
+
spec_no_space = spec_number.replace(" ", "")
|
| 314 |
+
filename = f"ts_{spec_no_space}_v{version}.docx"
|
| 315 |
+
dest = ts_dir / filename
|
| 316 |
+
|
| 317 |
+
if dest.exists():
|
| 318 |
+
return filename, "already existed"
|
| 319 |
+
|
| 320 |
+
last_error = "no attempts made"
|
| 321 |
+
for attempt in range(1, max_retries + 1):
|
| 322 |
+
try:
|
| 323 |
+
resp = requests.post(
|
| 324 |
+
f"{BASE_URL}/find/docx",
|
| 325 |
+
json={"doc_id": spec_number, "version": version},
|
| 326 |
+
proxies=PROXIES,
|
| 327 |
+
timeout=120,
|
| 328 |
+
)
|
| 329 |
+
except requests.RequestException as e:
|
| 330 |
+
return None, f"network error: {e}"
|
| 331 |
+
|
| 332 |
+
if not resp.ok:
|
| 333 |
+
return None, f"HTTP {resp.status_code}"
|
| 334 |
+
|
| 335 |
+
content = resp.content
|
| 336 |
+
if not content:
|
| 337 |
+
return None, "empty response"
|
| 338 |
+
|
| 339 |
+
# Detect HTML splash page (HF Space cold-start) β retry after a delay
|
| 340 |
+
if _is_html(resp):
|
| 341 |
+
last_error = f"got HTML instead of DOCX (attempt {attempt}/{max_retries})"
|
| 342 |
+
if attempt < max_retries:
|
| 343 |
+
print(f"\n [retry in {retry_delay}s β HF Space loadingβ¦]", flush=True)
|
| 344 |
+
time.sleep(retry_delay)
|
| 345 |
+
continue
|
| 346 |
+
return None, f"invalid file (not a ZIP/DOCX, starts with {content[:4]!r}) after {max_retries} attempts"
|
| 347 |
+
|
| 348 |
+
# Good binary response
|
| 349 |
+
dest.write_bytes(content)
|
| 350 |
+
|
| 351 |
+
if content[:2] != b"PK":
|
| 352 |
+
dest.unlink()
|
| 353 |
+
return None, f"invalid file (not a ZIP/DOCX, starts with {content[:4]!r})"
|
| 354 |
+
|
| 355 |
+
# Verify the TS contains the expected spec number in its first paragraph
|
| 356 |
+
try:
|
| 357 |
+
import docx as _docx
|
| 358 |
+
_doc = _docx.Document(dest)
|
| 359 |
+
first_para = _doc.paragraphs[0].text if _doc.paragraphs else ''
|
| 360 |
+
if spec_no_space not in first_para.replace(' ', ''):
|
| 361 |
+
dest.unlink()
|
| 362 |
+
return None, f"wrong TS returned by API: got {first_para[:80]!r} (expected spec {spec_no_space})"
|
| 363 |
+
except Exception:
|
| 364 |
+
pass # Trust the ZIP check above
|
| 365 |
+
|
| 366 |
+
note = "downloaded" if attempt == 1 else f"downloaded (after {attempt} attempts)"
|
| 367 |
+
return filename, note
|
| 368 |
+
|
| 369 |
+
return None, last_error
|
| 370 |
+
|
| 371 |
+
|
| 372 |
+
# ---------------------------------------------------------------------------
|
| 373 |
+
# Main
|
| 374 |
+
# ---------------------------------------------------------------------------
|
| 375 |
+
|
| 376 |
+
def main():
|
| 377 |
+
parser = argparse.ArgumentParser(
|
| 378 |
+
description="Download CRs and TSs from a 3GPP/ETSI Excel contribution list."
|
| 379 |
+
)
|
| 380 |
+
parser.add_argument("excel_path", help="Path to .xls or .xlsx contribution list")
|
| 381 |
+
parser.add_argument("person_name", help="Name to search for in SubmittedBy column")
|
| 382 |
+
parser.add_argument(
|
| 383 |
+
"--output-dir",
|
| 384 |
+
default=str(Path.home() / "CR_Processing"),
|
| 385 |
+
help="Base output directory (default: ~/CR_Processing)",
|
| 386 |
+
)
|
| 387 |
+
args = parser.parse_args()
|
| 388 |
+
|
| 389 |
+
excel_path = wsl_path(args.excel_path)
|
| 390 |
+
person_name = args.person_name
|
| 391 |
+
output_dir = Path(wsl_path(args.output_dir)).expanduser()
|
| 392 |
+
|
| 393 |
+
cr_dir = output_dir / "CRs"
|
| 394 |
+
ts_dir = output_dir / "TS"
|
| 395 |
+
cr_dir.mkdir(parents=True, exist_ok=True)
|
| 396 |
+
ts_dir.mkdir(parents=True, exist_ok=True)
|
| 397 |
+
|
| 398 |
+
# --- Step 1: Parse Excel ---
|
| 399 |
+
print(f"Parsing Excel: {excel_path}")
|
| 400 |
+
print(f"Filtering for: {person_name!r} | Type=CR | Status=Accepted\n")
|
| 401 |
+
|
| 402 |
+
try:
|
| 403 |
+
cr_list = parse_excel(excel_path, person_name)
|
| 404 |
+
except Exception as e:
|
| 405 |
+
sys.exit(f"ERROR parsing Excel: {e}")
|
| 406 |
+
|
| 407 |
+
print(f"Found {len(cr_list)} matching CR(s).\n")
|
| 408 |
+
|
| 409 |
+
if not cr_list:
|
| 410 |
+
print("Nothing to download.")
|
| 411 |
+
return
|
| 412 |
+
|
| 413 |
+
# --- Step 2: Download CR DOCXs ---
|
| 414 |
+
print("Downloading CRs...")
|
| 415 |
+
cr_results = [] # list of (uid, docx_path_or_None, note)
|
| 416 |
+
|
| 417 |
+
for uid, title in cr_list:
|
| 418 |
+
print(f" [{uid}] ", end="", flush=True)
|
| 419 |
+
docx_path, note = download_cr(uid, cr_dir)
|
| 420 |
+
cr_results.append((uid, docx_path, note))
|
| 421 |
+
if docx_path:
|
| 422 |
+
print(f"OK ({note}) β {docx_path.name}")
|
| 423 |
+
else:
|
| 424 |
+
print(f"FAILED β {note}")
|
| 425 |
+
|
| 426 |
+
print()
|
| 427 |
+
|
| 428 |
+
# --- Step 3: Parse cover pages ---
|
| 429 |
+
print("Parsing CR cover pages...")
|
| 430 |
+
ts_targets = {} # (spec_number, version) -> list of uids
|
| 431 |
+
|
| 432 |
+
for uid, docx_path, note in cr_results:
|
| 433 |
+
if docx_path is None:
|
| 434 |
+
continue
|
| 435 |
+
spec_number, version = parse_cr_cover(docx_path)
|
| 436 |
+
if spec_number and version:
|
| 437 |
+
key = (spec_number, version)
|
| 438 |
+
ts_targets.setdefault(key, []).append(uid)
|
| 439 |
+
print(f" [{uid}] β TS {spec_number} v{version}")
|
| 440 |
+
else:
|
| 441 |
+
print(f" [{uid}] WARNING: could not parse cover page (spec/version not found)")
|
| 442 |
+
|
| 443 |
+
print()
|
| 444 |
+
|
| 445 |
+
# --- Step 4: Download TSs ---
|
| 446 |
+
print("Downloading TSs...")
|
| 447 |
+
ts_results = [] # list of (spec_number, version, filename_or_None, note)
|
| 448 |
+
|
| 449 |
+
for (spec_number, version), uids in ts_targets.items():
|
| 450 |
+
print(f" [TS {spec_number} v{version}] ", end="", flush=True)
|
| 451 |
+
filename, note = download_ts(spec_number, version, ts_dir)
|
| 452 |
+
ts_results.append((spec_number, version, filename, note))
|
| 453 |
+
if filename:
|
| 454 |
+
print(f"OK ({note}) β {filename}")
|
| 455 |
+
else:
|
| 456 |
+
print(f"FAILED β {note}")
|
| 457 |
+
|
| 458 |
+
print()
|
| 459 |
+
|
| 460 |
+
# --- Step 5: Summary ---
|
| 461 |
+
print("=" * 50)
|
| 462 |
+
print("=== fetch-crs summary ===")
|
| 463 |
+
print(f"Person: {person_name}")
|
| 464 |
+
print(f"Excel: {excel_path}")
|
| 465 |
+
print(f"CRs found: {len(cr_list)} (Accepted, Type=CR)")
|
| 466 |
+
print()
|
| 467 |
+
print("CRs downloaded:")
|
| 468 |
+
for uid, docx_path, note in cr_results:
|
| 469 |
+
if docx_path:
|
| 470 |
+
print(f" β {docx_path.name} [{note}]")
|
| 471 |
+
else:
|
| 472 |
+
print(f" β {uid} β {note}")
|
| 473 |
+
|
| 474 |
+
print()
|
| 475 |
+
print("TSs downloaded:")
|
| 476 |
+
for spec_number, version, filename, note in ts_results:
|
| 477 |
+
if filename:
|
| 478 |
+
print(f" β {filename} [{note}]")
|
| 479 |
+
else:
|
| 480 |
+
print(f" β ts_{spec_number.replace(' ', '')} v{version} β {note}")
|
| 481 |
+
|
| 482 |
+
print()
|
| 483 |
+
print(f"Output: {output_dir}/")
|
| 484 |
+
|
| 485 |
+
|
| 486 |
+
if __name__ == "__main__":
|
| 487 |
+
main()
|
scripts/finalize_ts.py
ADDED
|
@@ -0,0 +1,370 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
finalize_ts.py β Add tracked-change metadata updates to a TS DOCX after CR application.
|
| 4 |
+
|
| 5 |
+
Three edits are made (all as tracked changes):
|
| 6 |
+
1. New row in the Change History table (second-to-last table, Annex V)
|
| 7 |
+
2. New row in the History table (last table, last page)
|
| 8 |
+
3. Version + date update in the first paragraph (title)
|
| 9 |
+
|
| 10 |
+
Usage:
|
| 11 |
+
python3 finalize_ts.py <ts_docx> <cr_docx> [--author "Name"] [--output <path>]
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import argparse
|
| 15 |
+
import re
|
| 16 |
+
import sys
|
| 17 |
+
from datetime import date, timedelta
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
|
| 20 |
+
import docx
|
| 21 |
+
|
| 22 |
+
sys.path.insert(0, str(Path(__file__).parent))
|
| 23 |
+
from docx_helpers import (
|
| 24 |
+
RevCounter,
|
| 25 |
+
tracked_insert_table_row,
|
| 26 |
+
tracked_modify_para_multi,
|
| 27 |
+
AUTHOR,
|
| 28 |
+
DATE,
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
# ββ Path helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 33 |
+
|
| 34 |
+
def to_wsl_path(p: str) -> str:
|
| 35 |
+
"""Convert Windows paths (C:\\...) to WSL paths (/mnt/c/...)."""
|
| 36 |
+
if p.startswith(('C:\\', 'c:\\', 'D:\\', 'd:\\')):
|
| 37 |
+
drive = p[0].lower()
|
| 38 |
+
rest = p[2:].replace('\\', '/')
|
| 39 |
+
return f'/mnt/{drive}{rest}'
|
| 40 |
+
return p
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
# ββ Date / version helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 44 |
+
|
| 45 |
+
def compute_pub_date():
|
| 46 |
+
"""
|
| 47 |
+
Return (yyyy-mm, "Month YYYY") using the 5-day rule:
|
| 48 |
+
if today is within 5 days of the next month's first day, use next month;
|
| 49 |
+
otherwise use the current month.
|
| 50 |
+
"""
|
| 51 |
+
today = date.today()
|
| 52 |
+
first_next = (today.replace(day=1) + timedelta(days=32)).replace(day=1)
|
| 53 |
+
days_until = (first_next - today).days
|
| 54 |
+
target = first_next if days_until <= 5 else today.replace(day=1)
|
| 55 |
+
return target.strftime('%Y-%m'), target.strftime('%B %Y')
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def derive_new_version(v: str) -> str:
|
| 59 |
+
"""Increment middle component of X.Y.Z β X.(Y+1).0."""
|
| 60 |
+
parts = v.split('.')
|
| 61 |
+
parts[1] = str(int(parts[1]) + 1)
|
| 62 |
+
parts[2] = '0'
|
| 63 |
+
return '.'.join(parts)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
# ββ CR metadata extraction ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 67 |
+
|
| 68 |
+
def extract_cr_metadata(cr_docx_path: str) -> dict:
|
| 69 |
+
"""
|
| 70 |
+
Open the CR DOCX and read metadata from tables[0] (cover page table).
|
| 71 |
+
Returns dict with keys:
|
| 72 |
+
meeting_id, uid, cr_num, rev, cat, title, current_version
|
| 73 |
+
"""
|
| 74 |
+
doc = docx.Document(cr_docx_path)
|
| 75 |
+
if not doc.tables:
|
| 76 |
+
raise ValueError('CR has no tables β cannot extract metadata')
|
| 77 |
+
|
| 78 |
+
tbl = doc.tables[0]
|
| 79 |
+
|
| 80 |
+
# Collect all cell texts for scanning
|
| 81 |
+
cells = []
|
| 82 |
+
for row in tbl.rows:
|
| 83 |
+
for cell in row.cells:
|
| 84 |
+
cells.append(cell.text.strip())
|
| 85 |
+
|
| 86 |
+
meta = {
|
| 87 |
+
'meeting_id': '',
|
| 88 |
+
'uid': '',
|
| 89 |
+
'cr_num': '',
|
| 90 |
+
'rev': '',
|
| 91 |
+
'cat': '',
|
| 92 |
+
'title': '',
|
| 93 |
+
'current_version': '',
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
# --- Meeting ID ---
|
| 97 |
+
# Find cell containing "Meeting #" and parse e.g. "ETSI TC SET Meeting #121, Edinburgh..."
|
| 98 |
+
meeting_text = ''
|
| 99 |
+
for c in cells:
|
| 100 |
+
if 'Meeting #' in c or 'Meeting#' in c:
|
| 101 |
+
meeting_text = c
|
| 102 |
+
break
|
| 103 |
+
|
| 104 |
+
if meeting_text:
|
| 105 |
+
# Body: word before "Meeting" (e.g. "SET")
|
| 106 |
+
body_match = re.search(r'(\w+)\s+Meeting\s*#', meeting_text)
|
| 107 |
+
body = body_match.group(1) if body_match else ''
|
| 108 |
+
# Number: digits after "#"
|
| 109 |
+
num_match = re.search(r'Meeting\s*#\s*(\d+)', meeting_text)
|
| 110 |
+
number = num_match.group(1) if num_match else ''
|
| 111 |
+
meta['meeting_id'] = f'{body}-{number}' if body and number else meeting_text
|
| 112 |
+
|
| 113 |
+
# --- UID ---
|
| 114 |
+
# Pattern like SET(26)000019r1 or similar
|
| 115 |
+
uid_pat = re.compile(r'[A-Z]+\(\d+\)\d+\S*')
|
| 116 |
+
for c in cells:
|
| 117 |
+
m = uid_pat.search(c)
|
| 118 |
+
if m:
|
| 119 |
+
meta['uid'] = m.group(0)
|
| 120 |
+
break
|
| 121 |
+
|
| 122 |
+
# --- Label-value scanning ---
|
| 123 |
+
# Scan pairs: if a cell matches a label, the next non-empty cell is the value
|
| 124 |
+
label_map = {
|
| 125 |
+
'CR': 'cr_num',
|
| 126 |
+
'Rev': 'rev',
|
| 127 |
+
'Curr. vers': 'current_version',
|
| 128 |
+
'Current version': 'current_version',
|
| 129 |
+
'Cat': 'cat',
|
| 130 |
+
'Category': 'cat',
|
| 131 |
+
}
|
| 132 |
+
title_next = False
|
| 133 |
+
for i, c in enumerate(cells):
|
| 134 |
+
stripped = c.strip().rstrip(':')
|
| 135 |
+
|
| 136 |
+
# Title may span its own cell or be labelled
|
| 137 |
+
if stripped.lower() in ('title', 'title of change'):
|
| 138 |
+
title_next = True
|
| 139 |
+
continue
|
| 140 |
+
if title_next:
|
| 141 |
+
if c.strip():
|
| 142 |
+
meta['title'] = c.strip()
|
| 143 |
+
title_next = False
|
| 144 |
+
continue
|
| 145 |
+
|
| 146 |
+
for label, key in label_map.items():
|
| 147 |
+
if stripped == label or stripped.startswith(label):
|
| 148 |
+
# Value is in the next non-empty cell
|
| 149 |
+
for j in range(i + 1, min(i + 4, len(cells))):
|
| 150 |
+
val = cells[j].strip()
|
| 151 |
+
if val:
|
| 152 |
+
meta[key] = val
|
| 153 |
+
break
|
| 154 |
+
break
|
| 155 |
+
|
| 156 |
+
return meta
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
# ββ Meeting ID format detection βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 160 |
+
|
| 161 |
+
def _detect_meeting_separator(tbl):
|
| 162 |
+
"""
|
| 163 |
+
Scan the meeting column (col index 1) of the Change History table bottom-up.
|
| 164 |
+
Find the last non-empty cell and detect the separator between body letters and
|
| 165 |
+
number, e.g. '#' in 'SET#115' or '-' in 'SET-119'.
|
| 166 |
+
Returns the detected separator character, defaulting to '#'.
|
| 167 |
+
"""
|
| 168 |
+
for row in reversed(tbl.rows):
|
| 169 |
+
cells = row.cells
|
| 170 |
+
if len(cells) > 1:
|
| 171 |
+
text = cells[1].text.strip()
|
| 172 |
+
if text:
|
| 173 |
+
m = re.search(r'[A-Za-z]([^A-Za-z0-9])\d', text)
|
| 174 |
+
if m:
|
| 175 |
+
return m.group(1)
|
| 176 |
+
return '#'
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
# ββ TS table locators βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 180 |
+
|
| 181 |
+
def find_change_history_table(ts_doc):
|
| 182 |
+
"""Return ts_doc.tables[-2] (Change History / Annex V). Accepts 8 or 9 columns."""
|
| 183 |
+
tables = ts_doc.tables
|
| 184 |
+
if len(tables) < 2:
|
| 185 |
+
raise ValueError('TS has fewer than 2 tables')
|
| 186 |
+
tbl = tables[-2]
|
| 187 |
+
ncols = len(tbl.rows[-1].cells)
|
| 188 |
+
if ncols not in (8, 9):
|
| 189 |
+
raise ValueError(
|
| 190 |
+
f'Change History table has {ncols} columns, expected 8 or 9'
|
| 191 |
+
)
|
| 192 |
+
return tbl
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
def find_history_table(ts_doc):
|
| 196 |
+
"""Return ts_doc.tables[-1] (History / last page). Validates 3 columns."""
|
| 197 |
+
tbl = ts_doc.tables[-1]
|
| 198 |
+
last_row = tbl.rows[-1]
|
| 199 |
+
if len(last_row.cells) != 3:
|
| 200 |
+
raise ValueError(
|
| 201 |
+
f'History table has {len(last_row.cells)} columns, expected 3'
|
| 202 |
+
)
|
| 203 |
+
return tbl
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
# ββ Update functions ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 207 |
+
|
| 208 |
+
def update_change_history_table(ts_doc, meta, pub_yyyy_mm, old_v, new_v, rev, author, date_str):
|
| 209 |
+
tbl = find_change_history_table(ts_doc)
|
| 210 |
+
ncols = len(tbl.rows[-1].cells)
|
| 211 |
+
|
| 212 |
+
# Detect separator used in existing rows (e.g. '#' in 'SET#115', '-' in 'SET-119')
|
| 213 |
+
# and reformat meeting_id accordingly so it matches the existing style.
|
| 214 |
+
sep = _detect_meeting_separator(tbl)
|
| 215 |
+
meeting_id = meta['meeting_id'] # always 'BODY-NUMBER' from extract_cr_metadata
|
| 216 |
+
if sep != '-' and '-' in meeting_id:
|
| 217 |
+
body, number = meeting_id.split('-', 1)
|
| 218 |
+
meeting_id = f'{body}{sep}{number}'
|
| 219 |
+
|
| 220 |
+
if ncols == 9:
|
| 221 |
+
# Standard ETSI format: date | meeting | uid | cr | rev | cat | title | old_v | new_v
|
| 222 |
+
cell_texts = [
|
| 223 |
+
pub_yyyy_mm, meeting_id, meta['uid'],
|
| 224 |
+
meta['cr_num'], meta['rev'], meta['cat'],
|
| 225 |
+
meta['title'], old_v, new_v,
|
| 226 |
+
]
|
| 227 |
+
elif ncols == 8:
|
| 228 |
+
# Detect 8-column variant by first column header
|
| 229 |
+
first_header = tbl.rows[0].cells[0].text.strip() if tbl.rows else ''
|
| 230 |
+
if re.search(r'[Dd]ate', first_header):
|
| 231 |
+
# Date | meeting | uid | cr | rev | cat | title | new_v (no old_v)
|
| 232 |
+
cell_texts = [
|
| 233 |
+
pub_yyyy_mm, meeting_id, meta['uid'],
|
| 234 |
+
meta['cr_num'], meta['rev'], meta['cat'],
|
| 235 |
+
meta['title'], new_v,
|
| 236 |
+
]
|
| 237 |
+
else:
|
| 238 |
+
# meeting | uid | wg_doc | cr | rev | cat | title | new_v (no date, no old_v)
|
| 239 |
+
cell_texts = [
|
| 240 |
+
meeting_id, meta['uid'], '',
|
| 241 |
+
meta['cr_num'], meta['rev'], meta['cat'],
|
| 242 |
+
meta['title'], new_v,
|
| 243 |
+
]
|
| 244 |
+
else:
|
| 245 |
+
cell_texts = ([pub_yyyy_mm, meeting_id, meta['uid'],
|
| 246 |
+
meta['cr_num'], meta['rev'], meta['cat'],
|
| 247 |
+
meta['title'], old_v, new_v])[:ncols]
|
| 248 |
+
|
| 249 |
+
tracked_insert_table_row(tbl, cell_texts, rev, author, date_str)
|
| 250 |
+
return cell_texts
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
def update_history_table(ts_doc, new_v, pub_month_year, rev, author, date_str):
|
| 254 |
+
tbl = find_history_table(ts_doc)
|
| 255 |
+
cell_texts = [f'V{new_v}', pub_month_year, 'Publication']
|
| 256 |
+
tracked_insert_table_row(tbl, cell_texts, rev, author, date_str)
|
| 257 |
+
return cell_texts
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
def update_title_para(ts_doc, old_v, new_v, old_date_str, new_date_str, rev, author, date_str):
|
| 261 |
+
"""
|
| 262 |
+
Update first paragraph: V<old_v>βV<new_v> and (old_date_str)β(new_date_str).
|
| 263 |
+
Both replacements are applied in a single tracked multi-replace pass.
|
| 264 |
+
"""
|
| 265 |
+
para = ts_doc.paragraphs[0]
|
| 266 |
+
replacements = [
|
| 267 |
+
(f'V{old_v}', f'V{new_v}'),
|
| 268 |
+
(f'({old_date_str})', f'({new_date_str})'),
|
| 269 |
+
]
|
| 270 |
+
tracked_modify_para_multi(para, replacements, rev, author, date_str)
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
# ββ Main ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 274 |
+
|
| 275 |
+
def main():
|
| 276 |
+
parser = argparse.ArgumentParser(
|
| 277 |
+
description='Add tracked-change metadata updates to a TS DOCX after CR application.'
|
| 278 |
+
)
|
| 279 |
+
parser.add_argument('ts_docx', help='TS DOCX file to update')
|
| 280 |
+
parser.add_argument('cr_docx', help='CR DOCX file to read metadata from')
|
| 281 |
+
parser.add_argument('--author', default=AUTHOR, help='Tracked change author name')
|
| 282 |
+
parser.add_argument('--output', default=None, help='Output path (default: <ts>_finalized.docx)')
|
| 283 |
+
args = parser.parse_args()
|
| 284 |
+
|
| 285 |
+
ts_path = to_wsl_path(args.ts_docx)
|
| 286 |
+
cr_path = to_wsl_path(args.cr_docx)
|
| 287 |
+
|
| 288 |
+
# Determine output path
|
| 289 |
+
if args.output:
|
| 290 |
+
out_path = to_wsl_path(args.output)
|
| 291 |
+
else:
|
| 292 |
+
p = Path(ts_path)
|
| 293 |
+
out_path = str(p.parent / (p.stem + '_finalized.docx'))
|
| 294 |
+
|
| 295 |
+
print(f'TS: {ts_path}')
|
| 296 |
+
print(f'CR: {cr_path}')
|
| 297 |
+
print(f'Output: {out_path}')
|
| 298 |
+
print()
|
| 299 |
+
|
| 300 |
+
# Open documents
|
| 301 |
+
ts_doc = docx.Document(ts_path)
|
| 302 |
+
cr_doc = docx.Document(cr_path)
|
| 303 |
+
|
| 304 |
+
# Extract metadata
|
| 305 |
+
print('Extracting CR metadata...')
|
| 306 |
+
meta = extract_cr_metadata(cr_path)
|
| 307 |
+
print(f" Meeting ID: {meta['meeting_id']}")
|
| 308 |
+
print(f" UID: {meta['uid']}")
|
| 309 |
+
print(f" CR#: {meta['cr_num']}")
|
| 310 |
+
print(f" Rev: {meta['rev']}")
|
| 311 |
+
print(f" Category: {meta['cat']}")
|
| 312 |
+
print(f" Title: {meta['title']}")
|
| 313 |
+
print(f" Current version: {meta['current_version']}")
|
| 314 |
+
print()
|
| 315 |
+
|
| 316 |
+
# Compute derived values
|
| 317 |
+
pub_ym, pub_month_year = compute_pub_date()
|
| 318 |
+
old_v = meta['current_version']
|
| 319 |
+
new_v = derive_new_version(old_v)
|
| 320 |
+
print(f'Old version: {old_v} β New version: {new_v}')
|
| 321 |
+
print(f'Publication: {pub_month_year} ({pub_ym})')
|
| 322 |
+
print()
|
| 323 |
+
|
| 324 |
+
# Extract old date from first paragraph
|
| 325 |
+
title_text = ts_doc.paragraphs[0].text
|
| 326 |
+
date_match = re.search(r'\((\d{4}-\d{2})\)', title_text)
|
| 327 |
+
if not date_match:
|
| 328 |
+
print(f'WARNING: Could not find date pattern (YYYY-MM) in first paragraph:')
|
| 329 |
+
print(f' {title_text!r}')
|
| 330 |
+
old_date_str = ''
|
| 331 |
+
else:
|
| 332 |
+
old_date_str = date_match.group(1)
|
| 333 |
+
print(f'Title paragraph: {title_text!r}')
|
| 334 |
+
print(f'Old date: {old_date_str} β New date: {pub_ym}')
|
| 335 |
+
print()
|
| 336 |
+
|
| 337 |
+
# Set up revision counter and tracked change date
|
| 338 |
+
rev = RevCounter(ts_doc)
|
| 339 |
+
tc_date = DATE # ISO 8601 from docx_helpers
|
| 340 |
+
|
| 341 |
+
# Apply changes
|
| 342 |
+
print('Inserting row in Change History table (Annex V)...')
|
| 343 |
+
ch_cells = update_change_history_table(ts_doc, meta, pub_ym, old_v, new_v, rev, args.author, tc_date)
|
| 344 |
+
print(f' Row: {ch_cells}')
|
| 345 |
+
|
| 346 |
+
print('Inserting row in History table (last page)...')
|
| 347 |
+
h_cells = update_history_table(ts_doc, new_v, pub_month_year, rev, args.author, tc_date)
|
| 348 |
+
print(f' Row: {h_cells}')
|
| 349 |
+
|
| 350 |
+
if old_date_str:
|
| 351 |
+
print('Updating title paragraph...')
|
| 352 |
+
update_title_para(ts_doc, old_v, new_v, old_date_str, pub_ym, rev, args.author, tc_date)
|
| 353 |
+
print(f' V{old_v} β V{new_v}, ({old_date_str}) β ({pub_ym})')
|
| 354 |
+
else:
|
| 355 |
+
print('Skipping title paragraph update (no date found).')
|
| 356 |
+
|
| 357 |
+
# Save
|
| 358 |
+
ts_doc.save(out_path)
|
| 359 |
+
print()
|
| 360 |
+
print(f'Saved: {out_path}')
|
| 361 |
+
print()
|
| 362 |
+
print('Summary of tracked changes:')
|
| 363 |
+
print(f' [Change History] New row: {ch_cells}')
|
| 364 |
+
print(f' [History] New row: {h_cells}')
|
| 365 |
+
if old_date_str:
|
| 366 |
+
print(f' [Title] V{old_v} β V{new_v}, ({old_date_str}) β ({pub_ym})')
|
| 367 |
+
|
| 368 |
+
|
| 369 |
+
if __name__ == '__main__':
|
| 370 |
+
main()
|
scripts/map_sections.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Diagnostic: print paragraphs of target clauses from a TS DOCX with indices and styles.
|
| 4 |
+
|
| 5 |
+
Usage:
|
| 6 |
+
python3 map_sections.py <ts_path.docx> "11.1.22.3.2" "14.5.6"
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import sys
|
| 10 |
+
from docx import Document
|
| 11 |
+
|
| 12 |
+
def main():
|
| 13 |
+
if len(sys.argv) < 3:
|
| 14 |
+
print("Usage: map_sections.py <ts.docx> <clause1> [clause2 ...]")
|
| 15 |
+
sys.exit(1)
|
| 16 |
+
|
| 17 |
+
ts_path = sys.argv[1]
|
| 18 |
+
clauses = sys.argv[2:]
|
| 19 |
+
|
| 20 |
+
doc = Document(ts_path)
|
| 21 |
+
in_section = None
|
| 22 |
+
|
| 23 |
+
for i, para in enumerate(doc.paragraphs):
|
| 24 |
+
text = para.text.strip()
|
| 25 |
+
style = para.style.name
|
| 26 |
+
|
| 27 |
+
matched = False
|
| 28 |
+
for clause in clauses:
|
| 29 |
+
if clause in text and ('Heading' in style or 'heading' in style.lower()):
|
| 30 |
+
in_section = clause
|
| 31 |
+
print(f'\n=== [{i}] SECTION {clause} | style={style!r} ===')
|
| 32 |
+
print(f' [{i}] style={style!r:16s} | "{text}"')
|
| 33 |
+
matched = True
|
| 34 |
+
break
|
| 35 |
+
|
| 36 |
+
if not matched and in_section:
|
| 37 |
+
if 'Heading' in style and text:
|
| 38 |
+
print(f' --- section ends at [{i}] style={style!r}: "{text[:60]}"')
|
| 39 |
+
in_section = None
|
| 40 |
+
elif text:
|
| 41 |
+
print(f' [{i}] style={style!r:16s} | "{text[:100]}"')
|
| 42 |
+
|
| 43 |
+
if __name__ == '__main__':
|
| 44 |
+
main()
|
scripts/orchestrate_cr.py
ADDED
|
@@ -0,0 +1,361 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
orchestrate_cr.py β Fully automated CR application pipeline.
|
| 4 |
+
|
| 5 |
+
Reads an Excel contribution list, downloads all Accepted CRs and their target
|
| 6 |
+
TSs, parses tracked changes from each CR, applies them to the TS, and
|
| 7 |
+
finalises the document metadata β all without any per-CR manual scripting.
|
| 8 |
+
|
| 9 |
+
Usage:
|
| 10 |
+
python3 orchestrate_cr.py <excel_path> [person_name] [--output-dir DIR] [--author NAME]
|
| 11 |
+
|
| 12 |
+
Arguments:
|
| 13 |
+
excel_path Path to .xls or .xlsx contribution list (Windows paths OK)
|
| 14 |
+
person_name Name to match in SubmittedBy column (default: "Ly Thanh PHAN")
|
| 15 |
+
|
| 16 |
+
Options:
|
| 17 |
+
--output-dir Base output folder (default: ~/CR_Processing)
|
| 18 |
+
--author Tracked-change author name (default: "CR Application")
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
import argparse
|
| 22 |
+
import contextlib
|
| 23 |
+
import datetime
|
| 24 |
+
import io
|
| 25 |
+
import re
|
| 26 |
+
import sys
|
| 27 |
+
from pathlib import Path
|
| 28 |
+
|
| 29 |
+
import docx as docx_lib
|
| 30 |
+
|
| 31 |
+
# ββ sys.path setup ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 32 |
+
SCRIPT_DIR = Path(__file__).parent
|
| 33 |
+
FETCH_SCRIPTS = SCRIPT_DIR.parent.parent / 'fetch-crs' / 'scripts'
|
| 34 |
+
sys.path.insert(0, str(SCRIPT_DIR))
|
| 35 |
+
sys.path.insert(0, str(FETCH_SCRIPTS))
|
| 36 |
+
|
| 37 |
+
from fetch_crs import parse_excel, download_cr, parse_cr_cover, download_ts, wsl_path
|
| 38 |
+
from cr_parser import parse_cr
|
| 39 |
+
from ts_applicator import apply_manifest
|
| 40 |
+
from finalize_ts import (
|
| 41 |
+
extract_cr_metadata,
|
| 42 |
+
compute_pub_date,
|
| 43 |
+
derive_new_version,
|
| 44 |
+
update_change_history_table,
|
| 45 |
+
update_history_table,
|
| 46 |
+
update_title_para,
|
| 47 |
+
)
|
| 48 |
+
from docx_helpers import RevCounter, AUTHOR as DEFAULT_AUTHOR, DATE as DEFAULT_DATE
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
# ββ Display / logging helpers βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 52 |
+
|
| 53 |
+
def _section(title):
|
| 54 |
+
bar = '=' * 60
|
| 55 |
+
print(f'\n{bar}')
|
| 56 |
+
print(f' {title}')
|
| 57 |
+
print(bar)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
class _TeeWriter:
|
| 61 |
+
"""Writes to both real stdout and a StringIO buffer simultaneously."""
|
| 62 |
+
def __init__(self, real, buf):
|
| 63 |
+
self._real = real
|
| 64 |
+
self._buf = buf
|
| 65 |
+
|
| 66 |
+
def write(self, s):
|
| 67 |
+
self._real.write(s)
|
| 68 |
+
self._buf.write(s)
|
| 69 |
+
|
| 70 |
+
def flush(self):
|
| 71 |
+
self._real.flush()
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
# ββ Main ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 75 |
+
|
| 76 |
+
def main():
|
| 77 |
+
ap = argparse.ArgumentParser(
|
| 78 |
+
description='Fully automated CR application pipeline.',
|
| 79 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 80 |
+
)
|
| 81 |
+
ap.add_argument('excel_path', help='Path to .xls or .xlsx contribution list')
|
| 82 |
+
ap.add_argument(
|
| 83 |
+
'person_name',
|
| 84 |
+
nargs='?',
|
| 85 |
+
default='Ly Thanh PHAN',
|
| 86 |
+
help='Name to match in SubmittedBy column (default: "Ly Thanh PHAN")',
|
| 87 |
+
)
|
| 88 |
+
ap.add_argument(
|
| 89 |
+
'--output-dir',
|
| 90 |
+
default=str(Path.home() / 'CR_Processing'),
|
| 91 |
+
help='Base output directory (default: ~/CR_Processing)',
|
| 92 |
+
)
|
| 93 |
+
ap.add_argument(
|
| 94 |
+
'--author',
|
| 95 |
+
default=DEFAULT_AUTHOR,
|
| 96 |
+
help=f'Tracked change author name (default: "{DEFAULT_AUTHOR}")',
|
| 97 |
+
)
|
| 98 |
+
args = ap.parse_args()
|
| 99 |
+
|
| 100 |
+
excel_path = wsl_path(args.excel_path)
|
| 101 |
+
output_dir = Path(wsl_path(args.output_dir)).expanduser()
|
| 102 |
+
cr_dir = output_dir / 'CRs'
|
| 103 |
+
ts_dir = output_dir / 'TS' # spec subfolders created per-TS below
|
| 104 |
+
cr_dir.mkdir(parents=True, exist_ok=True)
|
| 105 |
+
ts_dir.mkdir(parents=True, exist_ok=True)
|
| 106 |
+
|
| 107 |
+
author = args.author
|
| 108 |
+
tc_date = DEFAULT_DATE
|
| 109 |
+
|
| 110 |
+
# ββ Step 1: Parse Excel βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 111 |
+
_section('Step 1 β Parsing Excel')
|
| 112 |
+
print(f'Excel: {excel_path}')
|
| 113 |
+
print(f'Person: {args.person_name!r}')
|
| 114 |
+
|
| 115 |
+
try:
|
| 116 |
+
cr_list = parse_excel(excel_path, args.person_name)
|
| 117 |
+
except Exception as e:
|
| 118 |
+
sys.exit(f'ERROR parsing Excel: {e}')
|
| 119 |
+
|
| 120 |
+
print(f'Found {len(cr_list)} Accepted CR(s):')
|
| 121 |
+
for uid, title in cr_list:
|
| 122 |
+
print(f' {uid}: {title[:80]}')
|
| 123 |
+
|
| 124 |
+
if not cr_list:
|
| 125 |
+
print('Nothing to process.')
|
| 126 |
+
return
|
| 127 |
+
|
| 128 |
+
# ββ Step 2: Download CR DOCXs βββββββββββββββββββββββββββββββββββββββββββββ
|
| 129 |
+
_section('Step 2 β Downloading CR DOCXs')
|
| 130 |
+
cr_paths = {} # uid -> Path
|
| 131 |
+
|
| 132 |
+
for uid, _ in cr_list:
|
| 133 |
+
print(f' [{uid}] ', end='', flush=True)
|
| 134 |
+
docx_path, note = download_cr(uid, cr_dir)
|
| 135 |
+
if docx_path:
|
| 136 |
+
cr_paths[uid] = docx_path
|
| 137 |
+
print(f'OK ({note}) β {docx_path.name}')
|
| 138 |
+
else:
|
| 139 |
+
print(f'FAILED β {note}')
|
| 140 |
+
|
| 141 |
+
# ββ Step 3: Parse cover pages β group by target TS βββββββββββββββββββββββ
|
| 142 |
+
_section('Step 3 β Parsing CR cover pages')
|
| 143 |
+
ts_groups = {} # (spec_number, version) -> [uid, ...]
|
| 144 |
+
uid_cover_failed = []
|
| 145 |
+
|
| 146 |
+
for uid in cr_paths:
|
| 147 |
+
spec_number, version = parse_cr_cover(cr_paths[uid])
|
| 148 |
+
if spec_number and version:
|
| 149 |
+
key = (spec_number, version)
|
| 150 |
+
ts_groups.setdefault(key, []).append(uid)
|
| 151 |
+
print(f' [{uid}] -> TS {spec_number} v{version}')
|
| 152 |
+
else:
|
| 153 |
+
uid_cover_failed.append(uid)
|
| 154 |
+
print(f' [{uid}] WARNING: could not parse cover page β skipping')
|
| 155 |
+
|
| 156 |
+
if not ts_groups:
|
| 157 |
+
print('\nNo TSs identified. Nothing to apply.')
|
| 158 |
+
return
|
| 159 |
+
|
| 160 |
+
# ββ Step 4: Download TSs ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 161 |
+
_section('Step 4 β Downloading TSs')
|
| 162 |
+
ts_paths = {} # (spec_number, version) -> Path
|
| 163 |
+
spec_dirs = {} # (spec_number, version) -> Path (per-spec subfolder)
|
| 164 |
+
|
| 165 |
+
for (spec_number, version) in ts_groups:
|
| 166 |
+
spec_compact = spec_number.replace(' ', '')
|
| 167 |
+
spec_dir = ts_dir / spec_compact
|
| 168 |
+
spec_dir.mkdir(parents=True, exist_ok=True)
|
| 169 |
+
spec_dirs[(spec_number, version)] = spec_dir
|
| 170 |
+
|
| 171 |
+
print(f' [TS {spec_number} v{version}] ', end='', flush=True)
|
| 172 |
+
filename, note = download_ts(spec_number, version, spec_dir)
|
| 173 |
+
if filename:
|
| 174 |
+
ts_paths[(spec_number, version)] = spec_dir / filename
|
| 175 |
+
print(f'OK ({note}) β {spec_compact}/{filename}')
|
| 176 |
+
else:
|
| 177 |
+
print(f'FAILED β {note}')
|
| 178 |
+
|
| 179 |
+
# ββ Steps 5 & 6: Apply CRs + Finalise each TS ββββββββββββββββββββββββββββ
|
| 180 |
+
_section('Steps 5 & 6 β Applying CRs and Finalising Metadata')
|
| 181 |
+
report = [] # (ts_key, n_ok, n_skip, n_crs, out_path, log_path, errors)
|
| 182 |
+
|
| 183 |
+
for (spec_number, version), uids in ts_groups.items():
|
| 184 |
+
ts_key = f'TS {spec_number} v{version}'
|
| 185 |
+
spec_compact = spec_number.replace(' ', '')
|
| 186 |
+
spec_dir = spec_dirs.get((spec_number, version), ts_dir / spec_compact)
|
| 187 |
+
spec_dir.mkdir(parents=True, exist_ok=True)
|
| 188 |
+
|
| 189 |
+
# Derive new version early so filenames are known upfront
|
| 190 |
+
new_v = derive_new_version(version)
|
| 191 |
+
stem = f'ts_{spec_compact}_v{new_v}_was_v{version}'
|
| 192 |
+
ts_applied = spec_dir / f'ts_{spec_compact}_v{version}_applied.docx'
|
| 193 |
+
ts_final = spec_dir / f'{stem}.docx'
|
| 194 |
+
log_path = spec_dir / f'{stem}.log'
|
| 195 |
+
errors = []
|
| 196 |
+
|
| 197 |
+
print(f'\n-- {ts_key} ({len(uids)} CR(s): {", ".join(uids)}) --')
|
| 198 |
+
|
| 199 |
+
if (spec_number, version) not in ts_paths:
|
| 200 |
+
msg = 'TS download failed β skipping'
|
| 201 |
+
print(f' SKIP: {msg}')
|
| 202 |
+
report.append((ts_key, 0, 0, len(uids), None, log_path, [msg]))
|
| 203 |
+
continue
|
| 204 |
+
|
| 205 |
+
ts_in = ts_paths[(spec_number, version)]
|
| 206 |
+
|
| 207 |
+
# All per-TS output is captured to log_buf (tee: stdout + file)
|
| 208 |
+
log_buf = io.StringIO()
|
| 209 |
+
tee = _TeeWriter(sys.stdout, log_buf)
|
| 210 |
+
|
| 211 |
+
with contextlib.redirect_stdout(tee):
|
| 212 |
+
log_header = (
|
| 213 |
+
f'Pipeline Log\n'
|
| 214 |
+
f'TS: {spec_number} v{version} -> v{new_v}\n'
|
| 215 |
+
f'CRs: {", ".join(uids)}\n'
|
| 216 |
+
f'Date: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n'
|
| 217 |
+
f'{"=" * 60}\n'
|
| 218 |
+
)
|
| 219 |
+
print(log_header, end='')
|
| 220 |
+
|
| 221 |
+
# 5a. Parse all CR manifests and combine
|
| 222 |
+
combined_manifest = []
|
| 223 |
+
participating_uids = []
|
| 224 |
+
|
| 225 |
+
for uid in uids:
|
| 226 |
+
if uid not in cr_paths:
|
| 227 |
+
errors.append(f'[{uid}] CR download had failed β skipped')
|
| 228 |
+
continue
|
| 229 |
+
print(f' Parsing {uid}... ', end='', flush=True)
|
| 230 |
+
try:
|
| 231 |
+
changes = parse_cr(cr_paths[uid])
|
| 232 |
+
combined_manifest.extend(changes)
|
| 233 |
+
participating_uids.append(uid)
|
| 234 |
+
print(f'{len(changes)} change(s)')
|
| 235 |
+
except Exception as e:
|
| 236 |
+
errors.append(f'[{uid}] parse ERROR: {e}')
|
| 237 |
+
print(f'ERROR: {e}')
|
| 238 |
+
|
| 239 |
+
if not combined_manifest:
|
| 240 |
+
print(' No changes parsed β skipping apply step.')
|
| 241 |
+
report.append((ts_key, 0, 0, len(uids), None, log_path,
|
| 242 |
+
errors + ['No changes parsed']))
|
| 243 |
+
log_path.write_text(log_buf.getvalue(), encoding='utf-8')
|
| 244 |
+
continue
|
| 245 |
+
|
| 246 |
+
# 5b. Apply manifest to TS
|
| 247 |
+
print(f' Applying {len(combined_manifest)} change(s) to {ts_in.name}...')
|
| 248 |
+
try:
|
| 249 |
+
n_ok, n_skip, log_lines = apply_manifest(
|
| 250 |
+
ts_in, combined_manifest, ts_applied, author=author, date=tc_date
|
| 251 |
+
)
|
| 252 |
+
except Exception as e:
|
| 253 |
+
errors.append(f'apply_manifest ERROR: {e}')
|
| 254 |
+
print(f' ERROR: {e}')
|
| 255 |
+
report.append((ts_key, 0, 0, len(uids), None, log_path, errors))
|
| 256 |
+
log_path.write_text(log_buf.getvalue(), encoding='utf-8')
|
| 257 |
+
continue
|
| 258 |
+
|
| 259 |
+
for line in log_lines:
|
| 260 |
+
print(f' {line}')
|
| 261 |
+
print(f' -> Applied: {n_ok} Skipped: {n_skip}')
|
| 262 |
+
|
| 263 |
+
# 6. Finalise metadata (Change History, History, title paragraph)
|
| 264 |
+
print(' Finalising metadata...')
|
| 265 |
+
try:
|
| 266 |
+
ts_doc = docx_lib.Document(str(ts_applied))
|
| 267 |
+
rev = RevCounter(ts_doc)
|
| 268 |
+
|
| 269 |
+
pub_ym, pub_month_year = compute_pub_date()
|
| 270 |
+
old_v = version
|
| 271 |
+
|
| 272 |
+
# Extract old date string from first paragraph
|
| 273 |
+
title_text = ts_doc.paragraphs[0].text
|
| 274 |
+
date_match = re.search(r'\((\d{4}-\d{2})\)', title_text)
|
| 275 |
+
old_date_str = date_match.group(1) if date_match else ''
|
| 276 |
+
|
| 277 |
+
print(f' Version: {old_v} -> {new_v}')
|
| 278 |
+
print(f' Publication: {pub_month_year} ({pub_ym})')
|
| 279 |
+
|
| 280 |
+
# One Change History row per CR
|
| 281 |
+
for uid in participating_uids:
|
| 282 |
+
try:
|
| 283 |
+
meta = extract_cr_metadata(str(cr_paths[uid]))
|
| 284 |
+
ch_cells = update_change_history_table(
|
| 285 |
+
ts_doc, meta, pub_ym, old_v, new_v, rev, author, tc_date
|
| 286 |
+
)
|
| 287 |
+
print(f' [Change History] {uid}: {ch_cells}')
|
| 288 |
+
except Exception as e:
|
| 289 |
+
errors.append(f'[{uid}] Change History ERROR: {e}')
|
| 290 |
+
print(f' [Change History] {uid}: ERROR β {e}')
|
| 291 |
+
|
| 292 |
+
# One History row for the whole TS
|
| 293 |
+
try:
|
| 294 |
+
h_cells = update_history_table(
|
| 295 |
+
ts_doc, new_v, pub_month_year, rev, author, tc_date
|
| 296 |
+
)
|
| 297 |
+
print(f' [History] {h_cells}')
|
| 298 |
+
except Exception as e:
|
| 299 |
+
errors.append(f'History table ERROR: {e}')
|
| 300 |
+
print(f' [History] ERROR β {e}')
|
| 301 |
+
|
| 302 |
+
# Title paragraph version + date
|
| 303 |
+
if old_date_str:
|
| 304 |
+
try:
|
| 305 |
+
update_title_para(
|
| 306 |
+
ts_doc, old_v, new_v, old_date_str, pub_ym, rev, author, tc_date
|
| 307 |
+
)
|
| 308 |
+
print(f' [Title] V{old_v} -> V{new_v}, ({old_date_str}) -> ({pub_ym})')
|
| 309 |
+
except Exception as e:
|
| 310 |
+
errors.append(f'Title update ERROR: {e}')
|
| 311 |
+
print(f' [Title] ERROR β {e}')
|
| 312 |
+
else:
|
| 313 |
+
print(f' [Title] SKIP β no (YYYY-MM) pattern in: {title_text!r}')
|
| 314 |
+
|
| 315 |
+
ts_doc.save(str(ts_final))
|
| 316 |
+
print(f' Saved: {spec_compact}/{ts_final.name}')
|
| 317 |
+
print(f' Log: {spec_compact}/{log_path.name}')
|
| 318 |
+
report.append((ts_key, n_ok, n_skip, len(uids), ts_final, log_path, errors))
|
| 319 |
+
|
| 320 |
+
except Exception as e:
|
| 321 |
+
errors.append(f'Finalisation ERROR: {e}')
|
| 322 |
+
print(f' Finalisation ERROR: {e}')
|
| 323 |
+
report.append((ts_key, n_ok, n_skip, len(uids), ts_applied, log_path, errors))
|
| 324 |
+
|
| 325 |
+
# Write log file after the tee context exits
|
| 326 |
+
log_path.write_text(log_buf.getvalue(), encoding='utf-8')
|
| 327 |
+
|
| 328 |
+
# ββ Final Report ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 329 |
+
_section('Final Report')
|
| 330 |
+
n_success = sum(1 for r in report if r[4] is not None and not r[6])
|
| 331 |
+
n_partial = sum(1 for r in report if r[4] is not None and r[6])
|
| 332 |
+
n_failed = sum(1 for r in report if r[4] is None)
|
| 333 |
+
|
| 334 |
+
print(f'Person: {args.person_name}')
|
| 335 |
+
print(f'Excel: {excel_path}')
|
| 336 |
+
print(f'CRs found: {len(cr_list)}')
|
| 337 |
+
print(f'TSs updated: {n_success} fully OK, {n_partial} with warnings, {n_failed} failed')
|
| 338 |
+
print()
|
| 339 |
+
|
| 340 |
+
for ts_key, n_ok, n_skip, n_crs, out_path, log_path, errors in report:
|
| 341 |
+
if out_path and not errors:
|
| 342 |
+
status = 'OK'
|
| 343 |
+
elif out_path:
|
| 344 |
+
status = 'WARN'
|
| 345 |
+
else:
|
| 346 |
+
status = 'FAIL'
|
| 347 |
+
print(f' [{status}] {ts_key}')
|
| 348 |
+
print(f' CRs: {n_crs} | Body changes applied: {n_ok} | Skipped: {n_skip}')
|
| 349 |
+
if out_path:
|
| 350 |
+
print(f' Output: {out_path.parent.name}/{out_path.name}')
|
| 351 |
+
if log_path and log_path.exists():
|
| 352 |
+
print(f' Log: {log_path.parent.name}/{log_path.name}')
|
| 353 |
+
for err in errors:
|
| 354 |
+
print(f' ! {err}')
|
| 355 |
+
|
| 356 |
+
print()
|
| 357 |
+
print(f'Output directory: {output_dir}/')
|
| 358 |
+
|
| 359 |
+
|
| 360 |
+
if __name__ == '__main__':
|
| 361 |
+
main()
|
scripts/ts_applicator.py
ADDED
|
@@ -0,0 +1,633 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
ts_applicator.py β Apply a CR change manifest to a TS DOCX as tracked changes.
|
| 4 |
+
|
| 5 |
+
Reads a JSON manifest produced by cr_parser.py and applies every change
|
| 6 |
+
to the target TS using docx_helpers tracked-change primitives.
|
| 7 |
+
|
| 8 |
+
Usage:
|
| 9 |
+
python3 ts_applicator.py <ts.docx> <manifest.json> [--author NAME] [--output path]
|
| 10 |
+
# or import: from ts_applicator import apply_manifest
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import argparse
|
| 14 |
+
import json
|
| 15 |
+
import re
|
| 16 |
+
import sys
|
| 17 |
+
from pathlib import Path
|
| 18 |
+
|
| 19 |
+
import docx
|
| 20 |
+
from docx.oxml import OxmlElement
|
| 21 |
+
from docx.oxml.ns import qn
|
| 22 |
+
|
| 23 |
+
sys.path.insert(0, str(Path(__file__).parent))
|
| 24 |
+
from docx_helpers import (
|
| 25 |
+
RevCounter,
|
| 26 |
+
tracked_modify_para,
|
| 27 |
+
tracked_insert_paras_after,
|
| 28 |
+
AUTHOR as DEFAULT_AUTHOR,
|
| 29 |
+
DATE as DEFAULT_DATE,
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
# ββ Text normalisation ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 34 |
+
|
| 35 |
+
def _norm(text):
|
| 36 |
+
"""Normalise non-breaking spaces and common Unicode dashes for comparison."""
|
| 37 |
+
return (text
|
| 38 |
+
.replace('\xa0', ' ')
|
| 39 |
+
.replace('\u2013', '-')
|
| 40 |
+
.replace('\u2014', '-')
|
| 41 |
+
.strip())
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def _norm_ws(text):
|
| 45 |
+
"""
|
| 46 |
+
Strip all whitespace for structural matching.
|
| 47 |
+
|
| 48 |
+
ETSI TS files store structured paragraphs (references, abbreviations,
|
| 49 |
+
headings) with a TAB between the code and the body text, e.g.:
|
| 50 |
+
'[27]\\tGlobalPlatform: ...'
|
| 51 |
+
'CLT\\tContactLess Tunnelling'
|
| 52 |
+
'8.3\\tRAM implementation over HTTPS'
|
| 53 |
+
|
| 54 |
+
The CR's text extraction concatenates runs directly, losing the tab:
|
| 55 |
+
'[27]GlobalPlatform: ...'
|
| 56 |
+
'CLTContactLess Tunnelling'
|
| 57 |
+
'8.3RAM implementation over HTTPS'
|
| 58 |
+
|
| 59 |
+
Removing all whitespace from both sides before comparing solves this.
|
| 60 |
+
Used as a third-level fallback (confidence 0.8) after exact and NBSP-norm.
|
| 61 |
+
"""
|
| 62 |
+
base = (text
|
| 63 |
+
.replace('\xa0', '')
|
| 64 |
+
.replace('\u2013', '-')
|
| 65 |
+
.replace('\u2014', '-'))
|
| 66 |
+
return re.sub(r'\s+', '', base)
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
# ββ Document search helpers βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 70 |
+
|
| 71 |
+
def _full_para_text(para):
|
| 72 |
+
"""All text content including w:t (normal/inserted) and w:delText (deleted runs)."""
|
| 73 |
+
el = para._element
|
| 74 |
+
return ''.join(t.text or '' for t in el.findall('.//' + qn('w:t'))) + \
|
| 75 |
+
''.join(t.text or '' for t in el.findall('.//' + qn('w:delText')))
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def _find_para(doc, search_text, prefer_not_in_table=False):
|
| 79 |
+
"""
|
| 80 |
+
Find the first paragraph containing search_text.
|
| 81 |
+
Four levels of matching, in order of confidence:
|
| 82 |
+
1.0 β exact substring match
|
| 83 |
+
0.9 β NBSP/dash-normalised match (_norm)
|
| 84 |
+
0.8 β whitespace-stripped match (_norm_ws) handles tab vs nothing in
|
| 85 |
+
structured paragraphs (refs '[27]\\t...', abbrevs 'CLT\\t...', headings '8.3\\t...')
|
| 86 |
+
0.6 β full XML text (including w:del content): handles anchors that were
|
| 87 |
+
previously deleted by tracked_modify_para in an earlier apply step
|
| 88 |
+
Returns (para, confidence) or (None, 0.0).
|
| 89 |
+
"""
|
| 90 |
+
norm_search = _norm(search_text)
|
| 91 |
+
ws_search = _norm_ws(search_text)
|
| 92 |
+
candidates_exact = []
|
| 93 |
+
candidates_norm = []
|
| 94 |
+
candidates_ws = []
|
| 95 |
+
candidates_del = []
|
| 96 |
+
|
| 97 |
+
for para in doc.paragraphs:
|
| 98 |
+
pt = para.text
|
| 99 |
+
if search_text in pt:
|
| 100 |
+
candidates_exact.append(para)
|
| 101 |
+
elif norm_search and norm_search in _norm(pt):
|
| 102 |
+
candidates_norm.append(para)
|
| 103 |
+
elif ws_search and ws_search in _norm_ws(pt):
|
| 104 |
+
candidates_ws.append(para)
|
| 105 |
+
else:
|
| 106 |
+
# Level 4: check full XML text (catches deleted-but-still-present paragraphs)
|
| 107 |
+
full_pt = _full_para_text(para)
|
| 108 |
+
if search_text in full_pt:
|
| 109 |
+
candidates_del.append(para)
|
| 110 |
+
elif ws_search and ws_search in _norm_ws(full_pt):
|
| 111 |
+
candidates_del.append(para)
|
| 112 |
+
|
| 113 |
+
def _in_table(para):
|
| 114 |
+
p = para._element
|
| 115 |
+
return any(a.tag == qn('w:tc') for a in p.iterancestors())
|
| 116 |
+
|
| 117 |
+
for pool, conf in [(candidates_exact, 1.0), (candidates_norm, 0.9),
|
| 118 |
+
(candidates_ws, 0.8), (candidates_del, 0.6)]:
|
| 119 |
+
if not pool:
|
| 120 |
+
continue
|
| 121 |
+
if prefer_not_in_table:
|
| 122 |
+
body_only = [p for p in pool if not _in_table(p)]
|
| 123 |
+
if body_only:
|
| 124 |
+
return body_only[0], conf
|
| 125 |
+
return pool[0], conf
|
| 126 |
+
|
| 127 |
+
return None, 0.0
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def _find_table_by_section(doc, section_heading):
|
| 131 |
+
"""
|
| 132 |
+
Find the table immediately following a paragraph that contains section_heading.
|
| 133 |
+
Checks both w:t (plain/inserted) and w:delText (tracked-deleted) so the match
|
| 134 |
+
survives even after the heading was wrapped in a tracked deletion.
|
| 135 |
+
Empty paragraphs between the heading and the table are tolerated.
|
| 136 |
+
Returns (table, confidence) or (None, 0.0).
|
| 137 |
+
"""
|
| 138 |
+
if not section_heading:
|
| 139 |
+
return None, 0.0
|
| 140 |
+
norm_h = _norm(section_heading)
|
| 141 |
+
ws_h = _norm_ws(section_heading)
|
| 142 |
+
heading_seen = False
|
| 143 |
+
for element in doc.element.body:
|
| 144 |
+
tag = element.tag.split('}')[-1] if '}' in element.tag else element.tag
|
| 145 |
+
if tag == 'p':
|
| 146 |
+
t_text = ''.join(t.text or '' for t in element.findall('.//' + qn('w:t')))
|
| 147 |
+
d_text = ''.join(t.text or '' for t in element.findall('.//' + qn('w:delText')))
|
| 148 |
+
full = (t_text + d_text).strip()
|
| 149 |
+
if not full:
|
| 150 |
+
continue # skip empty paras, keep heading_seen state
|
| 151 |
+
if (section_heading in full
|
| 152 |
+
or norm_h in _norm(full)
|
| 153 |
+
or ws_h in _norm_ws(full)):
|
| 154 |
+
heading_seen = True
|
| 155 |
+
else:
|
| 156 |
+
heading_seen = False # non-matching non-empty para resets
|
| 157 |
+
elif tag == 'tbl':
|
| 158 |
+
if heading_seen:
|
| 159 |
+
for tbl in doc.tables:
|
| 160 |
+
if tbl._tbl is element:
|
| 161 |
+
return tbl, 1.0
|
| 162 |
+
heading_seen = False
|
| 163 |
+
return None, 0.0
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def _find_table(doc, header_key):
|
| 167 |
+
"""
|
| 168 |
+
Find a table whose first row cell texts start with header_key.
|
| 169 |
+
Returns (table, confidence) or (None, 0.0).
|
| 170 |
+
"""
|
| 171 |
+
norm_key = [_norm(h) for h in header_key]
|
| 172 |
+
|
| 173 |
+
for tbl in doc.tables:
|
| 174 |
+
if not tbl.rows:
|
| 175 |
+
continue
|
| 176 |
+
first_row_texts = [_norm(c.text) for c in tbl.rows[0].cells]
|
| 177 |
+
# Match by prefix (header_key may have fewer columns)
|
| 178 |
+
match = all(
|
| 179 |
+
i < len(first_row_texts) and norm_key[i] in first_row_texts[i]
|
| 180 |
+
for i in range(len(norm_key))
|
| 181 |
+
)
|
| 182 |
+
if match:
|
| 183 |
+
return tbl, 1.0
|
| 184 |
+
|
| 185 |
+
return None, 0.0
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
def _find_row(tbl, anchor_text):
|
| 189 |
+
"""
|
| 190 |
+
Find first row in tbl where col-0 cell text contains anchor_text.
|
| 191 |
+
Returns (row_idx, confidence) or (-1, 0.0).
|
| 192 |
+
Three confidence levels: 1.0 exact, 0.9 norm, 0.8 whitespace-stripped.
|
| 193 |
+
"""
|
| 194 |
+
norm_anchor = _norm(anchor_text)
|
| 195 |
+
ws_anchor = _norm_ws(anchor_text)
|
| 196 |
+
best = (-1, 0.0)
|
| 197 |
+
for idx, row in enumerate(tbl.rows):
|
| 198 |
+
cell0 = row.cells[0].text if row.cells else ''
|
| 199 |
+
if anchor_text in cell0:
|
| 200 |
+
return idx, 1.0
|
| 201 |
+
if norm_anchor and norm_anchor in _norm(cell0) and best[1] < 0.9:
|
| 202 |
+
best = (idx, 0.9)
|
| 203 |
+
elif ws_anchor and ws_anchor in _norm_ws(cell0) and best[1] < 0.8:
|
| 204 |
+
best = (idx, 0.8)
|
| 205 |
+
return best
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
# ββ vMerge row insertion ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 209 |
+
|
| 210 |
+
def _build_new_tr(cells_data, rev, author, date):
|
| 211 |
+
"""
|
| 212 |
+
Build and return a new tracked-insert <w:tr> element (does NOT insert it).
|
| 213 |
+
cells_data: list of dicts with keys: text, width, vmerge, style.
|
| 214 |
+
"""
|
| 215 |
+
def _ins_attr():
|
| 216 |
+
return {qn('w:id'): rev.next(), qn('w:author'): author, qn('w:date'): date}
|
| 217 |
+
|
| 218 |
+
def _make_t(text, tag='w:t'):
|
| 219 |
+
t = OxmlElement(tag)
|
| 220 |
+
t.text = text or ''
|
| 221 |
+
if text and (text[0] in (' ', '\t') or text[-1] in (' ', '\t')):
|
| 222 |
+
t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
|
| 223 |
+
return t
|
| 224 |
+
|
| 225 |
+
def _make_run(text):
|
| 226 |
+
r = OxmlElement('w:r')
|
| 227 |
+
r.append(_make_t(text))
|
| 228 |
+
return r
|
| 229 |
+
|
| 230 |
+
new_tr = OxmlElement('w:tr')
|
| 231 |
+
|
| 232 |
+
# trPr: tracked row insertion
|
| 233 |
+
trPr = OxmlElement('w:trPr')
|
| 234 |
+
tr_ins = OxmlElement('w:ins')
|
| 235 |
+
for k, v in _ins_attr().items():
|
| 236 |
+
tr_ins.set(k, v)
|
| 237 |
+
trPr.append(tr_ins)
|
| 238 |
+
new_tr.append(trPr)
|
| 239 |
+
|
| 240 |
+
for cd in cells_data:
|
| 241 |
+
tc = OxmlElement('w:tc')
|
| 242 |
+
|
| 243 |
+
tcPr = OxmlElement('w:tcPr')
|
| 244 |
+
tcW = OxmlElement('w:tcW')
|
| 245 |
+
if cd.get('width'):
|
| 246 |
+
tcW.set(qn('w:w'), str(cd['width']))
|
| 247 |
+
tcW.set(qn('w:type'), 'dxa')
|
| 248 |
+
tcPr.append(tcW)
|
| 249 |
+
if cd.get('vmerge'):
|
| 250 |
+
vm = OxmlElement('w:vMerge')
|
| 251 |
+
tcPr.append(vm)
|
| 252 |
+
tc.append(tcPr)
|
| 253 |
+
|
| 254 |
+
p = OxmlElement('w:p')
|
| 255 |
+
pPr = OxmlElement('w:pPr')
|
| 256 |
+
if cd.get('style'):
|
| 257 |
+
pStyle = OxmlElement('w:pStyle')
|
| 258 |
+
pStyle.set(qn('w:val'), cd['style'])
|
| 259 |
+
pPr.append(pStyle)
|
| 260 |
+
rPr_para = OxmlElement('w:rPr')
|
| 261 |
+
pm_ins = OxmlElement('w:ins')
|
| 262 |
+
for k, v in _ins_attr().items():
|
| 263 |
+
pm_ins.set(k, v)
|
| 264 |
+
rPr_para.append(pm_ins)
|
| 265 |
+
pPr.append(rPr_para)
|
| 266 |
+
p.append(pPr)
|
| 267 |
+
|
| 268 |
+
if cd.get('text') and not cd.get('vmerge'):
|
| 269 |
+
ins_el = OxmlElement('w:ins')
|
| 270 |
+
for k, v in _ins_attr().items():
|
| 271 |
+
ins_el.set(k, v)
|
| 272 |
+
ins_el.append(_make_run(cd['text']))
|
| 273 |
+
p.append(ins_el)
|
| 274 |
+
|
| 275 |
+
tc.append(p)
|
| 276 |
+
new_tr.append(tc)
|
| 277 |
+
|
| 278 |
+
return new_tr
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
def _insert_vmerge_row(tbl, after_row_idx, cells_data, rev, author, date):
|
| 282 |
+
"""
|
| 283 |
+
Insert a tracked row after row[after_row_idx].
|
| 284 |
+
cells_data: list of dicts with keys: text, width, vmerge, style.
|
| 285 |
+
Returns the inserted <w:tr> element.
|
| 286 |
+
"""
|
| 287 |
+
new_tr = _build_new_tr(cells_data, rev, author, date)
|
| 288 |
+
ref_tr = tbl.rows[after_row_idx]._tr
|
| 289 |
+
ref_tr.addnext(new_tr)
|
| 290 |
+
return new_tr
|
| 291 |
+
|
| 292 |
+
|
| 293 |
+
# ββ Section replace (direct XML transplant) βββββββββββββββββββββββββββββββββββ
|
| 294 |
+
|
| 295 |
+
def _apply_section_replace(doc, change, rev, author, date, log):
|
| 296 |
+
"""
|
| 297 |
+
Transplant a block of CR elements (del section + ins section) directly into
|
| 298 |
+
the TS, replacing the old heading+table at the matching location.
|
| 299 |
+
|
| 300 |
+
This mirrors what Word does on copy-paste: the exact XML from the CR is
|
| 301 |
+
cloned into the TS, with only the tracked-change revision IDs remapped to
|
| 302 |
+
avoid conflicts.
|
| 303 |
+
"""
|
| 304 |
+
from lxml import etree
|
| 305 |
+
import copy
|
| 306 |
+
|
| 307 |
+
loc = change['location']
|
| 308 |
+
del_heading = loc.get('del_heading', '')
|
| 309 |
+
has_del_table = loc.get('has_del_table', False)
|
| 310 |
+
elements_xml = change.get('elements_xml', [])
|
| 311 |
+
|
| 312 |
+
if not elements_xml:
|
| 313 |
+
log.append(' SKIP section_replace: no elements in manifest')
|
| 314 |
+
return False
|
| 315 |
+
|
| 316 |
+
# ββ Find the TS paragraph that matches the deleted heading βββββββββββββββββ
|
| 317 |
+
ts_para_elem = None
|
| 318 |
+
if del_heading:
|
| 319 |
+
for para in doc.paragraphs:
|
| 320 |
+
pt = para.text
|
| 321 |
+
if del_heading in pt or _norm(del_heading) in _norm(pt):
|
| 322 |
+
ts_para_elem = para._element
|
| 323 |
+
break
|
| 324 |
+
if ts_para_elem is None:
|
| 325 |
+
# Fallback: include paragraphs whose XML text (inc. del runs) matches
|
| 326 |
+
for para in doc.paragraphs:
|
| 327 |
+
if del_heading in _full_para_text(para):
|
| 328 |
+
ts_para_elem = para._element
|
| 329 |
+
break
|
| 330 |
+
|
| 331 |
+
if ts_para_elem is None:
|
| 332 |
+
log.append(f' SKIP section_replace: del_heading {del_heading!r} not found in TS')
|
| 333 |
+
return False
|
| 334 |
+
|
| 335 |
+
ts_body = ts_para_elem.getparent()
|
| 336 |
+
|
| 337 |
+
# ββ Find the table immediately after the heading (if applicable) βββββββββββ
|
| 338 |
+
ts_tbl_elem = None
|
| 339 |
+
if has_del_table:
|
| 340 |
+
found_para = False
|
| 341 |
+
for sib in ts_body:
|
| 342 |
+
if sib is ts_para_elem:
|
| 343 |
+
found_para = True
|
| 344 |
+
continue
|
| 345 |
+
if not found_para:
|
| 346 |
+
continue
|
| 347 |
+
sib_tag = sib.tag.split('}')[-1] if '}' in sib.tag else sib.tag
|
| 348 |
+
if sib_tag == 'p':
|
| 349 |
+
# Allow empty paragraphs between heading and table
|
| 350 |
+
if not (''.join(t.text or '' for t in sib.findall('.//' + qn('w:t')))).strip():
|
| 351 |
+
continue
|
| 352 |
+
break # non-empty paragraph before table β no table to remove
|
| 353 |
+
elif sib_tag == 'tbl':
|
| 354 |
+
ts_tbl_elem = sib
|
| 355 |
+
break
|
| 356 |
+
else:
|
| 357 |
+
break
|
| 358 |
+
|
| 359 |
+
# ββ Clone and remap IDs on the CR elements βββββββββββββββββββββββββββββββββ
|
| 360 |
+
cloned = []
|
| 361 |
+
for xml_str in elements_xml:
|
| 362 |
+
elem = etree.fromstring(xml_str)
|
| 363 |
+
cloned_elem = copy.deepcopy(elem)
|
| 364 |
+
# Remap w:id in all tracked-change elements (must be unique per document)
|
| 365 |
+
for el in cloned_elem.iter():
|
| 366 |
+
if el.get(qn('w:id')) is not None:
|
| 367 |
+
el.set(qn('w:id'), rev.next())
|
| 368 |
+
cloned.append(cloned_elem)
|
| 369 |
+
|
| 370 |
+
# ββ Insert cloned elements before the TS heading paragraph ββββββββββββββββ
|
| 371 |
+
insert_idx = list(ts_body).index(ts_para_elem)
|
| 372 |
+
for i, elem in enumerate(cloned):
|
| 373 |
+
ts_body.insert(insert_idx + i, elem)
|
| 374 |
+
|
| 375 |
+
# ββ Remove the now-replaced TS elements βββββββββββββββββββββββββββββββββββ
|
| 376 |
+
ts_body.remove(ts_para_elem)
|
| 377 |
+
if ts_tbl_elem is not None:
|
| 378 |
+
ts_body.remove(ts_tbl_elem)
|
| 379 |
+
|
| 380 |
+
n_del = sum(1 for x in elements_xml if 'w:del' in x[:200])
|
| 381 |
+
log.append(
|
| 382 |
+
f' OK section_replace: {del_heading!r} β {len(elements_xml)} element(s) spliced in'
|
| 383 |
+
f' (removed heading{"+ table" if has_del_table else ""})'
|
| 384 |
+
)
|
| 385 |
+
return True
|
| 386 |
+
|
| 387 |
+
|
| 388 |
+
# ββ Per-change-type applicators βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 389 |
+
|
| 390 |
+
def _apply_text_replace(doc, change, rev, author, date, log):
|
| 391 |
+
loc = change['location']
|
| 392 |
+
old = change['old']
|
| 393 |
+
new = change['new']
|
| 394 |
+
|
| 395 |
+
if loc['kind'] == 'table_cell':
|
| 396 |
+
tbl, t_conf = _find_table(doc, loc['table_header'])
|
| 397 |
+
if tbl is None:
|
| 398 |
+
log.append(f" SKIP text_replace: table not found {loc['table_header'][:2]!r}")
|
| 399 |
+
return False
|
| 400 |
+
col_idx = loc['col_idx']
|
| 401 |
+
row_anchor = loc['row_anchor']
|
| 402 |
+
|
| 403 |
+
if row_anchor:
|
| 404 |
+
row_idx, r_conf = _find_row(tbl, row_anchor)
|
| 405 |
+
if row_idx < 0:
|
| 406 |
+
log.append(f" SKIP text_replace: row anchor not found {row_anchor!r}")
|
| 407 |
+
return False
|
| 408 |
+
row = tbl.rows[row_idx]
|
| 409 |
+
if col_idx >= len(row.cells):
|
| 410 |
+
log.append(f" SKIP text_replace: col_idx {col_idx} out of range")
|
| 411 |
+
return False
|
| 412 |
+
cell = row.cells[col_idx]
|
| 413 |
+
for para in cell.paragraphs:
|
| 414 |
+
if old in para.text:
|
| 415 |
+
tracked_modify_para(para, old, new, rev, author, date)
|
| 416 |
+
log.append(f" OK text_replace (table_cell row={row_idx} col={col_idx}): {old!r} β {new!r}")
|
| 417 |
+
return True
|
| 418 |
+
log.append(f" SKIP text_replace: old text {old!r} not in cell (row={row_idx} col={col_idx})")
|
| 419 |
+
return False
|
| 420 |
+
else:
|
| 421 |
+
# Empty row anchor: scan all rows in col_idx.
|
| 422 |
+
# Prefer the table that follows the section heading (e.g. "Thirty fifth byte:")
|
| 423 |
+
# because all-empty table headers match any table.
|
| 424 |
+
section_heading = loc.get('section_heading', '')
|
| 425 |
+
tbl_by_section, _ = _find_table_by_section(doc, section_heading)
|
| 426 |
+
if tbl_by_section is not None:
|
| 427 |
+
tables_to_try = [tbl_by_section] + [t for t in doc.tables if t is not tbl_by_section]
|
| 428 |
+
else:
|
| 429 |
+
tables_to_try = [tbl] + [t for t in doc.tables if t is not tbl]
|
| 430 |
+
for search_tbl in tables_to_try:
|
| 431 |
+
for r_idx, row in enumerate(search_tbl.rows):
|
| 432 |
+
if col_idx >= len(row.cells):
|
| 433 |
+
continue
|
| 434 |
+
cell = row.cells[col_idx]
|
| 435 |
+
for para in cell.paragraphs:
|
| 436 |
+
if old in para.text:
|
| 437 |
+
tracked_modify_para(para, old, new, rev, author, date)
|
| 438 |
+
log.append(f" OK text_replace (table_cell scan row={r_idx} col={col_idx}): {old!r} β {new!r}")
|
| 439 |
+
return True
|
| 440 |
+
# Final fallback: scan ALL columns of ALL tables
|
| 441 |
+
_all_start = tbl_by_section if tbl_by_section is not None else tbl
|
| 442 |
+
for search_tbl in [_all_start] + [t for t in doc.tables if t is not _all_start]:
|
| 443 |
+
for r_idx, row in enumerate(search_tbl.rows):
|
| 444 |
+
for c_idx, cell in enumerate(row.cells):
|
| 445 |
+
for para in cell.paragraphs:
|
| 446 |
+
if old in para.text:
|
| 447 |
+
tracked_modify_para(para, old, new, rev, author, date)
|
| 448 |
+
log.append(f" OK text_replace (table_cell any_col row={r_idx} col={c_idx}): {old!r} β {new!r}")
|
| 449 |
+
return True
|
| 450 |
+
log.append(f" SKIP text_replace: old text {old!r} not found in any table column")
|
| 451 |
+
return False
|
| 452 |
+
|
| 453 |
+
elif loc['kind'] == 'body_para':
|
| 454 |
+
ctx = loc.get('para_context', '')
|
| 455 |
+
# Try to find the paragraph by old text first
|
| 456 |
+
para, conf = _find_para(doc, old, prefer_not_in_table=True)
|
| 457 |
+
if para is None:
|
| 458 |
+
# Fall back: find by paragraph context
|
| 459 |
+
para, conf = _find_para(doc, ctx, prefer_not_in_table=True)
|
| 460 |
+
if para is None:
|
| 461 |
+
log.append(f" SKIP text_replace: old text {old!r} not found in TS")
|
| 462 |
+
return False
|
| 463 |
+
if old in para.text:
|
| 464 |
+
tracked_modify_para(para, old, new, rev, author, date)
|
| 465 |
+
log.append(f" OK text_replace (body_para conf={conf:.1f}): {old!r} β {new!r}")
|
| 466 |
+
return True
|
| 467 |
+
log.append(f" SKIP text_replace: old text {old!r} not in resolved paragraph")
|
| 468 |
+
return False
|
| 469 |
+
|
| 470 |
+
log.append(f" SKIP text_replace: unknown kind {loc['kind']!r}")
|
| 471 |
+
return False
|
| 472 |
+
|
| 473 |
+
|
| 474 |
+
def _apply_para_insert(doc, change, rev, author, date, log):
|
| 475 |
+
anchor_text = change['location'].get('anchor_text', '')
|
| 476 |
+
paras_data = change.get('paragraphs', [])
|
| 477 |
+
if not paras_data:
|
| 478 |
+
return True
|
| 479 |
+
|
| 480 |
+
anchor_para, conf = _find_para(doc, anchor_text)
|
| 481 |
+
if anchor_para is None:
|
| 482 |
+
log.append(f" SKIP para_insert: anchor not found {anchor_text[:60]!r}")
|
| 483 |
+
return False
|
| 484 |
+
|
| 485 |
+
items = [(p['text'], p['style'] or 'Normal') for p in paras_data]
|
| 486 |
+
tracked_insert_paras_after(anchor_para, items, rev, author, date)
|
| 487 |
+
first_text = paras_data[0]['text'][:50] if paras_data else ''
|
| 488 |
+
log.append(f" OK para_insert ({len(paras_data)} para(s) after anchor conf={conf:.1f}): {first_text!r}...")
|
| 489 |
+
return True
|
| 490 |
+
|
| 491 |
+
|
| 492 |
+
def _apply_row_insert(doc, change, rev, author, date, log, last_inserted=None):
|
| 493 |
+
loc = change['location']
|
| 494 |
+
|
| 495 |
+
# Prefer table located by section heading (handles ambiguous all-empty headers)
|
| 496 |
+
section_heading = loc.get('section_heading', '')
|
| 497 |
+
tbl_by_section, _ = _find_table_by_section(doc, section_heading)
|
| 498 |
+
if tbl_by_section is not None:
|
| 499 |
+
tbl = tbl_by_section
|
| 500 |
+
else:
|
| 501 |
+
tbl, t_conf = _find_table(doc, loc['table_header'])
|
| 502 |
+
if tbl is None:
|
| 503 |
+
log.append(f" SKIP row_insert: table not found {loc['table_header'][:2]!r}")
|
| 504 |
+
return False
|
| 505 |
+
|
| 506 |
+
after_anchor = loc.get('after_row_anchor', '')
|
| 507 |
+
row_idx, r_conf = _find_row(tbl, after_anchor)
|
| 508 |
+
if row_idx < 0:
|
| 509 |
+
log.append(f" SKIP row_insert: anchor row not found {after_anchor!r}")
|
| 510 |
+
return False
|
| 511 |
+
|
| 512 |
+
cells_data = change.get('cells', [])
|
| 513 |
+
|
| 514 |
+
# Fix insertion ordering: when multiple rows target the same (tbl, row_idx),
|
| 515 |
+
# each new row should go AFTER the previously inserted one, not after row_idx.
|
| 516 |
+
# last_inserted maps (tbl._tbl id, row_idx) β last w:tr element inserted there.
|
| 517 |
+
key = (id(tbl._tbl), row_idx)
|
| 518 |
+
if last_inserted is not None and key in last_inserted:
|
| 519 |
+
# Insert after the previously inserted row to maintain forward order
|
| 520 |
+
prev_tr = last_inserted[key]
|
| 521 |
+
new_tr = _build_new_tr(cells_data, rev, author, date)
|
| 522 |
+
prev_tr.addnext(new_tr)
|
| 523 |
+
last_inserted[key] = new_tr
|
| 524 |
+
else:
|
| 525 |
+
new_tr = _insert_vmerge_row(tbl, row_idx, cells_data, rev, author, date)
|
| 526 |
+
if last_inserted is not None:
|
| 527 |
+
last_inserted[key] = new_tr
|
| 528 |
+
|
| 529 |
+
desc = cells_data[1]['text'] if len(cells_data) > 1 else '?'
|
| 530 |
+
log.append(f" OK row_insert after row[{row_idx}] ({after_anchor!r}): {desc!r}")
|
| 531 |
+
return True
|
| 532 |
+
|
| 533 |
+
|
| 534 |
+
# ββ Manifest pre-processing βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 535 |
+
|
| 536 |
+
def _merge_para_inserts(manifest):
|
| 537 |
+
"""
|
| 538 |
+
Merge consecutive para_insert entries that share the same anchor_text.
|
| 539 |
+
|
| 540 |
+
When the CR parser emits multiple para_insert entries for the same anchor
|
| 541 |
+
(because [...] context markers were transparent and kept prev_stable_text
|
| 542 |
+
unchanged), each would call tracked_insert_paras_after independently.
|
| 543 |
+
Since each call starts from the same anchor element and uses addnext(),
|
| 544 |
+
later groups push earlier groups down β producing reversed order.
|
| 545 |
+
|
| 546 |
+
Merging them into one entry ensures a single tracked_insert_paras_after
|
| 547 |
+
call that inserts all paragraphs in the correct forward order.
|
| 548 |
+
"""
|
| 549 |
+
result = []
|
| 550 |
+
for change in manifest:
|
| 551 |
+
if (change.get('type') == 'para_insert'
|
| 552 |
+
and result
|
| 553 |
+
and result[-1].get('type') == 'para_insert'
|
| 554 |
+
and result[-1]['location']['anchor_text'] == change['location']['anchor_text']):
|
| 555 |
+
result[-1]['paragraphs'].extend(change['paragraphs'])
|
| 556 |
+
else:
|
| 557 |
+
merged = dict(change)
|
| 558 |
+
if change.get('type') == 'para_insert':
|
| 559 |
+
merged['paragraphs'] = list(change['paragraphs'])
|
| 560 |
+
result.append(merged)
|
| 561 |
+
return result
|
| 562 |
+
|
| 563 |
+
|
| 564 |
+
# ββ Main apply function βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 565 |
+
|
| 566 |
+
def apply_manifest(ts_path, manifest, out_path, author=DEFAULT_AUTHOR, date=DEFAULT_DATE):
|
| 567 |
+
"""
|
| 568 |
+
Apply all changes in manifest to ts_path, save to out_path.
|
| 569 |
+
Returns (n_ok, n_skipped, log_lines).
|
| 570 |
+
"""
|
| 571 |
+
doc = docx.Document(str(ts_path))
|
| 572 |
+
rev = RevCounter(doc)
|
| 573 |
+
log = []
|
| 574 |
+
n_ok = 0
|
| 575 |
+
n_skip = 0
|
| 576 |
+
|
| 577 |
+
manifest = _merge_para_inserts(manifest)
|
| 578 |
+
|
| 579 |
+
# Track last inserted <w:tr> per (tbl_id, anchor_row_idx) to maintain
|
| 580 |
+
# forward insertion order when multiple row_inserts target the same anchor.
|
| 581 |
+
last_inserted = {}
|
| 582 |
+
|
| 583 |
+
for change in manifest:
|
| 584 |
+
ctype = change.get('type')
|
| 585 |
+
ok = False
|
| 586 |
+
|
| 587 |
+
if ctype == 'section_replace':
|
| 588 |
+
ok = _apply_section_replace(doc, change, rev, author, date, log)
|
| 589 |
+
elif ctype == 'text_replace':
|
| 590 |
+
ok = _apply_text_replace(doc, change, rev, author, date, log)
|
| 591 |
+
elif ctype == 'para_insert':
|
| 592 |
+
ok = _apply_para_insert(doc, change, rev, author, date, log)
|
| 593 |
+
elif ctype == 'row_insert':
|
| 594 |
+
ok = _apply_row_insert(doc, change, rev, author, date, log, last_inserted=last_inserted)
|
| 595 |
+
else:
|
| 596 |
+
log.append(f" SKIP unknown change type: {ctype!r}")
|
| 597 |
+
|
| 598 |
+
if ok:
|
| 599 |
+
n_ok += 1
|
| 600 |
+
else:
|
| 601 |
+
n_skip += 1
|
| 602 |
+
|
| 603 |
+
doc.save(str(out_path))
|
| 604 |
+
return n_ok, n_skip, log
|
| 605 |
+
|
| 606 |
+
|
| 607 |
+
# ββ CLI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 608 |
+
|
| 609 |
+
def main():
|
| 610 |
+
ap = argparse.ArgumentParser(description='Apply CR manifest to TS DOCX as tracked changes.')
|
| 611 |
+
ap.add_argument('ts_docx', help='Target TS DOCX file')
|
| 612 |
+
ap.add_argument('manifest', help='JSON manifest from cr_parser.py')
|
| 613 |
+
ap.add_argument('--author', default=DEFAULT_AUTHOR, help='Tracked change author')
|
| 614 |
+
ap.add_argument('--output', default=None, help='Output path (default: <ts>_applied.docx)')
|
| 615 |
+
args = ap.parse_args()
|
| 616 |
+
|
| 617 |
+
ts_path = Path(args.ts_docx)
|
| 618 |
+
out_path = Path(args.output) if args.output else ts_path.parent / (ts_path.stem + '_applied.docx')
|
| 619 |
+
|
| 620 |
+
with open(args.manifest, encoding='utf-8') as f:
|
| 621 |
+
manifest = json.load(f)
|
| 622 |
+
|
| 623 |
+
print(f'Applying {len(manifest)} change(s) from manifest to {ts_path.name}...')
|
| 624 |
+
n_ok, n_skip, log = apply_manifest(ts_path, manifest, out_path, author=args.author)
|
| 625 |
+
|
| 626 |
+
for line in log:
|
| 627 |
+
print(line)
|
| 628 |
+
print(f'\nResult: {n_ok} applied, {n_skip} skipped')
|
| 629 |
+
print(f'Output: {out_path}')
|
| 630 |
+
|
| 631 |
+
|
| 632 |
+
if __name__ == '__main__':
|
| 633 |
+
main()
|