Spaces:

OrganizedProgrammers
/

ApplyCRs

Running

App Files Files Community

heymenn commited on 13 days ago

Commit

7eedaf8

1 Parent(s): 3bf5b65

init

Browse files

Files changed (15) hide show

README.md +9 -6
app.py +459 -0
requirements.txt +6 -0
scripts/__pycache__/cr_parser.cpython-310.pyc +0 -0
scripts/__pycache__/docx_helpers.cpython-310.pyc +0 -0
scripts/__pycache__/fetch_crs.cpython-310.pyc +0 -0
scripts/__pycache__/finalize_ts.cpython-310.pyc +0 -0
scripts/__pycache__/ts_applicator.cpython-310.pyc +0 -0
scripts/cr_parser.py +490 -0
scripts/docx_helpers.py +494 -0
scripts/fetch_crs.py +487 -0
scripts/finalize_ts.py +370 -0
scripts/map_sections.py +44 -0
scripts/orchestrate_cr.py +361 -0
scripts/ts_applicator.py +633 -0

README.md CHANGED Viewed

@@ -1,10 +1,13 @@
 ---
-title: ApplyCRs
-emoji: 👀
-colorFrom: green
-colorTo: red
-sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: CR Application Tool
+emoji: 📄
+colorFrom: blue
+colorTo: green
+sdk: streamlit
+sdk_version: 1.35.0
+app_file: app.py
 pinned: false
 ---
+Automated 3GPP/ETSI CR application tool.
+Upload an Excel contribution list → preview accepted CRs → apply all changes → download ZIP.

app.py ADDED Viewed

	@@ -0,0 +1,459 @@

+#!/usr/bin/env python3
+"""
+CR Application Tool — Streamlit frontend.
+Three-step UI:
+  1. UPLOAD   — upload Excel contribution list
+  2. PREVIEW  — review accepted CRs
+  3. RUNNING  — pipeline subprocess with live log
+  4. DONE/ERROR — download ZIP of results
+"""
+import io
+import json
+import os
+import subprocess
+import sys
+import threading
+import time
+import uuid
+import zipfile
+from datetime import datetime
+from pathlib import Path
+import streamlit as st
+# ── Scripts dir (same folder as app.py / scripts/) ───────────────────────────
+SCRIPTS_DIR = Path(__file__).parent / "scripts"
+sys.path.insert(0, str(SCRIPTS_DIR))
+# ── Session persistence ───────────────────────────────────────────────────────
+def _get_session_base() -> Path:
+    """Use /data/cr_sessions if writable (HF persistent storage), else /tmp."""
+    candidate = Path("/data/cr_sessions")
+    try:
+        candidate.mkdir(parents=True, exist_ok=True)
+        probe = candidate / ".write_test"
+        probe.write_text("x")
+        probe.unlink()
+        return candidate
+    except OSError:
+        fallback = Path("/tmp/cr_sessions")
+        fallback.mkdir(parents=True, exist_ok=True)
+        return fallback
+SESSION_BASE = _get_session_base()
+def session_dir(sid: str) -> Path:
+    d = SESSION_BASE / sid
+    d.mkdir(parents=True, exist_ok=True)
+    return d
+def _state_path(sid: str) -> Path:
+    return session_dir(sid) / "state.json"
+def load_state(sid: str) -> dict | None:
+    p = _state_path(sid)
+    if p.exists():
+        try:
+            return json.loads(p.read_text())
+        except Exception:
+            return None
+    return None
+def save_state(sid: str, state: dict) -> None:
+    _state_path(sid).write_text(json.dumps(state, indent=2, default=str))
+def new_state(sid: str) -> dict:
+    return {
+        "session_id": sid,
+        "status": "upload",
+        "excel_filename": None,
+        "person_name": "Ly Thanh PHAN",
+        "cr_list": [],
+        "pid": None,
+        "output_dir": None,
+        "log_path": None,
+        "started_at": None,
+        "completed_at": None,
+        "return_code": None,
+    }
+# ── Helpers ───────────────────────────────────────────────────────────────────
+def _rc_path(sid: str) -> Path:
+    return session_dir(sid) / "returncode"
+def _run_and_save_rc(proc: subprocess.Popen, rc_path: Path) -> None:
+    """Background thread: wait for process, write return code to disk."""
+    proc.wait()
+    rc_path.write_text(str(proc.returncode))
+def read_return_code(sid: str) -> int | None:
+    p = _rc_path(sid)
+    if p.exists():
+        try:
+            return int(p.read_text().strip())
+        except ValueError:
+            return None
+    return None
+def is_process_alive(pid: int) -> bool:
+    try:
+        os.kill(pid, 0)
+        return True
+    except (ProcessLookupError, PermissionError):
+        return False
+def tail_log(log_path: str, n: int = 100) -> str:
+    p = Path(log_path)
+    if not p.exists():
+        return "(log not yet available…)"
+    lines = p.read_text(errors="replace").splitlines()
+    return "\n".join(lines[-n:])
+def parse_log_results(log_path: str) -> list[dict]:
+    """Extract per-TS result lines from the Final Report section."""
+    p = Path(log_path)
+    if not p.exists():
+        return []
+    lines = p.read_text(errors="replace").splitlines()
+    results, in_report = [], False
+    for line in lines:
+        if "Final Report" in line:
+            in_report = True
+        if in_report:
+            for tag in ("OK", "WARN", "FAIL"):
+                if f"[{tag}]" in line:
+                    ts_name = line.split(f"[{tag}]", 1)[-1].strip()
+                    results.append({"Status": tag, "TS": ts_name})
+                    break
+    return results
+def peek_submitted_by(excel_path: Path, max_names: int = 20) -> list[str]:
+    """Return unique non-empty SubmittedBy values from the Excel (best-effort)."""
+    try:
+        ext = excel_path.suffix.lower()
+        names: set[str] = set()
+        if ext == ".xls":
+            import xlrd
+            wb = xlrd.open_workbook(str(excel_path))
+            try:
+                ws = wb.sheet_by_name("Contributions")
+            except xlrd.XLRDError:
+                ws = wb.sheet_by_index(0)
+            headers = [str(ws.cell_value(0, c)).strip() for c in range(ws.ncols)]
+            by_col = next(
+                (i for i, h in enumerate(headers)
+                 if h.lower() in ("submittedby", "submitted by")),
+                None,
+            )
+            if by_col is not None:
+                for r in range(1, ws.nrows):
+                    v = str(ws.cell_value(r, by_col)).strip()
+                    if v:
+                        names.add(v)
+        elif ext == ".xlsx":
+            import openpyxl
+            wb = openpyxl.load_workbook(str(excel_path), read_only=True, data_only=True)
+            ws = wb["Contributions"] if "Contributions" in wb.sheetnames else wb.active
+            rows = iter(ws.iter_rows(values_only=True))
+            headers = [str(c).strip() if c is not None else "" for c in next(rows, [])]
+            by_col = next(
+                (i for i, h in enumerate(headers)
+                 if h.lower() in ("submittedby", "submitted by")),
+                None,
+            )
+            if by_col is not None:
+                for row in rows:
+                    v = str(row[by_col]).strip() if row[by_col] is not None else ""
+                    if v and v != "None":
+                        names.add(v)
+        return sorted(names)[:max_names]
+    except Exception:
+        return []
+def make_zip(output_dir: Path) -> bytes:
+    buf = io.BytesIO()
+    with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
+        for f in output_dir.rglob("*"):
+            if f.is_file():
+                zf.write(f, f.relative_to(output_dir.parent))
+    buf.seek(0)
+    return buf.read()
+# ── Page config ───────────────────────────────────────────────────────────────
+st.set_page_config(
+    page_title="CR Application Tool",
+    page_icon="📄",
+    layout="centered",
+)
+st.title("📄 CR Application Tool")
+st.caption("Upload an ETSI/3GPP Excel contribution list → preview accepted CRs → apply all → download ZIP.")
+# ── Session init ──────────────────────────────────────────────────────────────
+params = st.query_params
+if "sid" not in st.session_state:
+    if "sid" in params:
+        candidate = params["sid"]
+        existing = load_state(candidate)
+        if existing:
+            st.session_state.sid = candidate
+            st.session_state.state = existing
+        else:
+            sid = str(uuid.uuid4())
+            st.session_state.sid = sid
+            st.session_state.state = new_state(sid)
+            st.query_params["sid"] = sid
+    else:
+        sid = str(uuid.uuid4())
+        st.session_state.sid = sid
+        st.session_state.state = new_state(sid)
+        st.query_params["sid"] = sid
+sid: str = st.session_state.sid
+state: dict = st.session_state.state
+# ── Sidebar ───────────────────────────────────────────────────────────────────
+with st.sidebar:
+    st.header("Session")
+    st.caption(f"ID: `{sid[:8]}…`")
+    st.divider()
+    st.subheader("Resume a session")
+    resume_sid = st.text_input("Paste a session ID")
+    if st.button("Resume") and resume_sid.strip():
+        existing = load_state(resume_sid.strip())
+        if existing:
+            st.session_state.sid = resume_sid.strip()
+            st.session_state.state = existing
+            st.query_params["sid"] = resume_sid.strip()
+            st.rerun()
+        else:
+            st.error("Session not found.")
+# ── State machine ─────────────────────────────────────────────────────────────
+status: str = state["status"]
+# ════════════════════════════════════════════════════════════════════════════
+# UPLOAD
+# ════════════════════════════════════════════════════════════════════════════
+if status == "upload":
+    st.subheader("Step 1 — Upload contribution list")
+    uploaded = st.file_uploader(
+        "Excel contribution list (.xlsx or .xls)",
+        type=["xlsx", "xls"],
+    )
+    person_name = st.text_input(
+        "Contributor name (must match SubmittedBy column)",
+        value=state.get("person_name", "Ly Thanh PHAN"),
+    )
+    if uploaded and st.button("Parse CR list →", type="primary"):
+        excel_path = session_dir(sid) / uploaded.name
+        excel_path.write_bytes(uploaded.getbuffer())
+        with st.spinner("Parsing Excel…"):
+            try:
+                from fetch_crs import parse_excel
+                cr_list = parse_excel(str(excel_path), person_name)
+                state["status"] = "preview"
+                state["excel_filename"] = uploaded.name
+                state["person_name"] = person_name
+                state["cr_list"] = [list(row) for row in cr_list]
+                save_state(sid, state)
+                st.rerun()
+            except Exception as exc:
+                st.error(f"Failed to parse Excel: {exc}")
+# ════════════════════════════════════════════════════════════════════════════
+# PREVIEW
+# ════════════════════════════════════════════════════════════════════════════
+elif status == "preview":
+    cr_list = state["cr_list"]
+    st.subheader(f"Step 2 — {len(cr_list)} Accepted CR(s) found")
+    if cr_list:
+        import pandas as pd
+        df = pd.DataFrame(cr_list, columns=["UID", "Title"])
+        st.dataframe(df, use_container_width=True)
+    else:
+        st.warning(
+            f"No Accepted CRs found for **{state['person_name']}** in this file."
+        )
+        # Diagnostic: show what names are in the SubmittedBy column
+        excel_path = session_dir(sid) / state["excel_filename"]
+        found_names = peek_submitted_by(excel_path)
+        if found_names:
+            st.info(
+                "**Names found in SubmittedBy column** — copy the exact one into the field above and re-upload:\n\n"
+                + "\n".join(f"- `{n}`" for n in found_names)
+            )
+    col1, col2 = st.columns(2)
+    with col1:
+        if st.button("← Back"):
+            state["status"] = "upload"
+            state["cr_list"] = []
+            save_state(sid, state)
+            st.rerun()
+    with col2:
+        if cr_list and st.button("▶ Start Pipeline", type="primary"):
+            excel_path = session_dir(sid) / state["excel_filename"]
+            output_dir = session_dir(sid) / "output"
+            output_dir.mkdir(parents=True, exist_ok=True)
+            log_path = session_dir(sid) / "pipeline.log"
+            rc_path = _rc_path(sid)
+            cmd = [
+                sys.executable,
+                str(SCRIPTS_DIR / "orchestrate_cr.py"),
+                str(excel_path),
+                state["person_name"],
+                "--output-dir", str(output_dir),
+            ]
+            log_file = open(str(log_path), "w")
+            proc = subprocess.Popen(
+                cmd,
+                stdout=log_file,
+                stderr=subprocess.STDOUT,
+                env=os.environ.copy(),
+            )
+            log_file.close()
+            # Background thread writes returncode file when process finishes
+            threading.Thread(
+                target=_run_and_save_rc,
+                args=(proc, rc_path),
+                daemon=True,
+            ).start()
+            st.session_state.proc = proc
+            state["status"] = "running"
+            state["pid"] = proc.pid
+            state["output_dir"] = str(output_dir)
+            state["log_path"] = str(log_path)
+            state["started_at"] = datetime.now().isoformat()
+            save_state(sid, state)
+            st.rerun()
+# ════════════════════════════════════════════════════════════════════════════
+# RUNNING
+# ════════════════════════════════════════════════════════════════════════════
+elif status == "running":
+    pid = state["pid"]
+    log_path = state["log_path"]
+    # Determine whether process is still alive
+    proc = st.session_state.get("proc")
+    alive = False
+    if proc is not None:
+        alive = proc.poll() is None
+    else:
+        # Session reloaded — check returncode file, then PID
+        rc = read_return_code(sid)
+        if rc is None:
+            alive = is_process_alive(pid)
+    if alive:
+        st.subheader("⏳ Pipeline running…")
+        st.info(f"PID {pid} — started {state.get('started_at', '')[:19]}")
+        log_text = tail_log(log_path, 100)
+        st.text_area("Live log (last 100 lines)", value=log_text, height=400)
+        time.sleep(2)
+        st.rerun()
+    else:
+        # Process finished — determine return code
+        rc = read_return_code(sid)
+        if rc is None and proc is not None:
+            rc = proc.returncode
+        state["return_code"] = rc
+        state["completed_at"] = datetime.now().isoformat()
+        state["status"] = "done" if rc == 0 else "error"
+        save_state(sid, state)
+        st.rerun()
+# ════════════════════════════════════════════════════════════════════════════
+# DONE / ERROR
+# ════════════════════════════════════════════════════════════════════════════
+elif status in ("done", "error"):
+    log_path = state.get("log_path", "")
+    output_dir = Path(state.get("output_dir", ""))
+    rc = state.get("return_code")
+    if status == "done":
+        st.success("✅ Pipeline completed successfully!")
+    else:
+        st.error(f"❌ Pipeline finished with errors (return code: {rc})")
+    # Per-TS results table
+    results = parse_log_results(log_path)
+    if results:
+        st.subheader("Results per TS")
+        import pandas as pd
+        df = pd.DataFrame(results)
+        def _color_status(val):
+            return {
+                "OK":   "background-color: #d4edda; color: #155724",
+                "WARN": "background-color: #fff3cd; color: #856404",
+                "FAIL": "background-color: #f8d7da; color: #721c24",
+            }.get(val, "")
+        st.dataframe(
+            df.style.map(_color_status, subset=["Status"]),
+            use_container_width=True,
+        )
+    # Download ZIP
+    if output_dir.exists() and any(output_dir.rglob("*")):
+        st.subheader("Download results")
+        zip_bytes = make_zip(output_dir)
+        st.download_button(
+            label="⬇ Download results ZIP",
+            data=zip_bytes,
+            file_name=f"cr_results_{sid[:8]}.zip",
+            mime="application/zip",
+            type="primary",
+        )
+    else:
+        st.warning("Output directory is empty — nothing to download.")
+    # Full log
+    with st.expander("Full pipeline log"):
+        if log_path and Path(log_path).exists():
+            st.text(Path(log_path).read_text(errors="replace"))
+        else:
+            st.text("Log not found.")
+    # Start new session
+    st.divider()
+    if st.button("Start new session"):
+        new_sid = str(uuid.uuid4())
+        st.session_state.sid = new_sid
+        st.session_state.state = new_state(new_sid)
+        if "proc" in st.session_state:
+            del st.session_state.proc
+        st.query_params["sid"] = new_sid
+        save_state(new_sid, st.session_state.state)
+        st.rerun()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+streamlit==1.35.0
+python-docx==1.1.2
+openpyxl==3.1.5
+xlrd==2.0.1
+lxml==5.2.2
+requests==2.32.3

scripts/__pycache__/cr_parser.cpython-310.pyc ADDED Viewed

Binary file (11.9 kB). View file

scripts/__pycache__/docx_helpers.cpython-310.pyc ADDED Viewed

Binary file (13.1 kB). View file

scripts/__pycache__/fetch_crs.cpython-310.pyc ADDED Viewed

Binary file (12.3 kB). View file

scripts/__pycache__/finalize_ts.cpython-310.pyc ADDED Viewed

Binary file (9.04 kB). View file

scripts/__pycache__/ts_applicator.cpython-310.pyc ADDED Viewed

Binary file (18.1 kB). View file

scripts/cr_parser.py ADDED Viewed

	@@ -0,0 +1,490 @@

+#!/usr/bin/env python3
+"""
+cr_parser.py — Parse a CR DOCX's tracked changes into a JSON manifest.
+Each entry in the manifest is one of:
+  {"type": "text_replace",  "location": {...}, "old": "...", "new": "..."}
+  {"type": "para_insert",   "location": {...}, "paragraphs": [...]}
+  {"type": "row_insert",    "location": {...}, "cells": [...]}
+Usage:
+    python3 cr_parser.py <cr.docx> [--output manifest.json]
+    # or import: from cr_parser import parse_cr
+"""
+import argparse
+import json
+import re
+import sys
+from pathlib import Path
+import docx
+from docx.oxml.ns import qn
+# ── Low-level text helpers ────────────────────────────────────────────────────
+def _del_text(elem):
+    """Concatenate all w:delText descendants."""
+    return ''.join(t.text or '' for t in elem.findall('.//' + qn('w:delText')))
+def _ins_text(elem):
+    """Concatenate all w:t descendants (inside w:ins)."""
+    return ''.join(t.text or '' for t in elem.findall('.//' + qn('w:t')))
+def _para_new_text(p_elem):
+    """Text of a paragraph after accepting tracked changes (ins included, del excluded)."""
+    return ''.join(t.text or '' for t in p_elem.findall('.//' + qn('w:t')))
+def _para_orig_text(p_elem):
+    """Text of a paragraph as it exists in the TS (del included, ins excluded)."""
+    parts = []
+    for node in p_elem.iter():
+        if node.tag == qn('w:delText') and node.text:
+            parts.append(node.text)
+        elif node.tag == qn('w:t') and node.text:
+            # Skip if inside a w:ins
+            if not any(a.tag == qn('w:ins') for a in node.iterancestors()):
+                parts.append(node.text)
+    return ''.join(parts)
+def _style_val(p_elem):
+    pPr = p_elem.find(qn('w:pPr'))
+    if pPr is None:
+        return None
+    pStyle = pPr.find(qn('w:pStyle'))
+    if pStyle is None:
+        return None
+    return pStyle.get(qn('w:val'))
+def _is_rpr_ins(ins_elem):
+    """True if w:ins is inside w:rPr — a formatting change, not a content insertion."""
+    p = ins_elem.getparent()
+    return p is not None and p.tag == qn('w:rPr')
+def _is_inserted_para(p_elem):
+    """True if this paragraph's paragraph-mark is tracked as inserted (whole new para)."""
+    pPr = p_elem.find(qn('w:pPr'))
+    if pPr is None:
+        return False
+    rPr = pPr.find(qn('w:rPr'))
+    if rPr is None:
+        return False
+    return rPr.find(qn('w:ins')) is not None
+def _is_deleted_para(p_elem):
+    """True if this paragraph's paragraph-mark is tracked as deleted (whole para deleted)."""
+    pPr = p_elem.find(qn('w:pPr'))
+    if pPr is None:
+        return False
+    rPr = pPr.find(qn('w:rPr'))
+    if rPr is None:
+        return False
+    return rPr.find(qn('w:del')) is not None
+def _is_fully_deleted_tbl(tbl_elem):
+    """True if every row in the table is tracked as a row-level deletion."""
+    rows = tbl_elem.findall(qn('w:tr'))
+    if not rows:
+        return False
+    return all(
+        tr.find(qn('w:trPr')) is not None and
+        tr.find(qn('w:trPr')).find(qn('w:del')) is not None
+        for tr in rows
+    )
+def _is_fully_inserted_tbl(tbl_elem):
+    """True if every row in the table is tracked as a row-level insertion."""
+    rows = tbl_elem.findall(qn('w:tr'))
+    if not rows:
+        return False
+    return all(
+        tr.find(qn('w:trPr')) is not None and
+        tr.find(qn('w:trPr')).find(qn('w:ins')) is not None
+        for tr in rows
+    )
+# ── Table helpers ─────────────────────────────────────────────────────────────
+def _table_header(tbl_elem):
+    """First row cell texts — used as table identifier."""
+    first_tr = tbl_elem.find(qn('w:tr'))
+    if first_tr is None:
+        return []
+    cells = []
+    for tc in first_tr.findall(qn('w:tc')):
+        p = tc.find('.//' + qn('w:p'))
+        cells.append(_para_new_text(p).strip() if p is not None else '')
+    return cells
+def _row_col0(tr_elem):
+    """Col-0 text of a table row — used as row anchor."""
+    tc = tr_elem.find(qn('w:tc'))
+    if tc is None:
+        return ''
+    p = tc.find('.//' + qn('w:p'))
+    return _para_new_text(p).strip() if p is not None else ''
+# ── Inline del+ins extraction (from a single paragraph) ──────────────────────
+def _extract_inline_replacements(p_elem):
+    """
+    Return list of (old_text, new_text) pairs from del+ins sibling pairs.
+    Handles: del-then-ins, ins-then-del, multi-fragment consecutive dels.
+    Filters: whitespace-only dels with no adjacent ins, empty dels, rPr ins.
+    """
+    children = list(p_elem)
+    pairs = []
+    skip = set()
+    for i, child in enumerate(children):
+        if i in skip:
+            continue
+        if child.tag != qn('w:del'):
+            continue
+        old_text = _del_text(child)
+        # Empty del (paragraph-mark or line-break deletion) — discard
+        if not old_text:
+            skip.add(i)
+            continue
+        # Merge consecutive del siblings (multi-fragment deletion)
+        j = i + 1
+        while j < len(children) and children[j].tag == qn('w:del'):
+            old_text += _del_text(children[j])
+            skip.add(j)
+            j += 1
+        # Whitespace-only del: only keep if there's an adjacent ins
+        next_sib = children[j] if j < len(children) else None
+        prev_sib = children[i - 1] if i > 0 else None
+        new_text = None
+        if next_sib is not None and next_sib.tag == qn('w:ins') and not _is_rpr_ins(next_sib):
+            new_text = _ins_text(next_sib)
+            skip.add(j)
+        elif prev_sib is not None and prev_sib.tag == qn('w:ins') and not _is_rpr_ins(prev_sib):
+            new_text = _ins_text(prev_sib)
+        if new_text is None:
+            if not old_text.strip():
+                skip.add(i)
+                continue  # whitespace artefact with no counterpart
+            # Pure deletion (no replacement) — record with empty new
+            pairs.append((old_text, ''))
+        else:
+            pairs.append((old_text, new_text))
+    return pairs
+# ── Table change extraction ───────────────────────────────────────────────────
+def _parse_table(tbl_elem, changes, section_heading=''):
+    header = _table_header(tbl_elem)
+    header_key = header[:3]  # first 3 columns enough for matching
+    rows = tbl_elem.findall(qn('w:tr'))
+    for tr_idx, tr in enumerate(rows):
+        trPr = tr.find(qn('w:trPr'))
+        # ── Tracked row insertion ─────────────────────────────────────────
+        if trPr is not None and trPr.find(qn('w:ins')) is not None:
+            # Find preceding stable row for anchor
+            after_anchor = ''
+            for prev_idx in range(tr_idx - 1, -1, -1):
+                prev_tr = rows[prev_idx]
+                prev_trPr = prev_tr.find(qn('w:trPr'))
+                if prev_trPr is None or prev_trPr.find(qn('w:ins')) is None:
+                    after_anchor = _row_col0(prev_tr)
+                    break
+            cells = []
+            for tc in tr.findall(qn('w:tc')):
+                tcPr = tc.find(qn('w:tcPr'))
+                # Width
+                width = None
+                if tcPr is not None:
+                    tcW = tcPr.find(qn('w:tcW'))
+                    if tcW is not None:
+                        try:
+                            width = int(tcW.get(qn('w:w'), 0))
+                        except (ValueError, TypeError):
+                            width = None
+                # vMerge (no w:val attribute = continuation)
+                is_vmerge = False
+                if tcPr is not None:
+                    vm = tcPr.find(qn('w:vMerge'))
+                    if vm is not None and vm.get(qn('w:val')) is None:
+                        is_vmerge = True
+                # Text — prefer ins text, fall back to all text
+                cell_ins_text = _ins_text(tc)
+                p = tc.find('.//' + qn('w:p'))
+                cell_text = cell_ins_text if cell_ins_text else (_para_new_text(p) if p else '')
+                style = _style_val(p) if p is not None else None
+                cells.append({
+                    'text': cell_text.strip(),
+                    'width': width,
+                    'vmerge': is_vmerge,
+                    'style': style,
+                })
+            changes.append({
+                'type': 'row_insert',
+                'location': {
+                    'kind': 'table_row',
+                    'table_header': header_key,
+                    'after_row_anchor': after_anchor,
+                    'section_heading': section_heading,
+                },
+                'cells': cells,
+            })
+            continue
+        # ── Cell-level text_replace ───────────────────────────────────────
+        row_anchor = _row_col0(tr)
+        tcs = tr.findall(qn('w:tc'))
+        for col_idx, tc in enumerate(tcs):
+            for p in tc.findall('.//' + qn('w:p')):
+                for old_text, new_text in _extract_inline_replacements(p):
+                    if not old_text:
+                        continue
+                    changes.append({
+                        'type': 'text_replace',
+                        'location': {
+                            'kind': 'table_cell',
+                            'table_header': header_key,
+                            'row_anchor': row_anchor,
+                            'col_idx': col_idx,
+                            'section_heading': section_heading,
+                        },
+                        'old': old_text,
+                        'new': new_text,
+                    })
+# ── Body paragraph extraction ─────────────────────────────────────────────────
+def _parse_body(body, changes):
+    """
+    Walk direct children of w:body, emitting changes.
+    Change types emitted:
+      section_replace — a contiguous block of fully-deleted elements (para and/or
+                        table, tracked at the paragraph-mark / row level) followed
+                        immediately by a contiguous block of fully-inserted elements.
+                        The raw XML of ALL those CR elements is stored verbatim so
+                        the applicator can transplant them directly into the TS —
+                        exactly what Word does on a copy-paste.
+      text_replace    — an inline del+ins pair inside an otherwise-stable paragraph.
+      para_insert     — one or more wholly-new paragraphs with no corresponding
+                        deletion (rare; kept for backward compatibility).
+    """
+    from lxml import etree
+    prev_stable_text = ''
+    # ── Section-replace accumulator ───────────────────────────────────────────
+    sec_del = []    # fully-deleted elements (CR del block)
+    sec_sep = []    # empty/separator paragraphs between del and ins blocks
+    sec_ins = []    # fully-inserted elements (CR ins block)
+    sec_state = 'stable'   # 'stable' | 'del' | 'sep' | 'ins'
+    sec_anchor = ''
+    def flush_section():
+        nonlocal sec_state, sec_anchor
+        if not sec_del and not sec_ins:
+            sec_del.clear(); sec_sep.clear(); sec_ins.clear()
+            sec_state = 'stable'
+            return
+        # The del_heading is the text content of the first deleted paragraph
+        del_heading = ''
+        for e in sec_del:
+            tag = e.tag.split('}')[-1] if '}' in e.tag else e.tag
+            if tag == 'p':
+                t = _del_text(e).strip() or _para_orig_text(e).strip()
+                if t:
+                    del_heading = t
+                    break
+        # Serialize all elements for the manifest (del + sep + ins)
+        all_elems = sec_del + sec_sep + sec_ins
+        elements_xml = [etree.tostring(e, encoding='unicode') for e in all_elems]
+        has_del_table = any(
+            (e.tag.split('}')[-1] if '}' in e.tag else e.tag) == 'tbl'
+            for e in sec_del
+        )
+        changes.append({
+            'type': 'section_replace',
+            'location': {
+                'kind': 'body',
+                'del_heading': del_heading,
+                'has_del_table': has_del_table,
+                'anchor_text': sec_anchor,
+            },
+            'elements_xml': elements_xml,
+        })
+        sec_del.clear(); sec_sep.clear(); sec_ins.clear()
+        sec_state = 'stable'
+    # ── Para-insert accumulator (for standalone new paragraphs) ───────────────
+    insert_group = []
+    def flush_group():
+        if not insert_group:
+            return
+        paras = [
+            {'text': _para_new_text(p).strip(), 'style': _style_val(p)}
+            for p in insert_group
+        ]
+        paras = [p for p in paras if p['text'] or p['style']]
+        if paras:
+            changes.append({
+                'type': 'para_insert',
+                'location': {
+                    'kind': 'body',
+                    'anchor_text': prev_stable_text,
+                },
+                'paragraphs': paras,
+            })
+        insert_group.clear()
+    for elem in body:
+        tag = elem.tag.split('}')[-1] if '}' in elem.tag else elem.tag
+        if tag == 'p':
+            is_del = _is_deleted_para(elem)
+            is_ins = _is_inserted_para(elem)
+            is_empty = not _para_orig_text(elem).strip() and not _para_new_text(elem).strip()
+            if is_del:
+                # Start or continue the del block
+                if sec_state == 'ins':
+                    flush_section()   # ins before del = two separate section_replaces
+                if sec_state == 'stable':
+                    flush_group()
+                    sec_anchor = prev_stable_text
+                sec_state = 'del'
+                sec_del.append(elem)
+            elif is_ins:
+                if sec_state in ('del', 'sep'):
+                    # ins block follows a del block → part of section_replace
+                    sec_state = 'ins'
+                    sec_ins.append(elem)
+                elif sec_state == 'ins':
+                    sec_ins.append(elem)
+                else:
+                    # Standalone ins paragraph (no preceding del block)
+                    flush_group()   # (should already be empty)
+                    insert_group.append(elem)
+            elif is_empty:
+                if sec_state == 'del':
+                    # Separator between del and ins blocks
+                    sec_state = 'sep'
+                    sec_sep.append(elem)
+                elif sec_state in ('sep', 'ins'):
+                    sec_ins.append(elem)
+                else:
+                    # Empty para in stable region — ignore for anchoring
+                    pass
+            else:
+                # Stable (or inline-changed) paragraph
+                flush_section()
+                flush_group()
+                for old_text, new_text in _extract_inline_replacements(elem):
+                    if not old_text:
+                        continue
+                    changes.append({
+                        'type': 'text_replace',
+                        'location': {
+                            'kind': 'body_para',
+                            'para_context': _para_orig_text(elem).strip(),
+                        },
+                        'old': old_text,
+                        'new': new_text,
+                    })
+                orig = _para_orig_text(elem).strip()
+                if orig and not re.fullmatch(r'\[\.[\s\.]*\]', orig):
+                    prev_stable_text = orig
+        elif tag == 'tbl':
+            if _is_fully_deleted_tbl(elem):
+                if sec_state == 'ins':
+                    flush_section()
+                if sec_state == 'stable':
+                    flush_group()
+                    sec_anchor = prev_stable_text
+                sec_state = 'del'
+                sec_del.append(elem)
+            elif _is_fully_inserted_tbl(elem):
+                if sec_state in ('del', 'sep', 'ins'):
+                    sec_state = 'ins'
+                    sec_ins.append(elem)
+                else:
+                    # Standalone fully-inserted table (no del block) — treat as section_replace
+                    flush_group()
+                    sec_anchor = prev_stable_text
+                    sec_state = 'ins'
+                    sec_ins.append(elem)
+            else:
+                # Table with inline cell changes
+                flush_section()
+                flush_group()
+                _parse_table(elem, changes, section_heading=prev_stable_text)
+    flush_section()
+    flush_group()
+# ── Public API ────────────────────────────────────────────────────────────────
+def parse_cr(cr_path, output_json=None):
+    """
+    Parse all tracked changes in a CR DOCX.
+    Returns list of change dicts. Optionally saves to JSON.
+    """
+    doc = docx.Document(str(cr_path))
+    body = doc.element.body
+    changes = []
+    _parse_body(body, changes)
+    if output_json:
+        Path(output_json).write_text(
+            json.dumps(changes, indent=2, ensure_ascii=False), encoding='utf-8'
+        )
+    return changes
+# ── CLI ───────────────────────────────────────────────────────────────────────
+def main():
+    ap = argparse.ArgumentParser(description='Parse CR DOCX tracked changes into JSON manifest.')
+    ap.add_argument('cr_docx', help='CR DOCX file path')
+    ap.add_argument('--output', default=None, help='Output JSON path (default: print to stdout)')
+    args = ap.parse_args()
+    changes = parse_cr(args.cr_docx, output_json=args.output)
+    if args.output:
+        print(f'Wrote {len(changes)} change(s) → {args.output}')
+    else:
+        print(json.dumps(changes, indent=2, ensure_ascii=False))
+if __name__ == '__main__':
+    main()

scripts/docx_helpers.py ADDED Viewed

	@@ -0,0 +1,494 @@

+"""
+Reusable helpers for applying CR changes to TS DOCX files.
+Supports both direct editing AND tracked changes (review mode).
+"""
+import copy
+import difflib
+import re
+from docx.oxml.ns import qn
+from docx.oxml import OxmlElement
+AUTHOR = "CR Application"
+DATE   = "2026-03-24T00:00:00Z"
+# ── Revision ID counter ───────────────────────────────────────────────────────
+def _get_max_id(doc):
+    max_id = 0
+    for el in doc.element.body.iter():
+        for key, val in el.attrib.items():
+            if key.endswith('}id'):
+                try:
+                    max_id = max(max_id, int(val))
+                except ValueError:
+                    pass
+    return max_id
+class RevCounter:
+    """Generates unique revision IDs that don't clash with existing ones."""
+    def __init__(self, doc):
+        self._n = _get_max_id(doc) + 1
+    def next(self):
+        n = self._n
+        self._n += 1
+        return str(n)
+# ── Helpers ───────────────────────────────────────────────────────────────────
+def _make_t(text, tag='w:t'):
+    t = OxmlElement(tag)
+    t.text = text or ''
+    if text and (text[0] in (' ', '\t') or text[-1] in (' ', '\t')):
+        t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
+    return t
+def _make_run(text):
+    r = OxmlElement('w:r')
+    r.append(_make_t(text))
+    return r
+def _make_para_el(text, style_val):
+    new_p = OxmlElement('w:p')
+    pPr = OxmlElement('w:pPr')
+    pStyle = OxmlElement('w:pStyle')
+    pStyle.set(qn('w:val'), style_val)
+    pPr.append(pStyle)
+    new_p.append(pPr)
+    new_p.append(_make_run(text))
+    return new_p
+# ── Section mapping ───────────────────────────────────────────────────────────
+def map_sections(doc, clause_numbers):
+    """
+    Print and return paragraphs belonging to the given clause numbers.
+    Returns dict: {clause: [(index, para), ...]}
+    """
+    results = {c: [] for c in clause_numbers}
+    in_section = None
+    for i, para in enumerate(doc.paragraphs):
+        text  = para.text.strip()
+        style = para.style.name
+        matched = False
+        for clause in clause_numbers:
+            if clause in text and ('Heading' in style or 'heading' in style.lower()):
+                in_section = clause
+                print(f'\n=== [{i}] SECTION {clause} | style={style!r} ===')
+                print(f'  [{i}] "{text}"')
+                results[clause].append((i, para))
+                matched = True
+                break
+        if not matched and in_section:
+            if 'Heading' in style and text:
+                print(f'  --- end at [{i}] ({style})')
+                in_section = None
+            elif text:
+                print(f'  [{i}] style={style!r:16s} | "{text[:90]}"')
+                results[in_section].append((i, para))
+    return results
+def get_bullet_style_val(doc, fallback='B1'):
+    for para in doc.paragraphs:
+        pPr = para._element.find(qn('w:pPr'))
+        if pPr is not None:
+            pStyle = pPr.find(qn('w:pStyle'))
+            if pStyle is not None:
+                val = pStyle.get(qn('w:val'), '')
+                if val.startswith('B') and val[1:].isdigit():
+                    return val
+    return fallback
+def get_style_val(para):
+    pPr = para._element.find(qn('w:pPr'))
+    if pPr is not None:
+        pStyle = pPr.find(qn('w:pStyle'))
+        if pStyle is not None:
+            return pStyle.get(qn('w:val'))
+    return 'Normal'
+# ══════════════════════════════════════════════════════════════════════════════
+# DIRECT EDIT MODE (no track changes)
+# ══════════════════════════════════════════════════════════════════════════════
+def delete_para(para):
+    """Remove a paragraph from the document entirely."""
+    el = para._element
+    el.getparent().remove(el)
+def insert_para_after(ref_para, text, style_val='Normal'):
+    """Insert one paragraph after ref_para. Returns the new element."""
+    new_p = _make_para_el(text, style_val)
+    ref_para._element.addnext(new_p)
+    return new_p
+def insert_paras_after(ref_para, items, style_val='Normal'):
+    """
+    Insert multiple paragraphs in order after ref_para using a moving pointer.
+    items: list of str, or list of (text, style_val) tuples.
+    Returns the last inserted element.
+    """
+    ref_el = ref_para._element
+    for item in items:
+        text, sv = item if isinstance(item, tuple) else (item, style_val)
+        new_p = _make_para_el(text, sv)
+        ref_el.addnext(new_p)
+        ref_el = new_p
+    return ref_el
+def modify_para_text(para, old_text, new_text):
+    """Replace old_text with new_text in a paragraph (collapses all runs)."""
+    full = para.text
+    if old_text not in full:
+        raise ValueError(f"Not found: {old_text!r}\nIn: {full!r}")
+    updated = full.replace(old_text, new_text)
+    p_el = para._element
+    for r in p_el.findall(qn('w:r')):
+        p_el.remove(r)
+    p_el.append(_make_run(updated))
+    return updated
+# ══════════════════════════════════════════════════════════════════════════════
+# TRACKED CHANGE MODE (review / redline mode)
+# ══════════════════════════════════════════════════════════════════════════════
+def _ins_attr(rev, author, date):
+    return {qn('w:id'): rev.next(), qn('w:author'): author, qn('w:date'): date}
+def _del_attr(rev, author, date):
+    return {qn('w:id'): rev.next(), qn('w:author'): author, qn('w:date'): date}
+def tracked_insert_para_after(ref_para_or_el, text, style_val, rev,
+                               author=AUTHOR, date=DATE):
+    """
+    Insert a new paragraph after ref_para_or_el with tracked insertion marks.
+    Word will show it as an insertion in review mode.
+    Returns the new XML element (use as next ref for chained inserts).
+    """
+    new_p = OxmlElement('w:p')
+    # Paragraph properties: mark the paragraph mark itself as inserted
+    pPr = OxmlElement('w:pPr')
+    pStyle = OxmlElement('w:pStyle')
+    pStyle.set(qn('w:val'), style_val)
+    pPr.append(pStyle)
+    rPr = OxmlElement('w:rPr')
+    ins_mark = OxmlElement('w:ins')
+    for k, v in _ins_attr(rev, author, date).items():
+        ins_mark.set(k, v)
+    rPr.append(ins_mark)
+    pPr.append(rPr)
+    new_p.append(pPr)
+    # Content wrapped in <w:ins>
+    ins = OxmlElement('w:ins')
+    for k, v in _ins_attr(rev, author, date).items():
+        ins.set(k, v)
+    ins.append(_make_run(text))
+    new_p.append(ins)
+    ref_el = ref_para_or_el if not hasattr(ref_para_or_el, '_element') else ref_para_or_el._element
+    ref_el.addnext(new_p)
+    return new_p
+def tracked_insert_paras_after(ref_para, items, rev, author=AUTHOR, date=DATE):
+    """
+    Insert multiple paragraphs in order with tracked insertion marks.
+    items: list of str, or list of (text, style_val) tuples.
+    Uses a moving pointer — order is preserved.
+    Returns the last inserted element.
+    """
+    ref_el = ref_para._element
+    for item in items:
+        text, sv = item if isinstance(item, tuple) else (item, 'Normal')
+        new_p_el = tracked_insert_para_after(ref_el, text, sv, rev, author, date)
+        ref_el = new_p_el
+    return ref_el
+def tracked_delete_para(para, rev, author=AUTHOR, date=DATE):
+    """
+    Mark a paragraph as deleted using tracked change marks.
+    The paragraph stays in the document but Word shows it as struck-through red.
+    """
+    p_el = para._element
+    # Mark the paragraph mark as deleted (in pPr > rPr)
+    pPr = p_el.find(qn('w:pPr'))
+    if pPr is None:
+        pPr = OxmlElement('w:pPr')
+        p_el.insert(0, pPr)
+    rPr = pPr.find(qn('w:rPr'))
+    if rPr is None:
+        rPr = OxmlElement('w:rPr')
+        pPr.append(rPr)
+    del_mark = OxmlElement('w:del')
+    for k, v in _del_attr(rev, author, date).items():
+        del_mark.set(k, v)
+    rPr.append(del_mark)
+    # Wrap every run in <w:del> and change <w:t> → <w:delText>
+    runs = list(p_el.findall(qn('w:r')))
+    for r in runs:
+        idx = list(p_el).index(r)
+        for t_el in r.findall(qn('w:t')):
+            del_t = _make_t(t_el.text, 'w:delText')
+            r.remove(t_el)
+            r.append(del_t)
+        del_wrap = OxmlElement('w:del')
+        for k, v in _del_attr(rev, author, date).items():
+            del_wrap.set(k, v)
+        p_el.remove(r)
+        del_wrap.append(r)
+        p_el.insert(idx, del_wrap)
+def tracked_modify_para(para, old_text, new_text, rev, author=AUTHOR, date=DATE):
+    """
+    Replace old_text with new_text using tracked del+ins marks.
+    Splits the paragraph into: [before][<w:del>old</w:del>][<w:ins>new</w:ins>][after]
+    Word shows the old text struck through and new text underlined.
+    """
+    full = para.text
+    if old_text not in full:
+        raise ValueError(f"Not found: {old_text!r}\nIn: {full!r}")
+    before, _, after = full.partition(old_text)
+    p_el = para._element
+    # Remove all existing runs
+    for r in p_el.findall(qn('w:r')):
+        p_el.remove(r)
+    # Before (unchanged)
+    if before:
+        p_el.append(_make_run(before))
+    # Tracked deletion of old text
+    del_el = OxmlElement('w:del')
+    for k, v in _del_attr(rev, author, date).items():
+        del_el.set(k, v)
+    r_del = OxmlElement('w:r')
+    r_del.append(_make_t(old_text, 'w:delText'))
+    del_el.append(r_del)
+    p_el.append(del_el)
+    # Tracked insertion of new text
+    ins_el = OxmlElement('w:ins')
+    for k, v in _ins_attr(rev, author, date).items():
+        ins_el.set(k, v)
+    ins_el.append(_make_run(new_text))
+    p_el.append(ins_el)
+    # After (unchanged)
+    if after:
+        p_el.append(_make_run(after))
+def _char_diff(old, new):
+    """
+    Return a list of (op, text) tuples for a minimal character-level diff.
+    op is one of 'keep', 'del', 'ins'.
+    Strategy: first tokenize into digit-runs, letter-runs, and single separator
+    characters so that separators like '-' or '.' are kept intact as their own
+    tokens; then match tokens with SequenceMatcher; finally apply char-level diff
+    within each replaced token pair for maximum granularity.
+    Examples:
+      ('V18.2.0', 'V18.3.0') →
+        [('keep','V18.'), ('del','2'), ('ins','3'), ('keep','.0')]
+      ('(2024-11)', '(2026-04)') →
+        [('keep','(202'), ('del','4'), ('ins','6'), ('keep','-'),
+         ('del','11'), ('ins','04'), ('keep',')')]
+    """
+    old_tokens = re.findall(r'\d+|[A-Za-z]+|.', old)
+    new_tokens = re.findall(r'\d+|[A-Za-z]+|.', new)
+    ops = []
+    matcher = difflib.SequenceMatcher(None, old_tokens, new_tokens, autojunk=False)
+    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
+        old_span = ''.join(old_tokens[i1:i2])
+        new_span = ''.join(new_tokens[j1:j2])
+        if tag == 'equal':
+            ops.append(('keep', old_span))
+        elif tag == 'replace':
+            # Within each replaced token span, apply char-level diff for finer granularity
+            cmatcher = difflib.SequenceMatcher(None, old_span, new_span, autojunk=False)
+            for ctag, ci1, ci2, cj1, cj2 in cmatcher.get_opcodes():
+                if ctag == 'equal':
+                    ops.append(('keep', old_span[ci1:ci2]))
+                elif ctag == 'replace':
+                    ops.append(('del', old_span[ci1:ci2]))
+                    ops.append(('ins', new_span[cj1:cj2]))
+                elif ctag == 'delete':
+                    ops.append(('del', old_span[ci1:ci2]))
+                elif ctag == 'insert':
+                    ops.append(('ins', new_span[cj1:cj2]))
+        elif tag == 'delete':
+            ops.append(('del', old_span))
+        elif tag == 'insert':
+            ops.append(('ins', new_span))
+    return ops
+def tracked_modify_para_multi(para, replacements, rev, author=AUTHOR, date=DATE):
+    """
+    Apply multiple tracked del+ins replacements in a single paragraph pass.
+    replacements: list of (old_text, new_text) tuples, applied in order of appearance.
+    Each replacement uses character-level diff so only the minimally changed characters
+    are marked as del/ins, with common characters kept as plain runs in between.
+    Use this instead of calling tracked_modify_para twice (which would corrupt the XML).
+    """
+    full = para.text
+    for old_text, _ in replacements:
+        if old_text not in full:
+            raise ValueError(f"Not found: {old_text!r}\nIn: {full!r}")
+    p_el = para._element
+    # Remove all existing runs
+    for r in p_el.findall(qn('w:r')):
+        p_el.remove(r)
+    # Walk through the full text, emitting plain runs and char-level del+ins ops
+    remaining = full
+    for old_text, new_text in replacements:
+        idx = remaining.find(old_text)
+        if idx == -1:
+            continue
+        before = remaining[:idx]
+        remaining = remaining[idx + len(old_text):]
+        if before:
+            p_el.append(_make_run(before))
+        for op, text in _char_diff(old_text, new_text):
+            if op == 'keep':
+                p_el.append(_make_run(text))
+            elif op == 'del':
+                del_el = OxmlElement('w:del')
+                for k, v in _del_attr(rev, author, date).items():
+                    del_el.set(k, v)
+                r_del = OxmlElement('w:r')
+                r_del.append(_make_t(text, 'w:delText'))
+                del_el.append(r_del)
+                p_el.append(del_el)
+            elif op == 'ins':
+                ins_el = OxmlElement('w:ins')
+                for k, v in _ins_attr(rev, author, date).items():
+                    ins_el.set(k, v)
+                ins_el.append(_make_run(text))
+                p_el.append(ins_el)
+    # Emit any trailing text
+    if remaining:
+        p_el.append(_make_run(remaining))
+def tracked_insert_table_row(tbl, cell_texts, rev, author=AUTHOR, date=DATE):
+    """
+    Insert a new row immediately after the last non-empty row in tbl, as a
+    tracked insertion.  Empty pre-allocated rows at the table bottom are skipped
+    so the new content appears directly under the previous entry.
+    The new row is deep-copied from the last content row so that ALL formatting
+    (cell widths, borders, shading, paragraph style, run font/size) is inherited —
+    exactly as clicking "Insert Row Below" does in Word.
+    tbl: python-docx Table object
+    cell_texts: list of strings, one per column
+    """
+    tbl_el = tbl._tbl
+    all_trs = tbl_el.findall(qn('w:tr'))
+    # Find the last row that contains at least one non-empty <w:t> node.
+    # This skips pre-allocated blank rows at the table bottom.
+    last_content_tr = all_trs[-1]
+    for tr in reversed(all_trs):
+        if any(t.text and t.text.strip() for t in tr.findall('.//' + qn('w:t'))):
+            last_content_tr = tr
+            break
+    # Deep-copy the last content row — inherits all cell/paragraph/run formatting.
+    new_tr = copy.deepcopy(last_content_tr)
+    # Mark the row itself as a tracked insertion in <w:trPr>.
+    trPr = new_tr.find(qn('w:trPr'))
+    if trPr is None:
+        trPr = OxmlElement('w:trPr')
+        new_tr.insert(0, trPr)
+    for child in list(trPr):
+        if child.tag == qn('w:ins'):
+            trPr.remove(child)
+    tr_ins = OxmlElement('w:ins')
+    for k, v in _ins_attr(rev, author, date).items():
+        tr_ins.set(k, v)
+    trPr.append(tr_ins)
+    # For each cell: extract the existing run's rPr, clear text content, insert new text.
+    cells_in_new_tr = new_tr.findall(qn('w:tc'))
+    for i, tc in enumerate(cells_in_new_tr):
+        p = tc.find('.//' + qn('w:p'))
+        if p is None:
+            continue
+        # Capture the first run's rPr (font size, bold, etc.) before clearing.
+        first_run_rpr = None
+        for r in list(p.iter(qn('w:r'))):
+            rpr = r.find(qn('w:rPr'))
+            if rpr is not None:
+                first_run_rpr = copy.deepcopy(rpr)
+            break
+        # Remove all non-pPr children (runs, ins, del, hyperlinks, etc.)
+        for child in list(p):
+            if child.tag != qn('w:pPr'):
+                p.remove(child)
+        # Ensure pPr exists with a paragraph-mark ins tracking element.
+        pPr = p.find(qn('w:pPr'))
+        if pPr is None:
+            pPr = OxmlElement('w:pPr')
+            p.insert(0, pPr)
+        rPr = pPr.find(qn('w:rPr'))
+        if rPr is None:
+            rPr = OxmlElement('w:rPr')
+            pPr.append(rPr)
+        for child in list(rPr):
+            if child.tag == qn('w:ins'):
+                rPr.remove(child)
+        p_ins_mark = OxmlElement('w:ins')
+        for k, v in _ins_attr(rev, author, date).items():
+            p_ins_mark.set(k, v)
+        rPr.append(p_ins_mark)
+        # Build new run, re-using the inherited rPr so font size / style matches.
+        r_new = OxmlElement('w:r')
+        if first_run_rpr is not None:
+            r_new.append(first_run_rpr)
+        text = cell_texts[i] if i < len(cell_texts) else ''
+        r_new.append(_make_t(text))
+        # Wrap the run in a tracked-insertion element.
+        ins_el = OxmlElement('w:ins')
+        for k, v in _ins_attr(rev, author, date).items():
+            ins_el.set(k, v)
+        ins_el.append(r_new)
+        p.append(ins_el)
+    last_content_tr.addnext(new_tr)

scripts/fetch_crs.py ADDED Viewed

	@@ -0,0 +1,487 @@

+#!/usr/bin/env python3
+"""
+fetch_crs.py — Download CRs and TSs from a 3GPP/ETSI Excel contribution list.
+Usage:
+    python3 fetch_crs.py <excel_path> <person_name> [--output-dir DIR]
+Steps:
+    1. Parse Excel, filter Accepted CRs by person name
+    2. Download CR DOCXs via docfinder /find/tdoc/download
+    3. Parse CR cover pages to extract target TS spec + version
+    4. Download TS DOCXs via docfinder /find/docx
+    5. Print summary report
+"""
+import argparse
+import os
+import re
+import sys
+import time
+import zipfile
+from pathlib import Path
+import requests
+BASE_URL = "https://organizedprogrammers-docfinder.hf.space"
+_proxy = os.environ.get("http_proxy") or None
+PROXIES = {"http": _proxy, "https": os.environ.get("https_proxy") or None}
+# ---------------------------------------------------------------------------
+# Path helpers
+# ---------------------------------------------------------------------------
+def wsl_path(p: str) -> str:
+    """Convert Windows path (C:\\...) to WSL path (/mnt/c/...) if needed."""
+    p = p.strip()
+    if len(p) >= 2 and p[1] == ":" and p[0].isalpha():
+        drive = p[0].lower()
+        rest = p[2:].replace("\\", "/")
+        return f"/mnt/{drive}{rest}"
+    return p
+# ---------------------------------------------------------------------------
+# Step 1 — Parse Excel
+# ---------------------------------------------------------------------------
+def parse_excel(excel_path: str, person_name: str):
+    """
+    Return list of (uid, title) for Accepted CRs matching person_name.
+    Handles both .xls and .xlsx.
+    """
+    path = Path(wsl_path(excel_path))
+    ext = path.suffix.lower()
+    if ext == ".xls":
+        return _parse_xls(path, person_name)
+    elif ext == ".xlsx":
+        return _parse_xlsx(path, person_name)
+    else:
+        raise ValueError(f"Unsupported file extension: {ext!r}. Expected .xls or .xlsx")
+def _name_pattern(name: str) -> re.Pattern:
+    return re.compile(r"\b" + re.escape(name) + r"\b", re.IGNORECASE)
+def _parse_xls(path: Path, person_name: str):
+    try:
+        import xlrd
+    except ImportError:
+        sys.exit("ERROR: xlrd is not installed. Run: pip install xlrd")
+    wb = xlrd.open_workbook(str(path))
+    # Try "Contributions" sheet first, fall back to first sheet
+    try:
+        ws = wb.sheet_by_name("Contributions")
+    except xlrd.XLRDError:
+        ws = wb.sheet_by_index(0)
+    # Row 0 is headers; row 1 is an empty duplicate — skip it
+    headers = [str(ws.cell_value(0, c)).strip() for c in range(ws.ncols)]
+    col = {h: i for i, h in enumerate(headers)}
+    uid_col = col.get("Uid") or col.get("UID") or col.get("uid")
+    type_col = col.get("Type") or col.get("type")
+    status_col = col.get("Status") or col.get("status")
+    by_col = col.get("SubmittedBy") or col.get("Submitted By") or col.get("submittedby")
+    title_col = col.get("Title") or col.get("title")
+    for name, c in [("Uid", uid_col), ("Type", type_col),
+                    ("Status", status_col), ("SubmittedBy", by_col)]:
+        if c is None:
+            raise ValueError(f"Column {name!r} not found. Available: {list(col.keys())}")
+    pattern = _name_pattern(person_name)
+    results = []
+    for r in range(2, ws.nrows):  # skip header + empty duplicate
+        uid = str(ws.cell_value(r, uid_col)).strip()
+        doc_type = str(ws.cell_value(r, type_col)).strip()
+        status = str(ws.cell_value(r, status_col)).strip()
+        submitted_by = str(ws.cell_value(r, by_col)).strip()
+        title = str(ws.cell_value(r, title_col)).strip() if title_col is not None else ""
+        if doc_type != "CR":
+            continue
+        if status != "Accepted":
+            continue
+        if not pattern.search(submitted_by):
+            continue
+        results.append((uid, title))
+    return results
+def _parse_xlsx(path: Path, person_name: str):
+    try:
+        import openpyxl
+    except ImportError:
+        sys.exit("ERROR: openpyxl is not installed. Run: pip install openpyxl")
+    wb = openpyxl.load_workbook(str(path), read_only=True, data_only=True)
+    ws = wb["Contributions"] if "Contributions" in wb.sheetnames else wb.active
+    rows = iter(ws.iter_rows(values_only=True))
+    # Row 0: headers
+    header_row = next(rows)
+    headers = [str(h).strip() if h is not None else "" for h in header_row]
+    col = {h: i for i, h in enumerate(headers)}
+    # Row 1: empty duplicate — skip
+    next(rows, None)
+    uid_col = col.get("Uid") or col.get("UID") or col.get("uid")
+    type_col = col.get("Type") or col.get("type")
+    status_col = col.get("Status") or col.get("status")
+    by_col = col.get("SubmittedBy") or col.get("Submitted By") or col.get("submittedby")
+    title_col = col.get("Title") or col.get("title")
+    for name, c in [("Uid", uid_col), ("Type", type_col),
+                    ("Status", status_col), ("SubmittedBy", by_col)]:
+        if c is None:
+            raise ValueError(f"Column {name!r} not found. Available: {list(col.keys())}")
+    pattern = _name_pattern(person_name)
+    results = []
+    for row in rows:
+        def cell(c):
+            v = row[c] if c < len(row) else None
+            return str(v).strip() if v is not None else ""
+        uid = cell(uid_col)
+        doc_type = cell(type_col)
+        status = cell(status_col)
+        submitted_by = cell(by_col)
+        title = cell(title_col) if title_col is not None else ""
+        if not uid:
+            continue
+        if doc_type != "CR":
+            continue
+        if status != "Accepted":
+            continue
+        if not pattern.search(submitted_by):
+            continue
+        results.append((uid, title))
+    return results
+# ---------------------------------------------------------------------------
+# Step 2 — Download CR DOCXs
+# ---------------------------------------------------------------------------
+def download_cr(uid: str, cr_dir: Path):
+    """
+    Download CR DOCX for the given UID.
+    Returns:
+        (docx_path, note)  — docx_path is the file to use for parsing
+                             note is a human-readable string for the summary
+    Returns (None, error_msg) on failure.
+    """
+    dest = cr_dir / f"{uid}.docx"
+    if dest.exists():
+        return dest, "already existed"
+    try:
+        resp = requests.post(
+            f"{BASE_URL}/find/tdoc/download",
+            json={"doc_id": uid},
+            proxies=PROXIES,
+            timeout=60,
+        )
+    except requests.RequestException as e:
+        return None, f"network error: {e}"
+    if not resp.ok:
+        return None, f"HTTP {resp.status_code}"
+    content = resp.content
+    if not content:
+        return None, "empty response"
+    dest.write_bytes(content)
+    # ZIP detection
+    if content[:4] == b"PK\x03\x04":
+        try:
+            with zipfile.ZipFile(dest) as zf:
+                docx_entries = [n for n in zf.namelist() if n.endswith(".docx")]
+                if docx_entries:
+                    extracted_name = f"{uid}_extracted.docx"
+                    extracted_path = cr_dir / extracted_name
+                    with zf.open(docx_entries[0]) as src, open(extracted_path, "wb") as dst:
+                        dst.write(src.read())
+                    return extracted_path, "extracted from ZIP"
+        except zipfile.BadZipFile:
+            pass  # Not actually a ZIP despite magic bytes — treat as raw DOCX
+    return dest, "downloaded"
+# ---------------------------------------------------------------------------
+# Step 3 — Parse CR Cover Pages
+# ---------------------------------------------------------------------------
+SPEC_PATTERN = re.compile(r"^\d{3}\s\d{3}$")
+VERSION_PATTERN = re.compile(r"^\d+\.\d+\.\d+$")
+def parse_cr_cover(docx_path: Path):
+    """
+    Parse the CR cover table (tables[0]) to extract (spec_number, version).
+    Returns (spec_number, version) e.g. ("102 221", "18.3.0")
+    Returns (None, None) if parsing fails.
+    """
+    try:
+        from docx import Document
+    except ImportError:
+        sys.exit("ERROR: python-docx is not installed. Run: pip install python-docx")
+    try:
+        doc = Document(str(docx_path))
+    except Exception as e:
+        return None, None
+    if not doc.tables:
+        return None, None
+    table = doc.tables[0]
+    # Collect all non-empty cell texts in order
+    cells = []
+    for row in table.rows:
+        for cell in row.cells:
+            text = cell.text.strip()
+            if text:
+                cells.append(text)
+    spec_number = None
+    version = None
+    for i, text in enumerate(cells):
+        # Look for spec number: "NNN NNN" pattern
+        if SPEC_PATTERN.match(text) and spec_number is None:
+            spec_number = text
+        # Look for version: cell immediately after "Current version:"
+        if text == "Current version:" and i + 1 < len(cells):
+            candidate = cells[i + 1]
+            if VERSION_PATTERN.match(candidate):
+                version = candidate
+        # Also accept "Current version" without colon
+        if text in ("Current version:", "Current version") and version is None:
+            if i + 1 < len(cells) and VERSION_PATTERN.match(cells[i + 1]):
+                version = cells[i + 1]
+    return spec_number, version
+# ---------------------------------------------------------------------------
+# Step 4 — Download TS DOCXs
+# ---------------------------------------------------------------------------
+def _is_html(resp: requests.Response) -> bool:
+    """Return True if the response body is an HTML page (e.g. HF Space loading page)."""
+    ct = resp.headers.get("content-type", "")
+    if "text/html" in ct:
+        return True
+    return resp.content[:5].lower() in (b"<!doc", b"<html")
+def download_ts(spec_number: str, version: str, ts_dir: Path,
+                max_retries: int = 3, retry_delay: int = 10):
+    """
+    Download TS DOCX for spec_number (e.g. "102 221") and version (e.g. "18.3.0").
+    Retries up to max_retries times when the HF Space returns an HTML loading page
+    instead of the DOCX binary (happens on cold-start / brief restarts).
+    Returns (filename, note) or (None, error_msg).
+    """
+    spec_no_space = spec_number.replace(" ", "")
+    filename = f"ts_{spec_no_space}_v{version}.docx"
+    dest = ts_dir / filename
+    if dest.exists():
+        return filename, "already existed"
+    last_error = "no attempts made"
+    for attempt in range(1, max_retries + 1):
+        try:
+            resp = requests.post(
+                f"{BASE_URL}/find/docx",
+                json={"doc_id": spec_number, "version": version},
+                proxies=PROXIES,
+                timeout=120,
+            )
+        except requests.RequestException as e:
+            return None, f"network error: {e}"
+        if not resp.ok:
+            return None, f"HTTP {resp.status_code}"
+        content = resp.content
+        if not content:
+            return None, "empty response"
+        # Detect HTML splash page (HF Space cold-start) — retry after a delay
+        if _is_html(resp):
+            last_error = f"got HTML instead of DOCX (attempt {attempt}/{max_retries})"
+            if attempt < max_retries:
+                print(f"\n    [retry in {retry_delay}s — HF Space loading…]", flush=True)
+                time.sleep(retry_delay)
+                continue
+            return None, f"invalid file (not a ZIP/DOCX, starts with {content[:4]!r}) after {max_retries} attempts"
+        # Good binary response
+        dest.write_bytes(content)
+        if content[:2] != b"PK":
+            dest.unlink()
+            return None, f"invalid file (not a ZIP/DOCX, starts with {content[:4]!r})"
+        # Verify the TS contains the expected spec number in its first paragraph
+        try:
+            import docx as _docx
+            _doc = _docx.Document(dest)
+            first_para = _doc.paragraphs[0].text if _doc.paragraphs else ''
+            if spec_no_space not in first_para.replace(' ', ''):
+                dest.unlink()
+                return None, f"wrong TS returned by API: got {first_para[:80]!r} (expected spec {spec_no_space})"
+        except Exception:
+            pass  # Trust the ZIP check above
+        note = "downloaded" if attempt == 1 else f"downloaded (after {attempt} attempts)"
+        return filename, note
+    return None, last_error
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+def main():
+    parser = argparse.ArgumentParser(
+        description="Download CRs and TSs from a 3GPP/ETSI Excel contribution list."
+    )
+    parser.add_argument("excel_path", help="Path to .xls or .xlsx contribution list")
+    parser.add_argument("person_name", help="Name to search for in SubmittedBy column")
+    parser.add_argument(
+        "--output-dir",
+        default=str(Path.home() / "CR_Processing"),
+        help="Base output directory (default: ~/CR_Processing)",
+    )
+    args = parser.parse_args()
+    excel_path = wsl_path(args.excel_path)
+    person_name = args.person_name
+    output_dir = Path(wsl_path(args.output_dir)).expanduser()
+    cr_dir = output_dir / "CRs"
+    ts_dir = output_dir / "TS"
+    cr_dir.mkdir(parents=True, exist_ok=True)
+    ts_dir.mkdir(parents=True, exist_ok=True)
+    # --- Step 1: Parse Excel ---
+    print(f"Parsing Excel: {excel_path}")
+    print(f"Filtering for: {person_name!r} | Type=CR | Status=Accepted\n")
+    try:
+        cr_list = parse_excel(excel_path, person_name)
+    except Exception as e:
+        sys.exit(f"ERROR parsing Excel: {e}")
+    print(f"Found {len(cr_list)} matching CR(s).\n")
+    if not cr_list:
+        print("Nothing to download.")
+        return
+    # --- Step 2: Download CR DOCXs ---
+    print("Downloading CRs...")
+    cr_results = []  # list of (uid, docx_path_or_None, note)
+    for uid, title in cr_list:
+        print(f"  [{uid}] ", end="", flush=True)
+        docx_path, note = download_cr(uid, cr_dir)
+        cr_results.append((uid, docx_path, note))
+        if docx_path:
+            print(f"OK ({note}) — {docx_path.name}")
+        else:
+            print(f"FAILED — {note}")
+    print()
+    # --- Step 3: Parse cover pages ---
+    print("Parsing CR cover pages...")
+    ts_targets = {}  # (spec_number, version) -> list of uids
+    for uid, docx_path, note in cr_results:
+        if docx_path is None:
+            continue
+        spec_number, version = parse_cr_cover(docx_path)
+        if spec_number and version:
+            key = (spec_number, version)
+            ts_targets.setdefault(key, []).append(uid)
+            print(f"  [{uid}] → TS {spec_number}  v{version}")
+        else:
+            print(f"  [{uid}] WARNING: could not parse cover page (spec/version not found)")
+    print()
+    # --- Step 4: Download TSs ---
+    print("Downloading TSs...")
+    ts_results = []  # list of (spec_number, version, filename_or_None, note)
+    for (spec_number, version), uids in ts_targets.items():
+        print(f"  [TS {spec_number} v{version}] ", end="", flush=True)
+        filename, note = download_ts(spec_number, version, ts_dir)
+        ts_results.append((spec_number, version, filename, note))
+        if filename:
+            print(f"OK ({note}) — {filename}")
+        else:
+            print(f"FAILED — {note}")
+    print()
+    # --- Step 5: Summary ---
+    print("=" * 50)
+    print("=== fetch-crs summary ===")
+    print(f"Person:     {person_name}")
+    print(f"Excel:      {excel_path}")
+    print(f"CRs found:  {len(cr_list)}   (Accepted, Type=CR)")
+    print()
+    print("CRs downloaded:")
+    for uid, docx_path, note in cr_results:
+        if docx_path:
+            print(f"  ✓ {docx_path.name}  [{note}]")
+        else:
+            print(f"  ✗ {uid} — {note}")
+    print()
+    print("TSs downloaded:")
+    for spec_number, version, filename, note in ts_results:
+        if filename:
+            print(f"  ✓ {filename}  [{note}]")
+        else:
+            print(f"  ✗ ts_{spec_number.replace(' ', '')} v{version} — {note}")
+    print()
+    print(f"Output:  {output_dir}/")
+if __name__ == "__main__":
+    main()

scripts/finalize_ts.py ADDED Viewed

	@@ -0,0 +1,370 @@

+#!/usr/bin/env python3
+"""
+finalize_ts.py — Add tracked-change metadata updates to a TS DOCX after CR application.
+Three edits are made (all as tracked changes):
+  1. New row in the Change History table (second-to-last table, Annex V)
+  2. New row in the History table (last table, last page)
+  3. Version + date update in the first paragraph (title)
+Usage:
+    python3 finalize_ts.py <ts_docx> <cr_docx> [--author "Name"] [--output <path>]
+"""
+import argparse
+import re
+import sys
+from datetime import date, timedelta
+from pathlib import Path
+import docx
+sys.path.insert(0, str(Path(__file__).parent))
+from docx_helpers import (
+    RevCounter,
+    tracked_insert_table_row,
+    tracked_modify_para_multi,
+    AUTHOR,
+    DATE,
+)
+# ── Path helpers ──────────────────────────────────────────────────────────────
+def to_wsl_path(p: str) -> str:
+    """Convert Windows paths (C:\\...) to WSL paths (/mnt/c/...)."""
+    if p.startswith(('C:\\', 'c:\\', 'D:\\', 'd:\\')):
+        drive = p[0].lower()
+        rest = p[2:].replace('\\', '/')
+        return f'/mnt/{drive}{rest}'
+    return p
+# ── Date / version helpers ────────────────────────────────────────────────────
+def compute_pub_date():
+    """
+    Return (yyyy-mm, "Month YYYY") using the 5-day rule:
+    if today is within 5 days of the next month's first day, use next month;
+    otherwise use the current month.
+    """
+    today = date.today()
+    first_next = (today.replace(day=1) + timedelta(days=32)).replace(day=1)
+    days_until = (first_next - today).days
+    target = first_next if days_until <= 5 else today.replace(day=1)
+    return target.strftime('%Y-%m'), target.strftime('%B %Y')
+def derive_new_version(v: str) -> str:
+    """Increment middle component of X.Y.Z → X.(Y+1).0."""
+    parts = v.split('.')
+    parts[1] = str(int(parts[1]) + 1)
+    parts[2] = '0'
+    return '.'.join(parts)
+# ── CR metadata extraction ────────────────────────────────────────────────────
+def extract_cr_metadata(cr_docx_path: str) -> dict:
+    """
+    Open the CR DOCX and read metadata from tables[0] (cover page table).
+    Returns dict with keys:
+        meeting_id, uid, cr_num, rev, cat, title, current_version
+    """
+    doc = docx.Document(cr_docx_path)
+    if not doc.tables:
+        raise ValueError('CR has no tables — cannot extract metadata')
+    tbl = doc.tables[0]
+    # Collect all cell texts for scanning
+    cells = []
+    for row in tbl.rows:
+        for cell in row.cells:
+            cells.append(cell.text.strip())
+    meta = {
+        'meeting_id': '',
+        'uid': '',
+        'cr_num': '',
+        'rev': '',
+        'cat': '',
+        'title': '',
+        'current_version': '',
+    }
+    # --- Meeting ID ---
+    # Find cell containing "Meeting #" and parse e.g. "ETSI TC SET Meeting #121, Edinburgh..."
+    meeting_text = ''
+    for c in cells:
+        if 'Meeting #' in c or 'Meeting#' in c:
+            meeting_text = c
+            break
+    if meeting_text:
+        # Body: word before "Meeting" (e.g. "SET")
+        body_match = re.search(r'(\w+)\s+Meeting\s*#', meeting_text)
+        body = body_match.group(1) if body_match else ''
+        # Number: digits after "#"
+        num_match = re.search(r'Meeting\s*#\s*(\d+)', meeting_text)
+        number = num_match.group(1) if num_match else ''
+        meta['meeting_id'] = f'{body}-{number}' if body and number else meeting_text
+    # --- UID ---
+    # Pattern like SET(26)000019r1 or similar
+    uid_pat = re.compile(r'[A-Z]+\(\d+\)\d+\S*')
+    for c in cells:
+        m = uid_pat.search(c)
+        if m:
+            meta['uid'] = m.group(0)
+            break
+    # --- Label-value scanning ---
+    # Scan pairs: if a cell matches a label, the next non-empty cell is the value
+    label_map = {
+        'CR': 'cr_num',
+        'Rev': 'rev',
+        'Curr. vers': 'current_version',
+        'Current version': 'current_version',
+        'Cat': 'cat',
+        'Category': 'cat',
+    }
+    title_next = False
+    for i, c in enumerate(cells):
+        stripped = c.strip().rstrip(':')
+        # Title may span its own cell or be labelled
+        if stripped.lower() in ('title', 'title of change'):
+            title_next = True
+            continue
+        if title_next:
+            if c.strip():
+                meta['title'] = c.strip()
+                title_next = False
+            continue
+        for label, key in label_map.items():
+            if stripped == label or stripped.startswith(label):
+                # Value is in the next non-empty cell
+                for j in range(i + 1, min(i + 4, len(cells))):
+                    val = cells[j].strip()
+                    if val:
+                        meta[key] = val
+                        break
+                break
+    return meta
+# ── Meeting ID format detection ───────────────────────────────────────────────
+def _detect_meeting_separator(tbl):
+    """
+    Scan the meeting column (col index 1) of the Change History table bottom-up.
+    Find the last non-empty cell and detect the separator between body letters and
+    number, e.g. '#' in 'SET#115' or '-' in 'SET-119'.
+    Returns the detected separator character, defaulting to '#'.
+    """
+    for row in reversed(tbl.rows):
+        cells = row.cells
+        if len(cells) > 1:
+            text = cells[1].text.strip()
+            if text:
+                m = re.search(r'[A-Za-z]([^A-Za-z0-9])\d', text)
+                if m:
+                    return m.group(1)
+    return '#'
+# ── TS table locators ─────────────────────────────────────────────────────────
+def find_change_history_table(ts_doc):
+    """Return ts_doc.tables[-2] (Change History / Annex V). Accepts 8 or 9 columns."""
+    tables = ts_doc.tables
+    if len(tables) < 2:
+        raise ValueError('TS has fewer than 2 tables')
+    tbl = tables[-2]
+    ncols = len(tbl.rows[-1].cells)
+    if ncols not in (8, 9):
+        raise ValueError(
+            f'Change History table has {ncols} columns, expected 8 or 9'
+        )
+    return tbl
+def find_history_table(ts_doc):
+    """Return ts_doc.tables[-1] (History / last page). Validates 3 columns."""
+    tbl = ts_doc.tables[-1]
+    last_row = tbl.rows[-1]
+    if len(last_row.cells) != 3:
+        raise ValueError(
+            f'History table has {len(last_row.cells)} columns, expected 3'
+        )
+    return tbl
+# ── Update functions ──────────────────────────────────────────────────────────
+def update_change_history_table(ts_doc, meta, pub_yyyy_mm, old_v, new_v, rev, author, date_str):
+    tbl = find_change_history_table(ts_doc)
+    ncols = len(tbl.rows[-1].cells)
+    # Detect separator used in existing rows (e.g. '#' in 'SET#115', '-' in 'SET-119')
+    # and reformat meeting_id accordingly so it matches the existing style.
+    sep = _detect_meeting_separator(tbl)
+    meeting_id = meta['meeting_id']   # always 'BODY-NUMBER' from extract_cr_metadata
+    if sep != '-' and '-' in meeting_id:
+        body, number = meeting_id.split('-', 1)
+        meeting_id = f'{body}{sep}{number}'
+    if ncols == 9:
+        # Standard ETSI format: date | meeting | uid | cr | rev | cat | title | old_v | new_v
+        cell_texts = [
+            pub_yyyy_mm, meeting_id, meta['uid'],
+            meta['cr_num'], meta['rev'], meta['cat'],
+            meta['title'], old_v, new_v,
+        ]
+    elif ncols == 8:
+        # Detect 8-column variant by first column header
+        first_header = tbl.rows[0].cells[0].text.strip() if tbl.rows else ''
+        if re.search(r'[Dd]ate', first_header):
+            # Date | meeting | uid | cr | rev | cat | title | new_v  (no old_v)
+            cell_texts = [
+                pub_yyyy_mm, meeting_id, meta['uid'],
+                meta['cr_num'], meta['rev'], meta['cat'],
+                meta['title'], new_v,
+            ]
+        else:
+            # meeting | uid | wg_doc | cr | rev | cat | title | new_v  (no date, no old_v)
+            cell_texts = [
+                meeting_id, meta['uid'], '',
+                meta['cr_num'], meta['rev'], meta['cat'],
+                meta['title'], new_v,
+            ]
+    else:
+        cell_texts = ([pub_yyyy_mm, meeting_id, meta['uid'],
+                       meta['cr_num'], meta['rev'], meta['cat'],
+                       meta['title'], old_v, new_v])[:ncols]
+    tracked_insert_table_row(tbl, cell_texts, rev, author, date_str)
+    return cell_texts
+def update_history_table(ts_doc, new_v, pub_month_year, rev, author, date_str):
+    tbl = find_history_table(ts_doc)
+    cell_texts = [f'V{new_v}', pub_month_year, 'Publication']
+    tracked_insert_table_row(tbl, cell_texts, rev, author, date_str)
+    return cell_texts
+def update_title_para(ts_doc, old_v, new_v, old_date_str, new_date_str, rev, author, date_str):
+    """
+    Update first paragraph: V<old_v>→V<new_v> and (old_date_str)→(new_date_str).
+    Both replacements are applied in a single tracked multi-replace pass.
+    """
+    para = ts_doc.paragraphs[0]
+    replacements = [
+        (f'V{old_v}', f'V{new_v}'),
+        (f'({old_date_str})', f'({new_date_str})'),
+    ]
+    tracked_modify_para_multi(para, replacements, rev, author, date_str)
+# ── Main ──────────────────────────────────────────────────────────────────────
+def main():
+    parser = argparse.ArgumentParser(
+        description='Add tracked-change metadata updates to a TS DOCX after CR application.'
+    )
+    parser.add_argument('ts_docx', help='TS DOCX file to update')
+    parser.add_argument('cr_docx', help='CR DOCX file to read metadata from')
+    parser.add_argument('--author', default=AUTHOR, help='Tracked change author name')
+    parser.add_argument('--output', default=None, help='Output path (default: <ts>_finalized.docx)')
+    args = parser.parse_args()
+    ts_path = to_wsl_path(args.ts_docx)
+    cr_path = to_wsl_path(args.cr_docx)
+    # Determine output path
+    if args.output:
+        out_path = to_wsl_path(args.output)
+    else:
+        p = Path(ts_path)
+        out_path = str(p.parent / (p.stem + '_finalized.docx'))
+    print(f'TS:     {ts_path}')
+    print(f'CR:     {cr_path}')
+    print(f'Output: {out_path}')
+    print()
+    # Open documents
+    ts_doc = docx.Document(ts_path)
+    cr_doc = docx.Document(cr_path)
+    # Extract metadata
+    print('Extracting CR metadata...')
+    meta = extract_cr_metadata(cr_path)
+    print(f"  Meeting ID:      {meta['meeting_id']}")
+    print(f"  UID:             {meta['uid']}")
+    print(f"  CR#:             {meta['cr_num']}")
+    print(f"  Rev:             {meta['rev']}")
+    print(f"  Category:        {meta['cat']}")
+    print(f"  Title:           {meta['title']}")
+    print(f"  Current version: {meta['current_version']}")
+    print()
+    # Compute derived values
+    pub_ym, pub_month_year = compute_pub_date()
+    old_v = meta['current_version']
+    new_v = derive_new_version(old_v)
+    print(f'Old version: {old_v}  →  New version: {new_v}')
+    print(f'Publication: {pub_month_year} ({pub_ym})')
+    print()
+    # Extract old date from first paragraph
+    title_text = ts_doc.paragraphs[0].text
+    date_match = re.search(r'\((\d{4}-\d{2})\)', title_text)
+    if not date_match:
+        print(f'WARNING: Could not find date pattern (YYYY-MM) in first paragraph:')
+        print(f'  {title_text!r}')
+        old_date_str = ''
+    else:
+        old_date_str = date_match.group(1)
+    print(f'Title paragraph: {title_text!r}')
+    print(f'Old date: {old_date_str}  →  New date: {pub_ym}')
+    print()
+    # Set up revision counter and tracked change date
+    rev = RevCounter(ts_doc)
+    tc_date = DATE  # ISO 8601 from docx_helpers
+    # Apply changes
+    print('Inserting row in Change History table (Annex V)...')
+    ch_cells = update_change_history_table(ts_doc, meta, pub_ym, old_v, new_v, rev, args.author, tc_date)
+    print(f'  Row: {ch_cells}')
+    print('Inserting row in History table (last page)...')
+    h_cells = update_history_table(ts_doc, new_v, pub_month_year, rev, args.author, tc_date)
+    print(f'  Row: {h_cells}')
+    if old_date_str:
+        print('Updating title paragraph...')
+        update_title_para(ts_doc, old_v, new_v, old_date_str, pub_ym, rev, args.author, tc_date)
+        print(f'  V{old_v} → V{new_v}, ({old_date_str}) → ({pub_ym})')
+    else:
+        print('Skipping title paragraph update (no date found).')
+    # Save
+    ts_doc.save(out_path)
+    print()
+    print(f'Saved: {out_path}')
+    print()
+    print('Summary of tracked changes:')
+    print(f'  [Change History] New row: {ch_cells}')
+    print(f'  [History]        New row: {h_cells}')
+    if old_date_str:
+        print(f'  [Title]          V{old_v} → V{new_v}, ({old_date_str}) → ({pub_ym})')
+if __name__ == '__main__':
+    main()

scripts/map_sections.py ADDED Viewed

	@@ -0,0 +1,44 @@

+#!/usr/bin/env python3
+"""
+Diagnostic: print paragraphs of target clauses from a TS DOCX with indices and styles.
+Usage:
+    python3 map_sections.py <ts_path.docx> "11.1.22.3.2" "14.5.6"
+"""
+import sys
+from docx import Document
+def main():
+    if len(sys.argv) < 3:
+        print("Usage: map_sections.py <ts.docx> <clause1> [clause2 ...]")
+        sys.exit(1)
+    ts_path = sys.argv[1]
+    clauses = sys.argv[2:]
+    doc = Document(ts_path)
+    in_section = None
+    for i, para in enumerate(doc.paragraphs):
+        text  = para.text.strip()
+        style = para.style.name
+        matched = False
+        for clause in clauses:
+            if clause in text and ('Heading' in style or 'heading' in style.lower()):
+                in_section = clause
+                print(f'\n=== [{i}] SECTION {clause} | style={style!r} ===')
+                print(f'  [{i}] style={style!r:16s} | "{text}"')
+                matched = True
+                break
+        if not matched and in_section:
+            if 'Heading' in style and text:
+                print(f'  --- section ends at [{i}] style={style!r}: "{text[:60]}"')
+                in_section = None
+            elif text:
+                print(f'  [{i}] style={style!r:16s} | "{text[:100]}"')
+if __name__ == '__main__':
+    main()

scripts/orchestrate_cr.py ADDED Viewed

	@@ -0,0 +1,361 @@

+#!/usr/bin/env python3
+"""
+orchestrate_cr.py — Fully automated CR application pipeline.
+Reads an Excel contribution list, downloads all Accepted CRs and their target
+TSs, parses tracked changes from each CR, applies them to the TS, and
+finalises the document metadata — all without any per-CR manual scripting.
+Usage:
+    python3 orchestrate_cr.py <excel_path> [person_name] [--output-dir DIR] [--author NAME]
+Arguments:
+    excel_path   Path to .xls or .xlsx contribution list (Windows paths OK)
+    person_name  Name to match in SubmittedBy column (default: "Ly Thanh PHAN")
+Options:
+    --output-dir  Base output folder (default: ~/CR_Processing)
+    --author      Tracked-change author name (default: "CR Application")
+"""
+import argparse
+import contextlib
+import datetime
+import io
+import re
+import sys
+from pathlib import Path
+import docx as docx_lib
+# ── sys.path setup ────────────────────────────────────────────────────────────
+SCRIPT_DIR = Path(__file__).parent
+FETCH_SCRIPTS = SCRIPT_DIR.parent.parent / 'fetch-crs' / 'scripts'
+sys.path.insert(0, str(SCRIPT_DIR))
+sys.path.insert(0, str(FETCH_SCRIPTS))
+from fetch_crs import parse_excel, download_cr, parse_cr_cover, download_ts, wsl_path
+from cr_parser import parse_cr
+from ts_applicator import apply_manifest
+from finalize_ts import (
+    extract_cr_metadata,
+    compute_pub_date,
+    derive_new_version,
+    update_change_history_table,
+    update_history_table,
+    update_title_para,
+)
+from docx_helpers import RevCounter, AUTHOR as DEFAULT_AUTHOR, DATE as DEFAULT_DATE
+# ── Display / logging helpers ─────────────────────────────────────────────────
+def _section(title):
+    bar = '=' * 60
+    print(f'\n{bar}')
+    print(f'  {title}')
+    print(bar)
+class _TeeWriter:
+    """Writes to both real stdout and a StringIO buffer simultaneously."""
+    def __init__(self, real, buf):
+        self._real = real
+        self._buf = buf
+    def write(self, s):
+        self._real.write(s)
+        self._buf.write(s)
+    def flush(self):
+        self._real.flush()
+# ── Main ──────────────────────────────────────────────────────────────────────
+def main():
+    ap = argparse.ArgumentParser(
+        description='Fully automated CR application pipeline.',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    ap.add_argument('excel_path', help='Path to .xls or .xlsx contribution list')
+    ap.add_argument(
+        'person_name',
+        nargs='?',
+        default='Ly Thanh PHAN',
+        help='Name to match in SubmittedBy column (default: "Ly Thanh PHAN")',
+    )
+    ap.add_argument(
+        '--output-dir',
+        default=str(Path.home() / 'CR_Processing'),
+        help='Base output directory (default: ~/CR_Processing)',
+    )
+    ap.add_argument(
+        '--author',
+        default=DEFAULT_AUTHOR,
+        help=f'Tracked change author name (default: "{DEFAULT_AUTHOR}")',
+    )
+    args = ap.parse_args()
+    excel_path = wsl_path(args.excel_path)
+    output_dir = Path(wsl_path(args.output_dir)).expanduser()
+    cr_dir = output_dir / 'CRs'
+    ts_dir = output_dir / 'TS'   # spec subfolders created per-TS below
+    cr_dir.mkdir(parents=True, exist_ok=True)
+    ts_dir.mkdir(parents=True, exist_ok=True)
+    author = args.author
+    tc_date = DEFAULT_DATE
+    # ── Step 1: Parse Excel ───────────────────────────────────────────────────
+    _section('Step 1 — Parsing Excel')
+    print(f'Excel:   {excel_path}')
+    print(f'Person:  {args.person_name!r}')
+    try:
+        cr_list = parse_excel(excel_path, args.person_name)
+    except Exception as e:
+        sys.exit(f'ERROR parsing Excel: {e}')
+    print(f'Found {len(cr_list)} Accepted CR(s):')
+    for uid, title in cr_list:
+        print(f'  {uid}: {title[:80]}')
+    if not cr_list:
+        print('Nothing to process.')
+        return
+    # ── Step 2: Download CR DOCXs ─────────────────────────────────────────────
+    _section('Step 2 — Downloading CR DOCXs')
+    cr_paths = {}  # uid -> Path
+    for uid, _ in cr_list:
+        print(f'  [{uid}] ', end='', flush=True)
+        docx_path, note = download_cr(uid, cr_dir)
+        if docx_path:
+            cr_paths[uid] = docx_path
+            print(f'OK ({note}) — {docx_path.name}')
+        else:
+            print(f'FAILED — {note}')
+    # ── Step 3: Parse cover pages → group by target TS ───────────────────────
+    _section('Step 3 — Parsing CR cover pages')
+    ts_groups = {}  # (spec_number, version) -> [uid, ...]
+    uid_cover_failed = []
+    for uid in cr_paths:
+        spec_number, version = parse_cr_cover(cr_paths[uid])
+        if spec_number and version:
+            key = (spec_number, version)
+            ts_groups.setdefault(key, []).append(uid)
+            print(f'  [{uid}] -> TS {spec_number}  v{version}')
+        else:
+            uid_cover_failed.append(uid)
+            print(f'  [{uid}] WARNING: could not parse cover page — skipping')
+    if not ts_groups:
+        print('\nNo TSs identified. Nothing to apply.')
+        return
+    # ── Step 4: Download TSs ──────────────────────────────────────────────────
+    _section('Step 4 — Downloading TSs')
+    ts_paths = {}   # (spec_number, version) -> Path
+    spec_dirs = {}  # (spec_number, version) -> Path (per-spec subfolder)
+    for (spec_number, version) in ts_groups:
+        spec_compact = spec_number.replace(' ', '')
+        spec_dir = ts_dir / spec_compact
+        spec_dir.mkdir(parents=True, exist_ok=True)
+        spec_dirs[(spec_number, version)] = spec_dir
+        print(f'  [TS {spec_number} v{version}] ', end='', flush=True)
+        filename, note = download_ts(spec_number, version, spec_dir)
+        if filename:
+            ts_paths[(spec_number, version)] = spec_dir / filename
+            print(f'OK ({note}) — {spec_compact}/{filename}')
+        else:
+            print(f'FAILED — {note}')
+    # ── Steps 5 & 6: Apply CRs + Finalise each TS ────────────────────────────
+    _section('Steps 5 & 6 — Applying CRs and Finalising Metadata')
+    report = []  # (ts_key, n_ok, n_skip, n_crs, out_path, log_path, errors)
+    for (spec_number, version), uids in ts_groups.items():
+        ts_key = f'TS {spec_number} v{version}'
+        spec_compact = spec_number.replace(' ', '')
+        spec_dir = spec_dirs.get((spec_number, version), ts_dir / spec_compact)
+        spec_dir.mkdir(parents=True, exist_ok=True)
+        # Derive new version early so filenames are known upfront
+        new_v = derive_new_version(version)
+        stem = f'ts_{spec_compact}_v{new_v}_was_v{version}'
+        ts_applied = spec_dir / f'ts_{spec_compact}_v{version}_applied.docx'
+        ts_final   = spec_dir / f'{stem}.docx'
+        log_path   = spec_dir / f'{stem}.log'
+        errors = []
+        print(f'\n-- {ts_key} ({len(uids)} CR(s): {", ".join(uids)}) --')
+        if (spec_number, version) not in ts_paths:
+            msg = 'TS download failed — skipping'
+            print(f'  SKIP: {msg}')
+            report.append((ts_key, 0, 0, len(uids), None, log_path, [msg]))
+            continue
+        ts_in = ts_paths[(spec_number, version)]
+        # All per-TS output is captured to log_buf (tee: stdout + file)
+        log_buf = io.StringIO()
+        tee = _TeeWriter(sys.stdout, log_buf)
+        with contextlib.redirect_stdout(tee):
+            log_header = (
+                f'Pipeline Log\n'
+                f'TS:   {spec_number}  v{version} -> v{new_v}\n'
+                f'CRs:  {", ".join(uids)}\n'
+                f'Date: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n'
+                f'{"=" * 60}\n'
+            )
+            print(log_header, end='')
+            # 5a. Parse all CR manifests and combine
+            combined_manifest = []
+            participating_uids = []
+            for uid in uids:
+                if uid not in cr_paths:
+                    errors.append(f'[{uid}] CR download had failed — skipped')
+                    continue
+                print(f'  Parsing {uid}... ', end='', flush=True)
+                try:
+                    changes = parse_cr(cr_paths[uid])
+                    combined_manifest.extend(changes)
+                    participating_uids.append(uid)
+                    print(f'{len(changes)} change(s)')
+                except Exception as e:
+                    errors.append(f'[{uid}] parse ERROR: {e}')
+                    print(f'ERROR: {e}')
+            if not combined_manifest:
+                print('  No changes parsed — skipping apply step.')
+                report.append((ts_key, 0, 0, len(uids), None, log_path,
+                                errors + ['No changes parsed']))
+                log_path.write_text(log_buf.getvalue(), encoding='utf-8')
+                continue
+            # 5b. Apply manifest to TS
+            print(f'  Applying {len(combined_manifest)} change(s) to {ts_in.name}...')
+            try:
+                n_ok, n_skip, log_lines = apply_manifest(
+                    ts_in, combined_manifest, ts_applied, author=author, date=tc_date
+                )
+            except Exception as e:
+                errors.append(f'apply_manifest ERROR: {e}')
+                print(f'  ERROR: {e}')
+                report.append((ts_key, 0, 0, len(uids), None, log_path, errors))
+                log_path.write_text(log_buf.getvalue(), encoding='utf-8')
+                continue
+            for line in log_lines:
+                print(f'  {line}')
+            print(f'  -> Applied: {n_ok}  Skipped: {n_skip}')
+            # 6. Finalise metadata (Change History, History, title paragraph)
+            print('  Finalising metadata...')
+            try:
+                ts_doc = docx_lib.Document(str(ts_applied))
+                rev = RevCounter(ts_doc)
+                pub_ym, pub_month_year = compute_pub_date()
+                old_v = version
+                # Extract old date string from first paragraph
+                title_text = ts_doc.paragraphs[0].text
+                date_match = re.search(r'\((\d{4}-\d{2})\)', title_text)
+                old_date_str = date_match.group(1) if date_match else ''
+                print(f'    Version:     {old_v} -> {new_v}')
+                print(f'    Publication: {pub_month_year} ({pub_ym})')
+                # One Change History row per CR
+                for uid in participating_uids:
+                    try:
+                        meta = extract_cr_metadata(str(cr_paths[uid]))
+                        ch_cells = update_change_history_table(
+                            ts_doc, meta, pub_ym, old_v, new_v, rev, author, tc_date
+                        )
+                        print(f'    [Change History] {uid}: {ch_cells}')
+                    except Exception as e:
+                        errors.append(f'[{uid}] Change History ERROR: {e}')
+                        print(f'    [Change History] {uid}: ERROR — {e}')
+                # One History row for the whole TS
+                try:
+                    h_cells = update_history_table(
+                        ts_doc, new_v, pub_month_year, rev, author, tc_date
+                    )
+                    print(f'    [History] {h_cells}')
+                except Exception as e:
+                    errors.append(f'History table ERROR: {e}')
+                    print(f'    [History] ERROR — {e}')
+                # Title paragraph version + date
+                if old_date_str:
+                    try:
+                        update_title_para(
+                            ts_doc, old_v, new_v, old_date_str, pub_ym, rev, author, tc_date
+                        )
+                        print(f'    [Title] V{old_v} -> V{new_v}, ({old_date_str}) -> ({pub_ym})')
+                    except Exception as e:
+                        errors.append(f'Title update ERROR: {e}')
+                        print(f'    [Title] ERROR — {e}')
+                else:
+                    print(f'    [Title] SKIP — no (YYYY-MM) pattern in: {title_text!r}')
+                ts_doc.save(str(ts_final))
+                print(f'  Saved: {spec_compact}/{ts_final.name}')
+                print(f'  Log:   {spec_compact}/{log_path.name}')
+                report.append((ts_key, n_ok, n_skip, len(uids), ts_final, log_path, errors))
+            except Exception as e:
+                errors.append(f'Finalisation ERROR: {e}')
+                print(f'  Finalisation ERROR: {e}')
+                report.append((ts_key, n_ok, n_skip, len(uids), ts_applied, log_path, errors))
+        # Write log file after the tee context exits
+        log_path.write_text(log_buf.getvalue(), encoding='utf-8')
+    # ── Final Report ──────────────────────────────────────────────────────────
+    _section('Final Report')
+    n_success = sum(1 for r in report if r[4] is not None and not r[6])
+    n_partial  = sum(1 for r in report if r[4] is not None and r[6])
+    n_failed   = sum(1 for r in report if r[4] is None)
+    print(f'Person:       {args.person_name}')
+    print(f'Excel:        {excel_path}')
+    print(f'CRs found:    {len(cr_list)}')
+    print(f'TSs updated:  {n_success} fully OK, {n_partial} with warnings, {n_failed} failed')
+    print()
+    for ts_key, n_ok, n_skip, n_crs, out_path, log_path, errors in report:
+        if out_path and not errors:
+            status = 'OK'
+        elif out_path:
+            status = 'WARN'
+        else:
+            status = 'FAIL'
+        print(f'  [{status}] {ts_key}')
+        print(f'         CRs: {n_crs}  |  Body changes applied: {n_ok}  |  Skipped: {n_skip}')
+        if out_path:
+            print(f'         Output: {out_path.parent.name}/{out_path.name}')
+        if log_path and log_path.exists():
+            print(f'         Log:    {log_path.parent.name}/{log_path.name}')
+        for err in errors:
+            print(f'         ! {err}')
+    print()
+    print(f'Output directory: {output_dir}/')
+if __name__ == '__main__':
+    main()

scripts/ts_applicator.py ADDED Viewed

	@@ -0,0 +1,633 @@

+#!/usr/bin/env python3
+"""
+ts_applicator.py — Apply a CR change manifest to a TS DOCX as tracked changes.
+Reads a JSON manifest produced by cr_parser.py and applies every change
+to the target TS using docx_helpers tracked-change primitives.
+Usage:
+    python3 ts_applicator.py <ts.docx> <manifest.json> [--author NAME] [--output path]
+    # or import: from ts_applicator import apply_manifest
+"""
+import argparse
+import json
+import re
+import sys
+from pathlib import Path
+import docx
+from docx.oxml import OxmlElement
+from docx.oxml.ns import qn
+sys.path.insert(0, str(Path(__file__).parent))
+from docx_helpers import (
+    RevCounter,
+    tracked_modify_para,
+    tracked_insert_paras_after,
+    AUTHOR as DEFAULT_AUTHOR,
+    DATE as DEFAULT_DATE,
+)
+# ── Text normalisation ────────────────────────────────────────────────────────
+def _norm(text):
+    """Normalise non-breaking spaces and common Unicode dashes for comparison."""
+    return (text
+            .replace('\xa0', ' ')
+            .replace('\u2013', '-')
+            .replace('\u2014', '-')
+            .strip())
+def _norm_ws(text):
+    """
+    Strip all whitespace for structural matching.
+    ETSI TS files store structured paragraphs (references, abbreviations,
+    headings) with a TAB between the code and the body text, e.g.:
+        '[27]\\tGlobalPlatform: ...'
+        'CLT\\tContactLess Tunnelling'
+        '8.3\\tRAM implementation over HTTPS'
+    The CR's text extraction concatenates runs directly, losing the tab:
+        '[27]GlobalPlatform: ...'
+        'CLTContactLess Tunnelling'
+        '8.3RAM implementation over HTTPS'
+    Removing all whitespace from both sides before comparing solves this.
+    Used as a third-level fallback (confidence 0.8) after exact and NBSP-norm.
+    """
+    base = (text
+            .replace('\xa0', '')
+            .replace('\u2013', '-')
+            .replace('\u2014', '-'))
+    return re.sub(r'\s+', '', base)
+# ── Document search helpers ───────────────────────────────────────────────────
+def _full_para_text(para):
+    """All text content including w:t (normal/inserted) and w:delText (deleted runs)."""
+    el = para._element
+    return ''.join(t.text or '' for t in el.findall('.//' + qn('w:t'))) + \
+           ''.join(t.text or '' for t in el.findall('.//' + qn('w:delText')))
+def _find_para(doc, search_text, prefer_not_in_table=False):
+    """
+    Find the first paragraph containing search_text.
+    Four levels of matching, in order of confidence:
+      1.0 — exact substring match
+      0.9 — NBSP/dash-normalised match  (_norm)
+      0.8 — whitespace-stripped match   (_norm_ws)  handles tab vs nothing in
+            structured paragraphs (refs '[27]\\t...', abbrevs 'CLT\\t...', headings '8.3\\t...')
+      0.6 — full XML text (including w:del content): handles anchors that were
+            previously deleted by tracked_modify_para in an earlier apply step
+    Returns (para, confidence) or (None, 0.0).
+    """
+    norm_search = _norm(search_text)
+    ws_search = _norm_ws(search_text)
+    candidates_exact = []
+    candidates_norm = []
+    candidates_ws = []
+    candidates_del = []
+    for para in doc.paragraphs:
+        pt = para.text
+        if search_text in pt:
+            candidates_exact.append(para)
+        elif norm_search and norm_search in _norm(pt):
+            candidates_norm.append(para)
+        elif ws_search and ws_search in _norm_ws(pt):
+            candidates_ws.append(para)
+        else:
+            # Level 4: check full XML text (catches deleted-but-still-present paragraphs)
+            full_pt = _full_para_text(para)
+            if search_text in full_pt:
+                candidates_del.append(para)
+            elif ws_search and ws_search in _norm_ws(full_pt):
+                candidates_del.append(para)
+    def _in_table(para):
+        p = para._element
+        return any(a.tag == qn('w:tc') for a in p.iterancestors())
+    for pool, conf in [(candidates_exact, 1.0), (candidates_norm, 0.9),
+                       (candidates_ws, 0.8), (candidates_del, 0.6)]:
+        if not pool:
+            continue
+        if prefer_not_in_table:
+            body_only = [p for p in pool if not _in_table(p)]
+            if body_only:
+                return body_only[0], conf
+        return pool[0], conf
+    return None, 0.0
+def _find_table_by_section(doc, section_heading):
+    """
+    Find the table immediately following a paragraph that contains section_heading.
+    Checks both w:t (plain/inserted) and w:delText (tracked-deleted) so the match
+    survives even after the heading was wrapped in a tracked deletion.
+    Empty paragraphs between the heading and the table are tolerated.
+    Returns (table, confidence) or (None, 0.0).
+    """
+    if not section_heading:
+        return None, 0.0
+    norm_h = _norm(section_heading)
+    ws_h = _norm_ws(section_heading)
+    heading_seen = False
+    for element in doc.element.body:
+        tag = element.tag.split('}')[-1] if '}' in element.tag else element.tag
+        if tag == 'p':
+            t_text = ''.join(t.text or '' for t in element.findall('.//' + qn('w:t')))
+            d_text = ''.join(t.text or '' for t in element.findall('.//' + qn('w:delText')))
+            full = (t_text + d_text).strip()
+            if not full:
+                continue  # skip empty paras, keep heading_seen state
+            if (section_heading in full
+                    or norm_h in _norm(full)
+                    or ws_h in _norm_ws(full)):
+                heading_seen = True
+            else:
+                heading_seen = False  # non-matching non-empty para resets
+        elif tag == 'tbl':
+            if heading_seen:
+                for tbl in doc.tables:
+                    if tbl._tbl is element:
+                        return tbl, 1.0
+            heading_seen = False
+    return None, 0.0
+def _find_table(doc, header_key):
+    """
+    Find a table whose first row cell texts start with header_key.
+    Returns (table, confidence) or (None, 0.0).
+    """
+    norm_key = [_norm(h) for h in header_key]
+    for tbl in doc.tables:
+        if not tbl.rows:
+            continue
+        first_row_texts = [_norm(c.text) for c in tbl.rows[0].cells]
+        # Match by prefix (header_key may have fewer columns)
+        match = all(
+            i < len(first_row_texts) and norm_key[i] in first_row_texts[i]
+            for i in range(len(norm_key))
+        )
+        if match:
+            return tbl, 1.0
+    return None, 0.0
+def _find_row(tbl, anchor_text):
+    """
+    Find first row in tbl where col-0 cell text contains anchor_text.
+    Returns (row_idx, confidence) or (-1, 0.0).
+    Three confidence levels: 1.0 exact, 0.9 norm, 0.8 whitespace-stripped.
+    """
+    norm_anchor = _norm(anchor_text)
+    ws_anchor = _norm_ws(anchor_text)
+    best = (-1, 0.0)
+    for idx, row in enumerate(tbl.rows):
+        cell0 = row.cells[0].text if row.cells else ''
+        if anchor_text in cell0:
+            return idx, 1.0
+        if norm_anchor and norm_anchor in _norm(cell0) and best[1] < 0.9:
+            best = (idx, 0.9)
+        elif ws_anchor and ws_anchor in _norm_ws(cell0) and best[1] < 0.8:
+            best = (idx, 0.8)
+    return best
+# ── vMerge row insertion ──────────────────────────────────────────────────────
+def _build_new_tr(cells_data, rev, author, date):
+    """
+    Build and return a new tracked-insert <w:tr> element (does NOT insert it).
+    cells_data: list of dicts with keys: text, width, vmerge, style.
+    """
+    def _ins_attr():
+        return {qn('w:id'): rev.next(), qn('w:author'): author, qn('w:date'): date}
+    def _make_t(text, tag='w:t'):
+        t = OxmlElement(tag)
+        t.text = text or ''
+        if text and (text[0] in (' ', '\t') or text[-1] in (' ', '\t')):
+            t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
+        return t
+    def _make_run(text):
+        r = OxmlElement('w:r')
+        r.append(_make_t(text))
+        return r
+    new_tr = OxmlElement('w:tr')
+    # trPr: tracked row insertion
+    trPr = OxmlElement('w:trPr')
+    tr_ins = OxmlElement('w:ins')
+    for k, v in _ins_attr().items():
+        tr_ins.set(k, v)
+    trPr.append(tr_ins)
+    new_tr.append(trPr)
+    for cd in cells_data:
+        tc = OxmlElement('w:tc')
+        tcPr = OxmlElement('w:tcPr')
+        tcW = OxmlElement('w:tcW')
+        if cd.get('width'):
+            tcW.set(qn('w:w'), str(cd['width']))
+            tcW.set(qn('w:type'), 'dxa')
+        tcPr.append(tcW)
+        if cd.get('vmerge'):
+            vm = OxmlElement('w:vMerge')
+            tcPr.append(vm)
+        tc.append(tcPr)
+        p = OxmlElement('w:p')
+        pPr = OxmlElement('w:pPr')
+        if cd.get('style'):
+            pStyle = OxmlElement('w:pStyle')
+            pStyle.set(qn('w:val'), cd['style'])
+            pPr.append(pStyle)
+        rPr_para = OxmlElement('w:rPr')
+        pm_ins = OxmlElement('w:ins')
+        for k, v in _ins_attr().items():
+            pm_ins.set(k, v)
+        rPr_para.append(pm_ins)
+        pPr.append(rPr_para)
+        p.append(pPr)
+        if cd.get('text') and not cd.get('vmerge'):
+            ins_el = OxmlElement('w:ins')
+            for k, v in _ins_attr().items():
+                ins_el.set(k, v)
+            ins_el.append(_make_run(cd['text']))
+            p.append(ins_el)
+        tc.append(p)
+        new_tr.append(tc)
+    return new_tr
+def _insert_vmerge_row(tbl, after_row_idx, cells_data, rev, author, date):
+    """
+    Insert a tracked row after row[after_row_idx].
+    cells_data: list of dicts with keys: text, width, vmerge, style.
+    Returns the inserted <w:tr> element.
+    """
+    new_tr = _build_new_tr(cells_data, rev, author, date)
+    ref_tr = tbl.rows[after_row_idx]._tr
+    ref_tr.addnext(new_tr)
+    return new_tr
+# ── Section replace (direct XML transplant) ───────────────────────────────────
+def _apply_section_replace(doc, change, rev, author, date, log):
+    """
+    Transplant a block of CR elements (del section + ins section) directly into
+    the TS, replacing the old heading+table at the matching location.
+    This mirrors what Word does on copy-paste: the exact XML from the CR is
+    cloned into the TS, with only the tracked-change revision IDs remapped to
+    avoid conflicts.
+    """
+    from lxml import etree
+    import copy
+    loc = change['location']
+    del_heading = loc.get('del_heading', '')
+    has_del_table = loc.get('has_del_table', False)
+    elements_xml = change.get('elements_xml', [])
+    if not elements_xml:
+        log.append('  SKIP section_replace: no elements in manifest')
+        return False
+    # ── Find the TS paragraph that matches the deleted heading ─────────────────
+    ts_para_elem = None
+    if del_heading:
+        for para in doc.paragraphs:
+            pt = para.text
+            if del_heading in pt or _norm(del_heading) in _norm(pt):
+                ts_para_elem = para._element
+                break
+        if ts_para_elem is None:
+            # Fallback: include paragraphs whose XML text (inc. del runs) matches
+            for para in doc.paragraphs:
+                if del_heading in _full_para_text(para):
+                    ts_para_elem = para._element
+                    break
+    if ts_para_elem is None:
+        log.append(f'  SKIP section_replace: del_heading {del_heading!r} not found in TS')
+        return False
+    ts_body = ts_para_elem.getparent()
+    # ── Find the table immediately after the heading (if applicable) ───────────
+    ts_tbl_elem = None
+    if has_del_table:
+        found_para = False
+        for sib in ts_body:
+            if sib is ts_para_elem:
+                found_para = True
+                continue
+            if not found_para:
+                continue
+            sib_tag = sib.tag.split('}')[-1] if '}' in sib.tag else sib.tag
+            if sib_tag == 'p':
+                # Allow empty paragraphs between heading and table
+                if not (''.join(t.text or '' for t in sib.findall('.//' + qn('w:t')))).strip():
+                    continue
+                break  # non-empty paragraph before table → no table to remove
+            elif sib_tag == 'tbl':
+                ts_tbl_elem = sib
+                break
+            else:
+                break
+    # ── Clone and remap IDs on the CR elements ─────────────────────────────────
+    cloned = []
+    for xml_str in elements_xml:
+        elem = etree.fromstring(xml_str)
+        cloned_elem = copy.deepcopy(elem)
+        # Remap w:id in all tracked-change elements (must be unique per document)
+        for el in cloned_elem.iter():
+            if el.get(qn('w:id')) is not None:
+                el.set(qn('w:id'), rev.next())
+        cloned.append(cloned_elem)
+    # ── Insert cloned elements before the TS heading paragraph ────────────────
+    insert_idx = list(ts_body).index(ts_para_elem)
+    for i, elem in enumerate(cloned):
+        ts_body.insert(insert_idx + i, elem)
+    # ── Remove the now-replaced TS elements ───────────────────────────────────
+    ts_body.remove(ts_para_elem)
+    if ts_tbl_elem is not None:
+        ts_body.remove(ts_tbl_elem)
+    n_del = sum(1 for x in elements_xml if 'w:del' in x[:200])
+    log.append(
+        f'  OK  section_replace: {del_heading!r} → {len(elements_xml)} element(s) spliced in'
+        f' (removed heading{"+ table" if has_del_table else ""})'
+    )
+    return True
+# ── Per-change-type applicators ───────────────────────────────────────────────
+def _apply_text_replace(doc, change, rev, author, date, log):
+    loc = change['location']
+    old = change['old']
+    new = change['new']
+    if loc['kind'] == 'table_cell':
+        tbl, t_conf = _find_table(doc, loc['table_header'])
+        if tbl is None:
+            log.append(f"  SKIP text_replace: table not found {loc['table_header'][:2]!r}")
+            return False
+        col_idx = loc['col_idx']
+        row_anchor = loc['row_anchor']
+        if row_anchor:
+            row_idx, r_conf = _find_row(tbl, row_anchor)
+            if row_idx < 0:
+                log.append(f"  SKIP text_replace: row anchor not found {row_anchor!r}")
+                return False
+            row = tbl.rows[row_idx]
+            if col_idx >= len(row.cells):
+                log.append(f"  SKIP text_replace: col_idx {col_idx} out of range")
+                return False
+            cell = row.cells[col_idx]
+            for para in cell.paragraphs:
+                if old in para.text:
+                    tracked_modify_para(para, old, new, rev, author, date)
+                    log.append(f"  OK  text_replace (table_cell row={row_idx} col={col_idx}): {old!r} → {new!r}")
+                    return True
+            log.append(f"  SKIP text_replace: old text {old!r} not in cell (row={row_idx} col={col_idx})")
+            return False
+        else:
+            # Empty row anchor: scan all rows in col_idx.
+            # Prefer the table that follows the section heading (e.g. "Thirty fifth byte:")
+            # because all-empty table headers match any table.
+            section_heading = loc.get('section_heading', '')
+            tbl_by_section, _ = _find_table_by_section(doc, section_heading)
+            if tbl_by_section is not None:
+                tables_to_try = [tbl_by_section] + [t for t in doc.tables if t is not tbl_by_section]
+            else:
+                tables_to_try = [tbl] + [t for t in doc.tables if t is not tbl]
+            for search_tbl in tables_to_try:
+                for r_idx, row in enumerate(search_tbl.rows):
+                    if col_idx >= len(row.cells):
+                        continue
+                    cell = row.cells[col_idx]
+                    for para in cell.paragraphs:
+                        if old in para.text:
+                            tracked_modify_para(para, old, new, rev, author, date)
+                            log.append(f"  OK  text_replace (table_cell scan row={r_idx} col={col_idx}): {old!r} → {new!r}")
+                            return True
+            # Final fallback: scan ALL columns of ALL tables
+            _all_start = tbl_by_section if tbl_by_section is not None else tbl
+            for search_tbl in [_all_start] + [t for t in doc.tables if t is not _all_start]:
+                for r_idx, row in enumerate(search_tbl.rows):
+                    for c_idx, cell in enumerate(row.cells):
+                        for para in cell.paragraphs:
+                            if old in para.text:
+                                tracked_modify_para(para, old, new, rev, author, date)
+                                log.append(f"  OK  text_replace (table_cell any_col row={r_idx} col={c_idx}): {old!r} → {new!r}")
+                                return True
+            log.append(f"  SKIP text_replace: old text {old!r} not found in any table column")
+            return False
+    elif loc['kind'] == 'body_para':
+        ctx = loc.get('para_context', '')
+        # Try to find the paragraph by old text first
+        para, conf = _find_para(doc, old, prefer_not_in_table=True)
+        if para is None:
+            # Fall back: find by paragraph context
+            para, conf = _find_para(doc, ctx, prefer_not_in_table=True)
+            if para is None:
+                log.append(f"  SKIP text_replace: old text {old!r} not found in TS")
+                return False
+        if old in para.text:
+            tracked_modify_para(para, old, new, rev, author, date)
+            log.append(f"  OK  text_replace (body_para conf={conf:.1f}): {old!r} → {new!r}")
+            return True
+        log.append(f"  SKIP text_replace: old text {old!r} not in resolved paragraph")
+        return False
+    log.append(f"  SKIP text_replace: unknown kind {loc['kind']!r}")
+    return False
+def _apply_para_insert(doc, change, rev, author, date, log):
+    anchor_text = change['location'].get('anchor_text', '')
+    paras_data = change.get('paragraphs', [])
+    if not paras_data:
+        return True
+    anchor_para, conf = _find_para(doc, anchor_text)
+    if anchor_para is None:
+        log.append(f"  SKIP para_insert: anchor not found {anchor_text[:60]!r}")
+        return False
+    items = [(p['text'], p['style'] or 'Normal') for p in paras_data]
+    tracked_insert_paras_after(anchor_para, items, rev, author, date)
+    first_text = paras_data[0]['text'][:50] if paras_data else ''
+    log.append(f"  OK  para_insert ({len(paras_data)} para(s) after anchor conf={conf:.1f}): {first_text!r}...")
+    return True
+def _apply_row_insert(doc, change, rev, author, date, log, last_inserted=None):
+    loc = change['location']
+    # Prefer table located by section heading (handles ambiguous all-empty headers)
+    section_heading = loc.get('section_heading', '')
+    tbl_by_section, _ = _find_table_by_section(doc, section_heading)
+    if tbl_by_section is not None:
+        tbl = tbl_by_section
+    else:
+        tbl, t_conf = _find_table(doc, loc['table_header'])
+        if tbl is None:
+            log.append(f"  SKIP row_insert: table not found {loc['table_header'][:2]!r}")
+            return False
+    after_anchor = loc.get('after_row_anchor', '')
+    row_idx, r_conf = _find_row(tbl, after_anchor)
+    if row_idx < 0:
+        log.append(f"  SKIP row_insert: anchor row not found {after_anchor!r}")
+        return False
+    cells_data = change.get('cells', [])
+    # Fix insertion ordering: when multiple rows target the same (tbl, row_idx),
+    # each new row should go AFTER the previously inserted one, not after row_idx.
+    # last_inserted maps (tbl._tbl id, row_idx) → last w:tr element inserted there.
+    key = (id(tbl._tbl), row_idx)
+    if last_inserted is not None and key in last_inserted:
+        # Insert after the previously inserted row to maintain forward order
+        prev_tr = last_inserted[key]
+        new_tr = _build_new_tr(cells_data, rev, author, date)
+        prev_tr.addnext(new_tr)
+        last_inserted[key] = new_tr
+    else:
+        new_tr = _insert_vmerge_row(tbl, row_idx, cells_data, rev, author, date)
+        if last_inserted is not None:
+            last_inserted[key] = new_tr
+    desc = cells_data[1]['text'] if len(cells_data) > 1 else '?'
+    log.append(f"  OK  row_insert after row[{row_idx}] ({after_anchor!r}): {desc!r}")
+    return True
+# ── Manifest pre-processing ───────────────────────────────────────────────────
+def _merge_para_inserts(manifest):
+    """
+    Merge consecutive para_insert entries that share the same anchor_text.
+    When the CR parser emits multiple para_insert entries for the same anchor
+    (because [...] context markers were transparent and kept prev_stable_text
+    unchanged), each would call tracked_insert_paras_after independently.
+    Since each call starts from the same anchor element and uses addnext(),
+    later groups push earlier groups down — producing reversed order.
+    Merging them into one entry ensures a single tracked_insert_paras_after
+    call that inserts all paragraphs in the correct forward order.
+    """
+    result = []
+    for change in manifest:
+        if (change.get('type') == 'para_insert'
+                and result
+                and result[-1].get('type') == 'para_insert'
+                and result[-1]['location']['anchor_text'] == change['location']['anchor_text']):
+            result[-1]['paragraphs'].extend(change['paragraphs'])
+        else:
+            merged = dict(change)
+            if change.get('type') == 'para_insert':
+                merged['paragraphs'] = list(change['paragraphs'])
+            result.append(merged)
+    return result
+# ── Main apply function ───────────────────────────────────────────────────────
+def apply_manifest(ts_path, manifest, out_path, author=DEFAULT_AUTHOR, date=DEFAULT_DATE):
+    """
+    Apply all changes in manifest to ts_path, save to out_path.
+    Returns (n_ok, n_skipped, log_lines).
+    """
+    doc = docx.Document(str(ts_path))
+    rev = RevCounter(doc)
+    log = []
+    n_ok = 0
+    n_skip = 0
+    manifest = _merge_para_inserts(manifest)
+    # Track last inserted <w:tr> per (tbl_id, anchor_row_idx) to maintain
+    # forward insertion order when multiple row_inserts target the same anchor.
+    last_inserted = {}
+    for change in manifest:
+        ctype = change.get('type')
+        ok = False
+        if ctype == 'section_replace':
+            ok = _apply_section_replace(doc, change, rev, author, date, log)
+        elif ctype == 'text_replace':
+            ok = _apply_text_replace(doc, change, rev, author, date, log)
+        elif ctype == 'para_insert':
+            ok = _apply_para_insert(doc, change, rev, author, date, log)
+        elif ctype == 'row_insert':
+            ok = _apply_row_insert(doc, change, rev, author, date, log, last_inserted=last_inserted)
+        else:
+            log.append(f"  SKIP unknown change type: {ctype!r}")
+        if ok:
+            n_ok += 1
+        else:
+            n_skip += 1
+    doc.save(str(out_path))
+    return n_ok, n_skip, log
+# ── CLI ───────────────────────────────────────────────────────────────────────
+def main():
+    ap = argparse.ArgumentParser(description='Apply CR manifest to TS DOCX as tracked changes.')
+    ap.add_argument('ts_docx',   help='Target TS DOCX file')
+    ap.add_argument('manifest',  help='JSON manifest from cr_parser.py')
+    ap.add_argument('--author',  default=DEFAULT_AUTHOR, help='Tracked change author')
+    ap.add_argument('--output',  default=None, help='Output path (default: <ts>_applied.docx)')
+    args = ap.parse_args()
+    ts_path = Path(args.ts_docx)
+    out_path = Path(args.output) if args.output else ts_path.parent / (ts_path.stem + '_applied.docx')
+    with open(args.manifest, encoding='utf-8') as f:
+        manifest = json.load(f)
+    print(f'Applying {len(manifest)} change(s) from manifest to {ts_path.name}...')
+    n_ok, n_skip, log = apply_manifest(ts_path, manifest, out_path, author=args.author)
+    for line in log:
+        print(line)
+    print(f'\nResult: {n_ok} applied, {n_skip} skipped')
+    print(f'Output: {out_path}')
+if __name__ == '__main__':
+    main()