Spaces:
Running
Running
modify UI, independant downloads away from docfinder, warnings, retry and manual upload
Browse files- app.py +211 -18
- scripts/etsi_client.py +495 -0
- scripts/fetch_crs.py +48 -80
- scripts/finalize_ts.py +25 -11
- scripts/orchestrate_cr.py +273 -11
- scripts/ts_applicator.py +131 -22
app.py
CHANGED
|
@@ -23,6 +23,30 @@ from pathlib import Path
|
|
| 23 |
|
| 24 |
import streamlit as st
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
# ββ Scripts dir (same folder as app.py / scripts/) βββββββββββββββββββββββββββ
|
| 27 |
SCRIPTS_DIR = Path(__file__).parent / "scripts"
|
| 28 |
sys.path.insert(0, str(SCRIPTS_DIR))
|
|
@@ -74,7 +98,7 @@ def save_state(sid: str, state: dict) -> None:
|
|
| 74 |
def new_state(sid: str) -> dict:
|
| 75 |
return {
|
| 76 |
"session_id": sid,
|
| 77 |
-
"status": "
|
| 78 |
"excel_filename": None,
|
| 79 |
"person_name": "Ly Thanh PHAN",
|
| 80 |
"cr_list": [],
|
|
@@ -126,21 +150,34 @@ def tail_log(log_path: str, n: int = 100) -> str:
|
|
| 126 |
|
| 127 |
|
| 128 |
def parse_log_results(log_path: str) -> list[dict]:
|
| 129 |
-
"""Extract per-TS result lines from the Final Report
|
| 130 |
p = Path(log_path)
|
| 131 |
if not p.exists():
|
| 132 |
return []
|
| 133 |
lines = p.read_text(errors="replace").splitlines()
|
| 134 |
results, in_report = [], False
|
|
|
|
| 135 |
for line in lines:
|
| 136 |
-
if "Final Report" in line:
|
| 137 |
in_report = True
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
return results
|
| 145 |
|
| 146 |
|
|
@@ -231,6 +268,11 @@ if "sid" not in st.session_state:
|
|
| 231 |
sid: str = st.session_state.sid
|
| 232 |
state: dict = st.session_state.state
|
| 233 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
# ββ Sidebar βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 235 |
with st.sidebar:
|
| 236 |
st.header("Session")
|
|
@@ -251,10 +293,37 @@ with st.sidebar:
|
|
| 251 |
# ββ State machine βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 252 |
status: str = state["status"]
|
| 253 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 255 |
# UPLOAD
|
| 256 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 257 |
-
|
| 258 |
st.subheader("Step 1 β Upload contribution list")
|
| 259 |
|
| 260 |
uploaded = st.file_uploader(
|
|
@@ -330,12 +399,16 @@ elif status == "preview":
|
|
| 330 |
"--output-dir", str(output_dir),
|
| 331 |
]
|
| 332 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
log_file = open(str(log_path), "w")
|
| 334 |
proc = subprocess.Popen(
|
| 335 |
cmd,
|
| 336 |
stdout=log_file,
|
| 337 |
stderr=subprocess.STDOUT,
|
| 338 |
-
env=
|
| 339 |
)
|
| 340 |
log_file.close()
|
| 341 |
|
|
@@ -405,25 +478,46 @@ elif status in ("done", "error"):
|
|
| 405 |
else:
|
| 406 |
st.error(f"β Pipeline finished with errors (return code: {rc})")
|
| 407 |
|
| 408 |
-
# Per-TS results table
|
| 409 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 410 |
if results:
|
| 411 |
st.subheader("Results per TS")
|
| 412 |
import pandas as pd
|
| 413 |
|
| 414 |
-
|
|
|
|
|
|
|
| 415 |
|
| 416 |
def _color_status(val):
|
| 417 |
return {
|
| 418 |
"OK": "background-color: #d4edda; color: #155724",
|
| 419 |
"WARN": "background-color: #fff3cd; color: #856404",
|
| 420 |
"FAIL": "background-color: #f8d7da; color: #721c24",
|
|
|
|
| 421 |
}.get(val, "")
|
| 422 |
|
| 423 |
-
|
| 424 |
-
df.
|
| 425 |
-
|
| 426 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 427 |
|
| 428 |
# Download ZIP
|
| 429 |
if output_dir.exists() and any(output_dir.rglob("*")):
|
|
@@ -446,6 +540,105 @@ elif status in ("done", "error"):
|
|
| 446 |
else:
|
| 447 |
st.text("Log not found.")
|
| 448 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 449 |
# Start new session
|
| 450 |
st.divider()
|
| 451 |
if st.button("Start new session"):
|
|
|
|
| 23 |
|
| 24 |
import streamlit as st
|
| 25 |
|
| 26 |
+
# ββ EOL credential verification βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 27 |
+
|
| 28 |
+
def verify_eol_credentials(username: str, password: str) -> bool:
|
| 29 |
+
import json as _json
|
| 30 |
+
import urllib3
|
| 31 |
+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
| 32 |
+
import requests as _req
|
| 33 |
+
session = _req.Session()
|
| 34 |
+
session.get(
|
| 35 |
+
"https://portal.etsi.org/LoginRedirection.aspx",
|
| 36 |
+
verify=False,
|
| 37 |
+
timeout=10,
|
| 38 |
+
)
|
| 39 |
+
resp = session.post(
|
| 40 |
+
"https://portal.etsi.org/ETSIPages/LoginEOL.ashx",
|
| 41 |
+
data=_json.dumps({"username": username, "password": password}),
|
| 42 |
+
headers={"Content-Type": "application/json; charset=UTF-8"},
|
| 43 |
+
verify=False,
|
| 44 |
+
allow_redirects=False,
|
| 45 |
+
timeout=10,
|
| 46 |
+
)
|
| 47 |
+
return resp.text.strip() != "Failed"
|
| 48 |
+
|
| 49 |
+
|
| 50 |
# ββ Scripts dir (same folder as app.py / scripts/) βββββββββββββββββββββββββββ
|
| 51 |
SCRIPTS_DIR = Path(__file__).parent / "scripts"
|
| 52 |
sys.path.insert(0, str(SCRIPTS_DIR))
|
|
|
|
| 98 |
def new_state(sid: str) -> dict:
|
| 99 |
return {
|
| 100 |
"session_id": sid,
|
| 101 |
+
"status": "login",
|
| 102 |
"excel_filename": None,
|
| 103 |
"person_name": "Ly Thanh PHAN",
|
| 104 |
"cr_list": [],
|
|
|
|
| 150 |
|
| 151 |
|
| 152 |
def parse_log_results(log_path: str) -> list[dict]:
|
| 153 |
+
"""Extract per-TS result lines and warning messages from the Final/Retry Report."""
|
| 154 |
p = Path(log_path)
|
| 155 |
if not p.exists():
|
| 156 |
return []
|
| 157 |
lines = p.read_text(errors="replace").splitlines()
|
| 158 |
results, in_report = [], False
|
| 159 |
+
current = None
|
| 160 |
for line in lines:
|
| 161 |
+
if "Final Report" in line or "Retry Summary" in line:
|
| 162 |
in_report = True
|
| 163 |
+
continue
|
| 164 |
+
if not in_report:
|
| 165 |
+
continue
|
| 166 |
+
matched = False
|
| 167 |
+
for tag in ("OK", "WARN", "FAIL", "SKIP"):
|
| 168 |
+
if f"[{tag}]" in line:
|
| 169 |
+
if current is not None:
|
| 170 |
+
results.append(current)
|
| 171 |
+
ts_name = line.split(f"[{tag}]", 1)[-1].strip()
|
| 172 |
+
current = {"Status": tag, "TS": ts_name, "warnings": []}
|
| 173 |
+
matched = True
|
| 174 |
+
break
|
| 175 |
+
if not matched and current is not None:
|
| 176 |
+
stripped = line.strip()
|
| 177 |
+
if stripped.startswith("! "):
|
| 178 |
+
current["warnings"].append(stripped[2:])
|
| 179 |
+
if current is not None:
|
| 180 |
+
results.append(current)
|
| 181 |
return results
|
| 182 |
|
| 183 |
|
|
|
|
| 268 |
sid: str = st.session_state.sid
|
| 269 |
state: dict = st.session_state.state
|
| 270 |
|
| 271 |
+
# Credential guard: if credentials are not in memory (e.g. page refresh after login),
|
| 272 |
+
# force re-login regardless of the persisted status.
|
| 273 |
+
if state.get("status") not in ("login",) and "eol_user" not in st.session_state:
|
| 274 |
+
state["status"] = "login"
|
| 275 |
+
|
| 276 |
# ββ Sidebar βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 277 |
with st.sidebar:
|
| 278 |
st.header("Session")
|
|
|
|
| 293 |
# ββ State machine βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 294 |
status: str = state["status"]
|
| 295 |
|
| 296 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 297 |
+
# LOGIN
|
| 298 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 299 |
+
if status == "login":
|
| 300 |
+
st.subheader("Connect with your ETSI EOL account")
|
| 301 |
+
st.info(
|
| 302 |
+
"Your credentials are used only for this session and are never stored on disk.",
|
| 303 |
+
icon="π",
|
| 304 |
+
)
|
| 305 |
+
username = st.text_input("EOL Username")
|
| 306 |
+
password = st.text_input("EOL Password", type="password")
|
| 307 |
+
|
| 308 |
+
if st.button("Connect", type="primary"):
|
| 309 |
+
if not username or not password:
|
| 310 |
+
st.error("Please enter both username and password.")
|
| 311 |
+
else:
|
| 312 |
+
with st.spinner("Verifying credentialsβ¦"):
|
| 313 |
+
ok = verify_eol_credentials(username, password)
|
| 314 |
+
if ok:
|
| 315 |
+
st.session_state.eol_user = username
|
| 316 |
+
st.session_state.eol_password = password
|
| 317 |
+
state["status"] = "upload"
|
| 318 |
+
save_state(sid, state)
|
| 319 |
+
st.rerun()
|
| 320 |
+
else:
|
| 321 |
+
st.error("Login failed β check your EOL username and password.")
|
| 322 |
+
|
| 323 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 324 |
# UPLOAD
|
| 325 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 326 |
+
elif status == "upload":
|
| 327 |
st.subheader("Step 1 β Upload contribution list")
|
| 328 |
|
| 329 |
uploaded = st.file_uploader(
|
|
|
|
| 399 |
"--output-dir", str(output_dir),
|
| 400 |
]
|
| 401 |
|
| 402 |
+
env = os.environ.copy()
|
| 403 |
+
env["EOL_USER"] = st.session_state.eol_user
|
| 404 |
+
env["EOL_PASSWORD"] = st.session_state.eol_password
|
| 405 |
+
|
| 406 |
log_file = open(str(log_path), "w")
|
| 407 |
proc = subprocess.Popen(
|
| 408 |
cmd,
|
| 409 |
stdout=log_file,
|
| 410 |
stderr=subprocess.STDOUT,
|
| 411 |
+
env=env,
|
| 412 |
)
|
| 413 |
log_file.close()
|
| 414 |
|
|
|
|
| 478 |
else:
|
| 479 |
st.error(f"β Pipeline finished with errors (return code: {rc})")
|
| 480 |
|
| 481 |
+
# Per-TS results table β merge all pipeline logs so retry results don't
|
| 482 |
+
# replace original ones; later logs (pipeline_retry.log) supersede earlier
|
| 483 |
+
# ones (pipeline.log) for the same TS key.
|
| 484 |
+
_merged: dict[str, dict] = {}
|
| 485 |
+
for _lf in sorted(session_dir(sid).glob("pipeline*.log")):
|
| 486 |
+
for _r in parse_log_results(str(_lf)):
|
| 487 |
+
_merged[_r["TS"]] = _r
|
| 488 |
+
results = list(_merged.values())
|
| 489 |
if results:
|
| 490 |
st.subheader("Results per TS")
|
| 491 |
import pandas as pd
|
| 492 |
|
| 493 |
+
n_warn = sum(1 for r in results if r["warnings"])
|
| 494 |
+
warn_label = f"Warnings ({n_warn})" if n_warn else "Warnings"
|
| 495 |
+
tab_summary, tab_warnings = st.tabs(["Summary", warn_label])
|
| 496 |
|
| 497 |
def _color_status(val):
|
| 498 |
return {
|
| 499 |
"OK": "background-color: #d4edda; color: #155724",
|
| 500 |
"WARN": "background-color: #fff3cd; color: #856404",
|
| 501 |
"FAIL": "background-color: #f8d7da; color: #721c24",
|
| 502 |
+
"SKIP": "background-color: #e2e3e5; color: #383d41",
|
| 503 |
}.get(val, "")
|
| 504 |
|
| 505 |
+
with tab_summary:
|
| 506 |
+
df = pd.DataFrame([{"Status": r["Status"], "TS": r["TS"]} for r in results])
|
| 507 |
+
st.dataframe(
|
| 508 |
+
df.style.map(_color_status, subset=["Status"]),
|
| 509 |
+
use_container_width=True,
|
| 510 |
+
)
|
| 511 |
+
|
| 512 |
+
with tab_warnings:
|
| 513 |
+
warned = [r for r in results if r["warnings"]]
|
| 514 |
+
if warned:
|
| 515 |
+
for r in warned:
|
| 516 |
+
with st.expander(f"β οΈ {r['TS']} β {len(r['warnings'])} warning(s)"):
|
| 517 |
+
for w in r["warnings"]:
|
| 518 |
+
st.text(w)
|
| 519 |
+
else:
|
| 520 |
+
st.success("No warnings.")
|
| 521 |
|
| 522 |
# Download ZIP
|
| 523 |
if output_dir.exists() and any(output_dir.rglob("*")):
|
|
|
|
| 540 |
else:
|
| 541 |
st.text("Log not found.")
|
| 542 |
|
| 543 |
+
# ββ TS Recovery βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 544 |
+
failed_ts_path = output_dir / "failed_ts.json"
|
| 545 |
+
if failed_ts_path.exists():
|
| 546 |
+
failed_ts_entries = json.loads(failed_ts_path.read_text())
|
| 547 |
+
if failed_ts_entries:
|
| 548 |
+
st.divider()
|
| 549 |
+
st.subheader("β οΈ Recover failed TS downloads")
|
| 550 |
+
st.info(
|
| 551 |
+
f"{len(failed_ts_entries)} TS(s) could not be downloaded. "
|
| 552 |
+
"Retry or upload each one manually, then apply the CRs."
|
| 553 |
+
)
|
| 554 |
+
|
| 555 |
+
for entry in failed_ts_entries:
|
| 556 |
+
spec_key = f"{entry['spec_number']} v{entry['version']}"
|
| 557 |
+
dest_path = Path(entry["spec_dir"]) / entry["expected_filename"]
|
| 558 |
+
ready = dest_path.exists()
|
| 559 |
+
|
| 560 |
+
label = f"{'β
' if ready else 'β'} TS {spec_key} β CRs: {', '.join(entry['cr_uids'])}"
|
| 561 |
+
with st.expander(label, expanded=not ready):
|
| 562 |
+
col1, col2 = st.columns(2)
|
| 563 |
+
|
| 564 |
+
with col1:
|
| 565 |
+
if st.button("π Retry download",
|
| 566 |
+
key=f"retry_{entry['spec_compact']}_{entry['version']}"):
|
| 567 |
+
from fetch_crs import download_ts as _dl_ts
|
| 568 |
+
with st.spinner(f"Downloading TS {spec_key}β¦"):
|
| 569 |
+
fn, note = _dl_ts(
|
| 570 |
+
entry["spec_number"], entry["version"],
|
| 571 |
+
Path(entry["spec_dir"]),
|
| 572 |
+
st.session_state.eol_user,
|
| 573 |
+
st.session_state.eol_password,
|
| 574 |
+
)
|
| 575 |
+
if fn:
|
| 576 |
+
st.success(f"Downloaded: {fn}")
|
| 577 |
+
st.rerun()
|
| 578 |
+
else:
|
| 579 |
+
st.error(f"Failed: {note}")
|
| 580 |
+
|
| 581 |
+
with col2:
|
| 582 |
+
uploaded_ts = st.file_uploader(
|
| 583 |
+
f"Or upload `{entry['expected_filename']}`",
|
| 584 |
+
type=["docx"],
|
| 585 |
+
key=f"upload_{entry['spec_compact']}_{entry['version']}",
|
| 586 |
+
)
|
| 587 |
+
if uploaded_ts is not None:
|
| 588 |
+
Path(entry["spec_dir"]).mkdir(parents=True, exist_ok=True)
|
| 589 |
+
dest_path.write_bytes(uploaded_ts.read())
|
| 590 |
+
st.success("Saved β")
|
| 591 |
+
st.rerun()
|
| 592 |
+
|
| 593 |
+
# Global apply button β enabled when β₯1 TS is now on disk
|
| 594 |
+
ready_entries = [
|
| 595 |
+
e for e in failed_ts_entries
|
| 596 |
+
if (Path(e["spec_dir"]) / e["expected_filename"]).exists()
|
| 597 |
+
]
|
| 598 |
+
remaining = len(failed_ts_entries) - len(ready_entries)
|
| 599 |
+
|
| 600 |
+
if ready_entries:
|
| 601 |
+
if remaining:
|
| 602 |
+
st.warning(f"{len(ready_entries)} ready, {remaining} will be skipped.")
|
| 603 |
+
else:
|
| 604 |
+
st.success(f"All {len(ready_entries)} TS(s) ready.")
|
| 605 |
+
|
| 606 |
+
if st.button("βΆ Apply CRs to recovered TSs", type="primary"):
|
| 607 |
+
retry_log = str(session_dir(sid) / "pipeline_retry.log")
|
| 608 |
+
_rc_path(sid).unlink(missing_ok=True) # clear old returncode
|
| 609 |
+
|
| 610 |
+
cmd = [
|
| 611 |
+
sys.executable,
|
| 612 |
+
str(SCRIPTS_DIR / "orchestrate_cr.py"),
|
| 613 |
+
"--output-dir", state["output_dir"],
|
| 614 |
+
"--retry-mode",
|
| 615 |
+
]
|
| 616 |
+
env = os.environ.copy()
|
| 617 |
+
env["EOL_USER"] = st.session_state.eol_user
|
| 618 |
+
env["EOL_PASSWORD"] = st.session_state.eol_password
|
| 619 |
+
|
| 620 |
+
log_file = open(retry_log, "w")
|
| 621 |
+
proc = subprocess.Popen(
|
| 622 |
+
cmd, stdout=log_file, stderr=subprocess.STDOUT, env=env
|
| 623 |
+
)
|
| 624 |
+
log_file.close()
|
| 625 |
+
|
| 626 |
+
threading.Thread(
|
| 627 |
+
target=_run_and_save_rc,
|
| 628 |
+
args=(proc, _rc_path(sid)),
|
| 629 |
+
daemon=True,
|
| 630 |
+
).start()
|
| 631 |
+
st.session_state.proc = proc
|
| 632 |
+
|
| 633 |
+
state["status"] = "running"
|
| 634 |
+
state["pid"] = proc.pid
|
| 635 |
+
state["log_path"] = retry_log
|
| 636 |
+
state["started_at"] = datetime.now().isoformat()
|
| 637 |
+
save_state(sid, state)
|
| 638 |
+
st.rerun()
|
| 639 |
+
else:
|
| 640 |
+
st.warning("No TSs available yet β retry download or upload DOCX files above.")
|
| 641 |
+
|
| 642 |
# Start new session
|
| 643 |
st.divider()
|
| 644 |
if st.button("Start new session"):
|
scripts/etsi_client.py
ADDED
|
@@ -0,0 +1,495 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
etsi_client.py β ETSI document download helpers for ApplyCRs.
|
| 3 |
+
|
| 4 |
+
Provides:
|
| 5 |
+
ETSIDocFinder β CR TDoc downloads via docbox.etsi.org
|
| 6 |
+
ETSISpecFinder β TS DOCX downloads via portal.etsi.org WKI chain
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import json
|
| 10 |
+
import os
|
| 11 |
+
import re
|
| 12 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 13 |
+
from urllib.parse import urljoin
|
| 14 |
+
|
| 15 |
+
import requests
|
| 16 |
+
import urllib3
|
| 17 |
+
from bs4 import BeautifulSoup
|
| 18 |
+
|
| 19 |
+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def _get_proxies() -> dict:
|
| 23 |
+
"""Return a requests-compatible proxies dict from $http_proxy / $HTTP_PROXY."""
|
| 24 |
+
proxy = os.environ.get("http_proxy") or os.environ.get("HTTP_PROXY") or ""
|
| 25 |
+
if not proxy:
|
| 26 |
+
return {}
|
| 27 |
+
return {"http": proxy, "https": proxy}
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class ETSIDocFinder:
|
| 31 |
+
HEADERS = {
|
| 32 |
+
"User-Agent": (
|
| 33 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
| 34 |
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
| 35 |
+
"Chrome/136.0.0.0 Safari/537.36"
|
| 36 |
+
)
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
def __init__(self, eol_user: str, eol_password: str):
|
| 40 |
+
self.eol_user = eol_user
|
| 41 |
+
self.eol_password = eol_password
|
| 42 |
+
self.main_ftp_url = "https://docbox.etsi.org/SET"
|
| 43 |
+
req_data = self.connect()
|
| 44 |
+
self.session = req_data["session"]
|
| 45 |
+
|
| 46 |
+
def connect(self):
|
| 47 |
+
session = requests.Session()
|
| 48 |
+
session.headers.update(self.HEADERS)
|
| 49 |
+
session.proxies.update(_get_proxies())
|
| 50 |
+
|
| 51 |
+
# Seed DNN session cookies β docbox requires the portal session to be
|
| 52 |
+
# initialised with domain=docbox.etsi.org so the .DOTNETNUKE cookie
|
| 53 |
+
# is scoped to .etsi.org and accepted by docbox.etsi.org as well.
|
| 54 |
+
login_redir_url = (
|
| 55 |
+
"https://portal.etsi.org/LoginRedirection.aspx"
|
| 56 |
+
"?domain=docbox.etsi.org&ReturnUrl=/"
|
| 57 |
+
)
|
| 58 |
+
session.get(login_redir_url, verify=False, timeout=15)
|
| 59 |
+
|
| 60 |
+
req = session.post(
|
| 61 |
+
"https://portal.etsi.org/ETSIPages/LoginEOL.ashx",
|
| 62 |
+
data=json.dumps({"username": self.eol_user, "password": self.eol_password}),
|
| 63 |
+
headers={
|
| 64 |
+
"Content-Type": "application/json; charset=UTF-8",
|
| 65 |
+
"Referer": login_redir_url,
|
| 66 |
+
},
|
| 67 |
+
verify=False,
|
| 68 |
+
allow_redirects=False,
|
| 69 |
+
timeout=15,
|
| 70 |
+
)
|
| 71 |
+
if req.text == "Failed":
|
| 72 |
+
return {
|
| 73 |
+
"error": True,
|
| 74 |
+
"session": session,
|
| 75 |
+
"message": "Login failed! Check your credentials",
|
| 76 |
+
}
|
| 77 |
+
self.session = session
|
| 78 |
+
return {"error": False, "session": session, "message": "Login successful"}
|
| 79 |
+
|
| 80 |
+
def download_document(self, url: str) -> bytes:
|
| 81 |
+
"""Download a docbox file using the authenticated session.
|
| 82 |
+
|
| 83 |
+
If the session has expired the portal redirects to LoginRedirection β
|
| 84 |
+
we detect this and re-authenticate before retrying.
|
| 85 |
+
"""
|
| 86 |
+
resp = self.session.get(url, verify=False, timeout=30, allow_redirects=True)
|
| 87 |
+
if resp.url and "LoginRedirection" in resp.url:
|
| 88 |
+
self.connect()
|
| 89 |
+
resp = self.session.get(url, verify=False, timeout=30, allow_redirects=True)
|
| 90 |
+
return resp.content
|
| 91 |
+
|
| 92 |
+
def get_workgroup(self, doc: str):
|
| 93 |
+
main_tsg = (
|
| 94 |
+
"SET-WG-R"
|
| 95 |
+
if any(doc.startswith(kw) for kw in ["SETREQ", "SCPREQ"])
|
| 96 |
+
else "SET-WG-T"
|
| 97 |
+
if any(doc.startswith(kw) for kw in ["SETTEC", "SCPTEC"])
|
| 98 |
+
else "SET"
|
| 99 |
+
if any(doc.startswith(kw) for kw in ["SET", "SCP"])
|
| 100 |
+
else None
|
| 101 |
+
)
|
| 102 |
+
if main_tsg is None:
|
| 103 |
+
return None, None, None
|
| 104 |
+
regex = re.search(r"\(([^)]+)\)", doc)
|
| 105 |
+
workgroup = "20" + regex.group(1)
|
| 106 |
+
return main_tsg, workgroup, doc
|
| 107 |
+
|
| 108 |
+
def find_workgroup_url(self, main_tsg, workgroup):
|
| 109 |
+
url = f"{self.main_ftp_url}/{main_tsg}/05-CONTRIBUTIONS"
|
| 110 |
+
response = self.session.get(url, verify=False, timeout=15)
|
| 111 |
+
if "LoginRedirection" in response.url:
|
| 112 |
+
self.connect()
|
| 113 |
+
response = self.session.get(url, verify=False, timeout=15)
|
| 114 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
| 115 |
+
for item in soup.find_all("tr"):
|
| 116 |
+
link = item.find("a")
|
| 117 |
+
if link and workgroup in link.get_text():
|
| 118 |
+
return (
|
| 119 |
+
f"{self.main_ftp_url}/{main_tsg}/05-CONTRIBUTIONS/{link.get_text()}"
|
| 120 |
+
)
|
| 121 |
+
return f"{self.main_ftp_url}/{main_tsg}/05-CONTRIBUTIONS/{workgroup}"
|
| 122 |
+
|
| 123 |
+
def get_docs_from_url(self, url):
|
| 124 |
+
try:
|
| 125 |
+
response = self.session.get(url, verify=False, timeout=15)
|
| 126 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
| 127 |
+
return [item.get_text() for item in soup.select("tr td a")]
|
| 128 |
+
except Exception as e:
|
| 129 |
+
print(f"Error accessing {url}: {e}")
|
| 130 |
+
return []
|
| 131 |
+
|
| 132 |
+
def search_document(self, doc_id: str):
|
| 133 |
+
original = doc_id
|
| 134 |
+
main_tsg, workgroup, doc = self.get_workgroup(doc_id)
|
| 135 |
+
urls = []
|
| 136 |
+
if main_tsg:
|
| 137 |
+
wg_url = self.find_workgroup_url(main_tsg, workgroup)
|
| 138 |
+
if wg_url:
|
| 139 |
+
entries = self.get_docs_from_url(wg_url)
|
| 140 |
+
for entry in entries:
|
| 141 |
+
if doc in entry.lower() or original in entry:
|
| 142 |
+
doc_url = f"{wg_url}/{entry}"
|
| 143 |
+
urls.append(doc_url)
|
| 144 |
+
elif "." not in entry.rstrip("/"):
|
| 145 |
+
sub_url = f"{wg_url}/{entry}"
|
| 146 |
+
files = self.get_docs_from_url(sub_url)
|
| 147 |
+
for f in files:
|
| 148 |
+
if doc in f.lower() or original in f:
|
| 149 |
+
urls.append(f"{sub_url}/{f}")
|
| 150 |
+
return (
|
| 151 |
+
urls[0]
|
| 152 |
+
if len(urls) == 1
|
| 153 |
+
else urls[-1]
|
| 154 |
+
if len(urls) > 1
|
| 155 |
+
else f"Document {doc_id} not found"
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
class ETSISpecFinder:
|
| 160 |
+
def __init__(self, eol_user: str, eol_password: str):
|
| 161 |
+
self.eol_user = eol_user
|
| 162 |
+
self.eol_password = eol_password
|
| 163 |
+
self.main_url = "https://www.etsi.org/deliver/etsi_ts"
|
| 164 |
+
self.second_url = "https://www.etsi.org/deliver/etsi_tr"
|
| 165 |
+
self.headers = {
|
| 166 |
+
"User-Agent": (
|
| 167 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
| 168 |
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
| 169 |
+
"Chrome/136.0.0.0 Safari/537.36"
|
| 170 |
+
)
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
def get_spec_path(self, doc_id: str):
|
| 174 |
+
if "-" in doc_id:
|
| 175 |
+
position, part = doc_id.split("-")
|
| 176 |
+
else:
|
| 177 |
+
position, part = doc_id, None
|
| 178 |
+
|
| 179 |
+
position = position.replace(" ", "")
|
| 180 |
+
if part:
|
| 181 |
+
if len(part) == 1:
|
| 182 |
+
part = "0" + part
|
| 183 |
+
spec_folder = position + part if part is not None else position
|
| 184 |
+
return (
|
| 185 |
+
f"{int(position) - (int(position) % 100)}_"
|
| 186 |
+
f"{int(position) - (int(position) % 100) + 99}/{spec_folder}"
|
| 187 |
+
)
|
| 188 |
+
|
| 189 |
+
def get_docs_from_url(self, url):
|
| 190 |
+
try:
|
| 191 |
+
response = requests.get(
|
| 192 |
+
url, verify=False, timeout=15, proxies=_get_proxies()
|
| 193 |
+
)
|
| 194 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
| 195 |
+
docs = [item.get_text() for item in soup.find_all("a")][1:]
|
| 196 |
+
return docs
|
| 197 |
+
except Exception as e:
|
| 198 |
+
print(f"Error accessing {url}: {e}")
|
| 199 |
+
return []
|
| 200 |
+
|
| 201 |
+
def _normalise_version(self, version: str) -> str:
|
| 202 |
+
"""Normalise a user-supplied version string to ETSI zero-padded format.
|
| 203 |
+
'17.6.0' -> '17.06.00' (the '_60' release suffix is ignored during matching)
|
| 204 |
+
Already-normalised strings like '17.06.00' are returned unchanged."""
|
| 205 |
+
parts = version.strip("/").split(".")
|
| 206 |
+
if len(parts) == 3:
|
| 207 |
+
try:
|
| 208 |
+
return f"{int(parts[0]):02d}.{int(parts[1]):02d}.{int(parts[2]):02d}"
|
| 209 |
+
except ValueError:
|
| 210 |
+
pass
|
| 211 |
+
return version.strip("/")
|
| 212 |
+
|
| 213 |
+
def _pick_release(self, releases: list, version: str = None) -> str:
|
| 214 |
+
"""Return the release folder matching version, or the latest if not found/specified."""
|
| 215 |
+
if version:
|
| 216 |
+
target = self._normalise_version(version)
|
| 217 |
+
for r in releases:
|
| 218 |
+
folder = r.strip("/").split("_")[0]
|
| 219 |
+
if folder == target:
|
| 220 |
+
return r
|
| 221 |
+
return releases[-1]
|
| 222 |
+
|
| 223 |
+
def search_document(self, doc_id: str, version: str = None):
|
| 224 |
+
original = doc_id
|
| 225 |
+
url = f"{self.main_url}/{self.get_spec_path(original)}/"
|
| 226 |
+
url2 = f"{self.second_url}/{self.get_spec_path(original)}/"
|
| 227 |
+
print(url)
|
| 228 |
+
print(url2)
|
| 229 |
+
|
| 230 |
+
releases = self.get_docs_from_url(url)
|
| 231 |
+
if releases:
|
| 232 |
+
release = self._pick_release(releases, version)
|
| 233 |
+
files = self.get_docs_from_url(url + release)
|
| 234 |
+
for f in files:
|
| 235 |
+
if f.endswith(".pdf"):
|
| 236 |
+
return url + release + "/" + f
|
| 237 |
+
|
| 238 |
+
releases = self.get_docs_from_url(url2)
|
| 239 |
+
if releases:
|
| 240 |
+
release = self._pick_release(releases, version)
|
| 241 |
+
files = self.get_docs_from_url(url2 + release)
|
| 242 |
+
for f in files:
|
| 243 |
+
if f.endswith(".pdf"):
|
| 244 |
+
return url2 + release + "/" + f
|
| 245 |
+
|
| 246 |
+
return f"Specification {doc_id} not found"
|
| 247 |
+
|
| 248 |
+
def _get_wki_id_candidates(self, doc_id: str, version: str = None) -> tuple:
|
| 249 |
+
"""Return (candidates, version_str) for a spec version (best match first)."""
|
| 250 |
+
if version:
|
| 251 |
+
version_str = version
|
| 252 |
+
else:
|
| 253 |
+
pdf_url = self.search_document(doc_id)
|
| 254 |
+
if "not found" in pdf_url.lower():
|
| 255 |
+
return [], ""
|
| 256 |
+
parts = pdf_url.rstrip("/").split("/")
|
| 257 |
+
version_folder = parts[-2] # e.g. "18.04.00_60"
|
| 258 |
+
v_parts = version_folder.split("_")[0].split(".") # ["18", "04", "00"]
|
| 259 |
+
try:
|
| 260 |
+
version_str = f"{int(v_parts[0])}.{int(v_parts[1])}.{int(v_parts[2])}"
|
| 261 |
+
except (ValueError, IndexError):
|
| 262 |
+
return [], ""
|
| 263 |
+
|
| 264 |
+
def fetch_candidates():
|
| 265 |
+
spec_num = doc_id.split("-")[0].replace(" ", "")
|
| 266 |
+
import datetime
|
| 267 |
+
today = datetime.date.today().isoformat()
|
| 268 |
+
|
| 269 |
+
base_params = {
|
| 270 |
+
"option": "com_standardssearch",
|
| 271 |
+
"view": "data",
|
| 272 |
+
"format": "json",
|
| 273 |
+
"page": "1",
|
| 274 |
+
"title": "1",
|
| 275 |
+
"etsiNumber": "1",
|
| 276 |
+
"content": "1",
|
| 277 |
+
"version": "0",
|
| 278 |
+
"onApproval": "1",
|
| 279 |
+
"published": "1",
|
| 280 |
+
"withdrawn": "1",
|
| 281 |
+
"historical": "1",
|
| 282 |
+
"isCurrent": "1",
|
| 283 |
+
"superseded": "1",
|
| 284 |
+
"startDate": "1988-01-15",
|
| 285 |
+
"endDate": today,
|
| 286 |
+
"harmonized": "0",
|
| 287 |
+
"keyword": "",
|
| 288 |
+
"TB": "",
|
| 289 |
+
"stdType": "",
|
| 290 |
+
"frequency": "",
|
| 291 |
+
"mandate": "",
|
| 292 |
+
"collection": "",
|
| 293 |
+
"sort": "1",
|
| 294 |
+
}
|
| 295 |
+
|
| 296 |
+
# ETSI UI sends capital-V version; try both to be safe
|
| 297 |
+
queries = [
|
| 298 |
+
f"{doc_id} V{version_str}", # e.g. "104 005 V1.2.1" (UI format)
|
| 299 |
+
f"{doc_id} v{version_str}", # e.g. "104 005 v1.2.1"
|
| 300 |
+
doc_id, # e.g. "104 005" (wider net)
|
| 301 |
+
]
|
| 302 |
+
seen = {}
|
| 303 |
+
for query in queries:
|
| 304 |
+
params = {**base_params, "search": query}
|
| 305 |
+
try:
|
| 306 |
+
resp = requests.get(
|
| 307 |
+
"https://www.etsi.org/",
|
| 308 |
+
params=params,
|
| 309 |
+
headers=self.headers,
|
| 310 |
+
verify=False,
|
| 311 |
+
timeout=15,
|
| 312 |
+
proxies=_get_proxies(),
|
| 313 |
+
)
|
| 314 |
+
data = resp.json()
|
| 315 |
+
if data and isinstance(data, list):
|
| 316 |
+
hits = [
|
| 317 |
+
str(item["wki_id"])
|
| 318 |
+
for item in data
|
| 319 |
+
if "wki_id" in item and spec_num in json.dumps(item)
|
| 320 |
+
]
|
| 321 |
+
for h in hits:
|
| 322 |
+
seen[h] = None
|
| 323 |
+
if hits:
|
| 324 |
+
print(f" wki_id search query={query!r} β {len(hits)} hit(s)")
|
| 325 |
+
break
|
| 326 |
+
except Exception as e:
|
| 327 |
+
print(f"Error getting wki_id for {doc_id} (query={query!r}): {e}")
|
| 328 |
+
return list(seen.keys())
|
| 329 |
+
|
| 330 |
+
candidates = list(dict.fromkeys(fetch_candidates()))
|
| 331 |
+
return candidates, version_str
|
| 332 |
+
|
| 333 |
+
def _authenticate_eol(self) -> requests.Session:
|
| 334 |
+
"""Create a requests.Session authenticated to the ETSI EOL portal."""
|
| 335 |
+
session = requests.Session()
|
| 336 |
+
session.headers.update({"User-Agent": self.headers["User-Agent"]})
|
| 337 |
+
session.proxies.update(_get_proxies())
|
| 338 |
+
|
| 339 |
+
login_redir_url = (
|
| 340 |
+
"https://portal.etsi.org/LoginRedirection.aspx"
|
| 341 |
+
"?domain=docbox.etsi.org&ReturnUrl=/"
|
| 342 |
+
)
|
| 343 |
+
session.get(login_redir_url, verify=False, timeout=15)
|
| 344 |
+
|
| 345 |
+
login_resp = session.post(
|
| 346 |
+
"https://portal.etsi.org/ETSIPages/LoginEOL.ashx",
|
| 347 |
+
data=json.dumps({"username": self.eol_user, "password": self.eol_password}),
|
| 348 |
+
headers={
|
| 349 |
+
"Content-Type": "application/json; charset=UTF-8",
|
| 350 |
+
"Referer": login_redir_url,
|
| 351 |
+
},
|
| 352 |
+
verify=False,
|
| 353 |
+
allow_redirects=False,
|
| 354 |
+
timeout=15,
|
| 355 |
+
)
|
| 356 |
+
if login_resp.text.strip() == "Failed":
|
| 357 |
+
raise RuntimeError(
|
| 358 |
+
"ETSI EOL login failed β check EOL_USER / EOL_PASSWORD"
|
| 359 |
+
)
|
| 360 |
+
return session
|
| 361 |
+
|
| 362 |
+
def search_document_docx(self, doc_id: str, version: str = None) -> str:
|
| 363 |
+
"""Download an ETSI spec as DOCX and return the local file path."""
|
| 364 |
+
candidates, version_str = self._get_wki_id_candidates(doc_id, version)
|
| 365 |
+
if not candidates:
|
| 366 |
+
return f"Specification {doc_id} not found"
|
| 367 |
+
|
| 368 |
+
try:
|
| 369 |
+
version_tag = "".join(f"{int(p):02d}" for p in version_str.split("."))
|
| 370 |
+
except (ValueError, AttributeError):
|
| 371 |
+
version_tag = ""
|
| 372 |
+
|
| 373 |
+
auth_session = self._authenticate_eol()
|
| 374 |
+
|
| 375 |
+
def try_wki(wki_id):
|
| 376 |
+
print(f"Trying wki_id={wki_id} for {doc_id}")
|
| 377 |
+
session = requests.Session()
|
| 378 |
+
session.headers.update({"User-Agent": self.headers["User-Agent"]})
|
| 379 |
+
session.proxies.update(_get_proxies())
|
| 380 |
+
session.cookies.update(auth_session.cookies)
|
| 381 |
+
|
| 382 |
+
# Step 1: LogonRedirection.asp registers the download intent server-side,
|
| 383 |
+
# generates a one-time profile_id, then 302s to NTaccount.asp.
|
| 384 |
+
# allow_redirects=True means the final response IS the NTaccount.asp page.
|
| 385 |
+
# Do NOT call NTaccount.asp again β a second call invalidates profile_id A
|
| 386 |
+
# and the server rejects the new profile_id B with "Your identifier is wrong".
|
| 387 |
+
r_logon = session.get(
|
| 388 |
+
f"https://portal.etsi.org/webapp/workprogram/LogonRedirection.asp"
|
| 389 |
+
f"?wki_id={wki_id}",
|
| 390 |
+
verify=False,
|
| 391 |
+
timeout=15,
|
| 392 |
+
allow_redirects=True,
|
| 393 |
+
)
|
| 394 |
+
meta_match = re.search(r"URL=([^\"'\s>]+)", r_logon.text)
|
| 395 |
+
if not meta_match:
|
| 396 |
+
print(
|
| 397 |
+
f" wki_id={wki_id}: authentication failed "
|
| 398 |
+
f"(no URL= in NTaccount.asp), trying next"
|
| 399 |
+
)
|
| 400 |
+
return None
|
| 401 |
+
|
| 402 |
+
meta_url = urljoin(r_logon.url, meta_match.group(1))
|
| 403 |
+
|
| 404 |
+
r2 = session.get(meta_url, allow_redirects=False, verify=False, timeout=15)
|
| 405 |
+
if r2.status_code != 302:
|
| 406 |
+
print(
|
| 407 |
+
f" wki_id={wki_id}: unexpected status {r2.status_code}, trying next"
|
| 408 |
+
)
|
| 409 |
+
return None
|
| 410 |
+
|
| 411 |
+
location2 = r2.headers.get("Location", "")
|
| 412 |
+
if "processerror" in location2.lower():
|
| 413 |
+
print(f" wki_id={wki_id}: portal rejected ({location2}), trying next")
|
| 414 |
+
return None
|
| 415 |
+
|
| 416 |
+
copy_url = urljoin("https://portal.etsi.org/", location2)
|
| 417 |
+
r3 = session.get(copy_url, allow_redirects=False, verify=False, timeout=15)
|
| 418 |
+
|
| 419 |
+
if r3.status_code == 302:
|
| 420 |
+
location3 = r3.headers.get("Location", "")
|
| 421 |
+
final_url = urljoin("https://portal.etsi.org/webapp/ewp/", location3)
|
| 422 |
+
r4 = session.get(final_url, verify=False, timeout=15)
|
| 423 |
+
else:
|
| 424 |
+
r4 = r3
|
| 425 |
+
|
| 426 |
+
docx_urls = re.findall(
|
| 427 |
+
r'href=["\']([^"\']*\.docx)["\']', r4.text, re.IGNORECASE
|
| 428 |
+
)
|
| 429 |
+
if not docx_urls:
|
| 430 |
+
print(f" wki_id={wki_id}: DOCX not found in page, trying next")
|
| 431 |
+
return None
|
| 432 |
+
|
| 433 |
+
spec_num = doc_id.split("-")[0].replace(" ", "")
|
| 434 |
+
matching_urls = [u for u in docx_urls if spec_num in u.split("/")[-1]]
|
| 435 |
+
if not matching_urls:
|
| 436 |
+
print(
|
| 437 |
+
f" wki_id={wki_id}: DOCX spec mismatch "
|
| 438 |
+
f"(expected {spec_num}), trying next"
|
| 439 |
+
)
|
| 440 |
+
return None
|
| 441 |
+
|
| 442 |
+
if version_tag:
|
| 443 |
+
version_candidates = [
|
| 444 |
+
version_tag, # "010201"
|
| 445 |
+
f"v{version_tag}", # "v010201"
|
| 446 |
+
version_str.replace(".", ""), # "121"
|
| 447 |
+
version_str, # "1.2.1"
|
| 448 |
+
version_str.replace(".", "_"), # "1_2_1"
|
| 449 |
+
]
|
| 450 |
+
versioned_urls = []
|
| 451 |
+
for tag in version_candidates:
|
| 452 |
+
versioned_urls = [
|
| 453 |
+
u for u in matching_urls if tag in u.split("/")[-1]
|
| 454 |
+
]
|
| 455 |
+
if versioned_urls:
|
| 456 |
+
break
|
| 457 |
+
|
| 458 |
+
if not versioned_urls:
|
| 459 |
+
found_names = [u.split("/")[-1] for u in matching_urls]
|
| 460 |
+
print(
|
| 461 |
+
f" wki_id={wki_id}: version tag not in filenames {found_names}, "
|
| 462 |
+
f"using first spec-matching DOCX as fallback"
|
| 463 |
+
)
|
| 464 |
+
versioned_urls = matching_urls
|
| 465 |
+
|
| 466 |
+
matching_urls = versioned_urls
|
| 467 |
+
|
| 468 |
+
docx_url = matching_urls[0]
|
| 469 |
+
dl = session.get(
|
| 470 |
+
docx_url,
|
| 471 |
+
headers={"Referer": r4.url},
|
| 472 |
+
verify=False,
|
| 473 |
+
timeout=60,
|
| 474 |
+
)
|
| 475 |
+
filename = docx_url.split("/")[-1]
|
| 476 |
+
tmp_path = f"/tmp/{filename}"
|
| 477 |
+
with open(tmp_path, "wb") as f:
|
| 478 |
+
f.write(dl.content)
|
| 479 |
+
|
| 480 |
+
print(f" wki_id={wki_id}: success")
|
| 481 |
+
return tmp_path
|
| 482 |
+
|
| 483 |
+
executor = ThreadPoolExecutor(max_workers=min(len(candidates), 4))
|
| 484 |
+
try:
|
| 485 |
+
futures = {executor.submit(try_wki, wki_id): wki_id for wki_id in candidates}
|
| 486 |
+
for future in as_completed(futures):
|
| 487 |
+
result = future.result()
|
| 488 |
+
if result is not None:
|
| 489 |
+
for f in futures:
|
| 490 |
+
f.cancel()
|
| 491 |
+
return result
|
| 492 |
+
finally:
|
| 493 |
+
executor.shutdown(wait=False)
|
| 494 |
+
|
| 495 |
+
return f"Specification {doc_id}: all {len(candidates)} wki_id candidate(s) rejected"
|
scripts/fetch_crs.py
CHANGED
|
@@ -7,9 +7,9 @@ Usage:
|
|
| 7 |
|
| 8 |
Steps:
|
| 9 |
1. Parse Excel, filter Accepted CRs by person name
|
| 10 |
-
2. Download CR DOCXs via
|
| 11 |
3. Parse CR cover pages to extract target TS spec + version
|
| 12 |
-
4. Download TS DOCXs via
|
| 13 |
5. Print summary report
|
| 14 |
"""
|
| 15 |
|
|
@@ -17,15 +17,10 @@ import argparse
|
|
| 17 |
import os
|
| 18 |
import re
|
| 19 |
import sys
|
| 20 |
-
import time
|
| 21 |
import zipfile
|
| 22 |
from pathlib import Path
|
| 23 |
|
| 24 |
-
import
|
| 25 |
-
|
| 26 |
-
BASE_URL = "https://organizedprogrammers-docfinder.hf.space"
|
| 27 |
-
#_proxy = os.environ.get("http_proxy") or None
|
| 28 |
-
#PROXIES = {"http": _proxy, "https": os.environ.get("https_proxy") or None}
|
| 29 |
|
| 30 |
|
| 31 |
# ---------------------------------------------------------------------------
|
|
@@ -178,7 +173,7 @@ def _parse_xlsx(path: Path, person_name: str):
|
|
| 178 |
# Step 2 β Download CR DOCXs
|
| 179 |
# ---------------------------------------------------------------------------
|
| 180 |
|
| 181 |
-
def download_cr(uid: str, cr_dir: Path):
|
| 182 |
"""
|
| 183 |
Download CR DOCX for the given UID.
|
| 184 |
|
|
@@ -193,19 +188,14 @@ def download_cr(uid: str, cr_dir: Path):
|
|
| 193 |
return dest, "already existed"
|
| 194 |
|
| 195 |
try:
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
if not resp.ok:
|
| 206 |
-
return None, f"HTTP {resp.status_code}"
|
| 207 |
-
|
| 208 |
-
content = resp.content
|
| 209 |
if not content:
|
| 210 |
return None, "empty response"
|
| 211 |
|
|
@@ -296,22 +286,11 @@ def parse_cr_cover(docx_path: Path):
|
|
| 296 |
# Step 4 β Download TS DOCXs
|
| 297 |
# ---------------------------------------------------------------------------
|
| 298 |
|
| 299 |
-
def _is_html(resp: requests.Response) -> bool:
|
| 300 |
-
"""Return True if the response body is an HTML page (e.g. HF Space loading page)."""
|
| 301 |
-
ct = resp.headers.get("content-type", "")
|
| 302 |
-
if "text/html" in ct:
|
| 303 |
-
return True
|
| 304 |
-
return resp.content[:5].lower() in (b"<!doc", b"<html")
|
| 305 |
-
|
| 306 |
-
|
| 307 |
def download_ts(spec_number: str, version: str, ts_dir: Path,
|
| 308 |
-
|
| 309 |
"""
|
| 310 |
Download TS DOCX for spec_number (e.g. "102 221") and version (e.g. "18.3.0").
|
| 311 |
|
| 312 |
-
Retries up to max_retries times when the HF Space returns an HTML loading page
|
| 313 |
-
instead of the DOCX binary (happens on cold-start / brief restarts).
|
| 314 |
-
|
| 315 |
Returns (filename, note) or (None, error_msg).
|
| 316 |
"""
|
| 317 |
spec_no_space = spec_number.replace(" ", "")
|
|
@@ -321,56 +300,40 @@ def download_ts(spec_number: str, version: str, ts_dir: Path,
|
|
| 321 |
if dest.exists():
|
| 322 |
return filename, "already existed"
|
| 323 |
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
json={"doc_id": spec_number, "version": version},
|
| 330 |
-
#proxies=PROXIES,
|
| 331 |
-
timeout=120,
|
| 332 |
-
)
|
| 333 |
-
except requests.RequestException as e:
|
| 334 |
-
return None, f"network error: {e}"
|
| 335 |
|
| 336 |
-
|
| 337 |
-
|
| 338 |
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
|
| 343 |
-
|
| 344 |
-
if _is_html(resp):
|
| 345 |
-
last_error = f"got HTML instead of DOCX (attempt {attempt}/{max_retries})"
|
| 346 |
-
if attempt < max_retries:
|
| 347 |
-
print(f"\n [retry in {retry_delay}s β HF Space loadingβ¦]", flush=True)
|
| 348 |
-
time.sleep(retry_delay)
|
| 349 |
-
continue
|
| 350 |
-
return None, f"invalid file (not a ZIP/DOCX, starts with {content[:4]!r}) after {max_retries} attempts"
|
| 351 |
|
| 352 |
-
|
| 353 |
-
dest.
|
|
|
|
| 354 |
|
| 355 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 356 |
dest.unlink()
|
| 357 |
-
return None,
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
first_para = _doc.paragraphs[0].text if _doc.paragraphs else ''
|
| 364 |
-
if spec_no_space not in first_para.replace(' ', ''):
|
| 365 |
-
dest.unlink()
|
| 366 |
-
return None, f"wrong TS returned by API: got {first_para[:80]!r} (expected spec {spec_no_space})"
|
| 367 |
-
except Exception:
|
| 368 |
-
pass # Trust the ZIP check above
|
| 369 |
-
|
| 370 |
-
note = "downloaded" if attempt == 1 else f"downloaded (after {attempt} attempts)"
|
| 371 |
-
return filename, note
|
| 372 |
|
| 373 |
-
return
|
| 374 |
|
| 375 |
|
| 376 |
# ---------------------------------------------------------------------------
|
|
@@ -394,6 +357,11 @@ def main():
|
|
| 394 |
person_name = args.person_name
|
| 395 |
output_dir = Path(wsl_path(args.output_dir)).expanduser()
|
| 396 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 397 |
cr_dir = output_dir / "CRs"
|
| 398 |
ts_dir = output_dir / "TS"
|
| 399 |
cr_dir.mkdir(parents=True, exist_ok=True)
|
|
@@ -419,8 +387,8 @@ def main():
|
|
| 419 |
cr_results = [] # list of (uid, docx_path_or_None, note)
|
| 420 |
|
| 421 |
for uid, title in cr_list:
|
| 422 |
-
print(f" [{uid}] ", end="", flush=True)
|
| 423 |
-
docx_path, note = download_cr(uid, cr_dir)
|
| 424 |
cr_results.append((uid, docx_path, note))
|
| 425 |
if docx_path:
|
| 426 |
print(f"OK ({note}) β {docx_path.name}")
|
|
@@ -452,7 +420,7 @@ def main():
|
|
| 452 |
|
| 453 |
for (spec_number, version), uids in ts_targets.items():
|
| 454 |
print(f" [TS {spec_number} v{version}] ", end="", flush=True)
|
| 455 |
-
filename, note = download_ts(spec_number, version, ts_dir)
|
| 456 |
ts_results.append((spec_number, version, filename, note))
|
| 457 |
if filename:
|
| 458 |
print(f"OK ({note}) β {filename}")
|
|
|
|
| 7 |
|
| 8 |
Steps:
|
| 9 |
1. Parse Excel, filter Accepted CRs by person name
|
| 10 |
+
2. Download CR DOCXs via ETSI docbox
|
| 11 |
3. Parse CR cover pages to extract target TS spec + version
|
| 12 |
+
4. Download TS DOCXs via ETSI portal WKI chain
|
| 13 |
5. Print summary report
|
| 14 |
"""
|
| 15 |
|
|
|
|
| 17 |
import os
|
| 18 |
import re
|
| 19 |
import sys
|
|
|
|
| 20 |
import zipfile
|
| 21 |
from pathlib import Path
|
| 22 |
|
| 23 |
+
from etsi_client import ETSIDocFinder, ETSISpecFinder
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
# ---------------------------------------------------------------------------
|
|
|
|
| 173 |
# Step 2 β Download CR DOCXs
|
| 174 |
# ---------------------------------------------------------------------------
|
| 175 |
|
| 176 |
+
def download_cr(uid: str, cr_dir: Path, eol_user: str, eol_password: str):
|
| 177 |
"""
|
| 178 |
Download CR DOCX for the given UID.
|
| 179 |
|
|
|
|
| 188 |
return dest, "already existed"
|
| 189 |
|
| 190 |
try:
|
| 191 |
+
finder = ETSIDocFinder(eol_user, eol_password)
|
| 192 |
+
url = finder.search_document(uid)
|
| 193 |
+
if isinstance(url, str) and "not found" in url.lower():
|
| 194 |
+
return None, f"document not found: {uid}"
|
| 195 |
+
content = finder.download_document(url)
|
| 196 |
+
except Exception as e:
|
| 197 |
+
return None, f"download error: {e}"
|
| 198 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
if not content:
|
| 200 |
return None, "empty response"
|
| 201 |
|
|
|
|
| 286 |
# Step 4 β Download TS DOCXs
|
| 287 |
# ---------------------------------------------------------------------------
|
| 288 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
def download_ts(spec_number: str, version: str, ts_dir: Path,
|
| 290 |
+
eol_user: str = "", eol_password: str = ""):
|
| 291 |
"""
|
| 292 |
Download TS DOCX for spec_number (e.g. "102 221") and version (e.g. "18.3.0").
|
| 293 |
|
|
|
|
|
|
|
|
|
|
| 294 |
Returns (filename, note) or (None, error_msg).
|
| 295 |
"""
|
| 296 |
spec_no_space = spec_number.replace(" ", "")
|
|
|
|
| 300 |
if dest.exists():
|
| 301 |
return filename, "already existed"
|
| 302 |
|
| 303 |
+
try:
|
| 304 |
+
finder = ETSISpecFinder(eol_user, eol_password)
|
| 305 |
+
tmp_path = finder.search_document_docx(spec_number, version)
|
| 306 |
+
except Exception as e:
|
| 307 |
+
return None, f"download error: {e}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 308 |
|
| 309 |
+
if "not found" in str(tmp_path).lower() or "rejected" in str(tmp_path).lower():
|
| 310 |
+
return None, tmp_path
|
| 311 |
|
| 312 |
+
content = Path(tmp_path).read_bytes()
|
| 313 |
+
if not content:
|
| 314 |
+
return None, "empty response"
|
| 315 |
|
| 316 |
+
dest.write_bytes(content)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 317 |
|
| 318 |
+
if content[:2] != b"PK":
|
| 319 |
+
dest.unlink()
|
| 320 |
+
return None, f"invalid file (not a ZIP/DOCX, starts with {content[:4]!r})"
|
| 321 |
|
| 322 |
+
# Verify the TS contains the expected spec number in its first paragraph
|
| 323 |
+
try:
|
| 324 |
+
import docx as _docx
|
| 325 |
+
_doc = _docx.Document(dest)
|
| 326 |
+
first_para = _doc.paragraphs[0].text if _doc.paragraphs else ""
|
| 327 |
+
if spec_no_space not in first_para.replace(" ", ""):
|
| 328 |
dest.unlink()
|
| 329 |
+
return None, (
|
| 330 |
+
f"wrong TS returned: got {first_para[:80]!r} "
|
| 331 |
+
f"(expected spec {spec_no_space})"
|
| 332 |
+
)
|
| 333 |
+
except Exception:
|
| 334 |
+
pass # Trust the ZIP check above
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 335 |
|
| 336 |
+
return filename, "downloaded"
|
| 337 |
|
| 338 |
|
| 339 |
# ---------------------------------------------------------------------------
|
|
|
|
| 357 |
person_name = args.person_name
|
| 358 |
output_dir = Path(wsl_path(args.output_dir)).expanduser()
|
| 359 |
|
| 360 |
+
eol_user = os.environ.get("EOL_USER", "")
|
| 361 |
+
eol_password = os.environ.get("EOL_PASSWORD", "")
|
| 362 |
+
if not eol_user or not eol_password:
|
| 363 |
+
sys.exit("ERROR: EOL_USER and EOL_PASSWORD must be set")
|
| 364 |
+
|
| 365 |
cr_dir = output_dir / "CRs"
|
| 366 |
ts_dir = output_dir / "TS"
|
| 367 |
cr_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
| 387 |
cr_results = [] # list of (uid, docx_path_or_None, note)
|
| 388 |
|
| 389 |
for uid, title in cr_list:
|
| 390 |
+
#print(f" [{uid}] ", end="", flush=True)
|
| 391 |
+
docx_path, note = download_cr(uid, cr_dir, eol_user, eol_password)
|
| 392 |
cr_results.append((uid, docx_path, note))
|
| 393 |
if docx_path:
|
| 394 |
print(f"OK ({note}) β {docx_path.name}")
|
|
|
|
| 420 |
|
| 421 |
for (spec_number, version), uids in ts_targets.items():
|
| 422 |
print(f" [TS {spec_number} v{version}] ", end="", flush=True)
|
| 423 |
+
filename, note = download_ts(spec_number, version, ts_dir, eol_user, eol_password)
|
| 424 |
ts_results.append((spec_number, version, filename, note))
|
| 425 |
if filename:
|
| 426 |
print(f"OK ({note}) β {filename}")
|
scripts/finalize_ts.py
CHANGED
|
@@ -178,18 +178,32 @@ def _detect_meeting_separator(tbl):
|
|
| 178 |
|
| 179 |
# ββ TS table locators βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 180 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
def find_change_history_table(ts_doc):
|
| 182 |
-
"""
|
| 183 |
-
tables
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
)
|
| 192 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
|
| 194 |
|
| 195 |
def find_history_table(ts_doc):
|
|
|
|
| 178 |
|
| 179 |
# ββ TS table locators βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 180 |
|
| 181 |
+
class NoChangeHistoryTable(Exception):
|
| 182 |
+
"""Raised when the document contains no recognisable Change History table."""
|
| 183 |
+
pass
|
| 184 |
+
|
| 185 |
+
|
| 186 |
def find_change_history_table(ts_doc):
|
| 187 |
+
"""
|
| 188 |
+
Scan all tables backward from the end looking for a Change History table.
|
| 189 |
+
A match requires both:
|
| 190 |
+
- 8 or 9 columns in the last row (standard ETSI Change History layout)
|
| 191 |
+
- At least one of the keywords 'cr', 'date', 'meeting', 'rev' in the header row
|
| 192 |
+
Raises NoChangeHistoryTable (not ValueError) when none is found so callers
|
| 193 |
+
can distinguish a structural absence from an unexpected error.
|
| 194 |
+
"""
|
| 195 |
+
for tbl in reversed(ts_doc.tables):
|
| 196 |
+
ncols = len(tbl.rows[-1].cells)
|
| 197 |
+
if ncols not in (8, 9):
|
| 198 |
+
continue
|
| 199 |
+
if tbl.rows:
|
| 200 |
+
header_text = ' '.join(c.text.strip() for c in tbl.rows[0].cells).lower()
|
| 201 |
+
if any(kw in header_text for kw in ('cr', 'date', 'meeting', 'rev')):
|
| 202 |
+
return tbl
|
| 203 |
+
raise NoChangeHistoryTable(
|
| 204 |
+
'No Change History table found in this document '
|
| 205 |
+
'(no table with 8 or 9 columns and CR/Date/Meeting/Rev headers)'
|
| 206 |
+
)
|
| 207 |
|
| 208 |
|
| 209 |
def find_history_table(ts_doc):
|
scripts/orchestrate_cr.py
CHANGED
|
@@ -22,8 +22,11 @@ import argparse
|
|
| 22 |
import contextlib
|
| 23 |
import datetime
|
| 24 |
import io
|
|
|
|
|
|
|
| 25 |
import re
|
| 26 |
import sys
|
|
|
|
| 27 |
from pathlib import Path
|
| 28 |
|
| 29 |
import docx as docx_lib
|
|
@@ -44,6 +47,7 @@ from finalize_ts import (
|
|
| 44 |
update_change_history_table,
|
| 45 |
update_history_table,
|
| 46 |
update_title_para,
|
|
|
|
| 47 |
)
|
| 48 |
from docx_helpers import RevCounter, AUTHOR as DEFAULT_AUTHOR, DATE as DEFAULT_DATE
|
| 49 |
|
|
@@ -78,7 +82,12 @@ def main():
|
|
| 78 |
description='Fully automated CR application pipeline.',
|
| 79 |
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 80 |
)
|
| 81 |
-
ap.add_argument(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
ap.add_argument(
|
| 83 |
'person_name',
|
| 84 |
nargs='?',
|
|
@@ -95,9 +104,21 @@ def main():
|
|
| 95 |
default=DEFAULT_AUTHOR,
|
| 96 |
help=f'Tracked change author name (default: "{DEFAULT_AUTHOR}")',
|
| 97 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
args = ap.parse_args()
|
| 99 |
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
output_dir = Path(wsl_path(args.output_dir)).expanduser()
|
| 102 |
cr_dir = output_dir / 'CRs'
|
| 103 |
ts_dir = output_dir / 'TS' # spec subfolders created per-TS below
|
|
@@ -107,6 +128,212 @@ def main():
|
|
| 107 |
author = args.author
|
| 108 |
tc_date = DEFAULT_DATE
|
| 109 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
# ββ Step 1: Parse Excel βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 111 |
_section('Step 1 β Parsing Excel')
|
| 112 |
print(f'Excel: {excel_path}')
|
|
@@ -117,9 +344,7 @@ def main():
|
|
| 117 |
except Exception as e:
|
| 118 |
sys.exit(f'ERROR parsing Excel: {e}')
|
| 119 |
|
| 120 |
-
print(f'Found {len(cr_list)} Accepted CR(s)
|
| 121 |
-
for uid, title in cr_list:
|
| 122 |
-
print(f' {uid}: {title[:80]}')
|
| 123 |
|
| 124 |
if not cr_list:
|
| 125 |
print('Nothing to process.')
|
|
@@ -130,13 +355,16 @@ def main():
|
|
| 130 |
cr_paths = {} # uid -> Path
|
| 131 |
|
| 132 |
for uid, _ in cr_list:
|
| 133 |
-
|
| 134 |
-
docx_path, note = download_cr(uid, cr_dir)
|
| 135 |
if docx_path:
|
| 136 |
cr_paths[uid] = docx_path
|
| 137 |
-
print(f'OK ({note}) β {docx_path.name}')
|
| 138 |
-
|
| 139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
|
| 141 |
# ββ Step 3: Parse cover pages β group by target TS βββββββββββββββββββββββ
|
| 142 |
_section('Step 3 β Parsing CR cover pages')
|
|
@@ -169,13 +397,41 @@ def main():
|
|
| 169 |
spec_dirs[(spec_number, version)] = spec_dir
|
| 170 |
|
| 171 |
print(f' [TS {spec_number} v{version}] ', end='', flush=True)
|
| 172 |
-
filename, note =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
if filename:
|
| 174 |
ts_paths[(spec_number, version)] = spec_dir / filename
|
| 175 |
print(f'OK ({note}) β {spec_compact}/{filename}')
|
| 176 |
else:
|
| 177 |
print(f'FAILED β {note}')
|
| 178 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
# ββ Steps 5 & 6: Apply CRs + Finalise each TS ββββββββββββββββββββββββββββ
|
| 180 |
_section('Steps 5 & 6 β Applying CRs and Finalising Metadata')
|
| 181 |
report = [] # (ts_key, n_ok, n_skip, n_crs, out_path, log_path, errors)
|
|
@@ -258,6 +514,10 @@ def main():
|
|
| 258 |
|
| 259 |
for line in log_lines:
|
| 260 |
print(f' {line}')
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
print(f' -> Applied: {n_ok} Skipped: {n_skip}')
|
| 262 |
|
| 263 |
# 6. Finalise metadata (Change History, History, title paragraph)
|
|
@@ -285,6 +545,8 @@ def main():
|
|
| 285 |
ts_doc, meta, pub_ym, old_v, new_v, rev, author, tc_date
|
| 286 |
)
|
| 287 |
print(f' [Change History] {uid}: {ch_cells}')
|
|
|
|
|
|
|
| 288 |
except Exception as e:
|
| 289 |
errors.append(f'[{uid}] Change History ERROR: {e}')
|
| 290 |
print(f' [Change History] {uid}: ERROR β {e}')
|
|
|
|
| 22 |
import contextlib
|
| 23 |
import datetime
|
| 24 |
import io
|
| 25 |
+
import json
|
| 26 |
+
import os
|
| 27 |
import re
|
| 28 |
import sys
|
| 29 |
+
import time
|
| 30 |
from pathlib import Path
|
| 31 |
|
| 32 |
import docx as docx_lib
|
|
|
|
| 47 |
update_change_history_table,
|
| 48 |
update_history_table,
|
| 49 |
update_title_para,
|
| 50 |
+
NoChangeHistoryTable,
|
| 51 |
)
|
| 52 |
from docx_helpers import RevCounter, AUTHOR as DEFAULT_AUTHOR, DATE as DEFAULT_DATE
|
| 53 |
|
|
|
|
| 82 |
description='Fully automated CR application pipeline.',
|
| 83 |
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 84 |
)
|
| 85 |
+
ap.add_argument(
|
| 86 |
+
'excel_path',
|
| 87 |
+
nargs='?',
|
| 88 |
+
default=None,
|
| 89 |
+
help='Path to .xls or .xlsx contribution list (not required in --retry-mode)',
|
| 90 |
+
)
|
| 91 |
ap.add_argument(
|
| 92 |
'person_name',
|
| 93 |
nargs='?',
|
|
|
|
| 104 |
default=DEFAULT_AUTHOR,
|
| 105 |
help=f'Tracked change author name (default: "{DEFAULT_AUTHOR}")',
|
| 106 |
)
|
| 107 |
+
ap.add_argument(
|
| 108 |
+
'--retry-mode',
|
| 109 |
+
action='store_true',
|
| 110 |
+
help='Skip steps 1-4; apply CRs to TSs listed in failed_ts.json that now have their DOCX on disk',
|
| 111 |
+
)
|
| 112 |
args = ap.parse_args()
|
| 113 |
|
| 114 |
+
if not args.retry_mode and not args.excel_path:
|
| 115 |
+
ap.error('excel_path is required when not in --retry-mode')
|
| 116 |
+
|
| 117 |
+
eol_user = os.environ.get("EOL_USER", "")
|
| 118 |
+
eol_password = os.environ.get("EOL_PASSWORD", "")
|
| 119 |
+
if not eol_user or not eol_password:
|
| 120 |
+
sys.exit("ERROR: EOL_USER and EOL_PASSWORD must be set")
|
| 121 |
+
|
| 122 |
output_dir = Path(wsl_path(args.output_dir)).expanduser()
|
| 123 |
cr_dir = output_dir / 'CRs'
|
| 124 |
ts_dir = output_dir / 'TS' # spec subfolders created per-TS below
|
|
|
|
| 128 |
author = args.author
|
| 129 |
tc_date = DEFAULT_DATE
|
| 130 |
|
| 131 |
+
# ββ Retry mode β skip steps 1-4, reconstruct state from failed_ts.json βββ
|
| 132 |
+
if args.retry_mode:
|
| 133 |
+
failed_ts_path = output_dir / 'failed_ts.json'
|
| 134 |
+
if not failed_ts_path.exists():
|
| 135 |
+
sys.exit('ERROR: failed_ts.json not found in output directory')
|
| 136 |
+
failed_ts_entries = json.loads(failed_ts_path.read_text())
|
| 137 |
+
if not failed_ts_entries:
|
| 138 |
+
print('No failed TSs in failed_ts.json β nothing to retry.')
|
| 139 |
+
return
|
| 140 |
+
|
| 141 |
+
_section('Retry mode β Steps 5 & 6 only')
|
| 142 |
+
print(f'Retrying {len(failed_ts_entries)} TS(s) from failed_ts.json')
|
| 143 |
+
|
| 144 |
+
ts_groups = {}
|
| 145 |
+
spec_dirs = {}
|
| 146 |
+
ts_paths = {}
|
| 147 |
+
cr_paths = {}
|
| 148 |
+
|
| 149 |
+
for entry in failed_ts_entries:
|
| 150 |
+
spec_number = entry['spec_number']
|
| 151 |
+
version = entry['version']
|
| 152 |
+
key = (spec_number, version)
|
| 153 |
+
ts_groups[key] = entry['cr_uids']
|
| 154 |
+
spec_dir = Path(entry['spec_dir'])
|
| 155 |
+
spec_dirs[key] = spec_dir
|
| 156 |
+
expected = spec_dir / entry['expected_filename']
|
| 157 |
+
if expected.exists():
|
| 158 |
+
ts_paths[key] = expected
|
| 159 |
+
print(f' [TS {spec_number} v{version}] DOCX found β will apply')
|
| 160 |
+
else:
|
| 161 |
+
print(f' [TS {spec_number} v{version}] DOCX missing β skipping')
|
| 162 |
+
# Reconstruct cr_paths for each UID
|
| 163 |
+
cr_entry_dir = Path(entry['cr_dir'])
|
| 164 |
+
for uid in entry['cr_uids']:
|
| 165 |
+
extracted = cr_entry_dir / f'{uid}_extracted.docx'
|
| 166 |
+
plain = cr_entry_dir / f'{uid}.docx'
|
| 167 |
+
if extracted.exists():
|
| 168 |
+
cr_paths[uid] = extracted
|
| 169 |
+
elif plain.exists():
|
| 170 |
+
cr_paths[uid] = plain
|
| 171 |
+
|
| 172 |
+
# ββ Steps 5 & 6 (retry mode falls through to shared loop below) ββββββ
|
| 173 |
+
report = []
|
| 174 |
+
|
| 175 |
+
for (spec_number, version), uids in ts_groups.items():
|
| 176 |
+
ts_key = f'TS {spec_number} v{version}'
|
| 177 |
+
spec_compact = spec_number.replace(' ', '')
|
| 178 |
+
spec_dir = spec_dirs.get((spec_number, version), ts_dir / spec_compact)
|
| 179 |
+
spec_dir.mkdir(parents=True, exist_ok=True)
|
| 180 |
+
|
| 181 |
+
new_v = derive_new_version(version)
|
| 182 |
+
stem = f'ts_{spec_compact}_v{new_v}_was_v{version}'
|
| 183 |
+
ts_applied = spec_dir / f'ts_{spec_compact}_v{version}_applied.docx'
|
| 184 |
+
ts_final = spec_dir / f'{stem}.docx'
|
| 185 |
+
log_path = spec_dir / f'{stem}.log'
|
| 186 |
+
errors = []
|
| 187 |
+
|
| 188 |
+
print(f'\n-- {ts_key} ({len(uids)} CR(s): {", ".join(uids)}) --')
|
| 189 |
+
|
| 190 |
+
if (spec_number, version) not in ts_paths:
|
| 191 |
+
msg = 'TS DOCX not on disk β skipping'
|
| 192 |
+
print(f' SKIP: {msg}')
|
| 193 |
+
report.append((ts_key, 0, 0, len(uids), None, log_path, [msg]))
|
| 194 |
+
continue
|
| 195 |
+
|
| 196 |
+
ts_in = ts_paths[(spec_number, version)]
|
| 197 |
+
|
| 198 |
+
log_buf = io.StringIO()
|
| 199 |
+
tee = _TeeWriter(sys.stdout, log_buf)
|
| 200 |
+
|
| 201 |
+
with contextlib.redirect_stdout(tee):
|
| 202 |
+
log_header = (
|
| 203 |
+
f'Pipeline Log (retry)\n'
|
| 204 |
+
f'TS: {spec_number} v{version} -> v{new_v}\n'
|
| 205 |
+
f'CRs: {", ".join(uids)}\n'
|
| 206 |
+
f'Date: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n'
|
| 207 |
+
f'{"=" * 60}\n'
|
| 208 |
+
)
|
| 209 |
+
print(log_header, end='')
|
| 210 |
+
|
| 211 |
+
combined_manifest = []
|
| 212 |
+
participating_uids = []
|
| 213 |
+
|
| 214 |
+
for uid in uids:
|
| 215 |
+
if uid not in cr_paths:
|
| 216 |
+
errors.append(f'[{uid}] CR DOCX not found β skipped')
|
| 217 |
+
continue
|
| 218 |
+
print(f' Parsing {uid}... ', end='', flush=True)
|
| 219 |
+
try:
|
| 220 |
+
changes = parse_cr(cr_paths[uid])
|
| 221 |
+
combined_manifest.extend(changes)
|
| 222 |
+
participating_uids.append(uid)
|
| 223 |
+
print(f'{len(changes)} change(s)')
|
| 224 |
+
except Exception as e:
|
| 225 |
+
errors.append(f'[{uid}] parse ERROR: {e}')
|
| 226 |
+
print(f'ERROR: {e}')
|
| 227 |
+
|
| 228 |
+
if not combined_manifest:
|
| 229 |
+
print(' No changes parsed β skipping apply step.')
|
| 230 |
+
report.append((ts_key, 0, 0, len(uids), None, log_path,
|
| 231 |
+
errors + ['No changes parsed']))
|
| 232 |
+
log_path.write_text(log_buf.getvalue(), encoding='utf-8')
|
| 233 |
+
continue
|
| 234 |
+
|
| 235 |
+
print(f' Applying {len(combined_manifest)} change(s) to {ts_in.name}...')
|
| 236 |
+
try:
|
| 237 |
+
n_ok, n_skip, log_lines = apply_manifest(
|
| 238 |
+
ts_in, combined_manifest, ts_applied, author=author, date=tc_date
|
| 239 |
+
)
|
| 240 |
+
except Exception as e:
|
| 241 |
+
errors.append(f'apply_manifest ERROR: {e}')
|
| 242 |
+
print(f' ERROR: {e}')
|
| 243 |
+
report.append((ts_key, 0, 0, len(uids), None, log_path, errors))
|
| 244 |
+
log_path.write_text(log_buf.getvalue(), encoding='utf-8')
|
| 245 |
+
continue
|
| 246 |
+
|
| 247 |
+
for line in log_lines:
|
| 248 |
+
print(f' {line}')
|
| 249 |
+
# Bubble every un-applied change into the warnings list
|
| 250 |
+
for line in log_lines:
|
| 251 |
+
if line.strip().startswith('ERROR'):
|
| 252 |
+
errors.append(line.strip())
|
| 253 |
+
print(f' -> Applied: {n_ok} Skipped: {n_skip}')
|
| 254 |
+
|
| 255 |
+
print(' Finalising metadata...')
|
| 256 |
+
try:
|
| 257 |
+
ts_doc = docx_lib.Document(str(ts_applied))
|
| 258 |
+
rev = RevCounter(ts_doc)
|
| 259 |
+
|
| 260 |
+
pub_ym, pub_month_year = compute_pub_date()
|
| 261 |
+
old_v = version
|
| 262 |
+
|
| 263 |
+
title_text = ts_doc.paragraphs[0].text
|
| 264 |
+
date_match = re.search(r'\((\d{4}-\d{2})\)', title_text)
|
| 265 |
+
old_date_str = date_match.group(1) if date_match else ''
|
| 266 |
+
|
| 267 |
+
print(f' Version: {old_v} -> {new_v}')
|
| 268 |
+
print(f' Publication: {pub_month_year} ({pub_ym})')
|
| 269 |
+
|
| 270 |
+
for uid in participating_uids:
|
| 271 |
+
try:
|
| 272 |
+
meta = extract_cr_metadata(str(cr_paths[uid]))
|
| 273 |
+
ch_cells = update_change_history_table(
|
| 274 |
+
ts_doc, meta, pub_ym, old_v, new_v, rev, author, tc_date
|
| 275 |
+
)
|
| 276 |
+
print(f' [Change History] {uid}: {ch_cells}')
|
| 277 |
+
except NoChangeHistoryTable:
|
| 278 |
+
print(f' [Change History] {uid}: NOT PRESENT β this document has no Change History table (History table only)')
|
| 279 |
+
except Exception as e:
|
| 280 |
+
errors.append(f'[{uid}] Change History ERROR: {e}')
|
| 281 |
+
print(f' [Change History] {uid}: ERROR β {e}')
|
| 282 |
+
|
| 283 |
+
try:
|
| 284 |
+
h_cells = update_history_table(
|
| 285 |
+
ts_doc, new_v, pub_month_year, rev, author, tc_date
|
| 286 |
+
)
|
| 287 |
+
print(f' [History] {h_cells}')
|
| 288 |
+
except Exception as e:
|
| 289 |
+
errors.append(f'History table ERROR: {e}')
|
| 290 |
+
print(f' [History] ERROR β {e}')
|
| 291 |
+
|
| 292 |
+
if old_date_str:
|
| 293 |
+
try:
|
| 294 |
+
update_title_para(
|
| 295 |
+
ts_doc, old_v, new_v, old_date_str, pub_ym, rev, author, tc_date
|
| 296 |
+
)
|
| 297 |
+
print(f' [Title] V{old_v} -> V{new_v}, ({old_date_str}) -> ({pub_ym})')
|
| 298 |
+
except Exception as e:
|
| 299 |
+
errors.append(f'Title update ERROR: {e}')
|
| 300 |
+
print(f' [Title] ERROR β {e}')
|
| 301 |
+
else:
|
| 302 |
+
print(f' [Title] SKIP β no (YYYY-MM) pattern in: {title_text!r}')
|
| 303 |
+
|
| 304 |
+
ts_doc.save(str(ts_final))
|
| 305 |
+
print(f' Saved: {spec_compact}/{ts_final.name}')
|
| 306 |
+
print(f' Log: {spec_compact}/{log_path.name}')
|
| 307 |
+
report.append((ts_key, n_ok, n_skip, len(uids), ts_final, log_path, errors))
|
| 308 |
+
|
| 309 |
+
except Exception as e:
|
| 310 |
+
errors.append(f'Finalisation ERROR: {e}')
|
| 311 |
+
print(f' Finalisation ERROR: {e}')
|
| 312 |
+
report.append((ts_key, n_ok, n_skip, len(uids), ts_applied, log_path, errors))
|
| 313 |
+
|
| 314 |
+
log_path.write_text(log_buf.getvalue(), encoding='utf-8')
|
| 315 |
+
|
| 316 |
+
# Update failed_ts.json β remove entries that are now resolved
|
| 317 |
+
still_failed = [
|
| 318 |
+
e for e in failed_ts_entries
|
| 319 |
+
if not (Path(e['spec_dir']) / e['expected_filename']).exists()
|
| 320 |
+
]
|
| 321 |
+
failed_ts_path.write_text(json.dumps(still_failed, indent=2))
|
| 322 |
+
|
| 323 |
+
_section('Retry Summary')
|
| 324 |
+
n_success = sum(1 for r in report if r[4] is not None and not r[6])
|
| 325 |
+
n_partial = sum(1 for r in report if r[4] is not None and r[6])
|
| 326 |
+
n_failed = sum(1 for r in report if r[4] is None)
|
| 327 |
+
print(f'TSs processed: {n_success} fully OK, {n_partial} with warnings, {n_failed} skipped/failed')
|
| 328 |
+
for ts_key, n_ok, n_skip, n_crs, out_path, log_path, errors in report:
|
| 329 |
+
status_tag = 'OK' if out_path and not errors else ('WARN' if out_path else 'SKIP')
|
| 330 |
+
print(f' [{status_tag}] {ts_key}')
|
| 331 |
+
for err in errors:
|
| 332 |
+
print(f' ! {err}')
|
| 333 |
+
return
|
| 334 |
+
|
| 335 |
+
excel_path = wsl_path(args.excel_path)
|
| 336 |
+
|
| 337 |
# ββ Step 1: Parse Excel βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 338 |
_section('Step 1 β Parsing Excel')
|
| 339 |
print(f'Excel: {excel_path}')
|
|
|
|
| 344 |
except Exception as e:
|
| 345 |
sys.exit(f'ERROR parsing Excel: {e}')
|
| 346 |
|
| 347 |
+
print(f'Found {len(cr_list)} Accepted CR(s)')
|
|
|
|
|
|
|
| 348 |
|
| 349 |
if not cr_list:
|
| 350 |
print('Nothing to process.')
|
|
|
|
| 355 |
cr_paths = {} # uid -> Path
|
| 356 |
|
| 357 |
for uid, _ in cr_list:
|
| 358 |
+
docx_path, note = download_cr(uid, cr_dir, eol_user, eol_password)
|
|
|
|
| 359 |
if docx_path:
|
| 360 |
cr_paths[uid] = docx_path
|
| 361 |
+
print(f' [{uid}] OK ({note}) β {docx_path.name}')
|
| 362 |
+
|
| 363 |
+
n_cr_failed = len(cr_list) - len(cr_paths)
|
| 364 |
+
if n_cr_failed:
|
| 365 |
+
print(f' {len(cr_paths)}/{len(cr_list)} downloaded ({n_cr_failed} failed β details in warnings)')
|
| 366 |
+
else:
|
| 367 |
+
print(f' All {len(cr_list)} CR(s) downloaded successfully')
|
| 368 |
|
| 369 |
# ββ Step 3: Parse cover pages β group by target TS βββββββββββββββββββββββ
|
| 370 |
_section('Step 3 β Parsing CR cover pages')
|
|
|
|
| 397 |
spec_dirs[(spec_number, version)] = spec_dir
|
| 398 |
|
| 399 |
print(f' [TS {spec_number} v{version}] ', end='', flush=True)
|
| 400 |
+
filename, note = None, "not attempted"
|
| 401 |
+
for attempt in range(1, 4):
|
| 402 |
+
filename, note = download_ts(spec_number, version, spec_dir, eol_user, eol_password)
|
| 403 |
+
if filename:
|
| 404 |
+
break
|
| 405 |
+
if attempt < 3:
|
| 406 |
+
print(f'\n [attempt {attempt}/3 failed β retrying in 5s: {note}]', flush=True)
|
| 407 |
+
print(f' [TS {spec_number} v{version}] ', end='', flush=True)
|
| 408 |
+
time.sleep(5)
|
| 409 |
+
else:
|
| 410 |
+
print(f'\n [all 3 attempts failed]', flush=True)
|
| 411 |
if filename:
|
| 412 |
ts_paths[(spec_number, version)] = spec_dir / filename
|
| 413 |
print(f'OK ({note}) β {spec_compact}/{filename}')
|
| 414 |
else:
|
| 415 |
print(f'FAILED β {note}')
|
| 416 |
|
| 417 |
+
# Write failed_ts.json (even when empty so app.py can detect "no failures")
|
| 418 |
+
failed_ts_entries = [
|
| 419 |
+
{
|
| 420 |
+
"spec_number": spec_number,
|
| 421 |
+
"version": version,
|
| 422 |
+
"spec_compact": spec_number.replace(' ', ''),
|
| 423 |
+
"spec_dir": str(spec_dirs[(spec_number, version)]),
|
| 424 |
+
"expected_filename": f"ts_{spec_number.replace(' ', '')}_v{version}.docx",
|
| 425 |
+
"cr_uids": ts_groups[(spec_number, version)],
|
| 426 |
+
"cr_dir": str(cr_dir),
|
| 427 |
+
}
|
| 428 |
+
for (spec_number, version) in ts_groups
|
| 429 |
+
if (spec_number, version) not in ts_paths
|
| 430 |
+
]
|
| 431 |
+
(output_dir / "failed_ts.json").write_text(
|
| 432 |
+
json.dumps(failed_ts_entries, indent=2)
|
| 433 |
+
)
|
| 434 |
+
|
| 435 |
# ββ Steps 5 & 6: Apply CRs + Finalise each TS ββββββββββββββββββββββββββββ
|
| 436 |
_section('Steps 5 & 6 β Applying CRs and Finalising Metadata')
|
| 437 |
report = [] # (ts_key, n_ok, n_skip, n_crs, out_path, log_path, errors)
|
|
|
|
| 514 |
|
| 515 |
for line in log_lines:
|
| 516 |
print(f' {line}')
|
| 517 |
+
# Bubble every un-applied change into the warnings list
|
| 518 |
+
for line in log_lines:
|
| 519 |
+
if line.strip().startswith('ERROR'):
|
| 520 |
+
errors.append(line.strip())
|
| 521 |
print(f' -> Applied: {n_ok} Skipped: {n_skip}')
|
| 522 |
|
| 523 |
# 6. Finalise metadata (Change History, History, title paragraph)
|
|
|
|
| 545 |
ts_doc, meta, pub_ym, old_v, new_v, rev, author, tc_date
|
| 546 |
)
|
| 547 |
print(f' [Change History] {uid}: {ch_cells}')
|
| 548 |
+
except NoChangeHistoryTable:
|
| 549 |
+
print(f' [Change History] {uid}: NOT PRESENT β this document has no Change History table (History table only)')
|
| 550 |
except Exception as e:
|
| 551 |
errors.append(f'[{uid}] Change History ERROR: {e}')
|
| 552 |
print(f' [Change History] {uid}: ERROR β {e}')
|
scripts/ts_applicator.py
CHANGED
|
@@ -33,11 +33,22 @@ from docx_helpers import (
|
|
| 33 |
# ββ Text normalisation ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 34 |
|
| 35 |
def _norm(text):
|
| 36 |
-
"""Normalise
|
| 37 |
return (text
|
| 38 |
-
.replace('\xa0',
|
| 39 |
-
.replace('\
|
| 40 |
-
.replace('\
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
.strip())
|
| 42 |
|
| 43 |
|
|
@@ -60,12 +71,53 @@ def _norm_ws(text):
|
|
| 60 |
Used as a third-level fallback (confidence 0.8) after exact and NBSP-norm.
|
| 61 |
"""
|
| 62 |
base = (text
|
| 63 |
-
.replace('\xa0',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
.replace('\u2013', '-')
|
| 65 |
-
.replace('\u2014', '-')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
return re.sub(r'\s+', '', base)
|
| 67 |
|
| 68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
# ββ Document search helpers βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 70 |
|
| 71 |
def _full_para_text(para):
|
|
@@ -189,11 +241,22 @@ def _find_row(tbl, anchor_text):
|
|
| 189 |
"""
|
| 190 |
Find first row in tbl where col-0 cell text contains anchor_text.
|
| 191 |
Returns (row_idx, confidence) or (-1, 0.0).
|
| 192 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
"""
|
| 194 |
-
norm_anchor
|
| 195 |
-
ws_anchor
|
|
|
|
| 196 |
best = (-1, 0.0)
|
|
|
|
| 197 |
for idx, row in enumerate(tbl.rows):
|
| 198 |
cell0 = row.cells[0].text if row.cells else ''
|
| 199 |
if anchor_text in cell0:
|
|
@@ -202,7 +265,42 @@ def _find_row(tbl, anchor_text):
|
|
| 202 |
best = (idx, 0.9)
|
| 203 |
elif ws_anchor and ws_anchor in _norm_ws(cell0) and best[1] < 0.8:
|
| 204 |
best = (idx, 0.8)
|
| 205 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
|
| 207 |
|
| 208 |
# ββ vMerge row insertion ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -329,7 +427,7 @@ def _apply_section_replace(doc, change, rev, author, date, log):
|
|
| 329 |
break
|
| 330 |
|
| 331 |
if ts_para_elem is None:
|
| 332 |
-
log.append(f'
|
| 333 |
return False
|
| 334 |
|
| 335 |
ts_body = ts_para_elem.getparent()
|
|
@@ -395,7 +493,7 @@ def _apply_text_replace(doc, change, rev, author, date, log):
|
|
| 395 |
if loc['kind'] == 'table_cell':
|
| 396 |
tbl, t_conf = _find_table(doc, loc['table_header'])
|
| 397 |
if tbl is None:
|
| 398 |
-
log.append(f"
|
| 399 |
return False
|
| 400 |
col_idx = loc['col_idx']
|
| 401 |
row_anchor = loc['row_anchor']
|
|
@@ -403,11 +501,22 @@ def _apply_text_replace(doc, change, rev, author, date, log):
|
|
| 403 |
if row_anchor:
|
| 404 |
row_idx, r_conf = _find_row(tbl, row_anchor)
|
| 405 |
if row_idx < 0:
|
| 406 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 407 |
return False
|
| 408 |
row = tbl.rows[row_idx]
|
| 409 |
if col_idx >= len(row.cells):
|
| 410 |
-
log.append(f"
|
| 411 |
return False
|
| 412 |
cell = row.cells[col_idx]
|
| 413 |
for para in cell.paragraphs:
|
|
@@ -415,7 +524,7 @@ def _apply_text_replace(doc, change, rev, author, date, log):
|
|
| 415 |
tracked_modify_para(para, old, new, rev, author, date)
|
| 416 |
log.append(f" OK text_replace (table_cell row={row_idx} col={col_idx}): {old!r} β {new!r}")
|
| 417 |
return True
|
| 418 |
-
log.append(f"
|
| 419 |
return False
|
| 420 |
else:
|
| 421 |
# Empty row anchor: scan all rows in col_idx.
|
|
@@ -447,7 +556,7 @@ def _apply_text_replace(doc, change, rev, author, date, log):
|
|
| 447 |
tracked_modify_para(para, old, new, rev, author, date)
|
| 448 |
log.append(f" OK text_replace (table_cell any_col row={r_idx} col={c_idx}): {old!r} β {new!r}")
|
| 449 |
return True
|
| 450 |
-
log.append(f"
|
| 451 |
return False
|
| 452 |
|
| 453 |
elif loc['kind'] == 'body_para':
|
|
@@ -458,16 +567,16 @@ def _apply_text_replace(doc, change, rev, author, date, log):
|
|
| 458 |
# Fall back: find by paragraph context
|
| 459 |
para, conf = _find_para(doc, ctx, prefer_not_in_table=True)
|
| 460 |
if para is None:
|
| 461 |
-
log.append(f"
|
| 462 |
return False
|
| 463 |
if old in para.text:
|
| 464 |
tracked_modify_para(para, old, new, rev, author, date)
|
| 465 |
log.append(f" OK text_replace (body_para conf={conf:.1f}): {old!r} β {new!r}")
|
| 466 |
return True
|
| 467 |
-
log.append(f"
|
| 468 |
return False
|
| 469 |
|
| 470 |
-
log.append(f"
|
| 471 |
return False
|
| 472 |
|
| 473 |
|
|
@@ -479,7 +588,7 @@ def _apply_para_insert(doc, change, rev, author, date, log):
|
|
| 479 |
|
| 480 |
anchor_para, conf = _find_para(doc, anchor_text)
|
| 481 |
if anchor_para is None:
|
| 482 |
-
log.append(f"
|
| 483 |
return False
|
| 484 |
|
| 485 |
items = [(p['text'], p['style'] or 'Normal') for p in paras_data]
|
|
@@ -500,13 +609,13 @@ def _apply_row_insert(doc, change, rev, author, date, log, last_inserted=None):
|
|
| 500 |
else:
|
| 501 |
tbl, t_conf = _find_table(doc, loc['table_header'])
|
| 502 |
if tbl is None:
|
| 503 |
-
log.append(f"
|
| 504 |
return False
|
| 505 |
|
| 506 |
after_anchor = loc.get('after_row_anchor', '')
|
| 507 |
row_idx, r_conf = _find_row(tbl, after_anchor)
|
| 508 |
if row_idx < 0:
|
| 509 |
-
log.append(f"
|
| 510 |
return False
|
| 511 |
|
| 512 |
cells_data = change.get('cells', [])
|
|
|
|
| 33 |
# ββ Text normalisation ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 34 |
|
| 35 |
def _norm(text):
|
| 36 |
+
"""Normalise common Unicode invisible/whitespace/punctuation variants for comparison."""
|
| 37 |
return (text
|
| 38 |
+
.replace('\xa0', ' ') # non-breaking space
|
| 39 |
+
.replace('\u202f', ' ') # narrow no-break space
|
| 40 |
+
.replace('\u2007', ' ') # figure space
|
| 41 |
+
.replace('\u2060', '') # word joiner (invisible)
|
| 42 |
+
.replace('\u200b', '') # zero-width space
|
| 43 |
+
.replace('\u00ad', '') # soft hyphen (invisible)
|
| 44 |
+
.replace('\u2011', '-') # non-breaking hyphen
|
| 45 |
+
.replace('\u2013', '-') # en dash
|
| 46 |
+
.replace('\u2014', '-') # em dash
|
| 47 |
+
.replace('\u2212', '-') # minus sign
|
| 48 |
+
.replace('\u2018', "'") # left single quote
|
| 49 |
+
.replace('\u2019', "'") # right single quote
|
| 50 |
+
.replace('\u201c', '"') # left double quote
|
| 51 |
+
.replace('\u201d', '"') # right double quote
|
| 52 |
.strip())
|
| 53 |
|
| 54 |
|
|
|
|
| 71 |
Used as a third-level fallback (confidence 0.8) after exact and NBSP-norm.
|
| 72 |
"""
|
| 73 |
base = (text
|
| 74 |
+
.replace('\xa0', '')
|
| 75 |
+
.replace('\u202f', '')
|
| 76 |
+
.replace('\u2007', '')
|
| 77 |
+
.replace('\u2060', '')
|
| 78 |
+
.replace('\u200b', '')
|
| 79 |
+
.replace('\u00ad', '')
|
| 80 |
+
.replace('\u2011', '-')
|
| 81 |
.replace('\u2013', '-')
|
| 82 |
+
.replace('\u2014', '-')
|
| 83 |
+
.replace('\u2212', '-')
|
| 84 |
+
.replace('\u2018', "'")
|
| 85 |
+
.replace('\u2019', "'")
|
| 86 |
+
.replace('\u201c', '"')
|
| 87 |
+
.replace('\u201d', '"'))
|
| 88 |
return re.sub(r'\s+', '', base)
|
| 89 |
|
| 90 |
|
| 91 |
+
def _norm_alnum(text):
|
| 92 |
+
"""Keep only lowercase alphanumeric characters β last-resort matching.
|
| 93 |
+
|
| 94 |
+
Strips all punctuation, spaces, and Unicode variants so that only the
|
| 95 |
+
raw word/number content is compared. Used as a confidence-0.6 fallback
|
| 96 |
+
in _find_row when even whitespace-stripped matching fails (e.g. different
|
| 97 |
+
bracket styles, quote variants, or punctuation differences between the CR
|
| 98 |
+
and the TS).
|
| 99 |
+
"""
|
| 100 |
+
return re.sub(r'[^a-z0-9]', '', text.lower())
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def _clean_prefix(text: str) -> str:
|
| 104 |
+
"""Return the longest leading substring that contains only standard printable
|
| 105 |
+
ASCII characters (ord 32β126).
|
| 106 |
+
|
| 107 |
+
Non-breaking spaces, curly quotes, and other Unicode characters embedded
|
| 108 |
+
mid-text (e.g. between spec number components like 'TS\xa0102\xa0226')
|
| 109 |
+
make the full anchor unmatchable. The clean prefix β the part before the
|
| 110 |
+
first such character β is still reliable and specific enough to locate the
|
| 111 |
+
correct row.
|
| 112 |
+
"""
|
| 113 |
+
end = 0
|
| 114 |
+
for ch in text:
|
| 115 |
+
if ord(ch) < 32 or ord(ch) > 126:
|
| 116 |
+
break
|
| 117 |
+
end += 1
|
| 118 |
+
return text[:end].strip()
|
| 119 |
+
|
| 120 |
+
|
| 121 |
# ββ Document search helpers βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 122 |
|
| 123 |
def _full_para_text(para):
|
|
|
|
| 241 |
"""
|
| 242 |
Find first row in tbl where col-0 cell text contains anchor_text.
|
| 243 |
Returns (row_idx, confidence) or (-1, 0.0).
|
| 244 |
+
|
| 245 |
+
Matching levels, in order of confidence:
|
| 246 |
+
1.0 β exact substring match
|
| 247 |
+
0.9 β Unicode-normalised match (_norm: xa0, dashes, quotes, β¦)
|
| 248 |
+
0.8 β whitespace-stripped match (_norm_ws: also removes tabs/newlines)
|
| 249 |
+
0.6 β alphanumeric-only match (_norm_alnum: strips all non a-z0-9)
|
| 250 |
+
0.55 β clean-prefix unique match: extract the leading ASCII-only part of
|
| 251 |
+
the anchor and find the single row that contains it.
|
| 252 |
+
0.5 β clean-prefix + token-overlap: when multiple rows share the prefix,
|
| 253 |
+
pick the one whose col-0 tokens overlap most with the anchor tokens.
|
| 254 |
"""
|
| 255 |
+
norm_anchor = _norm(anchor_text)
|
| 256 |
+
ws_anchor = _norm_ws(anchor_text)
|
| 257 |
+
alnum_anchor = _norm_alnum(anchor_text)
|
| 258 |
best = (-1, 0.0)
|
| 259 |
+
|
| 260 |
for idx, row in enumerate(tbl.rows):
|
| 261 |
cell0 = row.cells[0].text if row.cells else ''
|
| 262 |
if anchor_text in cell0:
|
|
|
|
| 265 |
best = (idx, 0.9)
|
| 266 |
elif ws_anchor and ws_anchor in _norm_ws(cell0) and best[1] < 0.8:
|
| 267 |
best = (idx, 0.8)
|
| 268 |
+
elif alnum_anchor and alnum_anchor in _norm_alnum(cell0) and best[1] < 0.6:
|
| 269 |
+
best = (idx, 0.6)
|
| 270 |
+
|
| 271 |
+
if best[0] >= 0:
|
| 272 |
+
return best
|
| 273 |
+
|
| 274 |
+
# ββ Prefix-based partial match βββββββββββββββββββββββββββββββββββββββββββββ
|
| 275 |
+
# The anchor may have Unicode chars embedded mid-text that prevent all string
|
| 276 |
+
# comparisons above from matching, even after normalisation (e.g. when the CR
|
| 277 |
+
# extracts '\xa0' between spec-number parts but the TS has different encoding).
|
| 278 |
+
# Strategy: use only the clean ASCII prefix of the anchor as the search key.
|
| 279 |
+
# If that prefix is found in exactly one row β we've uniquely identified it.
|
| 280 |
+
# If it appears in several rows β pick the one whose full token set overlaps
|
| 281 |
+
# most with the anchor's tokens (the user's described disambiguation rule).
|
| 282 |
+
prefix = _clean_prefix(anchor_text)
|
| 283 |
+
if prefix and len(prefix) > 8:
|
| 284 |
+
prefix_low = prefix.lower()
|
| 285 |
+
hits = [
|
| 286 |
+
idx for idx, row in enumerate(tbl.rows)
|
| 287 |
+
if row.cells and prefix_low in row.cells[0].text.lower()
|
| 288 |
+
]
|
| 289 |
+
if len(hits) == 1:
|
| 290 |
+
return hits[0], 0.55
|
| 291 |
+
elif len(hits) > 1:
|
| 292 |
+
anchor_tokens = set(re.findall(r'[a-z0-9]+', anchor_text.lower()))
|
| 293 |
+
best_score, best_idx = -1, -1
|
| 294 |
+
for hit_idx in hits:
|
| 295 |
+
cell_tokens = set(re.findall(r'[a-z0-9]+',
|
| 296 |
+
tbl.rows[hit_idx].cells[0].text.lower()))
|
| 297 |
+
score = len(anchor_tokens & cell_tokens)
|
| 298 |
+
if score > best_score:
|
| 299 |
+
best_score, best_idx = score, hit_idx
|
| 300 |
+
if best_idx >= 0:
|
| 301 |
+
return best_idx, 0.5
|
| 302 |
+
|
| 303 |
+
return (-1, 0.0)
|
| 304 |
|
| 305 |
|
| 306 |
# ββ vMerge row insertion ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 427 |
break
|
| 428 |
|
| 429 |
if ts_para_elem is None:
|
| 430 |
+
log.append(f' ERROR section_replace: del_heading {del_heading!r} not found in TS')
|
| 431 |
return False
|
| 432 |
|
| 433 |
ts_body = ts_para_elem.getparent()
|
|
|
|
| 493 |
if loc['kind'] == 'table_cell':
|
| 494 |
tbl, t_conf = _find_table(doc, loc['table_header'])
|
| 495 |
if tbl is None:
|
| 496 |
+
log.append(f" ERROR text_replace: table not found {loc['table_header'][:2]!r}")
|
| 497 |
return False
|
| 498 |
col_idx = loc['col_idx']
|
| 499 |
row_anchor = loc['row_anchor']
|
|
|
|
| 501 |
if row_anchor:
|
| 502 |
row_idx, r_conf = _find_row(tbl, row_anchor)
|
| 503 |
if row_idx < 0:
|
| 504 |
+
# Primary table doesn't contain this row anchor β the CR may be
|
| 505 |
+
# targeting a different table than the one _find_table resolved.
|
| 506 |
+
# Try every other table in the document before giving up.
|
| 507 |
+
for alt_tbl in doc.tables:
|
| 508 |
+
if alt_tbl is tbl:
|
| 509 |
+
continue
|
| 510 |
+
row_idx, r_conf = _find_row(alt_tbl, row_anchor)
|
| 511 |
+
if row_idx >= 0:
|
| 512 |
+
tbl = alt_tbl
|
| 513 |
+
break
|
| 514 |
+
if row_idx < 0:
|
| 515 |
+
log.append(f" ERROR text_replace: row anchor not found {row_anchor!r}")
|
| 516 |
return False
|
| 517 |
row = tbl.rows[row_idx]
|
| 518 |
if col_idx >= len(row.cells):
|
| 519 |
+
log.append(f" ERROR text_replace: col_idx {col_idx} out of range")
|
| 520 |
return False
|
| 521 |
cell = row.cells[col_idx]
|
| 522 |
for para in cell.paragraphs:
|
|
|
|
| 524 |
tracked_modify_para(para, old, new, rev, author, date)
|
| 525 |
log.append(f" OK text_replace (table_cell row={row_idx} col={col_idx}): {old!r} β {new!r}")
|
| 526 |
return True
|
| 527 |
+
log.append(f" ERROR text_replace: old text {old!r} not in cell (row={row_idx} col={col_idx})")
|
| 528 |
return False
|
| 529 |
else:
|
| 530 |
# Empty row anchor: scan all rows in col_idx.
|
|
|
|
| 556 |
tracked_modify_para(para, old, new, rev, author, date)
|
| 557 |
log.append(f" OK text_replace (table_cell any_col row={r_idx} col={c_idx}): {old!r} β {new!r}")
|
| 558 |
return True
|
| 559 |
+
log.append(f" ERROR text_replace: old text {old!r} not found in any table column")
|
| 560 |
return False
|
| 561 |
|
| 562 |
elif loc['kind'] == 'body_para':
|
|
|
|
| 567 |
# Fall back: find by paragraph context
|
| 568 |
para, conf = _find_para(doc, ctx, prefer_not_in_table=True)
|
| 569 |
if para is None:
|
| 570 |
+
log.append(f" ERROR text_replace: old text {old!r} not found in TS")
|
| 571 |
return False
|
| 572 |
if old in para.text:
|
| 573 |
tracked_modify_para(para, old, new, rev, author, date)
|
| 574 |
log.append(f" OK text_replace (body_para conf={conf:.1f}): {old!r} β {new!r}")
|
| 575 |
return True
|
| 576 |
+
log.append(f" ERROR text_replace: old text {old!r} not in resolved paragraph")
|
| 577 |
return False
|
| 578 |
|
| 579 |
+
log.append(f" ERROR text_replace: unknown kind {loc['kind']!r}")
|
| 580 |
return False
|
| 581 |
|
| 582 |
|
|
|
|
| 588 |
|
| 589 |
anchor_para, conf = _find_para(doc, anchor_text)
|
| 590 |
if anchor_para is None:
|
| 591 |
+
log.append(f" ERROR para_insert: anchor not found {anchor_text[:60]!r}")
|
| 592 |
return False
|
| 593 |
|
| 594 |
items = [(p['text'], p['style'] or 'Normal') for p in paras_data]
|
|
|
|
| 609 |
else:
|
| 610 |
tbl, t_conf = _find_table(doc, loc['table_header'])
|
| 611 |
if tbl is None:
|
| 612 |
+
log.append(f" ERROR row_insert: table not found {loc['table_header'][:2]!r}")
|
| 613 |
return False
|
| 614 |
|
| 615 |
after_anchor = loc.get('after_row_anchor', '')
|
| 616 |
row_idx, r_conf = _find_row(tbl, after_anchor)
|
| 617 |
if row_idx < 0:
|
| 618 |
+
log.append(f" ERROR row_insert: anchor row not found {after_anchor!r}")
|
| 619 |
return False
|
| 620 |
|
| 621 |
cells_data = change.get('cells', [])
|