heymenn commited on
Commit
f8638ca
Β·
1 Parent(s): f646c65

modify UI, independant downloads away from docfinder, warnings, retry and manual upload

Browse files
app.py CHANGED
@@ -23,6 +23,30 @@ from pathlib import Path
23
 
24
  import streamlit as st
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  # ── Scripts dir (same folder as app.py / scripts/) ───────────────────────────
27
  SCRIPTS_DIR = Path(__file__).parent / "scripts"
28
  sys.path.insert(0, str(SCRIPTS_DIR))
@@ -74,7 +98,7 @@ def save_state(sid: str, state: dict) -> None:
74
  def new_state(sid: str) -> dict:
75
  return {
76
  "session_id": sid,
77
- "status": "upload",
78
  "excel_filename": None,
79
  "person_name": "Ly Thanh PHAN",
80
  "cr_list": [],
@@ -126,21 +150,34 @@ def tail_log(log_path: str, n: int = 100) -> str:
126
 
127
 
128
  def parse_log_results(log_path: str) -> list[dict]:
129
- """Extract per-TS result lines from the Final Report section."""
130
  p = Path(log_path)
131
  if not p.exists():
132
  return []
133
  lines = p.read_text(errors="replace").splitlines()
134
  results, in_report = [], False
 
135
  for line in lines:
136
- if "Final Report" in line:
137
  in_report = True
138
- if in_report:
139
- for tag in ("OK", "WARN", "FAIL"):
140
- if f"[{tag}]" in line:
141
- ts_name = line.split(f"[{tag}]", 1)[-1].strip()
142
- results.append({"Status": tag, "TS": ts_name})
143
- break
 
 
 
 
 
 
 
 
 
 
 
 
144
  return results
145
 
146
 
@@ -231,6 +268,11 @@ if "sid" not in st.session_state:
231
  sid: str = st.session_state.sid
232
  state: dict = st.session_state.state
233
 
 
 
 
 
 
234
  # ── Sidebar ───────────────────────────────────────────────────────────────────
235
  with st.sidebar:
236
  st.header("Session")
@@ -251,10 +293,37 @@ with st.sidebar:
251
  # ── State machine ─────────────────────────────────────────────────────────────
252
  status: str = state["status"]
253
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
  # ════════════════════════════════════════════════════════════════════════════
255
  # UPLOAD
256
  # ════════════════════════════════════════════════════════════════════════════
257
- if status == "upload":
258
  st.subheader("Step 1 β€” Upload contribution list")
259
 
260
  uploaded = st.file_uploader(
@@ -330,12 +399,16 @@ elif status == "preview":
330
  "--output-dir", str(output_dir),
331
  ]
332
 
 
 
 
 
333
  log_file = open(str(log_path), "w")
334
  proc = subprocess.Popen(
335
  cmd,
336
  stdout=log_file,
337
  stderr=subprocess.STDOUT,
338
- env=os.environ.copy(),
339
  )
340
  log_file.close()
341
 
@@ -405,25 +478,46 @@ elif status in ("done", "error"):
405
  else:
406
  st.error(f"❌ Pipeline finished with errors (return code: {rc})")
407
 
408
- # Per-TS results table
409
- results = parse_log_results(log_path)
 
 
 
 
 
 
410
  if results:
411
  st.subheader("Results per TS")
412
  import pandas as pd
413
 
414
- df = pd.DataFrame(results)
 
 
415
 
416
  def _color_status(val):
417
  return {
418
  "OK": "background-color: #d4edda; color: #155724",
419
  "WARN": "background-color: #fff3cd; color: #856404",
420
  "FAIL": "background-color: #f8d7da; color: #721c24",
 
421
  }.get(val, "")
422
 
423
- st.dataframe(
424
- df.style.map(_color_status, subset=["Status"]),
425
- use_container_width=True,
426
- )
 
 
 
 
 
 
 
 
 
 
 
 
427
 
428
  # Download ZIP
429
  if output_dir.exists() and any(output_dir.rglob("*")):
@@ -446,6 +540,105 @@ elif status in ("done", "error"):
446
  else:
447
  st.text("Log not found.")
448
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
449
  # Start new session
450
  st.divider()
451
  if st.button("Start new session"):
 
23
 
24
  import streamlit as st
25
 
26
+ # ── EOL credential verification ───────────────────────────────────────────────
27
+
28
+ def verify_eol_credentials(username: str, password: str) -> bool:
29
+ import json as _json
30
+ import urllib3
31
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
32
+ import requests as _req
33
+ session = _req.Session()
34
+ session.get(
35
+ "https://portal.etsi.org/LoginRedirection.aspx",
36
+ verify=False,
37
+ timeout=10,
38
+ )
39
+ resp = session.post(
40
+ "https://portal.etsi.org/ETSIPages/LoginEOL.ashx",
41
+ data=_json.dumps({"username": username, "password": password}),
42
+ headers={"Content-Type": "application/json; charset=UTF-8"},
43
+ verify=False,
44
+ allow_redirects=False,
45
+ timeout=10,
46
+ )
47
+ return resp.text.strip() != "Failed"
48
+
49
+
50
  # ── Scripts dir (same folder as app.py / scripts/) ───────────────────────────
51
  SCRIPTS_DIR = Path(__file__).parent / "scripts"
52
  sys.path.insert(0, str(SCRIPTS_DIR))
 
98
  def new_state(sid: str) -> dict:
99
  return {
100
  "session_id": sid,
101
+ "status": "login",
102
  "excel_filename": None,
103
  "person_name": "Ly Thanh PHAN",
104
  "cr_list": [],
 
150
 
151
 
152
  def parse_log_results(log_path: str) -> list[dict]:
153
+ """Extract per-TS result lines and warning messages from the Final/Retry Report."""
154
  p = Path(log_path)
155
  if not p.exists():
156
  return []
157
  lines = p.read_text(errors="replace").splitlines()
158
  results, in_report = [], False
159
+ current = None
160
  for line in lines:
161
+ if "Final Report" in line or "Retry Summary" in line:
162
  in_report = True
163
+ continue
164
+ if not in_report:
165
+ continue
166
+ matched = False
167
+ for tag in ("OK", "WARN", "FAIL", "SKIP"):
168
+ if f"[{tag}]" in line:
169
+ if current is not None:
170
+ results.append(current)
171
+ ts_name = line.split(f"[{tag}]", 1)[-1].strip()
172
+ current = {"Status": tag, "TS": ts_name, "warnings": []}
173
+ matched = True
174
+ break
175
+ if not matched and current is not None:
176
+ stripped = line.strip()
177
+ if stripped.startswith("! "):
178
+ current["warnings"].append(stripped[2:])
179
+ if current is not None:
180
+ results.append(current)
181
  return results
182
 
183
 
 
268
  sid: str = st.session_state.sid
269
  state: dict = st.session_state.state
270
 
271
+ # Credential guard: if credentials are not in memory (e.g. page refresh after login),
272
+ # force re-login regardless of the persisted status.
273
+ if state.get("status") not in ("login",) and "eol_user" not in st.session_state:
274
+ state["status"] = "login"
275
+
276
  # ── Sidebar ───────────────────────────────────────────────────────────────────
277
  with st.sidebar:
278
  st.header("Session")
 
293
  # ── State machine ─────────────────────────────────────────────────────────────
294
  status: str = state["status"]
295
 
296
+ # ════════════════════════════════════════════════════════════════════════════
297
+ # LOGIN
298
+ # ════════════════════════════════════════════════════════════════════════════
299
+ if status == "login":
300
+ st.subheader("Connect with your ETSI EOL account")
301
+ st.info(
302
+ "Your credentials are used only for this session and are never stored on disk.",
303
+ icon="πŸ”’",
304
+ )
305
+ username = st.text_input("EOL Username")
306
+ password = st.text_input("EOL Password", type="password")
307
+
308
+ if st.button("Connect", type="primary"):
309
+ if not username or not password:
310
+ st.error("Please enter both username and password.")
311
+ else:
312
+ with st.spinner("Verifying credentials…"):
313
+ ok = verify_eol_credentials(username, password)
314
+ if ok:
315
+ st.session_state.eol_user = username
316
+ st.session_state.eol_password = password
317
+ state["status"] = "upload"
318
+ save_state(sid, state)
319
+ st.rerun()
320
+ else:
321
+ st.error("Login failed β€” check your EOL username and password.")
322
+
323
  # ════════════════════════════════════════════════════════════════════════════
324
  # UPLOAD
325
  # ════════════════════════════════════════════════════════════════════════════
326
+ elif status == "upload":
327
  st.subheader("Step 1 β€” Upload contribution list")
328
 
329
  uploaded = st.file_uploader(
 
399
  "--output-dir", str(output_dir),
400
  ]
401
 
402
+ env = os.environ.copy()
403
+ env["EOL_USER"] = st.session_state.eol_user
404
+ env["EOL_PASSWORD"] = st.session_state.eol_password
405
+
406
  log_file = open(str(log_path), "w")
407
  proc = subprocess.Popen(
408
  cmd,
409
  stdout=log_file,
410
  stderr=subprocess.STDOUT,
411
+ env=env,
412
  )
413
  log_file.close()
414
 
 
478
  else:
479
  st.error(f"❌ Pipeline finished with errors (return code: {rc})")
480
 
481
+ # Per-TS results table β€” merge all pipeline logs so retry results don't
482
+ # replace original ones; later logs (pipeline_retry.log) supersede earlier
483
+ # ones (pipeline.log) for the same TS key.
484
+ _merged: dict[str, dict] = {}
485
+ for _lf in sorted(session_dir(sid).glob("pipeline*.log")):
486
+ for _r in parse_log_results(str(_lf)):
487
+ _merged[_r["TS"]] = _r
488
+ results = list(_merged.values())
489
  if results:
490
  st.subheader("Results per TS")
491
  import pandas as pd
492
 
493
+ n_warn = sum(1 for r in results if r["warnings"])
494
+ warn_label = f"Warnings ({n_warn})" if n_warn else "Warnings"
495
+ tab_summary, tab_warnings = st.tabs(["Summary", warn_label])
496
 
497
  def _color_status(val):
498
  return {
499
  "OK": "background-color: #d4edda; color: #155724",
500
  "WARN": "background-color: #fff3cd; color: #856404",
501
  "FAIL": "background-color: #f8d7da; color: #721c24",
502
+ "SKIP": "background-color: #e2e3e5; color: #383d41",
503
  }.get(val, "")
504
 
505
+ with tab_summary:
506
+ df = pd.DataFrame([{"Status": r["Status"], "TS": r["TS"]} for r in results])
507
+ st.dataframe(
508
+ df.style.map(_color_status, subset=["Status"]),
509
+ use_container_width=True,
510
+ )
511
+
512
+ with tab_warnings:
513
+ warned = [r for r in results if r["warnings"]]
514
+ if warned:
515
+ for r in warned:
516
+ with st.expander(f"⚠️ {r['TS']} β€” {len(r['warnings'])} warning(s)"):
517
+ for w in r["warnings"]:
518
+ st.text(w)
519
+ else:
520
+ st.success("No warnings.")
521
 
522
  # Download ZIP
523
  if output_dir.exists() and any(output_dir.rglob("*")):
 
540
  else:
541
  st.text("Log not found.")
542
 
543
+ # ── TS Recovery ───────────────────────────────────────────────────────────
544
+ failed_ts_path = output_dir / "failed_ts.json"
545
+ if failed_ts_path.exists():
546
+ failed_ts_entries = json.loads(failed_ts_path.read_text())
547
+ if failed_ts_entries:
548
+ st.divider()
549
+ st.subheader("⚠️ Recover failed TS downloads")
550
+ st.info(
551
+ f"{len(failed_ts_entries)} TS(s) could not be downloaded. "
552
+ "Retry or upload each one manually, then apply the CRs."
553
+ )
554
+
555
+ for entry in failed_ts_entries:
556
+ spec_key = f"{entry['spec_number']} v{entry['version']}"
557
+ dest_path = Path(entry["spec_dir"]) / entry["expected_filename"]
558
+ ready = dest_path.exists()
559
+
560
+ label = f"{'βœ…' if ready else '❌'} TS {spec_key} β€” CRs: {', '.join(entry['cr_uids'])}"
561
+ with st.expander(label, expanded=not ready):
562
+ col1, col2 = st.columns(2)
563
+
564
+ with col1:
565
+ if st.button("πŸ”„ Retry download",
566
+ key=f"retry_{entry['spec_compact']}_{entry['version']}"):
567
+ from fetch_crs import download_ts as _dl_ts
568
+ with st.spinner(f"Downloading TS {spec_key}…"):
569
+ fn, note = _dl_ts(
570
+ entry["spec_number"], entry["version"],
571
+ Path(entry["spec_dir"]),
572
+ st.session_state.eol_user,
573
+ st.session_state.eol_password,
574
+ )
575
+ if fn:
576
+ st.success(f"Downloaded: {fn}")
577
+ st.rerun()
578
+ else:
579
+ st.error(f"Failed: {note}")
580
+
581
+ with col2:
582
+ uploaded_ts = st.file_uploader(
583
+ f"Or upload `{entry['expected_filename']}`",
584
+ type=["docx"],
585
+ key=f"upload_{entry['spec_compact']}_{entry['version']}",
586
+ )
587
+ if uploaded_ts is not None:
588
+ Path(entry["spec_dir"]).mkdir(parents=True, exist_ok=True)
589
+ dest_path.write_bytes(uploaded_ts.read())
590
+ st.success("Saved βœ“")
591
+ st.rerun()
592
+
593
+ # Global apply button β€” enabled when β‰₯1 TS is now on disk
594
+ ready_entries = [
595
+ e for e in failed_ts_entries
596
+ if (Path(e["spec_dir"]) / e["expected_filename"]).exists()
597
+ ]
598
+ remaining = len(failed_ts_entries) - len(ready_entries)
599
+
600
+ if ready_entries:
601
+ if remaining:
602
+ st.warning(f"{len(ready_entries)} ready, {remaining} will be skipped.")
603
+ else:
604
+ st.success(f"All {len(ready_entries)} TS(s) ready.")
605
+
606
+ if st.button("β–Ά Apply CRs to recovered TSs", type="primary"):
607
+ retry_log = str(session_dir(sid) / "pipeline_retry.log")
608
+ _rc_path(sid).unlink(missing_ok=True) # clear old returncode
609
+
610
+ cmd = [
611
+ sys.executable,
612
+ str(SCRIPTS_DIR / "orchestrate_cr.py"),
613
+ "--output-dir", state["output_dir"],
614
+ "--retry-mode",
615
+ ]
616
+ env = os.environ.copy()
617
+ env["EOL_USER"] = st.session_state.eol_user
618
+ env["EOL_PASSWORD"] = st.session_state.eol_password
619
+
620
+ log_file = open(retry_log, "w")
621
+ proc = subprocess.Popen(
622
+ cmd, stdout=log_file, stderr=subprocess.STDOUT, env=env
623
+ )
624
+ log_file.close()
625
+
626
+ threading.Thread(
627
+ target=_run_and_save_rc,
628
+ args=(proc, _rc_path(sid)),
629
+ daemon=True,
630
+ ).start()
631
+ st.session_state.proc = proc
632
+
633
+ state["status"] = "running"
634
+ state["pid"] = proc.pid
635
+ state["log_path"] = retry_log
636
+ state["started_at"] = datetime.now().isoformat()
637
+ save_state(sid, state)
638
+ st.rerun()
639
+ else:
640
+ st.warning("No TSs available yet β€” retry download or upload DOCX files above.")
641
+
642
  # Start new session
643
  st.divider()
644
  if st.button("Start new session"):
scripts/etsi_client.py ADDED
@@ -0,0 +1,495 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ etsi_client.py β€” ETSI document download helpers for ApplyCRs.
3
+
4
+ Provides:
5
+ ETSIDocFinder β€” CR TDoc downloads via docbox.etsi.org
6
+ ETSISpecFinder β€” TS DOCX downloads via portal.etsi.org WKI chain
7
+ """
8
+
9
+ import json
10
+ import os
11
+ import re
12
+ from concurrent.futures import ThreadPoolExecutor, as_completed
13
+ from urllib.parse import urljoin
14
+
15
+ import requests
16
+ import urllib3
17
+ from bs4 import BeautifulSoup
18
+
19
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
20
+
21
+
22
+ def _get_proxies() -> dict:
23
+ """Return a requests-compatible proxies dict from $http_proxy / $HTTP_PROXY."""
24
+ proxy = os.environ.get("http_proxy") or os.environ.get("HTTP_PROXY") or ""
25
+ if not proxy:
26
+ return {}
27
+ return {"http": proxy, "https": proxy}
28
+
29
+
30
+ class ETSIDocFinder:
31
+ HEADERS = {
32
+ "User-Agent": (
33
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
34
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
35
+ "Chrome/136.0.0.0 Safari/537.36"
36
+ )
37
+ }
38
+
39
+ def __init__(self, eol_user: str, eol_password: str):
40
+ self.eol_user = eol_user
41
+ self.eol_password = eol_password
42
+ self.main_ftp_url = "https://docbox.etsi.org/SET"
43
+ req_data = self.connect()
44
+ self.session = req_data["session"]
45
+
46
+ def connect(self):
47
+ session = requests.Session()
48
+ session.headers.update(self.HEADERS)
49
+ session.proxies.update(_get_proxies())
50
+
51
+ # Seed DNN session cookies β€” docbox requires the portal session to be
52
+ # initialised with domain=docbox.etsi.org so the .DOTNETNUKE cookie
53
+ # is scoped to .etsi.org and accepted by docbox.etsi.org as well.
54
+ login_redir_url = (
55
+ "https://portal.etsi.org/LoginRedirection.aspx"
56
+ "?domain=docbox.etsi.org&ReturnUrl=/"
57
+ )
58
+ session.get(login_redir_url, verify=False, timeout=15)
59
+
60
+ req = session.post(
61
+ "https://portal.etsi.org/ETSIPages/LoginEOL.ashx",
62
+ data=json.dumps({"username": self.eol_user, "password": self.eol_password}),
63
+ headers={
64
+ "Content-Type": "application/json; charset=UTF-8",
65
+ "Referer": login_redir_url,
66
+ },
67
+ verify=False,
68
+ allow_redirects=False,
69
+ timeout=15,
70
+ )
71
+ if req.text == "Failed":
72
+ return {
73
+ "error": True,
74
+ "session": session,
75
+ "message": "Login failed! Check your credentials",
76
+ }
77
+ self.session = session
78
+ return {"error": False, "session": session, "message": "Login successful"}
79
+
80
+ def download_document(self, url: str) -> bytes:
81
+ """Download a docbox file using the authenticated session.
82
+
83
+ If the session has expired the portal redirects to LoginRedirection β€”
84
+ we detect this and re-authenticate before retrying.
85
+ """
86
+ resp = self.session.get(url, verify=False, timeout=30, allow_redirects=True)
87
+ if resp.url and "LoginRedirection" in resp.url:
88
+ self.connect()
89
+ resp = self.session.get(url, verify=False, timeout=30, allow_redirects=True)
90
+ return resp.content
91
+
92
+ def get_workgroup(self, doc: str):
93
+ main_tsg = (
94
+ "SET-WG-R"
95
+ if any(doc.startswith(kw) for kw in ["SETREQ", "SCPREQ"])
96
+ else "SET-WG-T"
97
+ if any(doc.startswith(kw) for kw in ["SETTEC", "SCPTEC"])
98
+ else "SET"
99
+ if any(doc.startswith(kw) for kw in ["SET", "SCP"])
100
+ else None
101
+ )
102
+ if main_tsg is None:
103
+ return None, None, None
104
+ regex = re.search(r"\(([^)]+)\)", doc)
105
+ workgroup = "20" + regex.group(1)
106
+ return main_tsg, workgroup, doc
107
+
108
+ def find_workgroup_url(self, main_tsg, workgroup):
109
+ url = f"{self.main_ftp_url}/{main_tsg}/05-CONTRIBUTIONS"
110
+ response = self.session.get(url, verify=False, timeout=15)
111
+ if "LoginRedirection" in response.url:
112
+ self.connect()
113
+ response = self.session.get(url, verify=False, timeout=15)
114
+ soup = BeautifulSoup(response.text, "html.parser")
115
+ for item in soup.find_all("tr"):
116
+ link = item.find("a")
117
+ if link and workgroup in link.get_text():
118
+ return (
119
+ f"{self.main_ftp_url}/{main_tsg}/05-CONTRIBUTIONS/{link.get_text()}"
120
+ )
121
+ return f"{self.main_ftp_url}/{main_tsg}/05-CONTRIBUTIONS/{workgroup}"
122
+
123
+ def get_docs_from_url(self, url):
124
+ try:
125
+ response = self.session.get(url, verify=False, timeout=15)
126
+ soup = BeautifulSoup(response.text, "html.parser")
127
+ return [item.get_text() for item in soup.select("tr td a")]
128
+ except Exception as e:
129
+ print(f"Error accessing {url}: {e}")
130
+ return []
131
+
132
+ def search_document(self, doc_id: str):
133
+ original = doc_id
134
+ main_tsg, workgroup, doc = self.get_workgroup(doc_id)
135
+ urls = []
136
+ if main_tsg:
137
+ wg_url = self.find_workgroup_url(main_tsg, workgroup)
138
+ if wg_url:
139
+ entries = self.get_docs_from_url(wg_url)
140
+ for entry in entries:
141
+ if doc in entry.lower() or original in entry:
142
+ doc_url = f"{wg_url}/{entry}"
143
+ urls.append(doc_url)
144
+ elif "." not in entry.rstrip("/"):
145
+ sub_url = f"{wg_url}/{entry}"
146
+ files = self.get_docs_from_url(sub_url)
147
+ for f in files:
148
+ if doc in f.lower() or original in f:
149
+ urls.append(f"{sub_url}/{f}")
150
+ return (
151
+ urls[0]
152
+ if len(urls) == 1
153
+ else urls[-1]
154
+ if len(urls) > 1
155
+ else f"Document {doc_id} not found"
156
+ )
157
+
158
+
159
+ class ETSISpecFinder:
160
+ def __init__(self, eol_user: str, eol_password: str):
161
+ self.eol_user = eol_user
162
+ self.eol_password = eol_password
163
+ self.main_url = "https://www.etsi.org/deliver/etsi_ts"
164
+ self.second_url = "https://www.etsi.org/deliver/etsi_tr"
165
+ self.headers = {
166
+ "User-Agent": (
167
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
168
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
169
+ "Chrome/136.0.0.0 Safari/537.36"
170
+ )
171
+ }
172
+
173
+ def get_spec_path(self, doc_id: str):
174
+ if "-" in doc_id:
175
+ position, part = doc_id.split("-")
176
+ else:
177
+ position, part = doc_id, None
178
+
179
+ position = position.replace(" ", "")
180
+ if part:
181
+ if len(part) == 1:
182
+ part = "0" + part
183
+ spec_folder = position + part if part is not None else position
184
+ return (
185
+ f"{int(position) - (int(position) % 100)}_"
186
+ f"{int(position) - (int(position) % 100) + 99}/{spec_folder}"
187
+ )
188
+
189
+ def get_docs_from_url(self, url):
190
+ try:
191
+ response = requests.get(
192
+ url, verify=False, timeout=15, proxies=_get_proxies()
193
+ )
194
+ soup = BeautifulSoup(response.text, "html.parser")
195
+ docs = [item.get_text() for item in soup.find_all("a")][1:]
196
+ return docs
197
+ except Exception as e:
198
+ print(f"Error accessing {url}: {e}")
199
+ return []
200
+
201
+ def _normalise_version(self, version: str) -> str:
202
+ """Normalise a user-supplied version string to ETSI zero-padded format.
203
+ '17.6.0' -> '17.06.00' (the '_60' release suffix is ignored during matching)
204
+ Already-normalised strings like '17.06.00' are returned unchanged."""
205
+ parts = version.strip("/").split(".")
206
+ if len(parts) == 3:
207
+ try:
208
+ return f"{int(parts[0]):02d}.{int(parts[1]):02d}.{int(parts[2]):02d}"
209
+ except ValueError:
210
+ pass
211
+ return version.strip("/")
212
+
213
+ def _pick_release(self, releases: list, version: str = None) -> str:
214
+ """Return the release folder matching version, or the latest if not found/specified."""
215
+ if version:
216
+ target = self._normalise_version(version)
217
+ for r in releases:
218
+ folder = r.strip("/").split("_")[0]
219
+ if folder == target:
220
+ return r
221
+ return releases[-1]
222
+
223
+ def search_document(self, doc_id: str, version: str = None):
224
+ original = doc_id
225
+ url = f"{self.main_url}/{self.get_spec_path(original)}/"
226
+ url2 = f"{self.second_url}/{self.get_spec_path(original)}/"
227
+ print(url)
228
+ print(url2)
229
+
230
+ releases = self.get_docs_from_url(url)
231
+ if releases:
232
+ release = self._pick_release(releases, version)
233
+ files = self.get_docs_from_url(url + release)
234
+ for f in files:
235
+ if f.endswith(".pdf"):
236
+ return url + release + "/" + f
237
+
238
+ releases = self.get_docs_from_url(url2)
239
+ if releases:
240
+ release = self._pick_release(releases, version)
241
+ files = self.get_docs_from_url(url2 + release)
242
+ for f in files:
243
+ if f.endswith(".pdf"):
244
+ return url2 + release + "/" + f
245
+
246
+ return f"Specification {doc_id} not found"
247
+
248
+ def _get_wki_id_candidates(self, doc_id: str, version: str = None) -> tuple:
249
+ """Return (candidates, version_str) for a spec version (best match first)."""
250
+ if version:
251
+ version_str = version
252
+ else:
253
+ pdf_url = self.search_document(doc_id)
254
+ if "not found" in pdf_url.lower():
255
+ return [], ""
256
+ parts = pdf_url.rstrip("/").split("/")
257
+ version_folder = parts[-2] # e.g. "18.04.00_60"
258
+ v_parts = version_folder.split("_")[0].split(".") # ["18", "04", "00"]
259
+ try:
260
+ version_str = f"{int(v_parts[0])}.{int(v_parts[1])}.{int(v_parts[2])}"
261
+ except (ValueError, IndexError):
262
+ return [], ""
263
+
264
+ def fetch_candidates():
265
+ spec_num = doc_id.split("-")[0].replace(" ", "")
266
+ import datetime
267
+ today = datetime.date.today().isoformat()
268
+
269
+ base_params = {
270
+ "option": "com_standardssearch",
271
+ "view": "data",
272
+ "format": "json",
273
+ "page": "1",
274
+ "title": "1",
275
+ "etsiNumber": "1",
276
+ "content": "1",
277
+ "version": "0",
278
+ "onApproval": "1",
279
+ "published": "1",
280
+ "withdrawn": "1",
281
+ "historical": "1",
282
+ "isCurrent": "1",
283
+ "superseded": "1",
284
+ "startDate": "1988-01-15",
285
+ "endDate": today,
286
+ "harmonized": "0",
287
+ "keyword": "",
288
+ "TB": "",
289
+ "stdType": "",
290
+ "frequency": "",
291
+ "mandate": "",
292
+ "collection": "",
293
+ "sort": "1",
294
+ }
295
+
296
+ # ETSI UI sends capital-V version; try both to be safe
297
+ queries = [
298
+ f"{doc_id} V{version_str}", # e.g. "104 005 V1.2.1" (UI format)
299
+ f"{doc_id} v{version_str}", # e.g. "104 005 v1.2.1"
300
+ doc_id, # e.g. "104 005" (wider net)
301
+ ]
302
+ seen = {}
303
+ for query in queries:
304
+ params = {**base_params, "search": query}
305
+ try:
306
+ resp = requests.get(
307
+ "https://www.etsi.org/",
308
+ params=params,
309
+ headers=self.headers,
310
+ verify=False,
311
+ timeout=15,
312
+ proxies=_get_proxies(),
313
+ )
314
+ data = resp.json()
315
+ if data and isinstance(data, list):
316
+ hits = [
317
+ str(item["wki_id"])
318
+ for item in data
319
+ if "wki_id" in item and spec_num in json.dumps(item)
320
+ ]
321
+ for h in hits:
322
+ seen[h] = None
323
+ if hits:
324
+ print(f" wki_id search query={query!r} β†’ {len(hits)} hit(s)")
325
+ break
326
+ except Exception as e:
327
+ print(f"Error getting wki_id for {doc_id} (query={query!r}): {e}")
328
+ return list(seen.keys())
329
+
330
+ candidates = list(dict.fromkeys(fetch_candidates()))
331
+ return candidates, version_str
332
+
333
+ def _authenticate_eol(self) -> requests.Session:
334
+ """Create a requests.Session authenticated to the ETSI EOL portal."""
335
+ session = requests.Session()
336
+ session.headers.update({"User-Agent": self.headers["User-Agent"]})
337
+ session.proxies.update(_get_proxies())
338
+
339
+ login_redir_url = (
340
+ "https://portal.etsi.org/LoginRedirection.aspx"
341
+ "?domain=docbox.etsi.org&ReturnUrl=/"
342
+ )
343
+ session.get(login_redir_url, verify=False, timeout=15)
344
+
345
+ login_resp = session.post(
346
+ "https://portal.etsi.org/ETSIPages/LoginEOL.ashx",
347
+ data=json.dumps({"username": self.eol_user, "password": self.eol_password}),
348
+ headers={
349
+ "Content-Type": "application/json; charset=UTF-8",
350
+ "Referer": login_redir_url,
351
+ },
352
+ verify=False,
353
+ allow_redirects=False,
354
+ timeout=15,
355
+ )
356
+ if login_resp.text.strip() == "Failed":
357
+ raise RuntimeError(
358
+ "ETSI EOL login failed β€” check EOL_USER / EOL_PASSWORD"
359
+ )
360
+ return session
361
+
362
+ def search_document_docx(self, doc_id: str, version: str = None) -> str:
363
+ """Download an ETSI spec as DOCX and return the local file path."""
364
+ candidates, version_str = self._get_wki_id_candidates(doc_id, version)
365
+ if not candidates:
366
+ return f"Specification {doc_id} not found"
367
+
368
+ try:
369
+ version_tag = "".join(f"{int(p):02d}" for p in version_str.split("."))
370
+ except (ValueError, AttributeError):
371
+ version_tag = ""
372
+
373
+ auth_session = self._authenticate_eol()
374
+
375
+ def try_wki(wki_id):
376
+ print(f"Trying wki_id={wki_id} for {doc_id}")
377
+ session = requests.Session()
378
+ session.headers.update({"User-Agent": self.headers["User-Agent"]})
379
+ session.proxies.update(_get_proxies())
380
+ session.cookies.update(auth_session.cookies)
381
+
382
+ # Step 1: LogonRedirection.asp registers the download intent server-side,
383
+ # generates a one-time profile_id, then 302s to NTaccount.asp.
384
+ # allow_redirects=True means the final response IS the NTaccount.asp page.
385
+ # Do NOT call NTaccount.asp again β€” a second call invalidates profile_id A
386
+ # and the server rejects the new profile_id B with "Your identifier is wrong".
387
+ r_logon = session.get(
388
+ f"https://portal.etsi.org/webapp/workprogram/LogonRedirection.asp"
389
+ f"?wki_id={wki_id}",
390
+ verify=False,
391
+ timeout=15,
392
+ allow_redirects=True,
393
+ )
394
+ meta_match = re.search(r"URL=([^\"'\s>]+)", r_logon.text)
395
+ if not meta_match:
396
+ print(
397
+ f" wki_id={wki_id}: authentication failed "
398
+ f"(no URL= in NTaccount.asp), trying next"
399
+ )
400
+ return None
401
+
402
+ meta_url = urljoin(r_logon.url, meta_match.group(1))
403
+
404
+ r2 = session.get(meta_url, allow_redirects=False, verify=False, timeout=15)
405
+ if r2.status_code != 302:
406
+ print(
407
+ f" wki_id={wki_id}: unexpected status {r2.status_code}, trying next"
408
+ )
409
+ return None
410
+
411
+ location2 = r2.headers.get("Location", "")
412
+ if "processerror" in location2.lower():
413
+ print(f" wki_id={wki_id}: portal rejected ({location2}), trying next")
414
+ return None
415
+
416
+ copy_url = urljoin("https://portal.etsi.org/", location2)
417
+ r3 = session.get(copy_url, allow_redirects=False, verify=False, timeout=15)
418
+
419
+ if r3.status_code == 302:
420
+ location3 = r3.headers.get("Location", "")
421
+ final_url = urljoin("https://portal.etsi.org/webapp/ewp/", location3)
422
+ r4 = session.get(final_url, verify=False, timeout=15)
423
+ else:
424
+ r4 = r3
425
+
426
+ docx_urls = re.findall(
427
+ r'href=["\']([^"\']*\.docx)["\']', r4.text, re.IGNORECASE
428
+ )
429
+ if not docx_urls:
430
+ print(f" wki_id={wki_id}: DOCX not found in page, trying next")
431
+ return None
432
+
433
+ spec_num = doc_id.split("-")[0].replace(" ", "")
434
+ matching_urls = [u for u in docx_urls if spec_num in u.split("/")[-1]]
435
+ if not matching_urls:
436
+ print(
437
+ f" wki_id={wki_id}: DOCX spec mismatch "
438
+ f"(expected {spec_num}), trying next"
439
+ )
440
+ return None
441
+
442
+ if version_tag:
443
+ version_candidates = [
444
+ version_tag, # "010201"
445
+ f"v{version_tag}", # "v010201"
446
+ version_str.replace(".", ""), # "121"
447
+ version_str, # "1.2.1"
448
+ version_str.replace(".", "_"), # "1_2_1"
449
+ ]
450
+ versioned_urls = []
451
+ for tag in version_candidates:
452
+ versioned_urls = [
453
+ u for u in matching_urls if tag in u.split("/")[-1]
454
+ ]
455
+ if versioned_urls:
456
+ break
457
+
458
+ if not versioned_urls:
459
+ found_names = [u.split("/")[-1] for u in matching_urls]
460
+ print(
461
+ f" wki_id={wki_id}: version tag not in filenames {found_names}, "
462
+ f"using first spec-matching DOCX as fallback"
463
+ )
464
+ versioned_urls = matching_urls
465
+
466
+ matching_urls = versioned_urls
467
+
468
+ docx_url = matching_urls[0]
469
+ dl = session.get(
470
+ docx_url,
471
+ headers={"Referer": r4.url},
472
+ verify=False,
473
+ timeout=60,
474
+ )
475
+ filename = docx_url.split("/")[-1]
476
+ tmp_path = f"/tmp/{filename}"
477
+ with open(tmp_path, "wb") as f:
478
+ f.write(dl.content)
479
+
480
+ print(f" wki_id={wki_id}: success")
481
+ return tmp_path
482
+
483
+ executor = ThreadPoolExecutor(max_workers=min(len(candidates), 4))
484
+ try:
485
+ futures = {executor.submit(try_wki, wki_id): wki_id for wki_id in candidates}
486
+ for future in as_completed(futures):
487
+ result = future.result()
488
+ if result is not None:
489
+ for f in futures:
490
+ f.cancel()
491
+ return result
492
+ finally:
493
+ executor.shutdown(wait=False)
494
+
495
+ return f"Specification {doc_id}: all {len(candidates)} wki_id candidate(s) rejected"
scripts/fetch_crs.py CHANGED
@@ -7,9 +7,9 @@ Usage:
7
 
8
  Steps:
9
  1. Parse Excel, filter Accepted CRs by person name
10
- 2. Download CR DOCXs via docfinder /find/tdoc/download
11
  3. Parse CR cover pages to extract target TS spec + version
12
- 4. Download TS DOCXs via docfinder /find/docx
13
  5. Print summary report
14
  """
15
 
@@ -17,15 +17,10 @@ import argparse
17
  import os
18
  import re
19
  import sys
20
- import time
21
  import zipfile
22
  from pathlib import Path
23
 
24
- import requests
25
-
26
- BASE_URL = "https://organizedprogrammers-docfinder.hf.space"
27
- #_proxy = os.environ.get("http_proxy") or None
28
- #PROXIES = {"http": _proxy, "https": os.environ.get("https_proxy") or None}
29
 
30
 
31
  # ---------------------------------------------------------------------------
@@ -178,7 +173,7 @@ def _parse_xlsx(path: Path, person_name: str):
178
  # Step 2 β€” Download CR DOCXs
179
  # ---------------------------------------------------------------------------
180
 
181
- def download_cr(uid: str, cr_dir: Path):
182
  """
183
  Download CR DOCX for the given UID.
184
 
@@ -193,19 +188,14 @@ def download_cr(uid: str, cr_dir: Path):
193
  return dest, "already existed"
194
 
195
  try:
196
- resp = requests.post(
197
- f"{BASE_URL}/find/tdoc/download",
198
- json={"doc_id": uid},
199
- #proxies=PROXIES,
200
- timeout=60,
201
- )
202
- except requests.RequestException as e:
203
- return None, f"network error: {e}"
204
-
205
- if not resp.ok:
206
- return None, f"HTTP {resp.status_code}"
207
-
208
- content = resp.content
209
  if not content:
210
  return None, "empty response"
211
 
@@ -296,22 +286,11 @@ def parse_cr_cover(docx_path: Path):
296
  # Step 4 β€” Download TS DOCXs
297
  # ---------------------------------------------------------------------------
298
 
299
- def _is_html(resp: requests.Response) -> bool:
300
- """Return True if the response body is an HTML page (e.g. HF Space loading page)."""
301
- ct = resp.headers.get("content-type", "")
302
- if "text/html" in ct:
303
- return True
304
- return resp.content[:5].lower() in (b"<!doc", b"<html")
305
-
306
-
307
  def download_ts(spec_number: str, version: str, ts_dir: Path,
308
- max_retries: int = 3, retry_delay: int = 10):
309
  """
310
  Download TS DOCX for spec_number (e.g. "102 221") and version (e.g. "18.3.0").
311
 
312
- Retries up to max_retries times when the HF Space returns an HTML loading page
313
- instead of the DOCX binary (happens on cold-start / brief restarts).
314
-
315
  Returns (filename, note) or (None, error_msg).
316
  """
317
  spec_no_space = spec_number.replace(" ", "")
@@ -321,56 +300,40 @@ def download_ts(spec_number: str, version: str, ts_dir: Path,
321
  if dest.exists():
322
  return filename, "already existed"
323
 
324
- last_error = "no attempts made"
325
- for attempt in range(1, max_retries + 1):
326
- try:
327
- resp = requests.post(
328
- f"{BASE_URL}/find/docx",
329
- json={"doc_id": spec_number, "version": version},
330
- #proxies=PROXIES,
331
- timeout=120,
332
- )
333
- except requests.RequestException as e:
334
- return None, f"network error: {e}"
335
 
336
- if not resp.ok:
337
- return None, f"HTTP {resp.status_code} β€” {resp.text[:200]}"
338
 
339
- content = resp.content
340
- if not content:
341
- return None, "empty response"
342
 
343
- # Detect HTML splash page (HF Space cold-start) β€” retry after a delay
344
- if _is_html(resp):
345
- last_error = f"got HTML instead of DOCX (attempt {attempt}/{max_retries})"
346
- if attempt < max_retries:
347
- print(f"\n [retry in {retry_delay}s β€” HF Space loading…]", flush=True)
348
- time.sleep(retry_delay)
349
- continue
350
- return None, f"invalid file (not a ZIP/DOCX, starts with {content[:4]!r}) after {max_retries} attempts"
351
 
352
- # Good binary response
353
- dest.write_bytes(content)
 
354
 
355
- if content[:2] != b"PK":
 
 
 
 
 
356
  dest.unlink()
357
- return None, f"invalid file (not a ZIP/DOCX, starts with {content[:4]!r})"
358
-
359
- # Verify the TS contains the expected spec number in its first paragraph
360
- try:
361
- import docx as _docx
362
- _doc = _docx.Document(dest)
363
- first_para = _doc.paragraphs[0].text if _doc.paragraphs else ''
364
- if spec_no_space not in first_para.replace(' ', ''):
365
- dest.unlink()
366
- return None, f"wrong TS returned by API: got {first_para[:80]!r} (expected spec {spec_no_space})"
367
- except Exception:
368
- pass # Trust the ZIP check above
369
-
370
- note = "downloaded" if attempt == 1 else f"downloaded (after {attempt} attempts)"
371
- return filename, note
372
 
373
- return None, last_error
374
 
375
 
376
  # ---------------------------------------------------------------------------
@@ -394,6 +357,11 @@ def main():
394
  person_name = args.person_name
395
  output_dir = Path(wsl_path(args.output_dir)).expanduser()
396
 
 
 
 
 
 
397
  cr_dir = output_dir / "CRs"
398
  ts_dir = output_dir / "TS"
399
  cr_dir.mkdir(parents=True, exist_ok=True)
@@ -419,8 +387,8 @@ def main():
419
  cr_results = [] # list of (uid, docx_path_or_None, note)
420
 
421
  for uid, title in cr_list:
422
- print(f" [{uid}] ", end="", flush=True)
423
- docx_path, note = download_cr(uid, cr_dir)
424
  cr_results.append((uid, docx_path, note))
425
  if docx_path:
426
  print(f"OK ({note}) β€” {docx_path.name}")
@@ -452,7 +420,7 @@ def main():
452
 
453
  for (spec_number, version), uids in ts_targets.items():
454
  print(f" [TS {spec_number} v{version}] ", end="", flush=True)
455
- filename, note = download_ts(spec_number, version, ts_dir)
456
  ts_results.append((spec_number, version, filename, note))
457
  if filename:
458
  print(f"OK ({note}) β€” {filename}")
 
7
 
8
  Steps:
9
  1. Parse Excel, filter Accepted CRs by person name
10
+ 2. Download CR DOCXs via ETSI docbox
11
  3. Parse CR cover pages to extract target TS spec + version
12
+ 4. Download TS DOCXs via ETSI portal WKI chain
13
  5. Print summary report
14
  """
15
 
 
17
  import os
18
  import re
19
  import sys
 
20
  import zipfile
21
  from pathlib import Path
22
 
23
+ from etsi_client import ETSIDocFinder, ETSISpecFinder
 
 
 
 
24
 
25
 
26
  # ---------------------------------------------------------------------------
 
173
  # Step 2 β€” Download CR DOCXs
174
  # ---------------------------------------------------------------------------
175
 
176
+ def download_cr(uid: str, cr_dir: Path, eol_user: str, eol_password: str):
177
  """
178
  Download CR DOCX for the given UID.
179
 
 
188
  return dest, "already existed"
189
 
190
  try:
191
+ finder = ETSIDocFinder(eol_user, eol_password)
192
+ url = finder.search_document(uid)
193
+ if isinstance(url, str) and "not found" in url.lower():
194
+ return None, f"document not found: {uid}"
195
+ content = finder.download_document(url)
196
+ except Exception as e:
197
+ return None, f"download error: {e}"
198
+
 
 
 
 
 
199
  if not content:
200
  return None, "empty response"
201
 
 
286
  # Step 4 β€” Download TS DOCXs
287
  # ---------------------------------------------------------------------------
288
 
 
 
 
 
 
 
 
 
289
  def download_ts(spec_number: str, version: str, ts_dir: Path,
290
+ eol_user: str = "", eol_password: str = ""):
291
  """
292
  Download TS DOCX for spec_number (e.g. "102 221") and version (e.g. "18.3.0").
293
 
 
 
 
294
  Returns (filename, note) or (None, error_msg).
295
  """
296
  spec_no_space = spec_number.replace(" ", "")
 
300
  if dest.exists():
301
  return filename, "already existed"
302
 
303
+ try:
304
+ finder = ETSISpecFinder(eol_user, eol_password)
305
+ tmp_path = finder.search_document_docx(spec_number, version)
306
+ except Exception as e:
307
+ return None, f"download error: {e}"
 
 
 
 
 
 
308
 
309
+ if "not found" in str(tmp_path).lower() or "rejected" in str(tmp_path).lower():
310
+ return None, tmp_path
311
 
312
+ content = Path(tmp_path).read_bytes()
313
+ if not content:
314
+ return None, "empty response"
315
 
316
+ dest.write_bytes(content)
 
 
 
 
 
 
 
317
 
318
+ if content[:2] != b"PK":
319
+ dest.unlink()
320
+ return None, f"invalid file (not a ZIP/DOCX, starts with {content[:4]!r})"
321
 
322
+ # Verify the TS contains the expected spec number in its first paragraph
323
+ try:
324
+ import docx as _docx
325
+ _doc = _docx.Document(dest)
326
+ first_para = _doc.paragraphs[0].text if _doc.paragraphs else ""
327
+ if spec_no_space not in first_para.replace(" ", ""):
328
  dest.unlink()
329
+ return None, (
330
+ f"wrong TS returned: got {first_para[:80]!r} "
331
+ f"(expected spec {spec_no_space})"
332
+ )
333
+ except Exception:
334
+ pass # Trust the ZIP check above
 
 
 
 
 
 
 
 
 
335
 
336
+ return filename, "downloaded"
337
 
338
 
339
  # ---------------------------------------------------------------------------
 
357
  person_name = args.person_name
358
  output_dir = Path(wsl_path(args.output_dir)).expanduser()
359
 
360
+ eol_user = os.environ.get("EOL_USER", "")
361
+ eol_password = os.environ.get("EOL_PASSWORD", "")
362
+ if not eol_user or not eol_password:
363
+ sys.exit("ERROR: EOL_USER and EOL_PASSWORD must be set")
364
+
365
  cr_dir = output_dir / "CRs"
366
  ts_dir = output_dir / "TS"
367
  cr_dir.mkdir(parents=True, exist_ok=True)
 
387
  cr_results = [] # list of (uid, docx_path_or_None, note)
388
 
389
  for uid, title in cr_list:
390
+ #print(f" [{uid}] ", end="", flush=True)
391
+ docx_path, note = download_cr(uid, cr_dir, eol_user, eol_password)
392
  cr_results.append((uid, docx_path, note))
393
  if docx_path:
394
  print(f"OK ({note}) β€” {docx_path.name}")
 
420
 
421
  for (spec_number, version), uids in ts_targets.items():
422
  print(f" [TS {spec_number} v{version}] ", end="", flush=True)
423
+ filename, note = download_ts(spec_number, version, ts_dir, eol_user, eol_password)
424
  ts_results.append((spec_number, version, filename, note))
425
  if filename:
426
  print(f"OK ({note}) β€” {filename}")
scripts/finalize_ts.py CHANGED
@@ -178,18 +178,32 @@ def _detect_meeting_separator(tbl):
178
 
179
  # ── TS table locators ─────────────────────────────────────────────────────────
180
 
 
 
 
 
 
181
  def find_change_history_table(ts_doc):
182
- """Return ts_doc.tables[-2] (Change History / Annex V). Accepts 8 or 9 columns."""
183
- tables = ts_doc.tables
184
- if len(tables) < 2:
185
- raise ValueError('TS has fewer than 2 tables')
186
- tbl = tables[-2]
187
- ncols = len(tbl.rows[-1].cells)
188
- if ncols not in (8, 9):
189
- raise ValueError(
190
- f'Change History table has {ncols} columns, expected 8 or 9'
191
- )
192
- return tbl
 
 
 
 
 
 
 
 
 
193
 
194
 
195
  def find_history_table(ts_doc):
 
178
 
179
  # ── TS table locators ─────────────────────────────────────────────────────────
180
 
181
+ class NoChangeHistoryTable(Exception):
182
+ """Raised when the document contains no recognisable Change History table."""
183
+ pass
184
+
185
+
186
  def find_change_history_table(ts_doc):
187
+ """
188
+ Scan all tables backward from the end looking for a Change History table.
189
+ A match requires both:
190
+ - 8 or 9 columns in the last row (standard ETSI Change History layout)
191
+ - At least one of the keywords 'cr', 'date', 'meeting', 'rev' in the header row
192
+ Raises NoChangeHistoryTable (not ValueError) when none is found so callers
193
+ can distinguish a structural absence from an unexpected error.
194
+ """
195
+ for tbl in reversed(ts_doc.tables):
196
+ ncols = len(tbl.rows[-1].cells)
197
+ if ncols not in (8, 9):
198
+ continue
199
+ if tbl.rows:
200
+ header_text = ' '.join(c.text.strip() for c in tbl.rows[0].cells).lower()
201
+ if any(kw in header_text for kw in ('cr', 'date', 'meeting', 'rev')):
202
+ return tbl
203
+ raise NoChangeHistoryTable(
204
+ 'No Change History table found in this document '
205
+ '(no table with 8 or 9 columns and CR/Date/Meeting/Rev headers)'
206
+ )
207
 
208
 
209
  def find_history_table(ts_doc):
scripts/orchestrate_cr.py CHANGED
@@ -22,8 +22,11 @@ import argparse
22
  import contextlib
23
  import datetime
24
  import io
 
 
25
  import re
26
  import sys
 
27
  from pathlib import Path
28
 
29
  import docx as docx_lib
@@ -44,6 +47,7 @@ from finalize_ts import (
44
  update_change_history_table,
45
  update_history_table,
46
  update_title_para,
 
47
  )
48
  from docx_helpers import RevCounter, AUTHOR as DEFAULT_AUTHOR, DATE as DEFAULT_DATE
49
 
@@ -78,7 +82,12 @@ def main():
78
  description='Fully automated CR application pipeline.',
79
  formatter_class=argparse.RawDescriptionHelpFormatter,
80
  )
81
- ap.add_argument('excel_path', help='Path to .xls or .xlsx contribution list')
 
 
 
 
 
82
  ap.add_argument(
83
  'person_name',
84
  nargs='?',
@@ -95,9 +104,21 @@ def main():
95
  default=DEFAULT_AUTHOR,
96
  help=f'Tracked change author name (default: "{DEFAULT_AUTHOR}")',
97
  )
 
 
 
 
 
98
  args = ap.parse_args()
99
 
100
- excel_path = wsl_path(args.excel_path)
 
 
 
 
 
 
 
101
  output_dir = Path(wsl_path(args.output_dir)).expanduser()
102
  cr_dir = output_dir / 'CRs'
103
  ts_dir = output_dir / 'TS' # spec subfolders created per-TS below
@@ -107,6 +128,212 @@ def main():
107
  author = args.author
108
  tc_date = DEFAULT_DATE
109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  # ── Step 1: Parse Excel ───────────────────────────────────────────────────
111
  _section('Step 1 β€” Parsing Excel')
112
  print(f'Excel: {excel_path}')
@@ -117,9 +344,7 @@ def main():
117
  except Exception as e:
118
  sys.exit(f'ERROR parsing Excel: {e}')
119
 
120
- print(f'Found {len(cr_list)} Accepted CR(s):')
121
- for uid, title in cr_list:
122
- print(f' {uid}: {title[:80]}')
123
 
124
  if not cr_list:
125
  print('Nothing to process.')
@@ -130,13 +355,16 @@ def main():
130
  cr_paths = {} # uid -> Path
131
 
132
  for uid, _ in cr_list:
133
- print(f' [{uid}] ', end='', flush=True)
134
- docx_path, note = download_cr(uid, cr_dir)
135
  if docx_path:
136
  cr_paths[uid] = docx_path
137
- print(f'OK ({note}) β€” {docx_path.name}')
138
- else:
139
- print(f'FAILED β€” {note}')
 
 
 
 
140
 
141
  # ── Step 3: Parse cover pages β†’ group by target TS ───────────────────────
142
  _section('Step 3 β€” Parsing CR cover pages')
@@ -169,13 +397,41 @@ def main():
169
  spec_dirs[(spec_number, version)] = spec_dir
170
 
171
  print(f' [TS {spec_number} v{version}] ', end='', flush=True)
172
- filename, note = download_ts(spec_number, version, spec_dir)
 
 
 
 
 
 
 
 
 
 
173
  if filename:
174
  ts_paths[(spec_number, version)] = spec_dir / filename
175
  print(f'OK ({note}) β€” {spec_compact}/{filename}')
176
  else:
177
  print(f'FAILED β€” {note}')
178
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  # ── Steps 5 & 6: Apply CRs + Finalise each TS ────────────────────────────
180
  _section('Steps 5 & 6 β€” Applying CRs and Finalising Metadata')
181
  report = [] # (ts_key, n_ok, n_skip, n_crs, out_path, log_path, errors)
@@ -258,6 +514,10 @@ def main():
258
 
259
  for line in log_lines:
260
  print(f' {line}')
 
 
 
 
261
  print(f' -> Applied: {n_ok} Skipped: {n_skip}')
262
 
263
  # 6. Finalise metadata (Change History, History, title paragraph)
@@ -285,6 +545,8 @@ def main():
285
  ts_doc, meta, pub_ym, old_v, new_v, rev, author, tc_date
286
  )
287
  print(f' [Change History] {uid}: {ch_cells}')
 
 
288
  except Exception as e:
289
  errors.append(f'[{uid}] Change History ERROR: {e}')
290
  print(f' [Change History] {uid}: ERROR β€” {e}')
 
22
  import contextlib
23
  import datetime
24
  import io
25
+ import json
26
+ import os
27
  import re
28
  import sys
29
+ import time
30
  from pathlib import Path
31
 
32
  import docx as docx_lib
 
47
  update_change_history_table,
48
  update_history_table,
49
  update_title_para,
50
+ NoChangeHistoryTable,
51
  )
52
  from docx_helpers import RevCounter, AUTHOR as DEFAULT_AUTHOR, DATE as DEFAULT_DATE
53
 
 
82
  description='Fully automated CR application pipeline.',
83
  formatter_class=argparse.RawDescriptionHelpFormatter,
84
  )
85
+ ap.add_argument(
86
+ 'excel_path',
87
+ nargs='?',
88
+ default=None,
89
+ help='Path to .xls or .xlsx contribution list (not required in --retry-mode)',
90
+ )
91
  ap.add_argument(
92
  'person_name',
93
  nargs='?',
 
104
  default=DEFAULT_AUTHOR,
105
  help=f'Tracked change author name (default: "{DEFAULT_AUTHOR}")',
106
  )
107
+ ap.add_argument(
108
+ '--retry-mode',
109
+ action='store_true',
110
+ help='Skip steps 1-4; apply CRs to TSs listed in failed_ts.json that now have their DOCX on disk',
111
+ )
112
  args = ap.parse_args()
113
 
114
+ if not args.retry_mode and not args.excel_path:
115
+ ap.error('excel_path is required when not in --retry-mode')
116
+
117
+ eol_user = os.environ.get("EOL_USER", "")
118
+ eol_password = os.environ.get("EOL_PASSWORD", "")
119
+ if not eol_user or not eol_password:
120
+ sys.exit("ERROR: EOL_USER and EOL_PASSWORD must be set")
121
+
122
  output_dir = Path(wsl_path(args.output_dir)).expanduser()
123
  cr_dir = output_dir / 'CRs'
124
  ts_dir = output_dir / 'TS' # spec subfolders created per-TS below
 
128
  author = args.author
129
  tc_date = DEFAULT_DATE
130
 
131
+ # ── Retry mode β€” skip steps 1-4, reconstruct state from failed_ts.json ───
132
+ if args.retry_mode:
133
+ failed_ts_path = output_dir / 'failed_ts.json'
134
+ if not failed_ts_path.exists():
135
+ sys.exit('ERROR: failed_ts.json not found in output directory')
136
+ failed_ts_entries = json.loads(failed_ts_path.read_text())
137
+ if not failed_ts_entries:
138
+ print('No failed TSs in failed_ts.json β€” nothing to retry.')
139
+ return
140
+
141
+ _section('Retry mode β€” Steps 5 & 6 only')
142
+ print(f'Retrying {len(failed_ts_entries)} TS(s) from failed_ts.json')
143
+
144
+ ts_groups = {}
145
+ spec_dirs = {}
146
+ ts_paths = {}
147
+ cr_paths = {}
148
+
149
+ for entry in failed_ts_entries:
150
+ spec_number = entry['spec_number']
151
+ version = entry['version']
152
+ key = (spec_number, version)
153
+ ts_groups[key] = entry['cr_uids']
154
+ spec_dir = Path(entry['spec_dir'])
155
+ spec_dirs[key] = spec_dir
156
+ expected = spec_dir / entry['expected_filename']
157
+ if expected.exists():
158
+ ts_paths[key] = expected
159
+ print(f' [TS {spec_number} v{version}] DOCX found β€” will apply')
160
+ else:
161
+ print(f' [TS {spec_number} v{version}] DOCX missing β€” skipping')
162
+ # Reconstruct cr_paths for each UID
163
+ cr_entry_dir = Path(entry['cr_dir'])
164
+ for uid in entry['cr_uids']:
165
+ extracted = cr_entry_dir / f'{uid}_extracted.docx'
166
+ plain = cr_entry_dir / f'{uid}.docx'
167
+ if extracted.exists():
168
+ cr_paths[uid] = extracted
169
+ elif plain.exists():
170
+ cr_paths[uid] = plain
171
+
172
+ # ── Steps 5 & 6 (retry mode falls through to shared loop below) ──────
173
+ report = []
174
+
175
+ for (spec_number, version), uids in ts_groups.items():
176
+ ts_key = f'TS {spec_number} v{version}'
177
+ spec_compact = spec_number.replace(' ', '')
178
+ spec_dir = spec_dirs.get((spec_number, version), ts_dir / spec_compact)
179
+ spec_dir.mkdir(parents=True, exist_ok=True)
180
+
181
+ new_v = derive_new_version(version)
182
+ stem = f'ts_{spec_compact}_v{new_v}_was_v{version}'
183
+ ts_applied = spec_dir / f'ts_{spec_compact}_v{version}_applied.docx'
184
+ ts_final = spec_dir / f'{stem}.docx'
185
+ log_path = spec_dir / f'{stem}.log'
186
+ errors = []
187
+
188
+ print(f'\n-- {ts_key} ({len(uids)} CR(s): {", ".join(uids)}) --')
189
+
190
+ if (spec_number, version) not in ts_paths:
191
+ msg = 'TS DOCX not on disk β€” skipping'
192
+ print(f' SKIP: {msg}')
193
+ report.append((ts_key, 0, 0, len(uids), None, log_path, [msg]))
194
+ continue
195
+
196
+ ts_in = ts_paths[(spec_number, version)]
197
+
198
+ log_buf = io.StringIO()
199
+ tee = _TeeWriter(sys.stdout, log_buf)
200
+
201
+ with contextlib.redirect_stdout(tee):
202
+ log_header = (
203
+ f'Pipeline Log (retry)\n'
204
+ f'TS: {spec_number} v{version} -> v{new_v}\n'
205
+ f'CRs: {", ".join(uids)}\n'
206
+ f'Date: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n'
207
+ f'{"=" * 60}\n'
208
+ )
209
+ print(log_header, end='')
210
+
211
+ combined_manifest = []
212
+ participating_uids = []
213
+
214
+ for uid in uids:
215
+ if uid not in cr_paths:
216
+ errors.append(f'[{uid}] CR DOCX not found β€” skipped')
217
+ continue
218
+ print(f' Parsing {uid}... ', end='', flush=True)
219
+ try:
220
+ changes = parse_cr(cr_paths[uid])
221
+ combined_manifest.extend(changes)
222
+ participating_uids.append(uid)
223
+ print(f'{len(changes)} change(s)')
224
+ except Exception as e:
225
+ errors.append(f'[{uid}] parse ERROR: {e}')
226
+ print(f'ERROR: {e}')
227
+
228
+ if not combined_manifest:
229
+ print(' No changes parsed β€” skipping apply step.')
230
+ report.append((ts_key, 0, 0, len(uids), None, log_path,
231
+ errors + ['No changes parsed']))
232
+ log_path.write_text(log_buf.getvalue(), encoding='utf-8')
233
+ continue
234
+
235
+ print(f' Applying {len(combined_manifest)} change(s) to {ts_in.name}...')
236
+ try:
237
+ n_ok, n_skip, log_lines = apply_manifest(
238
+ ts_in, combined_manifest, ts_applied, author=author, date=tc_date
239
+ )
240
+ except Exception as e:
241
+ errors.append(f'apply_manifest ERROR: {e}')
242
+ print(f' ERROR: {e}')
243
+ report.append((ts_key, 0, 0, len(uids), None, log_path, errors))
244
+ log_path.write_text(log_buf.getvalue(), encoding='utf-8')
245
+ continue
246
+
247
+ for line in log_lines:
248
+ print(f' {line}')
249
+ # Bubble every un-applied change into the warnings list
250
+ for line in log_lines:
251
+ if line.strip().startswith('ERROR'):
252
+ errors.append(line.strip())
253
+ print(f' -> Applied: {n_ok} Skipped: {n_skip}')
254
+
255
+ print(' Finalising metadata...')
256
+ try:
257
+ ts_doc = docx_lib.Document(str(ts_applied))
258
+ rev = RevCounter(ts_doc)
259
+
260
+ pub_ym, pub_month_year = compute_pub_date()
261
+ old_v = version
262
+
263
+ title_text = ts_doc.paragraphs[0].text
264
+ date_match = re.search(r'\((\d{4}-\d{2})\)', title_text)
265
+ old_date_str = date_match.group(1) if date_match else ''
266
+
267
+ print(f' Version: {old_v} -> {new_v}')
268
+ print(f' Publication: {pub_month_year} ({pub_ym})')
269
+
270
+ for uid in participating_uids:
271
+ try:
272
+ meta = extract_cr_metadata(str(cr_paths[uid]))
273
+ ch_cells = update_change_history_table(
274
+ ts_doc, meta, pub_ym, old_v, new_v, rev, author, tc_date
275
+ )
276
+ print(f' [Change History] {uid}: {ch_cells}')
277
+ except NoChangeHistoryTable:
278
+ print(f' [Change History] {uid}: NOT PRESENT β€” this document has no Change History table (History table only)')
279
+ except Exception as e:
280
+ errors.append(f'[{uid}] Change History ERROR: {e}')
281
+ print(f' [Change History] {uid}: ERROR β€” {e}')
282
+
283
+ try:
284
+ h_cells = update_history_table(
285
+ ts_doc, new_v, pub_month_year, rev, author, tc_date
286
+ )
287
+ print(f' [History] {h_cells}')
288
+ except Exception as e:
289
+ errors.append(f'History table ERROR: {e}')
290
+ print(f' [History] ERROR β€” {e}')
291
+
292
+ if old_date_str:
293
+ try:
294
+ update_title_para(
295
+ ts_doc, old_v, new_v, old_date_str, pub_ym, rev, author, tc_date
296
+ )
297
+ print(f' [Title] V{old_v} -> V{new_v}, ({old_date_str}) -> ({pub_ym})')
298
+ except Exception as e:
299
+ errors.append(f'Title update ERROR: {e}')
300
+ print(f' [Title] ERROR β€” {e}')
301
+ else:
302
+ print(f' [Title] SKIP β€” no (YYYY-MM) pattern in: {title_text!r}')
303
+
304
+ ts_doc.save(str(ts_final))
305
+ print(f' Saved: {spec_compact}/{ts_final.name}')
306
+ print(f' Log: {spec_compact}/{log_path.name}')
307
+ report.append((ts_key, n_ok, n_skip, len(uids), ts_final, log_path, errors))
308
+
309
+ except Exception as e:
310
+ errors.append(f'Finalisation ERROR: {e}')
311
+ print(f' Finalisation ERROR: {e}')
312
+ report.append((ts_key, n_ok, n_skip, len(uids), ts_applied, log_path, errors))
313
+
314
+ log_path.write_text(log_buf.getvalue(), encoding='utf-8')
315
+
316
+ # Update failed_ts.json β€” remove entries that are now resolved
317
+ still_failed = [
318
+ e for e in failed_ts_entries
319
+ if not (Path(e['spec_dir']) / e['expected_filename']).exists()
320
+ ]
321
+ failed_ts_path.write_text(json.dumps(still_failed, indent=2))
322
+
323
+ _section('Retry Summary')
324
+ n_success = sum(1 for r in report if r[4] is not None and not r[6])
325
+ n_partial = sum(1 for r in report if r[4] is not None and r[6])
326
+ n_failed = sum(1 for r in report if r[4] is None)
327
+ print(f'TSs processed: {n_success} fully OK, {n_partial} with warnings, {n_failed} skipped/failed')
328
+ for ts_key, n_ok, n_skip, n_crs, out_path, log_path, errors in report:
329
+ status_tag = 'OK' if out_path and not errors else ('WARN' if out_path else 'SKIP')
330
+ print(f' [{status_tag}] {ts_key}')
331
+ for err in errors:
332
+ print(f' ! {err}')
333
+ return
334
+
335
+ excel_path = wsl_path(args.excel_path)
336
+
337
  # ── Step 1: Parse Excel ───────────────────────────────────────────────────
338
  _section('Step 1 β€” Parsing Excel')
339
  print(f'Excel: {excel_path}')
 
344
  except Exception as e:
345
  sys.exit(f'ERROR parsing Excel: {e}')
346
 
347
+ print(f'Found {len(cr_list)} Accepted CR(s)')
 
 
348
 
349
  if not cr_list:
350
  print('Nothing to process.')
 
355
  cr_paths = {} # uid -> Path
356
 
357
  for uid, _ in cr_list:
358
+ docx_path, note = download_cr(uid, cr_dir, eol_user, eol_password)
 
359
  if docx_path:
360
  cr_paths[uid] = docx_path
361
+ print(f' [{uid}] OK ({note}) β€” {docx_path.name}')
362
+
363
+ n_cr_failed = len(cr_list) - len(cr_paths)
364
+ if n_cr_failed:
365
+ print(f' {len(cr_paths)}/{len(cr_list)} downloaded ({n_cr_failed} failed β€” details in warnings)')
366
+ else:
367
+ print(f' All {len(cr_list)} CR(s) downloaded successfully')
368
 
369
  # ── Step 3: Parse cover pages β†’ group by target TS ───────────────────────
370
  _section('Step 3 β€” Parsing CR cover pages')
 
397
  spec_dirs[(spec_number, version)] = spec_dir
398
 
399
  print(f' [TS {spec_number} v{version}] ', end='', flush=True)
400
+ filename, note = None, "not attempted"
401
+ for attempt in range(1, 4):
402
+ filename, note = download_ts(spec_number, version, spec_dir, eol_user, eol_password)
403
+ if filename:
404
+ break
405
+ if attempt < 3:
406
+ print(f'\n [attempt {attempt}/3 failed β€” retrying in 5s: {note}]', flush=True)
407
+ print(f' [TS {spec_number} v{version}] ', end='', flush=True)
408
+ time.sleep(5)
409
+ else:
410
+ print(f'\n [all 3 attempts failed]', flush=True)
411
  if filename:
412
  ts_paths[(spec_number, version)] = spec_dir / filename
413
  print(f'OK ({note}) β€” {spec_compact}/{filename}')
414
  else:
415
  print(f'FAILED β€” {note}')
416
 
417
+ # Write failed_ts.json (even when empty so app.py can detect "no failures")
418
+ failed_ts_entries = [
419
+ {
420
+ "spec_number": spec_number,
421
+ "version": version,
422
+ "spec_compact": spec_number.replace(' ', ''),
423
+ "spec_dir": str(spec_dirs[(spec_number, version)]),
424
+ "expected_filename": f"ts_{spec_number.replace(' ', '')}_v{version}.docx",
425
+ "cr_uids": ts_groups[(spec_number, version)],
426
+ "cr_dir": str(cr_dir),
427
+ }
428
+ for (spec_number, version) in ts_groups
429
+ if (spec_number, version) not in ts_paths
430
+ ]
431
+ (output_dir / "failed_ts.json").write_text(
432
+ json.dumps(failed_ts_entries, indent=2)
433
+ )
434
+
435
  # ── Steps 5 & 6: Apply CRs + Finalise each TS ────────────────────────────
436
  _section('Steps 5 & 6 β€” Applying CRs and Finalising Metadata')
437
  report = [] # (ts_key, n_ok, n_skip, n_crs, out_path, log_path, errors)
 
514
 
515
  for line in log_lines:
516
  print(f' {line}')
517
+ # Bubble every un-applied change into the warnings list
518
+ for line in log_lines:
519
+ if line.strip().startswith('ERROR'):
520
+ errors.append(line.strip())
521
  print(f' -> Applied: {n_ok} Skipped: {n_skip}')
522
 
523
  # 6. Finalise metadata (Change History, History, title paragraph)
 
545
  ts_doc, meta, pub_ym, old_v, new_v, rev, author, tc_date
546
  )
547
  print(f' [Change History] {uid}: {ch_cells}')
548
+ except NoChangeHistoryTable:
549
+ print(f' [Change History] {uid}: NOT PRESENT β€” this document has no Change History table (History table only)')
550
  except Exception as e:
551
  errors.append(f'[{uid}] Change History ERROR: {e}')
552
  print(f' [Change History] {uid}: ERROR β€” {e}')
scripts/ts_applicator.py CHANGED
@@ -33,11 +33,22 @@ from docx_helpers import (
33
  # ── Text normalisation ────────────────────────────────────────────────────────
34
 
35
  def _norm(text):
36
- """Normalise non-breaking spaces and common Unicode dashes for comparison."""
37
  return (text
38
- .replace('\xa0', ' ')
39
- .replace('\u2013', '-')
40
- .replace('\u2014', '-')
 
 
 
 
 
 
 
 
 
 
 
41
  .strip())
42
 
43
 
@@ -60,12 +71,53 @@ def _norm_ws(text):
60
  Used as a third-level fallback (confidence 0.8) after exact and NBSP-norm.
61
  """
62
  base = (text
63
- .replace('\xa0', '')
 
 
 
 
 
 
64
  .replace('\u2013', '-')
65
- .replace('\u2014', '-'))
 
 
 
 
 
66
  return re.sub(r'\s+', '', base)
67
 
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  # ── Document search helpers ───────────────────────────────────────────────────
70
 
71
  def _full_para_text(para):
@@ -189,11 +241,22 @@ def _find_row(tbl, anchor_text):
189
  """
190
  Find first row in tbl where col-0 cell text contains anchor_text.
191
  Returns (row_idx, confidence) or (-1, 0.0).
192
- Three confidence levels: 1.0 exact, 0.9 norm, 0.8 whitespace-stripped.
 
 
 
 
 
 
 
 
 
193
  """
194
- norm_anchor = _norm(anchor_text)
195
- ws_anchor = _norm_ws(anchor_text)
 
196
  best = (-1, 0.0)
 
197
  for idx, row in enumerate(tbl.rows):
198
  cell0 = row.cells[0].text if row.cells else ''
199
  if anchor_text in cell0:
@@ -202,7 +265,42 @@ def _find_row(tbl, anchor_text):
202
  best = (idx, 0.9)
203
  elif ws_anchor and ws_anchor in _norm_ws(cell0) and best[1] < 0.8:
204
  best = (idx, 0.8)
205
- return best
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
 
208
  # ── vMerge row insertion ──────────────────────────────────────────────────────
@@ -329,7 +427,7 @@ def _apply_section_replace(doc, change, rev, author, date, log):
329
  break
330
 
331
  if ts_para_elem is None:
332
- log.append(f' SKIP section_replace: del_heading {del_heading!r} not found in TS')
333
  return False
334
 
335
  ts_body = ts_para_elem.getparent()
@@ -395,7 +493,7 @@ def _apply_text_replace(doc, change, rev, author, date, log):
395
  if loc['kind'] == 'table_cell':
396
  tbl, t_conf = _find_table(doc, loc['table_header'])
397
  if tbl is None:
398
- log.append(f" SKIP text_replace: table not found {loc['table_header'][:2]!r}")
399
  return False
400
  col_idx = loc['col_idx']
401
  row_anchor = loc['row_anchor']
@@ -403,11 +501,22 @@ def _apply_text_replace(doc, change, rev, author, date, log):
403
  if row_anchor:
404
  row_idx, r_conf = _find_row(tbl, row_anchor)
405
  if row_idx < 0:
406
- log.append(f" SKIP text_replace: row anchor not found {row_anchor!r}")
 
 
 
 
 
 
 
 
 
 
 
407
  return False
408
  row = tbl.rows[row_idx]
409
  if col_idx >= len(row.cells):
410
- log.append(f" SKIP text_replace: col_idx {col_idx} out of range")
411
  return False
412
  cell = row.cells[col_idx]
413
  for para in cell.paragraphs:
@@ -415,7 +524,7 @@ def _apply_text_replace(doc, change, rev, author, date, log):
415
  tracked_modify_para(para, old, new, rev, author, date)
416
  log.append(f" OK text_replace (table_cell row={row_idx} col={col_idx}): {old!r} β†’ {new!r}")
417
  return True
418
- log.append(f" SKIP text_replace: old text {old!r} not in cell (row={row_idx} col={col_idx})")
419
  return False
420
  else:
421
  # Empty row anchor: scan all rows in col_idx.
@@ -447,7 +556,7 @@ def _apply_text_replace(doc, change, rev, author, date, log):
447
  tracked_modify_para(para, old, new, rev, author, date)
448
  log.append(f" OK text_replace (table_cell any_col row={r_idx} col={c_idx}): {old!r} β†’ {new!r}")
449
  return True
450
- log.append(f" SKIP text_replace: old text {old!r} not found in any table column")
451
  return False
452
 
453
  elif loc['kind'] == 'body_para':
@@ -458,16 +567,16 @@ def _apply_text_replace(doc, change, rev, author, date, log):
458
  # Fall back: find by paragraph context
459
  para, conf = _find_para(doc, ctx, prefer_not_in_table=True)
460
  if para is None:
461
- log.append(f" SKIP text_replace: old text {old!r} not found in TS")
462
  return False
463
  if old in para.text:
464
  tracked_modify_para(para, old, new, rev, author, date)
465
  log.append(f" OK text_replace (body_para conf={conf:.1f}): {old!r} β†’ {new!r}")
466
  return True
467
- log.append(f" SKIP text_replace: old text {old!r} not in resolved paragraph")
468
  return False
469
 
470
- log.append(f" SKIP text_replace: unknown kind {loc['kind']!r}")
471
  return False
472
 
473
 
@@ -479,7 +588,7 @@ def _apply_para_insert(doc, change, rev, author, date, log):
479
 
480
  anchor_para, conf = _find_para(doc, anchor_text)
481
  if anchor_para is None:
482
- log.append(f" SKIP para_insert: anchor not found {anchor_text[:60]!r}")
483
  return False
484
 
485
  items = [(p['text'], p['style'] or 'Normal') for p in paras_data]
@@ -500,13 +609,13 @@ def _apply_row_insert(doc, change, rev, author, date, log, last_inserted=None):
500
  else:
501
  tbl, t_conf = _find_table(doc, loc['table_header'])
502
  if tbl is None:
503
- log.append(f" SKIP row_insert: table not found {loc['table_header'][:2]!r}")
504
  return False
505
 
506
  after_anchor = loc.get('after_row_anchor', '')
507
  row_idx, r_conf = _find_row(tbl, after_anchor)
508
  if row_idx < 0:
509
- log.append(f" SKIP row_insert: anchor row not found {after_anchor!r}")
510
  return False
511
 
512
  cells_data = change.get('cells', [])
 
33
  # ── Text normalisation ────────────────────────────────────────────────────────
34
 
35
  def _norm(text):
36
+ """Normalise common Unicode invisible/whitespace/punctuation variants for comparison."""
37
  return (text
38
+ .replace('\xa0', ' ') # non-breaking space
39
+ .replace('\u202f', ' ') # narrow no-break space
40
+ .replace('\u2007', ' ') # figure space
41
+ .replace('\u2060', '') # word joiner (invisible)
42
+ .replace('\u200b', '') # zero-width space
43
+ .replace('\u00ad', '') # soft hyphen (invisible)
44
+ .replace('\u2011', '-') # non-breaking hyphen
45
+ .replace('\u2013', '-') # en dash
46
+ .replace('\u2014', '-') # em dash
47
+ .replace('\u2212', '-') # minus sign
48
+ .replace('\u2018', "'") # left single quote
49
+ .replace('\u2019', "'") # right single quote
50
+ .replace('\u201c', '"') # left double quote
51
+ .replace('\u201d', '"') # right double quote
52
  .strip())
53
 
54
 
 
71
  Used as a third-level fallback (confidence 0.8) after exact and NBSP-norm.
72
  """
73
  base = (text
74
+ .replace('\xa0', '')
75
+ .replace('\u202f', '')
76
+ .replace('\u2007', '')
77
+ .replace('\u2060', '')
78
+ .replace('\u200b', '')
79
+ .replace('\u00ad', '')
80
+ .replace('\u2011', '-')
81
  .replace('\u2013', '-')
82
+ .replace('\u2014', '-')
83
+ .replace('\u2212', '-')
84
+ .replace('\u2018', "'")
85
+ .replace('\u2019', "'")
86
+ .replace('\u201c', '"')
87
+ .replace('\u201d', '"'))
88
  return re.sub(r'\s+', '', base)
89
 
90
 
91
+ def _norm_alnum(text):
92
+ """Keep only lowercase alphanumeric characters β€” last-resort matching.
93
+
94
+ Strips all punctuation, spaces, and Unicode variants so that only the
95
+ raw word/number content is compared. Used as a confidence-0.6 fallback
96
+ in _find_row when even whitespace-stripped matching fails (e.g. different
97
+ bracket styles, quote variants, or punctuation differences between the CR
98
+ and the TS).
99
+ """
100
+ return re.sub(r'[^a-z0-9]', '', text.lower())
101
+
102
+
103
+ def _clean_prefix(text: str) -> str:
104
+ """Return the longest leading substring that contains only standard printable
105
+ ASCII characters (ord 32–126).
106
+
107
+ Non-breaking spaces, curly quotes, and other Unicode characters embedded
108
+ mid-text (e.g. between spec number components like 'TS\xa0102\xa0226')
109
+ make the full anchor unmatchable. The clean prefix β€” the part before the
110
+ first such character β€” is still reliable and specific enough to locate the
111
+ correct row.
112
+ """
113
+ end = 0
114
+ for ch in text:
115
+ if ord(ch) < 32 or ord(ch) > 126:
116
+ break
117
+ end += 1
118
+ return text[:end].strip()
119
+
120
+
121
  # ── Document search helpers ───────────────────────────────────────────────────
122
 
123
  def _full_para_text(para):
 
241
  """
242
  Find first row in tbl where col-0 cell text contains anchor_text.
243
  Returns (row_idx, confidence) or (-1, 0.0).
244
+
245
+ Matching levels, in order of confidence:
246
+ 1.0 β€” exact substring match
247
+ 0.9 β€” Unicode-normalised match (_norm: xa0, dashes, quotes, …)
248
+ 0.8 β€” whitespace-stripped match (_norm_ws: also removes tabs/newlines)
249
+ 0.6 β€” alphanumeric-only match (_norm_alnum: strips all non a-z0-9)
250
+ 0.55 β€” clean-prefix unique match: extract the leading ASCII-only part of
251
+ the anchor and find the single row that contains it.
252
+ 0.5 β€” clean-prefix + token-overlap: when multiple rows share the prefix,
253
+ pick the one whose col-0 tokens overlap most with the anchor tokens.
254
  """
255
+ norm_anchor = _norm(anchor_text)
256
+ ws_anchor = _norm_ws(anchor_text)
257
+ alnum_anchor = _norm_alnum(anchor_text)
258
  best = (-1, 0.0)
259
+
260
  for idx, row in enumerate(tbl.rows):
261
  cell0 = row.cells[0].text if row.cells else ''
262
  if anchor_text in cell0:
 
265
  best = (idx, 0.9)
266
  elif ws_anchor and ws_anchor in _norm_ws(cell0) and best[1] < 0.8:
267
  best = (idx, 0.8)
268
+ elif alnum_anchor and alnum_anchor in _norm_alnum(cell0) and best[1] < 0.6:
269
+ best = (idx, 0.6)
270
+
271
+ if best[0] >= 0:
272
+ return best
273
+
274
+ # ── Prefix-based partial match ─────────────────────────────────────────────
275
+ # The anchor may have Unicode chars embedded mid-text that prevent all string
276
+ # comparisons above from matching, even after normalisation (e.g. when the CR
277
+ # extracts '\xa0' between spec-number parts but the TS has different encoding).
278
+ # Strategy: use only the clean ASCII prefix of the anchor as the search key.
279
+ # If that prefix is found in exactly one row β†’ we've uniquely identified it.
280
+ # If it appears in several rows β†’ pick the one whose full token set overlaps
281
+ # most with the anchor's tokens (the user's described disambiguation rule).
282
+ prefix = _clean_prefix(anchor_text)
283
+ if prefix and len(prefix) > 8:
284
+ prefix_low = prefix.lower()
285
+ hits = [
286
+ idx for idx, row in enumerate(tbl.rows)
287
+ if row.cells and prefix_low in row.cells[0].text.lower()
288
+ ]
289
+ if len(hits) == 1:
290
+ return hits[0], 0.55
291
+ elif len(hits) > 1:
292
+ anchor_tokens = set(re.findall(r'[a-z0-9]+', anchor_text.lower()))
293
+ best_score, best_idx = -1, -1
294
+ for hit_idx in hits:
295
+ cell_tokens = set(re.findall(r'[a-z0-9]+',
296
+ tbl.rows[hit_idx].cells[0].text.lower()))
297
+ score = len(anchor_tokens & cell_tokens)
298
+ if score > best_score:
299
+ best_score, best_idx = score, hit_idx
300
+ if best_idx >= 0:
301
+ return best_idx, 0.5
302
+
303
+ return (-1, 0.0)
304
 
305
 
306
  # ── vMerge row insertion ──────────────────────────────────────────────────────
 
427
  break
428
 
429
  if ts_para_elem is None:
430
+ log.append(f' ERROR section_replace: del_heading {del_heading!r} not found in TS')
431
  return False
432
 
433
  ts_body = ts_para_elem.getparent()
 
493
  if loc['kind'] == 'table_cell':
494
  tbl, t_conf = _find_table(doc, loc['table_header'])
495
  if tbl is None:
496
+ log.append(f" ERROR text_replace: table not found {loc['table_header'][:2]!r}")
497
  return False
498
  col_idx = loc['col_idx']
499
  row_anchor = loc['row_anchor']
 
501
  if row_anchor:
502
  row_idx, r_conf = _find_row(tbl, row_anchor)
503
  if row_idx < 0:
504
+ # Primary table doesn't contain this row anchor β€” the CR may be
505
+ # targeting a different table than the one _find_table resolved.
506
+ # Try every other table in the document before giving up.
507
+ for alt_tbl in doc.tables:
508
+ if alt_tbl is tbl:
509
+ continue
510
+ row_idx, r_conf = _find_row(alt_tbl, row_anchor)
511
+ if row_idx >= 0:
512
+ tbl = alt_tbl
513
+ break
514
+ if row_idx < 0:
515
+ log.append(f" ERROR text_replace: row anchor not found {row_anchor!r}")
516
  return False
517
  row = tbl.rows[row_idx]
518
  if col_idx >= len(row.cells):
519
+ log.append(f" ERROR text_replace: col_idx {col_idx} out of range")
520
  return False
521
  cell = row.cells[col_idx]
522
  for para in cell.paragraphs:
 
524
  tracked_modify_para(para, old, new, rev, author, date)
525
  log.append(f" OK text_replace (table_cell row={row_idx} col={col_idx}): {old!r} β†’ {new!r}")
526
  return True
527
+ log.append(f" ERROR text_replace: old text {old!r} not in cell (row={row_idx} col={col_idx})")
528
  return False
529
  else:
530
  # Empty row anchor: scan all rows in col_idx.
 
556
  tracked_modify_para(para, old, new, rev, author, date)
557
  log.append(f" OK text_replace (table_cell any_col row={r_idx} col={c_idx}): {old!r} β†’ {new!r}")
558
  return True
559
+ log.append(f" ERROR text_replace: old text {old!r} not found in any table column")
560
  return False
561
 
562
  elif loc['kind'] == 'body_para':
 
567
  # Fall back: find by paragraph context
568
  para, conf = _find_para(doc, ctx, prefer_not_in_table=True)
569
  if para is None:
570
+ log.append(f" ERROR text_replace: old text {old!r} not found in TS")
571
  return False
572
  if old in para.text:
573
  tracked_modify_para(para, old, new, rev, author, date)
574
  log.append(f" OK text_replace (body_para conf={conf:.1f}): {old!r} β†’ {new!r}")
575
  return True
576
+ log.append(f" ERROR text_replace: old text {old!r} not in resolved paragraph")
577
  return False
578
 
579
+ log.append(f" ERROR text_replace: unknown kind {loc['kind']!r}")
580
  return False
581
 
582
 
 
588
 
589
  anchor_para, conf = _find_para(doc, anchor_text)
590
  if anchor_para is None:
591
+ log.append(f" ERROR para_insert: anchor not found {anchor_text[:60]!r}")
592
  return False
593
 
594
  items = [(p['text'], p['style'] or 'Normal') for p in paras_data]
 
609
  else:
610
  tbl, t_conf = _find_table(doc, loc['table_header'])
611
  if tbl is None:
612
+ log.append(f" ERROR row_insert: table not found {loc['table_header'][:2]!r}")
613
  return False
614
 
615
  after_anchor = loc.get('after_row_anchor', '')
616
  row_idx, r_conf = _find_row(tbl, after_anchor)
617
  if row_idx < 0:
618
+ log.append(f" ERROR row_insert: anchor row not found {after_anchor!r}")
619
  return False
620
 
621
  cells_data = change.get('cells', [])