Spaces:
Running
Running
show more info to catch errors on etsi login page
Browse files- classes.py +67 -54
classes.py
CHANGED
|
@@ -4,6 +4,7 @@ import re
|
|
| 4 |
from bs4 import BeautifulSoup
|
| 5 |
import os
|
| 6 |
import json
|
|
|
|
| 7 |
|
| 8 |
def _get_proxies() -> dict:
|
| 9 |
"""Return a requests-compatible proxies dict from $http_proxy / $HTTP_PROXY."""
|
|
@@ -200,24 +201,24 @@ class ETSISpecFinder:
|
|
| 200 |
|
| 201 |
return f"Specification {doc_id} not found"
|
| 202 |
|
| 203 |
-
def
|
| 204 |
-
"""Return
|
| 205 |
if version:
|
| 206 |
version_str = version
|
| 207 |
else:
|
| 208 |
# Derive version from the FTP PDF URL
|
| 209 |
pdf_url = self.search_document(doc_id)
|
| 210 |
if "not found" in pdf_url.lower():
|
| 211 |
-
return
|
| 212 |
-
# URL path: .../18.04.00_60/ts_...p.pdf → folder is parts[-2]
|
| 213 |
parts = pdf_url.rstrip("/").split("/")
|
| 214 |
version_folder = parts[-2] # e.g. "18.04.00_60"
|
| 215 |
v_parts = version_folder.split("_")[0].split(".") # ["18", "04", "00"]
|
| 216 |
try:
|
| 217 |
version_str = f"{int(v_parts[0])}.{int(v_parts[1])}.{int(v_parts[2])}"
|
| 218 |
except (ValueError, IndexError):
|
| 219 |
-
return
|
| 220 |
|
|
|
|
| 221 |
for spec_type in ["TS", "TR"]:
|
| 222 |
params = {
|
| 223 |
"option": "com_standardssearch",
|
|
@@ -234,10 +235,10 @@ class ETSISpecFinder:
|
|
| 234 |
proxies=_get_proxies())
|
| 235 |
data = resp.json()
|
| 236 |
if data and isinstance(data, list):
|
| 237 |
-
|
| 238 |
except Exception as e:
|
| 239 |
print(f"Error getting wki_id for {doc_id}: {e}")
|
| 240 |
-
return
|
| 241 |
|
| 242 |
def _authenticate_eol(self, wki_id: str) -> requests.Session:
|
| 243 |
"""Create a requests.Session authenticated to the ETSI EOL portal."""
|
|
@@ -268,52 +269,64 @@ class ETSISpecFinder:
|
|
| 268 |
|
| 269 |
def search_document_docx(self, doc_id: str, version: str = None) -> str:
|
| 270 |
"""Download an ETSI spec as DOCX and return the local file path."""
|
| 271 |
-
|
| 272 |
-
if not
|
| 273 |
return f"Specification {doc_id} not found"
|
| 274 |
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
from bs4 import BeautifulSoup
|
| 5 |
import os
|
| 6 |
import json
|
| 7 |
+
from urllib.parse import urljoin
|
| 8 |
|
| 9 |
def _get_proxies() -> dict:
|
| 10 |
"""Return a requests-compatible proxies dict from $http_proxy / $HTTP_PROXY."""
|
|
|
|
| 201 |
|
| 202 |
return f"Specification {doc_id} not found"
|
| 203 |
|
| 204 |
+
def _get_wki_id_candidates(self, doc_id: str, version: str = None) -> list:
|
| 205 |
+
"""Return a list of candidate wki_ids for a spec version (best match first)."""
|
| 206 |
if version:
|
| 207 |
version_str = version
|
| 208 |
else:
|
| 209 |
# Derive version from the FTP PDF URL
|
| 210 |
pdf_url = self.search_document(doc_id)
|
| 211 |
if "not found" in pdf_url.lower():
|
| 212 |
+
return []
|
|
|
|
| 213 |
parts = pdf_url.rstrip("/").split("/")
|
| 214 |
version_folder = parts[-2] # e.g. "18.04.00_60"
|
| 215 |
v_parts = version_folder.split("_")[0].split(".") # ["18", "04", "00"]
|
| 216 |
try:
|
| 217 |
version_str = f"{int(v_parts[0])}.{int(v_parts[1])}.{int(v_parts[2])}"
|
| 218 |
except (ValueError, IndexError):
|
| 219 |
+
return []
|
| 220 |
|
| 221 |
+
candidates = []
|
| 222 |
for spec_type in ["TS", "TR"]:
|
| 223 |
params = {
|
| 224 |
"option": "com_standardssearch",
|
|
|
|
| 235 |
proxies=_get_proxies())
|
| 236 |
data = resp.json()
|
| 237 |
if data and isinstance(data, list):
|
| 238 |
+
candidates.extend(str(item["wki_id"]) for item in data if "wki_id" in item)
|
| 239 |
except Exception as e:
|
| 240 |
print(f"Error getting wki_id for {doc_id}: {e}")
|
| 241 |
+
return candidates
|
| 242 |
|
| 243 |
def _authenticate_eol(self, wki_id: str) -> requests.Session:
|
| 244 |
"""Create a requests.Session authenticated to the ETSI EOL portal."""
|
|
|
|
| 269 |
|
| 270 |
def search_document_docx(self, doc_id: str, version: str = None) -> str:
|
| 271 |
"""Download an ETSI spec as DOCX and return the local file path."""
|
| 272 |
+
candidates = self._get_wki_id_candidates(doc_id, version)
|
| 273 |
+
if not candidates:
|
| 274 |
return f"Specification {doc_id} not found"
|
| 275 |
|
| 276 |
+
for wki_id in candidates:
|
| 277 |
+
print(f"Trying wki_id={wki_id} for {doc_id}")
|
| 278 |
+
session = self._authenticate_eol(wki_id)
|
| 279 |
+
|
| 280 |
+
# NTaccount.asp → parse profile_id from meta-refresh
|
| 281 |
+
r = session.get(
|
| 282 |
+
f"https://portal.etsi.org/webapp/protect/NTaccount.asp?Wki_Id={wki_id}",
|
| 283 |
+
verify=False, timeout=15,
|
| 284 |
+
)
|
| 285 |
+
meta_match = re.search(r'URL=([^"\'\s>]+)', r.text)
|
| 286 |
+
if not meta_match:
|
| 287 |
+
print(f" wki_id={wki_id}: authentication failed, trying next")
|
| 288 |
+
continue
|
| 289 |
+
|
| 290 |
+
meta_url = meta_match.group(1)
|
| 291 |
+
if not meta_url.startswith("http"):
|
| 292 |
+
meta_url = f"https://portal.etsi.org/webapp/protect/{meta_url}"
|
| 293 |
+
|
| 294 |
+
# CheckIdentifier → 302 to copy_file
|
| 295 |
+
r2 = session.get(meta_url, allow_redirects=False, verify=False, timeout=15)
|
| 296 |
+
if r2.status_code != 302:
|
| 297 |
+
print(f" wki_id={wki_id}: unexpected status {r2.status_code}, trying next")
|
| 298 |
+
continue
|
| 299 |
+
|
| 300 |
+
location2 = r2.headers.get("Location", "")
|
| 301 |
+
if "processError" in location2 or "processErrors" in location2:
|
| 302 |
+
print(f" wki_id={wki_id}: portal rejected ({location2}), trying next")
|
| 303 |
+
continue
|
| 304 |
+
|
| 305 |
+
# copy_file (may have a second redirect)
|
| 306 |
+
copy_url = urljoin("https://portal.etsi.org/", location2)
|
| 307 |
+
r3 = session.get(copy_url, allow_redirects=False, verify=False, timeout=15)
|
| 308 |
+
|
| 309 |
+
if r3.status_code == 302:
|
| 310 |
+
location3 = r3.headers.get("Location", "")
|
| 311 |
+
final_url = urljoin("https://portal.etsi.org/webapp/ewp/", location3)
|
| 312 |
+
r4 = session.get(final_url, verify=False, timeout=15)
|
| 313 |
+
else:
|
| 314 |
+
r4 = r3
|
| 315 |
+
|
| 316 |
+
# Extract DOCX link
|
| 317 |
+
docx_urls = re.findall(r'href=["\']([^"\']*\.docx)["\']', r4.text, re.IGNORECASE)
|
| 318 |
+
if not docx_urls:
|
| 319 |
+
print(f" wki_id={wki_id}: DOCX not found in page, trying next")
|
| 320 |
+
continue
|
| 321 |
+
|
| 322 |
+
docx_url = docx_urls[0]
|
| 323 |
+
dl = session.get(docx_url, headers={"Referer": r4.url}, verify=False, timeout=60)
|
| 324 |
+
filename = docx_url.split("/")[-1]
|
| 325 |
+
tmp_path = f"/tmp/{filename}"
|
| 326 |
+
with open(tmp_path, "wb") as f:
|
| 327 |
+
f.write(dl.content)
|
| 328 |
+
|
| 329 |
+
print(f" wki_id={wki_id}: success")
|
| 330 |
+
return tmp_path
|
| 331 |
+
|
| 332 |
+
return f"Specification {doc_id}: all {len(candidates)} wki_id candidate(s) rejected by ETSI portal"
|