heymenn commited on
Commit
801e72f
·
1 Parent(s): 8f93b05

show more info to catch errors on etsi login page

Browse files
Files changed (1) hide show
  1. classes.py +67 -54
classes.py CHANGED
@@ -4,6 +4,7 @@ import re
4
  from bs4 import BeautifulSoup
5
  import os
6
  import json
 
7
 
8
  def _get_proxies() -> dict:
9
  """Return a requests-compatible proxies dict from $http_proxy / $HTTP_PROXY."""
@@ -200,24 +201,24 @@ class ETSISpecFinder:
200
 
201
  return f"Specification {doc_id} not found"
202
 
203
- def _get_wki_id(self, doc_id: str, version: str = None) -> str:
204
- """Return the ETSI portal wki_id for a spec version, or None if not found."""
205
  if version:
206
  version_str = version
207
  else:
208
  # Derive version from the FTP PDF URL
209
  pdf_url = self.search_document(doc_id)
210
  if "not found" in pdf_url.lower():
211
- return None
212
- # URL path: .../18.04.00_60/ts_...p.pdf → folder is parts[-2]
213
  parts = pdf_url.rstrip("/").split("/")
214
  version_folder = parts[-2] # e.g. "18.04.00_60"
215
  v_parts = version_folder.split("_")[0].split(".") # ["18", "04", "00"]
216
  try:
217
  version_str = f"{int(v_parts[0])}.{int(v_parts[1])}.{int(v_parts[2])}"
218
  except (ValueError, IndexError):
219
- return None
220
 
 
221
  for spec_type in ["TS", "TR"]:
222
  params = {
223
  "option": "com_standardssearch",
@@ -234,10 +235,10 @@ class ETSISpecFinder:
234
  proxies=_get_proxies())
235
  data = resp.json()
236
  if data and isinstance(data, list):
237
- return str(data[0]["wki_id"])
238
  except Exception as e:
239
  print(f"Error getting wki_id for {doc_id}: {e}")
240
- return None
241
 
242
  def _authenticate_eol(self, wki_id: str) -> requests.Session:
243
  """Create a requests.Session authenticated to the ETSI EOL portal."""
@@ -268,52 +269,64 @@ class ETSISpecFinder:
268
 
269
  def search_document_docx(self, doc_id: str, version: str = None) -> str:
270
  """Download an ETSI spec as DOCX and return the local file path."""
271
- wki_id = self._get_wki_id(doc_id, version)
272
- if not wki_id:
273
  return f"Specification {doc_id} not found"
274
 
275
- session = self._authenticate_eol(wki_id)
276
-
277
- # NTaccount.asp → parse profile_id from meta-refresh
278
- r = session.get(
279
- f"https://portal.etsi.org/webapp/protect/NTaccount.asp?Wki_Id={wki_id}",
280
- verify=False, timeout=15,
281
- )
282
- meta_match = re.search(r'URL=([^"\'\s>]+)', r.text)
283
- if not meta_match:
284
- return f"Specification {doc_id}: authentication failed"
285
-
286
- meta_url = meta_match.group(1)
287
- if not meta_url.startswith("http"):
288
- meta_url = f"https://portal.etsi.org/webapp/protect/{meta_url}"
289
-
290
- # CheckIdentifier → 302 to copy_file
291
- r2 = session.get(meta_url, allow_redirects=False, verify=False, timeout=15)
292
- if r2.status_code != 302:
293
- return f"Specification {doc_id}: download chain failed"
294
-
295
- # copy_file (may have a second redirect)
296
- copy_url = "https://portal.etsi.org" + r2.headers["Location"]
297
- r3 = session.get(copy_url, allow_redirects=False, verify=False, timeout=15)
298
-
299
- if r3.status_code == 302:
300
- final_url = "https://portal.etsi.org/webapp/ewp/" + r3.headers["Location"]
301
- r4 = session.get(final_url, verify=False, timeout=15)
302
- else:
303
- r4 = r3
304
-
305
- # Extract DOCX link
306
- docx_urls = re.findall(r'href=["\']([^"\']*\.docx)["\']', r4.text, re.IGNORECASE)
307
- if not docx_urls:
308
- return f"Specification {doc_id}: DOCX not available"
309
-
310
- docx_url = docx_urls[0]
311
-
312
- # Download
313
- dl = session.get(docx_url, headers={"Referer": r4.url}, verify=False, timeout=60)
314
- filename = docx_url.split("/")[-1]
315
- tmp_path = f"/tmp/{filename}"
316
- with open(tmp_path, "wb") as f:
317
- f.write(dl.content)
318
-
319
- return tmp_path
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  from bs4 import BeautifulSoup
5
  import os
6
  import json
7
+ from urllib.parse import urljoin
8
 
9
  def _get_proxies() -> dict:
10
  """Return a requests-compatible proxies dict from $http_proxy / $HTTP_PROXY."""
 
201
 
202
  return f"Specification {doc_id} not found"
203
 
204
+ def _get_wki_id_candidates(self, doc_id: str, version: str = None) -> list:
205
+ """Return a list of candidate wki_ids for a spec version (best match first)."""
206
  if version:
207
  version_str = version
208
  else:
209
  # Derive version from the FTP PDF URL
210
  pdf_url = self.search_document(doc_id)
211
  if "not found" in pdf_url.lower():
212
+ return []
 
213
  parts = pdf_url.rstrip("/").split("/")
214
  version_folder = parts[-2] # e.g. "18.04.00_60"
215
  v_parts = version_folder.split("_")[0].split(".") # ["18", "04", "00"]
216
  try:
217
  version_str = f"{int(v_parts[0])}.{int(v_parts[1])}.{int(v_parts[2])}"
218
  except (ValueError, IndexError):
219
+ return []
220
 
221
+ candidates = []
222
  for spec_type in ["TS", "TR"]:
223
  params = {
224
  "option": "com_standardssearch",
 
235
  proxies=_get_proxies())
236
  data = resp.json()
237
  if data and isinstance(data, list):
238
+ candidates.extend(str(item["wki_id"]) for item in data if "wki_id" in item)
239
  except Exception as e:
240
  print(f"Error getting wki_id for {doc_id}: {e}")
241
+ return candidates
242
 
243
  def _authenticate_eol(self, wki_id: str) -> requests.Session:
244
  """Create a requests.Session authenticated to the ETSI EOL portal."""
 
269
 
270
  def search_document_docx(self, doc_id: str, version: str = None) -> str:
271
  """Download an ETSI spec as DOCX and return the local file path."""
272
+ candidates = self._get_wki_id_candidates(doc_id, version)
273
+ if not candidates:
274
  return f"Specification {doc_id} not found"
275
 
276
+ for wki_id in candidates:
277
+ print(f"Trying wki_id={wki_id} for {doc_id}")
278
+ session = self._authenticate_eol(wki_id)
279
+
280
+ # NTaccount.asp → parse profile_id from meta-refresh
281
+ r = session.get(
282
+ f"https://portal.etsi.org/webapp/protect/NTaccount.asp?Wki_Id={wki_id}",
283
+ verify=False, timeout=15,
284
+ )
285
+ meta_match = re.search(r'URL=([^"\'\s>]+)', r.text)
286
+ if not meta_match:
287
+ print(f" wki_id={wki_id}: authentication failed, trying next")
288
+ continue
289
+
290
+ meta_url = meta_match.group(1)
291
+ if not meta_url.startswith("http"):
292
+ meta_url = f"https://portal.etsi.org/webapp/protect/{meta_url}"
293
+
294
+ # CheckIdentifier 302 to copy_file
295
+ r2 = session.get(meta_url, allow_redirects=False, verify=False, timeout=15)
296
+ if r2.status_code != 302:
297
+ print(f" wki_id={wki_id}: unexpected status {r2.status_code}, trying next")
298
+ continue
299
+
300
+ location2 = r2.headers.get("Location", "")
301
+ if "processError" in location2 or "processErrors" in location2:
302
+ print(f" wki_id={wki_id}: portal rejected ({location2}), trying next")
303
+ continue
304
+
305
+ # copy_file (may have a second redirect)
306
+ copy_url = urljoin("https://portal.etsi.org/", location2)
307
+ r3 = session.get(copy_url, allow_redirects=False, verify=False, timeout=15)
308
+
309
+ if r3.status_code == 302:
310
+ location3 = r3.headers.get("Location", "")
311
+ final_url = urljoin("https://portal.etsi.org/webapp/ewp/", location3)
312
+ r4 = session.get(final_url, verify=False, timeout=15)
313
+ else:
314
+ r4 = r3
315
+
316
+ # Extract DOCX link
317
+ docx_urls = re.findall(r'href=["\']([^"\']*\.docx)["\']', r4.text, re.IGNORECASE)
318
+ if not docx_urls:
319
+ print(f" wki_id={wki_id}: DOCX not found in page, trying next")
320
+ continue
321
+
322
+ docx_url = docx_urls[0]
323
+ dl = session.get(docx_url, headers={"Referer": r4.url}, verify=False, timeout=60)
324
+ filename = docx_url.split("/")[-1]
325
+ tmp_path = f"/tmp/{filename}"
326
+ with open(tmp_path, "wb") as f:
327
+ f.write(dl.content)
328
+
329
+ print(f" wki_id={wki_id}: success")
330
+ return tmp_path
331
+
332
+ return f"Specification {doc_id}: all {len(candidates)} wki_id candidate(s) rejected by ETSI portal"