Spaces:

OrganizedProgrammers
/

DocFinder

Running

App Files Files Community

heymenn commited on 18 days ago

Commit

801e72f

1 Parent(s): 8f93b05

show more info to catch errors on etsi login page

Browse files

Files changed (1) hide show

classes.py +67 -54

classes.py CHANGED Viewed

@@ -4,6 +4,7 @@ import re
 from bs4 import BeautifulSoup
 import os
 import json
 def _get_proxies() -> dict:
     """Return a requests-compatible proxies dict from $http_proxy / $HTTP_PROXY."""
@@ -200,24 +201,24 @@ class ETSISpecFinder:
         return f"Specification {doc_id} not found"
-    def _get_wki_id(self, doc_id: str, version: str = None) -> str:
-        """Return the ETSI portal wki_id for a spec version, or None if not found."""
         if version:
             version_str = version
         else:
             # Derive version from the FTP PDF URL
             pdf_url = self.search_document(doc_id)
             if "not found" in pdf_url.lower():
-                return None
-            # URL path: .../18.04.00_60/ts_...p.pdf  →  folder is parts[-2]
             parts = pdf_url.rstrip("/").split("/")
             version_folder = parts[-2]          # e.g. "18.04.00_60"
             v_parts = version_folder.split("_")[0].split(".")  # ["18", "04", "00"]
             try:
                 version_str = f"{int(v_parts[0])}.{int(v_parts[1])}.{int(v_parts[2])}"
             except (ValueError, IndexError):
-                return None
         for spec_type in ["TS", "TR"]:
             params = {
                 "option": "com_standardssearch",
@@ -234,10 +235,10 @@ class ETSISpecFinder:
                                     proxies=_get_proxies())
                 data = resp.json()
                 if data and isinstance(data, list):
-                    return str(data[0]["wki_id"])
             except Exception as e:
                 print(f"Error getting wki_id for {doc_id}: {e}")
-        return None
     def _authenticate_eol(self, wki_id: str) -> requests.Session:
         """Create a requests.Session authenticated to the ETSI EOL portal."""
@@ -268,52 +269,64 @@ class ETSISpecFinder:
     def search_document_docx(self, doc_id: str, version: str = None) -> str:
         """Download an ETSI spec as DOCX and return the local file path."""
-        wki_id = self._get_wki_id(doc_id, version)
-        if not wki_id:
             return f"Specification {doc_id} not found"
-        session = self._authenticate_eol(wki_id)
-        # NTaccount.asp → parse profile_id from meta-refresh
-        r = session.get(
-            f"https://portal.etsi.org/webapp/protect/NTaccount.asp?Wki_Id={wki_id}",
-            verify=False, timeout=15,
-        )
-        meta_match = re.search(r'URL=([^"\'\s>]+)', r.text)
-        if not meta_match:
-            return f"Specification {doc_id}: authentication failed"
-        meta_url = meta_match.group(1)
-        if not meta_url.startswith("http"):
-            meta_url = f"https://portal.etsi.org/webapp/protect/{meta_url}"
-        # CheckIdentifier → 302 to copy_file
-        r2 = session.get(meta_url, allow_redirects=False, verify=False, timeout=15)
-        if r2.status_code != 302:
-            return f"Specification {doc_id}: download chain failed"
-        # copy_file (may have a second redirect)
-        copy_url = "https://portal.etsi.org" + r2.headers["Location"]
-        r3 = session.get(copy_url, allow_redirects=False, verify=False, timeout=15)
-        if r3.status_code == 302:
-            final_url = "https://portal.etsi.org/webapp/ewp/" + r3.headers["Location"]
-            r4 = session.get(final_url, verify=False, timeout=15)
-        else:
-            r4 = r3
-        # Extract DOCX link
-        docx_urls = re.findall(r'href=["\']([^"\']*\.docx)["\']', r4.text, re.IGNORECASE)
-        if not docx_urls:
-            return f"Specification {doc_id}: DOCX not available"
-        docx_url = docx_urls[0]
-        # Download
-        dl = session.get(docx_url, headers={"Referer": r4.url}, verify=False, timeout=60)
-        filename = docx_url.split("/")[-1]
-        tmp_path = f"/tmp/{filename}"
-        with open(tmp_path, "wb") as f:
-            f.write(dl.content)
-        return tmp_path

 from bs4 import BeautifulSoup
 import os
 import json
+from urllib.parse import urljoin
 def _get_proxies() -> dict:
     """Return a requests-compatible proxies dict from $http_proxy / $HTTP_PROXY."""
         return f"Specification {doc_id} not found"
+    def _get_wki_id_candidates(self, doc_id: str, version: str = None) -> list:
+        """Return a list of candidate wki_ids for a spec version (best match first)."""
         if version:
             version_str = version
         else:
             # Derive version from the FTP PDF URL
             pdf_url = self.search_document(doc_id)
             if "not found" in pdf_url.lower():
+                return []
             parts = pdf_url.rstrip("/").split("/")
             version_folder = parts[-2]          # e.g. "18.04.00_60"
             v_parts = version_folder.split("_")[0].split(".")  # ["18", "04", "00"]
             try:
                 version_str = f"{int(v_parts[0])}.{int(v_parts[1])}.{int(v_parts[2])}"
             except (ValueError, IndexError):
+                return []
+        candidates = []
         for spec_type in ["TS", "TR"]:
             params = {
                 "option": "com_standardssearch",
                                     proxies=_get_proxies())
                 data = resp.json()
                 if data and isinstance(data, list):
+                    candidates.extend(str(item["wki_id"]) for item in data if "wki_id" in item)
             except Exception as e:
                 print(f"Error getting wki_id for {doc_id}: {e}")
+        return candidates
     def _authenticate_eol(self, wki_id: str) -> requests.Session:
         """Create a requests.Session authenticated to the ETSI EOL portal."""
     def search_document_docx(self, doc_id: str, version: str = None) -> str:
         """Download an ETSI spec as DOCX and return the local file path."""
+        candidates = self._get_wki_id_candidates(doc_id, version)
+        if not candidates:
             return f"Specification {doc_id} not found"
+        for wki_id in candidates:
+            print(f"Trying wki_id={wki_id} for {doc_id}")
+            session = self._authenticate_eol(wki_id)
+            # NTaccount.asp → parse profile_id from meta-refresh
+            r = session.get(
+                f"https://portal.etsi.org/webapp/protect/NTaccount.asp?Wki_Id={wki_id}",
+                verify=False, timeout=15,
+            )
+            meta_match = re.search(r'URL=([^"\'\s>]+)', r.text)
+            if not meta_match:
+                print(f"  wki_id={wki_id}: authentication failed, trying next")
+                continue
+            meta_url = meta_match.group(1)
+            if not meta_url.startswith("http"):
+                meta_url = f"https://portal.etsi.org/webapp/protect/{meta_url}"
+            # CheckIdentifier → 302 to copy_file
+            r2 = session.get(meta_url, allow_redirects=False, verify=False, timeout=15)
+            if r2.status_code != 302:
+                print(f"  wki_id={wki_id}: unexpected status {r2.status_code}, trying next")
+                continue
+            location2 = r2.headers.get("Location", "")
+            if "processError" in location2 or "processErrors" in location2:
+                print(f"  wki_id={wki_id}: portal rejected ({location2}), trying next")
+                continue
+            # copy_file (may have a second redirect)
+            copy_url = urljoin("https://portal.etsi.org/", location2)
+            r3 = session.get(copy_url, allow_redirects=False, verify=False, timeout=15)
+            if r3.status_code == 302:
+                location3 = r3.headers.get("Location", "")
+                final_url = urljoin("https://portal.etsi.org/webapp/ewp/", location3)
+                r4 = session.get(final_url, verify=False, timeout=15)
+            else:
+                r4 = r3
+            # Extract DOCX link
+            docx_urls = re.findall(r'href=["\']([^"\']*\.docx)["\']', r4.text, re.IGNORECASE)
+            if not docx_urls:
+                print(f"  wki_id={wki_id}: DOCX not found in page, trying next")
+                continue
+            docx_url = docx_urls[0]
+            dl = session.get(docx_url, headers={"Referer": r4.url}, verify=False, timeout=60)
+            filename = docx_url.split("/")[-1]
+            tmp_path = f"/tmp/{filename}"
+            with open(tmp_path, "wb") as f:
+                f.write(dl.content)
+            print(f"  wki_id={wki_id}: success")
+            return tmp_path
+        return f"Specification {doc_id}: all {len(candidates)} wki_id candidate(s) rejected by ETSI portal"