Spaces:

andrehoffmann80
/

DOI

Sleeping

App Files Files Community

andrehoffmann80 commited on 7 days ago

Commit

3f687cd

verified ·

1 Parent(s): 092f5ec

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +76 -34

src/streamlit_app.py CHANGED Viewed

@@ -64,18 +64,27 @@ def get_text(node, xpath, ns):
     return clean_text(elem.text) if elem is not None and elem.text else ""
-def build_dora_mods_url(repo_code: str, object_or_url: str) -> str:
     """
-    Erzeugt MODS-URL aus einer DORA-ID wie 'wsl:41900' via OAI-PMH.
-    Nutzt ausschließlich die öffentliche Schnittstelle (www.dora.lib4ri.ch).
     """
     if object_or_url.startswith("http://") or object_or_url.startswith("https://"):
-        return object_or_url
-    # Wandle z.B. "wsl:41891" in "wsl_41891" um für den OAI Identifier
-    oai_id = object_or_url.replace(":", "_")
-    return f"https://www.dora.lib4ri.ch/{repo_code}/oai2?verb=GetRecord&identifier=oai:dora:{oai_id}&metadataPrefix=mods"
 def build_persistent_url(repo_code: str, object_id: str) -> str:
@@ -88,27 +97,54 @@ def build_persistent_url(repo_code: str, object_id: str) -> str:
     return f"{public_base}/{repo_code}/item/{object_id}"
-def fetch_mods_xml(mods_url: str) -> etree._Element:
-    """Lädt eine MODS-Datei oder OAI-PMH von einer URL und gibt das MODS Root-Element zurück."""
-    resp = ROBUST_SESSION.get(mods_url, headers=HEADERS, timeout=60)
-    resp.raise_for_status()
-    # Use recover=True to handle malformed XML (e.g. unescaped HTML in notes)
-    parser = etree.XMLParser(recover=True, remove_blank_text=True)
-    root = etree.fromstring(resp.content, parser=parser)
-    # Falls es sich um eine OAI-PMH Antwort handelt, extrahiere den <mods:mods> Knoten
-    if "OAI-PMH" in root.tag:
-        ns = {
-            "oai": "http://www.openarchives.org/OAI/2.0/",
-            "mods": "http://www.loc.gov/mods/v3"
-        }
-        mods_node = root.find(".//mods:mods", namespaces=ns)
-        if mods_node is not None:
-            return mods_node
-        else:
-            raise ValueError(f"Kein MODS-Element in der OAI-PMH-Antwort gefunden: {mods_url}")
-    return root
 def parse_book_mods(book_root: etree._Element, repo_base_url: str) -> dict:
@@ -317,7 +353,7 @@ def build_doi_batch_xml(
     head = etree.SubElement(doi_batch, "head")
     etree.SubElement(head, "doi_batch_id").text = depositor_meta["doi_batch_id"]
-    ts = datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S")
     etree.SubElement(head, "timestamp").text = ts
     depositor = etree.SubElement(head, "depositor")
@@ -576,6 +612,13 @@ def main():
     with col_config:
         st.markdown("#### Verbindung & Typ")
         repo_list = list(REPO_CONFIG.keys())
         repo_code = st.selectbox(
             "Repository-Code",
@@ -585,7 +628,7 @@ def main():
         )
         repo_config = REPO_CONFIG[repo_code]
-        repo_base_url = f"https://www.dora.lib4ri.ch/{repo_code}"
         pub_type = st.radio(
             "Publikationstyp",
@@ -616,9 +659,9 @@ def main():
         st.write("") # Spacer
         if st.button("Metadaten laden", type="primary"):
             try:
-                mods_url = build_dora_mods_url(repo_code, book_id_or_url)
-                st.info(f"Lade MODS von: {mods_url}")
-                book_root = fetch_mods_xml(mods_url)
                 meta = parse_book_mods(book_root, repo_base_url)
                 # --- Attempt to extract report number from MODS ---
@@ -854,9 +897,8 @@ def main():
                 line = line.strip()
                 if not line:
                     continue
-                mods_url = build_dora_mods_url(repo_code, line)
-                st.write(f"Lade Kapitel-MODS von: {mods_url}")
-                mods_root = fetch_mods_xml(mods_url)
                 ci, page_no = mods_to_content_item(mods_root, repo_base_url)
                 chapter_items.append((ci, page_no))

     return clean_text(elem.text) if elem is not None and elem.text else ""
+def build_dora_mods_url(repo_code: str, object_or_url: str, host: str = "www.dora.lib4ri.ch") -> list[str]:
     """
+    Erzeugt eine Liste von möglichen MODS-URLs (Direct Datastream & OAI-PMH)
+    aus einer DORA-ID wie 'wsl:41900'.
     """
     if object_or_url.startswith("http://") or object_or_url.startswith("https://"):
+        return [object_or_url]
+    # ID bereinigen
+    clean_id = object_or_url.strip()
+    # Für URLs muss die ID encodiert werden (z.B. : zu %3A)
+    encoded_id = quote(clean_id)
+    oai_id = clean_id.replace(":", "_")
+    # Variante 1: Direct Datastream (oft stabiler gegen IP-Sperren)
+    direct_url = f"https://{host}/{repo_code}/islandora/object/{encoded_id}/datastream/MODS/view"
+    # Variante 2: OAI-PMH
+    oai_url = f"https://{host}/{repo_code}/oai2?verb=GetRecord&identifier=oai:dora:{oai_id}&metadataPrefix=mods"
+    return [direct_url, oai_url]
 def build_persistent_url(repo_code: str, object_id: str) -> str:
     return f"{public_base}/{repo_code}/item/{object_id}"
+def fetch_mods_xml(mods_urls: list[str]) -> etree._Element:
+    """Probiert eine Liste von URLs durch, bis ein MODS-XML erfolgreich geladen wird."""
+    errors = []
+    for url in mods_urls:
+        # Versuch 1: Mit Browser-Headern
+        # Versuch 2: Ohne spezielle Header (falls der Server picky ist)
+        for use_headers in [True, False]:
+            current_headers = HEADERS if use_headers else {}
+            header_label = "Browser-Header" if use_headers else "Standard-Header"
+            try:
+                resp = ROBUST_SESSION.get(url, headers=current_headers, timeout=30)
+                if resp.status_code != 200:
+                    errors.append(f"FAILED ({header_label}): {url} -> HTTP {resp.status_code}")
+                    continue
+                # Use recover=True to handle malformed XML
+                parser = etree.XMLParser(recover=True, remove_blank_text=True)
+                root = etree.fromstring(resp.content, parser=parser)
+                # Falls es sich um eine OAI-PMH Antwort handelt, extrahiere den <mods:mods> Knoten
+                if "OAI-PMH" in root.tag or root.tag.endswith("OAI-PMH"):
+                    ns = {
+                        "oai": "http://www.openarchives.org/OAI/2.0/",
+                        "mods": "http://www.loc.gov/mods/v3"
+                    }
+                    mods_node = root.find(".//mods:mods", namespaces=ns)
+                    if mods_node is not None:
+                        return mods_node
+                    else:
+                        errors.append(f"EMPTY: {url} (OAI-PMH Antwort enthielt kein <mods:mods>)")
+                else:
+                    # Es ist bereits das MODS Element oder ein MODS-ähnlicher Knoten
+                    if "mods" in root.tag.lower():
+                        return root
+                    errors.append(f"UNRECOGNIZED: {url} (Wurzel-Element '{root.tag}' ist kein MODS)")
+            except Exception as e:
+                errors.append(f"ERROR ({header_label}): {url} -> {str(e)}")
+                continue
+    # Wenn wir hier ankommen, sind alle Versuche fehlgeschlagen
+    error_summary = "\n".join(errors)
+    st.error("### Ernte-Bericht (Alle Versuche fehlgeschlagen)")
+    for err in errors:
+        st.write(f"- {err}")
+    raise ValueError("Metadaten konnten von keiner der verfügbaren URLs geladen werden.")
 def parse_book_mods(book_root: etree._Element, repo_base_url: str) -> dict:
     head = etree.SubElement(doi_batch, "head")
     etree.SubElement(head, "doi_batch_id").text = depositor_meta["doi_batch_id"]
+    ts = datetime.datetime.now(datetime.timezone.utc).strftime("%Y%m%d%H%M%S")
     etree.SubElement(head, "timestamp").text = ts
     depositor = etree.SubElement(head, "depositor")
     with col_config:
         st.markdown("#### Verbindung & Typ")
+        dora_host = st.selectbox(
+            "DORA Host",
+            options=["www.dora.lib4ri.ch", "admin.dora.lib4ri.ch"],
+            index=0,
+            help="Nutze 'www' für externen Zugriff (Hugging Face) und 'admin' für institutsweiten Zugriff."
+        )
         repo_list = list(REPO_CONFIG.keys())
         repo_code = st.selectbox(
             "Repository-Code",
         )
         repo_config = REPO_CONFIG[repo_code]
+        repo_base_url = f"https://{dora_host}/{repo_code}"
         pub_type = st.radio(
             "Publikationstyp",
         st.write("") # Spacer
         if st.button("Metadaten laden", type="primary"):
             try:
+                mods_urls = build_dora_mods_url(repo_code, book_id_or_url, host=dora_host)
+                st.info(f"Suche MODS-Metadaten auf: {dora_host}...")
+                book_root = fetch_mods_xml(mods_urls)
                 meta = parse_book_mods(book_root, repo_base_url)
                 # --- Attempt to extract report number from MODS ---
                 line = line.strip()
                 if not line:
                     continue
+                mods_urls = build_dora_mods_url(repo_code, line, host=dora_host)
+                mods_root = fetch_mods_xml(mods_urls)
                 ci, page_no = mods_to_content_item(mods_root, repo_base_url)
                 chapter_items.append((ci, page_no))