Spaces:
Sleeping
Sleeping
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +76 -34
src/streamlit_app.py
CHANGED
|
@@ -64,18 +64,27 @@ def get_text(node, xpath, ns):
|
|
| 64 |
return clean_text(elem.text) if elem is not None and elem.text else ""
|
| 65 |
|
| 66 |
|
| 67 |
-
def build_dora_mods_url(repo_code: str, object_or_url: str) -> str:
|
| 68 |
"""
|
| 69 |
-
Erzeugt
|
| 70 |
-
|
| 71 |
"""
|
| 72 |
if object_or_url.startswith("http://") or object_or_url.startswith("https://"):
|
| 73 |
-
return object_or_url
|
| 74 |
|
| 75 |
-
#
|
| 76 |
-
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
|
| 81 |
def build_persistent_url(repo_code: str, object_id: str) -> str:
|
|
@@ -88,27 +97,54 @@ def build_persistent_url(repo_code: str, object_id: str) -> str:
|
|
| 88 |
return f"{public_base}/{repo_code}/item/{object_id}"
|
| 89 |
|
| 90 |
|
| 91 |
-
def fetch_mods_xml(
|
| 92 |
-
"""
|
| 93 |
-
|
| 94 |
-
resp.raise_for_status()
|
| 95 |
-
# Use recover=True to handle malformed XML (e.g. unescaped HTML in notes)
|
| 96 |
-
parser = etree.XMLParser(recover=True, remove_blank_text=True)
|
| 97 |
-
root = etree.fromstring(resp.content, parser=parser)
|
| 98 |
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
mods_node = root.find(".//mods:mods", namespaces=ns)
|
| 106 |
-
if mods_node is not None:
|
| 107 |
-
return mods_node
|
| 108 |
-
else:
|
| 109 |
-
raise ValueError(f"Kein MODS-Element in der OAI-PMH-Antwort gefunden: {mods_url}")
|
| 110 |
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
|
| 114 |
def parse_book_mods(book_root: etree._Element, repo_base_url: str) -> dict:
|
|
@@ -317,7 +353,7 @@ def build_doi_batch_xml(
|
|
| 317 |
head = etree.SubElement(doi_batch, "head")
|
| 318 |
etree.SubElement(head, "doi_batch_id").text = depositor_meta["doi_batch_id"]
|
| 319 |
|
| 320 |
-
ts = datetime.datetime.
|
| 321 |
etree.SubElement(head, "timestamp").text = ts
|
| 322 |
|
| 323 |
depositor = etree.SubElement(head, "depositor")
|
|
@@ -576,6 +612,13 @@ def main():
|
|
| 576 |
with col_config:
|
| 577 |
st.markdown("#### Verbindung & Typ")
|
| 578 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 579 |
repo_list = list(REPO_CONFIG.keys())
|
| 580 |
repo_code = st.selectbox(
|
| 581 |
"Repository-Code",
|
|
@@ -585,7 +628,7 @@ def main():
|
|
| 585 |
)
|
| 586 |
|
| 587 |
repo_config = REPO_CONFIG[repo_code]
|
| 588 |
-
repo_base_url = f"https://
|
| 589 |
|
| 590 |
pub_type = st.radio(
|
| 591 |
"Publikationstyp",
|
|
@@ -616,9 +659,9 @@ def main():
|
|
| 616 |
st.write("") # Spacer
|
| 617 |
if st.button("Metadaten laden", type="primary"):
|
| 618 |
try:
|
| 619 |
-
|
| 620 |
-
st.info(f"
|
| 621 |
-
book_root = fetch_mods_xml(
|
| 622 |
meta = parse_book_mods(book_root, repo_base_url)
|
| 623 |
|
| 624 |
# --- Attempt to extract report number from MODS ---
|
|
@@ -854,9 +897,8 @@ def main():
|
|
| 854 |
line = line.strip()
|
| 855 |
if not line:
|
| 856 |
continue
|
| 857 |
-
|
| 858 |
-
|
| 859 |
-
mods_root = fetch_mods_xml(mods_url)
|
| 860 |
ci, page_no = mods_to_content_item(mods_root, repo_base_url)
|
| 861 |
chapter_items.append((ci, page_no))
|
| 862 |
|
|
|
|
| 64 |
return clean_text(elem.text) if elem is not None and elem.text else ""
|
| 65 |
|
| 66 |
|
| 67 |
+
def build_dora_mods_url(repo_code: str, object_or_url: str, host: str = "www.dora.lib4ri.ch") -> list[str]:
|
| 68 |
"""
|
| 69 |
+
Erzeugt eine Liste von möglichen MODS-URLs (Direct Datastream & OAI-PMH)
|
| 70 |
+
aus einer DORA-ID wie 'wsl:41900'.
|
| 71 |
"""
|
| 72 |
if object_or_url.startswith("http://") or object_or_url.startswith("https://"):
|
| 73 |
+
return [object_or_url]
|
| 74 |
|
| 75 |
+
# ID bereinigen
|
| 76 |
+
clean_id = object_or_url.strip()
|
| 77 |
+
# Für URLs muss die ID encodiert werden (z.B. : zu %3A)
|
| 78 |
+
encoded_id = quote(clean_id)
|
| 79 |
+
oai_id = clean_id.replace(":", "_")
|
| 80 |
|
| 81 |
+
# Variante 1: Direct Datastream (oft stabiler gegen IP-Sperren)
|
| 82 |
+
direct_url = f"https://{host}/{repo_code}/islandora/object/{encoded_id}/datastream/MODS/view"
|
| 83 |
+
|
| 84 |
+
# Variante 2: OAI-PMH
|
| 85 |
+
oai_url = f"https://{host}/{repo_code}/oai2?verb=GetRecord&identifier=oai:dora:{oai_id}&metadataPrefix=mods"
|
| 86 |
+
|
| 87 |
+
return [direct_url, oai_url]
|
| 88 |
|
| 89 |
|
| 90 |
def build_persistent_url(repo_code: str, object_id: str) -> str:
|
|
|
|
| 97 |
return f"{public_base}/{repo_code}/item/{object_id}"
|
| 98 |
|
| 99 |
|
| 100 |
+
def fetch_mods_xml(mods_urls: list[str]) -> etree._Element:
|
| 101 |
+
"""Probiert eine Liste von URLs durch, bis ein MODS-XML erfolgreich geladen wird."""
|
| 102 |
+
errors = []
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
+
for url in mods_urls:
|
| 105 |
+
# Versuch 1: Mit Browser-Headern
|
| 106 |
+
# Versuch 2: Ohne spezielle Header (falls der Server picky ist)
|
| 107 |
+
for use_headers in [True, False]:
|
| 108 |
+
current_headers = HEADERS if use_headers else {}
|
| 109 |
+
header_label = "Browser-Header" if use_headers else "Standard-Header"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
+
try:
|
| 112 |
+
resp = ROBUST_SESSION.get(url, headers=current_headers, timeout=30)
|
| 113 |
+
if resp.status_code != 200:
|
| 114 |
+
errors.append(f"FAILED ({header_label}): {url} -> HTTP {resp.status_code}")
|
| 115 |
+
continue
|
| 116 |
+
|
| 117 |
+
# Use recover=True to handle malformed XML
|
| 118 |
+
parser = etree.XMLParser(recover=True, remove_blank_text=True)
|
| 119 |
+
root = etree.fromstring(resp.content, parser=parser)
|
| 120 |
+
|
| 121 |
+
# Falls es sich um eine OAI-PMH Antwort handelt, extrahiere den <mods:mods> Knoten
|
| 122 |
+
if "OAI-PMH" in root.tag or root.tag.endswith("OAI-PMH"):
|
| 123 |
+
ns = {
|
| 124 |
+
"oai": "http://www.openarchives.org/OAI/2.0/",
|
| 125 |
+
"mods": "http://www.loc.gov/mods/v3"
|
| 126 |
+
}
|
| 127 |
+
mods_node = root.find(".//mods:mods", namespaces=ns)
|
| 128 |
+
if mods_node is not None:
|
| 129 |
+
return mods_node
|
| 130 |
+
else:
|
| 131 |
+
errors.append(f"EMPTY: {url} (OAI-PMH Antwort enthielt kein <mods:mods>)")
|
| 132 |
+
else:
|
| 133 |
+
# Es ist bereits das MODS Element oder ein MODS-ähnlicher Knoten
|
| 134 |
+
if "mods" in root.tag.lower():
|
| 135 |
+
return root
|
| 136 |
+
errors.append(f"UNRECOGNIZED: {url} (Wurzel-Element '{root.tag}' ist kein MODS)")
|
| 137 |
+
except Exception as e:
|
| 138 |
+
errors.append(f"ERROR ({header_label}): {url} -> {str(e)}")
|
| 139 |
+
continue
|
| 140 |
+
|
| 141 |
+
# Wenn wir hier ankommen, sind alle Versuche fehlgeschlagen
|
| 142 |
+
error_summary = "\n".join(errors)
|
| 143 |
+
st.error("### Ernte-Bericht (Alle Versuche fehlgeschlagen)")
|
| 144 |
+
for err in errors:
|
| 145 |
+
st.write(f"- {err}")
|
| 146 |
+
|
| 147 |
+
raise ValueError("Metadaten konnten von keiner der verfügbaren URLs geladen werden.")
|
| 148 |
|
| 149 |
|
| 150 |
def parse_book_mods(book_root: etree._Element, repo_base_url: str) -> dict:
|
|
|
|
| 353 |
head = etree.SubElement(doi_batch, "head")
|
| 354 |
etree.SubElement(head, "doi_batch_id").text = depositor_meta["doi_batch_id"]
|
| 355 |
|
| 356 |
+
ts = datetime.datetime.now(datetime.timezone.utc).strftime("%Y%m%d%H%M%S")
|
| 357 |
etree.SubElement(head, "timestamp").text = ts
|
| 358 |
|
| 359 |
depositor = etree.SubElement(head, "depositor")
|
|
|
|
| 612 |
with col_config:
|
| 613 |
st.markdown("#### Verbindung & Typ")
|
| 614 |
|
| 615 |
+
dora_host = st.selectbox(
|
| 616 |
+
"DORA Host",
|
| 617 |
+
options=["www.dora.lib4ri.ch", "admin.dora.lib4ri.ch"],
|
| 618 |
+
index=0,
|
| 619 |
+
help="Nutze 'www' für externen Zugriff (Hugging Face) und 'admin' für institutsweiten Zugriff."
|
| 620 |
+
)
|
| 621 |
+
|
| 622 |
repo_list = list(REPO_CONFIG.keys())
|
| 623 |
repo_code = st.selectbox(
|
| 624 |
"Repository-Code",
|
|
|
|
| 628 |
)
|
| 629 |
|
| 630 |
repo_config = REPO_CONFIG[repo_code]
|
| 631 |
+
repo_base_url = f"https://{dora_host}/{repo_code}"
|
| 632 |
|
| 633 |
pub_type = st.radio(
|
| 634 |
"Publikationstyp",
|
|
|
|
| 659 |
st.write("") # Spacer
|
| 660 |
if st.button("Metadaten laden", type="primary"):
|
| 661 |
try:
|
| 662 |
+
mods_urls = build_dora_mods_url(repo_code, book_id_or_url, host=dora_host)
|
| 663 |
+
st.info(f"Suche MODS-Metadaten auf: {dora_host}...")
|
| 664 |
+
book_root = fetch_mods_xml(mods_urls)
|
| 665 |
meta = parse_book_mods(book_root, repo_base_url)
|
| 666 |
|
| 667 |
# --- Attempt to extract report number from MODS ---
|
|
|
|
| 897 |
line = line.strip()
|
| 898 |
if not line:
|
| 899 |
continue
|
| 900 |
+
mods_urls = build_dora_mods_url(repo_code, line, host=dora_host)
|
| 901 |
+
mods_root = fetch_mods_xml(mods_urls)
|
|
|
|
| 902 |
ci, page_no = mods_to_content_item(mods_root, repo_base_url)
|
| 903 |
chapter_items.append((ci, page_no))
|
| 904 |
|