andrehoffmann80 commited on
Commit
3f687cd
·
verified ·
1 Parent(s): 092f5ec

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +76 -34
src/streamlit_app.py CHANGED
@@ -64,18 +64,27 @@ def get_text(node, xpath, ns):
64
  return clean_text(elem.text) if elem is not None and elem.text else ""
65
 
66
 
67
- def build_dora_mods_url(repo_code: str, object_or_url: str) -> str:
68
  """
69
- Erzeugt MODS-URL aus einer DORA-ID wie 'wsl:41900' via OAI-PMH.
70
- Nutzt ausschließlich die öffentliche Schnittstelle (www.dora.lib4ri.ch).
71
  """
72
  if object_or_url.startswith("http://") or object_or_url.startswith("https://"):
73
- return object_or_url
74
 
75
- # Wandle z.B. "wsl:41891" in "wsl_41891" um für den OAI Identifier
76
- oai_id = object_or_url.replace(":", "_")
 
 
 
77
 
78
- return f"https://www.dora.lib4ri.ch/{repo_code}/oai2?verb=GetRecord&identifier=oai:dora:{oai_id}&metadataPrefix=mods"
 
 
 
 
 
 
79
 
80
 
81
  def build_persistent_url(repo_code: str, object_id: str) -> str:
@@ -88,27 +97,54 @@ def build_persistent_url(repo_code: str, object_id: str) -> str:
88
  return f"{public_base}/{repo_code}/item/{object_id}"
89
 
90
 
91
- def fetch_mods_xml(mods_url: str) -> etree._Element:
92
- """Lädt eine MODS-Datei oder OAI-PMH von einer URL und gibt das MODS Root-Element zurück."""
93
- resp = ROBUST_SESSION.get(mods_url, headers=HEADERS, timeout=60)
94
- resp.raise_for_status()
95
- # Use recover=True to handle malformed XML (e.g. unescaped HTML in notes)
96
- parser = etree.XMLParser(recover=True, remove_blank_text=True)
97
- root = etree.fromstring(resp.content, parser=parser)
98
 
99
- # Falls es sich um eine OAI-PMH Antwort handelt, extrahiere den <mods:mods> Knoten
100
- if "OAI-PMH" in root.tag:
101
- ns = {
102
- "oai": "http://www.openarchives.org/OAI/2.0/",
103
- "mods": "http://www.loc.gov/mods/v3"
104
- }
105
- mods_node = root.find(".//mods:mods", namespaces=ns)
106
- if mods_node is not None:
107
- return mods_node
108
- else:
109
- raise ValueError(f"Kein MODS-Element in der OAI-PMH-Antwort gefunden: {mods_url}")
110
 
111
- return root
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
 
114
  def parse_book_mods(book_root: etree._Element, repo_base_url: str) -> dict:
@@ -317,7 +353,7 @@ def build_doi_batch_xml(
317
  head = etree.SubElement(doi_batch, "head")
318
  etree.SubElement(head, "doi_batch_id").text = depositor_meta["doi_batch_id"]
319
 
320
- ts = datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S")
321
  etree.SubElement(head, "timestamp").text = ts
322
 
323
  depositor = etree.SubElement(head, "depositor")
@@ -576,6 +612,13 @@ def main():
576
  with col_config:
577
  st.markdown("#### Verbindung & Typ")
578
 
 
 
 
 
 
 
 
579
  repo_list = list(REPO_CONFIG.keys())
580
  repo_code = st.selectbox(
581
  "Repository-Code",
@@ -585,7 +628,7 @@ def main():
585
  )
586
 
587
  repo_config = REPO_CONFIG[repo_code]
588
- repo_base_url = f"https://www.dora.lib4ri.ch/{repo_code}"
589
 
590
  pub_type = st.radio(
591
  "Publikationstyp",
@@ -616,9 +659,9 @@ def main():
616
  st.write("") # Spacer
617
  if st.button("Metadaten laden", type="primary"):
618
  try:
619
- mods_url = build_dora_mods_url(repo_code, book_id_or_url)
620
- st.info(f"Lade MODS von: {mods_url}")
621
- book_root = fetch_mods_xml(mods_url)
622
  meta = parse_book_mods(book_root, repo_base_url)
623
 
624
  # --- Attempt to extract report number from MODS ---
@@ -854,9 +897,8 @@ def main():
854
  line = line.strip()
855
  if not line:
856
  continue
857
- mods_url = build_dora_mods_url(repo_code, line)
858
- st.write(f"Lade Kapitel-MODS von: {mods_url}")
859
- mods_root = fetch_mods_xml(mods_url)
860
  ci, page_no = mods_to_content_item(mods_root, repo_base_url)
861
  chapter_items.append((ci, page_no))
862
 
 
64
  return clean_text(elem.text) if elem is not None and elem.text else ""
65
 
66
 
67
+ def build_dora_mods_url(repo_code: str, object_or_url: str, host: str = "www.dora.lib4ri.ch") -> list[str]:
68
  """
69
+ Erzeugt eine Liste von möglichen MODS-URLs (Direct Datastream & OAI-PMH)
70
+ aus einer DORA-ID wie 'wsl:41900'.
71
  """
72
  if object_or_url.startswith("http://") or object_or_url.startswith("https://"):
73
+ return [object_or_url]
74
 
75
+ # ID bereinigen
76
+ clean_id = object_or_url.strip()
77
+ # Für URLs muss die ID encodiert werden (z.B. : zu %3A)
78
+ encoded_id = quote(clean_id)
79
+ oai_id = clean_id.replace(":", "_")
80
 
81
+ # Variante 1: Direct Datastream (oft stabiler gegen IP-Sperren)
82
+ direct_url = f"https://{host}/{repo_code}/islandora/object/{encoded_id}/datastream/MODS/view"
83
+
84
+ # Variante 2: OAI-PMH
85
+ oai_url = f"https://{host}/{repo_code}/oai2?verb=GetRecord&identifier=oai:dora:{oai_id}&metadataPrefix=mods"
86
+
87
+ return [direct_url, oai_url]
88
 
89
 
90
  def build_persistent_url(repo_code: str, object_id: str) -> str:
 
97
  return f"{public_base}/{repo_code}/item/{object_id}"
98
 
99
 
100
+ def fetch_mods_xml(mods_urls: list[str]) -> etree._Element:
101
+ """Probiert eine Liste von URLs durch, bis ein MODS-XML erfolgreich geladen wird."""
102
+ errors = []
 
 
 
 
103
 
104
+ for url in mods_urls:
105
+ # Versuch 1: Mit Browser-Headern
106
+ # Versuch 2: Ohne spezielle Header (falls der Server picky ist)
107
+ for use_headers in [True, False]:
108
+ current_headers = HEADERS if use_headers else {}
109
+ header_label = "Browser-Header" if use_headers else "Standard-Header"
 
 
 
 
 
110
 
111
+ try:
112
+ resp = ROBUST_SESSION.get(url, headers=current_headers, timeout=30)
113
+ if resp.status_code != 200:
114
+ errors.append(f"FAILED ({header_label}): {url} -> HTTP {resp.status_code}")
115
+ continue
116
+
117
+ # Use recover=True to handle malformed XML
118
+ parser = etree.XMLParser(recover=True, remove_blank_text=True)
119
+ root = etree.fromstring(resp.content, parser=parser)
120
+
121
+ # Falls es sich um eine OAI-PMH Antwort handelt, extrahiere den <mods:mods> Knoten
122
+ if "OAI-PMH" in root.tag or root.tag.endswith("OAI-PMH"):
123
+ ns = {
124
+ "oai": "http://www.openarchives.org/OAI/2.0/",
125
+ "mods": "http://www.loc.gov/mods/v3"
126
+ }
127
+ mods_node = root.find(".//mods:mods", namespaces=ns)
128
+ if mods_node is not None:
129
+ return mods_node
130
+ else:
131
+ errors.append(f"EMPTY: {url} (OAI-PMH Antwort enthielt kein <mods:mods>)")
132
+ else:
133
+ # Es ist bereits das MODS Element oder ein MODS-ähnlicher Knoten
134
+ if "mods" in root.tag.lower():
135
+ return root
136
+ errors.append(f"UNRECOGNIZED: {url} (Wurzel-Element '{root.tag}' ist kein MODS)")
137
+ except Exception as e:
138
+ errors.append(f"ERROR ({header_label}): {url} -> {str(e)}")
139
+ continue
140
+
141
+ # Wenn wir hier ankommen, sind alle Versuche fehlgeschlagen
142
+ error_summary = "\n".join(errors)
143
+ st.error("### Ernte-Bericht (Alle Versuche fehlgeschlagen)")
144
+ for err in errors:
145
+ st.write(f"- {err}")
146
+
147
+ raise ValueError("Metadaten konnten von keiner der verfügbaren URLs geladen werden.")
148
 
149
 
150
  def parse_book_mods(book_root: etree._Element, repo_base_url: str) -> dict:
 
353
  head = etree.SubElement(doi_batch, "head")
354
  etree.SubElement(head, "doi_batch_id").text = depositor_meta["doi_batch_id"]
355
 
356
+ ts = datetime.datetime.now(datetime.timezone.utc).strftime("%Y%m%d%H%M%S")
357
  etree.SubElement(head, "timestamp").text = ts
358
 
359
  depositor = etree.SubElement(head, "depositor")
 
612
  with col_config:
613
  st.markdown("#### Verbindung & Typ")
614
 
615
+ dora_host = st.selectbox(
616
+ "DORA Host",
617
+ options=["www.dora.lib4ri.ch", "admin.dora.lib4ri.ch"],
618
+ index=0,
619
+ help="Nutze 'www' für externen Zugriff (Hugging Face) und 'admin' für institutsweiten Zugriff."
620
+ )
621
+
622
  repo_list = list(REPO_CONFIG.keys())
623
  repo_code = st.selectbox(
624
  "Repository-Code",
 
628
  )
629
 
630
  repo_config = REPO_CONFIG[repo_code]
631
+ repo_base_url = f"https://{dora_host}/{repo_code}"
632
 
633
  pub_type = st.radio(
634
  "Publikationstyp",
 
659
  st.write("") # Spacer
660
  if st.button("Metadaten laden", type="primary"):
661
  try:
662
+ mods_urls = build_dora_mods_url(repo_code, book_id_or_url, host=dora_host)
663
+ st.info(f"Suche MODS-Metadaten auf: {dora_host}...")
664
+ book_root = fetch_mods_xml(mods_urls)
665
  meta = parse_book_mods(book_root, repo_base_url)
666
 
667
  # --- Attempt to extract report number from MODS ---
 
897
  line = line.strip()
898
  if not line:
899
  continue
900
+ mods_urls = build_dora_mods_url(repo_code, line, host=dora_host)
901
+ mods_root = fetch_mods_xml(mods_urls)
 
902
  ci, page_no = mods_to_content_item(mods_root, repo_base_url)
903
  chapter_items.append((ci, page_no))
904