| """ |
| BƯỚC 1: LOAD DOCUMENTS |
| ----------------------- |
| Debug-full version |
| |
| - Lädt Prüfungsordnung (PDF) seitenweise. |
| - Lädt Hochschulgesetz NRW aus dem im Dataset gespeicherten HTML, |
| und zerlegt es in einzelne Absätze (Document pro <p>). |
| """ |
|
|
| from huggingface_hub import hf_hub_download, list_repo_files |
| from langchain_community.document_loaders import PyPDFLoader |
| from langchain_core.documents import Document |
| from bs4 import BeautifulSoup |
|
|
| DATASET = "Nguyen5/docs" |
| PDF_FILE = "f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf" |
| HTML_FILE = "Hochschulgesetz_NRW.html" |
|
|
| def _load_hg_paragraph_documents(html_path: str): |
| """ |
| Liest das generierte Hochschulgesetz-HTML ein und erzeugt |
| pro <p>-Element einen LangChain-Document mit: |
| - page_content = Text des Absatzes |
| - metadata: |
| source = "Hochschulgesetz NRW (HTML)" |
| filename = HTML_FILE |
| paragraph_id = id-Attribut (z.B. 'hg_abs_12'), falls vorhanden |
| """ |
| with open(html_path, "r", encoding="utf-8") as f: |
| html = f.read() |
|
|
| soup = BeautifulSoup(html, "html.parser") |
| docs = [] |
|
|
| for p in soup.find_all("p"): |
| text = p.get_text(" ", strip=True) |
| if not text: |
| continue |
|
|
| pid = p.get("id") |
|
|
| metadata = { |
| "source": "Hochschulgesetz NRW (HTML)", |
| "filename": HTML_FILE, |
| } |
| if pid: |
| metadata["paragraph_id"] = pid |
|
|
| docs.append(Document(page_content=text, metadata=metadata)) |
|
|
| print(f"Loaded {len(docs)} paragraph Documents from HG-HTML.\n") |
| return docs |
|
|
| def load_documents(): |
| print("=== START: load_documents() ===\n") |
|
|
| |
| |
| |
| print(">>> Checking dataset file list from HuggingFace...") |
| files = list_repo_files(DATASET, repo_type="dataset") |
| print("Files in dataset:", files, "\n") |
|
|
| docs = [] |
|
|
| |
| |
| |
| print(">>> Step 1: Download PDF from HuggingFace...") |
| try: |
| pdf_path = hf_hub_download( |
| repo_id=DATASET, |
| filename=PDF_FILE, |
| repo_type="dataset", |
| ) |
| print(f"Downloaded PDF to local cache:\n{pdf_path}\n") |
| except Exception as e: |
| print("ERROR downloading PDF:", e) |
| return [] |
|
|
| print(">>> Step 1.1: Loading PDF pages...") |
| try: |
| pdf_docs = PyPDFLoader(pdf_path).load() |
| print(f"Loaded {len(pdf_docs)} PDF pages.\n") |
| except Exception as e: |
| print("ERROR loading PDF:", e) |
| return [] |
|
|
| for d in pdf_docs: |
| d.metadata["source"] = "Prüfungsordnung (PDF)" |
| d.metadata["filename"] = PDF_FILE |
|
|
| docs.extend(pdf_docs) |
|
|
| |
| |
| |
| print(">>> Step 2: Download HTML from HuggingFace...") |
| try: |
| html_path = hf_hub_download( |
| repo_id=DATASET, |
| filename=HTML_FILE, |
| repo_type="dataset", |
| ) |
| print(f"Downloaded HTML to local cache:\n{html_path}\n") |
| except Exception as e: |
| print("ERROR downloading HTML:", e) |
| return docs |
|
|
| print(">>> Step 2.1: Loading HG-HTML and splitting into paragraphs...") |
| try: |
| html_docs = _load_hg_paragraph_documents(html_path) |
| except Exception as e: |
| print("ERROR loading / parsing HTML:", e) |
| return docs |
|
|
| docs.extend(html_docs) |
|
|
| print("=== DONE: load_documents() ===\n") |
| return docs |
|
|
| if __name__ == "__main__": |
| print("\n=== Running load_documents.py directly ===\n") |
| docs = load_documents() |
| print(f"\n>>> TOTAL documents loaded: {len(docs)}") |
|
|
| if len(docs): |
| print("\nExample metadata from 1st document:") |
| print(docs[0].metadata) |
|
|
|
|