Spaces:

Nguyen5
/

chatbot1

Sleeping

App Files Files Community

chatbot1 / load_documents.py

Nguyen5

commit

4da3e87 4 months ago

raw

history blame contribute delete

3.87 kB

	"""
	BƯỚC 1: LOAD DOCUMENTS
	-----------------------
	Debug-full version

	- Lädt Prüfungsordnung (PDF) seitenweise.
	- Lädt Hochschulgesetz NRW aus dem im Dataset gespeicherten HTML,
	und zerlegt es in einzelne Absätze (Document pro <p>).
	"""

	from huggingface_hub import hf_hub_download, list_repo_files
	from langchain_community.document_loaders import PyPDFLoader
	from langchain_core.documents import Document
	from bs4 import BeautifulSoup

	DATASET = "Nguyen5/docs"
	PDF_FILE = "f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf"
	HTML_FILE = "Hochschulgesetz_NRW.html" # konsistent mit hg_nrw.py

	def _load_hg_paragraph_documents(html_path: str):
	"""
	Liest das generierte Hochschulgesetz-HTML ein und erzeugt
	pro <p>-Element einen LangChain-Document mit:
	- page_content = Text des Absatzes
	- metadata:
	source = "Hochschulgesetz NRW (HTML)"
	filename = HTML_FILE
	paragraph_id = id-Attribut (z.B. 'hg_abs_12'), falls vorhanden
	"""
	with open(html_path, "r", encoding="utf-8") as f:
	html = f.read()

	soup = BeautifulSoup(html, "html.parser")
	docs = []

	for p in soup.find_all("p"):
	text = p.get_text(" ", strip=True)
	if not text:
	continue

	pid = p.get("id")

	metadata = {
	"source": "Hochschulgesetz NRW (HTML)",
	"filename": HTML_FILE,
	}
	if pid:
	metadata["paragraph_id"] = pid

	docs.append(Document(page_content=text, metadata=metadata))

	print(f"Loaded {len(docs)} paragraph Documents from HG-HTML.\n")
	return docs

	def load_documents():
	print("=== START: load_documents() ===\n")

	# -------------------------
	# Check files in dataset
	# -------------------------
	print(">>> Checking dataset file list from HuggingFace...")
	files = list_repo_files(DATASET, repo_type="dataset")
	print("Files in dataset:", files, "\n")

	docs = []

	# -------------------------
	# Load PDF
	# -------------------------
	print(">>> Step 1: Download PDF from HuggingFace...")
	try:
	pdf_path = hf_hub_download(
	repo_id=DATASET,
	filename=PDF_FILE,
	repo_type="dataset",
	)
	print(f"Downloaded PDF to local cache:\n{pdf_path}\n")
	except Exception as e:
	print("ERROR downloading PDF:", e)
	return []

	print(">>> Step 1.1: Loading PDF pages...")
	try:
	pdf_docs = PyPDFLoader(pdf_path).load()
	print(f"Loaded {len(pdf_docs)} PDF pages.\n")
	except Exception as e:
	print("ERROR loading PDF:", e)
	return []

	for d in pdf_docs:
	d.metadata["source"] = "Prüfungsordnung (PDF)"
	d.metadata["filename"] = PDF_FILE

	docs.extend(pdf_docs)

	# -------------------------
	# Load HTML (Hochschulgesetz NRW)
	# -------------------------
	print(">>> Step 2: Download HTML from HuggingFace...")
	try:
	html_path = hf_hub_download(
	repo_id=DATASET,
	filename=HTML_FILE,
	repo_type="dataset",
	)
	print(f"Downloaded HTML to local cache:\n{html_path}\n")
	except Exception as e:
	print("ERROR downloading HTML:", e)
	return docs

	print(">>> Step 2.1: Loading HG-HTML and splitting into paragraphs...")
	try:
	html_docs = _load_hg_paragraph_documents(html_path)
	except Exception as e:
	print("ERROR loading / parsing HTML:", e)
	return docs

	docs.extend(html_docs)

	print("=== DONE: load_documents() ===\n")
	return docs

	if __name__ == "__main__":
	print("\n=== Running load_documents.py directly ===\n")
	docs = load_documents()
	print(f"\n>>> TOTAL documents loaded: {len(docs)}")

	if len(docs):
	print("\nExample metadata from 1st document:")
	print(docs[0].metadata)