Spaces:

Nguyen5
/

chatbot1

Sleeping

App Files Files Community

chatbot1 / upload_weblink_to_supabase.py

Nguyen5

commit

6548bf5 5 months ago

raw

history blame contribute delete

3.89 kB

	import os
	import requests
	from bs4 import BeautifulSoup
	from supabase import create_client
	from dotenv import load_dotenv

	load_dotenv()

	SUPABASE_URL = os.environ["SUPABASE_URL"]
	SUPABASE_SERVICE_ROLE = os.environ["SUPABASE_SERVICE_ROLE"]

	supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)

	# URL CHÍNH THỨC – không dùng Druckversion
	LAW_URL = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"

	def extract_paragraphs():
	"""
	Lädt die aktuelle Fassung des Hochschulgesetzes NRW
	von recht.nrw.de (br_text_anzeigen) und extrahiert Paragraphen.
	Ergebnis: Liste von Dicts mit:
	- abs_id: para_1, para_2, ...
	- title: "§ 1 ...", "§ 2 ..."
	- content: gesamter Text des Paragraphen
	- order_index: laufende Nummer
	"""
	print(">>> Lade offizielles Hochschulgesetz NRW von recht.nrw.de …")


	# html = requests.get(LAW_URL, timeout=30).text
	# soup = BeautifulSoup(html, "html.parser")
	# 5.12_2:13
	resp = requests.get(LAW_URL, timeout=30)
	resp.raise_for_status()
	soup = BeautifulSoup(resp.text, "html.parser")

	# 5.12_2:13
	# Paragraph-Überschriften: häufig in <p>, <b> oder <strong>
	candidates = soup.find_all(["p", "b", "strong"])


	# Tất cả tiêu đề Paragraph xuất hiện trong <h2> hoặc <h3>
	headers = soup.find_all(["h2", "h3"])

	paragraphs = []
	order = 1

	# for header in headers:
	# title = header.get_text(" ", strip=True)

	# if not title.startswith("§"):
	# continue # bỏ các h2/h3 không phải Paragraph

	# # Gom toàn bộ nội dung từ header đến trước h2/h3 tiếp theo
	# content_parts = []
	# sibling = header.find_next_sibling()

	# while sibling and sibling.name not in ["h2", "h3"]:
	# text = sibling.get_text(" ", strip=True)
	# if text:
	# content_parts.append(text)
	# sibling = sibling.find_next_sibling()

	# full_content = "\n".join(content_parts).strip()

	# para_id = f"para_{order}"

	# paragraphs.append({
	# "abs_id": para_id,
	# "title": title,
	# "content": full_content,
	# "order_index": order
	# })

	# order += 1

	# print(f"✔ Extracted {len(paragraphs)} paragraphs (§).")
	# return paragraphs
	# 5.12_2:13
	for tag in candidates:
	text = tag.get_text(" ", strip=True)
	if not text.startswith("§"):
	continue

	title = text
	content_parts = []
	sibling = tag.find_next_sibling()

	while sibling and not (
	(sibling.name in ["p", "b", "strong"])
	and sibling.get_text(" ", strip=True).startswith("§")
	):
	txt = sibling.get_text(" ", strip=True)
	if txt:
	content_parts.append(txt)
	sibling = sibling.find_next_sibling()

	full_content = "\n".join(content_parts).strip()
	abs_id = f"para_{order}"

	paragraphs.append(
	{
	"abs_id": abs_id,
	"title": title,
	"content": full_content,
	"order_index": order,
	}
	)
	order += 1

	print(f"✔ {len(paragraphs)} Paragraphen extrahiert.")
	return paragraphs

	def upload_to_supabase():
	paras = extract_paragraphs()

	print(">>> Leere Tabelle hg_nrw …")
	supabase.table("hg_nrw").delete().neq("abs_id", "").execute()

	print(">>> Upload nach Supabase …")
	BATCH = 100
	for i in range(0, len(paras), BATCH):
	batch = paras[i:i+BATCH]
	print(f" - Upload batch {i} – {i+len(batch)-1}")
	supabase.table("hg_nrw").upsert(batch).execute()

	print("✔ DONE uploading complete NRW law.")

	if __name__ == "__main__":
	upload_to_supabase()