import os import requests from bs4 import BeautifulSoup from supabase import create_client from dotenv import load_dotenv load_dotenv() SUPABASE_URL = os.environ["SUPABASE_URL"] SUPABASE_SERVICE_ROLE = os.environ["SUPABASE_SERVICE_ROLE"] supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE) # URL CHÍNH THỨC – không dùng Druckversion LAW_URL = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654" def extract_paragraphs(): """ Lädt die aktuelle Fassung des Hochschulgesetzes NRW von recht.nrw.de (br_text_anzeigen) und extrahiert Paragraphen. Ergebnis: Liste von Dicts mit: - abs_id: para_1, para_2, ... - title: "§ 1 ...", "§ 2 ..." - content: gesamter Text des Paragraphen - order_index: laufende Nummer """ print(">>> Lade offizielles Hochschulgesetz NRW von recht.nrw.de …") # html = requests.get(LAW_URL, timeout=30).text # soup = BeautifulSoup(html, "html.parser") # 5.12_2:13 resp = requests.get(LAW_URL, timeout=30) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") # 5.12_2:13 # Paragraph-Überschriften: häufig in
, oder
candidates = soup.find_all(["p", "b", "strong"])
# Tất cả tiêu đề Paragraph xuất hiện trong hoặc
headers = soup.find_all(["h2", "h3"])
paragraphs = []
order = 1
# for header in headers:
# title = header.get_text(" ", strip=True)
# if not title.startswith("§"):
# continue # bỏ các h2/h3 không phải Paragraph
# # Gom toàn bộ nội dung từ header đến trước h2/h3 tiếp theo
# content_parts = []
# sibling = header.find_next_sibling()
# while sibling and sibling.name not in ["h2", "h3"]:
# text = sibling.get_text(" ", strip=True)
# if text:
# content_parts.append(text)
# sibling = sibling.find_next_sibling()
# full_content = "\n".join(content_parts).strip()
# para_id = f"para_{order}"
# paragraphs.append({
# "abs_id": para_id,
# "title": title,
# "content": full_content,
# "order_index": order
# })
# order += 1
# print(f"✔ Extracted {len(paragraphs)} paragraphs (§).")
# return paragraphs
# 5.12_2:13
for tag in candidates:
text = tag.get_text(" ", strip=True)
if not text.startswith("§"):
continue
title = text
content_parts = []
sibling = tag.find_next_sibling()
while sibling and not (
(sibling.name in ["p", "b", "strong"])
and sibling.get_text(" ", strip=True).startswith("§")
):
txt = sibling.get_text(" ", strip=True)
if txt:
content_parts.append(txt)
sibling = sibling.find_next_sibling()
full_content = "\n".join(content_parts).strip()
abs_id = f"para_{order}"
paragraphs.append(
{
"abs_id": abs_id,
"title": title,
"content": full_content,
"order_index": order,
}
)
order += 1
print(f"✔ {len(paragraphs)} Paragraphen extrahiert.")
return paragraphs
def upload_to_supabase():
paras = extract_paragraphs()
print(">>> Leere Tabelle hg_nrw …")
supabase.table("hg_nrw").delete().neq("abs_id", "").execute()
print(">>> Upload nach Supabase …")
BATCH = 100
for i in range(0, len(paras), BATCH):
batch = paras[i:i+BATCH]
print(f" - Upload batch {i} – {i+len(batch)-1}")
supabase.table("hg_nrw").upsert(batch).execute()
print("✔ DONE uploading complete NRW law.")
if __name__ == "__main__":
upload_to_supabase()