| import os |
| import requests |
| from bs4 import BeautifulSoup |
| from supabase import create_client |
| from dotenv import load_dotenv |
|
|
| load_dotenv() |
|
|
| SUPABASE_URL = os.environ["SUPABASE_URL"] |
| SUPABASE_SERVICE_ROLE = os.environ["SUPABASE_SERVICE_ROLE"] |
|
|
| supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE) |
|
|
| |
| LAW_URL = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654" |
|
|
| def extract_paragraphs(): |
| """ |
| Lädt die aktuelle Fassung des Hochschulgesetzes NRW |
| von recht.nrw.de (br_text_anzeigen) und extrahiert Paragraphen. |
| Ergebnis: Liste von Dicts mit: |
| - abs_id: para_1, para_2, ... |
| - title: "§ 1 ...", "§ 2 ..." |
| - content: gesamter Text des Paragraphen |
| - order_index: laufende Nummer |
| """ |
| print(">>> Lade offizielles Hochschulgesetz NRW von recht.nrw.de …") |
|
|
|
|
| |
| |
| |
| resp = requests.get(LAW_URL, timeout=30) |
| resp.raise_for_status() |
| soup = BeautifulSoup(resp.text, "html.parser") |
|
|
| |
| |
| candidates = soup.find_all(["p", "b", "strong"]) |
|
|
|
|
| |
| headers = soup.find_all(["h2", "h3"]) |
|
|
| paragraphs = [] |
| order = 1 |
|
|
| |
| |
|
|
| |
| |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| |
|
|
| |
|
|
| |
|
|
| |
| |
| |
| |
| |
| |
|
|
| |
|
|
| |
| |
| |
| for tag in candidates: |
| text = tag.get_text(" ", strip=True) |
| if not text.startswith("§"): |
| continue |
|
|
| title = text |
| content_parts = [] |
| sibling = tag.find_next_sibling() |
|
|
| while sibling and not ( |
| (sibling.name in ["p", "b", "strong"]) |
| and sibling.get_text(" ", strip=True).startswith("§") |
| ): |
| txt = sibling.get_text(" ", strip=True) |
| if txt: |
| content_parts.append(txt) |
| sibling = sibling.find_next_sibling() |
|
|
| full_content = "\n".join(content_parts).strip() |
| abs_id = f"para_{order}" |
|
|
| paragraphs.append( |
| { |
| "abs_id": abs_id, |
| "title": title, |
| "content": full_content, |
| "order_index": order, |
| } |
| ) |
| order += 1 |
|
|
| print(f"✔ {len(paragraphs)} Paragraphen extrahiert.") |
| return paragraphs |
|
|
| def upload_to_supabase(): |
| paras = extract_paragraphs() |
|
|
| print(">>> Leere Tabelle hg_nrw …") |
| supabase.table("hg_nrw").delete().neq("abs_id", "").execute() |
|
|
| print(">>> Upload nach Supabase …") |
| BATCH = 100 |
| for i in range(0, len(paras), BATCH): |
| batch = paras[i:i+BATCH] |
| print(f" - Upload batch {i} – {i+len(batch)-1}") |
| supabase.table("hg_nrw").upsert(batch).execute() |
|
|
| print("✔ DONE uploading complete NRW law.") |
|
|
| if __name__ == "__main__": |
| upload_to_supabase() |
|
|