import os import requests from bs4 import BeautifulSoup from supabase import create_client from dotenv import load_dotenv load_dotenv() SUPABASE_URL = os.environ["SUPABASE_URL"] SUPABASE_SERVICE_ROLE = os.environ["SUPABASE_SERVICE_ROLE"] supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE) # URL CHÍNH THỨC – không dùng Druckversion LAW_URL = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654" def extract_paragraphs(): """ Lädt die aktuelle Fassung des Hochschulgesetzes NRW von recht.nrw.de (br_text_anzeigen) und extrahiert Paragraphen. Ergebnis: Liste von Dicts mit: - abs_id: para_1, para_2, ... - title: "§ 1 ...", "§ 2 ..." - content: gesamter Text des Paragraphen - order_index: laufende Nummer """ print(">>> Lade offizielles Hochschulgesetz NRW von recht.nrw.de …") # html = requests.get(LAW_URL, timeout=30).text # soup = BeautifulSoup(html, "html.parser") # 5.12_2:13 resp = requests.get(LAW_URL, timeout=30) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") # 5.12_2:13 # Paragraph-Überschriften: häufig in

, oder candidates = soup.find_all(["p", "b", "strong"]) # Tất cả tiêu đề Paragraph xuất hiện trong

hoặc

headers = soup.find_all(["h2", "h3"]) paragraphs = [] order = 1 # for header in headers: # title = header.get_text(" ", strip=True) # if not title.startswith("§"): # continue # bỏ các h2/h3 không phải Paragraph # # Gom toàn bộ nội dung từ header đến trước h2/h3 tiếp theo # content_parts = [] # sibling = header.find_next_sibling() # while sibling and sibling.name not in ["h2", "h3"]: # text = sibling.get_text(" ", strip=True) # if text: # content_parts.append(text) # sibling = sibling.find_next_sibling() # full_content = "\n".join(content_parts).strip() # para_id = f"para_{order}" # paragraphs.append({ # "abs_id": para_id, # "title": title, # "content": full_content, # "order_index": order # }) # order += 1 # print(f"✔ Extracted {len(paragraphs)} paragraphs (§).") # return paragraphs # 5.12_2:13 for tag in candidates: text = tag.get_text(" ", strip=True) if not text.startswith("§"): continue title = text content_parts = [] sibling = tag.find_next_sibling() while sibling and not ( (sibling.name in ["p", "b", "strong"]) and sibling.get_text(" ", strip=True).startswith("§") ): txt = sibling.get_text(" ", strip=True) if txt: content_parts.append(txt) sibling = sibling.find_next_sibling() full_content = "\n".join(content_parts).strip() abs_id = f"para_{order}" paragraphs.append( { "abs_id": abs_id, "title": title, "content": full_content, "order_index": order, } ) order += 1 print(f"✔ {len(paragraphs)} Paragraphen extrahiert.") return paragraphs def upload_to_supabase(): paras = extract_paragraphs() print(">>> Leere Tabelle hg_nrw …") supabase.table("hg_nrw").delete().neq("abs_id", "").execute() print(">>> Upload nach Supabase …") BATCH = 100 for i in range(0, len(paras), BATCH): batch = paras[i:i+BATCH] print(f" - Upload batch {i} – {i+len(batch)-1}") supabase.table("hg_nrw").upsert(batch).execute() print("✔ DONE uploading complete NRW law.") if __name__ == "__main__": upload_to_supabase()