chatbot1 / upload_weblink_to_supabase.py
Nguyen5's picture
commit
6548bf5
import os
import requests
from bs4 import BeautifulSoup
from supabase import create_client
from dotenv import load_dotenv
load_dotenv()
SUPABASE_URL = os.environ["SUPABASE_URL"]
SUPABASE_SERVICE_ROLE = os.environ["SUPABASE_SERVICE_ROLE"]
supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)
# URL CHÍNH THỨC – không dùng Druckversion
LAW_URL = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
def extract_paragraphs():
"""
Lädt die aktuelle Fassung des Hochschulgesetzes NRW
von recht.nrw.de (br_text_anzeigen) und extrahiert Paragraphen.
Ergebnis: Liste von Dicts mit:
- abs_id: para_1, para_2, ...
- title: "§ 1 ...", "§ 2 ..."
- content: gesamter Text des Paragraphen
- order_index: laufende Nummer
"""
print(">>> Lade offizielles Hochschulgesetz NRW von recht.nrw.de …")
# html = requests.get(LAW_URL, timeout=30).text
# soup = BeautifulSoup(html, "html.parser")
# 5.12_2:13
resp = requests.get(LAW_URL, timeout=30)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
# 5.12_2:13
# Paragraph-Überschriften: häufig in <p>, <b> oder <strong>
candidates = soup.find_all(["p", "b", "strong"])
# Tất cả tiêu đề Paragraph xuất hiện trong <h2> hoặc <h3>
headers = soup.find_all(["h2", "h3"])
paragraphs = []
order = 1
# for header in headers:
# title = header.get_text(" ", strip=True)
# if not title.startswith("§"):
# continue # bỏ các h2/h3 không phải Paragraph
# # Gom toàn bộ nội dung từ header đến trước h2/h3 tiếp theo
# content_parts = []
# sibling = header.find_next_sibling()
# while sibling and sibling.name not in ["h2", "h3"]:
# text = sibling.get_text(" ", strip=True)
# if text:
# content_parts.append(text)
# sibling = sibling.find_next_sibling()
# full_content = "\n".join(content_parts).strip()
# para_id = f"para_{order}"
# paragraphs.append({
# "abs_id": para_id,
# "title": title,
# "content": full_content,
# "order_index": order
# })
# order += 1
# print(f"✔ Extracted {len(paragraphs)} paragraphs (§).")
# return paragraphs
# 5.12_2:13
for tag in candidates:
text = tag.get_text(" ", strip=True)
if not text.startswith("§"):
continue
title = text
content_parts = []
sibling = tag.find_next_sibling()
while sibling and not (
(sibling.name in ["p", "b", "strong"])
and sibling.get_text(" ", strip=True).startswith("§")
):
txt = sibling.get_text(" ", strip=True)
if txt:
content_parts.append(txt)
sibling = sibling.find_next_sibling()
full_content = "\n".join(content_parts).strip()
abs_id = f"para_{order}"
paragraphs.append(
{
"abs_id": abs_id,
"title": title,
"content": full_content,
"order_index": order,
}
)
order += 1
print(f"✔ {len(paragraphs)} Paragraphen extrahiert.")
return paragraphs
def upload_to_supabase():
paras = extract_paragraphs()
print(">>> Leere Tabelle hg_nrw …")
supabase.table("hg_nrw").delete().neq("abs_id", "").execute()
print(">>> Upload nach Supabase …")
BATCH = 100
for i in range(0, len(paras), BATCH):
batch = paras[i:i+BATCH]
print(f" - Upload batch {i}{i+len(batch)-1}")
supabase.table("hg_nrw").upsert(batch).execute()
print("✔ DONE uploading complete NRW law.")
if __name__ == "__main__":
upload_to_supabase()