import os
import requests
from bs4 import BeautifulSoup
from supabase import create_client
from dotenv import load_dotenv

load_dotenv()

SUPABASE_URL = os.environ["SUPABASE_URL"]
SUPABASE_SERVICE_ROLE = os.environ["SUPABASE_SERVICE_ROLE"]

supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)

# URL CHÍNH THỨC – không dùng Druckversion
LAW_URL = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"

def extract_paragraphs():
    """
    Lädt die aktuelle Fassung des Hochschulgesetzes NRW
    von recht.nrw.de (br_text_anzeigen) und extrahiert Paragraphen.
    Ergebnis: Liste von Dicts mit:
      - abs_id:    para_1, para_2, ...
      - title:     "§ 1 ...", "§ 2 ..."
      - content:   gesamter Text des Paragraphen
      - order_index: laufende Nummer
    """
    print(">>> Lade offizielles Hochschulgesetz NRW von recht.nrw.de …")


    # html = requests.get(LAW_URL, timeout=30).text
    # soup = BeautifulSoup(html, "html.parser")
    # 5.12_2:13
    resp = requests.get(LAW_URL, timeout=30)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    # 5.12_2:13
    # Paragraph-Überschriften: häufig in <p>, <b> oder <strong>
    candidates = soup.find_all(["p", "b", "strong"])


    # Tất cả tiêu đề Paragraph xuất hiện trong <h2> hoặc <h3>
    headers = soup.find_all(["h2", "h3"])

    paragraphs = []
    order = 1

    # for header in headers:
    #     title = header.get_text(" ", strip=True)

    #     if not title.startswith("§"):
    #         continue  # bỏ các h2/h3 không phải Paragraph

    #     # Gom toàn bộ nội dung từ header đến trước h2/h3 tiếp theo
    #     content_parts = []
    #     sibling = header.find_next_sibling()

    #     while sibling and sibling.name not in ["h2", "h3"]:
    #         text = sibling.get_text(" ", strip=True)
    #         if text:
    #             content_parts.append(text)
    #         sibling = sibling.find_next_sibling()

    #     full_content = "\n".join(content_parts).strip()

    #     para_id = f"para_{order}"

    #     paragraphs.append({
    #         "abs_id": para_id,
    #         "title": title,
    #         "content": full_content,
    #         "order_index": order
    #     })

    #     order += 1

    # print(f"✔ Extracted {len(paragraphs)} paragraphs (§).")
    # return paragraphs
    # 5.12_2:13
    for tag in candidates:
        text = tag.get_text(" ", strip=True)
        if not text.startswith("§"):
            continue

        title = text
        content_parts = []
        sibling = tag.find_next_sibling()

        while sibling and not (
            (sibling.name in ["p", "b", "strong"])
            and sibling.get_text(" ", strip=True).startswith("§")
        ):
            txt = sibling.get_text(" ", strip=True)
            if txt:
                content_parts.append(txt)
            sibling = sibling.find_next_sibling()

        full_content = "\n".join(content_parts).strip()
        abs_id = f"para_{order}"

        paragraphs.append(
            {
                "abs_id": abs_id,
                "title": title,
                "content": full_content,
                "order_index": order,
            }
        )
        order += 1

    print(f"✔ {len(paragraphs)} Paragraphen extrahiert.")
    return paragraphs

def upload_to_supabase():
    paras = extract_paragraphs()

    print(">>> Leere Tabelle hg_nrw …")
    supabase.table("hg_nrw").delete().neq("abs_id", "").execute()

    print(">>> Upload nach Supabase …")
    BATCH = 100
    for i in range(0, len(paras), BATCH):
        batch = paras[i:i+BATCH]
        print(f"   - Upload batch {i} – {i+len(batch)-1}")
        supabase.table("hg_nrw").upsert(batch).execute()

    print("✔ DONE uploading complete NRW law.")

if __name__ == "__main__":
    upload_to_supabase()