File size: 8,486 Bytes
21626e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
589d46e
 
 
 
 
 
 
 
 
21626e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
589d46e
21626e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
"""Ingest CBSA D-Memoranda (HTML) into structured, section-level chunks.

D-Memoranda are CBSA's administrative guidance on how it applies the Customs Act
and related law. They are persuasive, not binding -- every chunk is tagged
doc_type="memorandum" so the rest of CanLex can keep them distinct from statute.
"""
import io
import json
import re
import sys
import time
import urllib.request
from urllib.parse import urljoin

from bs4 import BeautifulSoup
from pypdf import PdfReader

from .config import RAW_DIR, PROCESSED_DIR

INDEX_URL = "https://www.cbsa-asfc.gc.ca/publications/dm-md/d1-d23-eng.html"
DMEMO_DIR = RAW_DIR / "dmemos"
OUT_FILE = PROCESSED_DIR / "dmemos.json"

# <h2> headings that are page boilerplate rather than memo content.
_SKIP_HEADINGS = {"contact us", "related links"}
_MEMO_HREF = re.compile(r"/dm-md/d\d+/d[\d-]+-eng\.html")
_URL_NUMBER = re.compile(r"/(d\d+-[\d-]+)-eng\.html")


def _norm(text):
    return re.sub(r"\s+", " ", text or "").strip()


def _fetch(url, dest, force=False):
    if dest.exists() and not force:
        return dest.read_bytes()
    req = urllib.request.Request(url, headers={"User-Agent": "CanLex/0.1"})
    with urllib.request.urlopen(req, timeout=60) as resp:
        data = resp.read()
    dest.parent.mkdir(parents=True, exist_ok=True)
    dest.write_bytes(data)
    time.sleep(0.5)  # be polite to the CBSA server
    return data


def memo_urls(force=False):
    """All individual D-memo URLs listed on the CBSA index page."""
    html = _fetch(INDEX_URL, DMEMO_DIR / "_index.html", force=force)
    soup = BeautifulSoup(html, "html.parser")
    urls, seen = [], set()
    for a in soup.find_all("a", href=True):
        if _MEMO_HREF.search(a["href"]):
            full = urljoin(INDEX_URL, a["href"])
            if full not in seen:
                seen.add(full)
                urls.append(full)
    return urls


def _render_section(h2):
    """Readable text from an <h2> up to the next <h2> (sections already unwrapped)."""
    lines = []
    for sib in h2.find_next_siblings():
        if sib.name == "h2" or sib.get("id") == "wb-dtmd":
            break
        if sib.name in ("ul", "ol"):
            for li in sib.find_all("li", recursive=False):
                item = _norm(li.get_text(" ", strip=True))
                if item:
                    lines.append(f"- {item}")
        else:
            text = _norm(sib.get_text(" ", strip=True))
            if text:
                lines.append(text)
    return "\n".join(lines)


def parse_memo(html, url):
    """Parse one D-memo HTML page into one chunk per <h2> content section."""
    soup = BeautifulSoup(html, "html.parser")
    main = soup.find("main")
    if main is None:
        return []
    for section in main.find_all("section"):
        section.unwrap()  # flatten so each <h2> and its content become siblings

    match = _URL_NUMBER.search(url)
    number = match.group(1).upper() if match else url
    h1 = main.find("h1")
    topic = ""
    if h1:
        # Pages vary: most carry the memo title in <h1><small>, others as plain
        # "Memorandum DNN-N-N: Title" h1 text. Use the <small> if present, else
        # the h1 text, and strip any leading memo-number prefix either way.
        small = h1.find("small")
        raw = (small.get_text(" ", strip=True) if small
               else h1.get_text(" ", strip=True))
        topic = re.sub(r"^Memorandum\s+D[\w-]+\s*[:–-]\s*", "",
                       _norm(raw), flags=re.I)
    dm = main.find("time", attrs={"property": "dateModified"})
    date = _norm(dm.get("datetime") or dm.get_text()) if dm else ""

    chunks = []
    for h2 in main.find_all("h2"):
        heading = _norm(h2.get_text(" ", strip=True))
        if not heading or heading.lower() in _SKIP_HEADINGS:
            continue
        body = _render_section(h2)
        if not body:
            continue
        chunks.append({
            "id": f"dmemo-{number}-{len(chunks) + 1}",
            "doc_type": "memorandum",
            "act_code": "D-Memo",
            "act_short": "D-Memo",
            "act_name": "CBSA D-Memoranda",
            "section": number,
            "marginal_note": heading,
            "part": topic,
            "division": "",
            "heading": "",
            "text": body,
            "history": "",
            "last_amended": date,
            "current_to": date,
            "citation": f"Memorandum {number}",
            "source_url": url,
        })
    return chunks


def _pdf_clean(text):
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n[ \t]+", "\n", text)
    return re.sub(r"\n{3,}", "\n\n", text).strip()


def _pdf_text(pdf_bytes):
    try:
        reader = PdfReader(io.BytesIO(pdf_bytes))
        return _pdf_clean("\n".join((p.extract_text() or "") for p in reader.pages))
    except Exception:
        return ""


def _split(text, target=3000):
    """Split long PDF text into ~target-sized pieces at line boundaries."""
    if len(text) <= target:
        return [text]
    parts, buf, size = [], [], 0
    for line in text.split("\n"):
        if size + len(line) > target and buf:
            parts.append("\n".join(buf))
            buf, size = [], 0
        buf.append(line)
        size += len(line) + 1
    if buf:
        parts.append("\n".join(buf))
    return parts


def parse_pdf_memo(html, url):
    """Fallback for memos whose HTML page is only a stub linking to a PDF."""
    soup = BeautifulSoup(html, "html.parser")
    main = soup.find("main")
    if main is None:
        return []
    pdf_href = next((a["href"] for a in main.find_all("a", href=True)
                     if a["href"].lower().endswith(".pdf")), None)
    if not pdf_href:
        return []
    pdf_url = urljoin(url, pdf_href)

    match = _URL_NUMBER.search(url)
    number = match.group(1).upper() if match else url
    h1 = main.find("h1")
    topic = _norm(h1.get_text(" ", strip=True)) if h1 else ""
    topic = re.sub(r"^Memorandum\s+D[\w-]+\s*[:–-]\s*", "", topic, flags=re.I)
    dm = main.find("time", attrs={"property": "dateModified"})
    date = _norm(dm.get("datetime") or dm.get_text()) if dm else ""

    pdf_bytes = _fetch(pdf_url, DMEMO_DIR / "pdf" / pdf_url.rsplit("/", 1)[-1])
    text = _pdf_text(pdf_bytes)
    if not text:
        return []

    parts = _split(text)
    chunks = []
    for i, part in enumerate(parts, 1):
        label = topic or number
        if len(parts) > 1:
            label = f"{label} (part {i})"
        chunks.append({
            "id": f"dmemo-{number}-pdf{i}",
            "doc_type": "memorandum",
            "act_code": "D-Memo",
            "act_short": "D-Memo",
            "act_name": "CBSA D-Memoranda",
            "section": number,
            "marginal_note": label,
            "part": topic,
            "division": "",
            "heading": "",
            "text": part,
            "history": "",
            "last_amended": date,
            "current_to": date,
            "citation": f"Memorandum {number}",
            "source_url": url,
        })
    return chunks


def ingest(force=False, limit=None):
    urls = memo_urls(force=force)
    if limit:
        urls = urls[:limit]
    print(f"Ingesting {len(urls)} D-Memoranda...")
    all_chunks, failures = [], []
    for i, url in enumerate(urls, 1):
        try:
            html = _fetch(url, DMEMO_DIR / url.rsplit("/", 1)[-1], force=force)
            chunks = parse_memo(html, url) or parse_pdf_memo(html, url)
            if chunks:
                all_chunks.extend(chunks)
            else:
                failures.append((url, "no content parsed"))
        except Exception as exc:
            failures.append((url, f"{type(exc).__name__}: {exc}"))
        if i % 50 == 0:
            print(f"  {i}/{len(urls)} ...")
    PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
    OUT_FILE.write_text(json.dumps(all_chunks, ensure_ascii=False, indent=2), encoding="utf-8")
    print(f"  {len(all_chunks)} section-chunks from {len(urls) - len(failures)} memos "
          f"-> {OUT_FILE.name}")
    if failures:
        print(f"  {len(failures)} memos with no content / errors:")
        for url, why in failures[:15]:
            print(f"    - {url.rsplit('/', 1)[-1]}: {why}")


def main():
    force = "--force" in sys.argv
    limit = next((int(a.split("=", 1)[1]) for a in sys.argv[1:]
                  if a.startswith("--limit=")), None)
    ingest(force=force, limit=limit)


if __name__ == "__main__":
    main()