adeshboudh16 Claude Sonnet 4.6 commited on
Commit
6ce5a5f
·
1 Parent(s): ef16f0c

fix: correct ingest.py to iterate registry values and call ingest_document

Browse files

- DOCUMENT_REGISTRY is dict[str, DocumentSpec] — iterate .values() not keys
- Replace await pipeline.run(doc) (non-existent) with pipeline.ingest_document(...)
unpacking all DocumentSpec fields with correct kwarg names
- Replace doc.doc_name with doc.name to match DocumentSpec field name

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. scripts/ingest.py +17 -8
scripts/ingest.py CHANGED
@@ -23,7 +23,7 @@ log = structlog.get_logger(__name__)
23
 
24
 
25
  async def ingest_all(jurisdiction_filter: str | None = None, dry_run: bool = False):
26
- docs = DOCUMENT_REGISTRY
27
  if jurisdiction_filter:
28
  docs = [d for d in docs if d.jurisdiction.value == jurisdiction_filter.upper()]
29
  if not docs:
@@ -34,7 +34,7 @@ async def ingest_all(jurisdiction_filter: str | None = None, dry_run: bool = Fal
34
 
35
  if dry_run:
36
  for doc in docs:
37
- print(f" [{doc.jurisdiction.value}] {doc.doc_name}")
38
  return
39
 
40
  pipeline = IngestionPipeline()
@@ -42,18 +42,27 @@ async def ingest_all(jurisdiction_filter: str | None = None, dry_run: bool = Fal
42
 
43
  for i, doc in enumerate(docs, 1):
44
  log.info("ingesting_document", index=i, total=len(docs),
45
- jurisdiction=doc.jurisdiction.value, doc_name=doc.doc_name)
46
  t0 = time.perf_counter()
47
  try:
48
- await pipeline.run(doc)
 
 
 
 
 
 
 
 
 
49
  elapsed = time.perf_counter() - t0
50
- log.info("ingestion_complete", doc_name=doc.doc_name, elapsed_s=round(elapsed, 1))
51
- results["success"].append(doc.doc_name)
52
  except Exception as e:
53
  elapsed = time.perf_counter() - t0
54
- log.error("ingestion_failed", doc_name=doc.doc_name,
55
  error=str(e), elapsed_s=round(elapsed, 1))
56
- results["failed"].append((doc.doc_name, str(e)))
57
 
58
  log.info("seeding_graph_edges")
59
  try:
 
23
 
24
 
25
  async def ingest_all(jurisdiction_filter: str | None = None, dry_run: bool = False):
26
+ docs = list(DOCUMENT_REGISTRY.values())
27
  if jurisdiction_filter:
28
  docs = [d for d in docs if d.jurisdiction.value == jurisdiction_filter.upper()]
29
  if not docs:
 
34
 
35
  if dry_run:
36
  for doc in docs:
37
+ print(f" [{doc.jurisdiction.value}] {doc.name}")
38
  return
39
 
40
  pipeline = IngestionPipeline()
 
42
 
43
  for i, doc in enumerate(docs, 1):
44
  log.info("ingesting_document", index=i, total=len(docs),
45
+ jurisdiction=doc.jurisdiction.value, doc_name=doc.name)
46
  t0 = time.perf_counter()
47
  try:
48
+ pipeline.ingest_document(
49
+ source_url=doc.url,
50
+ doc_name=doc.name,
51
+ jurisdiction=doc.jurisdiction,
52
+ doc_type=doc.doc_type,
53
+ effective_date=doc.effective_date,
54
+ dest_subdir=doc.dest_subdir,
55
+ filename=doc.filename,
56
+ max_pages=doc.max_pages,
57
+ )
58
  elapsed = time.perf_counter() - t0
59
+ log.info("ingestion_complete", doc_name=doc.name, elapsed_s=round(elapsed, 1))
60
+ results["success"].append(doc.name)
61
  except Exception as e:
62
  elapsed = time.perf_counter() - t0
63
+ log.error("ingestion_failed", doc_name=doc.name,
64
  error=str(e), elapsed_s=round(elapsed, 1))
65
+ results["failed"].append((doc.name, str(e)))
66
 
67
  log.info("seeding_graph_edges")
68
  try: