rafmacalaba commited on
Commit
4a9a5a6
ยท
1 Parent(s): a2c885c

feat: add UNHCR corpus to registry and prepare_unhcr.py script

Browse files

- corpora.json now includes UNHCR entry
- prepare_unhcr.py converts raw_mentions.json to doc_N format
- Generates unhcr_pdf_links.json from raw data

Files changed (1) hide show
  1. prepare_unhcr.py +207 -0
prepare_unhcr.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ prepare_unhcr.py
4
+
5
+ Converts UNHCR raw_mentions.json files into the doc_N/_direct_judged.jsonl format
6
+ and generates unhcr_pdf_links.json.
7
+
8
+ Structure expected:
9
+ annotation_data/unhcr_extractions/<dir_name>/raw/raw_mentions.json
10
+
11
+ Output:
12
+ annotation_data/unhcr_extractions/doc_N/raw/doc_N_direct_judged.jsonl
13
+ annotation_data/unhcr_data/unhcr_pdf_links.json
14
+
15
+ Usage:
16
+ python3 prepare_unhcr.py # Dry run
17
+ python3 prepare_unhcr.py --execute # Actually restructure files
18
+ python3 prepare_unhcr.py --execute --upload # Restructure + upload to HF
19
+ """
20
+
21
+ import argparse
22
+ import json
23
+ import os
24
+ import shutil
25
+ import sys
26
+ from pathlib import Path
27
+
28
+ UNHCR_DIR = Path(__file__).parent / "annotation_data" / "unhcr_extractions"
29
+ LINKS_DIR = Path(__file__).parent / "annotation_data" / "unhcr_data"
30
+ LINKS_FILE = LINKS_DIR / "unhcr_pdf_links.json"
31
+
32
+
33
+ def scan_raw_dirs():
34
+ """Find all directories with raw_mentions.json."""
35
+ results = []
36
+ if not UNHCR_DIR.exists():
37
+ print("โŒ unhcr_extractions directory not found")
38
+ return results
39
+
40
+ for d in sorted(UNHCR_DIR.iterdir()):
41
+ if not d.is_dir():
42
+ continue
43
+ # Skip already-converted doc_N directories
44
+ if d.name.startswith("doc_"):
45
+ continue
46
+ raw_file = d / "raw" / "raw_mentions.json"
47
+ if raw_file.exists():
48
+ results.append(d)
49
+ return results
50
+
51
+
52
+ def extract_pdf_url(pages_data):
53
+ """Get PDF URL from document.source in first page."""
54
+ for page in pages_data:
55
+ source = page.get("document", {}).get("source")
56
+ if source:
57
+ return source
58
+ return None
59
+
60
+
61
+ def has_datasets(pages_data):
62
+ """Check if any page has dataset mentions."""
63
+ for page in pages_data:
64
+ if page.get("datasets") and len(page["datasets"]) > 0:
65
+ return True
66
+ return False
67
+
68
+
69
+ def convert_directory(src_dir, doc_index, execute=False):
70
+ """
71
+ Convert a named directory into doc_N format.
72
+ - Reads raw/raw_mentions.json
73
+ - Writes doc_N/raw/doc_N_direct_judged.jsonl
74
+ - Returns link entry dict
75
+ """
76
+ raw_file = src_dir / "raw" / "raw_mentions.json"
77
+ pages_data = json.loads(raw_file.read_text())
78
+
79
+ pdf_url = extract_pdf_url(pages_data)
80
+ has_ds = has_datasets(pages_data)
81
+
82
+ target_dir = UNHCR_DIR / f"doc_{doc_index}" / "raw"
83
+ target_file = target_dir / f"doc_{doc_index}_direct_judged.jsonl"
84
+
85
+ link_entry = {
86
+ "index": doc_index,
87
+ "original_name": src_dir.name,
88
+ "direct_pdf_url": pdf_url,
89
+ "landing_page_url": pdf_url,
90
+ "status": "success",
91
+ "has_revalidation": True,
92
+ "has_datasets": has_ds,
93
+ "num_pages": len(pages_data),
94
+ }
95
+
96
+ if execute:
97
+ target_dir.mkdir(parents=True, exist_ok=True)
98
+ target_file.write_text(json.dumps(pages_data, indent=2))
99
+
100
+ return link_entry
101
+
102
+
103
+ def main():
104
+ parser = argparse.ArgumentParser(description="Prepare UNHCR corpus data")
105
+ parser.add_argument("--execute", action="store_true", help="Actually create files (default: dry run)")
106
+ parser.add_argument("--upload", action="store_true", help="Upload to HF after conversion")
107
+ parser.add_argument("--limit", type=int, default=None, help="Limit number of docs to process (for testing)")
108
+ args = parser.parse_args()
109
+
110
+ dirs = scan_raw_dirs()
111
+ print(f"๐Ÿ“‚ Found {len(dirs)} UNHCR documents with raw_mentions.json")
112
+
113
+ if not dirs:
114
+ return
115
+
116
+ if args.limit:
117
+ dirs = dirs[:args.limit]
118
+ print(f"โš ๏ธ Limited to {args.limit} docs")
119
+
120
+ links = []
121
+ docs_with_datasets = 0
122
+
123
+ for i, d in enumerate(dirs):
124
+ doc_index = i + 1
125
+ link = convert_directory(d, doc_index, execute=args.execute)
126
+ links.append(link)
127
+
128
+ if link["has_datasets"]:
129
+ docs_with_datasets += 1
130
+
131
+ if (i + 1) % 100 == 0:
132
+ print(f" Processed {i + 1}/{len(dirs)}...")
133
+
134
+ print(f"\n๐Ÿ“Š Summary:")
135
+ print(f" Total docs: {len(links)}")
136
+ print(f" Docs with datasets: {docs_with_datasets}")
137
+ print(f" Docs without: {len(links) - docs_with_datasets}")
138
+
139
+ if args.execute:
140
+ # Write links file
141
+ LINKS_DIR.mkdir(parents=True, exist_ok=True)
142
+ LINKS_FILE.write_text(json.dumps(links, indent=2))
143
+ print(f"\n๐Ÿ’พ Saved {LINKS_FILE}")
144
+ print(f"๐Ÿ’พ Created {len(links)} doc_N directories in {UNHCR_DIR}")
145
+
146
+ # Clean up original dirs (optional โ€” keep for now)
147
+ print("\nโš ๏ธ Original named directories preserved. Remove manually if desired.")
148
+
149
+ if args.upload:
150
+ upload_to_hf(links)
151
+ else:
152
+ print(f"\n[DRY RUN] Would create {len(links)} doc_N dirs and unhcr_pdf_links.json")
153
+ print(f"[DRY RUN] Run with --execute to create files")
154
+ # Show sample
155
+ if links:
156
+ print(f"\nSample link entry:")
157
+ print(json.dumps(links[0], indent=2))
158
+
159
+
160
+ def upload_to_hf(links):
161
+ """Upload links file and all doc files to HF."""
162
+ try:
163
+ from huggingface_hub import HfApi
164
+
165
+ token = None
166
+ env_path = Path(__file__).parent / ".env"
167
+ if env_path.exists():
168
+ for line in env_path.read_text().splitlines():
169
+ if line.startswith("HF_TOKEN="):
170
+ token = line.split("=", 1)[1].strip()
171
+
172
+ if not token:
173
+ token = os.environ.get("HF_TOKEN")
174
+ if not token:
175
+ print("โŒ No HF_TOKEN found")
176
+ return
177
+
178
+ api = HfApi(token=token)
179
+ repo_id = "ai4data/annotation_data"
180
+
181
+ # Upload links file
182
+ api.upload_file(
183
+ path_or_fileobj=str(LINKS_FILE),
184
+ path_in_repo="annotation_data/unhcr_data/unhcr_pdf_links.json",
185
+ repo_id=repo_id,
186
+ repo_type="dataset",
187
+ commit_message="Add UNHCR PDF links",
188
+ )
189
+ print("โœ… Uploaded unhcr_pdf_links.json")
190
+
191
+ # Upload all doc files
192
+ api.upload_folder(
193
+ folder_path=str(UNHCR_DIR),
194
+ path_in_repo="annotation_data/unhcr_extractions",
195
+ repo_id=repo_id,
196
+ repo_type="dataset",
197
+ commit_message="Add UNHCR extraction data",
198
+ allow_patterns=["doc_*/raw/*_direct_judged.jsonl"],
199
+ )
200
+ print("โœ… Uploaded UNHCR extraction files")
201
+
202
+ except ImportError:
203
+ print("โŒ huggingface_hub required: uv pip install huggingface_hub")
204
+
205
+
206
+ if __name__ == "__main__":
207
+ main()