rafmacalaba commited on
Commit
da957b0
·
1 Parent(s): a1705f4

feat: human validation, recursive highlighting, data mentions rename

Browse files

- Add human validation UI (✅/❌) for model-extracted data mentions
- Validation metadata stored inside dataset_name alongside judge_ fields
- Recursive highlighting works inside bold/italic/links
- Rename 'Datasets' to 'Data Mentions' throughout
- Switch input from _raw.json to _direct_judged.jsonl
- Document API reads local files in dev (highlights appear after save)
- Add /api/validate (PUT for validation, DELETE by index)
- Add /api/whoami for HF username auto-fetch
- Floating chevron panel toggle with badge count
- Rename annotation_tag to annotator

.env.example ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Hugging Face token for accessing datasets
2
+ HF_TOKEN=your_hf_token_here
3
+
4
+ # Optional: Override the default HF dataset repo
5
+ # HF_DATASET_REPO=rafmacalaba/wbg_annotation_data
6
+
7
+ # Optional: Number of documents to scan on initial load (default: 5)
8
+ # MAX_DOCS_TO_SCAN=5
Dockerfile CHANGED
@@ -6,9 +6,9 @@ FROM base AS deps
6
  RUN apk add --no-cache libc6-compat
7
  WORKDIR /app
8
 
9
- # Copy package.json and install
10
- COPY package.json ./
11
- RUN npm install
12
 
13
  # Rebuild the source code only when needed
14
  FROM base AS builder
 
6
  RUN apk add --no-cache libc6-compat
7
  WORKDIR /app
8
 
9
+ # Copy package files and install deterministically
10
+ COPY package.json package-lock.json ./
11
+ RUN npm ci
12
 
13
  # Rebuild the source code only when needed
14
  FROM base AS builder
app/api/annotate/route.js CHANGED
@@ -1,28 +1,80 @@
1
- import { saveAnnotation } from '../../../utils/storage.js';
 
2
 
3
  export async function POST(request) {
4
  try {
5
  const body = await request.json();
6
- const { document_index, page_number, selected_text, annotation_note } = body;
 
 
 
 
 
7
 
8
- if (!document_index || page_number === undefined || !selected_text) {
9
- return new Response(JSON.stringify({ error: "Missing required fields" }), { status: 400 });
 
 
 
 
 
 
 
 
10
  }
11
 
12
- const annotation = {
13
- document_index,
14
- page_number,
15
- selected_text,
16
- annotation_note: annotation_note || "",
17
- timestamp: new Date().toISOString()
18
- };
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
- await saveAnnotation(annotation);
 
 
 
21
 
22
- return new Response(JSON.stringify({ success: true, annotation }), { status: 200, headers: { 'Content-Type': 'application/json' } });
 
 
 
 
 
23
 
 
 
 
 
 
24
  } catch (error) {
25
- console.error("Annotation save error:", error);
26
- return new Response(JSON.stringify({ error: "Failed to save annotation" }), { status: 500 });
27
  }
28
  }
 
1
+ import { saveAnnotation, deleteAnnotation, updateAnnotation } from '../../../utils/storage.js';
2
+ import { NextResponse } from 'next/server';
3
 
4
  export async function POST(request) {
5
  try {
6
  const body = await request.json();
7
+ const {
8
+ dataset_name,
9
+ dataset_tag,
10
+ document_index,
11
+ page_number,
12
+ } = body;
13
 
14
+ if (
15
+ document_index === undefined || document_index === null ||
16
+ page_number === undefined ||
17
+ !dataset_name?.text ||
18
+ !dataset_tag
19
+ ) {
20
+ return NextResponse.json(
21
+ { error: 'Missing required fields: dataset_name.text, dataset_tag, document_index, page_number' },
22
+ { status: 400 }
23
+ );
24
  }
25
 
26
+ await saveAnnotation(body);
27
+ return NextResponse.json({ success: true });
28
+ } catch (error) {
29
+ console.error('Annotation error:', error);
30
+ return NextResponse.json({ error: 'Failed to save annotation: ' + error.message }, { status: 500 });
31
+ }
32
+ }
33
+
34
+ export async function DELETE(request) {
35
+ try {
36
+ const { searchParams } = new URL(request.url);
37
+ const timestamp = searchParams.get('timestamp');
38
+ const docIndex = parseInt(searchParams.get('doc'), 10);
39
+ const pageNumber = parseInt(searchParams.get('page'), 10);
40
+
41
+ if (!timestamp || isNaN(docIndex) || isNaN(pageNumber)) {
42
+ return NextResponse.json(
43
+ { error: 'Missing timestamp, doc, or page parameter' },
44
+ { status: 400 }
45
+ );
46
+ }
47
+
48
+ const deleted = await deleteAnnotation(timestamp, docIndex, pageNumber);
49
+ if (deleted) {
50
+ return NextResponse.json({ success: true });
51
+ }
52
+ return NextResponse.json({ error: 'Annotation not found' }, { status: 404 });
53
+ } catch (error) {
54
+ console.error('Delete annotation error:', error);
55
+ return NextResponse.json({ error: 'Failed to delete annotation: ' + error.message }, { status: 500 });
56
+ }
57
+ }
58
 
59
+ export async function PUT(request) {
60
+ try {
61
+ const body = await request.json();
62
+ const { timestamp, document_index, page_number, updates } = body;
63
 
64
+ if (!timestamp || document_index === undefined || page_number === undefined || !updates) {
65
+ return NextResponse.json(
66
+ { error: 'Missing timestamp, document_index, page_number, or updates' },
67
+ { status: 400 }
68
+ );
69
+ }
70
 
71
+ const updated = await updateAnnotation(timestamp, document_index, page_number, updates);
72
+ if (updated) {
73
+ return NextResponse.json({ success: true, annotation: updated });
74
+ }
75
+ return NextResponse.json({ error: 'Annotation not found' }, { status: 404 });
76
  } catch (error) {
77
+ console.error('Update annotation error:', error);
78
+ return NextResponse.json({ error: 'Failed to update annotation: ' + error.message }, { status: 500 });
79
  }
80
  }
app/api/annotations/route.js ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { getAnnotations } from '../../../utils/storage.js';
2
+
3
+ export async function GET(request) {
4
+ try {
5
+ const { searchParams } = new URL(request.url);
6
+ const docIndex = searchParams.get('document_index');
7
+
8
+ const filter = docIndex !== null ? parseInt(docIndex, 10) : null;
9
+ const annotations = await getAnnotations(filter);
10
+
11
+ return new Response(JSON.stringify(annotations), {
12
+ status: 200,
13
+ headers: { 'Content-Type': 'application/json' }
14
+ });
15
+ } catch (error) {
16
+ console.error("Error fetching annotations:", error);
17
+ return new Response(
18
+ JSON.stringify({ error: "Failed to fetch annotations" }),
19
+ { status: 500, headers: { 'Content-Type': 'application/json' } }
20
+ );
21
+ }
22
+ }
app/api/document/route.js CHANGED
@@ -1,37 +1,88 @@
 
 
 
 
 
 
 
 
1
  export async function GET(request) {
2
  const { searchParams } = new URL(request.url);
3
  const index = searchParams.get('index');
4
  const page = searchParams.get('page');
5
 
6
- if (!index || !page) {
7
- return new Response(JSON.stringify({ error: "Missing index or page parameter" }), { status: 400 });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  }
9
 
10
  try {
11
- const HF_DATASET_ID = "rafmacalaba/wbg_annotation_data";
12
- const docUrl = `https://huggingface.co/datasets/${HF_DATASET_ID}/raw/main/annotation_data/wbg_extractions/doc_${index}/raw/doc_${index}_raw.json`;
13
 
14
- const res = await fetch(docUrl, {
15
- headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
16
- });
 
 
 
17
 
18
- if (!res.ok) {
19
- return new Response(JSON.stringify({ error: `doc_${index}_raw.json not found on HF Datasets` }), { status: res.status });
20
- }
 
 
 
 
 
 
 
 
 
 
 
21
 
22
- const pagesData = await res.json();
 
 
 
 
 
 
 
 
23
 
24
- // Find the specific page
25
- const pageData = pagesData.find(p => p.document.pages[0] === parseInt(page));
26
 
27
  if (!pageData) {
28
- return new Response(JSON.stringify({ error: `Page ${page} not found in doc ${index}` }), { status: 404 });
 
 
 
29
  }
30
 
31
- return new Response(JSON.stringify(pageData), { status: 200, headers: { 'Content-Type': 'application/json' } });
32
-
 
 
33
  } catch (error) {
34
  console.error(error);
35
- return new Response(JSON.stringify({ error: "Failed to fetch document page from HF" }), { status: 500 });
 
 
 
36
  }
37
  }
 
1
+ import { HF_DATASET_BASE_URL } from '../../../utils/config.js';
2
+ import fs from 'fs';
3
+ import path from 'path';
4
+
5
+ const isHFSpace = () => {
6
+ return process.env.HF_TOKEN && process.env.NODE_ENV !== 'development';
7
+ };
8
+
9
  export async function GET(request) {
10
  const { searchParams } = new URL(request.url);
11
  const index = searchParams.get('index');
12
  const page = searchParams.get('page');
13
 
14
+ // Validate required params
15
+ if (index === null || page === null) {
16
+ return new Response(
17
+ JSON.stringify({ error: "Missing index or page parameter" }),
18
+ { status: 400, headers: { 'Content-Type': 'application/json' } }
19
+ );
20
+ }
21
+
22
+ // Validate numeric values
23
+ const indexNum = parseInt(index, 10);
24
+ const pageNum = parseInt(page, 10);
25
+
26
+ if (isNaN(indexNum) || isNaN(pageNum) || indexNum < 0 || pageNum < 0) {
27
+ return new Response(
28
+ JSON.stringify({ error: "index and page must be non-negative integers" }),
29
+ { status: 400, headers: { 'Content-Type': 'application/json' } }
30
+ );
31
  }
32
 
33
  try {
34
+ let pagesData;
 
35
 
36
+ if (isHFSpace()) {
37
+ // Production: fetch from HuggingFace
38
+ const docUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/wbg_extractions/doc_${indexNum}/raw/doc_${indexNum}_direct_judged.jsonl`;
39
+ const res = await fetch(docUrl, {
40
+ headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
41
+ });
42
 
43
+ if (!res.ok) {
44
+ return new Response(
45
+ JSON.stringify({ error: `doc_${indexNum}_direct_judged.jsonl not found on HF Datasets` }),
46
+ { status: res.status, headers: { 'Content-Type': 'application/json' } }
47
+ );
48
+ }
49
+ pagesData = await res.json();
50
+ } else {
51
+ // Local dev: read from local file (reflects saved annotations immediately)
52
+ const filePath = path.join(
53
+ process.cwd(),
54
+ 'annotation_data', 'wbg_extractions',
55
+ `doc_${indexNum}`, 'raw', `doc_${indexNum}_direct_judged.jsonl`
56
+ );
57
 
58
+ if (!fs.existsSync(filePath)) {
59
+ return new Response(
60
+ JSON.stringify({ error: `doc_${indexNum}_direct_judged.jsonl not found locally` }),
61
+ { status: 404, headers: { 'Content-Type': 'application/json' } }
62
+ );
63
+ }
64
+ const raw = fs.readFileSync(filePath, 'utf-8');
65
+ pagesData = JSON.parse(raw);
66
+ }
67
 
68
+ const pageData = pagesData.find(p => p.document?.pages?.[0] === pageNum);
 
69
 
70
  if (!pageData) {
71
+ return new Response(
72
+ JSON.stringify({ error: `Page ${pageNum} not found in doc ${indexNum}` }),
73
+ { status: 404, headers: { 'Content-Type': 'application/json' } }
74
+ );
75
  }
76
 
77
+ return new Response(JSON.stringify(pageData), {
78
+ status: 200,
79
+ headers: { 'Content-Type': 'application/json' }
80
+ });
81
  } catch (error) {
82
  console.error(error);
83
+ return new Response(
84
+ JSON.stringify({ error: "Failed to fetch document page" }),
85
+ { status: 500, headers: { 'Content-Type': 'application/json' } }
86
+ );
87
  }
88
  }
app/api/documents/route.js CHANGED
@@ -1,67 +1,58 @@
 
 
1
  export async function GET() {
2
  try {
3
- const HF_DATASET_ID = "rafmacalaba/wbg_annotation_data";
4
-
5
- // Fetch the index file from HF Datasets raw URL
6
- const linksUrl = `https://huggingface.co/datasets/${HF_DATASET_ID}/raw/main/annotation_data/wbg_data/wbg_pdf_links.json`;
7
  const linksRes = await fetch(linksUrl, {
8
  headers: {
9
  'Authorization': `Bearer ${process.env.HF_TOKEN}`
10
  },
11
- next: { revalidate: 3600 } // Cache for an hour to not spam HF
12
  });
13
 
14
  if (!linksRes.ok) {
15
  console.error("Failed to fetch links JSON", await linksRes.text());
16
- return new Response(JSON.stringify({ error: "Missing wbg_pdf_links.json on HF Datasets" }), { status: 404 });
 
 
 
17
  }
18
 
19
  const links = await linksRes.json();
20
- const documents = [];
21
-
22
- // Because scanning 1220 external HTTP JSON files concurrently on every page load would be extremely slow
23
- // and easily hit rate limits, we will implement a simplified approach.
24
- // For a production app, we would pre-compute this list and upload it as a manifest.
25
- // For this MVP, we will only scan the first 5 "success" links to populate the dropdown quickly,
26
- // assuming those are the priority documents to annotate.
27
 
28
- const maxDocsToScan = 5;
29
- let scanCount = 0;
30
 
31
- for (const link of links) {
32
- if (link.status === 'success') {
33
- scanCount++;
 
 
 
 
34
 
35
- try {
36
- const docUrl = `https://huggingface.co/datasets/${HF_DATASET_ID}/raw/main/annotation_data/wbg_extractions/doc_${link.index}/raw/doc_${link.index}_raw.json`;
37
- const docRes = await fetch(docUrl, {
38
- headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
39
- });
40
 
41
- if (docRes.ok) {
42
- const pagesData = await docRes.json();
 
 
43
 
44
- // Find pages with non-empty datasets
45
- const annotatablePages = pagesData
46
- .filter(page => page.datasets && page.datasets.length > 0)
47
- .map(page => page.document.pages[0]);
48
 
49
- if (annotatablePages.length > 0) {
50
- documents.push({
51
- index: link.index,
52
- pdf_url: link.direct_pdf_url,
53
- landing_page: link.landing_page_url,
54
- annotatable_pages: annotatablePages
55
- });
56
- }
57
- }
58
- } catch (e) {
59
- console.error(`Failed to scan doc ${link.index} from HF`, e);
60
- }
61
 
62
- if (scanCount >= maxDocsToScan) break;
63
- }
64
- }
65
 
66
  return new Response(JSON.stringify(documents), {
67
  status: 200,
@@ -72,6 +63,9 @@ export async function GET() {
72
  });
73
  } catch (error) {
74
  console.error(error);
75
- return new Response(JSON.stringify({ error: "Failed to fetch documents from HF" }), { status: 500 });
 
 
 
76
  }
77
  }
 
1
+ import { HF_DATASET_BASE_URL, MAX_DOCS_TO_SCAN } from '../../../utils/config.js';
2
+
3
  export async function GET() {
4
  try {
5
+ // Fetch the index file from HF Datasets
6
+ const linksUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/wbg_data/wbg_pdf_links.json`;
 
 
7
  const linksRes = await fetch(linksUrl, {
8
  headers: {
9
  'Authorization': `Bearer ${process.env.HF_TOKEN}`
10
  },
11
+ next: { revalidate: 3600 }
12
  });
13
 
14
  if (!linksRes.ok) {
15
  console.error("Failed to fetch links JSON", await linksRes.text());
16
+ return new Response(
17
+ JSON.stringify({ error: "Missing wbg_pdf_links.json on HF Datasets" }),
18
+ { status: 404, headers: { 'Content-Type': 'application/json' } }
19
+ );
20
  }
21
 
22
  const links = await linksRes.json();
 
 
 
 
 
 
 
23
 
24
+ // Filter to successful links and take the first N
25
+ const successLinks = links.filter(l => l.status === 'success').slice(0, MAX_DOCS_TO_SCAN);
26
 
27
+ // Parallel fetch much faster than sequential scanning
28
+ const results = await Promise.allSettled(
29
+ successLinks.map(async (link) => {
30
+ const docUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/wbg_extractions/doc_${link.index}/raw/doc_${link.index}_direct_judged.jsonl`;
31
+ const docRes = await fetch(docUrl, {
32
+ headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
33
+ });
34
 
35
+ if (!docRes.ok) return null;
 
 
 
 
36
 
37
+ const pagesData = await docRes.json();
38
+ const annotatablePages = pagesData
39
+ .filter(page => page.datasets && page.datasets.length > 0)
40
+ .map(page => page.document.pages[0]);
41
 
42
+ if (annotatablePages.length === 0) return null;
 
 
 
43
 
44
+ return {
45
+ index: link.index,
46
+ pdf_url: link.direct_pdf_url,
47
+ landing_page: link.landing_page_url,
48
+ annotatable_pages: annotatablePages
49
+ };
50
+ })
51
+ );
 
 
 
 
52
 
53
+ const documents = results
54
+ .filter(r => r.status === 'fulfilled' && r.value !== null)
55
+ .map(r => r.value);
56
 
57
  return new Response(JSON.stringify(documents), {
58
  status: 200,
 
63
  });
64
  } catch (error) {
65
  console.error(error);
66
+ return new Response(
67
+ JSON.stringify({ error: "Failed to fetch documents from HF" }),
68
+ { status: 500, headers: { 'Content-Type': 'application/json' } }
69
+ );
70
  }
71
  }
app/api/health/route.js ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ export async function GET() {
2
+ return new Response(
3
+ JSON.stringify({
4
+ status: 'ok',
5
+ timestamp: new Date().toISOString(),
6
+ environment: process.env.NODE_ENV || 'unknown'
7
+ }),
8
+ { status: 200, headers: { 'Content-Type': 'application/json' } }
9
+ );
10
+ }
app/api/validate/route.js ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { NextResponse } from 'next/server';
2
+ import fs from 'fs';
3
+ import path from 'path';
4
+ import { commit } from '@huggingface/hub';
5
+ import { HF_DATASET_ID, HF_DATASET_BASE_URL } from '../../../utils/config.js';
6
+
7
+ const isHFSpace = () => process.env.HF_TOKEN && process.env.NODE_ENV !== 'development';
8
+
9
+ function getDocFilePath(docIndex) {
10
+ return path.join(
11
+ process.cwd(),
12
+ 'annotation_data', 'wbg_extractions',
13
+ `doc_${docIndex}`, 'raw', `doc_${docIndex}_direct_judged.jsonl`
14
+ );
15
+ }
16
+
17
+ function getDocRepoPath(docIndex) {
18
+ return `annotation_data/wbg_extractions/doc_${docIndex}/raw/doc_${docIndex}_direct_judged.jsonl`;
19
+ }
20
+
21
+ /**
22
+ * PUT /api/validate
23
+ * Updates a specific dataset entry within a page by its array index.
24
+ * Body: { document_index, page_number, dataset_index, updates }
25
+ */
26
+ export async function PUT(request) {
27
+ try {
28
+ const { document_index, page_number, dataset_index, updates } = await request.json();
29
+
30
+ if (document_index == null || page_number == null || dataset_index == null || !updates) {
31
+ return NextResponse.json(
32
+ { error: 'Missing document_index, page_number, dataset_index, or updates' },
33
+ { status: 400 }
34
+ );
35
+ }
36
+
37
+ let pagesData;
38
+
39
+ if (isHFSpace()) {
40
+ const url = `${HF_DATASET_BASE_URL}/raw/main/${getDocRepoPath(document_index)}`;
41
+ const res = await fetch(url, {
42
+ headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
43
+ });
44
+ if (!res.ok) {
45
+ return NextResponse.json({ error: 'Document not found on HF' }, { status: 404 });
46
+ }
47
+ pagesData = await res.json();
48
+ } else {
49
+ const filePath = getDocFilePath(document_index);
50
+ if (!fs.existsSync(filePath)) {
51
+ return NextResponse.json({ error: 'Document not found locally' }, { status: 404 });
52
+ }
53
+ pagesData = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
54
+ }
55
+
56
+ // Find the page
57
+ const pageIdx = pagesData.findIndex(p => p.document?.pages?.[0] === page_number);
58
+ if (pageIdx === -1) {
59
+ return NextResponse.json({ error: `Page ${page_number} not found` }, { status: 404 });
60
+ }
61
+
62
+ const datasets = pagesData[pageIdx].datasets || [];
63
+ if (dataset_index < 0 || dataset_index >= datasets.length) {
64
+ return NextResponse.json({ error: `Dataset index ${dataset_index} out of range` }, { status: 400 });
65
+ }
66
+
67
+ // Merge updates into dataset_name (same level as judge_ fields)
68
+ const currentEntry = pagesData[pageIdx].datasets[dataset_index];
69
+ pagesData[pageIdx].datasets[dataset_index] = {
70
+ ...currentEntry,
71
+ dataset_name: {
72
+ ...currentEntry.dataset_name,
73
+ ...updates,
74
+ },
75
+ };
76
+
77
+ // Save back
78
+ if (isHFSpace()) {
79
+ const token = process.env.HF_TOKEN;
80
+ const repoPath = getDocRepoPath(document_index);
81
+ const content = JSON.stringify(pagesData, null, 2);
82
+ await commit({
83
+ repo: { type: 'dataset', name: HF_DATASET_ID },
84
+ credentials: { accessToken: token },
85
+ title: `Validate dataset in doc_${document_index} page ${page_number}`,
86
+ operations: [{
87
+ operation: 'addOrUpdate',
88
+ path: repoPath,
89
+ content: new Blob([content], { type: 'application/json' }),
90
+ }],
91
+ });
92
+ } else {
93
+ const filePath = getDocFilePath(document_index);
94
+ fs.writeFileSync(filePath, JSON.stringify(pagesData, null, 2));
95
+ }
96
+
97
+ return NextResponse.json({
98
+ success: true,
99
+ dataset: pagesData[pageIdx].datasets[dataset_index],
100
+ });
101
+ } catch (error) {
102
+ console.error('Validate error:', error);
103
+ return NextResponse.json({ error: 'Failed to validate: ' + error.message }, { status: 500 });
104
+ }
105
+ }
106
+
107
+ /**
108
+ * DELETE /api/validate?doc=X&page=Y&idx=Z
109
+ * Removes a dataset entry by its array index.
110
+ */
111
+ export async function DELETE(request) {
112
+ try {
113
+ const { searchParams } = new URL(request.url);
114
+ const document_index = parseInt(searchParams.get('doc'), 10);
115
+ const page_number = parseInt(searchParams.get('page'), 10);
116
+ const dataset_index = parseInt(searchParams.get('idx'), 10);
117
+
118
+ if (isNaN(document_index) || isNaN(page_number) || isNaN(dataset_index)) {
119
+ return NextResponse.json(
120
+ { error: 'Missing doc, page, or idx parameter' },
121
+ { status: 400 }
122
+ );
123
+ }
124
+
125
+ let pagesData;
126
+
127
+ if (isHFSpace()) {
128
+ const url = `${HF_DATASET_BASE_URL}/raw/main/${getDocRepoPath(document_index)}`;
129
+ const res = await fetch(url, {
130
+ headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
131
+ });
132
+ if (!res.ok) {
133
+ return NextResponse.json({ error: 'Document not found on HF' }, { status: 404 });
134
+ }
135
+ pagesData = await res.json();
136
+ } else {
137
+ const filePath = getDocFilePath(document_index);
138
+ if (!fs.existsSync(filePath)) {
139
+ return NextResponse.json({ error: 'Document not found locally' }, { status: 404 });
140
+ }
141
+ pagesData = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
142
+ }
143
+
144
+ const pageIdx = pagesData.findIndex(p => p.document?.pages?.[0] === page_number);
145
+ if (pageIdx === -1) {
146
+ return NextResponse.json({ error: `Page ${page_number} not found` }, { status: 404 });
147
+ }
148
+
149
+ const datasets = pagesData[pageIdx].datasets || [];
150
+ if (dataset_index < 0 || dataset_index >= datasets.length) {
151
+ return NextResponse.json({ error: `Dataset index ${dataset_index} out of range` }, { status: 400 });
152
+ }
153
+
154
+ // Remove the entry
155
+ pagesData[pageIdx].datasets.splice(dataset_index, 1);
156
+
157
+ // Save back
158
+ if (isHFSpace()) {
159
+ const token = process.env.HF_TOKEN;
160
+ const repoPath = getDocRepoPath(document_index);
161
+ const content = JSON.stringify(pagesData, null, 2);
162
+ await commit({
163
+ repo: { type: 'dataset', name: HF_DATASET_ID },
164
+ credentials: { accessToken: token },
165
+ title: `Delete dataset from doc_${document_index} page ${page_number}`,
166
+ operations: [{
167
+ operation: 'addOrUpdate',
168
+ path: repoPath,
169
+ content: new Blob([content], { type: 'application/json' }),
170
+ }],
171
+ });
172
+ } else {
173
+ const filePath = getDocFilePath(document_index);
174
+ fs.writeFileSync(filePath, JSON.stringify(pagesData, null, 2));
175
+ }
176
+
177
+ return NextResponse.json({ success: true });
178
+ } catch (error) {
179
+ console.error('Delete error:', error);
180
+ return NextResponse.json({ error: 'Failed to delete: ' + error.message }, { status: 500 });
181
+ }
182
+ }
app/api/whoami/route.js ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { NextResponse } from 'next/server';
2
+
3
+ /**
4
+ * Returns the HF username for the current token.
5
+ * Used to auto-fill the annotator name in production.
6
+ */
7
+ export async function GET() {
8
+ const token = process.env.HF_TOKEN;
9
+ if (!token) {
10
+ return NextResponse.json({ username: null }, { status: 200 });
11
+ }
12
+
13
+ try {
14
+ const res = await fetch('https://huggingface.co/api/whoami-v2', {
15
+ headers: { 'Authorization': `Bearer ${token}` },
16
+ });
17
+
18
+ if (!res.ok) {
19
+ return NextResponse.json({ username: null }, { status: 200 });
20
+ }
21
+
22
+ const data = await res.json();
23
+ return NextResponse.json({ username: data.name || null });
24
+ } catch (error) {
25
+ console.error('HF whoami error:', error);
26
+ return NextResponse.json({ username: null }, { status: 200 });
27
+ }
28
+ }
app/components/AnnotationModal.js ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "use client";
2
+
3
+ import { useState, useEffect, useRef } from 'react';
4
+
5
+ const DATASET_TAGS = [
6
+ { value: 'named', label: 'Named Dataset', description: 'A specific, named dataset (e.g. "2022 national census")' },
7
+ { value: 'descriptive', label: 'Descriptive', description: 'A described but not formally named dataset' },
8
+ { value: 'vague', label: 'Vague', description: 'An unclear or ambiguous data reference' },
9
+ ];
10
+
11
+ export default function AnnotationModal({
12
+ isOpen,
13
+ selectedText,
14
+ annotatorName,
15
+ onAnnotatorChange,
16
+ onSubmit,
17
+ onClose,
18
+ }) {
19
+ const [datasetTag, setDatasetTag] = useState('named');
20
+ const [saving, setSaving] = useState(false);
21
+ const noteRef = useRef(null);
22
+
23
+ useEffect(() => {
24
+ if (isOpen && noteRef.current) {
25
+ noteRef.current.focus();
26
+ }
27
+ }, [isOpen]);
28
+
29
+ useEffect(() => {
30
+ const handleEsc = (e) => { if (e.key === 'Escape') onClose(); };
31
+ if (isOpen) window.addEventListener('keydown', handleEsc);
32
+ return () => window.removeEventListener('keydown', handleEsc);
33
+ }, [isOpen, onClose]);
34
+
35
+ if (!isOpen) return null;
36
+
37
+ const handleSubmit = async () => {
38
+ setSaving(true);
39
+ await onSubmit({ dataset_tag: datasetTag });
40
+ setSaving(false);
41
+ setDatasetTag('named');
42
+ };
43
+
44
+ return (
45
+ <div className="modal-overlay" onClick={(e) => { if (e.target === e.currentTarget) onClose(); }}>
46
+ <div className="modal-content">
47
+ <div className="modal-header">
48
+ <h3>Add Annotation</h3>
49
+ <button className="modal-close" onClick={onClose}>&times;</button>
50
+ </div>
51
+
52
+ <div className="modal-body">
53
+ {/* Selected text preview */}
54
+ <div className="form-group">
55
+ <label>Selected Text</label>
56
+ <div className="selected-text-preview">"{selectedText}"</div>
57
+ </div>
58
+
59
+ {/* Dataset tag dropdown */}
60
+ <div className="form-group">
61
+ <label htmlFor="dataset-tag">Dataset Tag</label>
62
+ <select
63
+ id="dataset-tag"
64
+ className="form-select"
65
+ value={datasetTag}
66
+ onChange={(e) => setDatasetTag(e.target.value)}
67
+ >
68
+ {DATASET_TAGS.map(tag => (
69
+ <option key={tag.value} value={tag.value}>
70
+ {tag.label}
71
+ </option>
72
+ ))}
73
+ </select>
74
+ <p className="form-help">
75
+ {DATASET_TAGS.find(t => t.value === datasetTag)?.description}
76
+ </p>
77
+ </div>
78
+
79
+ {/* Annotator name */}
80
+ <div className="form-group">
81
+ <label htmlFor="annotator-name">Your Name</label>
82
+ <input
83
+ id="annotator-name"
84
+ type="text"
85
+ className="form-input"
86
+ placeholder="Enter your name"
87
+ value={annotatorName}
88
+ onChange={(e) => onAnnotatorChange(e.target.value)}
89
+ />
90
+ </div>
91
+ </div>
92
+
93
+ <div className="modal-footer">
94
+ <button className="btn btn-secondary" onClick={onClose} disabled={saving}>
95
+ Cancel
96
+ </button>
97
+ <button className="btn btn-primary" onClick={handleSubmit} disabled={saving}>
98
+ {saving ? 'Saving...' : 'Save Annotation'}
99
+ </button>
100
+ </div>
101
+ </div>
102
+ </div>
103
+ );
104
+ }
app/components/AnnotationPanel.js ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "use client";
2
+
3
+ import { useState } from 'react';
4
+
5
+ const TAG_STYLES = {
6
+ named: { color: '#10b981', bg: '#10b98120', label: 'Named' },
7
+ descriptive: { color: '#f59e0b', bg: '#f59e0b20', label: 'Descriptive' },
8
+ vague: { color: '#a78bfa', bg: '#a78bfa20', label: 'Vague' },
9
+ 'non-dataset': { color: '#64748b', bg: '#64748b20', label: 'Non-Dataset' },
10
+ };
11
+
12
+ const TAG_OPTIONS = ['named', 'descriptive', 'vague', 'non-dataset'];
13
+
14
+ export default function AnnotationPanel({
15
+ isOpen,
16
+ onClose,
17
+ datasets, // ALL datasets on current page (model + human)
18
+ annotatorName, // current user's name
19
+ onValidate, // (datasetIdx, updates) => void
20
+ onDelete,
21
+ }) {
22
+ const [validatingIdx, setValidatingIdx] = useState(null);
23
+ const [validationNotes, setValidationNotes] = useState('');
24
+ const [editingTagIdx, setEditingTagIdx] = useState(null);
25
+ const [editTag, setEditTag] = useState('');
26
+ const [confirmDelete, setConfirmDelete] = useState(null);
27
+
28
+ const startValidation = (idx, prefillNotes = '') => {
29
+ setValidatingIdx(idx);
30
+ setValidationNotes(prefillNotes);
31
+ };
32
+
33
+ const submitValidation = (ds, idx, verdict) => {
34
+ onValidate(idx, {
35
+ human_validated: true,
36
+ human_verdict: verdict,
37
+ human_notes: validationNotes.trim() || null,
38
+ annotator: annotatorName || 'user',
39
+ validated_at: new Date().toISOString(),
40
+ });
41
+ setValidatingIdx(null);
42
+ setValidationNotes('');
43
+ };
44
+
45
+ const startEditTag = (idx, currentTag) => {
46
+ setEditingTagIdx(idx);
47
+ setEditTag(currentTag);
48
+ };
49
+
50
+ const saveEditTag = (ds, idx) => {
51
+ onValidate(idx, { dataset_tag: editTag });
52
+ setEditingTagIdx(null);
53
+ setEditTag('');
54
+ };
55
+
56
+ const handleDelete = (ds, idx) => {
57
+ if (confirmDelete === idx) {
58
+ onDelete(ds, idx);
59
+ setConfirmDelete(null);
60
+ } else {
61
+ setConfirmDelete(idx);
62
+ setTimeout(() => setConfirmDelete(prev => prev === idx ? null : prev), 3000);
63
+ }
64
+ };
65
+
66
+ return (
67
+ <>
68
+ {isOpen && <div className="panel-backdrop" onClick={onClose} />}
69
+
70
+ <div className={`annotation-panel ${isOpen ? 'open' : ''}`}>
71
+ <div className="panel-header">
72
+ <h3>Data Mentions</h3>
73
+ <span className="panel-count">{datasets.length}</span>
74
+ <button className="panel-close" onClick={onClose}>&times;</button>
75
+ </div>
76
+
77
+ <div className="panel-body">
78
+ {datasets.length === 0 ? (
79
+ <div className="panel-empty">
80
+ <p>No datasets detected on this page.</p>
81
+ </div>
82
+ ) : (
83
+ datasets.map((ds, i) => {
84
+ const text = ds.dataset_name?.text || '';
85
+ const tag = ds.dataset_tag || 'named';
86
+ const style = TAG_STYLES[tag] || TAG_STYLES.named;
87
+ const isHuman = !!ds.annotator;
88
+ const isValidated = ds.dataset_name?.human_validated;
89
+ const humanVerdict = ds.dataset_name?.human_verdict;
90
+ const humanNotes = ds.dataset_name?.human_notes;
91
+ const validatedBy = ds.dataset_name?.annotator;
92
+ const judgeVerdict = ds.dataset_name?.judge_verdict;
93
+ const judgeTag = ds.dataset_name?.judge_tag;
94
+ const isValidating = validatingIdx === i;
95
+ const isEditingTag = editingTagIdx === i;
96
+
97
+ return (
98
+ <div
99
+ key={`${text}-${ds.dataset_name?.start}-${i}`}
100
+ className={`panel-annotation-card ${isValidated ? (humanVerdict ? 'validated-correct' : 'validated-wrong') : ''}`}
101
+ >
102
+ {/* Top row: tag + source */}
103
+ <div className="panel-card-top">
104
+ {isEditingTag ? (
105
+ <div className="inline-edit">
106
+ <select
107
+ className="form-select-small"
108
+ value={editTag}
109
+ onChange={(e) => setEditTag(e.target.value)}
110
+ >
111
+ {TAG_OPTIONS.map(t => (
112
+ <option key={t} value={t}>
113
+ {TAG_STYLES[t]?.label || t}
114
+ </option>
115
+ ))}
116
+ </select>
117
+ <button className="btn-panel save" onClick={() => saveEditTag(ds, i)}>✓</button>
118
+ <button className="btn-panel" onClick={() => setEditingTagIdx(null)}>✕</button>
119
+ </div>
120
+ ) : (
121
+ <span
122
+ className="annotation-tag-badge clickable"
123
+ style={{ color: style.color, backgroundColor: style.bg }}
124
+ onClick={() => startEditTag(i, tag)}
125
+ title="Click to change tag"
126
+ >
127
+ {style.label}
128
+ </span>
129
+ )}
130
+
131
+ <span className="panel-card-source">
132
+ {isHuman ? `👤 ${ds.annotator}` : '🤖 model'}
133
+ </span>
134
+ </div>
135
+
136
+ {/* Dataset text */}
137
+ <p className="panel-card-text">"{text}"</p>
138
+
139
+ {/* Judge info (for model extractions) */}
140
+ {judgeTag && (
141
+ <div className="panel-card-judge">
142
+ <span className={`judge-verdict ${judgeVerdict ? 'correct' : 'wrong'}`}>
143
+ Judge: {judgeVerdict ? '✓' : '✕'}
144
+ </span>
145
+ <span className="judge-tag">{judgeTag}</span>
146
+ </div>
147
+ )}
148
+
149
+ {/* Position info */}
150
+ {ds.dataset_name?.start != null && (
151
+ <span className="panel-card-position">
152
+ chars {ds.dataset_name.start}–{ds.dataset_name.end}
153
+ </span>
154
+ )}
155
+
156
+ {/* Existing validation status */}
157
+ {isValidated && (
158
+ <div className={`validation-status ${humanVerdict ? 'correct' : 'wrong'}`}>
159
+ {humanVerdict ? '✅ Validated correct' : '❌ Marked incorrect'}
160
+ <span className="validation-by"> by {validatedBy}</span>
161
+ {humanNotes && (
162
+ <p className="validation-notes">Note: {humanNotes}</p>
163
+ )}
164
+ </div>
165
+ )}
166
+
167
+ {/* Validation UI */}
168
+ {isValidating ? (
169
+ <div className="validation-form">
170
+ <textarea
171
+ className="validation-notes-input"
172
+ placeholder="Optional notes..."
173
+ value={validationNotes}
174
+ onChange={(e) => setValidationNotes(e.target.value)}
175
+ rows={2}
176
+ />
177
+ <div className="validation-buttons">
178
+ <button
179
+ className="btn-panel correct"
180
+ onClick={() => submitValidation(ds, i, true)}
181
+ >
182
+ ✅ Correct
183
+ </button>
184
+ <button
185
+ className="btn-panel wrong"
186
+ onClick={() => submitValidation(ds, i, false)}
187
+ >
188
+ ❌ Wrong
189
+ </button>
190
+ <button
191
+ className="btn-panel"
192
+ onClick={() => setValidatingIdx(null)}
193
+ >
194
+ Cancel
195
+ </button>
196
+ </div>
197
+ </div>
198
+ ) : (
199
+ <div className="panel-card-actions">
200
+ <button
201
+ className="btn-panel validate"
202
+ onClick={() => startValidation(i, humanNotes || '')}
203
+ >
204
+ {isValidated ? '🔄 Re-validate' : '🏷️ Validate'}
205
+ </button>
206
+ {isHuman && (
207
+ <button
208
+ className={`btn-panel delete ${confirmDelete === i ? 'confirming' : ''}`}
209
+ onClick={() => handleDelete(ds, i)}
210
+ >
211
+ {confirmDelete === i ? '⚠ Confirm?' : '🗑 Delete'}
212
+ </button>
213
+ )}
214
+ </div>
215
+ )}
216
+ </div>
217
+ );
218
+ })
219
+ )}
220
+ </div>
221
+ </div>
222
+ </>
223
+ );
224
+ }
app/components/AnnotationsList.js ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "use client";
2
+
3
+ const TAG_STYLES = {
4
+ named: { color: '#10b981', bg: '#10b98120' },
5
+ descriptive: { color: '#f59e0b', bg: '#f59e0b20' },
6
+ vague: { color: '#a78bfa', bg: '#a78bfa20' },
7
+ 'non-dataset': { color: '#64748b', bg: '#64748b20' },
8
+ };
9
+
10
+ export default function AnnotationsList({ annotations }) {
11
+ if (!annotations || annotations.length === 0) return null;
12
+
13
+ return (
14
+ <div className="annotations-list">
15
+ <h3>Annotations ({annotations.length})</h3>
16
+ <ul>
17
+ {annotations.map((a, i) => {
18
+ const text = a.dataset_name?.text || a.selected_text || '';
19
+ const tag = a.dataset_tag || 'named';
20
+ const author = a.annotation_tag || a.annotator_name || 'user';
21
+ const style = TAG_STYLES[tag] || TAG_STYLES.named;
22
+
23
+ return (
24
+ <li key={`${text}-${i}`}>
25
+ <div className="annotation-meta">
26
+ <span className="annotation-location">
27
+ Doc {a.document_index} / Pg {a.page_number}
28
+ </span>
29
+ <span
30
+ className="annotation-tag-badge"
31
+ style={{ color: style.color, backgroundColor: style.bg }}
32
+ >
33
+ {tag}
34
+ </span>
35
+ </div>
36
+ <p className="annotation-text">
37
+ <strong>Dataset:</strong> "{text}"
38
+ </p>
39
+ <small>
40
+ by {author} · {a.timestamp ? new Date(a.timestamp).toLocaleString() : ''}
41
+ </small>
42
+ </li>
43
+ );
44
+ })}
45
+ </ul>
46
+ </div>
47
+ );
48
+ }
app/components/DocumentSelector.js ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "use client";
2
+
3
+ export default function DocumentSelector({
4
+ documents,
5
+ selectedDocIndex,
6
+ onDocChange,
7
+ }) {
8
+ return (
9
+ <div className="navigation-controls">
10
+ <div className="select-group">
11
+ <label htmlFor="doc-select">Document</label>
12
+ <select
13
+ id="doc-select"
14
+ value={selectedDocIndex ?? ''}
15
+ onChange={(e) => {
16
+ const docIdx = parseInt(e.target.value, 10);
17
+ onDocChange(docIdx);
18
+ }}
19
+ >
20
+ {documents.map(doc => (
21
+ <option key={doc.index} value={doc.index}>
22
+ Document {doc.index} ({doc.annotatable_pages.length} pages)
23
+ </option>
24
+ ))}
25
+ </select>
26
+ </div>
27
+ </div>
28
+ );
29
+ }
app/components/MarkdownAnnotator.js ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "use client";
2
+
3
+ import ReactMarkdown from 'react-markdown';
4
+ import remarkGfm from 'remark-gfm';
5
+ import React from 'react';
6
+
7
+ // Color mapping for dataset tags
8
+ const TAG_COLORS = {
9
+ named: { bg: '#10b98130', border: '#10b981', text: '#34d399', label: 'Named Dataset' },
10
+ descriptive: { bg: '#f59e0b30', border: '#f59e0b', text: '#fbbf24', label: 'Descriptive' },
11
+ vague: { bg: '#a78bfa30', border: '#a78bfa', text: '#c4b5fd', label: 'Vague' },
12
+ 'non-dataset': { bg: '#64748b30', border: '#64748b', text: '#94a3b8', label: 'Non-Dataset' },
13
+ };
14
+
15
+ /**
16
+ * Highlights all dataset mentions within the markdown text.
17
+ * Returns the text with <mark> tags wrapping each dataset name occurrence.
18
+ */
19
+ function highlightDatasets(text, datasets) {
20
+ if (!datasets || datasets.length === 0 || !text) return text;
21
+
22
+ // Build a list of {name, tag} sorted by name length descending (longest first to avoid partial overlaps)
23
+ const mentions = datasets
24
+ .filter(ds => ds.dataset_name?.text)
25
+ .map(ds => ({
26
+ name: ds.dataset_name.text,
27
+ tag: ds.dataset_tag || 'non-dataset',
28
+ }))
29
+ .sort((a, b) => b.name.length - a.name.length);
30
+
31
+ // Deduplicate by name
32
+ const seen = new Set();
33
+ const uniqueMentions = mentions.filter(m => {
34
+ if (seen.has(m.name)) return false;
35
+ seen.add(m.name);
36
+ return true;
37
+ });
38
+
39
+ if (uniqueMentions.length === 0) return text;
40
+
41
+ // Build regex that matches any of the dataset names
42
+ const escaped = uniqueMentions.map(m => m.name.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'));
43
+ const pattern = new RegExp(`(${escaped.join('|')})`, 'gi');
44
+
45
+ // Create a lookup map for fast tag resolution
46
+ const nameToTag = {};
47
+ uniqueMentions.forEach(m => { nameToTag[m.name.toLowerCase()] = m.tag; });
48
+
49
+ // Split text by the pattern, preserving the matches
50
+ const parts = text.split(pattern);
51
+
52
+ return parts.map((part, i) => {
53
+ const tag = nameToTag[part.toLowerCase()];
54
+ if (tag) {
55
+ const colors = TAG_COLORS[tag] || TAG_COLORS['non-dataset'];
56
+ return `<mark data-tag="${tag}" style="background-color:${colors.bg};border-bottom:2px solid ${colors.border};color:${colors.text};padding:1px 3px;border-radius:3px;cursor:pointer;" title="[${colors.label}] ${part}">${part}</mark>`;
57
+ }
58
+ return part;
59
+ }).join('');
60
+ }
61
+
62
+ export default function MarkdownAnnotator({ selectedDocIndex, selectedPage, currentPageData, loadingPage, onAnnotate, onTogglePanel, annotationCount }) {
63
+ const handleAnnotateClick = () => {
64
+ const selection = window.getSelection();
65
+ if (selection && selection.toString().trim() !== "" && selection.rangeCount > 0) {
66
+ const text = selection.toString().trim();
67
+
68
+ // Compute the character offset of the selection start within the
69
+ // .markdown-preview container. This lets us disambiguate when the
70
+ // same text appears multiple times on the page.
71
+ let selectionOffset = 0;
72
+ const container = document.querySelector('.markdown-preview');
73
+ if (container) {
74
+ try {
75
+ const range = selection.getRangeAt(0);
76
+ const preCaretRange = document.createRange();
77
+ preCaretRange.setStart(container, 0);
78
+ preCaretRange.setEnd(range.startContainer, range.startOffset);
79
+ selectionOffset = preCaretRange.toString().length;
80
+ } catch (e) {
81
+ // Fallback: offset 0 (will just use first occurrence)
82
+ selectionOffset = 0;
83
+ }
84
+ }
85
+
86
+ onAnnotate(text, selectionOffset);
87
+ } else {
88
+ const btn = document.getElementById('annotate-btn');
89
+ if (btn) {
90
+ btn.classList.add('shake');
91
+ setTimeout(() => btn.classList.remove('shake'), 500);
92
+ }
93
+ }
94
+ };
95
+
96
+ // Pre-process the markdown text to inject highlight markers
97
+ const datasets = currentPageData?.datasets || [];
98
+ const rawText = currentPageData?.input_text || "";
99
+ const highlightedText = highlightDatasets(rawText, datasets);
100
+
101
+ // Recursive helper: processes children at any depth so text inside
102
+ // <strong>, <em>, <a>, etc. also gets highlighted.
103
+ const processChildren = (children) =>
104
+ React.Children.map(children, child => {
105
+ if (typeof child === 'string') {
106
+ const highlighted = highlightDatasets(child, datasets);
107
+ if (highlighted !== child) {
108
+ return <span dangerouslySetInnerHTML={{ __html: highlighted }} />;
109
+ }
110
+ return child;
111
+ }
112
+ // If it's a React element with children, recurse into it
113
+ if (React.isValidElement(child) && child.props?.children) {
114
+ return React.cloneElement(child, {}, processChildren(child.props.children));
115
+ }
116
+ return child;
117
+ });
118
+
119
+ // Build component overrides for all block-level and inline elements
120
+ const highlightWrapper = (Tag) => ({ children, ...props }) => (
121
+ <Tag {...props}>{processChildren(children)}</Tag>
122
+ );
123
+
124
+ const highlightComponents = {
125
+ p: highlightWrapper('p'),
126
+ li: highlightWrapper('li'),
127
+ td: highlightWrapper('td'),
128
+ th: highlightWrapper('th'),
129
+ };
130
+
131
+ return (
132
+ <div className="annotator-container">
133
+ <div className="annotator-header">
134
+ <h2>Markdown Annotation</h2>
135
+ <button
136
+ id="annotate-btn"
137
+ onClick={handleAnnotateClick}
138
+ className="btn btn-primary"
139
+ title="Select text below, then click to annotate"
140
+ >
141
+ ✍️ Annotate Selection
142
+ </button>
143
+ </div>
144
+
145
+ {/* Dataset legend */}
146
+ {datasets.length > 0 && (
147
+ <div className="dataset-legend">
148
+ {Object.entries(TAG_COLORS).map(([tag, colors]) => {
149
+ const count = datasets.filter(ds => ds.dataset_tag === tag).length;
150
+ if (count === 0) return null;
151
+ return (
152
+ <span key={tag} className="legend-item" style={{ borderColor: colors.border }}>
153
+ <span className="legend-dot" style={{ backgroundColor: colors.border }} />
154
+ {colors.label} ({count})
155
+ </span>
156
+ );
157
+ })}
158
+ </div>
159
+ )}
160
+
161
+ <div className="markdown-content">
162
+ <div className="markdown-content-header">
163
+ <h3>Doc {selectedDocIndex}, Page {selectedPage}</h3>
164
+ {datasets.length > 0 && (
165
+ <span className="dataset-count">
166
+ {datasets.length} data mention{datasets.length !== 1 ? 's' : ''} detected
167
+ </span>
168
+ )}
169
+ </div>
170
+
171
+ {loadingPage ? (
172
+ <div className="loading-spinner-container">
173
+ <div className="loading-spinner" />
174
+ <p>Loading page data...</p>
175
+ </div>
176
+ ) : currentPageData ? (
177
+ <div className="markdown-preview">
178
+ <ReactMarkdown
179
+ remarkPlugins={[remarkGfm]}
180
+ components={highlightComponents}
181
+ >
182
+ {rawText || "No text available."}
183
+ </ReactMarkdown>
184
+ </div>
185
+ ) : (
186
+ <p className="text-muted">Select a document and page to view extracted text.</p>
187
+ )}
188
+ </div>
189
+ </div>
190
+ );
191
+ }
app/components/PageNavigator.js ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "use client";
2
+
3
+ export default function PageNavigator({
4
+ currentIndex,
5
+ totalPages,
6
+ currentPageNumber,
7
+ onPrevious,
8
+ onNext,
9
+ }) {
10
+ return (
11
+ <div className="page-navigator">
12
+ <button
13
+ className="btn btn-nav"
14
+ onClick={onPrevious}
15
+ disabled={currentIndex <= 0}
16
+ aria-label="Previous page"
17
+ >
18
+ ← Prev
19
+ </button>
20
+
21
+ <span className="page-indicator">
22
+ Page <strong>{currentPageNumber}</strong>
23
+ <span className="page-count">{currentIndex + 1} / {totalPages}</span>
24
+ </span>
25
+
26
+ <button
27
+ className="btn btn-nav"
28
+ onClick={onNext}
29
+ disabled={currentIndex >= totalPages - 1}
30
+ aria-label="Next page"
31
+ >
32
+ Next →
33
+ </button>
34
+ </div>
35
+ );
36
+ }
app/components/PdfViewer.js ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "use client";
2
+
3
+ export default function PdfViewer({ pdfUrl, pageNumber }) {
4
+ if (!pdfUrl) {
5
+ return (
6
+ <div className="pdf-placeholder">
7
+ <p>No PDF available for this document.</p>
8
+ </div>
9
+ );
10
+ }
11
+
12
+ // PDF pages in our data are 0-indexed; PDF.js viewer expects 1-indexed pages
13
+ const viewerPage = (pageNumber ?? 0) + 1;
14
+
15
+ // Use Mozilla's hosted PDF.js viewer — supports #page=N for direct page navigation.
16
+ // This avoids X-Frame-Options restrictions from the source server.
17
+ const pdfJsViewerUrl = `https://mozilla.github.io/pdf.js/web/viewer.html?file=${encodeURIComponent(pdfUrl)}#page=${viewerPage}`;
18
+
19
+ return (
20
+ <iframe
21
+ key={`pdf-${pdfUrl}-page-${viewerPage}`}
22
+ src={pdfJsViewerUrl}
23
+ className="pdf-frame"
24
+ title={`PDF Page ${viewerPage}`}
25
+ allow="fullscreen"
26
+ />
27
+ );
28
+ }
app/globals.css CHANGED
@@ -5,12 +5,20 @@
5
  --accent: #3b82f6;
6
  --accent-hover: #2563eb;
7
  --border-color: #334155;
 
 
 
 
 
 
 
 
8
  }
9
 
10
  body {
11
  margin: 0;
12
  padding: 0;
13
- font-family: ui-sans-serif, system-ui, sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji";
14
  background-color: var(--bg-color);
15
  color: var(--text-color);
16
  display: flex;
@@ -25,17 +33,23 @@ h4 {
25
  color: #f8fafc;
26
  }
27
 
 
 
28
  .container {
29
  display: flex;
30
  width: 100%;
31
  height: 100%;
 
32
  }
33
 
34
  .pane {
35
  flex: 1;
36
- padding: 24px;
37
  display: flex;
38
  flex-direction: column;
 
 
 
39
  }
40
 
41
  .left-pane {
@@ -43,32 +57,250 @@ h4 {
43
  background-color: #020617;
44
  }
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  .pdf-frame {
47
- flex-grow: 1;
48
  width: 100%;
 
49
  border: none;
50
  background: white;
51
  border-radius: 12px;
52
  box-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1), 0 2px 4px -2px rgb(0 0 0 / 0.1);
53
  }
54
 
55
- .right-pane {
56
- background-color: var(--pane-bg);
57
- overflow-y: auto;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  }
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  .markdown-content {
61
  background-color: #2dd4bf10;
62
- padding: 24px;
63
  border-radius: 12px;
64
- margin-bottom: 24px;
65
  line-height: 1.6;
66
  border: 1px solid #2dd4bf30;
67
- font-size: 1.05rem;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  }
69
 
70
  .markdown-preview {
71
- margin-top: 16px;
72
  color: #f1f5f9;
73
  }
74
 
@@ -77,22 +309,22 @@ h4 {
77
  .markdown-preview h3,
78
  .markdown-preview h4 {
79
  color: #38bdf8;
80
- margin-top: 1.5em;
81
- margin-bottom: 0.5em;
82
  }
83
 
84
  .markdown-preview p {
85
- margin-bottom: 1em;
86
  }
87
 
88
  .markdown-preview ul,
89
  .markdown-preview ol {
90
- margin-bottom: 1em;
91
  padding-left: 20px;
92
  }
93
 
94
  .markdown-preview li {
95
- margin-bottom: 0.25em;
96
  }
97
 
98
  .markdown-preview strong,
@@ -138,53 +370,701 @@ h4 {
138
  border: 0;
139
  height: 1px;
140
  background: var(--border-color);
141
- margin: 20px 0;
 
 
 
 
142
  }
143
 
 
 
144
  .btn {
145
- background-color: var(--accent);
146
- color: #ffffff;
147
  border: none;
148
- padding: 12px 20px;
149
- font-size: 16px;
150
  border-radius: 8px;
151
  cursor: pointer;
152
  font-weight: 600;
153
  transition: all 0.2s ease;
 
 
 
 
 
154
  box-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1);
155
  }
156
 
157
- .btn:hover {
158
  background-color: var(--accent-hover);
159
  transform: translateY(-1px);
160
  }
161
 
162
- .btn:active {
163
  transform: translateY(0);
164
  }
165
 
166
- .annotations-list {
167
- margin-top: 32px;
 
168
  }
169
 
170
- .annotations-list ul {
171
- list-style: none;
172
- padding: 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  margin: 0;
 
174
  }
175
 
176
- .annotations-list li {
177
- background-color: #33415550;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  padding: 16px;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  margin-bottom: 12px;
180
- border-radius: 8px;
181
- border-left: 4px solid #2dd4bf;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  backdrop-filter: blur(4px);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  }
184
 
185
- .annotations-list small {
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  color: #94a3b8;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  display: block;
188
- margin-top: 8px;
189
  font-size: 0.8rem;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  }
 
5
  --accent: #3b82f6;
6
  --accent-hover: #2563eb;
7
  --border-color: #334155;
8
+ --success: #10b981;
9
+ --error: #ef4444;
10
+ --surface: #334155;
11
+ --nav-height: 56px;
12
+ }
13
+
14
+ * {
15
+ box-sizing: border-box;
16
  }
17
 
18
  body {
19
  margin: 0;
20
  padding: 0;
21
+ font-family: 'Inter', ui-sans-serif, system-ui, sans-serif, "Apple Color Emoji", "Segoe UI Emoji";
22
  background-color: var(--bg-color);
23
  color: var(--text-color);
24
  display: flex;
 
33
  color: #f8fafc;
34
  }
35
 
36
+ /* ── Layout ─────────────────────────────────────── */
37
+
38
  .container {
39
  display: flex;
40
  width: 100%;
41
  height: 100%;
42
+ flex-wrap: wrap;
43
  }
44
 
45
  .pane {
46
  flex: 1;
47
+ padding: 20px 24px;
48
  display: flex;
49
  flex-direction: column;
50
+ /* Both panes are non-scrollable — content must fit */
51
+ height: calc(100vh - var(--nav-height));
52
+ overflow: hidden;
53
  }
54
 
55
  .left-pane {
 
57
  background-color: #020617;
58
  }
59
 
60
+ .right-pane {
61
+ background-color: var(--pane-bg);
62
+ overflow-y: auto;
63
+ }
64
+
65
+ .pane-header {
66
+ flex-shrink: 0;
67
+ margin-bottom: 12px;
68
+ }
69
+
70
+ .pane-header h2 {
71
+ margin-bottom: 8px;
72
+ }
73
+
74
+ /* ── Document Selector ──────────────────────────── */
75
+
76
+ .navigation-controls {
77
+ display: flex;
78
+ flex-direction: column;
79
+ gap: 8px;
80
+ }
81
+
82
+ .select-group {
83
+ display: flex;
84
+ flex-direction: column;
85
+ gap: 4px;
86
+ }
87
+
88
+ .select-group label {
89
+ font-size: 0.75rem;
90
+ font-weight: 600;
91
+ color: #94a3b8;
92
+ text-transform: uppercase;
93
+ letter-spacing: 0.05em;
94
+ }
95
+
96
+ .select-group select {
97
+ background-color: var(--surface);
98
+ color: var(--text-color);
99
+ border: 1px solid var(--border-color);
100
+ border-radius: 8px;
101
+ padding: 8px 12px;
102
+ font-size: 0.9rem;
103
+ cursor: pointer;
104
+ transition: border-color 0.2s;
105
+ }
106
+
107
+ .select-group select:hover,
108
+ .select-group select:focus {
109
+ border-color: var(--accent);
110
+ outline: none;
111
+ }
112
+
113
+ /* ── PDF Viewer ─────────────────────────────────── */
114
+
115
  .pdf-frame {
116
+ flex: 1;
117
  width: 100%;
118
+ min-height: 0;
119
  border: none;
120
  background: white;
121
  border-radius: 12px;
122
  box-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1), 0 2px 4px -2px rgb(0 0 0 / 0.1);
123
  }
124
 
125
+ .pdf-placeholder {
126
+ flex: 1;
127
+ display: flex;
128
+ align-items: center;
129
+ justify-content: center;
130
+ background-color: var(--surface);
131
+ border-radius: 12px;
132
+ color: #94a3b8;
133
+ }
134
+
135
+ /* ── Bottom Navigation ──────────────────────────── */
136
+
137
+ .bottom-nav {
138
+ position: fixed;
139
+ bottom: 0;
140
+ left: 0;
141
+ right: 0;
142
+ height: var(--nav-height);
143
+ background-color: #020617;
144
+ border-top: 1px solid var(--border-color);
145
+ display: flex;
146
+ align-items: center;
147
+ justify-content: center;
148
+ z-index: 100;
149
  }
150
 
151
+ .page-navigator {
152
+ display: flex;
153
+ align-items: center;
154
+ gap: 24px;
155
+ }
156
+
157
+ .page-indicator {
158
+ display: flex;
159
+ flex-direction: column;
160
+ align-items: center;
161
+ gap: 2px;
162
+ min-width: 100px;
163
+ text-align: center;
164
+ }
165
+
166
+ .page-indicator strong {
167
+ font-size: 1.1rem;
168
+ color: #f8fafc;
169
+ }
170
+
171
+ .page-count {
172
+ font-size: 0.75rem;
173
+ color: #64748b;
174
+ }
175
+
176
+ .btn-nav {
177
+ background-color: var(--surface);
178
+ color: var(--text-color);
179
+ padding: 10px 20px;
180
+ font-weight: 600;
181
+ font-size: 0.9rem;
182
+ border-radius: 8px;
183
+ border: 1px solid var(--border-color);
184
+ transition: all 0.2s;
185
+ }
186
+
187
+ .btn-nav:hover:not(:disabled) {
188
+ background-color: var(--accent);
189
+ border-color: var(--accent);
190
+ transform: translateY(-1px);
191
+ }
192
+
193
+ .btn-nav:disabled {
194
+ opacity: 0.3;
195
+ cursor: not-allowed;
196
+ }
197
+
198
+ /* ── Annotator Header ───────────────────────────── */
199
+
200
+ .annotator-container {
201
+ display: flex;
202
+ flex-direction: column;
203
+ flex: 1;
204
+ min-height: 0;
205
+ }
206
+
207
+ .annotator-header {
208
+ display: flex;
209
+ align-items: center;
210
+ justify-content: space-between;
211
+ gap: 16px;
212
+ flex-shrink: 0;
213
+ margin-bottom: 12px;
214
+ }
215
+
216
+ .annotator-header h2 {
217
+ margin: 0;
218
+ }
219
+
220
+ .annotator-actions {
221
+ display: flex;
222
+ flex-direction: column;
223
+ gap: 6px;
224
+ }
225
+
226
+ /* ── Markdown Content ───────────────────────────── */
227
+
228
  .markdown-content {
229
  background-color: #2dd4bf10;
230
+ padding: 20px;
231
  border-radius: 12px;
 
232
  line-height: 1.6;
233
  border: 1px solid #2dd4bf30;
234
+ font-size: 0.95rem;
235
+ flex: 1;
236
+ min-height: 0;
237
+ overflow-y: auto;
238
+ }
239
+
240
+ .markdown-content-header {
241
+ display: flex;
242
+ justify-content: space-between;
243
+ align-items: center;
244
+ margin-bottom: 12px;
245
+ flex-shrink: 0;
246
+ }
247
+
248
+ .markdown-content-header h3 {
249
+ margin: 0;
250
+ font-size: 0.9rem;
251
+ }
252
+
253
+ .dataset-count {
254
+ background-color: #2dd4bf20;
255
+ color: #2dd4bf;
256
+ padding: 4px 10px;
257
+ border-radius: 20px;
258
+ font-size: 0.75rem;
259
+ font-weight: 600;
260
+ }
261
+
262
+ /* ── Dataset Legend ─────────────────────────────── */
263
+
264
+ .dataset-legend {
265
+ display: flex;
266
+ gap: 12px;
267
+ flex-wrap: wrap;
268
+ flex-shrink: 0;
269
+ margin-bottom: 8px;
270
+ }
271
+
272
+ .legend-item {
273
+ display: flex;
274
+ align-items: center;
275
+ gap: 6px;
276
+ font-size: 0.75rem;
277
+ color: #94a3b8;
278
+ padding: 4px 10px;
279
+ border-radius: 6px;
280
+ background-color: #1e293b;
281
+ border: 1px solid;
282
+ }
283
+
284
+ .legend-dot {
285
+ width: 8px;
286
+ height: 8px;
287
+ border-radius: 50%;
288
+ flex-shrink: 0;
289
+ }
290
+
291
+ /* ── Highlight marks ────────────────────────────── */
292
+
293
+ .markdown-preview mark {
294
+ transition: all 0.15s ease;
295
+ }
296
+
297
+ .markdown-preview mark:hover {
298
+ filter: brightness(1.3);
299
+ transform: scale(1.02);
300
+ display: inline-block;
301
  }
302
 
303
  .markdown-preview {
 
304
  color: #f1f5f9;
305
  }
306
 
 
309
  .markdown-preview h3,
310
  .markdown-preview h4 {
311
  color: #38bdf8;
312
+ margin-top: 1.2em;
313
+ margin-bottom: 0.4em;
314
  }
315
 
316
  .markdown-preview p {
317
+ margin-bottom: 0.8em;
318
  }
319
 
320
  .markdown-preview ul,
321
  .markdown-preview ol {
322
+ margin-bottom: 0.8em;
323
  padding-left: 20px;
324
  }
325
 
326
  .markdown-preview li {
327
+ margin-bottom: 0.2em;
328
  }
329
 
330
  .markdown-preview strong,
 
370
  border: 0;
371
  height: 1px;
372
  background: var(--border-color);
373
+ margin: 16px 0;
374
+ }
375
+
376
+ .text-muted {
377
+ color: #94a3b8;
378
  }
379
 
380
+ /* ── Buttons ────────────────────────────────────── */
381
+
382
  .btn {
 
 
383
  border: none;
384
+ padding: 10px 18px;
385
+ font-size: 14px;
386
  border-radius: 8px;
387
  cursor: pointer;
388
  font-weight: 600;
389
  transition: all 0.2s ease;
390
+ }
391
+
392
+ .btn-primary {
393
+ background-color: var(--accent);
394
+ color: #ffffff;
395
  box-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1);
396
  }
397
 
398
+ .btn-primary:hover {
399
  background-color: var(--accent-hover);
400
  transform: translateY(-1px);
401
  }
402
 
403
+ .btn-primary:active {
404
  transform: translateY(0);
405
  }
406
 
407
+ .btn-secondary {
408
+ background-color: var(--surface);
409
+ color: var(--text-color);
410
  }
411
 
412
+ .btn-secondary:hover {
413
+ background-color: #475569;
414
+ }
415
+
416
+ /* ── Annotations Side Panel ─────────────────────── */
417
+
418
+ .panel-backdrop {
419
+ position: fixed;
420
+ inset: 0;
421
+ background: rgba(0, 0, 0, 0.3);
422
+ z-index: 200;
423
+ }
424
+
425
+ .annotation-panel {
426
+ position: fixed;
427
+ top: 0;
428
+ right: 0;
429
+ bottom: var(--nav-height);
430
+ width: 380px;
431
+ background-color: var(--pane-bg);
432
+ border-left: 1px solid var(--border-color);
433
+ z-index: 300;
434
+ transform: translateX(100%);
435
+ transition: transform 0.3s cubic-bezier(0.4, 0, 0.2, 1);
436
+ display: flex;
437
+ flex-direction: column;
438
+ box-shadow: -8px 0 30px rgb(0 0 0 / 0.3);
439
+ }
440
+
441
+ .annotation-panel.open {
442
+ transform: translateX(0);
443
+ }
444
+
445
+ .panel-header {
446
+ display: flex;
447
+ align-items: center;
448
+ gap: 10px;
449
+ padding: 16px 20px;
450
+ border-bottom: 1px solid var(--border-color);
451
+ flex-shrink: 0;
452
+ }
453
+
454
+ .panel-header h3 {
455
  margin: 0;
456
+ flex: 1;
457
  }
458
 
459
+ .panel-count {
460
+ background-color: var(--accent);
461
+ color: #fff;
462
+ font-size: 0.75rem;
463
+ font-weight: 700;
464
+ padding: 2px 8px;
465
+ border-radius: 10px;
466
+ min-width: 24px;
467
+ text-align: center;
468
+ }
469
+
470
+ .panel-close {
471
+ background: none;
472
+ border: none;
473
+ color: #94a3b8;
474
+ font-size: 22px;
475
+ cursor: pointer;
476
+ padding: 4px 8px;
477
+ border-radius: 6px;
478
+ transition: all 0.15s;
479
+ }
480
+
481
+ .panel-close:hover {
482
+ background-color: var(--surface);
483
+ color: var(--text-color);
484
+ }
485
+
486
+ .panel-body {
487
+ flex: 1;
488
+ overflow-y: auto;
489
  padding: 16px;
490
+ }
491
+
492
+ .panel-empty {
493
+ text-align: center;
494
+ padding: 40px 20px;
495
+ color: #64748b;
496
+ }
497
+
498
+ .panel-empty p:first-child {
499
+ font-size: 1rem;
500
+ }
501
+
502
+ /* ── Panel Cards ────────────────────────────────── */
503
+
504
+ .panel-annotation-card {
505
+ background-color: #0f172a;
506
+ border: 1px solid var(--border-color);
507
+ border-radius: 10px;
508
+ padding: 14px;
509
  margin-bottom: 12px;
510
+ transition: border-color 0.2s;
511
+ }
512
+
513
+ .panel-annotation-card:hover {
514
+ border-color: var(--accent);
515
+ }
516
+
517
+ .panel-card-top {
518
+ display: flex;
519
+ justify-content: space-between;
520
+ align-items: center;
521
+ margin-bottom: 8px;
522
+ }
523
+
524
+ .panel-card-author {
525
+ font-size: 0.75rem;
526
+ color: #64748b;
527
+ font-style: italic;
528
+ }
529
+
530
+ .panel-card-text {
531
+ margin: 0 0 6px;
532
+ font-size: 0.9rem;
533
+ color: #e2e8f0;
534
+ line-height: 1.4;
535
+ }
536
+
537
+ .panel-card-position {
538
+ display: inline-block;
539
+ font-size: 0.7rem;
540
+ color: #475569;
541
+ font-family: monospace;
542
+ margin-bottom: 8px;
543
+ }
544
+
545
+ .panel-card-actions {
546
+ display: flex;
547
+ gap: 8px;
548
+ }
549
+
550
+ .btn-panel {
551
+ background: none;
552
+ border: 1px solid var(--border-color);
553
+ color: #94a3b8;
554
+ font-size: 0.75rem;
555
+ padding: 4px 10px;
556
+ border-radius: 5px;
557
+ cursor: pointer;
558
+ transition: all 0.15s;
559
+ }
560
+
561
+ .btn-panel:hover {
562
+ border-color: #94a3b8;
563
+ color: var(--text-color);
564
+ }
565
+
566
+ .btn-panel.save {
567
+ border-color: #10b981;
568
+ color: #10b981;
569
+ }
570
+
571
+ .btn-panel.save:hover {
572
+ background-color: #10b98120;
573
+ }
574
+
575
+ .btn-panel.delete.confirming {
576
+ border-color: var(--error);
577
+ color: var(--error);
578
+ animation: pulse 0.6s ease infinite alternate;
579
+ }
580
+
581
+ @keyframes pulse {
582
+ from {
583
+ opacity: 0.7;
584
+ }
585
+
586
+ to {
587
+ opacity: 1;
588
+ }
589
+ }
590
+
591
+ .panel-card-time {
592
+ font-size: 0.7rem;
593
+ color: #475569;
594
+ margin-top: 6px;
595
+ }
596
+
597
+ .form-select-small {
598
+ background-color: var(--bg-color);
599
+ color: var(--text-color);
600
+ border: 1px solid var(--accent);
601
+ border-radius: 5px;
602
+ padding: 3px 8px;
603
+ font-size: 0.75rem;
604
+ cursor: pointer;
605
+ }
606
+
607
+ .annotation-tag-badge {
608
+ font-size: 0.7rem;
609
+ font-weight: 700;
610
+ text-transform: uppercase;
611
+ letter-spacing: 0.05em;
612
+ padding: 3px 8px;
613
+ border-radius: 4px;
614
+ }
615
+
616
+ .annotation-tag-badge.clickable {
617
+ cursor: pointer;
618
+ transition: opacity 0.15s;
619
+ }
620
+
621
+ .annotation-tag-badge.clickable:hover {
622
+ opacity: 0.8;
623
+ }
624
+
625
+ .panel-card-source {
626
+ font-size: 0.7rem;
627
+ color: #64748b;
628
+ }
629
+
630
+ .inline-edit {
631
+ display: flex;
632
+ align-items: center;
633
+ gap: 4px;
634
+ }
635
+
636
+ /* Judge info row */
637
+ .panel-card-judge {
638
+ display: flex;
639
+ align-items: center;
640
+ gap: 8px;
641
+ font-size: 0.7rem;
642
+ margin-top: 2px;
643
+ }
644
+
645
+ .judge-verdict {
646
+ font-weight: 600;
647
+ }
648
+
649
+ .judge-verdict.correct {
650
+ color: #10b981;
651
+ }
652
+
653
+ .judge-verdict.wrong {
654
+ color: #ef4444;
655
+ }
656
+
657
+ .judge-tag {
658
+ color: #64748b;
659
+ font-style: italic;
660
+ }
661
+
662
+ /* Validated card accents */
663
+ .panel-annotation-card.validated-correct {
664
+ border-left: 3px solid #10b981;
665
+ }
666
+
667
+ .panel-annotation-card.validated-wrong {
668
+ border-left: 3px solid #ef4444;
669
+ }
670
+
671
+ /* Validation status */
672
+ .validation-status {
673
+ font-size: 0.75rem;
674
+ margin-top: 6px;
675
+ padding: 6px 8px;
676
+ border-radius: 6px;
677
+ }
678
+
679
+ .validation-status.correct {
680
+ background-color: #10b98115;
681
+ color: #10b981;
682
+ }
683
+
684
+ .validation-status.wrong {
685
+ background-color: #ef444415;
686
+ color: #ef4444;
687
+ }
688
+
689
+ .validation-by {
690
+ color: #64748b;
691
+ font-style: italic;
692
+ }
693
+
694
+ .validation-notes {
695
+ margin: 4px 0 0;
696
+ color: var(--text-muted);
697
+ font-style: italic;
698
+ }
699
+
700
+ /* Validation form */
701
+ .validation-form {
702
+ margin-top: 8px;
703
+ }
704
+
705
+ .validation-notes-input {
706
+ width: 100%;
707
+ background-color: var(--bg-color);
708
+ color: var(--text-color);
709
+ border: 1px solid var(--border-color);
710
+ border-radius: 6px;
711
+ padding: 6px 8px;
712
+ font-size: 0.75rem;
713
+ resize: vertical;
714
+ margin-bottom: 6px;
715
+ font-family: inherit;
716
+ }
717
+
718
+ .validation-notes-input:focus {
719
+ outline: none;
720
+ border-color: var(--accent);
721
+ }
722
+
723
+ .validation-buttons {
724
+ display: flex;
725
+ gap: 4px;
726
+ }
727
+
728
+ .btn-panel.correct {
729
+ border-color: #10b981;
730
+ color: #10b981;
731
+ }
732
+
733
+ .btn-panel.correct:hover {
734
+ background-color: #10b98120;
735
+ }
736
+
737
+ .btn-panel.wrong {
738
+ border-color: #ef4444;
739
+ color: #ef4444;
740
+ }
741
+
742
+ .btn-panel.wrong:hover {
743
+ background-color: #ef444420;
744
+ }
745
+
746
+ .btn-panel.validate {
747
+ border-color: #6366f1;
748
+ color: #6366f1;
749
+ }
750
+
751
+ .btn-panel.validate:hover {
752
+ background-color: #6366f120;
753
+ }
754
+
755
+ /* ── Floating Panel Chevron ────────────────────── */
756
+
757
+ .panel-chevron {
758
+ position: fixed;
759
+ right: 0;
760
+ top: 50%;
761
+ transform: translateY(-50%);
762
+ z-index: 150;
763
+ background-color: var(--surface);
764
+ border: 1px solid var(--border-color);
765
+ border-right: none;
766
+ color: var(--text-color);
767
+ font-size: 1.4rem;
768
+ font-weight: 700;
769
+ padding: 12px 6px;
770
+ border-radius: 8px 0 0 8px;
771
+ cursor: pointer;
772
+ transition: all 0.2s;
773
+ line-height: 1;
774
+ }
775
+
776
+ .panel-chevron:hover {
777
+ background-color: var(--accent);
778
+ border-color: var(--accent);
779
+ }
780
+
781
+ .chevron-badge {
782
+ position: absolute;
783
+ top: -6px;
784
+ left: -6px;
785
+ background-color: var(--accent);
786
+ color: #fff;
787
+ font-size: 0.6rem;
788
+ font-weight: 700;
789
+ width: 18px;
790
+ height: 18px;
791
+ border-radius: 50%;
792
+ display: flex;
793
+ align-items: center;
794
+ justify-content: center;
795
+ }
796
+
797
+
798
+
799
+ /* ── Modal ──────────────────────────────────────── */
800
+
801
+ .modal-overlay {
802
+ position: fixed;
803
+ inset: 0;
804
+ background-color: rgba(0, 0, 0, 0.6);
805
  backdrop-filter: blur(4px);
806
+ display: flex;
807
+ align-items: center;
808
+ justify-content: center;
809
+ z-index: 1000;
810
+ animation: fadeIn 0.15s ease;
811
+ }
812
+
813
+ .modal-content {
814
+ background-color: var(--pane-bg);
815
+ border: 1px solid var(--border-color);
816
+ border-radius: 16px;
817
+ width: 90%;
818
+ max-width: 520px;
819
+ box-shadow: 0 25px 50px -12px rgb(0 0 0 / 0.5);
820
+ animation: slideUp 0.2s ease;
821
  }
822
 
823
+ .modal-header {
824
+ display: flex;
825
+ justify-content: space-between;
826
+ align-items: center;
827
+ padding: 20px 24px 0;
828
+ }
829
+
830
+ .modal-header h3 {
831
+ margin: 0;
832
+ }
833
+
834
+ .modal-close {
835
+ background: none;
836
+ border: none;
837
  color: #94a3b8;
838
+ font-size: 24px;
839
+ cursor: pointer;
840
+ padding: 4px 8px;
841
+ border-radius: 6px;
842
+ transition: all 0.15s;
843
+ }
844
+
845
+ .modal-close:hover {
846
+ background-color: var(--surface);
847
+ color: var(--text-color);
848
+ }
849
+
850
+ .modal-body {
851
+ padding: 20px 24px;
852
+ }
853
+
854
+ .modal-footer {
855
+ display: flex;
856
+ justify-content: flex-end;
857
+ gap: 12px;
858
+ padding: 0 24px 20px;
859
+ }
860
+
861
+ /* ── Form Elements ──────────────────────────────── */
862
+
863
+ .form-group {
864
+ margin-bottom: 16px;
865
+ }
866
+
867
+ .form-group label {
868
  display: block;
 
869
  font-size: 0.8rem;
870
+ font-weight: 600;
871
+ color: #94a3b8;
872
+ text-transform: uppercase;
873
+ letter-spacing: 0.05em;
874
+ margin-bottom: 6px;
875
+ }
876
+
877
+ .selected-text-preview {
878
+ background-color: #2dd4bf15;
879
+ border: 1px solid #2dd4bf30;
880
+ padding: 12px;
881
+ border-radius: 8px;
882
+ font-style: italic;
883
+ color: #2dd4bf;
884
+ max-height: 100px;
885
+ overflow-y: auto;
886
+ font-size: 0.95rem;
887
+ }
888
+
889
+ .form-select {
890
+ width: 100%;
891
+ background-color: var(--bg-color);
892
+ color: var(--text-color);
893
+ border: 1px solid var(--border-color);
894
+ border-radius: 8px;
895
+ padding: 10px 12px;
896
+ font-size: 0.95rem;
897
+ font-family: inherit;
898
+ cursor: pointer;
899
+ transition: border-color 0.2s;
900
+ }
901
+
902
+ .form-select:focus {
903
+ border-color: var(--accent);
904
+ outline: none;
905
+ }
906
+
907
+ .form-help {
908
+ margin-top: 6px;
909
+ margin-bottom: 0;
910
+ font-size: 0.8rem;
911
+ color: #64748b;
912
+ font-style: italic;
913
+ }
914
+
915
+ .form-input,
916
+ .form-textarea {
917
+ width: 100%;
918
+ background-color: var(--bg-color);
919
+ color: var(--text-color);
920
+ border: 1px solid var(--border-color);
921
+ border-radius: 8px;
922
+ padding: 10px 12px;
923
+ font-size: 0.95rem;
924
+ font-family: inherit;
925
+ transition: border-color 0.2s;
926
+ }
927
+
928
+ .form-input:focus,
929
+ .form-textarea:focus {
930
+ border-color: var(--accent);
931
+ outline: none;
932
+ }
933
+
934
+ .form-textarea {
935
+ resize: vertical;
936
+ min-height: 80px;
937
+ }
938
+
939
+ /* ── Toast Notifications ────────────────────────── */
940
+
941
+ .toast {
942
+ position: fixed;
943
+ bottom: calc(var(--nav-height) + 16px);
944
+ right: 24px;
945
+ padding: 14px 24px;
946
+ border-radius: 10px;
947
+ font-weight: 600;
948
+ font-size: 0.9rem;
949
+ z-index: 2000;
950
+ animation: slideInRight 0.3s ease, fadeOut 0.3s ease 2.7s forwards;
951
+ box-shadow: 0 10px 25px -5px rgb(0 0 0 / 0.3);
952
+ }
953
+
954
+ .toast-success {
955
+ background-color: var(--success);
956
+ color: #fff;
957
+ }
958
+
959
+ .toast-error {
960
+ background-color: var(--error);
961
+ color: #fff;
962
+ }
963
+
964
+ /* ── Loading ────────────────────────────────────── */
965
+
966
+ .loading-container {
967
+ display: flex;
968
+ flex-direction: column;
969
+ align-items: center;
970
+ justify-content: center;
971
+ height: 100vh;
972
+ width: 100%;
973
+ gap: 16px;
974
+ color: #94a3b8;
975
+ }
976
+
977
+ .loading-spinner-container {
978
+ display: flex;
979
+ flex-direction: column;
980
+ align-items: center;
981
+ justify-content: center;
982
+ padding: 40px;
983
+ gap: 12px;
984
+ color: #94a3b8;
985
+ }
986
+
987
+ .loading-spinner {
988
+ width: 32px;
989
+ height: 32px;
990
+ border: 3px solid var(--border-color);
991
+ border-top-color: var(--accent);
992
+ border-radius: 50%;
993
+ animation: spin 0.8s linear infinite;
994
+ }
995
+
996
+ /* ── Animations ─────────────────────────────────── */
997
+
998
+ @keyframes fadeIn {
999
+ from {
1000
+ opacity: 0;
1001
+ }
1002
+
1003
+ to {
1004
+ opacity: 1;
1005
+ }
1006
+ }
1007
+
1008
+ @keyframes slideUp {
1009
+ from {
1010
+ transform: translateY(20px);
1011
+ opacity: 0;
1012
+ }
1013
+
1014
+ to {
1015
+ transform: translateY(0);
1016
+ opacity: 1;
1017
+ }
1018
+ }
1019
+
1020
+ @keyframes slideInRight {
1021
+ from {
1022
+ transform: translateX(100%);
1023
+ opacity: 0;
1024
+ }
1025
+
1026
+ to {
1027
+ transform: translateX(0);
1028
+ opacity: 1;
1029
+ }
1030
+ }
1031
+
1032
+ @keyframes fadeOut {
1033
+ from {
1034
+ opacity: 1;
1035
+ }
1036
+
1037
+ to {
1038
+ opacity: 0;
1039
+ }
1040
+ }
1041
+
1042
+ @keyframes spin {
1043
+ to {
1044
+ transform: rotate(360deg);
1045
+ }
1046
+ }
1047
+
1048
+ @keyframes shake {
1049
+
1050
+ 0%,
1051
+ 100% {
1052
+ transform: translateX(0);
1053
+ }
1054
+
1055
+ 25% {
1056
+ transform: translateX(-5px);
1057
+ }
1058
+
1059
+ 50% {
1060
+ transform: translateX(5px);
1061
+ }
1062
+
1063
+ 75% {
1064
+ transform: translateX(-5px);
1065
+ }
1066
+ }
1067
+
1068
+ .shake {
1069
+ animation: shake 0.4s ease;
1070
  }
app/page.js CHANGED
@@ -1,21 +1,49 @@
1
  "use client";
2
 
3
- import { useState, useEffect } from 'react';
4
- import ReactMarkdown from 'react-markdown';
5
- import remarkGfm from 'remark-gfm';
 
 
 
 
6
 
7
  export default function Home() {
8
  const [documents, setDocuments] = useState([]);
9
  const [selectedDocIndex, setSelectedDocIndex] = useState(null);
10
- const [selectedPage, setSelectedPage] = useState(null);
11
-
12
  const [currentDoc, setCurrentDoc] = useState(null);
 
 
 
13
  const [currentPageData, setCurrentPageData] = useState(null);
14
  const [loading, setLoading] = useState(true);
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
- // Array of local session annotations just for immediate UI feedback
17
- const [sessionAnnotations, setSessionAnnotations] = useState([]);
18
 
 
 
 
 
 
 
 
 
 
 
19
  useEffect(() => {
20
  fetch('/api/documents')
21
  .then(res => res.json())
@@ -23,7 +51,7 @@ export default function Home() {
23
  setDocuments(data);
24
  if (data.length > 0) {
25
  setSelectedDocIndex(data[0].index);
26
- setSelectedPage(data[0].annotatable_pages[0]);
27
  }
28
  setLoading(false);
29
  })
@@ -33,139 +61,318 @@ export default function Home() {
33
  });
34
  }, []);
35
 
 
36
  useEffect(() => {
37
- if (selectedDocIndex !== null && selectedPage !== null) {
38
  const doc = documents.find(d => d.index === selectedDocIndex);
39
  setCurrentDoc(doc);
 
 
40
 
41
- fetch(`/api/document?index=${selectedDocIndex}&page=${selectedPage}`)
 
 
 
 
42
  .then(res => res.json())
43
  .then(data => {
44
  setCurrentPageData(data);
 
45
  })
46
- .catch(err => console.error("Failed to load page data", err));
47
- }
48
- }, [selectedDocIndex, selectedPage, documents]);
49
-
50
- const handleAnnotate = async () => {
51
- const selection = window.getSelection();
52
- if (selection && selection.toString().trim() !== "") {
53
- const selectedText = selection.toString().trim();
54
- const note = window.prompt("Enter annotation note for: " + selectedText);
55
-
56
- const payload = {
57
- document_index: selectedDocIndex,
58
- page_number: selectedPage,
59
- selected_text: selectedText,
60
- annotation_note: note || "No note provided",
61
- timestamp: new Date().toISOString()
62
- };
63
-
64
- try {
65
- const res = await fetch('/api/annotate', {
66
- method: 'POST',
67
- headers: { 'Content-Type': 'application/json' },
68
- body: JSON.stringify(payload)
69
  });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
- if (res.ok) {
72
- setSessionAnnotations(prev => [...prev, payload]);
73
- window.alert("Annotation saved locally to human_annotations.json!");
74
- } else {
75
- window.alert("Failed to save annotation.");
 
 
 
76
  }
77
- } catch (err) {
78
- console.error(err);
79
- window.alert("Network error saving annotation.");
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  }
81
- } else {
82
- window.alert("Please select some text to annotate.");
 
83
  }
84
  };
85
 
86
- if (loading) return <div className="p-4">Loading Data...</div>;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
  return (
89
  <div className="container">
90
  <div className="pane left-pane">
91
- <h2>PDF Viewer & Navigation</h2>
92
-
93
- <div className="navigation-controls">
94
- <div>
95
- <label>Document: </label>
96
- <select
97
- value={selectedDocIndex || ''}
98
- onChange={(e) => {
99
- const docIdx = parseInt(e.target.value);
100
- setSelectedDocIndex(docIdx);
101
- const doc = documents.find(d => d.index === docIdx);
102
- if (doc && doc.annotatable_pages.length > 0) {
103
- setSelectedPage(doc.annotatable_pages[0]);
104
- }
105
- }}
106
- >
107
- {documents.map(doc => (
108
- <option key={doc.index} value={doc.index}>
109
- Document {doc.index} ({doc.annotatable_pages.length} pages available)
110
- </option>
111
- ))}
112
- </select>
113
- </div>
114
-
115
- <div style={{ marginTop: '12px' }}>
116
- <label>Annotatable Page: </label>
117
- <select
118
- value={selectedPage || ''}
119
- onChange={(e) => setSelectedPage(parseInt(e.target.value))}
120
- >
121
- {currentDoc?.annotatable_pages.map(p => (
122
- <option key={p} value={p}>Page {p}</option>
123
- ))}
124
- </select>
125
- </div>
126
  </div>
 
 
 
 
 
127
 
128
- {currentDoc && currentDoc.pdf_url && (
129
- <iframe
130
- src={`https://docs.google.com/viewer?url=${encodeURIComponent(currentDoc.pdf_url)}&embedded=true`}
131
- className="pdf-frame"
132
- title={`Document ${currentDoc.index} PDF`}
133
- style={{ marginTop: '20px' }}
134
- />
 
 
 
 
 
 
 
 
 
 
 
 
135
  )}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  </div>
137
 
138
- <div className="pane right-pane">
139
- <h2>Markdown Annotation Mvp</h2>
140
- <button onClick={handleAnnotate} className="btn" style={{ marginBottom: '20px' }}>
141
- Annotate Selection
142
- </button>
143
-
144
- <div className="markdown-content">
145
- <h3>Extracted Text (Doc {selectedDocIndex}, Page {selectedPage})</h3>
146
- {currentPageData ? (
147
- <div className="markdown-preview">
148
- <ReactMarkdown remarkPlugins={[remarkGfm]}>{currentPageData.input_text || "No text available."}</ReactMarkdown>
149
- </div>
150
- ) : (
151
- <p>Loading text...</p>
152
- )}
153
- </div>
154
 
155
- <div className="annotations-list">
156
- {sessionAnnotations.length > 0 && <h3>Current Session Annotations</h3>}
157
- <ul>
158
- {sessionAnnotations.map((ann, idx) => (
159
- <li key={idx}>
160
- <strong>Doc {ann.document_index} / Pg {ann.page_number}</strong><br />
161
- <strong>Selected:</strong> "{ann.selected_text}" <br />
162
- <strong>Note:</strong> {ann.annotation_note} <br />
163
- <small>{new Date(ann.timestamp).toLocaleString()}</small>
164
- </li>
165
- ))}
166
- </ul>
167
  </div>
168
- </div>
169
  </div>
170
  );
171
  }
 
1
  "use client";
2
 
3
+ import { useState, useEffect, useCallback } from 'react';
4
+ import DocumentSelector from './components/DocumentSelector';
5
+ import PdfViewer from './components/PdfViewer';
6
+ import MarkdownAnnotator from './components/MarkdownAnnotator';
7
+ import AnnotationPanel from './components/AnnotationPanel';
8
+ import AnnotationModal from './components/AnnotationModal';
9
+ import PageNavigator from './components/PageNavigator';
10
 
11
  export default function Home() {
12
  const [documents, setDocuments] = useState([]);
13
  const [selectedDocIndex, setSelectedDocIndex] = useState(null);
 
 
14
  const [currentDoc, setCurrentDoc] = useState(null);
15
+
16
+ // Page-by-page navigation: track the index into annotatable_pages array
17
+ const [pageIdx, setPageIdx] = useState(0);
18
  const [currentPageData, setCurrentPageData] = useState(null);
19
  const [loading, setLoading] = useState(true);
20
+ const [loadingPage, setLoadingPage] = useState(false);
21
+
22
+ // Annotations
23
+ const [annotations, setAnnotations] = useState([]);
24
+ const [annotatorName, setAnnotatorName] = useState('');
25
+
26
+ // Modal state
27
+ const [modalOpen, setModalOpen] = useState(false);
28
+ const [selectedText, setSelectedText] = useState('');
29
+ const [selectionOffset, setSelectionOffset] = useState(0);
30
+
31
+ // Side panel state
32
+ const [panelOpen, setPanelOpen] = useState(false);
33
 
34
+ // Toast state
35
+ const [toast, setToast] = useState(null);
36
 
37
+ const showToast = useCallback((message, type = 'success') => {
38
+ setToast({ message, type });
39
+ setTimeout(() => setToast(null), 3000);
40
+ }, []);
41
+
42
+ // Derived: current page number from the annotatable_pages array
43
+ const annotatablePages = currentDoc?.annotatable_pages ?? [];
44
+ const currentPageNumber = annotatablePages[pageIdx] ?? null;
45
+
46
+ // Load documents on mount
47
  useEffect(() => {
48
  fetch('/api/documents')
49
  .then(res => res.json())
 
51
  setDocuments(data);
52
  if (data.length > 0) {
53
  setSelectedDocIndex(data[0].index);
54
+ setPageIdx(0);
55
  }
56
  setLoading(false);
57
  })
 
61
  });
62
  }, []);
63
 
64
+ // Update currentDoc when selection changes
65
  useEffect(() => {
66
+ if (selectedDocIndex !== null) {
67
  const doc = documents.find(d => d.index === selectedDocIndex);
68
  setCurrentDoc(doc);
69
+ }
70
+ }, [selectedDocIndex, documents]);
71
 
72
+ // Fetch page data (reusable — called on page change and after saving)
73
+ const refreshPageData = useCallback(() => {
74
+ if (selectedDocIndex !== null && currentPageNumber !== null) {
75
+ setLoadingPage(true);
76
+ fetch(`/api/document?index=${selectedDocIndex}&page=${currentPageNumber}`)
77
  .then(res => res.json())
78
  .then(data => {
79
  setCurrentPageData(data);
80
+ setLoadingPage(false);
81
  })
82
+ .catch(err => {
83
+ console.error("Failed to load page data", err);
84
+ setLoadingPage(false);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  });
86
+ }
87
+ }, [selectedDocIndex, currentPageNumber]);
88
+
89
+ // Load page data when page changes
90
+ useEffect(() => {
91
+ refreshPageData();
92
+ }, [refreshPageData]);
93
+
94
+ // Load persisted annotations on mount
95
+ useEffect(() => {
96
+ fetch('/api/annotations')
97
+ .then(res => res.json())
98
+ .then(data => {
99
+ if (Array.isArray(data)) setAnnotations(data);
100
+ })
101
+ .catch(err => console.error("Failed to load annotations", err));
102
+ }, [])
103
 
104
+ // Auto-fetch HF username in production
105
+ useEffect(() => {
106
+ fetch('/api/whoami')
107
+ .then(res => res.ok ? res.json() : null)
108
+ .then(data => {
109
+ if (data?.username) {
110
+ setAnnotatorName(data.username);
111
+ localStorage.setItem('annotator_name', data.username);
112
  }
113
+ })
114
+ .catch(() => { }); // Silently ignore — falls back to localStorage name
115
+ }, []);
116
+
117
+ // Load annotator name from localStorage
118
+ useEffect(() => {
119
+ const saved = localStorage.getItem('annotator_name');
120
+ if (saved) setAnnotatorName(saved);
121
+ }, []);
122
+
123
+ const handleAnnotatorChange = (name) => {
124
+ setAnnotatorName(name);
125
+ localStorage.setItem('annotator_name', name);
126
+ };
127
+
128
+ const handleDocChange = (docIdx) => {
129
+ setSelectedDocIndex(docIdx);
130
+ setPageIdx(0);
131
+ };
132
+
133
+ const handlePrevPage = () => {
134
+ setPageIdx(prev => Math.max(0, prev - 1));
135
+ };
136
+
137
+ const handleNextPage = () => {
138
+ setPageIdx(prev => Math.min(annotatablePages.length - 1, prev + 1));
139
+ };
140
+
141
+ const handleAnnotate = (text, domOffset) => {
142
+ setSelectedText(text);
143
+ setSelectionOffset(domOffset || 0);
144
+ setModalOpen(true);
145
+ };
146
+
147
+ const handleAnnotationSubmit = async ({ dataset_tag }) => {
148
+ // Find ALL occurrences of the selected text in input_text
149
+ const inputText = currentPageData?.input_text || "";
150
+ const occurrences = [];
151
+ let searchFrom = 0;
152
+ while (searchFrom < inputText.length) {
153
+ const idx = inputText.indexOf(selectedText, searchFrom);
154
+ if (idx === -1) break;
155
+ occurrences.push(idx);
156
+ searchFrom = idx + 1;
157
+ }
158
+
159
+ let startIdx = null;
160
+ if (occurrences.length === 1) {
161
+ startIdx = occurrences[0];
162
+ } else if (occurrences.length > 1) {
163
+ const container = document.querySelector('.markdown-preview');
164
+ const visibleLen = container?.textContent?.length || inputText.length;
165
+ const ratio = inputText.length / visibleLen;
166
+ const estimatedSourcePos = selectionOffset * ratio;
167
+ startIdx = occurrences.reduce((best, idx) =>
168
+ Math.abs(idx - estimatedSourcePos) < Math.abs(best - estimatedSourcePos) ? idx : best
169
+ );
170
+ }
171
+
172
+ const endIdx = startIdx !== null ? startIdx + selectedText.length : null;
173
+
174
+ const payload = {
175
+ dataset_name: {
176
+ text: selectedText,
177
+ confidence: 1.0,
178
+ start: startIdx >= 0 ? startIdx : null,
179
+ end: endIdx,
180
+ },
181
+ dataset_tag: dataset_tag,
182
+ annotator: annotatorName || "user",
183
+ document_index: selectedDocIndex,
184
+ page_number: currentPageNumber,
185
+ timestamp: new Date().toISOString(),
186
+ description: null,
187
+ data_type: null,
188
+ acronym: null,
189
+ author: null,
190
+ producer: null,
191
+ geography: null,
192
+ publication_year: null,
193
+ reference_year: null,
194
+ reference_population: null,
195
+ is_used: null,
196
+ usage_context: null,
197
+ };
198
+
199
+ try {
200
+ const res = await fetch('/api/annotate', {
201
+ method: 'POST',
202
+ headers: { 'Content-Type': 'application/json' },
203
+ body: JSON.stringify(payload)
204
+ });
205
+
206
+ if (res.ok) {
207
+ setAnnotations(prev => [...prev, payload]);
208
+ setModalOpen(false);
209
+ showToast("Annotation saved!");
210
+ setPanelOpen(true);
211
+ refreshPageData(); // Refresh so new annotation appears highlighted
212
+ } else {
213
+ showToast("Failed to save annotation.", "error");
214
+ }
215
+ } catch (err) {
216
+ console.error(err);
217
+ showToast("Network error saving annotation.", "error");
218
+ }
219
+ };
220
+
221
+ // Delete dataset entry by index
222
+ const handleDeleteAnnotation = async (ds, idx) => {
223
+ try {
224
+ const res = await fetch(
225
+ `/api/validate?doc=${selectedDocIndex}&page=${currentPageNumber}&idx=${idx}`,
226
+ { method: 'DELETE' }
227
+ );
228
+ if (res.ok) {
229
+ refreshPageData();
230
+ showToast("Data mention deleted.");
231
+ } else {
232
+ showToast("Failed to delete.", "error");
233
+ }
234
+ } catch (err) {
235
+ console.error(err);
236
+ showToast("Network error deleting.", "error");
237
+ }
238
+ };
239
+
240
+ // Update annotation (e.g. change dataset_tag)
241
+ const handleUpdateAnnotation = async (annotation, idx, updates) => {
242
+ try {
243
+ const res = await fetch('/api/annotate', {
244
+ method: 'PUT',
245
+ headers: { 'Content-Type': 'application/json' },
246
+ body: JSON.stringify({
247
+ timestamp: annotation.timestamp,
248
+ document_index: annotation.document_index ?? selectedDocIndex,
249
+ page_number: annotation.page_number ?? currentPageNumber,
250
+ updates,
251
+ })
252
+ });
253
+ if (res.ok) {
254
+ setAnnotations(prev => prev.map(a =>
255
+ a.timestamp === annotation.timestamp ? { ...a, ...updates } : a
256
+ ));
257
+ showToast("Annotation updated.");
258
+ } else {
259
+ showToast("Failed to update annotation.", "error");
260
  }
261
+ } catch (err) {
262
+ console.error(err);
263
+ showToast("Network error updating annotation.", "error");
264
  }
265
  };
266
 
267
+ // All datasets on the current page (model + human)
268
+ const currentPageDatasets = currentPageData?.datasets || [];
269
+
270
+ // Validate a dataset entry (approve/reject with notes)
271
+ const handleValidateDataset = async (datasetIdx, updates) => {
272
+ try {
273
+ const res = await fetch('/api/validate', {
274
+ method: 'PUT',
275
+ headers: { 'Content-Type': 'application/json' },
276
+ body: JSON.stringify({
277
+ document_index: selectedDocIndex,
278
+ page_number: currentPageNumber,
279
+ dataset_index: datasetIdx,
280
+ updates,
281
+ })
282
+ });
283
+ if (res.ok) {
284
+ refreshPageData();
285
+ showToast("Validation saved!");
286
+ } else {
287
+ showToast("Failed to save validation.", "error");
288
+ }
289
+ } catch (err) {
290
+ console.error(err);
291
+ showToast("Network error saving validation.", "error");
292
+ }
293
+ };
294
+
295
+ if (loading) {
296
+ return (
297
+ <div className="loading-container">
298
+ <div className="loading-spinner" />
299
+ <p>Loading documents...</p>
300
+ </div>
301
+ );
302
+ }
303
 
304
  return (
305
  <div className="container">
306
  <div className="pane left-pane">
307
+ <div className="pane-header">
308
+ <h2>PDF Viewer</h2>
309
+ <DocumentSelector
310
+ documents={documents}
311
+ selectedDocIndex={selectedDocIndex}
312
+ onDocChange={handleDocChange}
313
+ />
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  </div>
315
+ <PdfViewer
316
+ pdfUrl={currentDoc?.pdf_url}
317
+ pageNumber={currentPageNumber}
318
+ />
319
+ </div>
320
 
321
+ <div className="pane right-pane">
322
+ <MarkdownAnnotator
323
+ selectedDocIndex={selectedDocIndex}
324
+ selectedPage={currentPageNumber}
325
+ currentPageData={currentPageData}
326
+ loadingPage={loadingPage}
327
+ onAnnotate={handleAnnotate}
328
+ />
329
+ </div>
330
+
331
+ {/* Floating chevron to open annotations panel */}
332
+ <button
333
+ className="panel-chevron"
334
+ onClick={() => setPanelOpen(prev => !prev)}
335
+ title="Toggle annotations"
336
+ >
337
+ {panelOpen ? '›' : '‹'}
338
+ {!panelOpen && currentPageDatasets.length > 0 && (
339
+ <span className="chevron-badge">{currentPageDatasets.length}</span>
340
  )}
341
+ </button>
342
+ <AnnotationPanel
343
+ isOpen={panelOpen}
344
+ onClose={() => setPanelOpen(false)}
345
+ datasets={currentPageDatasets}
346
+ annotatorName={annotatorName}
347
+ onValidate={handleValidateDataset}
348
+ onDelete={handleDeleteAnnotation}
349
+ />
350
+
351
+ {/* Shared page navigator at the bottom */}
352
+ <div className="bottom-nav">
353
+ <PageNavigator
354
+ currentIndex={pageIdx}
355
+ totalPages={annotatablePages.length}
356
+ currentPageNumber={currentPageNumber}
357
+ onPrevious={handlePrevPage}
358
+ onNext={handleNextPage}
359
+ />
360
  </div>
361
 
362
+ <AnnotationModal
363
+ isOpen={modalOpen}
364
+ selectedText={selectedText}
365
+ annotatorName={annotatorName}
366
+ onAnnotatorChange={handleAnnotatorChange}
367
+ onSubmit={handleAnnotationSubmit}
368
+ onClose={() => setModalOpen(false)}
369
+ />
 
 
 
 
 
 
 
 
370
 
371
+ {toast && (
372
+ <div className={`toast toast-${toast.type}`}>
373
+ {toast.message}
 
 
 
 
 
 
 
 
 
374
  </div>
375
+ )}
376
  </div>
377
  );
378
  }
utils/config.js ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ // Centralized configuration for the annotation app
2
+ export const HF_DATASET_ID = process.env.HF_DATASET_REPO || 'ai4data/annotation_data';
3
+ export const HF_DATASET_BASE_URL = `https://huggingface.co/datasets/${HF_DATASET_ID}`;
4
+ export const MAX_DOCS_TO_SCAN = parseInt(process.env.MAX_DOCS_TO_SCAN || '5', 10);
utils/storage.js CHANGED
@@ -1,132 +1,258 @@
1
  import fs from 'fs';
2
  import path from 'path';
 
 
3
 
4
- // Get the root path of the project (handles Docker container `/app` or local)
5
  const getRootPath = () => process.cwd();
6
 
7
- // Determines if we are running in a Hugging Face Space
8
- // which we indicate by the presence of HF_TOKEN and not being explicitly in dev mode
9
  const isHFSpace = () => {
10
  return process.env.HF_TOKEN && process.env.NODE_ENV !== 'development';
11
  };
12
 
13
  /**
14
- * Saves the annotation payload to either the local system or HF Datasets
15
- * @param {Object} annotation { document_index, page_number, selected_text, annotation_note, timestamp }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  */
17
  export async function saveAnnotation(annotation) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  if (isHFSpace()) {
19
- await saveToHuggingFace(annotation);
 
 
 
 
 
 
 
20
  } else {
21
- await saveToLocal(annotation);
 
 
 
 
 
 
 
 
22
  }
23
  }
24
 
25
  /**
26
- * Saves annotation locally by creating or appending to human_annotations.json
27
  */
28
- async function saveToLocal(annotation) {
29
- const dataDir = path.join(getRootPath(), 'annotation_data');
30
- const filePath = path.join(dataDir, 'human_annotations.json');
 
 
31
 
32
- // Ensure directory exists
33
- if (!fs.existsSync(dataDir)) {
34
- fs.mkdirSync(dataDir, { recursive: true });
35
- }
 
36
 
37
- let annotations = [];
38
- if (fs.existsSync(filePath)) {
39
- try {
40
- const raw = fs.readFileSync(filePath, 'utf-8');
41
- annotations = JSON.parse(raw);
42
- } catch (e) {
43
- console.error("Error parsing existing annotations, creating new array.", e);
44
- }
45
- }
46
 
47
- annotations.push(annotation);
48
- fs.writeFileSync(filePath, JSON.stringify(annotations, null, 2));
49
- console.log(`Saved annotation locally to ${filePath}`);
 
 
 
 
 
 
 
 
 
50
  }
51
 
52
  /**
53
- * Commits the new annotation to a JSON Lines (JSONL) dataset on Hugging Face
54
  */
55
- async function saveToHuggingFace(annotation) {
56
- const repoId = process.env.HF_DATASET_REPO || 'rafmacalaba/wbg_annotation_data';
57
- const token = process.env.HF_TOKEN;
 
 
58
 
59
- if (!token) throw new Error("Missing HF_TOKEN for dataset commit.");
60
-
61
- try {
62
- const filename = `annotations/${annotation.document_index}_p${annotation.page_number}_${Date.now()}.json`;
63
- const content = Buffer.from(JSON.stringify(annotation, null, 2)).toString('base64');
64
-
65
- const payload = {
66
- commit_message: `Add annotation for doc ${annotation.document_index} page ${annotation.page_number}`,
67
- operations: [
68
- {
69
- key: "path",
70
- value: filename
71
- },
72
- {
73
- key: "content",
74
- value: content
75
- }
76
- ]
77
  };
 
 
 
 
 
 
78
 
79
- // Use the Hugging Face REST API directly
80
- const res = await fetch(`https://huggingface.co/api/datasets/${repoId}/commit/main`, {
81
- method: 'POST',
82
- headers: {
83
- 'Authorization': `Bearer ${token}`,
84
- 'Content-Type': 'application/json'
85
- },
86
- body: JSON.stringify({
87
- summary: payload.commit_message,
88
- operations: [
89
- {
90
- keyItem: "path",
91
- keyValue: filename,
92
- keyItem2: "content",
93
- keyValue2: content
94
- }
95
- ]
96
- }) // The API structure for operations is slightly complex, lets use a simple multipart form or the proper JSON
97
- });
98
-
99
- // Actually the HF Commit API expects a specific JSON structure. Let's send the correct one:
100
- // { "operations": [{ "operation": "addOrUpdate", "path": "filename", "content": "base64encoded==" }], "commit_message": "..." }
101
-
102
- const correctPayload = {
103
- summary: payload.commit_message,
104
- operations: [
105
- {
106
- operation: "addOrUpdate",
107
- path: filename,
108
- content: content,
109
- encoding: "base64"
110
- }
111
- ]
112
  };
 
 
 
 
113
 
114
- const executeRes = await fetch(`https://huggingface.co/api/datasets/${repoId}/commit/main`, {
115
- method: 'POST',
116
- headers: {
117
- 'Authorization': `Bearer ${token}`,
118
- 'Content-Type': 'application/json'
119
- },
120
- body: JSON.stringify(correctPayload)
121
- });
122
-
123
- if (!executeRes.ok) {
124
- throw new Error(`Failed to commit to HF: ${await executeRes.text()}`);
125
- }
126
 
127
- console.log(`Successfully committed annotation ${filename} to HF Space ${repoId}`);
128
- } catch (e) {
129
- console.error("Failed to commit to Hugging Face:", e);
130
- throw e;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  }
 
 
132
  }
 
1
  import fs from 'fs';
2
  import path from 'path';
3
+ import { commit } from '@huggingface/hub';
4
+ import { HF_DATASET_ID, HF_DATASET_BASE_URL } from './config.js';
5
 
 
6
  const getRootPath = () => process.cwd();
7
 
 
 
8
  const isHFSpace = () => {
9
  return process.env.HF_TOKEN && process.env.NODE_ENV !== 'development';
10
  };
11
 
12
  /**
13
+ * Returns the local file path for a document's raw JSON
14
+ */
15
+ function getDocFilePath(docIndex) {
16
+ return path.join(
17
+ getRootPath(),
18
+ 'annotation_data', 'wbg_extractions',
19
+ `doc_${docIndex}`, 'raw', `doc_${docIndex}_direct_judged.jsonl`
20
+ );
21
+ }
22
+
23
+ /**
24
+ * Returns the HF repo path for a document's raw JSON
25
+ */
26
+ function getDocRepoPath(docIndex) {
27
+ return `annotation_data/wbg_extractions/doc_${docIndex}/raw/doc_${docIndex}_direct_judged.jsonl`;
28
+ }
29
+
30
+ /**
31
+ * Reads the full document JSON (all pages) from local file
32
+ */
33
+ function readDocLocal(docIndex) {
34
+ const filePath = getDocFilePath(docIndex);
35
+ if (!fs.existsSync(filePath)) return null;
36
+ const raw = fs.readFileSync(filePath, 'utf-8');
37
+ return JSON.parse(raw);
38
+ }
39
+
40
+ /**
41
+ * Writes the full document JSON (all pages) to local file
42
+ */
43
+ function writeDocLocal(docIndex, pagesData) {
44
+ const filePath = getDocFilePath(docIndex);
45
+ fs.writeFileSync(filePath, JSON.stringify(pagesData, null, 2));
46
+ console.log(`Saved doc_${docIndex}_direct_judged.jsonl locally`);
47
+ }
48
+
49
+ /**
50
+ * Finds the page index in the pages array by page_number
51
+ * Uses document.pages[0] to match, consistent with the document/route.js API
52
+ */
53
+ function findPageIndex(pagesData, pageNumber) {
54
+ return pagesData.findIndex(p => p.document?.pages?.[0] === pageNumber);
55
+ }
56
+
57
+ /**
58
+ * Fetches the document JSON from HuggingFace
59
+ */
60
+ async function fetchDocFromHF(docIndex) {
61
+ const token = process.env.HF_TOKEN;
62
+ const url = `${HF_DATASET_BASE_URL}/raw/main/${getDocRepoPath(docIndex)}`;
63
+ const res = await fetch(url, {
64
+ headers: { 'Authorization': `Bearer ${token}` }
65
+ });
66
+ if (!res.ok) throw new Error(`Failed to fetch doc_${docIndex} from HF: ${res.status}`);
67
+ return res.json();
68
+ }
69
+
70
+ /**
71
+ * Commits the updated document JSON back to HuggingFace
72
+ */
73
+ async function commitDocToHF(docIndex, pagesData, commitMessage) {
74
+ const token = process.env.HF_TOKEN;
75
+ if (!token) throw new Error("Missing HF_TOKEN");
76
+
77
+ const repoPath = getDocRepoPath(docIndex);
78
+ const content = JSON.stringify(pagesData, null, 2);
79
+
80
+ await commit({
81
+ repo: { type: 'dataset', name: HF_DATASET_ID },
82
+ credentials: { accessToken: token },
83
+ title: commitMessage,
84
+ operations: [{
85
+ operation: 'addOrUpdate',
86
+ path: repoPath,
87
+ content: new Blob([content], { type: 'application/json' }),
88
+ }],
89
+ });
90
+ console.log(`Committed ${repoPath} to HF dataset ${HF_DATASET_ID}`);
91
+ }
92
+
93
+ // ─── Public API ────────────────────────────────────
94
+
95
+ /**
96
+ * Saves an annotation by appending it to the page's datasets array
97
+ * in the per-document raw JSON file.
98
+ *
99
+ * @param {Object} annotation - Must include document_index, page_number, and dataset fields
100
  */
101
  export async function saveAnnotation(annotation) {
102
+ const { document_index: docIndex, page_number: pageNumber } = annotation;
103
+
104
+ // Build the dataset entry (strip routing fields — they stay at page/doc level)
105
+ const datasetEntry = {
106
+ dataset_name: annotation.dataset_name,
107
+ dataset_tag: annotation.dataset_tag,
108
+ annotator: annotation.annotator,
109
+ timestamp: annotation.timestamp,
110
+ description: annotation.description || null,
111
+ data_type: annotation.data_type || null,
112
+ acronym: annotation.acronym || null,
113
+ author: annotation.author || null,
114
+ producer: annotation.producer || null,
115
+ geography: annotation.geography || null,
116
+ publication_year: annotation.publication_year || null,
117
+ reference_year: annotation.reference_year || null,
118
+ reference_population: annotation.reference_population || null,
119
+ is_used: annotation.is_used || null,
120
+ usage_context: annotation.usage_context || null,
121
+ };
122
+
123
  if (isHFSpace()) {
124
+ // Production: fetch from HF, modify, commit back
125
+ const pagesData = await fetchDocFromHF(docIndex);
126
+ const pageIdx = findPageIndex(pagesData, pageNumber);
127
+ if (pageIdx === -1) throw new Error(`Page ${pageNumber} not found in doc_${docIndex}`);
128
+
129
+ pagesData[pageIdx].datasets.push(datasetEntry);
130
+ await commitDocToHF(docIndex, pagesData,
131
+ `Add human annotation to doc_${docIndex} page ${pageNumber}`);
132
  } else {
133
+ // Local: read, modify, write
134
+ const pagesData = readDocLocal(docIndex);
135
+ if (!pagesData) throw new Error(`doc_${docIndex}_raw.json not found locally`);
136
+
137
+ const pageIdx = findPageIndex(pagesData, pageNumber);
138
+ if (pageIdx === -1) throw new Error(`Page ${pageNumber} not found in doc_${docIndex}`);
139
+
140
+ pagesData[pageIdx].datasets.push(datasetEntry);
141
+ writeDocLocal(docIndex, pagesData);
142
  }
143
  }
144
 
145
  /**
146
+ * Deletes an annotation from the page's datasets array by timestamp
147
  */
148
+ export async function deleteAnnotation(timestamp, docIndex, pageNumber) {
149
+ if (isHFSpace()) {
150
+ const pagesData = await fetchDocFromHF(docIndex);
151
+ const pageIdx = findPageIndex(pagesData, pageNumber);
152
+ if (pageIdx === -1) return false;
153
 
154
+ const before = pagesData[pageIdx].datasets.length;
155
+ pagesData[pageIdx].datasets = pagesData[pageIdx].datasets.filter(
156
+ ds => ds.timestamp !== timestamp
157
+ );
158
+ if (pagesData[pageIdx].datasets.length === before) return false;
159
 
160
+ await commitDocToHF(docIndex, pagesData,
161
+ `Delete annotation from doc_${docIndex} page ${pageNumber}`);
162
+ return true;
163
+ } else {
164
+ const pagesData = readDocLocal(docIndex);
165
+ if (!pagesData) return false;
 
 
 
166
 
167
+ const pageIdx = findPageIndex(pagesData, pageNumber);
168
+ if (pageIdx === -1) return false;
169
+
170
+ const before = pagesData[pageIdx].datasets.length;
171
+ pagesData[pageIdx].datasets = pagesData[pageIdx].datasets.filter(
172
+ ds => ds.timestamp !== timestamp
173
+ );
174
+ if (pagesData[pageIdx].datasets.length === before) return false;
175
+
176
+ writeDocLocal(docIndex, pagesData);
177
+ return true;
178
+ }
179
  }
180
 
181
  /**
182
+ * Updates an annotation in the page's datasets array by timestamp
183
  */
184
+ export async function updateAnnotation(timestamp, docIndex, pageNumber, updates) {
185
+ if (isHFSpace()) {
186
+ const pagesData = await fetchDocFromHF(docIndex);
187
+ const pageIdx = findPageIndex(pagesData, pageNumber);
188
+ if (pageIdx === -1) return null;
189
 
190
+ const dsIdx = pagesData[pageIdx].datasets.findIndex(ds => ds.timestamp === timestamp);
191
+ if (dsIdx === -1) return null;
192
+
193
+ pagesData[pageIdx].datasets[dsIdx] = {
194
+ ...pagesData[pageIdx].datasets[dsIdx],
195
+ ...updates
 
 
 
 
 
 
 
 
 
 
 
 
196
  };
197
+ await commitDocToHF(docIndex, pagesData,
198
+ `Update annotation in doc_${docIndex} page ${pageNumber}`);
199
+ return pagesData[pageIdx].datasets[dsIdx];
200
+ } else {
201
+ const pagesData = readDocLocal(docIndex);
202
+ if (!pagesData) return null;
203
 
204
+ const pageIdx = findPageIndex(pagesData, pageNumber);
205
+ if (pageIdx === -1) return null;
206
+
207
+ const dsIdx = pagesData[pageIdx].datasets.findIndex(ds => ds.timestamp === timestamp);
208
+ if (dsIdx === -1) return null;
209
+
210
+ pagesData[pageIdx].datasets[dsIdx] = {
211
+ ...pagesData[pageIdx].datasets[dsIdx],
212
+ ...updates
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  };
214
+ writeDocLocal(docIndex, pagesData);
215
+ return pagesData[pageIdx].datasets[dsIdx];
216
+ }
217
+ }
218
 
219
+ /**
220
+ * Retrieves all human annotations (those with annotator field) from local files.
221
+ * Scans all doc files and returns entries that have a timestamp (human-added).
222
+ */
223
+ export async function getAnnotations(docIndex = null) {
224
+ const extractionsDir = path.join(getRootPath(), 'annotation_data', 'wbg_extractions');
225
+ if (!fs.existsSync(extractionsDir)) return [];
226
+
227
+ const results = [];
228
+ const dirs = fs.readdirSync(extractionsDir).filter(d => d.startsWith('doc_'));
 
 
229
 
230
+ for (const dir of dirs) {
231
+ const idx = parseInt(dir.replace('doc_', ''), 10);
232
+ if (docIndex !== null && idx !== docIndex) continue;
233
+
234
+ const filePath = path.join(extractionsDir, dir, 'raw', `${dir}_direct_judged.jsonl`);
235
+ if (!fs.existsSync(filePath)) continue;
236
+
237
+ try {
238
+ const pagesData = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
239
+ for (const page of pagesData) {
240
+ const pageNum = page.document?.pages?.[0];
241
+ for (const ds of (page.datasets || [])) {
242
+ // Only return human annotations (those with annotator field)
243
+ if (ds.annotator) {
244
+ results.push({
245
+ ...ds,
246
+ document_index: idx,
247
+ page_number: pageNum,
248
+ });
249
+ }
250
+ }
251
+ }
252
+ } catch (e) {
253
+ console.error(`Error reading ${filePath}:`, e);
254
+ }
255
  }
256
+
257
+ return results;
258
  }