gauthamnairy commited on
Commit
b163dc2
·
verified ·
1 Parent(s): 8eab37a

Upload 42 files

Browse files
Files changed (42) hide show
  1. .gitattributes +8 -35
  2. .gitignore +15 -0
  3. CHANGELOG.md +16 -0
  4. Dockerfile +23 -0
  5. LICENSE +21 -0
  6. README.md +261 -11
  7. app.py +67 -0
  8. cookbook/README.md +14 -0
  9. cookbook/agentic_retrieval.ipynb +899 -0
  10. cookbook/pageIndex_chat_quickstart.ipynb +274 -0
  11. cookbook/pageindex_RAG_simple.ipynb +609 -0
  12. cookbook/vision_RAG_pageindex.ipynb +667 -0
  13. llm_config.py +39 -0
  14. pageindex/__init__.py +2 -0
  15. pageindex/config.yaml +8 -0
  16. pageindex/core/tree_index.py +226 -0
  17. pageindex/page_index.py +1144 -0
  18. pageindex/page_index_md.py +339 -0
  19. pageindex/utils.py +712 -0
  20. requirements.txt +8 -0
  21. run_pageindex.py +133 -0
  22. tests/pdfs/2023-annual-report-truncated.pdf +3 -0
  23. tests/pdfs/2023-annual-report.pdf +3 -0
  24. tests/pdfs/PRML.pdf +3 -0
  25. tests/pdfs/Regulation Best Interest_Interpretive release.pdf +3 -0
  26. tests/pdfs/Regulation Best Interest_proposed rule.pdf +3 -0
  27. tests/pdfs/earthmover.pdf +3 -0
  28. tests/pdfs/four-lectures.pdf +3 -0
  29. tests/pdfs/q1-fy25-earnings.pdf +3 -0
  30. tests/results/2023-annual-report-truncated_structure.json +83 -0
  31. tests/results/2023-annual-report_structure.json +493 -0
  32. tests/results/PRML_structure.json +1847 -0
  33. tests/results/Regulation Best Interest_Interpretive release_structure.json +73 -0
  34. tests/results/Regulation Best Interest_proposed rule_structure.json +0 -0
  35. tests/results/earthmover_structure.json +137 -0
  36. tests/results/four-lectures_structure.json +333 -0
  37. tests/results/q1-fy25-earnings_structure.json +311 -0
  38. tutorials/doc-search/README.md +17 -0
  39. tutorials/doc-search/description.md +67 -0
  40. tutorials/doc-search/metadata.md +37 -0
  41. tutorials/doc-search/semantics.md +41 -0
  42. tutorials/tree-search/README.md +70 -0
.gitattributes CHANGED
@@ -1,35 +1,8 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.ipynb linguist-vendoredtests/pdfs/2023-annual-report-truncated.pdf filter=lfs diff=lfs merge=lfs -text
2
+ tests/pdfs/2023-annual-report.pdf filter=lfs diff=lfs merge=lfs -text
3
+ tests/pdfs/earthmover.pdf filter=lfs diff=lfs merge=lfs -text
4
+ tests/pdfs/four-lectures.pdf filter=lfs diff=lfs merge=lfs -text
5
+ tests/pdfs/PRML.pdf filter=lfs diff=lfs merge=lfs -text
6
+ tests/pdfs/q1-fy25-earnings.pdf filter=lfs diff=lfs merge=lfs -text
7
+ tests/pdfs/Regulation[[:space:]]Best[[:space:]]Interest_Interpretive[[:space:]]release.pdf filter=lfs diff=lfs merge=lfs -text
8
+ tests/pdfs/Regulation[[:space:]]Best[[:space:]]Interest_proposed[[:space:]]rule.pdf filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .ipynb_checkpoints
2
+ __pycache__
3
+ files
4
+ index
5
+ temp/*
6
+ chroma-collections.parquet
7
+ chroma-embeddings.parquet
8
+ .DS_Store
9
+ .env*
10
+ notebook
11
+ SDK/*
12
+ log/*
13
+ logs/
14
+ parts/*
15
+ json_results/*
CHANGELOG.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Change Log
2
+ All notable changes to this project will be documented in this file.
3
+
4
+ ## Beta - 2025-04-23
5
+
6
+ ### Fixed
7
+ - [x] Fixed a bug introduced on April 18 where `start_index` was incorrectly passed.
8
+
9
+ ## Beta - 2025-04-03
10
+
11
+ ### Added
12
+ - [x] Add node_id, node summary
13
+ - [x] Add document discription
14
+
15
+ ### Changed
16
+ - [x] Change "child_nodes" -> "nodes" to simplify the structure
Dockerfile ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ FROM python:3.10-slim
3
+
4
+ WORKDIR /app
5
+
6
+ # Copy requirements first to leverage Docker cache
7
+ COPY requirements.txt .
8
+
9
+ # Install dependencies
10
+ RUN pip install --no-cache-dir -r requirements.txt
11
+
12
+ # Copy the rest of the application
13
+ COPY . .
14
+
15
+ # Expose the port used by Gradio
16
+ EXPOSE 7860
17
+
18
+ # Set environment variables if needed (though secrets should be set in HF Space settings)
19
+ # ENV NVIDIA_API_KEY=...
20
+ # ENV MISTRAL_API_KEY=...
21
+
22
+ # Run the application
23
+ CMD ["python", "app.py"]
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Vectify AI
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,11 +1,261 @@
1
- ---
2
- title: PageIndexAPI
3
- emoji: 📚
4
- colorFrom: red
5
- colorTo: blue
6
- sdk: docker
7
- pinned: false
8
- license: apache-2.0
9
- ---
10
-
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: PageIndex RAG
3
+ emoji: 📑
4
+ colorFrom: blue
5
+ colorTo: indigo
6
+ sdk: docker
7
+ app_file: app.py
8
+ pinned: false
9
+ ---
10
+
11
+ <div align="center">
12
+
13
+ <a href="https://vectify.ai/pageindex" target="_blank">
14
+ <img src="https://github.com/user-attachments/assets/46201e72-675b-43bc-bfbd-081cc6b65a1d" alt="PageIndex Banner" />
15
+ </a>
16
+
17
+ <br/>
18
+ <br/>
19
+
20
+ <p align="center">
21
+ <a href="https://trendshift.io/repositories/14736" target="_blank"><img src="https://trendshift.io/api/badge/repositories/14736" alt="VectifyAI%2FPageIndex | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
22
+ </p>
23
+
24
+ # PageIndex: Vectorless, Reasoning-based RAG
25
+
26
+ <p align="center"><b>Reasoning-based RAG&nbsp; ◦ &nbsp;No Vector DB&nbsp; ◦ &nbsp;No Chunking&nbsp; ◦ &nbsp;Human-like Retrieval</b></p>
27
+
28
+ <h4 align="center">
29
+ <a href="https://vectify.ai">🏠 Homepage</a>&nbsp; • &nbsp;
30
+ <a href="https://chat.pageindex.ai">🖥️ Chat Platform</a>&nbsp; • &nbsp;
31
+ <a href="https://pageindex.ai/mcp">🔌 MCP</a>&nbsp; • &nbsp;
32
+ <a href="https://docs.pageindex.ai">📚 Docs</a>&nbsp; • &nbsp;
33
+ <a href="https://discord.com/invite/VuXuf29EUj">💬 Discord</a>&nbsp; • &nbsp;
34
+ <a href="https://ii2abc2jejf.typeform.com/to/tK3AXl8T">✉️ Contact</a>&nbsp;
35
+ </h4>
36
+
37
+ </div>
38
+
39
+
40
+ <details open>
41
+ <summary><h3>📢 Latest Updates</h3></summary>
42
+
43
+ **🔥 Releases:**
44
+ - [**PageIndex Chat**](https://chat.pageindex.ai): The first human-like document-analysis agent [platform](https://chat.pageindex.ai) built for professional long documents. Can also be integrated via [MCP](https://pageindex.ai/mcp) or [API](https://docs.pageindex.ai/quickstart) (beta).
45
+ <!-- - [**PageIndex Chat API**](https://docs.pageindex.ai/quickstart): An API that brings PageIndex's advanced long-document intelligence directly into your applications and workflows. -->
46
+ <!-- - [PageIndex MCP](https://pageindex.ai/mcp): Bring PageIndex into Claude, Cursor, or any MCP-enabled agent. Chat with long PDFs in a reasoning-based, human-like way. -->
47
+
48
+ **📝 Articles:**
49
+ - [**PageIndex Framework**](https://pageindex.ai/blog/pageindex-intro): Introduces the PageIndex framework — an *agentic, in-context* *tree index* that enables LLMs to perform *reasoning-based*, *human-like retrieval* over long documents, without vector DB or chunking.
50
+ <!-- - [Do We Still Need OCR?](https://pageindex.ai/blog/do-we-need-ocr): Explores how vision-based, reasoning-native RAG challenges the traditional OCR pipeline, and why the future of document AI might be *vectorless* and *vision-based*. -->
51
+
52
+ **🧪 Cookbooks:**
53
+ - [Vectorless RAG](https://docs.pageindex.ai/cookbook/vectorless-rag-pageindex): A minimal, hands-on example of reasoning-based RAG using PageIndex. No vectors, no chunking, and human-like retrieval.
54
+ - [Vision-based Vectorless RAG](https://docs.pageindex.ai/cookbook/vision-rag-pageindex): OCR-free, vision-only RAG with PageIndex's reasoning-native retrieval workflow that works directly over PDF page images.
55
+ </details>
56
+
57
+ ---
58
+
59
+ # 📑 Introduction to PageIndex
60
+
61
+ Are you frustrated with vector database retrieval accuracy for long professional documents? Traditional vector-based RAG relies on semantic *similarity* rather than true *relevance*. But **similarity ≠ relevance** — what we truly need in retrieval is **relevance**, and that requires **reasoning**. When working with professional documents that demand domain expertise and multi-step reasoning, similarity search often falls short.
62
+
63
+ Inspired by AlphaGo, we propose **[PageIndex](https://vectify.ai/pageindex)** — a **vectorless**, **reasoning-based RAG** system that builds a **hierarchical tree index** from long documents and uses LLMs to **reason** *over that index* for **agentic, context-aware retrieval**.
64
+ It simulates how *human experts* navigate and extract knowledge from complex documents through *tree search*, enabling LLMs to *think* and *reason* their way to the most relevant document sections. PageIndex performs retrieval in two steps:
65
+
66
+ 1. Generate a “Table-of-Contents” **tree structure index** of documents
67
+ 2. Perform reasoning-based retrieval through **tree search**
68
+
69
+ <div align="center">
70
+ <a href="https://pageindex.ai/blog/pageindex-intro" target="_blank" title="The PageIndex Framework">
71
+ <img src="https://docs.pageindex.ai/images/cookbook/vectorless-rag.png" width="70%">
72
+ </a>
73
+ </div>
74
+
75
+ ### 🎯 Core Features
76
+
77
+ Compared to traditional vector-based RAG, **PageIndex** features:
78
+ - **No Vector DB**: Uses document structure and LLM reasoning for retrieval, instead of vector similarity search.
79
+ - **No Chunking**: Documents are organized into natural sections, not artificial chunks.
80
+ - **Human-like Retrieval**: Simulates how human experts navigate and extract knowledge from complex documents.
81
+ - **Better Explainability and Traceability**: Retrieval is based on reasoning — traceable and interpretable, with page and section references. No more opaque, approximate vector search (“vibe retrieval”).
82
+
83
+ PageIndex powers a reasoning-based RAG system that achieved **state-of-the-art** [98.7% accuracy](https://github.com/VectifyAI/Mafin2.5-FinanceBench) on FinanceBench, demonstrating superior performance over vector-based RAG solutions in professional document analysis (see our [blog post](https://vectify.ai/blog/Mafin2.5) for details).
84
+
85
+ ### 📍 Explore PageIndex
86
+
87
+ To learn more, please see a detailed introduction of the [PageIndex framework](https://pageindex.ai/blog/pageindex-intro). Check out this GitHub repo for open-source code, and the [cookbooks](https://docs.pageindex.ai/cookbook), [tutorials](https://docs.pageindex.ai/tutorials), and [blog](https://pageindex.ai/blog) for additional usage guides and examples.
88
+
89
+ The PageIndex service is available as a ChatGPT-style [chat platform](https://chat.pageindex.ai), or can be integrated via [MCP](https://pageindex.ai/mcp) or [API](https://docs.pageindex.ai/quickstart).
90
+
91
+ ### 🛠️ Deployment Options
92
+ - Self-host — run locally with this open-source repo.
93
+ - Cloud Service — try instantly with our [Chat Platform](https://chat.pageindex.ai/), or integrate with [MCP](https://pageindex.ai/mcp) or [API](https://docs.pageindex.ai/quickstart).
94
+ - _Enterprise_ — private or on-prem deployment. [Contact us](https://ii2abc2jejf.typeform.com/to/tK3AXl8T) or [book a demo](https://calendly.com/pageindex/meet) for more details.
95
+
96
+ ### 🧪 Quick Hands-on
97
+
98
+ - Try the [**Vectorless RAG**](https://github.com/VectifyAI/PageIndex/blob/main/cookbook/pageindex_RAG_simple.ipynb) notebook — a *minimal*, hands-on example of reasoning-based RAG using PageIndex.
99
+ - Experiment with [*Vision-based Vectorless RAG*](https://github.com/VectifyAI/PageIndex/blob/main/cookbook/vision_RAG_pageindex.ipynb) — no OCR; a minimal, reasoning-native RAG pipeline that works directly over page images.
100
+
101
+ <div align="center">
102
+ <a href="https://colab.research.google.com/github/VectifyAI/PageIndex/blob/main/cookbook/pageindex_RAG_simple.ipynb" target="_blank" rel="noopener">
103
+ <img src="https://img.shields.io/badge/Open_In_Colab-Vectorless_RAG-orange?style=for-the-badge&logo=googlecolab" alt="Open in Colab: Vectorless RAG" />
104
+ </a>
105
+ &nbsp;&nbsp;
106
+ <a href="https://colab.research.google.com/github/VectifyAI/PageIndex/blob/main/cookbook/vision_RAG_pageindex.ipynb" target="_blank" rel="noopener">
107
+ <img src="https://img.shields.io/badge/Open_In_Colab-Vision_RAG-orange?style=for-the-badge&logo=googlecolab" alt="Open in Colab: Vision RAG" />
108
+ </a>
109
+ </div>
110
+
111
+ ---
112
+
113
+ # 🌲 PageIndex Tree Structure
114
+ PageIndex can transform lengthy PDF documents into a semantic **tree structure**, similar to a _"table of contents"_ but optimized for use with Large Language Models (LLMs). It's ideal for: financial reports, regulatory filings, academic textbooks, legal or technical manuals, and any document that exceeds LLM context limits.
115
+
116
+ Below is an example PageIndex tree structure. Also see more example [documents](https://github.com/VectifyAI/PageIndex/tree/main/tests/pdfs) and generated [tree structures](https://github.com/VectifyAI/PageIndex/tree/main/tests/results).
117
+
118
+ ```jsonc
119
+ ...
120
+ {
121
+ "title": "Financial Stability",
122
+ "node_id": "0006",
123
+ "start_index": 21,
124
+ "end_index": 22,
125
+ "summary": "The Federal Reserve ...",
126
+ "nodes": [
127
+ {
128
+ "title": "Monitoring Financial Vulnerabilities",
129
+ "node_id": "0007",
130
+ "start_index": 22,
131
+ "end_index": 28,
132
+ "summary": "The Federal Reserve's monitoring ..."
133
+ },
134
+ {
135
+ "title": "Domestic and International Cooperation and Coordination",
136
+ "node_id": "0008",
137
+ "start_index": 28,
138
+ "end_index": 31,
139
+ "summary": "In 2023, the Federal Reserve collaborated ..."
140
+ }
141
+ ]
142
+ }
143
+ ...
144
+ ```
145
+
146
+ You can generate the PageIndex tree structure with this open-source repo, or use our [API](https://docs.pageindex.ai/quickstart)
147
+
148
+ ---
149
+
150
+ # ⚙️ Package Usage
151
+
152
+ You can follow these steps to generate a PageIndex tree from a PDF document.
153
+
154
+ ### 1. Install dependencies
155
+
156
+ ```bash
157
+ pip3 install --upgrade -r requirements.txt
158
+ ```
159
+
160
+ ### 2. Set your OpenAI API key
161
+
162
+ Create a `.env` file in the root directory and add your API key:
163
+
164
+ ```bash
165
+ CHATGPT_API_KEY=your_openai_key_here
166
+ ```
167
+
168
+ ### 3. Run PageIndex on your PDF
169
+
170
+ ```bash
171
+ python3 run_pageindex.py --pdf_path /path/to/your/document.pdf
172
+ ```
173
+
174
+ <details>
175
+ <summary><strong>Optional parameters</strong></summary>
176
+ <br>
177
+ You can customize the processing with additional optional arguments:
178
+
179
+ ```
180
+ --model OpenAI model to use (default: gpt-4o-2024-11-20)
181
+ --toc-check-pages Pages to check for table of contents (default: 20)
182
+ --max-pages-per-node Max pages per node (default: 10)
183
+ --max-tokens-per-node Max tokens per node (default: 20000)
184
+ --if-add-node-id Add node ID (yes/no, default: yes)
185
+ --if-add-node-summary Add node summary (yes/no, default: yes)
186
+ --if-add-doc-description Add doc description (yes/no, default: yes)
187
+ ```
188
+ </details>
189
+
190
+ <details>
191
+ <summary><strong>Markdown support</strong></summary>
192
+ <br>
193
+ We also provide markdown support for PageIndex. You can use the `-md_path` flag to generate a tree structure for a markdown file.
194
+
195
+ ```bash
196
+ python3 run_pageindex.py --md_path /path/to/your/document.md
197
+ ```
198
+
199
+ > Note: in this function, we use "#" to determine node heading and their levels. For example, "##" is level 2, "###" is level 3, etc. Make sure your markdown file is formatted correctly. If your Markdown file was converted from a PDF or HTML, we don't recommend using this function, since most existing conversion tools cannot preserve the original hierarchy. Instead, use our [PageIndex OCR](https://pageindex.ai/blog/ocr), which is designed to preserve the original hierarchy, to convert the PDF to a markdown file and then use this function.
200
+ </details>
201
+
202
+ <!--
203
+ # ☁️ Improved Tree Generation with PageIndex OCR
204
+
205
+ This repo is designed for generating PageIndex tree structure for simple PDFs, but many real-world use cases involve complex PDFs that are hard to parse by classic Python tools. However, extracting high-quality text from PDF documents remains a non-trivial challenge. Most OCR tools only extract page-level content, losing the broader document context and hierarchy.
206
+
207
+ To address this, we introduced PageIndex OCR — the first long-context OCR model designed to preserve the global structure of documents. PageIndex OCR significantly outperforms other leading OCR tools, such as those from Mistral and Contextual AI, in recognizing true hierarchy and semantic relationships across document pages.
208
+
209
+ - Experience next-level OCR quality with PageIndex OCR at our [Dashboard](https://dash.pageindex.ai/).
210
+ - Integrate PageIndex OCR seamlessly into your stack via our [API](https://docs.pageindex.ai/quickstart).
211
+
212
+ <p align="center">
213
+ <img src="https://github.com/user-attachments/assets/eb35d8ae-865c-4e60-a33b-ebbd00c41732" width="80%">
214
+ </p>
215
+ -->
216
+
217
+ ---
218
+
219
+ # 📈 Case Study: PageIndex Leads Finance QA Benchmark
220
+
221
+ [Mafin 2.5](https://vectify.ai/mafin) is a reasoning-based RAG system for financial document analysis, powered by **PageIndex**. It achieved a state-of-the-art [**98.7% accuracy**](https://vectify.ai/blog/Mafin2.5) on the [FinanceBench](https://arxiv.org/abs/2311.11944) benchmark, significantly outperforming traditional vector-based RAG systems.
222
+
223
+ PageIndex's hierarchical indexing and reasoning-driven retrieval enable precise navigation and extraction of relevant context from complex financial reports, such as SEC filings and earnings disclosures.
224
+
225
+ Explore the full [benchmark results](https://github.com/VectifyAI/Mafin2.5-FinanceBench) and our [blog post](https://vectify.ai/blog/Mafin2.5) for detailed comparisons and performance metrics.
226
+
227
+ <div align="center">
228
+ <a href="https://github.com/VectifyAI/Mafin2.5-FinanceBench">
229
+ <img src="https://github.com/user-attachments/assets/571aa074-d803-43c7-80c4-a04254b782a3" width="70%">
230
+ </a>
231
+ </div>
232
+
233
+ ---
234
+
235
+ # 🧭 Resources
236
+
237
+ * 🧪 [Cookbooks](https://docs.pageindex.ai/cookbook/vectorless-rag-pageindex): hands-on, runnable examples and advanced use cases.
238
+ * 📖 [Tutorials](https://docs.pageindex.ai/doc-search): practical guides and strategies, including *Document Search* and *Tree Search*.
239
+ * 📝 [Blog](https://pageindex.ai/blog): technical articles, research insights, and product updates.
240
+ * 🔌 [MCP setup](https://pageindex.ai/mcp#quick-setup) & [API docs](https://docs.pageindex.ai/quickstart): integration details and configuration options.
241
+
242
+ ---
243
+
244
+ # ⭐ Support Us
245
+
246
+ Leave us a star 🌟 if you like our project. Thank you!
247
+
248
+ <p>
249
+ <img src="https://github.com/user-attachments/assets/eae4ff38-48ae-4a7c-b19f-eab81201d794" width="80%">
250
+ </p>
251
+
252
+ ### Connect with Us
253
+
254
+ [![Twitter](https://img.shields.io/badge/Twitter-000000?style=for-the-badge&logo=x&logoColor=white)](https://x.com/PageIndexAI)&nbsp;
255
+ [![LinkedIn](https://img.shields.io/badge/LinkedIn-0077B5?style=for-the-badge&logo=linkedin&logoColor=white)](https://www.linkedin.com/company/vectify-ai/)&nbsp;
256
+ [![Discord](https://img.shields.io/badge/Discord-5865F2?style=for-the-badge&logo=discord&logoColor=white)](https://discord.com/invite/VuXuf29EUj)&nbsp;
257
+ [![Contact Us](https://img.shields.io/badge/Contact_Us-3B82F6?style=for-the-badge&logo=envelope&logoColor=white)](https://ii2abc2jejf.typeform.com/to/tK3AXl8T)
258
+
259
+ ---
260
+
261
+ © 2025 [Vectify AI](https://vectify.ai)
app.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ from pageindex.core.tree_index import TreeIndex
4
+ from llm_config import get_llm_client, get_model_name
5
+
6
+ # Initialize clients (checking for environment variables)
7
+ # We do this inside the function or globally, but for robustness inside function is safer if env vars change (less likely in HF Spaces)
8
+ # However, initializing once is better for connection pooling if applicable. Let's do it inside for now to handle errors gracefully.
9
+
10
+ def process_docling_and_chat(markdown_text, user_query):
11
+ if not markdown_text:
12
+ return "Please provide document markdown text."
13
+ if not user_query:
14
+ return "Please provide a query."
15
+
16
+ try:
17
+ # 1. Build the PageIndex Tree locally in the Space
18
+ tree = TreeIndex()
19
+ tree.build_from_markdown(markdown_text)
20
+
21
+ # 2. Initialize the Navigator (The "Brain")
22
+ # Try Nvidia first, then Mistral
23
+ try:
24
+ client = get_llm_client(provider="nvidia")
25
+ model = get_model_name(provider="nvidia")
26
+ # Test connection simply or just proceed
27
+ except Exception as e:
28
+ print(f"Nvidia client failed: {e}. Falling back to Mistral.")
29
+ client = get_llm_client(provider="mistral")
30
+ model = get_model_name(provider="mistral")
31
+
32
+ # 3. Perform Reasoning Search
33
+ # This uses the internal logic of the repo to navigate the tree
34
+ context = tree.reasoning_search(query=user_query, llm_client=client)
35
+
36
+ # 4. Final Answer Extraction
37
+ # Using the same client for consistency
38
+ response = client.chat.completions.create(
39
+ model=model,
40
+ messages=[
41
+ {"role": "system", "content": "You are a helpful assistant. Use the provided context to answer the user's query."},
42
+ {"role": "user", "content": f"Context:\n{context}\n\nQuery: {user_query}"}
43
+ ]
44
+ )
45
+ return response.choices[0].message.content
46
+
47
+ except Exception as e:
48
+ return f"An error occurred: {str(e)}"
49
+
50
+ # Gradio UI setup
51
+ with gr.Blocks(title="Petromind AI - PageIndex RAG") as demo:
52
+ gr.Markdown("# Oil & Gas Report - PageIndex RAG")
53
+ gr.Markdown("Upload document content (markdown format) and ask questions to extract specific information using PageIndex reasoning.")
54
+
55
+ with gr.Row():
56
+ with gr.Column(scale=1):
57
+ input_md = gr.Textbox(label="Paste Docling Markdown Here", lines=15, placeholder="# Document Title\n\n## Section 1\nContent...")
58
+ with gr.Column(scale=1):
59
+ query = gr.Textbox(label="What do you want to extract?", placeholder="e.g., What is the casing size?")
60
+ btn = gr.Button("Analyze", variant="primary")
61
+ output = gr.Textbox(label="Result", lines=10, interactive=False)
62
+
63
+ btn.click(fn=process_docling_and_chat, inputs=[input_md, query], outputs=output)
64
+
65
+ if __name__ == "__main__":
66
+ # Enable queue for concurrency
67
+ demo.queue().launch(server_name="0.0.0.0", server_port=7860)
cookbook/README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### 🧪 Cookbooks:
2
+
3
+ * [**Vectorless RAG notebook**](https://github.com/VectifyAI/PageIndex/blob/main/cookbook/pageindex_RAG_simple.ipynb): A *minimal*, hands-on example of reasoning-based RAG using **PageIndex** — no vectors, no chunking, and human-like retrieval.
4
+ * [Vision-based Vectorless RAG notebook](https://github.com/VectifyAI/PageIndex/blob/main/cookbook/vision_RAG_pageindex.ipynb): no OCR; reasoning-native RAG pipeline that retrieves and reasons directly over page images.
5
+
6
+ <div align="center">
7
+ <a href="https://colab.research.google.com/github/VectifyAI/PageIndex/blob/main/cookbook/pageindex_RAG_simple.ipynb" target="_blank" rel="noopener">
8
+ <img src="https://img.shields.io/badge/Open_In_Colab-Vectorless_RAG-orange?style=for-the-badge&logo=googlecolab" alt="Open in Colab: Vectorless RAG" />
9
+ </a>
10
+ &nbsp;&nbsp;
11
+ <a href="https://colab.research.google.com/github/VectifyAI/PageIndex/blob/main/cookbook/vision_RAG_pageindex.ipynb" target="_blank" rel="noopener">
12
+ <img src="https://img.shields.io/badge/Open_In_Colab-Vision_RAG-orange?style=for-the-badge&logo=googlecolab" alt="Open in Colab: Vision RAG" />
13
+ </a>
14
+ </div>
cookbook/agentic_retrieval.ipynb ADDED
@@ -0,0 +1,899 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "id": "XTboY7brzyp2"
7
+ },
8
+ "source": [
9
+ "![pageindex_banner](https://pageindex.ai/static/images/pageindex_banner.jpg)"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "markdown",
14
+ "metadata": {
15
+ "id": "EtjMbl9Pz3S-"
16
+ },
17
+ "source": [
18
+ "<p align=\"center\">Reasoning-based RAG&nbsp; ◦ &nbsp;No Vector DB&nbsp; ◦ &nbsp;No Chunking&nbsp; ◦ &nbsp;Human-like Retrieval</p>\n",
19
+ "\n",
20
+ "<p align=\"center\">\n",
21
+ " <a href=\"https://vectify.ai\">🏠 Homepage</a>&nbsp; • &nbsp;\n",
22
+ " <a href=\"https://chat.pageindex.ai\">🖥️ Platform</a>&nbsp; • &nbsp;\n",
23
+ " <a href=\"https://docs.pageindex.ai/quickstart\">📚 API Docs</a>&nbsp; • &nbsp;\n",
24
+ " <a href=\"https://github.com/VectifyAI/PageIndex\">📦 GitHub</a>&nbsp; • &nbsp;\n",
25
+ " <a href=\"https://discord.com/invite/VuXuf29EUj\">💬 Discord</a>&nbsp; • &nbsp;\n",
26
+ " <a href=\"https://ii2abc2jejf.typeform.com/to/tK3AXl8T\">✉️ Contact</a>&nbsp;\n",
27
+ "</p>\n",
28
+ "\n",
29
+ "<div align=\"center\">\n",
30
+ "\n",
31
+ "[![Star us on GitHub](https://img.shields.io/github/stars/VectifyAI/PageIndex?style=for-the-badge&logo=github&label=⭐️%20Star%20Us)](https://github.com/VectifyAI/PageIndex) &nbsp;&nbsp; [![Follow us on X](https://img.shields.io/badge/Follow%20Us-000000?style=for-the-badge&logo=x&logoColor=white)](https://twitter.com/VectifyAI)\n",
32
+ "\n",
33
+ "</div>\n",
34
+ "\n",
35
+ "---\n"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "markdown",
40
+ "metadata": {
41
+ "id": "bbC9uLWCz8zl"
42
+ },
43
+ "source": [
44
+ "# Agentic Retrieval with PageIndex Chat API\n",
45
+ "\n",
46
+ "Similarity-based RAG based on Vector-DB has shown big limitations in recent AI applications, reasoning-based or agentic retrieval has become important in current developments. However, unlike classic RAG pipeine with embedding input, top-K chunks returns, re-rank, what should a agentic-native retreival API looks like?\n",
47
+ "\n",
48
+ "For an agentic-native retrieval system, we need the ability to prompt for retrieval just as naturally as you interact with ChatGPT. Below, we provide an example of how the PageIndex Chat API enables this style of prompt-driven retrieval.\n",
49
+ "\n",
50
+ "\n",
51
+ "## PageIndex Chat API\n",
52
+ "[PageIndex Chat](https://chat.pageindex.ai/) is a AI assistant that allow you chat with multiple super-long documents without worrying about limited context or context rot problem. It is based on [PageIndex](https://pageindex.ai/blog/pageindex-intro), a vectorless reasoning-based RAG framework which gives more transparent and reliable results like a human expert.\n",
53
+ "<div align=\"center\">\n",
54
+ " <img src=\"https://docs.pageindex.ai/images/cookbook/vectorless-rag.png\" width=\"70%\">\n",
55
+ "</div>\n",
56
+ "\n",
57
+ "You can now access PageIndex Chat with API or SDK.\n",
58
+ "\n",
59
+ "## 📝 Notebook Overview\n",
60
+ "\n",
61
+ "This notebook demonstrates a simple, minimal example of agentic retrieval with PageIndex. You will learn:\n",
62
+ "- [x] How to use PageIndex Chat API.\n",
63
+ "- [x] How to prompt the PageIndex Chat to make it a retrieval system"
64
+ ]
65
+ },
66
+ {
67
+ "cell_type": "markdown",
68
+ "metadata": {
69
+ "id": "77SQbPoe-LTN"
70
+ },
71
+ "source": [
72
+ "### Install PageIndex SDK"
73
+ ]
74
+ },
75
+ {
76
+ "cell_type": "code",
77
+ "execution_count": 36,
78
+ "metadata": {
79
+ "id": "6Eiv_cHf0OXz"
80
+ },
81
+ "outputs": [],
82
+ "source": [
83
+ "%pip install -q --upgrade pageindex"
84
+ ]
85
+ },
86
+ {
87
+ "cell_type": "markdown",
88
+ "metadata": {
89
+ "id": "UR9-qkdD-Om7"
90
+ },
91
+ "source": [
92
+ "### Setup PageIndex"
93
+ ]
94
+ },
95
+ {
96
+ "cell_type": "code",
97
+ "execution_count": 60,
98
+ "metadata": {
99
+ "id": "AFzsW4gq0fjh"
100
+ },
101
+ "outputs": [],
102
+ "source": [
103
+ "from pageindex import PageIndexClient\n",
104
+ "\n",
105
+ "# Get your PageIndex API key from https://dash.pageindex.ai/api-keys\n",
106
+ "PAGEINDEX_API_KEY = \"YOUR_PAGEINDEX_API_KEY\"\n",
107
+ "pi_client = PageIndexClient(api_key=PAGEINDEX_API_KEY)"
108
+ ]
109
+ },
110
+ {
111
+ "cell_type": "markdown",
112
+ "metadata": {
113
+ "id": "uvzf9oWL-Ts9"
114
+ },
115
+ "source": [
116
+ "### Upload a document"
117
+ ]
118
+ },
119
+ {
120
+ "cell_type": "code",
121
+ "execution_count": 39,
122
+ "metadata": {
123
+ "colab": {
124
+ "base_uri": "https://localhost:8080/"
125
+ },
126
+ "id": "qf7sNRoL0hGw",
127
+ "outputId": "529f53c1-c827-45a7-cf01-41f567d4feaa"
128
+ },
129
+ "outputs": [
130
+ {
131
+ "name": "stdout",
132
+ "output_type": "stream",
133
+ "text": [
134
+ "Downloaded https://arxiv.org/pdf/2507.13334.pdf\n",
135
+ "Document Submitted: pi-cmi34m6jy01sg0bqzofch62n8\n"
136
+ ]
137
+ }
138
+ ],
139
+ "source": [
140
+ "import os, requests\n",
141
+ "\n",
142
+ "pdf_url = \"https://arxiv.org/pdf/2507.13334.pdf\"\n",
143
+ "pdf_path = os.path.join(\"../data\", pdf_url.split('/')[-1])\n",
144
+ "os.makedirs(os.path.dirname(pdf_path), exist_ok=True)\n",
145
+ "\n",
146
+ "response = requests.get(pdf_url)\n",
147
+ "with open(pdf_path, \"wb\") as f:\n",
148
+ " f.write(response.content)\n",
149
+ "print(f\"Downloaded {pdf_url}\")\n",
150
+ "\n",
151
+ "doc_id = pi_client.submit_document(pdf_path)[\"doc_id\"]\n",
152
+ "print('Document Submitted:', doc_id)"
153
+ ]
154
+ },
155
+ {
156
+ "cell_type": "markdown",
157
+ "metadata": {
158
+ "id": "U4hpLB4T-fCt"
159
+ },
160
+ "source": [
161
+ "### Check the processing status"
162
+ ]
163
+ },
164
+ {
165
+ "cell_type": "code",
166
+ "execution_count": 61,
167
+ "metadata": {
168
+ "colab": {
169
+ "base_uri": "https://localhost:8080/"
170
+ },
171
+ "id": "PB1S_CWd2n87",
172
+ "outputId": "472a64ab-747d-469c-9e46-3329456df212"
173
+ },
174
+ "outputs": [
175
+ {
176
+ "name": "stdout",
177
+ "output_type": "stream",
178
+ "text": [
179
+ "{'createdAt': '2025-11-16T08:36:41.177000',\n",
180
+ " 'description': 'This survey provides a comprehensive overview and taxonomy of '\n",
181
+ " 'Context Engineering for Large Language Models, covering '\n",
182
+ " 'foundational components, system implementations, evaluation '\n",
183
+ " 'methods, and future research directions.',\n",
184
+ " 'id': 'pi-cmi1gp1hg01t20do2l3bgzwz1',\n",
185
+ " 'name': '2507.13334_19.pdf',\n",
186
+ " 'pageNum': 166,\n",
187
+ " 'status': 'completed'}\n",
188
+ "\n",
189
+ " Document ready! (166 pages)\n"
190
+ ]
191
+ }
192
+ ],
193
+ "source": [
194
+ "from pprint import pprint\n",
195
+ "\n",
196
+ "doc_info = pi_client.get_document(doc_id)\n",
197
+ "pprint(doc_info)\n",
198
+ "\n",
199
+ "if doc_info['status'] == 'completed':\n",
200
+ " print(f\"\\n Document ready! ({doc_info['pageNum']} pages)\")\n",
201
+ "elif doc_info['status'] == 'processing':\n",
202
+ " print(\"\\n Document is still processing. Please wait and check again.\")"
203
+ ]
204
+ },
205
+ {
206
+ "cell_type": "markdown",
207
+ "metadata": {
208
+ "id": "z1C9FOvO-p1m"
209
+ },
210
+ "source": [
211
+ "### Ask a question about this document"
212
+ ]
213
+ },
214
+ {
215
+ "cell_type": "code",
216
+ "execution_count": 55,
217
+ "metadata": {
218
+ "colab": {
219
+ "base_uri": "https://localhost:8080/"
220
+ },
221
+ "id": "X3RbQvy_0nt7",
222
+ "outputId": "9bfb314b-24ad-4eb2-d26c-01be5728d3cc"
223
+ },
224
+ "outputs": [
225
+ {
226
+ "name": "stdout",
227
+ "output_type": "stream",
228
+ "text": [
229
+ "I'll help you find the evaluation methods used in this paper. Let me start by examining the document structure to locate the relevant sections.{\"doc_name\": \"2507.13334_19.pdf\"}Perfect! I can see there's a dedicated section on \"Evaluation\" (node_id: 0015) that covers pages 45-50. Let me extract the content from those pages to get detailed information about the evaluation methods.{\"doc_name\": \"2507.13334_19.pdf\", \"pages\": \"45-50\"}Based on the comprehensive evaluation section of the paper, here are the **evaluation methods** used:\n",
230
+ "\n",
231
+ "## Evaluation Framework Overview\n",
232
+ "\n",
233
+ "The paper presents a comprehensive evaluation framework organized into **Component-Level Assessment** and **System-Level Integration Assessment**.\n",
234
+ "\n",
235
+ "### 1. **Component-Level Assessment (Intrinsic Evaluation)**\n",
236
+ "\n",
237
+ "#### Prompt Engineering Evaluation:\n",
238
+ "- **Semantic similarity metrics**\n",
239
+ "- **Response quality assessment**\n",
240
+ "- **Robustness testing** across diverse input variations\n",
241
+ "- **Contextual calibration** assessment\n",
242
+ "\n",
243
+ "#### Long Context Processing Evaluation:\n",
244
+ "- **\"Needle in a haystack\"** evaluation paradigm - tests models' ability to retrieve specific information embedded within long contexts\n",
245
+ "- **Multi-document reasoning tasks** - assess synthesis capabilities\n",
246
+ "- **Position interpolation techniques** evaluation\n",
247
+ "- **Information retention, positional bias, and reasoning coherence** metrics\n",
248
+ "\n",
249
+ "#### Self-Contextualization Evaluation:\n",
250
+ "- **Meta-learning assessments**\n",
251
+ "- **Adaptation speed measurements**\n",
252
+ "- **Consistency analysis** across multiple iterations\n",
253
+ "- Self-refinement frameworks: **Self-Refine, Reflexion, N-CRITICS**\n",
254
+ "- Performance improvements measured (~20% improvement with GPT-4)\n",
255
+ "\n",
256
+ "#### Structured/Relational Data Integration:\n",
257
+ "- **Knowledge graph traversal accuracy**\n",
258
+ "- **Table comprehension assessment**\n",
259
+ "- **Database query generation evaluation**\n",
260
+ "\n",
261
+ "### 2. **System-Level Integration Assessment (Extrinsic Evaluation)**\n",
262
+ "\n",
263
+ "#### Retrieval-Augmented Generation (RAG):\n",
264
+ "- **Precision, recall, relevance metrics**\n",
265
+ "- **Factual accuracy assessment**\n",
266
+ "- **Task decomposition accuracy**\n",
267
+ "- **Multi-plan selection effectiveness**\n",
268
+ "- Memory-augmented planning evaluation\n",
269
+ "\n",
270
+ "#### Memory Systems Evaluation:\n",
271
+ "- **LongMemEval benchmark** (500 curated questions covering):\n",
272
+ " - Information extraction\n",
273
+ " - Temporal reasoning\n",
274
+ " - Multi-session reasoning\n",
275
+ " - Knowledge updates\n",
276
+ "- Dedicated benchmarks: **NarrativeQA, QMSum, QuALITY, MEMENTO**\n",
277
+ "- Accuracy degradation tracking (~30% degradation in extended interactions)\n",
278
+ "\n",
279
+ "#### Tool-Integrated Reasoning:\n",
280
+ "- **MCP-RADAR framework** for standardized evaluation\n",
281
+ "- **Berkeley Function Calling Leaderboard (BFCL)** - 2,000 test cases\n",
282
+ "- **T-Eval** - 553 tool-use cases\n",
283
+ "- **API-Bank** - 73 APIs, 314 dialogues\n",
284
+ "- **ToolHop** - 995 queries, 3,912 tools\n",
285
+ "- **StableToolBench** for API instability\n",
286
+ "- **WebArena** and **Mind2Web** for web agents\n",
287
+ "- **VideoWebArena** for multimodal agents\n",
288
+ "- Metrics: tool selection accuracy, parameter extraction precision, execution success rates, error recovery\n",
289
+ "\n",
290
+ "#### Multi-Agent Systems:\n",
291
+ "- **Communication effectiveness metrics**\n",
292
+ "- **Coordination efficiency assessment**\n",
293
+ "- **Protocol adherence evaluation**\n",
294
+ "- **Task decomposition accuracy**\n",
295
+ "- **Emergent collaborative behaviors** assessment\n",
296
+ "- Context handling and transaction support evaluation\n",
297
+ "\n",
298
+ "### 3. **Emerging Evaluation Paradigms**\n",
299
+ "\n",
300
+ "#### Self-Refinement Evaluation:\n",
301
+ "- Iterative improvement assessment across multiple cycles\n",
302
+ "- Multi-dimensional feedback mechanisms\n",
303
+ "- Ensemble-based evaluation approaches\n",
304
+ "\n",
305
+ "#### Multi-Aspect Feedback:\n",
306
+ "- Correctness, relevance, clarity, and robustness dimensions\n",
307
+ "- Self-rewarding mechanisms for autonomous evolution\n",
308
+ "\n",
309
+ "#### Criticism-Guided Evaluation:\n",
310
+ "- Specialized critic models providing detailed feedback\n",
311
+ "- Fine-grained assessment of reasoning quality, factual accuracy, logical consistency\n",
312
+ "\n",
313
+ "### 4. **Safety and Robustness Assessment**\n",
314
+ "\n",
315
+ "- **Adversarial attack resistance testing**\n",
316
+ "- **Distribution shift evaluation**\n",
317
+ "- **Input perturbation testing**\n",
318
+ "- **Alignment assessment** (adherence to intended behaviors)\n",
319
+ "- **Graceful degradation strategies**\n",
320
+ "- **Error recovery protocols**\n",
321
+ "- **Long-term behavior consistency** evaluation\n",
322
+ "\n",
323
+ "### Key Benchmarks Mentioned:\n",
324
+ "- GAIA (general assistant tasks - 92% human vs 15% GPT-4 accuracy)\n",
325
+ "- GTA benchmark (GPT-4 <50% task completion vs 92% human)\n",
326
+ "- WebArena Leaderboard (with success rates ranging from 23.5% to 61.7%)\n",
327
+ "\n",
328
+ "### Challenges Identified:\n",
329
+ "- Traditional metrics (BLEU, ROUGE, perplexity) inadequate for complex systems\n",
330
+ "- Need for \"living\" benchmarks that co-evolve with AI capabilities\n",
331
+ "- Longitudinal evaluation frameworks for tracking memory fidelity over time\n",
332
+ "- Compositional generalization assessment\n",
333
+ "- Evaluation of \"unknown unknowns\" in multi-agent systems\n",
334
+ "\n",
335
+ "The paper emphasizes a **paradigm shift from static benchmarks to dynamic, holistic assessments** that evaluate not just task success but reasoning quality, robustness, and long-term autonomy."
336
+ ]
337
+ }
338
+ ],
339
+ "source": [
340
+ "query = \"What are the evaluation methods used in this paper?\"\n",
341
+ "\n",
342
+ "for chunk in pi_client.chat_completions(\n",
343
+ " messages=[{\"role\": \"user\", \"content\": query}],\n",
344
+ " doc_id=doc_id,\n",
345
+ " stream=True\n",
346
+ "):\n",
347
+ " print(chunk, end='', flush=True)"
348
+ ]
349
+ },
350
+ {
351
+ "cell_type": "markdown",
352
+ "metadata": {
353
+ "id": "RXrdnxv92LY1"
354
+ },
355
+ "source": [
356
+ "## Agentic Retrieval with PageIndex Chat API\n",
357
+ "\n",
358
+ "You can now easily prompt the PageIndex Chat API to be a retrieval assistant."
359
+ ]
360
+ },
361
+ {
362
+ "cell_type": "code",
363
+ "execution_count": 62,
364
+ "metadata": {
365
+ "colab": {
366
+ "base_uri": "https://localhost:8080/"
367
+ },
368
+ "id": "puyOGkAi0wrH",
369
+ "outputId": "b08c682f-abfb-4381-8bb0-726d210120ad"
370
+ },
371
+ "outputs": [
372
+ {
373
+ "name": "stdout",
374
+ "output_type": "stream",
375
+ "text": [
376
+ "I'll help you find the evaluation methods discussed in this paper. Let me start by examining the document structure to locate relevant sections.{\"doc_name\": \"2507.13334_22.pdf\"}Now I can see the document structure. Section 6 is dedicated to \"Evaluation\" and spans pages 45-50. Let me retrieve the content from those pages to get the detailed evaluation methods.{\"doc_name\": \"2507.13334_22.pdf\", \"pages\": \"45-50\"}Perfect! I have retrieved the comprehensive evaluation section. Now let me compile the evaluation methods in the requested JSON format.\n",
377
+ "\n",
378
+ "```json\n",
379
+ "[\n",
380
+ " {\n",
381
+ " \"page\": 45,\n",
382
+ " \"content\": \"## 6. Evaluation\\n\\nThe evaluation of context-engineered systems presents unprecedented challenges that transcend traditional language model assessment paradigms. These systems exhibit complex, multi-component architectures with dynamic, context-dependent behaviors requiring comprehensive evaluation frameworks that assess component-level diagnostics, task-based performance, and overall system robustness [841, 1141].\\n\\nThe heterogeneous nature of context engineering components-spanning retrieval mechanisms, memory systems, reasoning chains, and multi-agent coordination-demands evaluation methodologies that can capture both individual component effectiveness and emergent system-level behaviors [314, 939].\\n\\n### 6.1. Evaluation Frameworks and Methodologies\\n\\nThis subsection presents comprehensive approaches for evaluating both individual components and integrated systems in context engineering.\\n\\n#### 6.1.1. Component-Level Assessment\\n\\nIntrinsic evaluation focuses on the performance of individual components in isolation, providing foundational insights into system capabilities and failure modes.\\n\\nFor prompt engineering components, evaluation encompasses prompt effectiveness measurement through semantic similarity metrics, response quality assessment, and robustness testing across diverse input variations. Current approaches reveal brittleness and robustness challenges in prompt design, necessitating more sophisticated evaluation frameworks that can assess contextual calibration and adaptive prompt optimization $[1141,669]$.\"\n",
383
+ " },\n",
384
+ " {\n",
385
+ " \"page\": 46,\n",
386
+ " \"content\": \"Long context processing evaluation requires specialized metrics addressing information retention, positional bias, and reasoning coherence across extended sequences. The \\\"needle in a haystack\\\" evaluation paradigm tests models' ability to retrieve specific information embedded within long contexts, while multi-document reasoning tasks assess synthesis capabilities across multiple information sources. Position interpolation techniques and ultra-long sequence processing methods face significant computational challenges that limit practical evaluation scenarios [737, 299].\\n\\nSelf-contextualization mechanisms undergo evaluation through meta-learning assessments, adaptation speed measurements, and consistency analysis across multiple iterations. Self-refinement frameworks including Self-Refine, Reflexion, and N-CRITICS demonstrate substantial performance improvements, with GPT-4 achieving approximately 20\\\\% improvement through iterative self-refinement processes [741, 964, 795]. Multi-dimensional feedback mechanisms and ensemble-based evaluation approaches provide comprehensive assessment of autonomous evolution capabilities [583, 710].\\n\\nStructured and relational data integration evaluation examines accuracy in knowledge graph traversal, table comprehension, and database query generation. However, current evaluation frameworks face significant limitations in assessing structural reasoning capabilities, with high-quality structured training data development presenting ongoing challenges. LSTM-based models demonstrate increased errors when sequential and structural information conflict, highlighting the need for more sophisticated benchmarks testing structural understanding $[769,674,167]$.\\n\\n#### 6.1.2. System-Level Integration Assessment\\n\\nExtrinsic evaluation measures end-to-end performance on downstream tasks, providing holistic assessments of system utility through comprehensive benchmarks spanning question answering, reasoning, and real-world applications.\\n\\nSystem-level evaluation must capture emergent behaviors arising from component interactions, including synergistic effects where combined components exceed individual performance and potential interference patterns where component integration degrades overall effectiveness [841, 1141].\\n\\nRetrieval-Augmented Generation evaluation encompasses both retrieval quality and generation effectiveness through comprehensive metrics addressing precision, recall, relevance, and factual accuracy. Agentic RAG systems introduce additional complexity requiring evaluation of task decomposition accuracy, multi-plan selection effectiveness, and memory-augmented planning capabilities. Self-reflection mechanisms demonstrate iterative improvement through feedback loops, with MemoryBank implementations incorporating Ebbinghaus Forgetting Curve principles for enhanced memory evaluation [444, 166, 1372, 1192, 41].\\n\\nMemory systems evaluation encounters substantial difficulties stemming from the absence of standardized assessment frameworks and the inherently stateless characteristics of contemporary LLMs. LongMemEval offers 500 carefully curated questions that evaluate fundamental capabilities encompassing information extraction, temporal reasoning, multi-session reasoning, and knowledge updates. Commercial AI assistants exhibit $30 \\\\%$ accuracy degradation throughout extended interactions, underscoring significant deficiencies in memory persistence and retrieval effectiveness [1340, 1180, 463, 847, 390]. Dedicated benchmarks such as NarrativeQA, QMSum, QuALITY, and MEMENTO tackle episodic memory evaluation challenges [556, 572].\\n\\nTool-integrated reasoning systems require comprehensive evaluation covering the entire interaction trajectory, including tool selection accuracy, parameter extraction precision, execution success rates, and error recovery capabilities. The MCP-RADAR framework provides standardized evaluation employing objective metrics for software engineering and mathematical reasoning domains. Real-world evaluation reveals\"\n",
387
+ " },\n",
388
+ " {\n",
389
+ " \"page\": 47,\n",
390
+ " \"content\": \"significant performance gaps, with GPT-4 completing less than 50\\\\% of tasks in the GTA benchmark, compared to human performance of $92 \\\\%$ [314, 1098, 126, 939]. Advanced benchmarks including BFCL (2,000 testing cases), T-Eval (553 tool-use cases), API-Bank (73 APIs, 314 dialogues), and ToolHop ( 995 queries, 3,912 tools) address multi-turn interactions and nested tool calling scenarios [263, 363, 377, 1264, 160, 835].\\n\\nMulti-agent systems evaluation captures communication effectiveness, coordination efficiency, and collective outcome quality through specialized metrics addressing protocol adherence, task decomposition accuracy, and emergent collaborative behaviors. Contemporary orchestration frameworks including LangGraph, AutoGen, and CAMEL demonstrate insufficient transaction support, with validation limitations emerging as systems rely exclusively on LLM self-validation capabilities without independent validation procedures. Context handling failures compound challenges as agents struggle with long-term context maintenance encompassing both episodic and semantic information [128, 394, 901].\\n\\n### 6.2. Benchmark Datasets and Evaluation Paradigms\\n\\nThis subsection reviews specialized benchmarks and evaluation paradigms designed for assessing context engineering system performance.\\n\\n#### 6.2.1. Foundational Component Benchmarks\\n\\nLong context processing evaluation employs specialized benchmark suites designed to test information retention, reasoning, and synthesis across extended sequences. Current benchmarks face significant computational complexity challenges, with $\\\\mathrm{O}\\\\left(\\\\mathrm{n}^{2}\\\\right)$ scaling limitations in attention mechanisms creating substantial memory constraints for ultra-long sequences. Position interpolation and extension techniques require sophisticated evaluation frameworks that can assess both computational efficiency and reasoning quality across varying sequence lengths [737, 299, 1236].\\n\\nAdvanced architectures including LongMamba and specialized position encoding methods demonstrate promising directions for long context processing, though evaluation reveals persistent challenges in maintaining coherence across extended sequences. The development of sliding attention mechanisms and memory-efficient implementations requires comprehensive benchmarks that can assess both computational tractability and task performance [1267, 351].\\n\\nStructured and relational data integration benchmarks encompass diverse knowledge representation formats and reasoning patterns. However, current evaluation frameworks face limitations in assessing structural reasoning capabilities, with the development of high-quality structured training data presenting ongoing challenges. Evaluation must address the fundamental tension between sequential and structural information processing, particularly in scenarios where these information types conflict [769, 674, 167].\\n\\n#### 6.2.2. System Implementation Benchmarks\\n\\nRetrieval-Augmented Generation evaluation leverages comprehensive benchmark suites addressing diverse retrieval and generation challenges. Modular RAG architectures demonstrate enhanced flexibility through specialized modules for retrieval, augmentation, and generation, enabling fine-grained evaluation of individual components and their interactions. Graph-enhanced RAG systems incorporating GraphRAG and LightRAG demonstrate improved performance in complex reasoning scenarios, though evaluation frameworks must address the additional complexity of graph traversal and multi-hop reasoning assessment [316, 973, 364].\\n\\nAgentic RAG systems introduce sophisticated planning and reflection mechanisms requiring evaluation\"\n",
391
+ " },\n",
392
+ " {\n",
393
+ " \"page\": 48,\n",
394
+ " \"content\": \"of task decomposition accuracy, multi-plan selection effectiveness, and iterative refinement capabilities. Real-time and streaming RAG applications present unique evaluation challenges in assessing both latency and accuracy under dynamic information conditions [444, 166, 1192].\\n\\nTool-integrated reasoning system evaluation employs comprehensive benchmarks spanning diverse tool usage scenarios and complexity levels. The Berkeley Function Calling Leaderboard (BFCL) provides 2,000 testing cases with step-by-step and end-to-end assessments measuring call accuracy, pass rates, and win rates across increasingly complex scenarios. T-Eval contributes 553 tool-use cases testing multi-turn interactions and nested tool calling capabilities [263, 1390, 835]. Advanced benchmarks including StableToolBench address API instability challenges, while NesTools evaluates nested tool scenarios and ToolHop assesses multi-hop tool usage across 995 queries and 3,912 tools [363, 377, 1264].\\n\\nWeb agent evaluation frameworks including WebArena and Mind2Web provide comprehensive assessment across thousands of tasks spanning 137 websites, revealing significant performance gaps in current LLM capabilities for complex web interactions. VideoWebArena extends evaluation to multimodal agents, while Deep Research Bench and DeepShop address specialized evaluation for research and shopping agents respectively $[1378,206,87,482]$.\\n\\nMulti-agent system evaluation employs specialized frameworks addressing coordination, communication, and collective intelligence. However, current frameworks face significant challenges in transactional integrity across complex workflows, with many systems lacking adequate compensation mechanisms for partial failures. Orchestration evaluation must address context management, coordination strategy effectiveness, and the ability to maintain system coherence under varying operational conditions [128, 901].\\n\\n| Release Date | Open Source | Method / Model | Success Rate (\\\\%) | Source |\\n| :-- | :--: | :-- | :--: | :-- |\\n| $2025-02$ | $\\\\times$ | IBM CUGA | 61.7 | $[753]$ |\\n| $2025-01$ | $\\\\times$ | OpenAI Operator | 58.1 | $[813]$ |\\n| $2024-08$ | $\\\\times$ | Jace.AI | 57.1 | $[476]$ |\\n| $2024-12$ | $\\\\times$ | ScribeAgent + GPT-4o | 53.0 | $[950]$ |\\n| $2025-01$ | $\\\\checkmark$ | AgentSymbiotic | 52.1 | $[1323]$ |\\n| $2025-01$ | $\\\\checkmark$ | Learn-by-Interact | 48.0 | $[998]$ |\\n| $2024-10$ | $\\\\checkmark$ | AgentOccam-Judge | 45.7 | $[1231]$ |\\n| $2024-08$ | $\\\\times$ | WebPilot | 37.2 | $[1331]$ |\\n| $2024-10$ | $\\\\checkmark$ | GUI-API Hybrid Agent | 35.8 | $[988]$ |\\n| $2024-09$ | $\\\\checkmark$ | Agent Workflow Memory | 35.5 | $[1144]$ |\\n| $2024-04$ | $\\\\checkmark$ | SteP | 33.5 | $[979]$ |\\n| $2025-06$ | $\\\\checkmark$ | TTI | 26.1 | $[951]$ |\\n| $2024-04$ | $\\\\checkmark$ | BrowserGym + GPT-4 | 23.5 | $[238]$ |\\n\\nTable 8: WebArena [1378] Leaderboard: Top performing models with their success rates and availability status.\\n\\n### 6.3. Evaluation Challenges and Emerging Paradigms\\n\\nThis subsection identifies current limitations in evaluation methodologies and explores emerging approaches for more effective assessment.\"\n",
395
+ " },\n",
396
+ " {\n",
397
+ " \"page\": 49,\n",
398
+ " \"content\": \"#### 6.3.1. Methodological Limitations and Biases\\n\\nTraditional evaluation metrics prove fundamentally inadequate for capturing the nuanced, dynamic behaviors exhibited by context-engineered systems. Static metrics like BLEU, ROUGE, and perplexity, originally designed for simpler text generation tasks, fail to assess complex reasoning chains, multi-step interactions, and emergent system behaviors. The inherent complexity and interdependencies of multi-component systems create attribution challenges where isolating failures and identifying root causes becomes computationally and methodologically intractable. Future metrics must evolve to capture not just task success, but the quality and robustness of the underlying reasoning process, especially in scenarios requiring compositional generalization and creative problem-solving [841, 1141].\\n\\nMemory system evaluation faces particular challenges due to the lack of standardized benchmarks and the stateless nature of current LLMs. Automated memory testing frameworks must address the isolation problem where different memory testing stages cannot be effectively separated, leading to unreliable assessment results. Commercial AI assistants demonstrate significant performance degradation during sustained interactions, with accuracy drops of up to $30 \\\\%$ highlighting critical gaps in current evaluation methodologies and pointing to the need for longitudinal evaluation frameworks that track memory fidelity over time $[1340,1180,463]$.\\n\\nTool-integrated reasoning system evaluation reveals substantial performance gaps between current systems and human-level capabilities. The GAIA benchmark demonstrates that while humans achieve $92 \\\\%$ accuracy on general assistant tasks, advanced models like GPT-4 achieve only $15 \\\\%$ accuracy, indicating fundamental limitations in current evaluation frameworks and system capabilities [778, 1098, 126]. Evaluation frameworks must address the complexity of multi-tool coordination, error recovery, and adaptive tool selection across diverse operational contexts [314, 939].\\n\\n#### 6.3.2. Emerging Evaluation Paradigms\\n\\nSelf-refinement evaluation paradigms leverage iterative improvement mechanisms to assess system capabilities across multiple refinement cycles. Frameworks including Self-Refine, Reflexion, and N-CRITICS demonstrate substantial performance improvements through multi-dimensional feedback and ensemblebased evaluation approaches. GPT-4 achieves approximately 20\\\\% improvement through self-refinement processes, highlighting the importance of evaluating systems across multiple iteration cycles rather than single-shot assessments. However, a key future challenge lies in evaluating the meta-learning capability itself—not just whether the system improves, but how efficiently and robustly it learns to refine its strategies over time $[741,964,795,583]$.\\n\\nMulti-aspect feedback evaluation incorporates diverse feedback dimensions including correctness, relevance, clarity, and robustness, providing comprehensive assessment of system outputs. Self-rewarding mechanisms enable autonomous evolution and meta-learning assessment, allowing systems to develop increasingly sophisticated evaluation criteria through iterative refinement [710].\\n\\nCriticism-guided evaluation employs specialized critic models to provide detailed feedback on system outputs, enabling fine-grained assessment of reasoning quality, factual accuracy, and logical consistency. These approaches address the limitations of traditional metrics by providing contextual, content-aware evaluation that can adapt to diverse task requirements and output formats [795, 583].\\n\\nOrchestration evaluation frameworks address the unique challenges of multi-agent coordination by incorporating transactional integrity assessment, context management evaluation, and coordination strategy effectiveness measurement. Advanced frameworks including SagaLLM provide transaction support and\"\n",
399
+ " },\n",
400
+ " {\n",
401
+ " \"page\": 50,\n",
402
+ " \"content\": \"independent validation procedures to address the limitations of systems that rely exclusively on LLM selfvalidation capabilities $[128,394]$.\\n\\n#### 6.3.3. Safety and Robustness Assessment\\n\\nSafety-oriented evaluation incorporates comprehensive robustness testing, adversarial attack resistance, and alignment assessment to ensure responsible development of context-engineered systems. Particular attention must be paid to the evaluation of agentic systems that can operate autonomously across extended periods, as these systems present unique safety challenges that traditional evaluation frameworks cannot adequately address $[973,364]$.\\n\\nRobustness evaluation must assess system performance under distribution shifts, input perturbations, and adversarial conditions through comprehensive stress testing protocols. Multi-agent systems face additional challenges in coordination failure scenarios, where partial system failures can cascade through the entire agent network. Evaluation frameworks must address graceful degradation strategies, error recovery protocols, and the ability to maintain system functionality under adverse conditions. Beyond predefined failure modes, future evaluation must grapple with assessing resilience to \\\"unknown unknowns\\\"-emergent and unpredictable failure cascades in highly complex, autonomous multi-agent systems [128, 394].\\n\\nAlignment evaluation measures system adherence to intended behaviors, value consistency, and beneficial outcome optimization through specialized assessment frameworks. Context engineering systems present unique alignment challenges due to their dynamic adaptation capabilities and complex interaction patterns across multiple components. Long-term evaluation must assess whether systems maintain beneficial behaviors as they adapt and evolve through extended operational periods [901].\\n\\nLooking ahead, the evaluation of context-engineered systems requires a paradigm shift from static benchmarks to dynamic, holistic assessments. Future frameworks must move beyond measuring task success to evaluating compositional generalization for novel problems and tracking long-term autonomy in interactive environments. The development of 'living' benchmarks that co-evolve with AI capabilities, alongside the integration of socio-technical and economic metrics, will be critical for ensuring these advanced systems are not only powerful but also reliable, efficient, and aligned with human values in real-world applications $[314,1378,1340]$.\\n\\nThe evaluation landscape for context-engineered systems continues evolving rapidly as new architectures, capabilities, and applications emerge. Future evaluation paradigms must address increasing system complexity while providing reliable, comprehensive, and actionable insights for system improvement and deployment decisions. The integration of multiple evaluation approaches-from component-level assessment to systemwide robustness testing-represents a critical research priority for ensuring the reliable deployment of context-engineered systems in real-world applications [841, 1141].\"\n",
403
+ " }\n",
404
+ "]\n",
405
+ "```"
406
+ ]
407
+ }
408
+ ],
409
+ "source": [
410
+ "retrieval_prompt = f\"\"\"\n",
411
+ "Your job is to retrieve the raw relevant content from the document based on the user's query.\n",
412
+ "\n",
413
+ "Query: {query}\n",
414
+ "\n",
415
+ "Return in JSON format:\n",
416
+ "```json\n",
417
+ "[\n",
418
+ " {{\n",
419
+ " \"page\": <number>,\n",
420
+ " \"content\": \"<raw text>\"\n",
421
+ " }},\n",
422
+ " ...\n",
423
+ "]\n",
424
+ "```\n",
425
+ "\"\"\"\n",
426
+ "\n",
427
+ "full_response = \"\"\n",
428
+ "\n",
429
+ "for chunk in pi_client.chat_completions(\n",
430
+ " messages=[{\"role\": \"user\", \"content\": retrieval_prompt}],\n",
431
+ " doc_id=doc_id,\n",
432
+ " stream=True\n",
433
+ "):\n",
434
+ " print(chunk, end='', flush=True)\n",
435
+ " full_response += chunk"
436
+ ]
437
+ },
438
+ {
439
+ "cell_type": "markdown",
440
+ "metadata": {
441
+ "id": "d-Y9towQ_CiF"
442
+ },
443
+ "source": [
444
+ "### Exctarct the JSON retreived results"
445
+ ]
446
+ },
447
+ {
448
+ "cell_type": "code",
449
+ "execution_count": 59,
450
+ "metadata": {
451
+ "colab": {
452
+ "base_uri": "https://localhost:8080/"
453
+ },
454
+ "id": "rwjC65oB05Tt",
455
+ "outputId": "64504ad5-1778-463f-989b-46e18aba2ea6"
456
+ },
457
+ "outputs": [
458
+ {
459
+ "name": "stdout",
460
+ "output_type": "stream",
461
+ "text": [
462
+ "Note: you may need to restart the kernel to use updated packages.\n",
463
+ "[{'content': '## 6. Evaluation\\n'\n",
464
+ " '\\n'\n",
465
+ " 'The evaluation of context-engineered systems presents '\n",
466
+ " 'unprecedented challenges that transcend traditional language '\n",
467
+ " 'model assessment paradigms. These systems exhibit complex, '\n",
468
+ " 'multi-component architectures with dynamic, context-dependent '\n",
469
+ " 'behaviors requiring comprehensive evaluation frameworks that '\n",
470
+ " 'assess component-level diagnostics, task-based performance, and '\n",
471
+ " 'overall system robustness [841, 1141].\\n'\n",
472
+ " '\\n'\n",
473
+ " 'The heterogeneous nature of context engineering '\n",
474
+ " 'components-spanning retrieval mechanisms, memory systems, '\n",
475
+ " 'reasoning chains, and multi-agent coordination-demands '\n",
476
+ " 'evaluation methodologies that can capture both individual '\n",
477
+ " 'component effectiveness and emergent system-level behaviors '\n",
478
+ " '[314, 939].\\n'\n",
479
+ " '\\n'\n",
480
+ " '### 6.1. Evaluation Frameworks and Methodologies\\n'\n",
481
+ " '\\n'\n",
482
+ " 'This subsection presents comprehensive approaches for evaluating '\n",
483
+ " 'both individual components and integrated systems in context '\n",
484
+ " 'engineering.\\n'\n",
485
+ " '\\n'\n",
486
+ " '#### 6.1.1. Component-Level Assessment\\n'\n",
487
+ " '\\n'\n",
488
+ " 'Intrinsic evaluation focuses on the performance of individual '\n",
489
+ " 'components in isolation, providing foundational insights into '\n",
490
+ " 'system capabilities and failure modes.\\n'\n",
491
+ " '\\n'\n",
492
+ " 'For prompt engineering components, evaluation encompasses prompt '\n",
493
+ " 'effectiveness measurement through semantic similarity metrics, '\n",
494
+ " 'response quality assessment, and robustness testing across '\n",
495
+ " 'diverse input variations. Current approaches reveal brittleness '\n",
496
+ " 'and robustness challenges in prompt design, necessitating more '\n",
497
+ " 'sophisticated evaluation frameworks that can assess contextual '\n",
498
+ " 'calibration and adaptive prompt optimization $[1141,669]$.',\n",
499
+ " 'page': 45},\n",
500
+ " {'content': 'Long context processing evaluation requires specialized metrics '\n",
501
+ " 'addressing information retention, positional bias, and reasoning '\n",
502
+ " 'coherence across extended sequences. The \"needle in a haystack\" '\n",
503
+ " \"evaluation paradigm tests models' ability to retrieve specific \"\n",
504
+ " 'information embedded within long contexts, while multi-document '\n",
505
+ " 'reasoning tasks assess synthesis capabilities across multiple '\n",
506
+ " 'information sources. Position interpolation techniques and '\n",
507
+ " 'ultra-long sequence processing methods face significant '\n",
508
+ " 'computational challenges that limit practical evaluation '\n",
509
+ " 'scenarios [737, 299].\\n'\n",
510
+ " '\\n'\n",
511
+ " 'Self-contextualization mechanisms undergo evaluation through '\n",
512
+ " 'meta-learning assessments, adaptation speed measurements, and '\n",
513
+ " 'consistency analysis across multiple iterations. Self-refinement '\n",
514
+ " 'frameworks including Self-Refine, Reflexion, and N-CRITICS '\n",
515
+ " 'demonstrate substantial performance improvements, with GPT-4 '\n",
516
+ " 'achieving approximately 20\\\\% improvement through iterative '\n",
517
+ " 'self-refinement processes [741, 964, 795]. Multi-dimensional '\n",
518
+ " 'feedback mechanisms and ensemble-based evaluation approaches '\n",
519
+ " 'provide comprehensive assessment of autonomous evolution '\n",
520
+ " 'capabilities [583, 710].\\n'\n",
521
+ " '\\n'\n",
522
+ " 'Structured and relational data integration evaluation examines '\n",
523
+ " 'accuracy in knowledge graph traversal, table comprehension, and '\n",
524
+ " 'database query generation. However, current evaluation '\n",
525
+ " 'frameworks face significant limitations in assessing structural '\n",
526
+ " 'reasoning capabilities, with high-quality structured training '\n",
527
+ " 'data development presenting ongoing challenges. LSTM-based '\n",
528
+ " 'models demonstrate increased errors when sequential and '\n",
529
+ " 'structural information conflict, highlighting the need for more '\n",
530
+ " 'sophisticated benchmarks testing structural understanding '\n",
531
+ " '$[769,674,167]$.\\n'\n",
532
+ " '\\n'\n",
533
+ " '#### 6.1.2. System-Level Integration Assessment\\n'\n",
534
+ " '\\n'\n",
535
+ " 'Extrinsic evaluation measures end-to-end performance on '\n",
536
+ " 'downstream tasks, providing holistic assessments of system '\n",
537
+ " 'utility through comprehensive benchmarks spanning question '\n",
538
+ " 'answering, reasoning, and real-world applications.\\n'\n",
539
+ " '\\n'\n",
540
+ " 'System-level evaluation must capture emergent behaviors arising '\n",
541
+ " 'from component interactions, including synergistic effects where '\n",
542
+ " 'combined components exceed individual performance and potential '\n",
543
+ " 'interference patterns where component integration degrades '\n",
544
+ " 'overall effectiveness [841, 1141].\\n'\n",
545
+ " '\\n'\n",
546
+ " 'Retrieval-Augmented Generation evaluation encompasses both '\n",
547
+ " 'retrieval quality and generation effectiveness through '\n",
548
+ " 'comprehensive metrics addressing precision, recall, relevance, '\n",
549
+ " 'and factual accuracy. Agentic RAG systems introduce additional '\n",
550
+ " 'complexity requiring evaluation of task decomposition accuracy, '\n",
551
+ " 'multi-plan selection effectiveness, and memory-augmented '\n",
552
+ " 'planning capabilities. Self-reflection mechanisms demonstrate '\n",
553
+ " 'iterative improvement through feedback loops, with MemoryBank '\n",
554
+ " 'implementations incorporating Ebbinghaus Forgetting Curve '\n",
555
+ " 'principles for enhanced memory evaluation [444, 166, 1372, 1192, '\n",
556
+ " '41].\\n'\n",
557
+ " '\\n'\n",
558
+ " 'Memory systems evaluation encounters substantial difficulties '\n",
559
+ " 'stemming from the absence of standardized assessment frameworks '\n",
560
+ " 'and the inherently stateless characteristics of contemporary '\n",
561
+ " 'LLMs. LongMemEval offers 500 carefully curated questions that '\n",
562
+ " 'evaluate fundamental capabilities encompassing information '\n",
563
+ " 'extraction, temporal reasoning, multi-session reasoning, and '\n",
564
+ " 'knowledge updates. Commercial AI assistants exhibit $30 \\\\%$ '\n",
565
+ " 'accuracy degradation throughout extended interactions, '\n",
566
+ " 'underscoring significant deficiencies in memory persistence and '\n",
567
+ " 'retrieval effectiveness [1340, 1180, 463, 847, 390]. Dedicated '\n",
568
+ " 'benchmarks such as NarrativeQA, QMSum, QuALITY, and MEMENTO '\n",
569
+ " 'tackle episodic memory evaluation challenges [556, 572].\\n'\n",
570
+ " '\\n'\n",
571
+ " 'Tool-integrated reasoning systems require comprehensive '\n",
572
+ " 'evaluation covering the entire interaction trajectory, including '\n",
573
+ " 'tool selection accuracy, parameter extraction precision, '\n",
574
+ " 'execution success rates, and error recovery capabilities. The '\n",
575
+ " 'MCP-RADAR framework provides standardized evaluation employing '\n",
576
+ " 'objective metrics for software engineering and mathematical '\n",
577
+ " 'reasoning domains. Real-world evaluation reveals',\n",
578
+ " 'page': 46},\n",
579
+ " {'content': 'significant performance gaps, with GPT-4 completing less than '\n",
580
+ " '50\\\\% of tasks in the GTA benchmark, compared to human '\n",
581
+ " 'performance of $92 \\\\%$ [314, 1098, 126, 939]. Advanced '\n",
582
+ " 'benchmarks including BFCL (2,000 testing cases), T-Eval (553 '\n",
583
+ " 'tool-use cases), API-Bank (73 APIs, 314 dialogues), and ToolHop '\n",
584
+ " '( 995 queries, 3,912 tools) address multi-turn interactions and '\n",
585
+ " 'nested tool calling scenarios [263, 363, 377, 1264, 160, 835].\\n'\n",
586
+ " '\\n'\n",
587
+ " 'Multi-agent systems evaluation captures communication '\n",
588
+ " 'effectiveness, coordination efficiency, and collective outcome '\n",
589
+ " 'quality through specialized metrics addressing protocol '\n",
590
+ " 'adherence, task decomposition accuracy, and emergent '\n",
591
+ " 'collaborative behaviors. Contemporary orchestration frameworks '\n",
592
+ " 'including LangGraph, AutoGen, and CAMEL demonstrate insufficient '\n",
593
+ " 'transaction support, with validation limitations emerging as '\n",
594
+ " 'systems rely exclusively on LLM self-validation capabilities '\n",
595
+ " 'without independent validation procedures. Context handling '\n",
596
+ " 'failures compound challenges as agents struggle with long-term '\n",
597
+ " 'context maintenance encompassing both episodic and semantic '\n",
598
+ " 'information [128, 394, 901].\\n'\n",
599
+ " '\\n'\n",
600
+ " '### 6.2. Benchmark Datasets and Evaluation Paradigms\\n'\n",
601
+ " '\\n'\n",
602
+ " 'This subsection reviews specialized benchmarks and evaluation '\n",
603
+ " 'paradigms designed for assessing context engineering system '\n",
604
+ " 'performance.\\n'\n",
605
+ " '\\n'\n",
606
+ " '#### 6.2.1. Foundational Component Benchmarks\\n'\n",
607
+ " '\\n'\n",
608
+ " 'Long context processing evaluation employs specialized benchmark '\n",
609
+ " 'suites designed to test information retention, reasoning, and '\n",
610
+ " 'synthesis across extended sequences. Current benchmarks face '\n",
611
+ " 'significant computational complexity challenges, with '\n",
612
+ " '$\\\\mathrm{O}\\\\left(\\\\mathrm{n}^{2}\\\\right)$ scaling limitations '\n",
613
+ " 'in attention mechanisms creating substantial memory constraints '\n",
614
+ " 'for ultra-long sequences. Position interpolation and extension '\n",
615
+ " 'techniques require sophisticated evaluation frameworks that can '\n",
616
+ " 'assess both computational efficiency and reasoning quality '\n",
617
+ " 'across varying sequence lengths [737, 299, 1236].\\n'\n",
618
+ " '\\n'\n",
619
+ " 'Advanced architectures including LongMamba and specialized '\n",
620
+ " 'position encoding methods demonstrate promising directions for '\n",
621
+ " 'long context processing, though evaluation reveals persistent '\n",
622
+ " 'challenges in maintaining coherence across extended sequences. '\n",
623
+ " 'The development of sliding attention mechanisms and '\n",
624
+ " 'memory-efficient implementations requires comprehensive '\n",
625
+ " 'benchmarks that can assess both computational tractability and '\n",
626
+ " 'task performance [1267, 351].\\n'\n",
627
+ " '\\n'\n",
628
+ " 'Structured and relational data integration benchmarks encompass '\n",
629
+ " 'diverse knowledge representation formats and reasoning patterns. '\n",
630
+ " 'However, current evaluation frameworks face limitations in '\n",
631
+ " 'assessing structural reasoning capabilities, with the '\n",
632
+ " 'development of high-quality structured training data presenting '\n",
633
+ " 'ongoing challenges. Evaluation must address the fundamental '\n",
634
+ " 'tension between sequential and structural information '\n",
635
+ " 'processing, particularly in scenarios where these information '\n",
636
+ " 'types conflict [769, 674, 167].\\n'\n",
637
+ " '\\n'\n",
638
+ " '#### 6.2.2. System Implementation Benchmarks\\n'\n",
639
+ " '\\n'\n",
640
+ " 'Retrieval-Augmented Generation evaluation leverages '\n",
641
+ " 'comprehensive benchmark suites addressing diverse retrieval and '\n",
642
+ " 'generation challenges. Modular RAG architectures demonstrate '\n",
643
+ " 'enhanced flexibility through specialized modules for retrieval, '\n",
644
+ " 'augmentation, and generation, enabling fine-grained evaluation '\n",
645
+ " 'of individual components and their interactions. Graph-enhanced '\n",
646
+ " 'RAG systems incorporating GraphRAG and LightRAG demonstrate '\n",
647
+ " 'improved performance in complex reasoning scenarios, though '\n",
648
+ " 'evaluation frameworks must address the additional complexity of '\n",
649
+ " 'graph traversal and multi-hop reasoning assessment [316, 973, '\n",
650
+ " '364].\\n'\n",
651
+ " '\\n'\n",
652
+ " 'Agentic RAG systems introduce sophisticated planning and '\n",
653
+ " 'reflection mechanisms requiring evaluation',\n",
654
+ " 'page': 47},\n",
655
+ " {'content': 'of task decomposition accuracy, multi-plan selection '\n",
656
+ " 'effectiveness, and iterative refinement capabilities. Real-time '\n",
657
+ " 'and streaming RAG applications present unique evaluation '\n",
658
+ " 'challenges in assessing both latency and accuracy under dynamic '\n",
659
+ " 'information conditions [444, 166, 1192].\\n'\n",
660
+ " '\\n'\n",
661
+ " 'Tool-integrated reasoning system evaluation employs '\n",
662
+ " 'comprehensive benchmarks spanning diverse tool usage scenarios '\n",
663
+ " 'and complexity levels. The Berkeley Function Calling Leaderboard '\n",
664
+ " '(BFCL) provides 2,000 testing cases with step-by-step and '\n",
665
+ " 'end-to-end assessments measuring call accuracy, pass rates, and '\n",
666
+ " 'win rates across increasingly complex scenarios. T-Eval '\n",
667
+ " 'contributes 553 tool-use cases testing multi-turn interactions '\n",
668
+ " 'and nested tool calling capabilities [263, 1390, 835]. Advanced '\n",
669
+ " 'benchmarks including StableToolBench address API instability '\n",
670
+ " 'challenges, while NesTools evaluates nested tool scenarios and '\n",
671
+ " 'ToolHop assesses multi-hop tool usage across 995 queries and '\n",
672
+ " '3,912 tools [363, 377, 1264].\\n'\n",
673
+ " '\\n'\n",
674
+ " 'Web agent evaluation frameworks including WebArena and Mind2Web '\n",
675
+ " 'provide comprehensive assessment across thousands of tasks '\n",
676
+ " 'spanning 137 websites, revealing significant performance gaps in '\n",
677
+ " 'current LLM capabilities for complex web interactions. '\n",
678
+ " 'VideoWebArena extends evaluation to multimodal agents, while '\n",
679
+ " 'Deep Research Bench and DeepShop address specialized evaluation '\n",
680
+ " 'for research and shopping agents respectively '\n",
681
+ " '$[1378,206,87,482]$.\\n'\n",
682
+ " '\\n'\n",
683
+ " 'Multi-agent system evaluation employs specialized frameworks '\n",
684
+ " 'addressing coordination, communication, and collective '\n",
685
+ " 'intelligence. However, current frameworks face significant '\n",
686
+ " 'challenges in transactional integrity across complex workflows, '\n",
687
+ " 'with many systems lacking adequate compensation mechanisms for '\n",
688
+ " 'partial failures. Orchestration evaluation must address context '\n",
689
+ " 'management, coordination strategy effectiveness, and the ability '\n",
690
+ " 'to maintain system coherence under varying operational '\n",
691
+ " 'conditions [128, 901].\\n'\n",
692
+ " '\\n'\n",
693
+ " '| Release Date | Open Source | Method / Model | Success Rate '\n",
694
+ " '(\\\\%) | Source |\\n'\n",
695
+ " '| :-- | :--: | :-- | :--: | :-- |\\n'\n",
696
+ " '| $2025-02$ | $\\\\times$ | IBM CUGA | 61.7 | $[753]$ |\\n'\n",
697
+ " '| $2025-01$ | $\\\\times$ | OpenAI Operator | 58.1 | $[813]$ |\\n'\n",
698
+ " '| $2024-08$ | $\\\\times$ | Jace.AI | 57.1 | $[476]$ |\\n'\n",
699
+ " '| $2024-12$ | $\\\\times$ | ScribeAgent + GPT-4o | 53.0 | $[950]$ '\n",
700
+ " '|\\n'\n",
701
+ " '| $2025-01$ | $\\\\checkmark$ | AgentSymbiotic | 52.1 | $[1323]$ '\n",
702
+ " '|\\n'\n",
703
+ " '| $2025-01$ | $\\\\checkmark$ | Learn-by-Interact | 48.0 | $[998]$ '\n",
704
+ " '|\\n'\n",
705
+ " '| $2024-10$ | $\\\\checkmark$ | AgentOccam-Judge | 45.7 | $[1231]$ '\n",
706
+ " '|\\n'\n",
707
+ " '| $2024-08$ | $\\\\times$ | WebPilot | 37.2 | $[1331]$ |\\n'\n",
708
+ " '| $2024-10$ | $\\\\checkmark$ | GUI-API Hybrid Agent | 35.8 | '\n",
709
+ " '$[988]$ |\\n'\n",
710
+ " '| $2024-09$ | $\\\\checkmark$ | Agent Workflow Memory | 35.5 | '\n",
711
+ " '$[1144]$ |\\n'\n",
712
+ " '| $2024-04$ | $\\\\checkmark$ | SteP | 33.5 | $[979]$ |\\n'\n",
713
+ " '| $2025-06$ | $\\\\checkmark$ | TTI | 26.1 | $[951]$ |\\n'\n",
714
+ " '| $2024-04$ | $\\\\checkmark$ | BrowserGym + GPT-4 | 23.5 | '\n",
715
+ " '$[238]$ |\\n'\n",
716
+ " '\\n'\n",
717
+ " 'Table 8: WebArena [1378] Leaderboard: Top performing models with '\n",
718
+ " 'their success rates and availability status.\\n'\n",
719
+ " '\\n'\n",
720
+ " '### 6.3. Evaluation Challenges and Emerging Paradigms\\n'\n",
721
+ " '\\n'\n",
722
+ " 'This subsection identifies current limitations in evaluation '\n",
723
+ " 'methodologies and explores emerging approaches for more '\n",
724
+ " 'effective assessment.',\n",
725
+ " 'page': 48},\n",
726
+ " {'content': '#### 6.3.1. Methodological Limitations and Biases\\n'\n",
727
+ " '\\n'\n",
728
+ " 'Traditional evaluation metrics prove fundamentally inadequate '\n",
729
+ " 'for capturing the nuanced, dynamic behaviors exhibited by '\n",
730
+ " 'context-engineered systems. Static metrics like BLEU, ROUGE, and '\n",
731
+ " 'perplexity, originally designed for simpler text generation '\n",
732
+ " 'tasks, fail to assess complex reasoning chains, multi-step '\n",
733
+ " 'interactions, and emergent system behaviors. The inherent '\n",
734
+ " 'complexity and interdependencies of multi-component systems '\n",
735
+ " 'create attribution challenges where isolating failures and '\n",
736
+ " 'identifying root causes becomes computationally and '\n",
737
+ " 'methodologically intractable. Future metrics must evolve to '\n",
738
+ " 'capture not just task success, but the quality and robustness of '\n",
739
+ " 'the underlying reasoning process, especially in scenarios '\n",
740
+ " 'requiring compositional generalization and creative '\n",
741
+ " 'problem-solving [841, 1141].\\n'\n",
742
+ " '\\n'\n",
743
+ " 'Memory system evaluation faces particular challenges due to the '\n",
744
+ " 'lack of standardized benchmarks and the stateless nature of '\n",
745
+ " 'current LLMs. Automated memory testing frameworks must address '\n",
746
+ " 'the isolation problem where different memory testing stages '\n",
747
+ " 'cannot be effectively separated, leading to unreliable '\n",
748
+ " 'assessment results. Commercial AI assistants demonstrate '\n",
749
+ " 'significant performance degradation during sustained '\n",
750
+ " 'interactions, with accuracy drops of up to $30 \\\\%$ highlighting '\n",
751
+ " 'critical gaps in current evaluation methodologies and pointing '\n",
752
+ " 'to the need for longitudinal evaluation frameworks that track '\n",
753
+ " 'memory fidelity over time $[1340,1180,463]$.\\n'\n",
754
+ " '\\n'\n",
755
+ " 'Tool-integrated reasoning system evaluation reveals substantial '\n",
756
+ " 'performance gaps between current systems and human-level '\n",
757
+ " 'capabilities. The GAIA benchmark demonstrates that while humans '\n",
758
+ " 'achieve $92 \\\\%$ accuracy on general assistant tasks, advanced '\n",
759
+ " 'models like GPT-4 achieve only $15 \\\\%$ accuracy, indicating '\n",
760
+ " 'fundamental limitations in current evaluation frameworks and '\n",
761
+ " 'system capabilities [778, 1098, 126]. Evaluation frameworks must '\n",
762
+ " 'address the complexity of multi-tool coordination, error '\n",
763
+ " 'recovery, and adaptive tool selection across diverse operational '\n",
764
+ " 'contexts [314, 939].\\n'\n",
765
+ " '\\n'\n",
766
+ " '#### 6.3.2. Emerging Evaluation Paradigms\\n'\n",
767
+ " '\\n'\n",
768
+ " 'Self-refinement evaluation paradigms leverage iterative '\n",
769
+ " 'improvement mechanisms to assess system capabilities across '\n",
770
+ " 'multiple refinement cycles. Frameworks including Self-Refine, '\n",
771
+ " 'Reflexion, and N-CRITICS demonstrate substantial performance '\n",
772
+ " 'improvements through multi-dimensional feedback and '\n",
773
+ " 'ensemblebased evaluation approaches. GPT-4 achieves '\n",
774
+ " 'approximately 20\\\\% improvement through self-refinement '\n",
775
+ " 'processes, highlighting the importance of evaluating systems '\n",
776
+ " 'across multiple iteration cycles rather than single-shot '\n",
777
+ " 'assessments. However, a key future challenge lies in evaluating '\n",
778
+ " 'the meta-learning capability itself—not just whether the system '\n",
779
+ " 'improves, but how efficiently and robustly it learns to refine '\n",
780
+ " 'its strategies over time $[741,964,795,583]$.\\n'\n",
781
+ " '\\n'\n",
782
+ " 'Multi-aspect feedback evaluation incorporates diverse feedback '\n",
783
+ " 'dimensions including correctness, relevance, clarity, and '\n",
784
+ " 'robustness, providing comprehensive assessment of system '\n",
785
+ " 'outputs. Self-rewarding mechanisms enable autonomous evolution '\n",
786
+ " 'and meta-learning assessment, allowing systems to develop '\n",
787
+ " 'increasingly sophisticated evaluation criteria through iterative '\n",
788
+ " 'refinement [710].\\n'\n",
789
+ " '\\n'\n",
790
+ " 'Criticism-guided evaluation employs specialized critic models to '\n",
791
+ " 'provide detailed feedback on system outputs, enabling '\n",
792
+ " 'fine-grained assessment of reasoning quality, factual accuracy, '\n",
793
+ " 'and logical consistency. These approaches address the '\n",
794
+ " 'limitations of traditional metrics by providing contextual, '\n",
795
+ " 'content-aware evaluation that can adapt to diverse task '\n",
796
+ " 'requirements and output formats [795, 583].\\n'\n",
797
+ " '\\n'\n",
798
+ " 'Orchestration evaluation frameworks address the unique '\n",
799
+ " 'challenges of multi-agent coordination by incorporating '\n",
800
+ " 'transactional integrity assessment, context management '\n",
801
+ " 'evaluation, and coordination strategy effectiveness measurement. '\n",
802
+ " 'Advanced frameworks including SagaLLM provide transaction '\n",
803
+ " 'support and',\n",
804
+ " 'page': 49},\n",
805
+ " {'content': 'independent validation procedures to address the limitations of '\n",
806
+ " 'systems that rely exclusively on LLM selfvalidation capabilities '\n",
807
+ " '$[128,394]$.\\n'\n",
808
+ " '\\n'\n",
809
+ " '#### 6.3.3. Safety and Robustness Assessment\\n'\n",
810
+ " '\\n'\n",
811
+ " 'Safety-oriented evaluation incorporates comprehensive robustness '\n",
812
+ " 'testing, adversarial attack resistance, and alignment assessment '\n",
813
+ " 'to ensure responsible development of context-engineered systems. '\n",
814
+ " 'Particular attention must be paid to the evaluation of agentic '\n",
815
+ " 'systems that can operate autonomously across extended periods, '\n",
816
+ " 'as these systems present unique safety challenges that '\n",
817
+ " 'traditional evaluation frameworks cannot adequately address '\n",
818
+ " '$[973,364]$.\\n'\n",
819
+ " '\\n'\n",
820
+ " 'Robustness evaluation must assess system performance under '\n",
821
+ " 'distribution shifts, input perturbations, and adversarial '\n",
822
+ " 'conditions through comprehensive stress testing protocols. '\n",
823
+ " 'Multi-agent systems face additional challenges in coordination '\n",
824
+ " 'failure scenarios, where partial system failures can cascade '\n",
825
+ " 'through the entire agent network. Evaluation frameworks must '\n",
826
+ " 'address graceful degradation strategies, error recovery '\n",
827
+ " 'protocols, and the ability to maintain system functionality '\n",
828
+ " 'under adverse conditions. Beyond predefined failure modes, '\n",
829
+ " 'future evaluation must grapple with assessing resilience to '\n",
830
+ " '\"unknown unknowns\"-emergent and unpredictable failure cascades '\n",
831
+ " 'in highly complex, autonomous multi-agent systems [128, 394].\\n'\n",
832
+ " '\\n'\n",
833
+ " 'Alignment evaluation measures system adherence to intended '\n",
834
+ " 'behaviors, value consistency, and beneficial outcome '\n",
835
+ " 'optimization through specialized assessment frameworks. Context '\n",
836
+ " 'engineering systems present unique alignment challenges due to '\n",
837
+ " 'their dynamic adaptation capabilities and complex interaction '\n",
838
+ " 'patterns across multiple components. Long-term evaluation must '\n",
839
+ " 'assess whether systems maintain beneficial behaviors as they '\n",
840
+ " 'adapt and evolve through extended operational periods [901].\\n'\n",
841
+ " '\\n'\n",
842
+ " 'Looking ahead, the evaluation of context-engineered systems '\n",
843
+ " 'requires a paradigm shift from static benchmarks to dynamic, '\n",
844
+ " 'holistic assessments. Future frameworks must move beyond '\n",
845
+ " 'measuring task success to evaluating compositional '\n",
846
+ " 'generalization for novel problems and tracking long-term '\n",
847
+ " 'autonomy in interactive environments. The development of '\n",
848
+ " \"'living' benchmarks that co-evolve with AI capabilities, \"\n",
849
+ " 'alongside the integration of socio-technical and economic '\n",
850
+ " 'metrics, will be critical for ensuring these advanced systems '\n",
851
+ " 'are not only powerful but also reliable, efficient, and aligned '\n",
852
+ " 'with human values in real-world applications $[314,1378,1340]$.\\n'\n",
853
+ " '\\n'\n",
854
+ " 'The evaluation landscape for context-engineered systems '\n",
855
+ " 'continues evolving rapidly as new architectures, capabilities, '\n",
856
+ " 'and applications emerge. Future evaluation paradigms must '\n",
857
+ " 'address increasing system complexity while providing reliable, '\n",
858
+ " 'comprehensive, and actionable insights for system improvement '\n",
859
+ " 'and deployment decisions. The integration of multiple evaluation '\n",
860
+ " 'approaches-from component-level assessment to systemwide '\n",
861
+ " 'robustness testing-represents a critical research priority for '\n",
862
+ " 'ensuring the reliable deployment of context-engineered systems '\n",
863
+ " 'in real-world applications [841, 1141].',\n",
864
+ " 'page': 50}]\n"
865
+ ]
866
+ }
867
+ ],
868
+ "source": [
869
+ "%pip install -q jsonextractor\n",
870
+ "\n",
871
+ "def extract_json(content):\n",
872
+ " from json_extractor import JsonExtractor\n",
873
+ " start_idx = content.find(\"```json\")\n",
874
+ " if start_idx != -1:\n",
875
+ " start_idx += 7 # Adjust index to start after the delimiter\n",
876
+ " end_idx = content.rfind(\"```\")\n",
877
+ " json_content = content[start_idx:end_idx].strip()\n",
878
+ " return JsonExtractor.extract_valid_json(json_content)\n",
879
+ "\n",
880
+ "from pprint import pprint\n",
881
+ "pprint(extract_json(full_response))"
882
+ ]
883
+ }
884
+ ],
885
+ "metadata": {
886
+ "colab": {
887
+ "provenance": []
888
+ },
889
+ "kernelspec": {
890
+ "display_name": "Python 3",
891
+ "name": "python3"
892
+ },
893
+ "language_info": {
894
+ "name": "python"
895
+ }
896
+ },
897
+ "nbformat": 4,
898
+ "nbformat_minor": 0
899
+ }
cookbook/pageIndex_chat_quickstart.ipynb ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "id": "XTboY7brzyp2"
7
+ },
8
+ "source": [
9
+ "![pageindex_banner](https://pageindex.ai/static/images/pageindex_banner.jpg)"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "markdown",
14
+ "metadata": {
15
+ "id": "EtjMbl9Pz3S-"
16
+ },
17
+ "source": [
18
+ "<p align=\"center\">Reasoning-based RAG&nbsp; ◦ &nbsp;No Vector DB&nbsp; ◦ &nbsp;No Chunking&nbsp; ◦ &nbsp;Human-like Retrieval</p>\n",
19
+ "\n",
20
+ "<p align=\"center\">\n",
21
+ " <a href=\"https://vectify.ai\">🏠 Homepage</a>&nbsp; • &nbsp;\n",
22
+ " <a href=\"https://chat.pageindex.ai\">🖥️ Platform</a>&nbsp; • &nbsp;\n",
23
+ " <a href=\"https://docs.pageindex.ai/quickstart\">📚 API Docs</a>&nbsp; • &nbsp;\n",
24
+ " <a href=\"https://github.com/VectifyAI/PageIndex\">📦 GitHub</a>&nbsp; • &nbsp;\n",
25
+ " <a href=\"https://discord.com/invite/VuXuf29EUj\">💬 Discord</a>&nbsp; • &nbsp;\n",
26
+ " <a href=\"https://ii2abc2jejf.typeform.com/to/tK3AXl8T\">✉️ Contact</a>&nbsp;\n",
27
+ "</p>\n",
28
+ "\n",
29
+ "<div align=\"center\">\n",
30
+ "\n",
31
+ "[![Star us on GitHub](https://img.shields.io/github/stars/VectifyAI/PageIndex?style=for-the-badge&logo=github&label=⭐️%20Star%20Us)](https://github.com/VectifyAI/PageIndex) &nbsp;&nbsp; [![Follow us on X](https://img.shields.io/badge/Follow%20Us-000000?style=for-the-badge&logo=x&logoColor=white)](https://twitter.com/VectifyAI)\n",
32
+ "\n",
33
+ "</div>\n",
34
+ "\n",
35
+ "---\n"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "markdown",
40
+ "metadata": {
41
+ "id": "bbC9uLWCz8zl"
42
+ },
43
+ "source": [
44
+ "# Document QA with PageIndex Chat API\n",
45
+ "\n",
46
+ "Similarity-based RAG based on Vector-DB has shown big limitations in recent AI applications, reasoning-based or agentic retrieval has become important in current developments.\n",
47
+ "\n",
48
+ "[PageIndex Chat](https://chat.pageindex.ai/) is a AI assistant that allow you chat with multiple super-long documents without worrying about limited context or context rot problem. It is based on [PageIndex](https://pageindex.ai/blog/pageindex-intro), a vectorless reasoning-based RAG framework which gives more transparent and reliable results like a human expert.\n",
49
+ "<div align=\"center\">\n",
50
+ " <img src=\"https://docs.pageindex.ai/images/cookbook/vectorless-rag.png\" width=\"70%\">\n",
51
+ "</div>\n",
52
+ "\n",
53
+ "You can now access PageIndex Chat with API or SDK.\n",
54
+ "\n",
55
+ "## 📝 Notebook Overview\n",
56
+ "\n",
57
+ "This notebook demonstrates a simple, minimal example of doing document analysis with PageIndex Chat API on the recently released [NVIDA 10Q report](https://d18rn0p25nwr6d.cloudfront.net/CIK-0001045810/13e6981b-95ed-4aac-a602-ebc5865d0590.pdf)."
58
+ ]
59
+ },
60
+ {
61
+ "cell_type": "markdown",
62
+ "metadata": {
63
+ "id": "77SQbPoe-LTN"
64
+ },
65
+ "source": [
66
+ "### Install PageIndex SDK"
67
+ ]
68
+ },
69
+ {
70
+ "cell_type": "code",
71
+ "execution_count": 2,
72
+ "metadata": {
73
+ "id": "6Eiv_cHf0OXz"
74
+ },
75
+ "outputs": [],
76
+ "source": [
77
+ "%pip install -q --upgrade pageindex"
78
+ ]
79
+ },
80
+ {
81
+ "cell_type": "markdown",
82
+ "metadata": {
83
+ "id": "UR9-qkdD-Om7"
84
+ },
85
+ "source": [
86
+ "### Setup PageIndex"
87
+ ]
88
+ },
89
+ {
90
+ "cell_type": "code",
91
+ "execution_count": 25,
92
+ "metadata": {
93
+ "id": "AFzsW4gq0fjh"
94
+ },
95
+ "outputs": [],
96
+ "source": [
97
+ "from pageindex import PageIndexClient\n",
98
+ "\n",
99
+ "# Get your PageIndex API key from https://dash.pageindex.ai/api-keys\n",
100
+ "PAGEINDEX_API_KEY = \"Your API KEY\"\n",
101
+ "pi_client = PageIndexClient(api_key=PAGEINDEX_API_KEY)"
102
+ ]
103
+ },
104
+ {
105
+ "cell_type": "markdown",
106
+ "metadata": {
107
+ "id": "uvzf9oWL-Ts9"
108
+ },
109
+ "source": [
110
+ "### Upload a document"
111
+ ]
112
+ },
113
+ {
114
+ "cell_type": "code",
115
+ "execution_count": 4,
116
+ "metadata": {
117
+ "colab": {
118
+ "base_uri": "https://localhost:8080/"
119
+ },
120
+ "id": "qf7sNRoL0hGw",
121
+ "outputId": "e8c2f3c1-1d1e-4932-f8e9-3272daae6781"
122
+ },
123
+ "outputs": [
124
+ {
125
+ "name": "stdout",
126
+ "output_type": "stream",
127
+ "text": [
128
+ "Downloaded https://d18rn0p25nwr6d.cloudfront.net/CIK-0001045810/13e6981b-95ed-4aac-a602-ebc5865d0590.pdf\n",
129
+ "Document Submitted: pi-cmi73f7r7022y09nwn40paaom\n"
130
+ ]
131
+ }
132
+ ],
133
+ "source": [
134
+ "import os, requests\n",
135
+ "\n",
136
+ "pdf_url = \"https://d18rn0p25nwr6d.cloudfront.net/CIK-0001045810/13e6981b-95ed-4aac-a602-ebc5865d0590.pdf\"\n",
137
+ "pdf_path = os.path.join(\"../data\", pdf_url.split('/')[-1])\n",
138
+ "os.makedirs(os.path.dirname(pdf_path), exist_ok=True)\n",
139
+ "\n",
140
+ "response = requests.get(pdf_url)\n",
141
+ "with open(pdf_path, \"wb\") as f:\n",
142
+ " f.write(response.content)\n",
143
+ "print(f\"Downloaded {pdf_url}\")\n",
144
+ "\n",
145
+ "doc_id = pi_client.submit_document(pdf_path)[\"doc_id\"]\n",
146
+ "print('Document Submitted:', doc_id)"
147
+ ]
148
+ },
149
+ {
150
+ "cell_type": "markdown",
151
+ "metadata": {
152
+ "id": "U4hpLB4T-fCt"
153
+ },
154
+ "source": [
155
+ "### Check the processing status"
156
+ ]
157
+ },
158
+ {
159
+ "cell_type": "code",
160
+ "execution_count": 22,
161
+ "metadata": {
162
+ "colab": {
163
+ "base_uri": "https://localhost:8080/"
164
+ },
165
+ "id": "PB1S_CWd2n87",
166
+ "outputId": "c1416161-a1d6-4f9e-873c-7f6e26c8fa5f"
167
+ },
168
+ "outputs": [
169
+ {
170
+ "name": "stdout",
171
+ "output_type": "stream",
172
+ "text": [
173
+ "{'createdAt': '2025-11-20T07:11:44.669000',\n",
174
+ " 'description': \"This document is NVIDIA Corporation's Form 10-Q Quarterly \"\n",
175
+ " 'Report for the period ending October 26, 2025, detailing its '\n",
176
+ " 'financial performance, operational results, market risks, and '\n",
177
+ " 'legal proceedings.',\n",
178
+ " 'id': 'pi-cmi73f7r7022y09nwn40paaom',\n",
179
+ " 'name': '13e6981b-95ed-4aac-a602-ebc5865d0590.pdf',\n",
180
+ " 'pageNum': 48,\n",
181
+ " 'status': 'completed'}\n",
182
+ "\n",
183
+ " Document ready! (48 pages)\n"
184
+ ]
185
+ }
186
+ ],
187
+ "source": [
188
+ "from pprint import pprint\n",
189
+ "\n",
190
+ "doc_info = pi_client.get_document(doc_id)\n",
191
+ "pprint(doc_info)\n",
192
+ "\n",
193
+ "if doc_info['status'] == 'completed':\n",
194
+ " print(f\"\\n Document ready! ({doc_info['pageNum']} pages)\")\n",
195
+ "elif doc_info['status'] == 'processing':\n",
196
+ " print(\"\\n Document is still processing. Please wait and check again.\")"
197
+ ]
198
+ },
199
+ {
200
+ "cell_type": "markdown",
201
+ "metadata": {
202
+ "id": "z1C9FOvO-p1m"
203
+ },
204
+ "source": [
205
+ "### Ask a question about this document"
206
+ ]
207
+ },
208
+ {
209
+ "cell_type": "code",
210
+ "execution_count": 24,
211
+ "metadata": {
212
+ "colab": {
213
+ "base_uri": "https://localhost:8080/"
214
+ },
215
+ "id": "X3RbQvy_0nt7",
216
+ "outputId": "a8bb5468-3f5f-4225-a5fd-60004041a644"
217
+ },
218
+ "outputs": [
219
+ {
220
+ "name": "stdout",
221
+ "output_type": "stream",
222
+ "text": [
223
+ "I'll help you find the revenue information from NVIDIA's 10-Q report. Let me first check the document structure to locate the financial information.{\"doc_name\": \"13e6981b-95ed-4aac-a602-ebc5865d0590.pdf\"}Now let me get the content from pages that contain the revenue information. Based on the structure, revenue information should be in the financial statements section, which appears early in the document. Let me check pages 3-8 which should contain the Condensed Consolidated Statements of Income.{\"doc_name\": \"13e6981b-95ed-4aac-a602-ebc5865d0590.pdf\", \"pages\": \"3-5\"}Perfect! I found the revenue information. Let me provide you with the details:\n",
224
+ "\n",
225
+ "## Revenue Information from NVIDIA's 10-Q Report\n",
226
+ "\n",
227
+ "**You can find the revenue on Page 3** of the document in the **Condensed Consolidated Statements of Income**.\n",
228
+ "\n",
229
+ "### Revenue Details:\n",
230
+ "\n",
231
+ "**For the Three Months Ended October 26, 2025 (Q3 FY2026):**\n",
232
+ "- **Revenue: $57,006 million** ($57.0 billion)\n",
233
+ "- Compared to Q3 FY2025 (October 27, 2024): $35,082 million\n",
234
+ "\n",
235
+ "**For the Nine Months Ended October 26, 2025 (First 9 months of FY2026):**\n",
236
+ "- **Revenue: $147,811 million** ($147.8 billion)\n",
237
+ "- Compared to same period FY2025: $91,166 million\n",
238
+ "\n",
239
+ "### Key Highlights:\n",
240
+ "- Q3 revenue increased by **62.5%** year-over-year ($21.9 billion increase)\n",
241
+ "- Nine-month revenue increased by **62.1%** year-over-year ($56.6 billion increase)\n",
242
+ "- This represents strong growth driven primarily by Data Center compute and networking platforms for AI and accelerated computing, with Blackwell architectures being a major contributor\n",
243
+ "\n",
244
+ "The revenue figures are clearly displayed at the top of the Condensed Consolidated Statements of Income on **Page 3** of the 10-Q report."
245
+ ]
246
+ }
247
+ ],
248
+ "source": [
249
+ "query = \"what is the revenue? Also show me which page I can find it.\"\n",
250
+ "\n",
251
+ "for chunk in pi_client.chat_completions(\n",
252
+ " messages=[{\"role\": \"user\", \"content\": query}],\n",
253
+ " doc_id=doc_id,\n",
254
+ " stream=True\n",
255
+ "):\n",
256
+ " print(chunk, end='', flush=True)"
257
+ ]
258
+ }
259
+ ],
260
+ "metadata": {
261
+ "colab": {
262
+ "provenance": []
263
+ },
264
+ "kernelspec": {
265
+ "display_name": "Python 3",
266
+ "name": "python3"
267
+ },
268
+ "language_info": {
269
+ "name": "python"
270
+ }
271
+ },
272
+ "nbformat": 4,
273
+ "nbformat_minor": 0
274
+ }
cookbook/pageindex_RAG_simple.ipynb ADDED
@@ -0,0 +1,609 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "id": "TCh9BTedHJK1"
7
+ },
8
+ "source": [
9
+ "![pageindex_banner](https://pageindex.ai/static/images/pageindex_banner.jpg)"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "markdown",
14
+ "metadata": {
15
+ "id": "nD0hb4TFHWTt"
16
+ },
17
+ "source": [
18
+ "<p align=\"center\"><i>Reasoning-based RAG&nbsp; ✧ &nbsp;No Vector DB&nbsp; ✧ &nbsp;No Chunking&nbsp; ✧ &nbsp;Human-like Retrieval</i></p>\n",
19
+ "\n",
20
+ "<p align=\"center\">\n",
21
+ " <a href=\"https://vectify.ai\">🏠 Homepage</a>&nbsp; • &nbsp;\n",
22
+ " <a href=\"https://dash.pageindex.ai\">🖥️ Dashboard</a>&nbsp; • &nbsp;\n",
23
+ " <a href=\"https://docs.pageindex.ai/quickstart\">📚 API Docs</a>&nbsp; • &nbsp;\n",
24
+ " <a href=\"https://github.com/VectifyAI/PageIndex\">📦 GitHub</a>&nbsp; • &nbsp;\n",
25
+ " <a href=\"https://discord.com/invite/VuXuf29EUj\">💬 Discord</a>&nbsp; • &nbsp;\n",
26
+ " <a href=\"https://ii2abc2jejf.typeform.com/to/tK3AXl8T\">✉️ Contact</a>&nbsp;\n",
27
+ "</p>\n",
28
+ "\n",
29
+ "<div align=\"center\">\n",
30
+ "\n",
31
+ "[![Star us on GitHub](https://img.shields.io/github/stars/VectifyAI/PageIndex?style=for-the-badge&logo=github&label=⭐️%20Star%20Us)](https://github.com/VectifyAI/PageIndex) &nbsp;&nbsp; [![Follow us on X](https://img.shields.io/badge/Follow%20Us-000000?style=for-the-badge&logo=x&logoColor=white)](https://twitter.com/VectifyAI)\n",
32
+ "\n",
33
+ "</div>\n",
34
+ "\n",
35
+ "---"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "markdown",
40
+ "metadata": {
41
+ "id": "Ebvn5qfpcG1K"
42
+ },
43
+ "source": [
44
+ "# Simple Vectorless RAG with PageIndex"
45
+ ]
46
+ },
47
+ {
48
+ "cell_type": "markdown",
49
+ "metadata": {},
50
+ "source": [
51
+ "## PageIndex Introduction\n",
52
+ "PageIndex is a new **reasoning-based**, **vectorless RAG** framework that performs retrieval in two steps: \n",
53
+ "1. Generate a tree structure index of documents \n",
54
+ "2. Perform reasoning-based retrieval through tree search \n",
55
+ "\n",
56
+ "<div align=\"center\">\n",
57
+ " <img src=\"https://docs.pageindex.ai/images/cookbook/vectorless-rag.png\" width=\"70%\">\n",
58
+ "</div>\n",
59
+ "\n",
60
+ "Compared to traditional vector-based RAG, PageIndex features:\n",
61
+ "- **No Vectors Needed**: Uses document structure and LLM reasoning for retrieval.\n",
62
+ "- **No Chunking Needed**: Documents are organized into natural sections rather than artificial chunks.\n",
63
+ "- **Human-like Retrieval**: Simulates how human experts navigate and extract knowledge from complex documents. \n",
64
+ "- **Transparent Retrieval Process**: Retrieval based on reasoning — say goodbye to approximate semantic search (\"vibe retrieval\")."
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "markdown",
69
+ "metadata": {},
70
+ "source": [
71
+ "## 📝 Notebook Overview\n",
72
+ "\n",
73
+ "This notebook demonstrates a simple, minimal example of **vectorless RAG** with PageIndex. You will learn how to:\n",
74
+ "- [x] Build a PageIndex tree structure of a document\n",
75
+ "- [x] Perform reasoning-based retrieval with tree search\n",
76
+ "- [x] Generate answers based on the retrieved context\n",
77
+ "\n",
78
+ "> ⚡ Note: This is a **minimal example** to illustrate PageIndex's core philosophy and idea, not its full capabilities. More advanced examples are coming soon.\n",
79
+ "\n",
80
+ "---"
81
+ ]
82
+ },
83
+ {
84
+ "cell_type": "markdown",
85
+ "metadata": {
86
+ "id": "7ziuTbbWcG1L"
87
+ },
88
+ "source": [
89
+ "## Step 0: Preparation\n",
90
+ "\n"
91
+ ]
92
+ },
93
+ {
94
+ "cell_type": "markdown",
95
+ "metadata": {
96
+ "id": "edTfrizMFK4c"
97
+ },
98
+ "source": [
99
+ "#### 0.1 Install PageIndex"
100
+ ]
101
+ },
102
+ {
103
+ "cell_type": "code",
104
+ "execution_count": null,
105
+ "metadata": {
106
+ "collapsed": true,
107
+ "id": "LaoB58wQFNDh"
108
+ },
109
+ "outputs": [],
110
+ "source": [
111
+ "%pip install -q --upgrade pageindex"
112
+ ]
113
+ },
114
+ {
115
+ "cell_type": "markdown",
116
+ "metadata": {
117
+ "id": "WVEWzPKGcG1M"
118
+ },
119
+ "source": [
120
+ "#### 0.2 Setup PageIndex"
121
+ ]
122
+ },
123
+ {
124
+ "cell_type": "code",
125
+ "execution_count": null,
126
+ "metadata": {
127
+ "id": "StvqfcK4cG1M"
128
+ },
129
+ "outputs": [],
130
+ "source": [
131
+ "from pageindex import PageIndexClient\n",
132
+ "import pageindex.utils as utils\n",
133
+ "\n",
134
+ "# Get your PageIndex API key from https://dash.pageindex.ai/api-keys\n",
135
+ "PAGEINDEX_API_KEY = \"YOUR_PAGEINDEX_API_KEY\"\n",
136
+ "pi_client = PageIndexClient(api_key=PAGEINDEX_API_KEY)"
137
+ ]
138
+ },
139
+ {
140
+ "cell_type": "markdown",
141
+ "metadata": {},
142
+ "source": [
143
+ "#### 0.3 Setup LLM\n",
144
+ "\n",
145
+ "Choose your preferred LLM for reasoning-based retrieval. In this example, we use OpenAI’s GPT-4.1."
146
+ ]
147
+ },
148
+ {
149
+ "cell_type": "code",
150
+ "execution_count": null,
151
+ "metadata": {},
152
+ "outputs": [],
153
+ "source": [
154
+ "import openai\n",
155
+ "OPENAI_API_KEY = \"YOUR_OPENAI_API_KEY\"\n",
156
+ "\n",
157
+ "async def call_llm(prompt, model=\"gpt-4.1\", temperature=0):\n",
158
+ " client = openai.AsyncOpenAI(api_key=OPENAI_API_KEY)\n",
159
+ " response = await client.chat.completions.create(\n",
160
+ " model=model,\n",
161
+ " messages=[{\"role\": \"user\", \"content\": prompt}],\n",
162
+ " temperature=temperature\n",
163
+ " )\n",
164
+ " return response.choices[0].message.content.strip()"
165
+ ]
166
+ },
167
+ {
168
+ "cell_type": "markdown",
169
+ "metadata": {
170
+ "id": "heGtIMOVcG1N"
171
+ },
172
+ "source": [
173
+ "## Step 1: PageIndex Tree Generation"
174
+ ]
175
+ },
176
+ {
177
+ "cell_type": "markdown",
178
+ "metadata": {
179
+ "id": "Mzd1VWjwMUJL"
180
+ },
181
+ "source": [
182
+ "#### 1.1 Submit a document for generating PageIndex tree"
183
+ ]
184
+ },
185
+ {
186
+ "cell_type": "code",
187
+ "execution_count": null,
188
+ "metadata": {
189
+ "colab": {
190
+ "base_uri": "https://localhost:8080/"
191
+ },
192
+ "id": "f6--eZPLcG1N",
193
+ "outputId": "ca688cfd-6c4b-4a57-dac2-f3c2604c4112"
194
+ },
195
+ "outputs": [
196
+ {
197
+ "name": "stdout",
198
+ "output_type": "stream",
199
+ "text": [
200
+ "Downloaded https://arxiv.org/pdf/2501.12948.pdf\n",
201
+ "Document Submitted: pi-cmeseq08w00vt0bo3u6tr244g\n"
202
+ ]
203
+ }
204
+ ],
205
+ "source": [
206
+ "import os, requests\n",
207
+ "\n",
208
+ "# You can also use our GitHub repo to generate PageIndex tree\n",
209
+ "# https://github.com/VectifyAI/PageIndex\n",
210
+ "\n",
211
+ "pdf_url = \"https://arxiv.org/pdf/2501.12948.pdf\"\n",
212
+ "pdf_path = os.path.join(\"../data\", pdf_url.split('/')[-1])\n",
213
+ "os.makedirs(os.path.dirname(pdf_path), exist_ok=True)\n",
214
+ "\n",
215
+ "response = requests.get(pdf_url)\n",
216
+ "with open(pdf_path, \"wb\") as f:\n",
217
+ " f.write(response.content)\n",
218
+ "print(f\"Downloaded {pdf_url}\")\n",
219
+ "\n",
220
+ "doc_id = pi_client.submit_document(pdf_path)[\"doc_id\"]\n",
221
+ "print('Document Submitted:', doc_id)"
222
+ ]
223
+ },
224
+ {
225
+ "cell_type": "markdown",
226
+ "metadata": {
227
+ "id": "4-Hrh0azcG1N"
228
+ },
229
+ "source": [
230
+ "#### 1.2 Get the generated PageIndex tree structure"
231
+ ]
232
+ },
233
+ {
234
+ "cell_type": "code",
235
+ "execution_count": null,
236
+ "metadata": {
237
+ "colab": {
238
+ "base_uri": "https://localhost:8080/",
239
+ "height": 1000
240
+ },
241
+ "id": "b1Q1g6vrcG1O",
242
+ "outputId": "dc944660-38ad-47ea-d358-be422edbae53"
243
+ },
244
+ "outputs": [
245
+ {
246
+ "name": "stdout",
247
+ "output_type": "stream",
248
+ "text": [
249
+ "Simplified Tree Structure of the Document:\n",
250
+ "[{'title': 'DeepSeek-R1: Incentivizing Reasoning Cap...',\n",
251
+ " 'node_id': '0000',\n",
252
+ " 'prefix_summary': '# DeepSeek-R1: Incentivizing Reasoning C...',\n",
253
+ " 'nodes': [{'title': 'Abstract',\n",
254
+ " 'node_id': '0001',\n",
255
+ " 'summary': 'The partial document introduces two reas...'},\n",
256
+ " {'title': 'Contents',\n",
257
+ " 'node_id': '0002',\n",
258
+ " 'summary': 'This partial document provides a detaile...'},\n",
259
+ " {'title': '1. Introduction',\n",
260
+ " 'node_id': '0003',\n",
261
+ " 'prefix_summary': 'The partial document introduces recent a...',\n",
262
+ " 'nodes': [{'title': '1.1. Contributions',\n",
263
+ " 'node_id': '0004',\n",
264
+ " 'summary': 'This partial document outlines the main ...'},\n",
265
+ " {'title': '1.2. Summary of Evaluation Results',\n",
266
+ " 'node_id': '0005',\n",
267
+ " 'summary': 'The partial document provides a summary ...'}]},\n",
268
+ " {'title': '2. Approach',\n",
269
+ " 'node_id': '0006',\n",
270
+ " 'prefix_summary': '## 2. Approach\\n',\n",
271
+ " 'nodes': [{'title': '2.1. Overview',\n",
272
+ " 'node_id': '0007',\n",
273
+ " 'summary': '### 2.1. Overview\\n\\nPrevious work has hea...'},\n",
274
+ " {'title': '2.2. DeepSeek-R1-Zero: Reinforcement Lea...',\n",
275
+ " 'node_id': '0008',\n",
276
+ " 'prefix_summary': '### 2.2. DeepSeek-R1-Zero: Reinforcement...',\n",
277
+ " 'nodes': [{'title': '2.2.1. Reinforcement Learning Algorithm',\n",
278
+ " 'node_id': '0009',\n",
279
+ " 'summary': 'The partial document describes the Group...'},\n",
280
+ " {'title': '2.2.2. Reward Modeling',\n",
281
+ " 'node_id': '0010',\n",
282
+ " 'summary': 'This partial document discusses the rewa...'},\n",
283
+ " {'title': '2.2.3. Training Template',\n",
284
+ " 'node_id': '0011',\n",
285
+ " 'summary': '#### 2.2.3. Training Template\\n\\nTo train ...'},\n",
286
+ " {'title': '2.2.4. Performance, Self-evolution Proce...',\n",
287
+ " 'node_id': '0012',\n",
288
+ " 'summary': 'This partial document discusses the perf...'}]},\n",
289
+ " {'title': '2.3. DeepSeek-R1: Reinforcement Learning...',\n",
290
+ " 'node_id': '0013',\n",
291
+ " 'summary': 'This partial document describes the trai...'},\n",
292
+ " {'title': '2.4. Distillation: Empower Small Models ...',\n",
293
+ " 'node_id': '0014',\n",
294
+ " 'summary': 'This partial document discusses the proc...'}]},\n",
295
+ " {'title': '3. Experiment',\n",
296
+ " 'node_id': '0015',\n",
297
+ " 'prefix_summary': 'The partial document describes the exper...',\n",
298
+ " 'nodes': [{'title': '3.1. DeepSeek-R1 Evaluation',\n",
299
+ " 'node_id': '0016',\n",
300
+ " 'summary': 'This partial document presents a compreh...'},\n",
301
+ " {'title': '3.2. Distilled Model Evaluation',\n",
302
+ " 'node_id': '0017',\n",
303
+ " 'summary': 'This partial document presents an evalua...'}]},\n",
304
+ " {'title': '4. Discussion',\n",
305
+ " 'node_id': '0018',\n",
306
+ " 'summary': 'This partial document discusses the comp...'},\n",
307
+ " {'title': '5. Conclusion, Limitations, and Future W...',\n",
308
+ " 'node_id': '0019',\n",
309
+ " 'summary': 'This partial document presents the concl...'},\n",
310
+ " {'title': 'References',\n",
311
+ " 'node_id': '0020',\n",
312
+ " 'summary': 'This partial document consists of the re...'},\n",
313
+ " {'title': 'Appendix', 'node_id': '0021', 'summary': '## Appendix\\n'},\n",
314
+ " {'title': 'A. Contributions and Acknowledgments',\n",
315
+ " 'node_id': '0022',\n",
316
+ " 'summary': 'This partial document section details th...'}]}]\n"
317
+ ]
318
+ }
319
+ ],
320
+ "source": [
321
+ "if pi_client.is_retrieval_ready(doc_id):\n",
322
+ " tree = pi_client.get_tree(doc_id, node_summary=True)['result']\n",
323
+ " print('Simplified Tree Structure of the Document:')\n",
324
+ " utils.print_tree(tree)\n",
325
+ "else:\n",
326
+ " print(\"Processing document, please try again later...\")"
327
+ ]
328
+ },
329
+ {
330
+ "cell_type": "markdown",
331
+ "metadata": {
332
+ "id": "USoCLOiQcG1O"
333
+ },
334
+ "source": [
335
+ "## Step 2: Reasoning-Based Retrieval with Tree Search"
336
+ ]
337
+ },
338
+ {
339
+ "cell_type": "markdown",
340
+ "metadata": {},
341
+ "source": [
342
+ "#### 2.1 Use LLM for tree search and identify nodes that might contain relevant context"
343
+ ]
344
+ },
345
+ {
346
+ "cell_type": "code",
347
+ "execution_count": 21,
348
+ "metadata": {
349
+ "id": "LLHNJAtTcG1O"
350
+ },
351
+ "outputs": [],
352
+ "source": [
353
+ "import json\n",
354
+ "\n",
355
+ "query = \"What are the conclusions in this document?\"\n",
356
+ "\n",
357
+ "tree_without_text = utils.remove_fields(tree.copy(), fields=['text'])\n",
358
+ "\n",
359
+ "search_prompt = f\"\"\"\n",
360
+ "You are given a question and a tree structure of a document.\n",
361
+ "Each node contains a node id, node title, and a corresponding summary.\n",
362
+ "Your task is to find all nodes that are likely to contain the answer to the question.\n",
363
+ "\n",
364
+ "Question: {query}\n",
365
+ "\n",
366
+ "Document tree structure:\n",
367
+ "{json.dumps(tree_without_text, indent=2)}\n",
368
+ "\n",
369
+ "Please reply in the following JSON format:\n",
370
+ "{{\n",
371
+ " \"thinking\": \"<Your thinking process on which nodes are relevant to the question>\",\n",
372
+ " \"node_list\": [\"node_id_1\", \"node_id_2\", ..., \"node_id_n\"]\n",
373
+ "}}\n",
374
+ "Directly return the final JSON structure. Do not output anything else.\n",
375
+ "\"\"\"\n",
376
+ "\n",
377
+ "tree_search_result = await call_llm(search_prompt)"
378
+ ]
379
+ },
380
+ {
381
+ "cell_type": "markdown",
382
+ "metadata": {},
383
+ "source": [
384
+ "#### 2.2 Print retrieved nodes and reasoning process"
385
+ ]
386
+ },
387
+ {
388
+ "cell_type": "code",
389
+ "execution_count": 57,
390
+ "metadata": {
391
+ "colab": {
392
+ "base_uri": "https://localhost:8080/",
393
+ "height": 206
394
+ },
395
+ "id": "P8DVUOuAen5u",
396
+ "outputId": "6bb6d052-ef30-4716-f88e-be98bcb7ebdb"
397
+ },
398
+ "outputs": [
399
+ {
400
+ "name": "stdout",
401
+ "output_type": "stream",
402
+ "text": [
403
+ "Reasoning Process:\n",
404
+ "The question asks for the conclusions in the document. Typically, conclusions are found in sections\n",
405
+ "explicitly titled 'Conclusion' or in sections summarizing the findings and implications of the work.\n",
406
+ "In this document tree, node 0019 ('5. Conclusion, Limitations, and Future Work') is the most\n",
407
+ "directly relevant, as it is dedicated to the conclusion and related topics. Additionally, the\n",
408
+ "'Abstract' (node 0001) may contain a high-level summary that sometimes includes concluding remarks,\n",
409
+ "but it is less likely to contain the full conclusions. Other sections like 'Discussion' (node 0018)\n",
410
+ "may discuss implications but are not explicitly conclusions. Therefore, the primary node is 0019.\n",
411
+ "\n",
412
+ "Retrieved Nodes:\n",
413
+ "Node ID: 0019\t Page: 16\t Title: 5. Conclusion, Limitations, and Future Work\n"
414
+ ]
415
+ }
416
+ ],
417
+ "source": [
418
+ "node_map = utils.create_node_mapping(tree)\n",
419
+ "tree_search_result_json = json.loads(tree_search_result)\n",
420
+ "\n",
421
+ "print('Reasoning Process:')\n",
422
+ "utils.print_wrapped(tree_search_result_json['thinking'])\n",
423
+ "\n",
424
+ "print('\\nRetrieved Nodes:')\n",
425
+ "for node_id in tree_search_result_json[\"node_list\"]:\n",
426
+ " node = node_map[node_id]\n",
427
+ " print(f\"Node ID: {node['node_id']}\\t Page: {node['page_index']}\\t Title: {node['title']}\")"
428
+ ]
429
+ },
430
+ {
431
+ "cell_type": "markdown",
432
+ "metadata": {
433
+ "id": "10wOZDG_cG1O"
434
+ },
435
+ "source": [
436
+ "## Step 3: Answer Generation"
437
+ ]
438
+ },
439
+ {
440
+ "cell_type": "markdown",
441
+ "metadata": {},
442
+ "source": [
443
+ "#### 3.1 Extract relevant context from retrieved nodes"
444
+ ]
445
+ },
446
+ {
447
+ "cell_type": "code",
448
+ "execution_count": 58,
449
+ "metadata": {
450
+ "colab": {
451
+ "base_uri": "https://localhost:8080/",
452
+ "height": 279
453
+ },
454
+ "id": "a7UCBnXlcG1O",
455
+ "outputId": "8a026ea3-4ef3-473a-a57b-b4565409749e"
456
+ },
457
+ "outputs": [
458
+ {
459
+ "name": "stdout",
460
+ "output_type": "stream",
461
+ "text": [
462
+ "Retrieved Context:\n",
463
+ "\n",
464
+ "## 5. Conclusion, Limitations, and Future Work\n",
465
+ "\n",
466
+ "In this work, we share our journey in enhancing model reasoning abilities through reinforcement\n",
467
+ "learning. DeepSeek-R1-Zero represents a pure RL approach without relying on cold-start data,\n",
468
+ "achieving strong performance across various tasks. DeepSeek-R1 is more powerful, leveraging cold-\n",
469
+ "start data alongside iterative RL fine-tuning. Ultimately, DeepSeek-R1 achieves performance\n",
470
+ "comparable to OpenAI-o1-1217 on a range of tasks.\n",
471
+ "\n",
472
+ "We further explore distillation the reasoning capability to small dense models. We use DeepSeek-R1\n",
473
+ "as the teacher model to generate 800K training samples, and fine-tune several small dense models.\n",
474
+ "The results are promising: DeepSeek-R1-Distill-Qwen-1.5B outperforms GPT-4o and Claude-3.5-Sonnet on\n",
475
+ "math benchmarks with $28.9 \\%$ on AIME and $83.9 \\%$ on MATH. Other dense models also achieve\n",
476
+ "impressive results, significantly outperforming other instructiontuned models based on the same\n",
477
+ "underlying checkpoints.\n",
478
+ "\n",
479
+ "In the fut...\n"
480
+ ]
481
+ }
482
+ ],
483
+ "source": [
484
+ "node_list = json.loads(tree_search_result)[\"node_list\"]\n",
485
+ "relevant_content = \"\\n\\n\".join(node_map[node_id][\"text\"] for node_id in node_list)\n",
486
+ "\n",
487
+ "print('Retrieved Context:\\n')\n",
488
+ "utils.print_wrapped(relevant_content[:1000] + '...')"
489
+ ]
490
+ },
491
+ {
492
+ "cell_type": "markdown",
493
+ "metadata": {},
494
+ "source": [
495
+ "#### 3.2 Generate answer based on retrieved context"
496
+ ]
497
+ },
498
+ {
499
+ "cell_type": "code",
500
+ "execution_count": 59,
501
+ "metadata": {
502
+ "colab": {
503
+ "base_uri": "https://localhost:8080/",
504
+ "height": 210
505
+ },
506
+ "id": "tcp_PhHzcG1O",
507
+ "outputId": "187ff116-9bb0-4ab4-bacb-13944460b5ff"
508
+ },
509
+ "outputs": [
510
+ {
511
+ "name": "stdout",
512
+ "output_type": "stream",
513
+ "text": [
514
+ "Generated Answer:\n",
515
+ "\n",
516
+ "The conclusions in this document are:\n",
517
+ "\n",
518
+ "- DeepSeek-R1-Zero, a pure reinforcement learning (RL) approach without cold-start data, achieves\n",
519
+ "strong performance across various tasks.\n",
520
+ "- DeepSeek-R1, which combines cold-start data with iterative RL fine-tuning, is more powerful and\n",
521
+ "achieves performance comparable to OpenAI-o1-1217 on a range of tasks.\n",
522
+ "- Distilling DeepSeek-R1’s reasoning capabilities into smaller dense models is promising; for\n",
523
+ "example, DeepSeek-R1-Distill-Qwen-1.5B outperforms GPT-4o and Claude-3.5-Sonnet on math benchmarks,\n",
524
+ "and other dense models also show significant improvements over similar instruction-tuned models.\n",
525
+ "\n",
526
+ "These results demonstrate the effectiveness of the RL-based approach and the potential for\n",
527
+ "distilling reasoning abilities into smaller models.\n"
528
+ ]
529
+ }
530
+ ],
531
+ "source": [
532
+ "answer_prompt = f\"\"\"\n",
533
+ "Answer the question based on the context:\n",
534
+ "\n",
535
+ "Question: {query}\n",
536
+ "Context: {relevant_content}\n",
537
+ "\n",
538
+ "Provide a clear, concise answer based only on the context provided.\n",
539
+ "\"\"\"\n",
540
+ "\n",
541
+ "print('Generated Answer:\\n')\n",
542
+ "answer = await call_llm(answer_prompt)\n",
543
+ "utils.print_wrapped(answer)"
544
+ ]
545
+ },
546
+ {
547
+ "cell_type": "markdown",
548
+ "metadata": {
549
+ "id": "_1kaGD3GcG1O"
550
+ },
551
+ "source": [
552
+ "---\n",
553
+ "\n",
554
+ "## 🎯 What's Next\n",
555
+ "\n",
556
+ "This notebook has demonstrated a **basic**, **minimal** example of **reasoning-based**, **vectorless** RAG with PageIndex. The workflow illustrates the core idea:\n",
557
+ "> *Generating a hierarchical tree structure from a document, reasoning over that tree structure, and extracting relevant context, without relying on a vector database or top-k similarity search*.\n",
558
+ "\n",
559
+ "While this notebook highlights a minimal workflow, the PageIndex framework is built to support **far more advanced** use cases. In upcoming tutorials, we will introduce:\n",
560
+ "* **Multi-Node Reasoning with Content Extraction** — Scale tree search to extract and select relevant content from multiple nodes.\n",
561
+ "* **Multi-Document Search** — Enable reasoning-based navigation across large document collections, extending beyond a single file.\n",
562
+ "* **Efficient Tree Search** — Improve tree search efficiency for long documents with a large number of nodes.\n",
563
+ "* **Expert Knowledge Integration and Preference Alignment** — Incorporate user preferences or expert insights by adding knowledge directly into the LLM tree search, without the need for fine-tuning.\n",
564
+ "\n"
565
+ ]
566
+ },
567
+ {
568
+ "cell_type": "markdown",
569
+ "metadata": {},
570
+ "source": [
571
+ "## 🔎 Learn More About PageIndex\n",
572
+ " <a href=\"https://vectify.ai\">🏠 Homepage</a>&nbsp; • &nbsp;\n",
573
+ " <a href=\"https://dash.pageindex.ai\">🖥️ Dashboard</a>&nbsp; • &nbsp;\n",
574
+ " <a href=\"https://docs.pageindex.ai/quickstart\">📚 API Docs</a>&nbsp; • &nbsp;\n",
575
+ " <a href=\"https://github.com/VectifyAI/PageIndex\">📦 GitHub</a>&nbsp; • &nbsp;\n",
576
+ " <a href=\"https://discord.com/invite/VuXuf29EUj\">💬 Discord</a>&nbsp; • &nbsp;\n",
577
+ " <a href=\"https://ii2abc2jejf.typeform.com/to/tK3AXl8T\">✉️ Contact</a>\n",
578
+ "\n",
579
+ "<br>\n",
580
+ "\n",
581
+ "© 2025 [Vectify AI](https://vectify.ai)"
582
+ ]
583
+ }
584
+ ],
585
+ "metadata": {
586
+ "colab": {
587
+ "provenance": []
588
+ },
589
+ "kernelspec": {
590
+ "display_name": "Python 3",
591
+ "language": "python",
592
+ "name": "python3"
593
+ },
594
+ "language_info": {
595
+ "codemirror_mode": {
596
+ "name": "ipython",
597
+ "version": 3
598
+ },
599
+ "file_extension": ".py",
600
+ "mimetype": "text/x-python",
601
+ "name": "python",
602
+ "nbconvert_exporter": "python",
603
+ "pygments_lexer": "ipython3",
604
+ "version": "3.11.9"
605
+ }
606
+ },
607
+ "nbformat": 4,
608
+ "nbformat_minor": 0
609
+ }
cookbook/vision_RAG_pageindex.ipynb ADDED
@@ -0,0 +1,667 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "id": "TCh9BTedHJK1"
7
+ },
8
+ "source": [
9
+ "![pageindex_banner](https://pageindex.ai/static/images/pageindex_banner.jpg)\n"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "markdown",
14
+ "metadata": {
15
+ "id": "nD0hb4TFHWTt"
16
+ },
17
+ "source": [
18
+ "<div align=\"center\">\n",
19
+ "<p><i>Reasoning-based RAG&nbsp; ◦ &nbsp;No Vector DB&nbsp; ◦ &nbsp;No Chunking&nbsp; ◦ &nbsp;Human-like Retrieval</i></p>\n",
20
+ "</div>\n",
21
+ "\n",
22
+ "<div align=\"center\">\n",
23
+ "<p>\n",
24
+ " <a href=\"https://vectify.ai\">🏠 Homepage</a>&nbsp; • &nbsp;\n",
25
+ " <a href=\"https://chat.pageindex.ai\">💻 Chat</a>&nbsp; • &nbsp;\n",
26
+ " <a href=\"https://pageindex.ai/mcp\">🔌 MCP</a>&nbsp; • &nbsp;\n",
27
+ " <a href=\"https://docs.pageindex.ai/quickstart\">📚 API</a>&nbsp; • &nbsp;\n",
28
+ " <a href=\"https://github.com/VectifyAI/PageIndex\">📦 GitHub</a>&nbsp; • &nbsp;\n",
29
+ " <a href=\"https://discord.com/invite/VuXuf29EUj\">💬 Discord</a>&nbsp; • &nbsp;\n",
30
+ " <a href=\"https://ii2abc2jejf.typeform.com/to/tK3AXl8T\">✉️ Contact</a>&nbsp;\n",
31
+ "</p>\n",
32
+ "</div>\n",
33
+ "\n",
34
+ "<div align=\"center\">\n",
35
+ "\n",
36
+ "[![Star us on GitHub](https://img.shields.io/github/stars/VectifyAI/PageIndex?style=for-the-badge&logo=github&label=⭐️%20Star%20Us)](https://github.com/VectifyAI/PageIndex) &nbsp;&nbsp; [![Follow us on X](https://img.shields.io/badge/Follow%20Us-000000?style=for-the-badge&logo=x&logoColor=white)](https://twitter.com/VectifyAI)\n",
37
+ "\n",
38
+ "</div>\n",
39
+ "\n",
40
+ "---"
41
+ ]
42
+ },
43
+ {
44
+ "cell_type": "markdown",
45
+ "metadata": {},
46
+ "source": [
47
+ "> Check out our blog post, \"[Do We Still Need OCR?](https://pageindex.ai/blog/do-we-need-ocr)\", for a more detailed discussion."
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "markdown",
52
+ "metadata": {
53
+ "id": "Ebvn5qfpcG1K"
54
+ },
55
+ "source": [
56
+ "# A Vision-based, Vectorless RAG System for Long Documents\n"
57
+ ]
58
+ },
59
+ {
60
+ "cell_type": "markdown",
61
+ "metadata": {},
62
+ "source": [
63
+ "In modern document question answering (QA) systems, Optical Character Recognition (OCR) serves an important role by converting PDF pages into text that can be processed by Large Language Models (LLMs). The resulting text can provide contextual input that enables LLMs to perform question answering over document content.\n",
64
+ "\n",
65
+ "Traditional OCR systems typically use a two-stage process that first detects the layout of a PDF — dividing it into text, tables, and images — and then recognizes and converts these elements into plain text. With the rise of vision-language models (VLMs) (such as [Qwen-VL](https://github.com/QwenLM/Qwen3-VL) and [GPT-4.1](https://openai.com/index/gpt-4-1/)), new end-to-end OCR models like [DeepSeek-OCR](https://github.com/deepseek-ai/DeepSeek-OCR) have emerged. These models jointly understand visual and textual information, enabling direct interpretation of PDFs without an explicit layout detection step.\n",
66
+ "\n",
67
+ "However, this paradigm shift raises an important question: \n",
68
+ "\n",
69
+ "\n",
70
+ "> **If a VLM can already process both the document images and the query to produce an answer directly, do we still need the intermediate OCR step?**\n",
71
+ "\n",
72
+ "In this notebook, we give a practical implementation of a vision-based question-answering system for long documents, without relying on OCR. Specifically, we use PageIndex as a reasoning-based retrieval layer and OpenAI's multimodal GPT-4.1 as the VLM for visual reasoning and answer generation.\n",
73
+ "\n",
74
+ "See the original [blog post](https://pageindex.ai/blog/do-we-need-ocr) for a more detailed discussion on how VLMs can replace traditional OCR pipelines in document question-answering."
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "markdown",
79
+ "metadata": {},
80
+ "source": [
81
+ "## 📝 Notebook Overview\n",
82
+ "\n",
83
+ "This notebook demonstrates a *minimal*, **vision-based vectorless RAG** pipeline for long documents with PageIndex, using only visual context from PDF pages. You will learn how to:\n",
84
+ "- [x] Build a PageIndex tree structure of a document\n",
85
+ "- [x] Perform reasoning-based retrieval with tree search\n",
86
+ "- [x] Extract PDF page images of retrieved tree nodes for visual context\n",
87
+ "- [x] Generate answers using VLM with PDF image inputs only (no OCR required)\n",
88
+ "\n",
89
+ "> ⚡ Note: This example uses PageIndex's reasoning-based retrieval with OpenAI's multimodal GPT-4.1 model for both tree search and visual context reasoning.\n",
90
+ "\n",
91
+ "---"
92
+ ]
93
+ },
94
+ {
95
+ "cell_type": "markdown",
96
+ "metadata": {
97
+ "id": "7ziuTbbWcG1L"
98
+ },
99
+ "source": [
100
+ "## Step 0: Preparation\n",
101
+ "\n",
102
+ "This notebook demonstrates **Vision-based RAG** with PageIndex, using PDF page images as visual context for retrieval and answer generation.\n",
103
+ "\n"
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "markdown",
108
+ "metadata": {
109
+ "id": "edTfrizMFK4c"
110
+ },
111
+ "source": [
112
+ "#### 0.1 Install PageIndex"
113
+ ]
114
+ },
115
+ {
116
+ "cell_type": "code",
117
+ "execution_count": null,
118
+ "metadata": {
119
+ "collapsed": true,
120
+ "id": "LaoB58wQFNDh"
121
+ },
122
+ "outputs": [],
123
+ "source": [
124
+ "%pip install -q --upgrade pageindex requests openai PyMuPDF"
125
+ ]
126
+ },
127
+ {
128
+ "cell_type": "markdown",
129
+ "metadata": {
130
+ "id": "WVEWzPKGcG1M"
131
+ },
132
+ "source": [
133
+ "#### 0.2 Setup PageIndex"
134
+ ]
135
+ },
136
+ {
137
+ "cell_type": "code",
138
+ "execution_count": null,
139
+ "metadata": {
140
+ "id": "StvqfcK4cG1M"
141
+ },
142
+ "outputs": [],
143
+ "source": [
144
+ "from pageindex import PageIndexClient\n",
145
+ "import pageindex.utils as utils\n",
146
+ "\n",
147
+ "# Get your PageIndex API key from https://dash.pageindex.ai/api-keys\n",
148
+ "PAGEINDEX_API_KEY = \"YOUR_PAGEINDEX_API_KEY\"\n",
149
+ "pi_client = PageIndexClient(api_key=PAGEINDEX_API_KEY)"
150
+ ]
151
+ },
152
+ {
153
+ "cell_type": "markdown",
154
+ "metadata": {},
155
+ "source": [
156
+ "#### 0.3 Setup VLM\n",
157
+ "\n",
158
+ "Choose your preferred VLM — in this notebook, we use OpenAI's multimodal GPT-4.1 as the VLM."
159
+ ]
160
+ },
161
+ {
162
+ "cell_type": "code",
163
+ "execution_count": null,
164
+ "metadata": {},
165
+ "outputs": [],
166
+ "source": [
167
+ "import openai, fitz, base64, os\n",
168
+ "\n",
169
+ "# Setup OpenAI client\n",
170
+ "OPENAI_API_KEY = \"YOUR_OPENAI_API_KEY\"\n",
171
+ "\n",
172
+ "async def call_vlm(prompt, image_paths=None, model=\"gpt-4.1\"):\n",
173
+ " client = openai.AsyncOpenAI(api_key=OPENAI_API_KEY)\n",
174
+ " messages = [{\"role\": \"user\", \"content\": prompt}]\n",
175
+ " if image_paths:\n",
176
+ " content = [{\"type\": \"text\", \"text\": prompt}]\n",
177
+ " for image in image_paths:\n",
178
+ " if os.path.exists(image):\n",
179
+ " with open(image, \"rb\") as image_file:\n",
180
+ " image_data = base64.b64encode(image_file.read()).decode('utf-8')\n",
181
+ " content.append({\n",
182
+ " \"type\": \"image_url\",\n",
183
+ " \"image_url\": {\n",
184
+ " \"url\": f\"data:image/jpeg;base64,{image_data}\"\n",
185
+ " }\n",
186
+ " })\n",
187
+ " messages[0][\"content\"] = content\n",
188
+ " response = await client.chat.completions.create(model=model, messages=messages, temperature=0)\n",
189
+ " return response.choices[0].message.content.strip()"
190
+ ]
191
+ },
192
+ {
193
+ "cell_type": "markdown",
194
+ "metadata": {},
195
+ "source": [
196
+ "#### 0.4 PDF Image Extraction Helper Functions\n"
197
+ ]
198
+ },
199
+ {
200
+ "cell_type": "code",
201
+ "execution_count": null,
202
+ "metadata": {},
203
+ "outputs": [],
204
+ "source": [
205
+ "def extract_pdf_page_images(pdf_path, output_dir=\"pdf_images\"):\n",
206
+ " os.makedirs(output_dir, exist_ok=True)\n",
207
+ " pdf_document = fitz.open(pdf_path)\n",
208
+ " page_images = {}\n",
209
+ " total_pages = len(pdf_document)\n",
210
+ " for page_number in range(len(pdf_document)):\n",
211
+ " page = pdf_document.load_page(page_number)\n",
212
+ " # Convert page to image\n",
213
+ " mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better quality\n",
214
+ " pix = page.get_pixmap(matrix=mat)\n",
215
+ " img_data = pix.tobytes(\"jpeg\")\n",
216
+ " image_path = os.path.join(output_dir, f\"page_{page_number + 1}.jpg\")\n",
217
+ " with open(image_path, \"wb\") as image_file:\n",
218
+ " image_file.write(img_data)\n",
219
+ " page_images[page_number + 1] = image_path\n",
220
+ " print(f\"Saved page {page_number + 1} image: {image_path}\")\n",
221
+ " pdf_document.close()\n",
222
+ " return page_images, total_pages\n",
223
+ "\n",
224
+ "def get_page_images_for_nodes(node_list, node_map, page_images):\n",
225
+ " # Get PDF page images for retrieved nodes\n",
226
+ " image_paths = []\n",
227
+ " seen_pages = set()\n",
228
+ " for node_id in node_list:\n",
229
+ " node_info = node_map[node_id]\n",
230
+ " for page_num in range(node_info['start_index'], node_info['end_index'] + 1):\n",
231
+ " if page_num not in seen_pages:\n",
232
+ " image_paths.append(page_images[page_num])\n",
233
+ " seen_pages.add(page_num)\n",
234
+ " return image_paths\n"
235
+ ]
236
+ },
237
+ {
238
+ "cell_type": "markdown",
239
+ "metadata": {
240
+ "id": "heGtIMOVcG1N"
241
+ },
242
+ "source": [
243
+ "## Step 1: PageIndex Tree Generation"
244
+ ]
245
+ },
246
+ {
247
+ "cell_type": "markdown",
248
+ "metadata": {
249
+ "id": "Mzd1VWjwMUJL"
250
+ },
251
+ "source": [
252
+ "#### 1.1 Submit a document for generating PageIndex tree"
253
+ ]
254
+ },
255
+ {
256
+ "cell_type": "code",
257
+ "execution_count": null,
258
+ "metadata": {
259
+ "colab": {
260
+ "base_uri": "https://localhost:8080/"
261
+ },
262
+ "id": "f6--eZPLcG1N",
263
+ "outputId": "ca688cfd-6c4b-4a57-dac2-f3c2604c4112"
264
+ },
265
+ "outputs": [],
266
+ "source": [
267
+ "import os, requests\n",
268
+ "\n",
269
+ "# You can also use our GitHub repo to generate PageIndex tree\n",
270
+ "# https://github.com/VectifyAI/PageIndex\n",
271
+ "\n",
272
+ "pdf_url = \"https://arxiv.org/pdf/1706.03762.pdf\" # the \"Attention Is All You Need\" paper\n",
273
+ "pdf_path = os.path.join(\"../data\", pdf_url.split('/')[-1])\n",
274
+ "os.makedirs(os.path.dirname(pdf_path), exist_ok=True)\n",
275
+ "\n",
276
+ "response = requests.get(pdf_url)\n",
277
+ "with open(pdf_path, \"wb\") as f:\n",
278
+ " f.write(response.content)\n",
279
+ "print(f\"Downloaded {pdf_url}\\n\")\n",
280
+ "\n",
281
+ "# Extract page images from PDF\n",
282
+ "print(\"Extracting page images...\")\n",
283
+ "page_images, total_pages = extract_pdf_page_images(pdf_path)\n",
284
+ "print(f\"Extracted {len(page_images)} page images from {total_pages} total pages.\\n\")\n",
285
+ "\n",
286
+ "doc_id = pi_client.submit_document(pdf_path)[\"doc_id\"]\n",
287
+ "print('Document Submitted:', doc_id)"
288
+ ]
289
+ },
290
+ {
291
+ "cell_type": "markdown",
292
+ "metadata": {
293
+ "id": "4-Hrh0azcG1N"
294
+ },
295
+ "source": [
296
+ "#### 1.2 Get the generated PageIndex tree structure"
297
+ ]
298
+ },
299
+ {
300
+ "cell_type": "code",
301
+ "execution_count": 65,
302
+ "metadata": {
303
+ "colab": {
304
+ "base_uri": "https://localhost:8080/",
305
+ "height": 1000
306
+ },
307
+ "id": "b1Q1g6vrcG1O",
308
+ "outputId": "dc944660-38ad-47ea-d358-be422edbae53"
309
+ },
310
+ "outputs": [
311
+ {
312
+ "name": "stdout",
313
+ "output_type": "stream",
314
+ "text": [
315
+ "Simplified Tree Structure of the Document:\n",
316
+ "[{'title': 'Attention Is All You Need',\n",
317
+ " 'node_id': '0000',\n",
318
+ " 'page_index': 1,\n",
319
+ " 'prefix_summary': '# Attention Is All You Need\\n\\nAshish Vasw...',\n",
320
+ " 'nodes': [{'title': 'Abstract',\n",
321
+ " 'node_id': '0001',\n",
322
+ " 'page_index': 1,\n",
323
+ " 'summary': 'The text introduces the Transformer, a n...'},\n",
324
+ " {'title': '1 Introduction',\n",
325
+ " 'node_id': '0002',\n",
326
+ " 'page_index': 2,\n",
327
+ " 'summary': 'The text introduces the Transformer, a n...'},\n",
328
+ " {'title': '2 Background',\n",
329
+ " 'node_id': '0003',\n",
330
+ " 'page_index': 2,\n",
331
+ " 'summary': 'This section discusses the Transformer m...'},\n",
332
+ " {'title': '3 Model Architecture',\n",
333
+ " 'node_id': '0004',\n",
334
+ " 'page_index': 2,\n",
335
+ " 'prefix_summary': 'The text describes the encoder-decoder a...',\n",
336
+ " 'nodes': [{'title': '3.1 Encoder and Decoder Stacks',\n",
337
+ " 'node_id': '0005',\n",
338
+ " 'page_index': 3,\n",
339
+ " 'summary': 'The text describes the encoder and decod...'},\n",
340
+ " {'title': '3.2 Attention',\n",
341
+ " 'node_id': '0006',\n",
342
+ " 'page_index': 3,\n",
343
+ " 'prefix_summary': '### 3.2 Attention\\n\\nAn attention function...',\n",
344
+ " 'nodes': [{'title': '3.2.1 Scaled Dot-Product Attention',\n",
345
+ " 'node_id': '0007',\n",
346
+ " 'page_index': 4,\n",
347
+ " 'summary': 'The text describes Scaled Dot-Product At...'},\n",
348
+ " {'title': '3.2.2 Multi-Head Attention',\n",
349
+ " 'node_id': '0008',\n",
350
+ " 'page_index': 4,\n",
351
+ " 'summary': 'The text describes Multi-Head Attention,...'},\n",
352
+ " {'title': '3.2.3 Applications of Attention in our M...',\n",
353
+ " 'node_id': '0009',\n",
354
+ " 'page_index': 5,\n",
355
+ " 'summary': 'The text describes the three application...'}]},\n",
356
+ " {'title': '3.3 Position-wise Feed-Forward Networks',\n",
357
+ " 'node_id': '0010',\n",
358
+ " 'page_index': 5,\n",
359
+ " 'summary': '### 3.3 Position-wise Feed-Forward Netwo...'},\n",
360
+ " {'title': '3.4 Embeddings and Softmax',\n",
361
+ " 'node_id': '0011',\n",
362
+ " 'page_index': 5,\n",
363
+ " 'summary': 'The text describes the use of learned em...'},\n",
364
+ " {'title': '3.5 Positional Encoding',\n",
365
+ " 'node_id': '0012',\n",
366
+ " 'page_index': 6,\n",
367
+ " 'summary': 'This section explains the necessity of p...'}]},\n",
368
+ " {'title': '4 Why Self-Attention',\n",
369
+ " 'node_id': '0013',\n",
370
+ " 'page_index': 6,\n",
371
+ " 'summary': 'This text compares self-attention layers...'},\n",
372
+ " {'title': '5 Training',\n",
373
+ " 'node_id': '0014',\n",
374
+ " 'page_index': 7,\n",
375
+ " 'prefix_summary': '## 5 Training\\n\\nThis section describes th...',\n",
376
+ " 'nodes': [{'title': '5.1 Training Data and Batching',\n",
377
+ " 'node_id': '0015',\n",
378
+ " 'page_index': 7,\n",
379
+ " 'summary': '### 5.1 Training Data and Batching\\n\\nWe t...'},\n",
380
+ " {'title': '5.2 Hardware and Schedule',\n",
381
+ " 'node_id': '0016',\n",
382
+ " 'page_index': 7,\n",
383
+ " 'summary': '### 5.2 Hardware and Schedule\\n\\nWe traine...'},\n",
384
+ " {'title': '5.3 Optimizer',\n",
385
+ " 'node_id': '0017',\n",
386
+ " 'page_index': 7,\n",
387
+ " 'summary': '### 5.3 Optimizer\\n\\nWe used the Adam opti...'},\n",
388
+ " {'title': '5.4 Regularization',\n",
389
+ " 'node_id': '0018',\n",
390
+ " 'page_index': 7,\n",
391
+ " 'summary': 'The text details three regularization te...'}]},\n",
392
+ " {'title': '6 Results',\n",
393
+ " 'node_id': '0019',\n",
394
+ " 'page_index': 8,\n",
395
+ " 'prefix_summary': '## 6 Results\\n',\n",
396
+ " 'nodes': [{'title': '6.1 Machine Translation',\n",
397
+ " 'node_id': '0020',\n",
398
+ " 'page_index': 8,\n",
399
+ " 'summary': 'The text details the performance of a Tr...'},\n",
400
+ " {'title': '6.2 Model Variations',\n",
401
+ " 'node_id': '0021',\n",
402
+ " 'page_index': 8,\n",
403
+ " 'summary': 'This text details experiments varying co...'},\n",
404
+ " {'title': '6.3 English Constituency Parsing',\n",
405
+ " 'node_id': '0022',\n",
406
+ " 'page_index': 9,\n",
407
+ " 'summary': 'The text describes experiments evaluatin...'}]},\n",
408
+ " {'title': '7 Conclusion',\n",
409
+ " 'node_id': '0023',\n",
410
+ " 'page_index': 10,\n",
411
+ " 'summary': 'This text concludes by presenting the Tr...'},\n",
412
+ " {'title': 'References',\n",
413
+ " 'node_id': '0024',\n",
414
+ " 'page_index': 10,\n",
415
+ " 'summary': 'The provided text is a collection of ref...'},\n",
416
+ " {'title': 'Attention Visualizations',\n",
417
+ " 'node_id': '0025',\n",
418
+ " 'page_index': 13,\n",
419
+ " 'summary': 'The text provides examples of attention ...'}]}]\n"
420
+ ]
421
+ }
422
+ ],
423
+ "source": [
424
+ "if pi_client.is_retrieval_ready(doc_id):\n",
425
+ " tree = pi_client.get_tree(doc_id, node_summary=True)['result']\n",
426
+ " print('Simplified Tree Structure of the Document:')\n",
427
+ " utils.print_tree(tree, exclude_fields=['text'])\n",
428
+ "else:\n",
429
+ " print(\"Processing document, please try again later...\")"
430
+ ]
431
+ },
432
+ {
433
+ "cell_type": "markdown",
434
+ "metadata": {
435
+ "id": "USoCLOiQcG1O"
436
+ },
437
+ "source": [
438
+ "## Step 2: Reasoning-Based Retrieval with Tree Search"
439
+ ]
440
+ },
441
+ {
442
+ "cell_type": "markdown",
443
+ "metadata": {},
444
+ "source": [
445
+ "#### 2.1 Reasoning-based retrieval with PageIndex to identify nodes that might contain relevant context"
446
+ ]
447
+ },
448
+ {
449
+ "cell_type": "code",
450
+ "execution_count": null,
451
+ "metadata": {
452
+ "id": "LLHNJAtTcG1O"
453
+ },
454
+ "outputs": [],
455
+ "source": [
456
+ "import json\n",
457
+ "\n",
458
+ "query = \"What is the last operation in the Scaled Dot-Product Attention figure?\"\n",
459
+ "\n",
460
+ "tree_without_text = utils.remove_fields(tree.copy(), fields=['text'])\n",
461
+ "\n",
462
+ "search_prompt = f\"\"\"\n",
463
+ "You are given a question and a tree structure of a document.\n",
464
+ "Each node contains a node id, node title, and a corresponding summary.\n",
465
+ "Your task is to find all tree nodes that are likely to contain the answer to the question.\n",
466
+ "\n",
467
+ "Question: {query}\n",
468
+ "\n",
469
+ "Document tree structure:\n",
470
+ "{json.dumps(tree_without_text, indent=2)}\n",
471
+ "\n",
472
+ "Please reply in the following JSON format:\n",
473
+ "{{\n",
474
+ " \"thinking\": \"<Your thinking process on which nodes are relevant to the question>\",\n",
475
+ " \"node_list\": [\"node_id_1\", \"node_id_2\", ..., \"node_id_n\"]\n",
476
+ "}}\n",
477
+ "Directly return the final JSON structure. Do not output anything else.\n",
478
+ "\"\"\"\n",
479
+ "\n",
480
+ "tree_search_result = await call_vlm(search_prompt)"
481
+ ]
482
+ },
483
+ {
484
+ "cell_type": "markdown",
485
+ "metadata": {},
486
+ "source": [
487
+ "#### 2.2 Print retrieved nodes and reasoning process"
488
+ ]
489
+ },
490
+ {
491
+ "cell_type": "code",
492
+ "execution_count": 87,
493
+ "metadata": {
494
+ "colab": {
495
+ "base_uri": "https://localhost:8080/",
496
+ "height": 206
497
+ },
498
+ "id": "P8DVUOuAen5u",
499
+ "outputId": "6bb6d052-ef30-4716-f88e-be98bcb7ebdb"
500
+ },
501
+ "outputs": [
502
+ {
503
+ "name": "stdout",
504
+ "output_type": "stream",
505
+ "text": [
506
+ "Reasoning Process:\n",
507
+ "\n",
508
+ "The question asks about the last operation in the Scaled Dot-Product Attention figure. The most\n",
509
+ "relevant section is the one that describes Scaled Dot-Product Attention in detail, including its\n",
510
+ "computation and the figure itself. This is likely found in section 3.2.1 'Scaled Dot-Product\n",
511
+ "Attention' (node_id: 0007), which is a subsection of 3.2 'Attention' (node_id: 0006). The parent\n",
512
+ "section 3.2 may also contain the figure and its caption, as the summary mentions Figure 2 (which is\n",
513
+ "the Scaled Dot-Product Attention figure). Therefore, both node 0006 and node 0007 are likely to\n",
514
+ "contain the answer.\n",
515
+ "\n",
516
+ "Retrieved Nodes:\n",
517
+ "\n",
518
+ "Node ID: 0006\t Pages: 3-4\t Title: 3.2 Attention\n",
519
+ "Node ID: 0007\t Pages: 4\t Title: 3.2.1 Scaled Dot-Product Attention\n"
520
+ ]
521
+ }
522
+ ],
523
+ "source": [
524
+ "node_map = utils.create_node_mapping(tree, include_page_ranges=True, max_page=total_pages)\n",
525
+ "tree_search_result_json = json.loads(tree_search_result)\n",
526
+ "\n",
527
+ "print('Reasoning Process:\\n')\n",
528
+ "utils.print_wrapped(tree_search_result_json['thinking'])\n",
529
+ "\n",
530
+ "print('\\nRetrieved Nodes:\\n')\n",
531
+ "for node_id in tree_search_result_json[\"node_list\"]:\n",
532
+ " node_info = node_map[node_id]\n",
533
+ " node = node_info['node']\n",
534
+ " start_page = node_info['start_index']\n",
535
+ " end_page = node_info['end_index']\n",
536
+ " page_range = start_page if start_page == end_page else f\"{start_page}-{end_page}\"\n",
537
+ " print(f\"Node ID: {node['node_id']}\\t Pages: {page_range}\\t Title: {node['title']}\")"
538
+ ]
539
+ },
540
+ {
541
+ "cell_type": "markdown",
542
+ "metadata": {},
543
+ "source": [
544
+ "#### 2.3 Get corresponding PDF page images of retrieved nodes"
545
+ ]
546
+ },
547
+ {
548
+ "cell_type": "code",
549
+ "execution_count": 81,
550
+ "metadata": {},
551
+ "outputs": [
552
+ {
553
+ "name": "stdout",
554
+ "output_type": "stream",
555
+ "text": [
556
+ "\n",
557
+ "Retrieved 2 PDF page image(s) for visual context.\n"
558
+ ]
559
+ }
560
+ ],
561
+ "source": [
562
+ "retrieved_nodes = tree_search_result_json[\"node_list\"]\n",
563
+ "retrieved_page_images = get_page_images_for_nodes(retrieved_nodes, node_map, page_images)\n",
564
+ "print(f'\\nRetrieved {len(retrieved_page_images)} PDF page image(s) for visual context.')"
565
+ ]
566
+ },
567
+ {
568
+ "cell_type": "markdown",
569
+ "metadata": {
570
+ "id": "10wOZDG_cG1O"
571
+ },
572
+ "source": [
573
+ "## Step 3: Answer Generation"
574
+ ]
575
+ },
576
+ {
577
+ "cell_type": "markdown",
578
+ "metadata": {},
579
+ "source": [
580
+ "#### 3.1 Generate answer using VLM with visual context"
581
+ ]
582
+ },
583
+ {
584
+ "cell_type": "code",
585
+ "execution_count": null,
586
+ "metadata": {
587
+ "colab": {
588
+ "base_uri": "https://localhost:8080/",
589
+ "height": 210
590
+ },
591
+ "id": "tcp_PhHzcG1O",
592
+ "outputId": "187ff116-9bb0-4ab4-bacb-13944460b5ff"
593
+ },
594
+ "outputs": [
595
+ {
596
+ "name": "stdout",
597
+ "output_type": "stream",
598
+ "text": [
599
+ "Generated answer using VLM with retrieved PDF page images as visual context:\n",
600
+ "\n",
601
+ "The last operation in the **Scaled Dot-Product Attention** figure is a **MatMul** (matrix\n",
602
+ "multiplication). This operation multiplies the attention weights (after softmax) by the value matrix\n",
603
+ "\\( V \\).\n"
604
+ ]
605
+ }
606
+ ],
607
+ "source": [
608
+ "# Generate answer using VLM with only PDF page images as visual context\n",
609
+ "answer_prompt = f\"\"\"\n",
610
+ "Answer the question based on the images of the document pages as context.\n",
611
+ "\n",
612
+ "Question: {query}\n",
613
+ "\n",
614
+ "Provide a clear, concise answer based only on the context provided.\n",
615
+ "\"\"\"\n",
616
+ "\n",
617
+ "print('Generated answer using VLM with retrieved PDF page images as visual context:\\n')\n",
618
+ "answer = await call_vlm(answer_prompt, retrieved_page_images)\n",
619
+ "utils.print_wrapped(answer)"
620
+ ]
621
+ },
622
+ {
623
+ "cell_type": "markdown",
624
+ "metadata": {},
625
+ "source": [
626
+ "## Conclusion\n",
627
+ "\n",
628
+ "In this notebook, we demonstrated a *minimal* **vision-based, vectorless RAG pipeline** using PageIndex and a VLM. The system retrieves relevant pages by reasoning over the document’s hierarchical tree index and answers questions directly from PDF images — no OCR required.\n",
629
+ "\n",
630
+ "If you’re interested in building your own **reasoning-based document QA system**, try [PageIndex Chat](https://chat.pageindex.ai), or integrate via [PageIndex MCP](https://pageindex.ai/mcp) and the [API](https://docs.pageindex.ai/quickstart). You can also explore the [GitHub repo](https://github.com/VectifyAI/PageIndex) for open-source implementations and additional examples."
631
+ ]
632
+ },
633
+ {
634
+ "cell_type": "markdown",
635
+ "metadata": {},
636
+ "source": [
637
+ "\n",
638
+ "\n",
639
+ "© 2025 [Vectify AI](https://vectify.ai)"
640
+ ]
641
+ }
642
+ ],
643
+ "metadata": {
644
+ "colab": {
645
+ "provenance": []
646
+ },
647
+ "kernelspec": {
648
+ "display_name": "Python 3",
649
+ "language": "python",
650
+ "name": "python3"
651
+ },
652
+ "language_info": {
653
+ "codemirror_mode": {
654
+ "name": "ipython",
655
+ "version": 3
656
+ },
657
+ "file_extension": ".py",
658
+ "mimetype": "text/x-python",
659
+ "name": "python",
660
+ "nbconvert_exporter": "python",
661
+ "pygments_lexer": "ipython3",
662
+ "version": "3.11.9"
663
+ }
664
+ },
665
+ "nbformat": 4,
666
+ "nbformat_minor": 0
667
+ }
llm_config.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from openai import OpenAI
3
+
4
+ def get_llm_client(provider="nvidia"):
5
+ """
6
+ Returns an OpenAI client configured for the specified provider.
7
+
8
+ Args:
9
+ provider (str): "nvidia" or "mistral"
10
+
11
+ Returns:
12
+ OpenAI: The configured client
13
+ """
14
+ if provider == "nvidia":
15
+ # Llama 4 Maverick via NVIDIA NIM
16
+ api_key = os.getenv("NVIDIA_API_KEY")
17
+ if not api_key:
18
+ print("Warning: NVIDIA_API_KEY not found in environment variables.")
19
+
20
+ return OpenAI(
21
+ base_url="https://integrate.api.nvidia.com/v1",
22
+ api_key=api_key
23
+ )
24
+ else:
25
+ # Mistral Large 3 via Mistral API
26
+ api_key = os.getenv("MISTRAL_API_KEY")
27
+ if not api_key:
28
+ print("Warning: MISTRAL_API_KEY not found in environment variables.")
29
+
30
+ return OpenAI(
31
+ base_url="https://api.mistral.ai/v1",
32
+ api_key=api_key
33
+ )
34
+
35
+ def get_model_name(provider="nvidia"):
36
+ if provider == "nvidia":
37
+ return "meta/llama-4-maverick"
38
+ else:
39
+ return "mistral-large-latest"
pageindex/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .page_index import *
2
+ from .page_index_md import md_to_tree
pageindex/config.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ model: "gpt-4o-2024-11-20"
2
+ toc_check_page_num: 20
3
+ max_page_num_each_node: 10
4
+ max_token_num_each_node: 20000
5
+ if_add_node_id: "yes"
6
+ if_add_node_summary: "yes"
7
+ if_add_doc_description: "no"
8
+ if_add_node_text: "no"
pageindex/core/tree_index.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ import asyncio
4
+ from typing import List, Dict, Any, Optional
5
+
6
+ # Import potentially available util functions or re-implement if necessary
7
+ # Assuming we are in pageindex/core/tree_index.py, and page_index_md is in pageindex/page_index_md.py
8
+ try:
9
+ from ...pageindex.page_index_md import (
10
+ extract_nodes_from_markdown,
11
+ extract_node_text_content,
12
+ build_tree_from_nodes,
13
+ count_tokens # Assuming this exists in utils or page_index_md imports
14
+ )
15
+ except ImportError:
16
+ # If imports fail due to path issues, we might need to adjust python path or copy logic.
17
+ # For now, let's assume we can import from the sibling directory if the package structure is correct.
18
+ # Alternatively, we can re-implement the core logic here to be self-contained.
19
+ pass
20
+
21
+ class TreeIndex:
22
+ def __init__(self, model: str = "gpt-4-turbo"):
23
+ self.model = model
24
+ self.tree = None
25
+
26
+ def build_from_markdown(self, markdown_text: str) -> str:
27
+ """
28
+ Builds the tree index from markdown text.
29
+
30
+ Args:
31
+ markdown_text (str): The markdown content.
32
+
33
+ Returns:
34
+ str: A document ID or confirmation (for this implementation, it returns 'doc_id').
35
+ """
36
+ print(f"Extracting nodes from markdown content...")
37
+ # Re-implementing core logic here to ensure stability and avoid complex relative imports for now
38
+ # This matches page_index_md.py logic
39
+
40
+ # 1. Extract nodes
41
+ node_list, lines = self._extract_nodes(markdown_text)
42
+
43
+ # 2. Extract text content
44
+ nodes_with_content = self._extract_node_text(node_list, lines)
45
+
46
+ # 3. Build tree
47
+ self.tree = self._build_tree(nodes_with_content)
48
+
49
+ # 4. Add Node IDs (simple counter)
50
+ self._add_node_ids(self.tree)
51
+
52
+ print(f"Tree built successfully.")
53
+ return "doc_1" # In a real app this would be a UUID
54
+
55
+ def _extract_nodes(self, markdown_content):
56
+ header_pattern = r'^(#{1,6})\s+(.+)$'
57
+ code_block_pattern = r'^```'
58
+ node_list = []
59
+
60
+ lines = markdown_content.split('\n')
61
+ in_code_block = False
62
+
63
+ for line_num, line in enumerate(lines, 1):
64
+ stripped_line = line.strip()
65
+
66
+ if re.match(code_block_pattern, stripped_line):
67
+ in_code_block = not in_code_block
68
+ continue
69
+
70
+ if not stripped_line:
71
+ continue
72
+
73
+ if not in_code_block:
74
+ match = re.match(header_pattern, stripped_line)
75
+ if match:
76
+ title = match.group(2).strip()
77
+ node_list.append({'node_title': title, 'line_num': line_num})
78
+
79
+ return node_list, lines
80
+
81
+ def _extract_node_text(self, node_list, lines):
82
+ all_nodes = []
83
+ for node in node_list:
84
+ line_content = lines[node['line_num'] - 1]
85
+ header_match = re.match(r'^(#{1,6})', line_content)
86
+
87
+ if header_match:
88
+ all_nodes.append({
89
+ 'title': node['node_title'],
90
+ 'line_num': node['line_num'],
91
+ 'level': len(header_match.group(1))
92
+ })
93
+
94
+ for i, node in enumerate(all_nodes):
95
+ start_line = node['line_num'] - 1
96
+ if i + 1 < len(all_nodes):
97
+ end_line = all_nodes[i + 1]['line_num'] - 1
98
+ else:
99
+ end_line = len(lines)
100
+
101
+ node['text'] = '\n'.join(lines[start_line:end_line]).strip()
102
+ return all_nodes
103
+
104
+ def _build_tree(self, node_list):
105
+ if not node_list:
106
+ return []
107
+
108
+ stack = []
109
+ root_nodes = []
110
+
111
+ for node in node_list:
112
+ current_level = node['level']
113
+
114
+ tree_node = {
115
+ 'title': node['title'],
116
+ 'text': node['text'],
117
+ 'line_num': node['line_num'],
118
+ 'nodes': []
119
+ }
120
+
121
+ while stack and stack[-1][1] >= current_level:
122
+ stack.pop()
123
+
124
+ if not stack:
125
+ root_nodes.append(tree_node)
126
+ else:
127
+ parent_node, parent_level = stack[-1]
128
+ parent_node['nodes'].append(tree_node)
129
+
130
+ stack.append((tree_node, current_level))
131
+
132
+ return root_nodes
133
+
134
+ def _add_node_ids(self, nodes, prefix=""):
135
+ for i, node in enumerate(nodes):
136
+ node_id = f"{prefix}{i+1}"
137
+ node['node_id'] = node_id
138
+ if node.get('nodes'):
139
+ self._add_node_ids(node['nodes'], prefix=f"{node_id}.")
140
+
141
+ def reasoning_search(self, query: str, llm_client: Any) -> str:
142
+ """
143
+ Performs a tree search to find relevant nodes for the query.
144
+
145
+ Args:
146
+ query (str): The user query.
147
+ llm_client: The initialized OpenAI client.
148
+
149
+ Returns:
150
+ str: Context concatenated from relevant nodes.
151
+ """
152
+ if not self.tree:
153
+ return "Tree not index built. Please upload a document first."
154
+
155
+ # Simplify tree for prompt (remove raw text to save tokens, keep titles and IDs)
156
+ tree_summary = self._get_tree_structure_summary(self.tree)
157
+
158
+ prompt = f"""
159
+ You are given a query and the tree structure of a document.
160
+ You need to find all nodes that are likely to contain the answer.
161
+
162
+ Query: {query}
163
+
164
+ Document tree structure:
165
+ {json.dumps(tree_summary, indent=2)}
166
+
167
+ Reply in the following JSON format:
168
+ {{
169
+ "thinking": <your reasoning about which nodes are relevant>,
170
+ "node_list": ["node_id1", "node_id2", ...]
171
+ }}
172
+ """
173
+
174
+ try:
175
+ response = llm_client.chat.completions.create(
176
+ model="gpt-4-turbo", # Provide a default, though the client might override or ignored it for NIM
177
+ messages=[{"role": "user", "content": prompt}],
178
+ temperature=0.1
179
+ )
180
+
181
+ content = response.choices[0].message.content
182
+ # Basic JSON extraction if LLM adds markdown
183
+ if "```json" in content:
184
+ content = content.split("```json")[1].split("```")[0].strip()
185
+ elif "```" in content:
186
+ content = content.split("```")[1].split("```")[0].strip()
187
+
188
+ result = json.loads(content)
189
+ relevant_node_ids = result.get("node_list", [])
190
+
191
+ # Retrieve text for these nodes
192
+ context_parts = []
193
+ for node_id in relevant_node_ids:
194
+ node = self._find_node_by_id(self.tree, node_id)
195
+ if node:
196
+ context_parts.append(f"--- Section: {node['title']} (ID: {node_id}) ---\n{node['text']}\n")
197
+
198
+ full_context = "\n".join(context_parts)
199
+ if not full_context:
200
+ return "No relevant context found in the document tree."
201
+ return full_context
202
+
203
+ except Exception as e:
204
+ return f"Error during reasoning search: {str(e)}"
205
+
206
+ def _get_tree_structure_summary(self, nodes):
207
+ summary = []
208
+ for node in nodes:
209
+ node_summary = {
210
+ 'title': node['title'],
211
+ 'node_id': node['node_id']
212
+ }
213
+ if node.get('nodes'):
214
+ node_summary['nodes'] = self._get_tree_structure_summary(node['nodes'])
215
+ summary.append(node_summary)
216
+ return summary
217
+
218
+ def _find_node_by_id(self, nodes, target_id):
219
+ for node in nodes:
220
+ if node.get('node_id') == target_id:
221
+ return node
222
+ if node.get('nodes'):
223
+ found = self._find_node_by_id(node['nodes'], target_id)
224
+ if found:
225
+ return found
226
+ return None
pageindex/page_index.py ADDED
@@ -0,0 +1,1144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import copy
4
+ import math
5
+ import random
6
+ import re
7
+ from .utils import *
8
+ import os
9
+ from concurrent.futures import ThreadPoolExecutor, as_completed
10
+
11
+
12
+ ################### check title in page #########################################################
13
+ async def check_title_appearance(item, page_list, start_index=1, model=None):
14
+ title=item['title']
15
+ if 'physical_index' not in item or item['physical_index'] is None:
16
+ return {'list_index': item.get('list_index'), 'answer': 'no', 'title':title, 'page_number': None}
17
+
18
+
19
+ page_number = item['physical_index']
20
+ page_text = page_list[page_number-start_index][0]
21
+
22
+
23
+ prompt = f"""
24
+ Your job is to check if the given section appears or starts in the given page_text.
25
+
26
+ Note: do fuzzy matching, ignore any space inconsistency in the page_text.
27
+
28
+ The given section title is {title}.
29
+ The given page_text is {page_text}.
30
+
31
+ Reply format:
32
+ {{
33
+
34
+ "thinking": <why do you think the section appears or starts in the page_text>
35
+ "answer": "yes or no" (yes if the section appears or starts in the page_text, no otherwise)
36
+ }}
37
+ Directly return the final JSON structure. Do not output anything else."""
38
+
39
+ response = await ChatGPT_API_async(model=model, prompt=prompt)
40
+ response = extract_json(response)
41
+ if 'answer' in response:
42
+ answer = response['answer']
43
+ else:
44
+ answer = 'no'
45
+ return {'list_index': item['list_index'], 'answer': answer, 'title': title, 'page_number': page_number}
46
+
47
+
48
+ async def check_title_appearance_in_start(title, page_text, model=None, logger=None):
49
+ prompt = f"""
50
+ You will be given the current section title and the current page_text.
51
+ Your job is to check if the current section starts in the beginning of the given page_text.
52
+ If there are other contents before the current section title, then the current section does not start in the beginning of the given page_text.
53
+ If the current section title is the first content in the given page_text, then the current section starts in the beginning of the given page_text.
54
+
55
+ Note: do fuzzy matching, ignore any space inconsistency in the page_text.
56
+
57
+ The given section title is {title}.
58
+ The given page_text is {page_text}.
59
+
60
+ reply format:
61
+ {{
62
+ "thinking": <why do you think the section appears or starts in the page_text>
63
+ "start_begin": "yes or no" (yes if the section starts in the beginning of the page_text, no otherwise)
64
+ }}
65
+ Directly return the final JSON structure. Do not output anything else."""
66
+
67
+ response = await ChatGPT_API_async(model=model, prompt=prompt)
68
+ response = extract_json(response)
69
+ if logger:
70
+ logger.info(f"Response: {response}")
71
+ return response.get("start_begin", "no")
72
+
73
+
74
+ async def check_title_appearance_in_start_concurrent(structure, page_list, model=None, logger=None):
75
+ if logger:
76
+ logger.info("Checking title appearance in start concurrently")
77
+
78
+ # skip items without physical_index
79
+ for item in structure:
80
+ if item.get('physical_index') is None:
81
+ item['appear_start'] = 'no'
82
+
83
+ # only for items with valid physical_index
84
+ tasks = []
85
+ valid_items = []
86
+ for item in structure:
87
+ if item.get('physical_index') is not None:
88
+ page_text = page_list[item['physical_index'] - 1][0]
89
+ tasks.append(check_title_appearance_in_start(item['title'], page_text, model=model, logger=logger))
90
+ valid_items.append(item)
91
+
92
+ results = await asyncio.gather(*tasks, return_exceptions=True)
93
+ for item, result in zip(valid_items, results):
94
+ if isinstance(result, Exception):
95
+ if logger:
96
+ logger.error(f"Error checking start for {item['title']}: {result}")
97
+ item['appear_start'] = 'no'
98
+ else:
99
+ item['appear_start'] = result
100
+
101
+ return structure
102
+
103
+
104
+ def toc_detector_single_page(content, model=None):
105
+ prompt = f"""
106
+ Your job is to detect if there is a table of content provided in the given text.
107
+
108
+ Given text: {content}
109
+
110
+ return the following JSON format:
111
+ {{
112
+ "thinking": <why do you think there is a table of content in the given text>
113
+ "toc_detected": "<yes or no>",
114
+ }}
115
+
116
+ Directly return the final JSON structure. Do not output anything else.
117
+ Please note: abstract,summary, notation list, figure list, table list, etc. are not table of contents."""
118
+
119
+ response = ChatGPT_API(model=model, prompt=prompt)
120
+ # print('response', response)
121
+ json_content = extract_json(response)
122
+ return json_content['toc_detected']
123
+
124
+
125
+ def check_if_toc_extraction_is_complete(content, toc, model=None):
126
+ prompt = f"""
127
+ You are given a partial document and a table of contents.
128
+ Your job is to check if the table of contents is complete, which it contains all the main sections in the partial document.
129
+
130
+ Reply format:
131
+ {{
132
+ "thinking": <why do you think the table of contents is complete or not>
133
+ "completed": "yes" or "no"
134
+ }}
135
+ Directly return the final JSON structure. Do not output anything else."""
136
+
137
+ prompt = prompt + '\n Document:\n' + content + '\n Table of contents:\n' + toc
138
+ response = ChatGPT_API(model=model, prompt=prompt)
139
+ json_content = extract_json(response)
140
+ return json_content['completed']
141
+
142
+
143
+ def check_if_toc_transformation_is_complete(content, toc, model=None):
144
+ prompt = f"""
145
+ You are given a raw table of contents and a table of contents.
146
+ Your job is to check if the table of contents is complete.
147
+
148
+ Reply format:
149
+ {{
150
+ "thinking": <why do you think the cleaned table of contents is complete or not>
151
+ "completed": "yes" or "no"
152
+ }}
153
+ Directly return the final JSON structure. Do not output anything else."""
154
+
155
+ prompt = prompt + '\n Raw Table of contents:\n' + content + '\n Cleaned Table of contents:\n' + toc
156
+ response = ChatGPT_API(model=model, prompt=prompt)
157
+ json_content = extract_json(response)
158
+ return json_content['completed']
159
+
160
+ def extract_toc_content(content, model=None):
161
+ prompt = f"""
162
+ Your job is to extract the full table of contents from the given text, replace ... with :
163
+
164
+ Given text: {content}
165
+
166
+ Directly return the full table of contents content. Do not output anything else."""
167
+
168
+ response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt)
169
+
170
+ if_complete = check_if_toc_transformation_is_complete(content, response, model)
171
+ if if_complete == "yes" and finish_reason == "finished":
172
+ return response
173
+
174
+ chat_history = [
175
+ {"role": "user", "content": prompt},
176
+ {"role": "assistant", "content": response},
177
+ ]
178
+ prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure"""
179
+ new_response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history)
180
+ response = response + new_response
181
+ if_complete = check_if_toc_transformation_is_complete(content, response, model)
182
+
183
+ while not (if_complete == "yes" and finish_reason == "finished"):
184
+ chat_history = [
185
+ {"role": "user", "content": prompt},
186
+ {"role": "assistant", "content": response},
187
+ ]
188
+ prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure"""
189
+ new_response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, chat_history=chat_history)
190
+ response = response + new_response
191
+ if_complete = check_if_toc_transformation_is_complete(content, response, model)
192
+
193
+ # Optional: Add a maximum retry limit to prevent infinite loops
194
+ if len(chat_history) > 5: # Arbitrary limit of 10 attempts
195
+ raise Exception('Failed to complete table of contents after maximum retries')
196
+
197
+ return response
198
+
199
+ def detect_page_index(toc_content, model=None):
200
+ print('start detect_page_index')
201
+ prompt = f"""
202
+ You will be given a table of contents.
203
+
204
+ Your job is to detect if there are page numbers/indices given within the table of contents.
205
+
206
+ Given text: {toc_content}
207
+
208
+ Reply format:
209
+ {{
210
+ "thinking": <why do you think there are page numbers/indices given within the table of contents>
211
+ "page_index_given_in_toc": "<yes or no>"
212
+ }}
213
+ Directly return the final JSON structure. Do not output anything else."""
214
+
215
+ response = ChatGPT_API(model=model, prompt=prompt)
216
+ json_content = extract_json(response)
217
+ return json_content['page_index_given_in_toc']
218
+
219
+ def toc_extractor(page_list, toc_page_list, model):
220
+ def transform_dots_to_colon(text):
221
+ text = re.sub(r'\.{5,}', ': ', text)
222
+ # Handle dots separated by spaces
223
+ text = re.sub(r'(?:\. ){5,}\.?', ': ', text)
224
+ return text
225
+
226
+ toc_content = ""
227
+ for page_index in toc_page_list:
228
+ toc_content += page_list[page_index][0]
229
+ toc_content = transform_dots_to_colon(toc_content)
230
+ has_page_index = detect_page_index(toc_content, model=model)
231
+
232
+ return {
233
+ "toc_content": toc_content,
234
+ "page_index_given_in_toc": has_page_index
235
+ }
236
+
237
+
238
+
239
+
240
+ def toc_index_extractor(toc, content, model=None):
241
+ print('start toc_index_extractor')
242
+ tob_extractor_prompt = """
243
+ You are given a table of contents in a json format and several pages of a document, your job is to add the physical_index to the table of contents in the json format.
244
+
245
+ The provided pages contains tags like <physical_index_X> and <physical_index_X> to indicate the physical location of the page X.
246
+
247
+ The structure variable is the numeric system which represents the index of the hierarchy section in the table of contents. For example, the first section has structure index 1, the first subsection has structure index 1.1, the second subsection has structure index 1.2, etc.
248
+
249
+ The response should be in the following JSON format:
250
+ [
251
+ {
252
+ "structure": <structure index, "x.x.x" or None> (string),
253
+ "title": <title of the section>,
254
+ "physical_index": "<physical_index_X>" (keep the format)
255
+ },
256
+ ...
257
+ ]
258
+
259
+ Only add the physical_index to the sections that are in the provided pages.
260
+ If the section is not in the provided pages, do not add the physical_index to it.
261
+ Directly return the final JSON structure. Do not output anything else."""
262
+
263
+ prompt = tob_extractor_prompt + '\nTable of contents:\n' + str(toc) + '\nDocument pages:\n' + content
264
+ response = ChatGPT_API(model=model, prompt=prompt)
265
+ json_content = extract_json(response)
266
+ return json_content
267
+
268
+
269
+
270
+ def toc_transformer(toc_content, model=None):
271
+ print('start toc_transformer')
272
+ init_prompt = """
273
+ You are given a table of contents, You job is to transform the whole table of content into a JSON format included table_of_contents.
274
+
275
+ structure is the numeric system which represents the index of the hierarchy section in the table of contents. For example, the first section has structure index 1, the first subsection has structure index 1.1, the second subsection has structure index 1.2, etc.
276
+
277
+ The response should be in the following JSON format:
278
+ {
279
+ table_of_contents: [
280
+ {
281
+ "structure": <structure index, "x.x.x" or None> (string),
282
+ "title": <title of the section>,
283
+ "page": <page number or None>,
284
+ },
285
+ ...
286
+ ],
287
+ }
288
+ You should transform the full table of contents in one go.
289
+ Directly return the final JSON structure, do not output anything else. """
290
+
291
+ prompt = init_prompt + '\n Given table of contents\n:' + toc_content
292
+ last_complete, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt)
293
+ if_complete = check_if_toc_transformation_is_complete(toc_content, last_complete, model)
294
+ if if_complete == "yes" and finish_reason == "finished":
295
+ last_complete = extract_json(last_complete)
296
+ cleaned_response=convert_page_to_int(last_complete['table_of_contents'])
297
+ return cleaned_response
298
+
299
+ last_complete = get_json_content(last_complete)
300
+ while not (if_complete == "yes" and finish_reason == "finished"):
301
+ position = last_complete.rfind('}')
302
+ if position != -1:
303
+ last_complete = last_complete[:position+2]
304
+ prompt = f"""
305
+ Your task is to continue the table of contents json structure, directly output the remaining part of the json structure.
306
+ The response should be in the following JSON format:
307
+
308
+ The raw table of contents json structure is:
309
+ {toc_content}
310
+
311
+ The incomplete transformed table of contents json structure is:
312
+ {last_complete}
313
+
314
+ Please continue the json structure, directly output the remaining part of the json structure."""
315
+
316
+ new_complete, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt)
317
+
318
+ if new_complete.startswith('```json'):
319
+ new_complete = get_json_content(new_complete)
320
+ last_complete = last_complete+new_complete
321
+
322
+ if_complete = check_if_toc_transformation_is_complete(toc_content, last_complete, model)
323
+
324
+
325
+ last_complete = json.loads(last_complete)
326
+
327
+ cleaned_response=convert_page_to_int(last_complete['table_of_contents'])
328
+ return cleaned_response
329
+
330
+
331
+
332
+
333
+ def find_toc_pages(start_page_index, page_list, opt, logger=None):
334
+ print('start find_toc_pages')
335
+ last_page_is_yes = False
336
+ toc_page_list = []
337
+ i = start_page_index
338
+
339
+ while i < len(page_list):
340
+ # Only check beyond max_pages if we're still finding TOC pages
341
+ if i >= opt.toc_check_page_num and not last_page_is_yes:
342
+ break
343
+ detected_result = toc_detector_single_page(page_list[i][0],model=opt.model)
344
+ if detected_result == 'yes':
345
+ if logger:
346
+ logger.info(f'Page {i} has toc')
347
+ toc_page_list.append(i)
348
+ last_page_is_yes = True
349
+ elif detected_result == 'no' and last_page_is_yes:
350
+ if logger:
351
+ logger.info(f'Found the last page with toc: {i-1}')
352
+ break
353
+ i += 1
354
+
355
+ if not toc_page_list and logger:
356
+ logger.info('No toc found')
357
+
358
+ return toc_page_list
359
+
360
+ def remove_page_number(data):
361
+ if isinstance(data, dict):
362
+ data.pop('page_number', None)
363
+ for key in list(data.keys()):
364
+ if 'nodes' in key:
365
+ remove_page_number(data[key])
366
+ elif isinstance(data, list):
367
+ for item in data:
368
+ remove_page_number(item)
369
+ return data
370
+
371
+ def extract_matching_page_pairs(toc_page, toc_physical_index, start_page_index):
372
+ pairs = []
373
+ for phy_item in toc_physical_index:
374
+ for page_item in toc_page:
375
+ if phy_item.get('title') == page_item.get('title'):
376
+ physical_index = phy_item.get('physical_index')
377
+ if physical_index is not None and int(physical_index) >= start_page_index:
378
+ pairs.append({
379
+ 'title': phy_item.get('title'),
380
+ 'page': page_item.get('page'),
381
+ 'physical_index': physical_index
382
+ })
383
+ return pairs
384
+
385
+
386
+ def calculate_page_offset(pairs):
387
+ differences = []
388
+ for pair in pairs:
389
+ try:
390
+ physical_index = pair['physical_index']
391
+ page_number = pair['page']
392
+ difference = physical_index - page_number
393
+ differences.append(difference)
394
+ except (KeyError, TypeError):
395
+ continue
396
+
397
+ if not differences:
398
+ return None
399
+
400
+ difference_counts = {}
401
+ for diff in differences:
402
+ difference_counts[diff] = difference_counts.get(diff, 0) + 1
403
+
404
+ most_common = max(difference_counts.items(), key=lambda x: x[1])[0]
405
+
406
+ return most_common
407
+
408
+ def add_page_offset_to_toc_json(data, offset):
409
+ for i in range(len(data)):
410
+ if data[i].get('page') is not None and isinstance(data[i]['page'], int):
411
+ data[i]['physical_index'] = data[i]['page'] + offset
412
+ del data[i]['page']
413
+
414
+ return data
415
+
416
+
417
+
418
+ def page_list_to_group_text(page_contents, token_lengths, max_tokens=20000, overlap_page=1):
419
+ num_tokens = sum(token_lengths)
420
+
421
+ if num_tokens <= max_tokens:
422
+ # merge all pages into one text
423
+ page_text = "".join(page_contents)
424
+ return [page_text]
425
+
426
+ subsets = []
427
+ current_subset = []
428
+ current_token_count = 0
429
+
430
+ expected_parts_num = math.ceil(num_tokens / max_tokens)
431
+ average_tokens_per_part = math.ceil(((num_tokens / expected_parts_num) + max_tokens) / 2)
432
+
433
+ for i, (page_content, page_tokens) in enumerate(zip(page_contents, token_lengths)):
434
+ if current_token_count + page_tokens > average_tokens_per_part:
435
+
436
+ subsets.append(''.join(current_subset))
437
+ # Start new subset from overlap if specified
438
+ overlap_start = max(i - overlap_page, 0)
439
+ current_subset = page_contents[overlap_start:i]
440
+ current_token_count = sum(token_lengths[overlap_start:i])
441
+
442
+ # Add current page to the subset
443
+ current_subset.append(page_content)
444
+ current_token_count += page_tokens
445
+
446
+ # Add the last subset if it contains any pages
447
+ if current_subset:
448
+ subsets.append(''.join(current_subset))
449
+
450
+ print('divide page_list to groups', len(subsets))
451
+ return subsets
452
+
453
+ def add_page_number_to_toc(part, structure, model=None):
454
+ fill_prompt_seq = """
455
+ You are given an JSON structure of a document and a partial part of the document. Your task is to check if the title that is described in the structure is started in the partial given document.
456
+
457
+ The provided text contains tags like <physical_index_X> and <physical_index_X> to indicate the physical location of the page X.
458
+
459
+ If the full target section starts in the partial given document, insert the given JSON structure with the "start": "yes", and "start_index": "<physical_index_X>".
460
+
461
+ If the full target section does not start in the partial given document, insert "start": "no", "start_index": None.
462
+
463
+ The response should be in the following format.
464
+ [
465
+ {
466
+ "structure": <structure index, "x.x.x" or None> (string),
467
+ "title": <title of the section>,
468
+ "start": "<yes or no>",
469
+ "physical_index": "<physical_index_X> (keep the format)" or None
470
+ },
471
+ ...
472
+ ]
473
+ The given structure contains the result of the previous part, you need to fill the result of the current part, do not change the previous result.
474
+ Directly return the final JSON structure. Do not output anything else."""
475
+
476
+ prompt = fill_prompt_seq + f"\n\nCurrent Partial Document:\n{part}\n\nGiven Structure\n{json.dumps(structure, indent=2)}\n"
477
+ current_json_raw = ChatGPT_API(model=model, prompt=prompt)
478
+ json_result = extract_json(current_json_raw)
479
+
480
+ for item in json_result:
481
+ if 'start' in item:
482
+ del item['start']
483
+ return json_result
484
+
485
+
486
+ def remove_first_physical_index_section(text):
487
+ """
488
+ Removes the first section between <physical_index_X> and <physical_index_X> tags,
489
+ and returns the remaining text.
490
+ """
491
+ pattern = r'<physical_index_\d+>.*?<physical_index_\d+>'
492
+ match = re.search(pattern, text, re.DOTALL)
493
+ if match:
494
+ # Remove the first matched section
495
+ return text.replace(match.group(0), '', 1)
496
+ return text
497
+
498
+ ### add verify completeness
499
+ def generate_toc_continue(toc_content, part, model="gpt-4o-2024-11-20"):
500
+ print('start generate_toc_continue')
501
+ prompt = """
502
+ You are an expert in extracting hierarchical tree structure.
503
+ You are given a tree structure of the previous part and the text of the current part.
504
+ Your task is to continue the tree structure from the previous part to include the current part.
505
+
506
+ The structure variable is the numeric system which represents the index of the hierarchy section in the table of contents. For example, the first section has structure index 1, the first subsection has structure index 1.1, the second subsection has structure index 1.2, etc.
507
+
508
+ For the title, you need to extract the original title from the text, only fix the space inconsistency.
509
+
510
+ The provided text contains tags like <physical_index_X> and <physical_index_X> to indicate the start and end of page X. \
511
+
512
+ For the physical_index, you need to extract the physical index of the start of the section from the text. Keep the <physical_index_X> format.
513
+
514
+ The response should be in the following format.
515
+ [
516
+ {
517
+ "structure": <structure index, "x.x.x"> (string),
518
+ "title": <title of the section, keep the original title>,
519
+ "physical_index": "<physical_index_X> (keep the format)"
520
+ },
521
+ ...
522
+ ]
523
+
524
+ Directly return the additional part of the final JSON structure. Do not output anything else."""
525
+
526
+ prompt = prompt + '\nGiven text\n:' + part + '\nPrevious tree structure\n:' + json.dumps(toc_content, indent=2)
527
+ response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt)
528
+ if finish_reason == 'finished':
529
+ return extract_json(response)
530
+ else:
531
+ raise Exception(f'finish reason: {finish_reason}')
532
+
533
+ ### add verify completeness
534
+ def generate_toc_init(part, model=None):
535
+ print('start generate_toc_init')
536
+ prompt = """
537
+ You are an expert in extracting hierarchical tree structure, your task is to generate the tree structure of the document.
538
+
539
+ The structure variable is the numeric system which represents the index of the hierarchy section in the table of contents. For example, the first section has structure index 1, the first subsection has structure index 1.1, the second subsection has structure index 1.2, etc.
540
+
541
+ For the title, you need to extract the original title from the text, only fix the space inconsistency.
542
+
543
+ The provided text contains tags like <physical_index_X> and <physical_index_X> to indicate the start and end of page X.
544
+
545
+ For the physical_index, you need to extract the physical index of the start of the section from the text. Keep the <physical_index_X> format.
546
+
547
+ The response should be in the following format.
548
+ [
549
+ {{
550
+ "structure": <structure index, "x.x.x"> (string),
551
+ "title": <title of the section, keep the original title>,
552
+ "physical_index": "<physical_index_X> (keep the format)"
553
+ }},
554
+
555
+ ],
556
+
557
+
558
+ Directly return the final JSON structure. Do not output anything else."""
559
+
560
+ prompt = prompt + '\nGiven text\n:' + part
561
+ response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt)
562
+
563
+ if finish_reason == 'finished':
564
+ return extract_json(response)
565
+ else:
566
+ raise Exception(f'finish reason: {finish_reason}')
567
+
568
+ def process_no_toc(page_list, start_index=1, model=None, logger=None):
569
+ page_contents=[]
570
+ token_lengths=[]
571
+ for page_index in range(start_index, start_index+len(page_list)):
572
+ page_text = f"<physical_index_{page_index}>\n{page_list[page_index-start_index][0]}\n<physical_index_{page_index}>\n\n"
573
+ page_contents.append(page_text)
574
+ token_lengths.append(count_tokens(page_text, model))
575
+ group_texts = page_list_to_group_text(page_contents, token_lengths)
576
+ logger.info(f'len(group_texts): {len(group_texts)}')
577
+
578
+ toc_with_page_number= generate_toc_init(group_texts[0], model)
579
+ for group_text in group_texts[1:]:
580
+ toc_with_page_number_additional = generate_toc_continue(toc_with_page_number, group_text, model)
581
+ toc_with_page_number.extend(toc_with_page_number_additional)
582
+ logger.info(f'generate_toc: {toc_with_page_number}')
583
+
584
+ toc_with_page_number = convert_physical_index_to_int(toc_with_page_number)
585
+ logger.info(f'convert_physical_index_to_int: {toc_with_page_number}')
586
+
587
+ return toc_with_page_number
588
+
589
+ def process_toc_no_page_numbers(toc_content, toc_page_list, page_list, start_index=1, model=None, logger=None):
590
+ page_contents=[]
591
+ token_lengths=[]
592
+ toc_content = toc_transformer(toc_content, model)
593
+ logger.info(f'toc_transformer: {toc_content}')
594
+ for page_index in range(start_index, start_index+len(page_list)):
595
+ page_text = f"<physical_index_{page_index}>\n{page_list[page_index-start_index][0]}\n<physical_index_{page_index}>\n\n"
596
+ page_contents.append(page_text)
597
+ token_lengths.append(count_tokens(page_text, model))
598
+
599
+ group_texts = page_list_to_group_text(page_contents, token_lengths)
600
+ logger.info(f'len(group_texts): {len(group_texts)}')
601
+
602
+ toc_with_page_number=copy.deepcopy(toc_content)
603
+ for group_text in group_texts:
604
+ toc_with_page_number = add_page_number_to_toc(group_text, toc_with_page_number, model)
605
+ logger.info(f'add_page_number_to_toc: {toc_with_page_number}')
606
+
607
+ toc_with_page_number = convert_physical_index_to_int(toc_with_page_number)
608
+ logger.info(f'convert_physical_index_to_int: {toc_with_page_number}')
609
+
610
+ return toc_with_page_number
611
+
612
+
613
+
614
+ def process_toc_with_page_numbers(toc_content, toc_page_list, page_list, toc_check_page_num=None, model=None, logger=None):
615
+ toc_with_page_number = toc_transformer(toc_content, model)
616
+ logger.info(f'toc_with_page_number: {toc_with_page_number}')
617
+
618
+ toc_no_page_number = remove_page_number(copy.deepcopy(toc_with_page_number))
619
+
620
+ start_page_index = toc_page_list[-1] + 1
621
+ main_content = ""
622
+ for page_index in range(start_page_index, min(start_page_index + toc_check_page_num, len(page_list))):
623
+ main_content += f"<physical_index_{page_index+1}>\n{page_list[page_index][0]}\n<physical_index_{page_index+1}>\n\n"
624
+
625
+ toc_with_physical_index = toc_index_extractor(toc_no_page_number, main_content, model)
626
+ logger.info(f'toc_with_physical_index: {toc_with_physical_index}')
627
+
628
+ toc_with_physical_index = convert_physical_index_to_int(toc_with_physical_index)
629
+ logger.info(f'toc_with_physical_index: {toc_with_physical_index}')
630
+
631
+ matching_pairs = extract_matching_page_pairs(toc_with_page_number, toc_with_physical_index, start_page_index)
632
+ logger.info(f'matching_pairs: {matching_pairs}')
633
+
634
+ offset = calculate_page_offset(matching_pairs)
635
+ logger.info(f'offset: {offset}')
636
+
637
+ toc_with_page_number = add_page_offset_to_toc_json(toc_with_page_number, offset)
638
+ logger.info(f'toc_with_page_number: {toc_with_page_number}')
639
+
640
+ toc_with_page_number = process_none_page_numbers(toc_with_page_number, page_list, model=model)
641
+ logger.info(f'toc_with_page_number: {toc_with_page_number}')
642
+
643
+ return toc_with_page_number
644
+
645
+
646
+
647
+ ##check if needed to process none page numbers
648
+ def process_none_page_numbers(toc_items, page_list, start_index=1, model=None):
649
+ for i, item in enumerate(toc_items):
650
+ if "physical_index" not in item:
651
+ # logger.info(f"fix item: {item}")
652
+ # Find previous physical_index
653
+ prev_physical_index = 0 # Default if no previous item exists
654
+ for j in range(i - 1, -1, -1):
655
+ if toc_items[j].get('physical_index') is not None:
656
+ prev_physical_index = toc_items[j]['physical_index']
657
+ break
658
+
659
+ # Find next physical_index
660
+ next_physical_index = -1 # Default if no next item exists
661
+ for j in range(i + 1, len(toc_items)):
662
+ if toc_items[j].get('physical_index') is not None:
663
+ next_physical_index = toc_items[j]['physical_index']
664
+ break
665
+
666
+ page_contents = []
667
+ for page_index in range(prev_physical_index, next_physical_index+1):
668
+ # Add bounds checking to prevent IndexError
669
+ list_index = page_index - start_index
670
+ if list_index >= 0 and list_index < len(page_list):
671
+ page_text = f"<physical_index_{page_index}>\n{page_list[list_index][0]}\n<physical_index_{page_index}>\n\n"
672
+ page_contents.append(page_text)
673
+ else:
674
+ continue
675
+
676
+ item_copy = copy.deepcopy(item)
677
+ del item_copy['page']
678
+ result = add_page_number_to_toc(page_contents, item_copy, model)
679
+ if isinstance(result[0]['physical_index'], str) and result[0]['physical_index'].startswith('<physical_index'):
680
+ item['physical_index'] = int(result[0]['physical_index'].split('_')[-1].rstrip('>').strip())
681
+ del item['page']
682
+
683
+ return toc_items
684
+
685
+
686
+
687
+
688
+ def check_toc(page_list, opt=None):
689
+ toc_page_list = find_toc_pages(start_page_index=0, page_list=page_list, opt=opt)
690
+ if len(toc_page_list) == 0:
691
+ print('no toc found')
692
+ return {'toc_content': None, 'toc_page_list': [], 'page_index_given_in_toc': 'no'}
693
+ else:
694
+ print('toc found')
695
+ toc_json = toc_extractor(page_list, toc_page_list, opt.model)
696
+
697
+ if toc_json['page_index_given_in_toc'] == 'yes':
698
+ print('index found')
699
+ return {'toc_content': toc_json['toc_content'], 'toc_page_list': toc_page_list, 'page_index_given_in_toc': 'yes'}
700
+ else:
701
+ current_start_index = toc_page_list[-1] + 1
702
+
703
+ while (toc_json['page_index_given_in_toc'] == 'no' and
704
+ current_start_index < len(page_list) and
705
+ current_start_index < opt.toc_check_page_num):
706
+
707
+ additional_toc_pages = find_toc_pages(
708
+ start_page_index=current_start_index,
709
+ page_list=page_list,
710
+ opt=opt
711
+ )
712
+
713
+ if len(additional_toc_pages) == 0:
714
+ break
715
+
716
+ additional_toc_json = toc_extractor(page_list, additional_toc_pages, opt.model)
717
+ if additional_toc_json['page_index_given_in_toc'] == 'yes':
718
+ print('index found')
719
+ return {'toc_content': additional_toc_json['toc_content'], 'toc_page_list': additional_toc_pages, 'page_index_given_in_toc': 'yes'}
720
+
721
+ else:
722
+ current_start_index = additional_toc_pages[-1] + 1
723
+ print('index not found')
724
+ return {'toc_content': toc_json['toc_content'], 'toc_page_list': toc_page_list, 'page_index_given_in_toc': 'no'}
725
+
726
+
727
+
728
+
729
+
730
+
731
+ ################### fix incorrect toc #########################################################
732
+ def single_toc_item_index_fixer(section_title, content, model="gpt-4o-2024-11-20"):
733
+ tob_extractor_prompt = """
734
+ You are given a section title and several pages of a document, your job is to find the physical index of the start page of the section in the partial document.
735
+
736
+ The provided pages contains tags like <physical_index_X> and <physical_index_X> to indicate the physical location of the page X.
737
+
738
+ Reply in a JSON format:
739
+ {
740
+ "thinking": <explain which page, started and closed by <physical_index_X>, contains the start of this section>,
741
+ "physical_index": "<physical_index_X>" (keep the format)
742
+ }
743
+ Directly return the final JSON structure. Do not output anything else."""
744
+
745
+ prompt = tob_extractor_prompt + '\nSection Title:\n' + str(section_title) + '\nDocument pages:\n' + content
746
+ response = ChatGPT_API(model=model, prompt=prompt)
747
+ json_content = extract_json(response)
748
+ return convert_physical_index_to_int(json_content['physical_index'])
749
+
750
+
751
+
752
+ async def fix_incorrect_toc(toc_with_page_number, page_list, incorrect_results, start_index=1, model=None, logger=None):
753
+ print(f'start fix_incorrect_toc with {len(incorrect_results)} incorrect results')
754
+ incorrect_indices = {result['list_index'] for result in incorrect_results}
755
+
756
+ end_index = len(page_list) + start_index - 1
757
+
758
+ incorrect_results_and_range_logs = []
759
+ # Helper function to process and check a single incorrect item
760
+ async def process_and_check_item(incorrect_item):
761
+ list_index = incorrect_item['list_index']
762
+
763
+ # Check if list_index is valid
764
+ if list_index < 0 or list_index >= len(toc_with_page_number):
765
+ # Return an invalid result for out-of-bounds indices
766
+ return {
767
+ 'list_index': list_index,
768
+ 'title': incorrect_item['title'],
769
+ 'physical_index': incorrect_item.get('physical_index'),
770
+ 'is_valid': False
771
+ }
772
+
773
+ # Find the previous correct item
774
+ prev_correct = None
775
+ for i in range(list_index-1, -1, -1):
776
+ if i not in incorrect_indices and i >= 0 and i < len(toc_with_page_number):
777
+ physical_index = toc_with_page_number[i].get('physical_index')
778
+ if physical_index is not None:
779
+ prev_correct = physical_index
780
+ break
781
+ # If no previous correct item found, use start_index
782
+ if prev_correct is None:
783
+ prev_correct = start_index - 1
784
+
785
+ # Find the next correct item
786
+ next_correct = None
787
+ for i in range(list_index+1, len(toc_with_page_number)):
788
+ if i not in incorrect_indices and i >= 0 and i < len(toc_with_page_number):
789
+ physical_index = toc_with_page_number[i].get('physical_index')
790
+ if physical_index is not None:
791
+ next_correct = physical_index
792
+ break
793
+ # If no next correct item found, use end_index
794
+ if next_correct is None:
795
+ next_correct = end_index
796
+
797
+ incorrect_results_and_range_logs.append({
798
+ 'list_index': list_index,
799
+ 'title': incorrect_item['title'],
800
+ 'prev_correct': prev_correct,
801
+ 'next_correct': next_correct
802
+ })
803
+
804
+ page_contents=[]
805
+ for page_index in range(prev_correct, next_correct+1):
806
+ # Add bounds checking to prevent IndexError
807
+ list_index = page_index - start_index
808
+ if list_index >= 0 and list_index < len(page_list):
809
+ page_text = f"<physical_index_{page_index}>\n{page_list[list_index][0]}\n<physical_index_{page_index}>\n\n"
810
+ page_contents.append(page_text)
811
+ else:
812
+ continue
813
+ content_range = ''.join(page_contents)
814
+
815
+ physical_index_int = single_toc_item_index_fixer(incorrect_item['title'], content_range, model)
816
+
817
+ # Check if the result is correct
818
+ check_item = incorrect_item.copy()
819
+ check_item['physical_index'] = physical_index_int
820
+ check_result = await check_title_appearance(check_item, page_list, start_index, model)
821
+
822
+ return {
823
+ 'list_index': list_index,
824
+ 'title': incorrect_item['title'],
825
+ 'physical_index': physical_index_int,
826
+ 'is_valid': check_result['answer'] == 'yes'
827
+ }
828
+
829
+ # Process incorrect items concurrently
830
+ tasks = [
831
+ process_and_check_item(item)
832
+ for item in incorrect_results
833
+ ]
834
+ results = await asyncio.gather(*tasks, return_exceptions=True)
835
+ for item, result in zip(incorrect_results, results):
836
+ if isinstance(result, Exception):
837
+ print(f"Processing item {item} generated an exception: {result}")
838
+ continue
839
+ results = [result for result in results if not isinstance(result, Exception)]
840
+
841
+ # Update the toc_with_page_number with the fixed indices and check for any invalid results
842
+ invalid_results = []
843
+ for result in results:
844
+ if result['is_valid']:
845
+ # Add bounds checking to prevent IndexError
846
+ list_idx = result['list_index']
847
+ if 0 <= list_idx < len(toc_with_page_number):
848
+ toc_with_page_number[list_idx]['physical_index'] = result['physical_index']
849
+ else:
850
+ # Index is out of bounds, treat as invalid
851
+ invalid_results.append({
852
+ 'list_index': result['list_index'],
853
+ 'title': result['title'],
854
+ 'physical_index': result['physical_index'],
855
+ })
856
+ else:
857
+ invalid_results.append({
858
+ 'list_index': result['list_index'],
859
+ 'title': result['title'],
860
+ 'physical_index': result['physical_index'],
861
+ })
862
+
863
+ logger.info(f'incorrect_results_and_range_logs: {incorrect_results_and_range_logs}')
864
+ logger.info(f'invalid_results: {invalid_results}')
865
+
866
+ return toc_with_page_number, invalid_results
867
+
868
+
869
+
870
+ async def fix_incorrect_toc_with_retries(toc_with_page_number, page_list, incorrect_results, start_index=1, max_attempts=3, model=None, logger=None):
871
+ print('start fix_incorrect_toc')
872
+ fix_attempt = 0
873
+ current_toc = toc_with_page_number
874
+ current_incorrect = incorrect_results
875
+
876
+ while current_incorrect:
877
+ print(f"Fixing {len(current_incorrect)} incorrect results")
878
+
879
+ current_toc, current_incorrect = await fix_incorrect_toc(current_toc, page_list, current_incorrect, start_index, model, logger)
880
+
881
+ fix_attempt += 1
882
+ if fix_attempt >= max_attempts:
883
+ logger.info("Maximum fix attempts reached")
884
+ break
885
+
886
+ return current_toc, current_incorrect
887
+
888
+
889
+
890
+
891
+ ################### verify toc #########################################################
892
+ async def verify_toc(page_list, list_result, start_index=1, N=None, model=None):
893
+ print('start verify_toc')
894
+ # Find the last non-None physical_index
895
+ last_physical_index = None
896
+ for item in reversed(list_result):
897
+ if item.get('physical_index') is not None:
898
+ last_physical_index = item['physical_index']
899
+ break
900
+
901
+ # Early return if we don't have valid physical indices
902
+ if last_physical_index is None or last_physical_index < len(page_list)/2:
903
+ return 0, []
904
+
905
+ # Determine which items to check
906
+ if N is None:
907
+ print('check all items')
908
+ sample_indices = range(0, len(list_result))
909
+ else:
910
+ N = min(N, len(list_result))
911
+ print(f'check {N} items')
912
+ sample_indices = random.sample(range(0, len(list_result)), N)
913
+
914
+ # Prepare items with their list indices
915
+ indexed_sample_list = []
916
+ for idx in sample_indices:
917
+ item = list_result[idx]
918
+ # Skip items with None physical_index (these were invalidated by validate_and_truncate_physical_indices)
919
+ if item.get('physical_index') is not None:
920
+ item_with_index = item.copy()
921
+ item_with_index['list_index'] = idx # Add the original index in list_result
922
+ indexed_sample_list.append(item_with_index)
923
+
924
+ # Run checks concurrently
925
+ tasks = [
926
+ check_title_appearance(item, page_list, start_index, model)
927
+ for item in indexed_sample_list
928
+ ]
929
+ results = await asyncio.gather(*tasks)
930
+
931
+ # Process results
932
+ correct_count = 0
933
+ incorrect_results = []
934
+ for result in results:
935
+ if result['answer'] == 'yes':
936
+ correct_count += 1
937
+ else:
938
+ incorrect_results.append(result)
939
+
940
+ # Calculate accuracy
941
+ checked_count = len(results)
942
+ accuracy = correct_count / checked_count if checked_count > 0 else 0
943
+ print(f"accuracy: {accuracy*100:.2f}%")
944
+ return accuracy, incorrect_results
945
+
946
+
947
+
948
+
949
+
950
+ ################### main process #########################################################
951
+ async def meta_processor(page_list, mode=None, toc_content=None, toc_page_list=None, start_index=1, opt=None, logger=None):
952
+ print(mode)
953
+ print(f'start_index: {start_index}')
954
+
955
+ if mode == 'process_toc_with_page_numbers':
956
+ toc_with_page_number = process_toc_with_page_numbers(toc_content, toc_page_list, page_list, toc_check_page_num=opt.toc_check_page_num, model=opt.model, logger=logger)
957
+ elif mode == 'process_toc_no_page_numbers':
958
+ toc_with_page_number = process_toc_no_page_numbers(toc_content, toc_page_list, page_list, model=opt.model, logger=logger)
959
+ else:
960
+ toc_with_page_number = process_no_toc(page_list, start_index=start_index, model=opt.model, logger=logger)
961
+
962
+ toc_with_page_number = [item for item in toc_with_page_number if item.get('physical_index') is not None]
963
+
964
+ toc_with_page_number = validate_and_truncate_physical_indices(
965
+ toc_with_page_number,
966
+ len(page_list),
967
+ start_index=start_index,
968
+ logger=logger
969
+ )
970
+
971
+ accuracy, incorrect_results = await verify_toc(page_list, toc_with_page_number, start_index=start_index, model=opt.model)
972
+
973
+ logger.info({
974
+ 'mode': 'process_toc_with_page_numbers',
975
+ 'accuracy': accuracy,
976
+ 'incorrect_results': incorrect_results
977
+ })
978
+ if accuracy == 1.0 and len(incorrect_results) == 0:
979
+ return toc_with_page_number
980
+ if accuracy > 0.6 and len(incorrect_results) > 0:
981
+ toc_with_page_number, incorrect_results = await fix_incorrect_toc_with_retries(toc_with_page_number, page_list, incorrect_results,start_index=start_index, max_attempts=3, model=opt.model, logger=logger)
982
+ return toc_with_page_number
983
+ else:
984
+ if mode == 'process_toc_with_page_numbers':
985
+ return await meta_processor(page_list, mode='process_toc_no_page_numbers', toc_content=toc_content, toc_page_list=toc_page_list, start_index=start_index, opt=opt, logger=logger)
986
+ elif mode == 'process_toc_no_page_numbers':
987
+ return await meta_processor(page_list, mode='process_no_toc', start_index=start_index, opt=opt, logger=logger)
988
+ else:
989
+ raise Exception('Processing failed')
990
+
991
+
992
+ async def process_large_node_recursively(node, page_list, opt=None, logger=None):
993
+ node_page_list = page_list[node['start_index']-1:node['end_index']]
994
+ token_num = sum([page[1] for page in node_page_list])
995
+
996
+ if node['end_index'] - node['start_index'] > opt.max_page_num_each_node and token_num >= opt.max_token_num_each_node:
997
+ print('large node:', node['title'], 'start_index:', node['start_index'], 'end_index:', node['end_index'], 'token_num:', token_num)
998
+
999
+ node_toc_tree = await meta_processor(node_page_list, mode='process_no_toc', start_index=node['start_index'], opt=opt, logger=logger)
1000
+ node_toc_tree = await check_title_appearance_in_start_concurrent(node_toc_tree, page_list, model=opt.model, logger=logger)
1001
+
1002
+ # Filter out items with None physical_index before post_processing
1003
+ valid_node_toc_items = [item for item in node_toc_tree if item.get('physical_index') is not None]
1004
+
1005
+ if valid_node_toc_items and node['title'].strip() == valid_node_toc_items[0]['title'].strip():
1006
+ node['nodes'] = post_processing(valid_node_toc_items[1:], node['end_index'])
1007
+ node['end_index'] = valid_node_toc_items[1]['start_index'] if len(valid_node_toc_items) > 1 else node['end_index']
1008
+ else:
1009
+ node['nodes'] = post_processing(valid_node_toc_items, node['end_index'])
1010
+ node['end_index'] = valid_node_toc_items[0]['start_index'] if valid_node_toc_items else node['end_index']
1011
+
1012
+ if 'nodes' in node and node['nodes']:
1013
+ tasks = [
1014
+ process_large_node_recursively(child_node, page_list, opt, logger=logger)
1015
+ for child_node in node['nodes']
1016
+ ]
1017
+ await asyncio.gather(*tasks)
1018
+
1019
+ return node
1020
+
1021
+ async def tree_parser(page_list, opt, doc=None, logger=None):
1022
+ check_toc_result = check_toc(page_list, opt)
1023
+ logger.info(check_toc_result)
1024
+
1025
+ if check_toc_result.get("toc_content") and check_toc_result["toc_content"].strip() and check_toc_result["page_index_given_in_toc"] == "yes":
1026
+ toc_with_page_number = await meta_processor(
1027
+ page_list,
1028
+ mode='process_toc_with_page_numbers',
1029
+ start_index=1,
1030
+ toc_content=check_toc_result['toc_content'],
1031
+ toc_page_list=check_toc_result['toc_page_list'],
1032
+ opt=opt,
1033
+ logger=logger)
1034
+ else:
1035
+ toc_with_page_number = await meta_processor(
1036
+ page_list,
1037
+ mode='process_no_toc',
1038
+ start_index=1,
1039
+ opt=opt,
1040
+ logger=logger)
1041
+
1042
+ toc_with_page_number = add_preface_if_needed(toc_with_page_number)
1043
+ toc_with_page_number = await check_title_appearance_in_start_concurrent(toc_with_page_number, page_list, model=opt.model, logger=logger)
1044
+
1045
+ # Filter out items with None physical_index before post_processings
1046
+ valid_toc_items = [item for item in toc_with_page_number if item.get('physical_index') is not None]
1047
+
1048
+ toc_tree = post_processing(valid_toc_items, len(page_list))
1049
+ tasks = [
1050
+ process_large_node_recursively(node, page_list, opt, logger=logger)
1051
+ for node in toc_tree
1052
+ ]
1053
+ await asyncio.gather(*tasks)
1054
+
1055
+ return toc_tree
1056
+
1057
+
1058
+ def page_index_main(doc, opt=None):
1059
+ logger = JsonLogger(doc)
1060
+
1061
+ is_valid_pdf = (
1062
+ (isinstance(doc, str) and os.path.isfile(doc) and doc.lower().endswith(".pdf")) or
1063
+ isinstance(doc, BytesIO)
1064
+ )
1065
+ if not is_valid_pdf:
1066
+ raise ValueError("Unsupported input type. Expected a PDF file path or BytesIO object.")
1067
+
1068
+ print('Parsing PDF...')
1069
+ page_list = get_page_tokens(doc)
1070
+
1071
+ logger.info({'total_page_number': len(page_list)})
1072
+ logger.info({'total_token': sum([page[1] for page in page_list])})
1073
+
1074
+ async def page_index_builder():
1075
+ structure = await tree_parser(page_list, opt, doc=doc, logger=logger)
1076
+ if opt.if_add_node_id == 'yes':
1077
+ write_node_id(structure)
1078
+ if opt.if_add_node_text == 'yes':
1079
+ add_node_text(structure, page_list)
1080
+ if opt.if_add_node_summary == 'yes':
1081
+ if opt.if_add_node_text == 'no':
1082
+ add_node_text(structure, page_list)
1083
+ await generate_summaries_for_structure(structure, model=opt.model)
1084
+ if opt.if_add_node_text == 'no':
1085
+ remove_structure_text(structure)
1086
+ if opt.if_add_doc_description == 'yes':
1087
+ # Create a clean structure without unnecessary fields for description generation
1088
+ clean_structure = create_clean_structure_for_description(structure)
1089
+ doc_description = generate_doc_description(clean_structure, model=opt.model)
1090
+ return {
1091
+ 'doc_name': get_pdf_name(doc),
1092
+ 'doc_description': doc_description,
1093
+ 'structure': structure,
1094
+ }
1095
+ return {
1096
+ 'doc_name': get_pdf_name(doc),
1097
+ 'structure': structure,
1098
+ }
1099
+
1100
+ return asyncio.run(page_index_builder())
1101
+
1102
+
1103
+ def page_index(doc, model=None, toc_check_page_num=None, max_page_num_each_node=None, max_token_num_each_node=None,
1104
+ if_add_node_id=None, if_add_node_summary=None, if_add_doc_description=None, if_add_node_text=None):
1105
+
1106
+ user_opt = {
1107
+ arg: value for arg, value in locals().items()
1108
+ if arg != "doc" and value is not None
1109
+ }
1110
+ opt = ConfigLoader().load(user_opt)
1111
+ return page_index_main(doc, opt)
1112
+
1113
+
1114
+ def validate_and_truncate_physical_indices(toc_with_page_number, page_list_length, start_index=1, logger=None):
1115
+ """
1116
+ Validates and truncates physical indices that exceed the actual document length.
1117
+ This prevents errors when TOC references pages that don't exist in the document (e.g. the file is broken or incomplete).
1118
+ """
1119
+ if not toc_with_page_number:
1120
+ return toc_with_page_number
1121
+
1122
+ max_allowed_page = page_list_length + start_index - 1
1123
+ truncated_items = []
1124
+
1125
+ for i, item in enumerate(toc_with_page_number):
1126
+ if item.get('physical_index') is not None:
1127
+ original_index = item['physical_index']
1128
+ if original_index > max_allowed_page:
1129
+ item['physical_index'] = None
1130
+ truncated_items.append({
1131
+ 'title': item.get('title', 'Unknown'),
1132
+ 'original_index': original_index
1133
+ })
1134
+ if logger:
1135
+ logger.info(f"Removed physical_index for '{item.get('title', 'Unknown')}' (was {original_index}, too far beyond document)")
1136
+
1137
+ if truncated_items and logger:
1138
+ logger.info(f"Total removed items: {len(truncated_items)}")
1139
+
1140
+ print(f"Document validation: {page_list_length} pages, max allowed index: {max_allowed_page}")
1141
+ if truncated_items:
1142
+ print(f"Truncated {len(truncated_items)} TOC items that exceeded document length")
1143
+
1144
+ return toc_with_page_number
pageindex/page_index_md.py ADDED
@@ -0,0 +1,339 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ import re
4
+ import os
5
+ try:
6
+ from .utils import *
7
+ except:
8
+ from utils import *
9
+
10
+ async def get_node_summary(node, summary_token_threshold=200, model=None):
11
+ node_text = node.get('text')
12
+ num_tokens = count_tokens(node_text, model=model)
13
+ if num_tokens < summary_token_threshold:
14
+ return node_text
15
+ else:
16
+ return await generate_node_summary(node, model=model)
17
+
18
+
19
+ async def generate_summaries_for_structure_md(structure, summary_token_threshold, model=None):
20
+ nodes = structure_to_list(structure)
21
+ tasks = [get_node_summary(node, summary_token_threshold=summary_token_threshold, model=model) for node in nodes]
22
+ summaries = await asyncio.gather(*tasks)
23
+
24
+ for node, summary in zip(nodes, summaries):
25
+ if not node.get('nodes'):
26
+ node['summary'] = summary
27
+ else:
28
+ node['prefix_summary'] = summary
29
+ return structure
30
+
31
+
32
+ def extract_nodes_from_markdown(markdown_content):
33
+ header_pattern = r'^(#{1,6})\s+(.+)$'
34
+ code_block_pattern = r'^```'
35
+ node_list = []
36
+
37
+ lines = markdown_content.split('\n')
38
+ in_code_block = False
39
+
40
+ for line_num, line in enumerate(lines, 1):
41
+ stripped_line = line.strip()
42
+
43
+ # Check for code block delimiters (triple backticks)
44
+ if re.match(code_block_pattern, stripped_line):
45
+ in_code_block = not in_code_block
46
+ continue
47
+
48
+ # Skip empty lines
49
+ if not stripped_line:
50
+ continue
51
+
52
+ # Only look for headers when not inside a code block
53
+ if not in_code_block:
54
+ match = re.match(header_pattern, stripped_line)
55
+ if match:
56
+ title = match.group(2).strip()
57
+ node_list.append({'node_title': title, 'line_num': line_num})
58
+
59
+ return node_list, lines
60
+
61
+
62
+ def extract_node_text_content(node_list, markdown_lines):
63
+ all_nodes = []
64
+ for node in node_list:
65
+ line_content = markdown_lines[node['line_num'] - 1]
66
+ header_match = re.match(r'^(#{1,6})', line_content)
67
+
68
+ if header_match is None:
69
+ print(f"Warning: Line {node['line_num']} does not contain a valid header: '{line_content}'")
70
+ continue
71
+
72
+ processed_node = {
73
+ 'title': node['node_title'],
74
+ 'line_num': node['line_num'],
75
+ 'level': len(header_match.group(1))
76
+ }
77
+ all_nodes.append(processed_node)
78
+
79
+ for i, node in enumerate(all_nodes):
80
+ start_line = node['line_num'] - 1
81
+ if i + 1 < len(all_nodes):
82
+ end_line = all_nodes[i + 1]['line_num'] - 1
83
+ else:
84
+ end_line = len(markdown_lines)
85
+
86
+ node['text'] = '\n'.join(markdown_lines[start_line:end_line]).strip()
87
+ return all_nodes
88
+
89
+ def update_node_list_with_text_token_count(node_list, model=None):
90
+
91
+ def find_all_children(parent_index, parent_level, node_list):
92
+ """Find all direct and indirect children of a parent node"""
93
+ children_indices = []
94
+
95
+ # Look for children after the parent
96
+ for i in range(parent_index + 1, len(node_list)):
97
+ current_level = node_list[i]['level']
98
+
99
+ # If we hit a node at same or higher level than parent, stop
100
+ if current_level <= parent_level:
101
+ break
102
+
103
+ # This is a descendant
104
+ children_indices.append(i)
105
+
106
+ return children_indices
107
+
108
+ # Make a copy to avoid modifying the original
109
+ result_list = node_list.copy()
110
+
111
+ # Process nodes from end to beginning to ensure children are processed before parents
112
+ for i in range(len(result_list) - 1, -1, -1):
113
+ current_node = result_list[i]
114
+ current_level = current_node['level']
115
+
116
+ # Get all children of this node
117
+ children_indices = find_all_children(i, current_level, result_list)
118
+
119
+ # Start with the node's own text
120
+ node_text = current_node.get('text', '')
121
+ total_text = node_text
122
+
123
+ # Add all children's text
124
+ for child_index in children_indices:
125
+ child_text = result_list[child_index].get('text', '')
126
+ if child_text:
127
+ total_text += '\n' + child_text
128
+
129
+ # Calculate token count for combined text
130
+ result_list[i]['text_token_count'] = count_tokens(total_text, model=model)
131
+
132
+ return result_list
133
+
134
+
135
+ def tree_thinning_for_index(node_list, min_node_token=None, model=None):
136
+ def find_all_children(parent_index, parent_level, node_list):
137
+ children_indices = []
138
+
139
+ for i in range(parent_index + 1, len(node_list)):
140
+ current_level = node_list[i]['level']
141
+
142
+ if current_level <= parent_level:
143
+ break
144
+
145
+ children_indices.append(i)
146
+
147
+ return children_indices
148
+
149
+ result_list = node_list.copy()
150
+ nodes_to_remove = set()
151
+
152
+ for i in range(len(result_list) - 1, -1, -1):
153
+ if i in nodes_to_remove:
154
+ continue
155
+
156
+ current_node = result_list[i]
157
+ current_level = current_node['level']
158
+
159
+ total_tokens = current_node.get('text_token_count', 0)
160
+
161
+ if total_tokens < min_node_token:
162
+ children_indices = find_all_children(i, current_level, result_list)
163
+
164
+ children_texts = []
165
+ for child_index in sorted(children_indices):
166
+ if child_index not in nodes_to_remove:
167
+ child_text = result_list[child_index].get('text', '')
168
+ if child_text.strip():
169
+ children_texts.append(child_text)
170
+ nodes_to_remove.add(child_index)
171
+
172
+ if children_texts:
173
+ parent_text = current_node.get('text', '')
174
+ merged_text = parent_text
175
+ for child_text in children_texts:
176
+ if merged_text and not merged_text.endswith('\n'):
177
+ merged_text += '\n\n'
178
+ merged_text += child_text
179
+
180
+ result_list[i]['text'] = merged_text
181
+
182
+ result_list[i]['text_token_count'] = count_tokens(merged_text, model=model)
183
+
184
+ for index in sorted(nodes_to_remove, reverse=True):
185
+ result_list.pop(index)
186
+
187
+ return result_list
188
+
189
+
190
+ def build_tree_from_nodes(node_list):
191
+ if not node_list:
192
+ return []
193
+
194
+ stack = []
195
+ root_nodes = []
196
+ node_counter = 1
197
+
198
+ for node in node_list:
199
+ current_level = node['level']
200
+
201
+ tree_node = {
202
+ 'title': node['title'],
203
+ 'node_id': str(node_counter).zfill(4),
204
+ 'text': node['text'],
205
+ 'line_num': node['line_num'],
206
+ 'nodes': []
207
+ }
208
+ node_counter += 1
209
+
210
+ while stack and stack[-1][1] >= current_level:
211
+ stack.pop()
212
+
213
+ if not stack:
214
+ root_nodes.append(tree_node)
215
+ else:
216
+ parent_node, parent_level = stack[-1]
217
+ parent_node['nodes'].append(tree_node)
218
+
219
+ stack.append((tree_node, current_level))
220
+
221
+ return root_nodes
222
+
223
+
224
+ def clean_tree_for_output(tree_nodes):
225
+ cleaned_nodes = []
226
+
227
+ for node in tree_nodes:
228
+ cleaned_node = {
229
+ 'title': node['title'],
230
+ 'node_id': node['node_id'],
231
+ 'text': node['text'],
232
+ 'line_num': node['line_num']
233
+ }
234
+
235
+ if node['nodes']:
236
+ cleaned_node['nodes'] = clean_tree_for_output(node['nodes'])
237
+
238
+ cleaned_nodes.append(cleaned_node)
239
+
240
+ return cleaned_nodes
241
+
242
+
243
+ async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_add_node_summary='no', summary_token_threshold=None, model=None, if_add_doc_description='no', if_add_node_text='no', if_add_node_id='yes'):
244
+ with open(md_path, 'r', encoding='utf-8') as f:
245
+ markdown_content = f.read()
246
+
247
+ print(f"Extracting nodes from markdown...")
248
+ node_list, markdown_lines = extract_nodes_from_markdown(markdown_content)
249
+
250
+ print(f"Extracting text content from nodes...")
251
+ nodes_with_content = extract_node_text_content(node_list, markdown_lines)
252
+
253
+ if if_thinning:
254
+ nodes_with_content = update_node_list_with_text_token_count(nodes_with_content, model=model)
255
+ print(f"Thinning nodes...")
256
+ nodes_with_content = tree_thinning_for_index(nodes_with_content, min_token_threshold, model=model)
257
+
258
+ print(f"Building tree from nodes...")
259
+ tree_structure = build_tree_from_nodes(nodes_with_content)
260
+
261
+ if if_add_node_id == 'yes':
262
+ write_node_id(tree_structure)
263
+
264
+ print(f"Formatting tree structure...")
265
+
266
+ if if_add_node_summary == 'yes':
267
+ # Always include text for summary generation
268
+ tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'text', 'line_num', 'nodes'])
269
+
270
+ print(f"Generating summaries for each node...")
271
+ tree_structure = await generate_summaries_for_structure_md(tree_structure, summary_token_threshold=summary_token_threshold, model=model)
272
+
273
+ if if_add_node_text == 'no':
274
+ # Remove text after summary generation if not requested
275
+ tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'line_num', 'nodes'])
276
+
277
+ if if_add_doc_description == 'yes':
278
+ print(f"Generating document description...")
279
+ # Create a clean structure without unnecessary fields for description generation
280
+ clean_structure = create_clean_structure_for_description(tree_structure)
281
+ doc_description = generate_doc_description(clean_structure, model=model)
282
+ return {
283
+ 'doc_name': os.path.splitext(os.path.basename(md_path))[0],
284
+ 'doc_description': doc_description,
285
+ 'structure': tree_structure,
286
+ }
287
+ else:
288
+ # No summaries needed, format based on text preference
289
+ if if_add_node_text == 'yes':
290
+ tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'text', 'line_num', 'nodes'])
291
+ else:
292
+ tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'line_num', 'nodes'])
293
+
294
+ return {
295
+ 'doc_name': os.path.splitext(os.path.basename(md_path))[0],
296
+ 'structure': tree_structure,
297
+ }
298
+
299
+
300
+ if __name__ == "__main__":
301
+ import os
302
+ import json
303
+
304
+ # MD_NAME = 'Detect-Order-Construct'
305
+ MD_NAME = 'cognitive-load'
306
+ MD_PATH = os.path.join(os.path.dirname(__file__), '..', 'tests/markdowns/', f'{MD_NAME}.md')
307
+
308
+
309
+ MODEL="gpt-4.1"
310
+ IF_THINNING=False
311
+ THINNING_THRESHOLD=5000
312
+ SUMMARY_TOKEN_THRESHOLD=200
313
+ IF_SUMMARY=True
314
+
315
+ tree_structure = asyncio.run(md_to_tree(
316
+ md_path=MD_PATH,
317
+ if_thinning=IF_THINNING,
318
+ min_token_threshold=THINNING_THRESHOLD,
319
+ if_add_node_summary='yes' if IF_SUMMARY else 'no',
320
+ summary_token_threshold=SUMMARY_TOKEN_THRESHOLD,
321
+ model=MODEL))
322
+
323
+ print('\n' + '='*60)
324
+ print('TREE STRUCTURE')
325
+ print('='*60)
326
+ print_json(tree_structure)
327
+
328
+ print('\n' + '='*60)
329
+ print('TABLE OF CONTENTS')
330
+ print('='*60)
331
+ print_toc(tree_structure['structure'])
332
+
333
+ output_path = os.path.join(os.path.dirname(__file__), '..', 'results', f'{MD_NAME}_structure.json')
334
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
335
+
336
+ with open(output_path, 'w', encoding='utf-8') as f:
337
+ json.dump(tree_structure, f, indent=2, ensure_ascii=False)
338
+
339
+ print(f"\nTree structure saved to: {output_path}")
pageindex/utils.py ADDED
@@ -0,0 +1,712 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tiktoken
2
+ import openai
3
+ import logging
4
+ import os
5
+ from datetime import datetime
6
+ import time
7
+ import json
8
+ import PyPDF2
9
+ import copy
10
+ import asyncio
11
+ import pymupdf
12
+ from io import BytesIO
13
+ from dotenv import load_dotenv
14
+ load_dotenv()
15
+ import logging
16
+ import yaml
17
+ from pathlib import Path
18
+ from types import SimpleNamespace as config
19
+
20
+ CHATGPT_API_KEY = os.getenv("CHATGPT_API_KEY")
21
+
22
+ def count_tokens(text, model=None):
23
+ if not text:
24
+ return 0
25
+ enc = tiktoken.encoding_for_model(model)
26
+ tokens = enc.encode(text)
27
+ return len(tokens)
28
+
29
+ def ChatGPT_API_with_finish_reason(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None):
30
+ max_retries = 10
31
+ client = openai.OpenAI(api_key=api_key)
32
+ for i in range(max_retries):
33
+ try:
34
+ if chat_history:
35
+ messages = chat_history
36
+ messages.append({"role": "user", "content": prompt})
37
+ else:
38
+ messages = [{"role": "user", "content": prompt}]
39
+
40
+ response = client.chat.completions.create(
41
+ model=model,
42
+ messages=messages,
43
+ temperature=0,
44
+ )
45
+ if response.choices[0].finish_reason == "length":
46
+ return response.choices[0].message.content, "max_output_reached"
47
+ else:
48
+ return response.choices[0].message.content, "finished"
49
+
50
+ except Exception as e:
51
+ print('************* Retrying *************')
52
+ logging.error(f"Error: {e}")
53
+ if i < max_retries - 1:
54
+ time.sleep(1) # Wait for 1秒 before retrying
55
+ else:
56
+ logging.error('Max retries reached for prompt: ' + prompt)
57
+ return "Error"
58
+
59
+
60
+
61
+ def ChatGPT_API(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None):
62
+ max_retries = 10
63
+ client = openai.OpenAI(api_key=api_key)
64
+ for i in range(max_retries):
65
+ try:
66
+ if chat_history:
67
+ messages = chat_history
68
+ messages.append({"role": "user", "content": prompt})
69
+ else:
70
+ messages = [{"role": "user", "content": prompt}]
71
+
72
+ response = client.chat.completions.create(
73
+ model=model,
74
+ messages=messages,
75
+ temperature=0,
76
+ )
77
+
78
+ return response.choices[0].message.content
79
+ except Exception as e:
80
+ print('************* Retrying *************')
81
+ logging.error(f"Error: {e}")
82
+ if i < max_retries - 1:
83
+ time.sleep(1) # Wait for 1秒 before retrying
84
+ else:
85
+ logging.error('Max retries reached for prompt: ' + prompt)
86
+ return "Error"
87
+
88
+
89
+ async def ChatGPT_API_async(model, prompt, api_key=CHATGPT_API_KEY):
90
+ max_retries = 10
91
+ messages = [{"role": "user", "content": prompt}]
92
+ for i in range(max_retries):
93
+ try:
94
+ async with openai.AsyncOpenAI(api_key=api_key) as client:
95
+ response = await client.chat.completions.create(
96
+ model=model,
97
+ messages=messages,
98
+ temperature=0,
99
+ )
100
+ return response.choices[0].message.content
101
+ except Exception as e:
102
+ print('************* Retrying *************')
103
+ logging.error(f"Error: {e}")
104
+ if i < max_retries - 1:
105
+ await asyncio.sleep(1) # Wait for 1s before retrying
106
+ else:
107
+ logging.error('Max retries reached for prompt: ' + prompt)
108
+ return "Error"
109
+
110
+
111
+ def get_json_content(response):
112
+ start_idx = response.find("```json")
113
+ if start_idx != -1:
114
+ start_idx += 7
115
+ response = response[start_idx:]
116
+
117
+ end_idx = response.rfind("```")
118
+ if end_idx != -1:
119
+ response = response[:end_idx]
120
+
121
+ json_content = response.strip()
122
+ return json_content
123
+
124
+
125
+ def extract_json(content):
126
+ try:
127
+ # First, try to extract JSON enclosed within ```json and ```
128
+ start_idx = content.find("```json")
129
+ if start_idx != -1:
130
+ start_idx += 7 # Adjust index to start after the delimiter
131
+ end_idx = content.rfind("```")
132
+ json_content = content[start_idx:end_idx].strip()
133
+ else:
134
+ # If no delimiters, assume entire content could be JSON
135
+ json_content = content.strip()
136
+
137
+ # Clean up common issues that might cause parsing errors
138
+ json_content = json_content.replace('None', 'null') # Replace Python None with JSON null
139
+ json_content = json_content.replace('\n', ' ').replace('\r', ' ') # Remove newlines
140
+ json_content = ' '.join(json_content.split()) # Normalize whitespace
141
+
142
+ # Attempt to parse and return the JSON object
143
+ return json.loads(json_content)
144
+ except json.JSONDecodeError as e:
145
+ logging.error(f"Failed to extract JSON: {e}")
146
+ # Try to clean up the content further if initial parsing fails
147
+ try:
148
+ # Remove any trailing commas before closing brackets/braces
149
+ json_content = json_content.replace(',]', ']').replace(',}', '}')
150
+ return json.loads(json_content)
151
+ except:
152
+ logging.error("Failed to parse JSON even after cleanup")
153
+ return {}
154
+ except Exception as e:
155
+ logging.error(f"Unexpected error while extracting JSON: {e}")
156
+ return {}
157
+
158
+ def write_node_id(data, node_id=0):
159
+ if isinstance(data, dict):
160
+ data['node_id'] = str(node_id).zfill(4)
161
+ node_id += 1
162
+ for key in list(data.keys()):
163
+ if 'nodes' in key:
164
+ node_id = write_node_id(data[key], node_id)
165
+ elif isinstance(data, list):
166
+ for index in range(len(data)):
167
+ node_id = write_node_id(data[index], node_id)
168
+ return node_id
169
+
170
+ def get_nodes(structure):
171
+ if isinstance(structure, dict):
172
+ structure_node = copy.deepcopy(structure)
173
+ structure_node.pop('nodes', None)
174
+ nodes = [structure_node]
175
+ for key in list(structure.keys()):
176
+ if 'nodes' in key:
177
+ nodes.extend(get_nodes(structure[key]))
178
+ return nodes
179
+ elif isinstance(structure, list):
180
+ nodes = []
181
+ for item in structure:
182
+ nodes.extend(get_nodes(item))
183
+ return nodes
184
+
185
+ def structure_to_list(structure):
186
+ if isinstance(structure, dict):
187
+ nodes = []
188
+ nodes.append(structure)
189
+ if 'nodes' in structure:
190
+ nodes.extend(structure_to_list(structure['nodes']))
191
+ return nodes
192
+ elif isinstance(structure, list):
193
+ nodes = []
194
+ for item in structure:
195
+ nodes.extend(structure_to_list(item))
196
+ return nodes
197
+
198
+
199
+ def get_leaf_nodes(structure):
200
+ if isinstance(structure, dict):
201
+ if not structure['nodes']:
202
+ structure_node = copy.deepcopy(structure)
203
+ structure_node.pop('nodes', None)
204
+ return [structure_node]
205
+ else:
206
+ leaf_nodes = []
207
+ for key in list(structure.keys()):
208
+ if 'nodes' in key:
209
+ leaf_nodes.extend(get_leaf_nodes(structure[key]))
210
+ return leaf_nodes
211
+ elif isinstance(structure, list):
212
+ leaf_nodes = []
213
+ for item in structure:
214
+ leaf_nodes.extend(get_leaf_nodes(item))
215
+ return leaf_nodes
216
+
217
+ def is_leaf_node(data, node_id):
218
+ # Helper function to find the node by its node_id
219
+ def find_node(data, node_id):
220
+ if isinstance(data, dict):
221
+ if data.get('node_id') == node_id:
222
+ return data
223
+ for key in data.keys():
224
+ if 'nodes' in key:
225
+ result = find_node(data[key], node_id)
226
+ if result:
227
+ return result
228
+ elif isinstance(data, list):
229
+ for item in data:
230
+ result = find_node(item, node_id)
231
+ if result:
232
+ return result
233
+ return None
234
+
235
+ # Find the node with the given node_id
236
+ node = find_node(data, node_id)
237
+
238
+ # Check if the node is a leaf node
239
+ if node and not node.get('nodes'):
240
+ return True
241
+ return False
242
+
243
+ def get_last_node(structure):
244
+ return structure[-1]
245
+
246
+
247
+ def extract_text_from_pdf(pdf_path):
248
+ pdf_reader = PyPDF2.PdfReader(pdf_path)
249
+ ###return text not list
250
+ text=""
251
+ for page_num in range(len(pdf_reader.pages)):
252
+ page = pdf_reader.pages[page_num]
253
+ text+=page.extract_text()
254
+ return text
255
+
256
+ def get_pdf_title(pdf_path):
257
+ pdf_reader = PyPDF2.PdfReader(pdf_path)
258
+ meta = pdf_reader.metadata
259
+ title = meta.title if meta and meta.title else 'Untitled'
260
+ return title
261
+
262
+ def get_text_of_pages(pdf_path, start_page, end_page, tag=True):
263
+ pdf_reader = PyPDF2.PdfReader(pdf_path)
264
+ text = ""
265
+ for page_num in range(start_page-1, end_page):
266
+ page = pdf_reader.pages[page_num]
267
+ page_text = page.extract_text()
268
+ if tag:
269
+ text += f"<start_index_{page_num+1}>\n{page_text}\n<end_index_{page_num+1}>\n"
270
+ else:
271
+ text += page_text
272
+ return text
273
+
274
+ def get_first_start_page_from_text(text):
275
+ start_page = -1
276
+ start_page_match = re.search(r'<start_index_(\d+)>', text)
277
+ if start_page_match:
278
+ start_page = int(start_page_match.group(1))
279
+ return start_page
280
+
281
+ def get_last_start_page_from_text(text):
282
+ start_page = -1
283
+ # Find all matches of start_index tags
284
+ start_page_matches = re.finditer(r'<start_index_(\d+)>', text)
285
+ # Convert iterator to list and get the last match if any exist
286
+ matches_list = list(start_page_matches)
287
+ if matches_list:
288
+ start_page = int(matches_list[-1].group(1))
289
+ return start_page
290
+
291
+
292
+ def sanitize_filename(filename, replacement='-'):
293
+ # In Linux, only '/' and '\0' (null) are invalid in filenames.
294
+ # Null can't be represented in strings, so we only handle '/'.
295
+ return filename.replace('/', replacement)
296
+
297
+ def get_pdf_name(pdf_path):
298
+ # Extract PDF name
299
+ if isinstance(pdf_path, str):
300
+ pdf_name = os.path.basename(pdf_path)
301
+ elif isinstance(pdf_path, BytesIO):
302
+ pdf_reader = PyPDF2.PdfReader(pdf_path)
303
+ meta = pdf_reader.metadata
304
+ pdf_name = meta.title if meta and meta.title else 'Untitled'
305
+ pdf_name = sanitize_filename(pdf_name)
306
+ return pdf_name
307
+
308
+
309
+ class JsonLogger:
310
+ def __init__(self, file_path):
311
+ # Extract PDF name for logger name
312
+ pdf_name = get_pdf_name(file_path)
313
+
314
+ current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
315
+ self.filename = f"{pdf_name}_{current_time}.json"
316
+ os.makedirs("./logs", exist_ok=True)
317
+ # Initialize empty list to store all messages
318
+ self.log_data = []
319
+
320
+ def log(self, level, message, **kwargs):
321
+ if isinstance(message, dict):
322
+ self.log_data.append(message)
323
+ else:
324
+ self.log_data.append({'message': message})
325
+ # Add new message to the log data
326
+
327
+ # Write entire log data to file
328
+ with open(self._filepath(), "w") as f:
329
+ json.dump(self.log_data, f, indent=2)
330
+
331
+ def info(self, message, **kwargs):
332
+ self.log("INFO", message, **kwargs)
333
+
334
+ def error(self, message, **kwargs):
335
+ self.log("ERROR", message, **kwargs)
336
+
337
+ def debug(self, message, **kwargs):
338
+ self.log("DEBUG", message, **kwargs)
339
+
340
+ def exception(self, message, **kwargs):
341
+ kwargs["exception"] = True
342
+ self.log("ERROR", message, **kwargs)
343
+
344
+ def _filepath(self):
345
+ return os.path.join("logs", self.filename)
346
+
347
+
348
+
349
+
350
+ def list_to_tree(data):
351
+ def get_parent_structure(structure):
352
+ """Helper function to get the parent structure code"""
353
+ if not structure:
354
+ return None
355
+ parts = str(structure).split('.')
356
+ return '.'.join(parts[:-1]) if len(parts) > 1 else None
357
+
358
+ # First pass: Create nodes and track parent-child relationships
359
+ nodes = {}
360
+ root_nodes = []
361
+
362
+ for item in data:
363
+ structure = item.get('structure')
364
+ node = {
365
+ 'title': item.get('title'),
366
+ 'start_index': item.get('start_index'),
367
+ 'end_index': item.get('end_index'),
368
+ 'nodes': []
369
+ }
370
+
371
+ nodes[structure] = node
372
+
373
+ # Find parent
374
+ parent_structure = get_parent_structure(structure)
375
+
376
+ if parent_structure:
377
+ # Add as child to parent if parent exists
378
+ if parent_structure in nodes:
379
+ nodes[parent_structure]['nodes'].append(node)
380
+ else:
381
+ root_nodes.append(node)
382
+ else:
383
+ # No parent, this is a root node
384
+ root_nodes.append(node)
385
+
386
+ # Helper function to clean empty children arrays
387
+ def clean_node(node):
388
+ if not node['nodes']:
389
+ del node['nodes']
390
+ else:
391
+ for child in node['nodes']:
392
+ clean_node(child)
393
+ return node
394
+
395
+ # Clean and return the tree
396
+ return [clean_node(node) for node in root_nodes]
397
+
398
+ def add_preface_if_needed(data):
399
+ if not isinstance(data, list) or not data:
400
+ return data
401
+
402
+ if data[0]['physical_index'] is not None and data[0]['physical_index'] > 1:
403
+ preface_node = {
404
+ "structure": "0",
405
+ "title": "Preface",
406
+ "physical_index": 1,
407
+ }
408
+ data.insert(0, preface_node)
409
+ return data
410
+
411
+
412
+
413
+ def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2"):
414
+ enc = tiktoken.encoding_for_model(model)
415
+ if pdf_parser == "PyPDF2":
416
+ pdf_reader = PyPDF2.PdfReader(pdf_path)
417
+ page_list = []
418
+ for page_num in range(len(pdf_reader.pages)):
419
+ page = pdf_reader.pages[page_num]
420
+ page_text = page.extract_text()
421
+ token_length = len(enc.encode(page_text))
422
+ page_list.append((page_text, token_length))
423
+ return page_list
424
+ elif pdf_parser == "PyMuPDF":
425
+ if isinstance(pdf_path, BytesIO):
426
+ pdf_stream = pdf_path
427
+ doc = pymupdf.open(stream=pdf_stream, filetype="pdf")
428
+ elif isinstance(pdf_path, str) and os.path.isfile(pdf_path) and pdf_path.lower().endswith(".pdf"):
429
+ doc = pymupdf.open(pdf_path)
430
+ page_list = []
431
+ for page in doc:
432
+ page_text = page.get_text()
433
+ token_length = len(enc.encode(page_text))
434
+ page_list.append((page_text, token_length))
435
+ return page_list
436
+ else:
437
+ raise ValueError(f"Unsupported PDF parser: {pdf_parser}")
438
+
439
+
440
+
441
+ def get_text_of_pdf_pages(pdf_pages, start_page, end_page):
442
+ text = ""
443
+ for page_num in range(start_page-1, end_page):
444
+ text += pdf_pages[page_num][0]
445
+ return text
446
+
447
+ def get_text_of_pdf_pages_with_labels(pdf_pages, start_page, end_page):
448
+ text = ""
449
+ for page_num in range(start_page-1, end_page):
450
+ text += f"<physical_index_{page_num+1}>\n{pdf_pages[page_num][0]}\n<physical_index_{page_num+1}>\n"
451
+ return text
452
+
453
+ def get_number_of_pages(pdf_path):
454
+ pdf_reader = PyPDF2.PdfReader(pdf_path)
455
+ num = len(pdf_reader.pages)
456
+ return num
457
+
458
+
459
+
460
+ def post_processing(structure, end_physical_index):
461
+ # First convert page_number to start_index in flat list
462
+ for i, item in enumerate(structure):
463
+ item['start_index'] = item.get('physical_index')
464
+ if i < len(structure) - 1:
465
+ if structure[i + 1].get('appear_start') == 'yes':
466
+ item['end_index'] = structure[i + 1]['physical_index']-1
467
+ else:
468
+ item['end_index'] = structure[i + 1]['physical_index']
469
+ else:
470
+ item['end_index'] = end_physical_index
471
+ tree = list_to_tree(structure)
472
+ if len(tree)!=0:
473
+ return tree
474
+ else:
475
+ ### remove appear_start
476
+ for node in structure:
477
+ node.pop('appear_start', None)
478
+ node.pop('physical_index', None)
479
+ return structure
480
+
481
+ def clean_structure_post(data):
482
+ if isinstance(data, dict):
483
+ data.pop('page_number', None)
484
+ data.pop('start_index', None)
485
+ data.pop('end_index', None)
486
+ if 'nodes' in data:
487
+ clean_structure_post(data['nodes'])
488
+ elif isinstance(data, list):
489
+ for section in data:
490
+ clean_structure_post(section)
491
+ return data
492
+
493
+ def remove_fields(data, fields=['text']):
494
+ if isinstance(data, dict):
495
+ return {k: remove_fields(v, fields)
496
+ for k, v in data.items() if k not in fields}
497
+ elif isinstance(data, list):
498
+ return [remove_fields(item, fields) for item in data]
499
+ return data
500
+
501
+ def print_toc(tree, indent=0):
502
+ for node in tree:
503
+ print(' ' * indent + node['title'])
504
+ if node.get('nodes'):
505
+ print_toc(node['nodes'], indent + 1)
506
+
507
+ def print_json(data, max_len=40, indent=2):
508
+ def simplify_data(obj):
509
+ if isinstance(obj, dict):
510
+ return {k: simplify_data(v) for k, v in obj.items()}
511
+ elif isinstance(obj, list):
512
+ return [simplify_data(item) for item in obj]
513
+ elif isinstance(obj, str) and len(obj) > max_len:
514
+ return obj[:max_len] + '...'
515
+ else:
516
+ return obj
517
+
518
+ simplified = simplify_data(data)
519
+ print(json.dumps(simplified, indent=indent, ensure_ascii=False))
520
+
521
+
522
+ def remove_structure_text(data):
523
+ if isinstance(data, dict):
524
+ data.pop('text', None)
525
+ if 'nodes' in data:
526
+ remove_structure_text(data['nodes'])
527
+ elif isinstance(data, list):
528
+ for item in data:
529
+ remove_structure_text(item)
530
+ return data
531
+
532
+
533
+ def check_token_limit(structure, limit=110000):
534
+ list = structure_to_list(structure)
535
+ for node in list:
536
+ num_tokens = count_tokens(node['text'], model='gpt-4o')
537
+ if num_tokens > limit:
538
+ print(f"Node ID: {node['node_id']} has {num_tokens} tokens")
539
+ print("Start Index:", node['start_index'])
540
+ print("End Index:", node['end_index'])
541
+ print("Title:", node['title'])
542
+ print("\n")
543
+
544
+
545
+ def convert_physical_index_to_int(data):
546
+ if isinstance(data, list):
547
+ for i in range(len(data)):
548
+ # Check if item is a dictionary and has 'physical_index' key
549
+ if isinstance(data[i], dict) and 'physical_index' in data[i]:
550
+ if isinstance(data[i]['physical_index'], str):
551
+ if data[i]['physical_index'].startswith('<physical_index_'):
552
+ data[i]['physical_index'] = int(data[i]['physical_index'].split('_')[-1].rstrip('>').strip())
553
+ elif data[i]['physical_index'].startswith('physical_index_'):
554
+ data[i]['physical_index'] = int(data[i]['physical_index'].split('_')[-1].strip())
555
+ elif isinstance(data, str):
556
+ if data.startswith('<physical_index_'):
557
+ data = int(data.split('_')[-1].rstrip('>').strip())
558
+ elif data.startswith('physical_index_'):
559
+ data = int(data.split('_')[-1].strip())
560
+ # Check data is int
561
+ if isinstance(data, int):
562
+ return data
563
+ else:
564
+ return None
565
+ return data
566
+
567
+
568
+ def convert_page_to_int(data):
569
+ for item in data:
570
+ if 'page' in item and isinstance(item['page'], str):
571
+ try:
572
+ item['page'] = int(item['page'])
573
+ except ValueError:
574
+ # Keep original value if conversion fails
575
+ pass
576
+ return data
577
+
578
+
579
+ def add_node_text(node, pdf_pages):
580
+ if isinstance(node, dict):
581
+ start_page = node.get('start_index')
582
+ end_page = node.get('end_index')
583
+ node['text'] = get_text_of_pdf_pages(pdf_pages, start_page, end_page)
584
+ if 'nodes' in node:
585
+ add_node_text(node['nodes'], pdf_pages)
586
+ elif isinstance(node, list):
587
+ for index in range(len(node)):
588
+ add_node_text(node[index], pdf_pages)
589
+ return
590
+
591
+
592
+ def add_node_text_with_labels(node, pdf_pages):
593
+ if isinstance(node, dict):
594
+ start_page = node.get('start_index')
595
+ end_page = node.get('end_index')
596
+ node['text'] = get_text_of_pdf_pages_with_labels(pdf_pages, start_page, end_page)
597
+ if 'nodes' in node:
598
+ add_node_text_with_labels(node['nodes'], pdf_pages)
599
+ elif isinstance(node, list):
600
+ for index in range(len(node)):
601
+ add_node_text_with_labels(node[index], pdf_pages)
602
+ return
603
+
604
+
605
+ async def generate_node_summary(node, model=None):
606
+ prompt = f"""You are given a part of a document, your task is to generate a description of the partial document about what are main points covered in the partial document.
607
+
608
+ Partial Document Text: {node['text']}
609
+
610
+ Directly return the description, do not include any other text.
611
+ """
612
+ response = await ChatGPT_API_async(model, prompt)
613
+ return response
614
+
615
+
616
+ async def generate_summaries_for_structure(structure, model=None):
617
+ nodes = structure_to_list(structure)
618
+ tasks = [generate_node_summary(node, model=model) for node in nodes]
619
+ summaries = await asyncio.gather(*tasks)
620
+
621
+ for node, summary in zip(nodes, summaries):
622
+ node['summary'] = summary
623
+ return structure
624
+
625
+
626
+ def create_clean_structure_for_description(structure):
627
+ """
628
+ Create a clean structure for document description generation,
629
+ excluding unnecessary fields like 'text'.
630
+ """
631
+ if isinstance(structure, dict):
632
+ clean_node = {}
633
+ # Only include essential fields for description
634
+ for key in ['title', 'node_id', 'summary', 'prefix_summary']:
635
+ if key in structure:
636
+ clean_node[key] = structure[key]
637
+
638
+ # Recursively process child nodes
639
+ if 'nodes' in structure and structure['nodes']:
640
+ clean_node['nodes'] = create_clean_structure_for_description(structure['nodes'])
641
+
642
+ return clean_node
643
+ elif isinstance(structure, list):
644
+ return [create_clean_structure_for_description(item) for item in structure]
645
+ else:
646
+ return structure
647
+
648
+
649
+ def generate_doc_description(structure, model=None):
650
+ prompt = f"""Your are an expert in generating descriptions for a document.
651
+ You are given a structure of a document. Your task is to generate a one-sentence description for the document, which makes it easy to distinguish the document from other documents.
652
+
653
+ Document Structure: {structure}
654
+
655
+ Directly return the description, do not include any other text.
656
+ """
657
+ response = ChatGPT_API(model, prompt)
658
+ return response
659
+
660
+
661
+ def reorder_dict(data, key_order):
662
+ if not key_order:
663
+ return data
664
+ return {key: data[key] for key in key_order if key in data}
665
+
666
+
667
+ def format_structure(structure, order=None):
668
+ if not order:
669
+ return structure
670
+ if isinstance(structure, dict):
671
+ if 'nodes' in structure:
672
+ structure['nodes'] = format_structure(structure['nodes'], order)
673
+ if not structure.get('nodes'):
674
+ structure.pop('nodes', None)
675
+ structure = reorder_dict(structure, order)
676
+ elif isinstance(structure, list):
677
+ structure = [format_structure(item, order) for item in structure]
678
+ return structure
679
+
680
+
681
+ class ConfigLoader:
682
+ def __init__(self, default_path: str = None):
683
+ if default_path is None:
684
+ default_path = Path(__file__).parent / "config.yaml"
685
+ self._default_dict = self._load_yaml(default_path)
686
+
687
+ @staticmethod
688
+ def _load_yaml(path):
689
+ with open(path, "r", encoding="utf-8") as f:
690
+ return yaml.safe_load(f) or {}
691
+
692
+ def _validate_keys(self, user_dict):
693
+ unknown_keys = set(user_dict) - set(self._default_dict)
694
+ if unknown_keys:
695
+ raise ValueError(f"Unknown config keys: {unknown_keys}")
696
+
697
+ def load(self, user_opt=None) -> config:
698
+ """
699
+ Load the configuration, merging user options with default values.
700
+ """
701
+ if user_opt is None:
702
+ user_dict = {}
703
+ elif isinstance(user_opt, config):
704
+ user_dict = vars(user_opt)
705
+ elif isinstance(user_opt, dict):
706
+ user_dict = user_opt
707
+ else:
708
+ raise TypeError("user_opt must be dict, config(SimpleNamespace) or None")
709
+
710
+ self._validate_keys(user_dict)
711
+ merged = {**self._default_dict, **user_dict}
712
+ return config(**merged)
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ openai
3
+ pydantic
4
+ python-dotenv
5
+ pymupdf==1.26.4
6
+ PyPDF2==3.0.1
7
+ tiktoken==0.11.0
8
+ pyyaml==6.0.2
run_pageindex.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import json
4
+ from pageindex import *
5
+ from pageindex.page_index_md import md_to_tree
6
+
7
+ if __name__ == "__main__":
8
+ # Set up argument parser
9
+ parser = argparse.ArgumentParser(description='Process PDF or Markdown document and generate structure')
10
+ parser.add_argument('--pdf_path', type=str, help='Path to the PDF file')
11
+ parser.add_argument('--md_path', type=str, help='Path to the Markdown file')
12
+
13
+ parser.add_argument('--model', type=str, default='gpt-4o-2024-11-20', help='Model to use')
14
+
15
+ parser.add_argument('--toc-check-pages', type=int, default=20,
16
+ help='Number of pages to check for table of contents (PDF only)')
17
+ parser.add_argument('--max-pages-per-node', type=int, default=10,
18
+ help='Maximum number of pages per node (PDF only)')
19
+ parser.add_argument('--max-tokens-per-node', type=int, default=20000,
20
+ help='Maximum number of tokens per node (PDF only)')
21
+
22
+ parser.add_argument('--if-add-node-id', type=str, default='yes',
23
+ help='Whether to add node id to the node')
24
+ parser.add_argument('--if-add-node-summary', type=str, default='yes',
25
+ help='Whether to add summary to the node')
26
+ parser.add_argument('--if-add-doc-description', type=str, default='no',
27
+ help='Whether to add doc description to the doc')
28
+ parser.add_argument('--if-add-node-text', type=str, default='no',
29
+ help='Whether to add text to the node')
30
+
31
+ # Markdown specific arguments
32
+ parser.add_argument('--if-thinning', type=str, default='no',
33
+ help='Whether to apply tree thinning for markdown (markdown only)')
34
+ parser.add_argument('--thinning-threshold', type=int, default=5000,
35
+ help='Minimum token threshold for thinning (markdown only)')
36
+ parser.add_argument('--summary-token-threshold', type=int, default=200,
37
+ help='Token threshold for generating summaries (markdown only)')
38
+ args = parser.parse_args()
39
+
40
+ # Validate that exactly one file type is specified
41
+ if not args.pdf_path and not args.md_path:
42
+ raise ValueError("Either --pdf_path or --md_path must be specified")
43
+ if args.pdf_path and args.md_path:
44
+ raise ValueError("Only one of --pdf_path or --md_path can be specified")
45
+
46
+ if args.pdf_path:
47
+ # Validate PDF file
48
+ if not args.pdf_path.lower().endswith('.pdf'):
49
+ raise ValueError("PDF file must have .pdf extension")
50
+ if not os.path.isfile(args.pdf_path):
51
+ raise ValueError(f"PDF file not found: {args.pdf_path}")
52
+
53
+ # Process PDF file
54
+ # Configure options
55
+ opt = config(
56
+ model=args.model,
57
+ toc_check_page_num=args.toc_check_pages,
58
+ max_page_num_each_node=args.max_pages_per_node,
59
+ max_token_num_each_node=args.max_tokens_per_node,
60
+ if_add_node_id=args.if_add_node_id,
61
+ if_add_node_summary=args.if_add_node_summary,
62
+ if_add_doc_description=args.if_add_doc_description,
63
+ if_add_node_text=args.if_add_node_text
64
+ )
65
+
66
+ # Process the PDF
67
+ toc_with_page_number = page_index_main(args.pdf_path, opt)
68
+ print('Parsing done, saving to file...')
69
+
70
+ # Save results
71
+ pdf_name = os.path.splitext(os.path.basename(args.pdf_path))[0]
72
+ output_dir = './results'
73
+ output_file = f'{output_dir}/{pdf_name}_structure.json'
74
+ os.makedirs(output_dir, exist_ok=True)
75
+
76
+ with open(output_file, 'w', encoding='utf-8') as f:
77
+ json.dump(toc_with_page_number, f, indent=2)
78
+
79
+ print(f'Tree structure saved to: {output_file}')
80
+
81
+ elif args.md_path:
82
+ # Validate Markdown file
83
+ if not args.md_path.lower().endswith(('.md', '.markdown')):
84
+ raise ValueError("Markdown file must have .md or .markdown extension")
85
+ if not os.path.isfile(args.md_path):
86
+ raise ValueError(f"Markdown file not found: {args.md_path}")
87
+
88
+ # Process markdown file
89
+ print('Processing markdown file...')
90
+
91
+ # Process the markdown
92
+ import asyncio
93
+
94
+ # Use ConfigLoader to get consistent defaults (matching PDF behavior)
95
+ from pageindex.utils import ConfigLoader
96
+ config_loader = ConfigLoader()
97
+
98
+ # Create options dict with user args
99
+ user_opt = {
100
+ 'model': args.model,
101
+ 'if_add_node_summary': args.if_add_node_summary,
102
+ 'if_add_doc_description': args.if_add_doc_description,
103
+ 'if_add_node_text': args.if_add_node_text,
104
+ 'if_add_node_id': args.if_add_node_id
105
+ }
106
+
107
+ # Load config with defaults from config.yaml
108
+ opt = config_loader.load(user_opt)
109
+
110
+ toc_with_page_number = asyncio.run(md_to_tree(
111
+ md_path=args.md_path,
112
+ if_thinning=args.if_thinning.lower() == 'yes',
113
+ min_token_threshold=args.thinning_threshold,
114
+ if_add_node_summary=opt.if_add_node_summary,
115
+ summary_token_threshold=args.summary_token_threshold,
116
+ model=opt.model,
117
+ if_add_doc_description=opt.if_add_doc_description,
118
+ if_add_node_text=opt.if_add_node_text,
119
+ if_add_node_id=opt.if_add_node_id
120
+ ))
121
+
122
+ print('Parsing done, saving to file...')
123
+
124
+ # Save results
125
+ md_name = os.path.splitext(os.path.basename(args.md_path))[0]
126
+ output_dir = './results'
127
+ output_file = f'{output_dir}/{md_name}_structure.json'
128
+ os.makedirs(output_dir, exist_ok=True)
129
+
130
+ with open(output_file, 'w', encoding='utf-8') as f:
131
+ json.dump(toc_with_page_number, f, indent=2, ensure_ascii=False)
132
+
133
+ print(f'Tree structure saved to: {output_file}')
tests/pdfs/2023-annual-report-truncated.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e1702e593dcc2b55f1f121422b6787fcde37c387cc6228ba783fe79825d0214
3
+ size 1463411
tests/pdfs/2023-annual-report.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aacf44d05ebbc21f5e78d029864dc69871f88364b61a2ae405ad4ca43143f666
3
+ size 2255266
tests/pdfs/PRML.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ee767e0a6b04fa05ba7e599e9dbb4637a94a4407ccedf0b4d316b1fd7c8ec64
3
+ size 18090775
tests/pdfs/Regulation Best Interest_Interpretive release.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1f896c166dace5e1314af16b5c4709a16a2aa6dccd6955d2779ceba11113d15
3
+ size 357253
tests/pdfs/Regulation Best Interest_proposed rule.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8611d968d8ba9542ec775f43b80953b365758599701fd9c438383a6b82cd58f
3
+ size 1803860
tests/pdfs/earthmover.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b28ea18276ce6ebb0ba1298813ce96fa812bb70ec8851933f07d18424813d74d
3
+ size 1036803
tests/pdfs/four-lectures.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37bf130021e972b4203038e81d88298f13dca7f6fae95a1f3dfdec58935c4e11
3
+ size 310124
tests/pdfs/q1-fy25-earnings.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edb5d4c78cc2c403f2387a7b8aa7134c1a96018acb489722d13aa6a968ae93bb
3
+ size 101417
tests/results/2023-annual-report-truncated_structure.json ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "doc_name": "2023-annual-report-truncated.pdf",
3
+ "structure": [
4
+ {
5
+ "title": "Preface",
6
+ "start_index": 1,
7
+ "end_index": 4,
8
+ "node_id": "0000"
9
+ },
10
+ {
11
+ "title": "About the Federal Reserve",
12
+ "start_index": 5,
13
+ "end_index": 7,
14
+ "node_id": "0001"
15
+ },
16
+ {
17
+ "title": "Overview",
18
+ "start_index": 7,
19
+ "end_index": 8,
20
+ "node_id": "0002"
21
+ },
22
+ {
23
+ "title": "Monetary Policy and Economic Developments",
24
+ "start_index": 9,
25
+ "end_index": 9,
26
+ "nodes": [
27
+ {
28
+ "title": "March 2024 Summary",
29
+ "start_index": 9,
30
+ "end_index": 14,
31
+ "node_id": "0004"
32
+ },
33
+ {
34
+ "title": "June 2023 Summary",
35
+ "start_index": 15,
36
+ "end_index": 20,
37
+ "node_id": "0005"
38
+ }
39
+ ],
40
+ "node_id": "0003"
41
+ },
42
+ {
43
+ "title": "Financial Stability",
44
+ "start_index": 21,
45
+ "end_index": 21,
46
+ "nodes": [
47
+ {
48
+ "title": "Monitoring Financial Vulnerabilities",
49
+ "start_index": 22,
50
+ "end_index": 28,
51
+ "node_id": "0007"
52
+ },
53
+ {
54
+ "title": "Domestic and International Cooperation and Coordination",
55
+ "start_index": 28,
56
+ "end_index": 30,
57
+ "node_id": "0008"
58
+ }
59
+ ],
60
+ "node_id": "0006"
61
+ },
62
+ {
63
+ "title": "Supervision and Regulation",
64
+ "start_index": 31,
65
+ "end_index": 32,
66
+ "nodes": [
67
+ {
68
+ "title": "Supervised and Regulated Institutions",
69
+ "start_index": 32,
70
+ "end_index": 35,
71
+ "node_id": "0010"
72
+ },
73
+ {
74
+ "title": "Supervisory Developments",
75
+ "start_index": 35,
76
+ "end_index": 50,
77
+ "node_id": "0011"
78
+ }
79
+ ],
80
+ "node_id": "0009"
81
+ }
82
+ ]
83
+ }
tests/results/2023-annual-report_structure.json ADDED
@@ -0,0 +1,493 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "doc_name": "2023-annual-report.pdf",
3
+ "structure": [
4
+ {
5
+ "title": "Preface",
6
+ "start_index": 1,
7
+ "end_index": 4,
8
+ "node_id": "0000"
9
+ },
10
+ {
11
+ "title": "About the Federal Reserve",
12
+ "start_index": 5,
13
+ "end_index": 6,
14
+ "node_id": "0001"
15
+ },
16
+ {
17
+ "title": "Overview",
18
+ "start_index": 7,
19
+ "end_index": 8,
20
+ "node_id": "0002"
21
+ },
22
+ {
23
+ "title": "Monetary Policy and Economic Developments",
24
+ "start_index": 9,
25
+ "end_index": 9,
26
+ "nodes": [
27
+ {
28
+ "title": "March 2024 Summary",
29
+ "start_index": 9,
30
+ "end_index": 14,
31
+ "node_id": "0004"
32
+ },
33
+ {
34
+ "title": "June 2023 Summary",
35
+ "start_index": 15,
36
+ "end_index": 20,
37
+ "node_id": "0005"
38
+ }
39
+ ],
40
+ "node_id": "0003"
41
+ },
42
+ {
43
+ "title": "Financial Stability",
44
+ "start_index": 21,
45
+ "end_index": 21,
46
+ "nodes": [
47
+ {
48
+ "title": "Monitoring Financial Vulnerabilities",
49
+ "start_index": 22,
50
+ "end_index": 28,
51
+ "node_id": "0007"
52
+ },
53
+ {
54
+ "title": "Domestic and International Cooperation and Coordination",
55
+ "start_index": 28,
56
+ "end_index": 31,
57
+ "node_id": "0008"
58
+ }
59
+ ],
60
+ "node_id": "0006"
61
+ },
62
+ {
63
+ "title": "Supervision and Regulation",
64
+ "start_index": 31,
65
+ "end_index": 31,
66
+ "nodes": [
67
+ {
68
+ "title": "Supervised and Regulated Institutions",
69
+ "start_index": 32,
70
+ "end_index": 35,
71
+ "node_id": "0010"
72
+ },
73
+ {
74
+ "title": "Supervisory Developments",
75
+ "start_index": 35,
76
+ "end_index": 54,
77
+ "node_id": "0011"
78
+ },
79
+ {
80
+ "title": "Regulatory Developments",
81
+ "start_index": 55,
82
+ "end_index": 59,
83
+ "node_id": "0012"
84
+ }
85
+ ],
86
+ "node_id": "0009"
87
+ },
88
+ {
89
+ "title": "Payment System and Reserve Bank Oversight",
90
+ "start_index": 59,
91
+ "end_index": 59,
92
+ "nodes": [
93
+ {
94
+ "title": "Payment Services to Depository and Other Institutions",
95
+ "start_index": 60,
96
+ "end_index": 65,
97
+ "node_id": "0014"
98
+ },
99
+ {
100
+ "title": "Currency and Coin",
101
+ "start_index": 66,
102
+ "end_index": 68,
103
+ "node_id": "0015"
104
+ },
105
+ {
106
+ "title": "Fiscal Agency and Government Depository Services",
107
+ "start_index": 69,
108
+ "end_index": 72,
109
+ "node_id": "0016"
110
+ },
111
+ {
112
+ "title": "Evolutions and Improvements to the System",
113
+ "start_index": 72,
114
+ "end_index": 75,
115
+ "node_id": "0017"
116
+ },
117
+ {
118
+ "title": "Oversight of Federal Reserve Banks",
119
+ "start_index": 75,
120
+ "end_index": 81,
121
+ "node_id": "0018"
122
+ },
123
+ {
124
+ "title": "Pro Forma Financial Statements for Federal Reserve Priced Services",
125
+ "start_index": 82,
126
+ "end_index": 88,
127
+ "node_id": "0019"
128
+ }
129
+ ],
130
+ "node_id": "0013"
131
+ },
132
+ {
133
+ "title": "Consumer and Community Affairs",
134
+ "start_index": 89,
135
+ "end_index": 89,
136
+ "nodes": [
137
+ {
138
+ "title": "Consumer Compliance Supervision",
139
+ "start_index": 89,
140
+ "end_index": 101,
141
+ "node_id": "0021"
142
+ },
143
+ {
144
+ "title": "Consumer Laws and Regulations",
145
+ "start_index": 101,
146
+ "end_index": 102,
147
+ "node_id": "0022"
148
+ },
149
+ {
150
+ "title": "Consumer Research and Analysis of Emerging Issues and Policy",
151
+ "start_index": 102,
152
+ "end_index": 105,
153
+ "node_id": "0023"
154
+ },
155
+ {
156
+ "title": "Community Development",
157
+ "start_index": 105,
158
+ "end_index": 106,
159
+ "node_id": "0024"
160
+ }
161
+ ],
162
+ "node_id": "0020"
163
+ },
164
+ {
165
+ "title": "Appendixes",
166
+ "start_index": 107,
167
+ "end_index": 109,
168
+ "node_id": "0025"
169
+ },
170
+ {
171
+ "title": "Federal Reserve System Organization",
172
+ "start_index": 109,
173
+ "end_index": 109,
174
+ "nodes": [
175
+ {
176
+ "title": "Board of Governors",
177
+ "start_index": 109,
178
+ "end_index": 116,
179
+ "node_id": "0027"
180
+ },
181
+ {
182
+ "title": "Federal Open Market Committee",
183
+ "start_index": 117,
184
+ "end_index": 118,
185
+ "node_id": "0028"
186
+ },
187
+ {
188
+ "title": "Board of Governors Advisory Councils",
189
+ "start_index": 119,
190
+ "end_index": 122,
191
+ "node_id": "0029"
192
+ },
193
+ {
194
+ "title": "Federal Reserve Banks and Branches",
195
+ "start_index": 123,
196
+ "end_index": 146,
197
+ "node_id": "0030"
198
+ }
199
+ ],
200
+ "node_id": "0026"
201
+ },
202
+ {
203
+ "title": "Minutes of Federal Open Market Committee Meetings",
204
+ "start_index": 147,
205
+ "end_index": 147,
206
+ "nodes": [
207
+ {
208
+ "title": "Meeting Minutes",
209
+ "start_index": 147,
210
+ "end_index": 149,
211
+ "node_id": "0032"
212
+ }
213
+ ],
214
+ "node_id": "0031"
215
+ },
216
+ {
217
+ "title": "Federal Reserve System Audits",
218
+ "start_index": 149,
219
+ "end_index": 149,
220
+ "nodes": [
221
+ {
222
+ "title": "Office of Inspector General Activities",
223
+ "start_index": 149,
224
+ "end_index": 151,
225
+ "node_id": "0034"
226
+ },
227
+ {
228
+ "title": "Government Accountability Office Reviews",
229
+ "start_index": 151,
230
+ "end_index": 152,
231
+ "node_id": "0035"
232
+ }
233
+ ],
234
+ "node_id": "0033"
235
+ },
236
+ {
237
+ "title": "Federal Reserve System Budgets",
238
+ "start_index": 153,
239
+ "end_index": 153,
240
+ "nodes": [
241
+ {
242
+ "title": "System Budgets Overview",
243
+ "start_index": 153,
244
+ "end_index": 157,
245
+ "node_id": "0037"
246
+ },
247
+ {
248
+ "title": "Board of Governors Budgets",
249
+ "start_index": 157,
250
+ "end_index": 163,
251
+ "node_id": "0038"
252
+ },
253
+ {
254
+ "title": "Federal Reserve Banks Budgets",
255
+ "start_index": 163,
256
+ "end_index": 169,
257
+ "node_id": "0039"
258
+ },
259
+ {
260
+ "title": "Currency Budget",
261
+ "start_index": 169,
262
+ "end_index": 174,
263
+ "node_id": "0040"
264
+ }
265
+ ],
266
+ "node_id": "0036"
267
+ },
268
+ {
269
+ "title": "Record of Policy Actions of the Board of Governors",
270
+ "start_index": 175,
271
+ "end_index": 175,
272
+ "nodes": [
273
+ {
274
+ "title": "Rules and Regulations",
275
+ "start_index": 175,
276
+ "end_index": 176,
277
+ "node_id": "0042"
278
+ },
279
+ {
280
+ "title": "Policy Statements and Other Actions",
281
+ "start_index": 177,
282
+ "end_index": 181,
283
+ "node_id": "0043"
284
+ },
285
+ {
286
+ "title": "Discount Rates for Depository Institutions in 2023",
287
+ "start_index": 181,
288
+ "end_index": 183,
289
+ "node_id": "0044"
290
+ },
291
+ {
292
+ "title": "The Board of Governors and the Government Performance and Results Act",
293
+ "start_index": 184,
294
+ "end_index": 184,
295
+ "node_id": "0045"
296
+ }
297
+ ],
298
+ "node_id": "0041"
299
+ },
300
+ {
301
+ "title": "Litigation",
302
+ "start_index": 185,
303
+ "end_index": 185,
304
+ "nodes": [
305
+ {
306
+ "title": "Pending",
307
+ "start_index": 185,
308
+ "end_index": 186,
309
+ "node_id": "0047"
310
+ },
311
+ {
312
+ "title": "Resolved",
313
+ "start_index": 186,
314
+ "end_index": 187,
315
+ "node_id": "0048"
316
+ }
317
+ ],
318
+ "node_id": "0046"
319
+ },
320
+ {
321
+ "title": "Statistical Tables",
322
+ "start_index": 187,
323
+ "end_index": 187,
324
+ "nodes": [
325
+ {
326
+ "title": "Federal Reserve open market transactions, 2023",
327
+ "start_index": 187,
328
+ "end_index": 187,
329
+ "nodes": [
330
+ {
331
+ "title": "Type of security and transaction",
332
+ "start_index": 187,
333
+ "end_index": 188,
334
+ "node_id": "0051"
335
+ },
336
+ {
337
+ "title": "Federal agency obligations",
338
+ "start_index": 188,
339
+ "end_index": 188,
340
+ "node_id": "0052"
341
+ },
342
+ {
343
+ "title": "Mortgage-backed securities",
344
+ "start_index": 188,
345
+ "end_index": 188,
346
+ "node_id": "0053"
347
+ },
348
+ {
349
+ "title": "Temporary transactions",
350
+ "start_index": 188,
351
+ "end_index": 188,
352
+ "node_id": "0054"
353
+ }
354
+ ],
355
+ "node_id": "0050"
356
+ },
357
+ {
358
+ "title": "Federal Reserve Bank holdings of U.S. Treasury and federal agency securities, December 31, 2021\u201323",
359
+ "start_index": 189,
360
+ "end_index": 189,
361
+ "nodes": [
362
+ {
363
+ "title": "By remaining maturity",
364
+ "start_index": 189,
365
+ "end_index": 189,
366
+ "node_id": "0056"
367
+ },
368
+ {
369
+ "title": "By type",
370
+ "start_index": 189,
371
+ "end_index": 190,
372
+ "node_id": "0057"
373
+ },
374
+ {
375
+ "title": "By issuer",
376
+ "start_index": 190,
377
+ "end_index": 190,
378
+ "node_id": "0058"
379
+ }
380
+ ],
381
+ "node_id": "0055"
382
+ },
383
+ {
384
+ "title": "Reserve requirements of depository institutions, December 31, 2023",
385
+ "start_index": 191,
386
+ "end_index": 191,
387
+ "node_id": "0059"
388
+ },
389
+ {
390
+ "title": "Banking offices and banks affiliated with bank holding companies in the United States, December 31, 2022 and 2023",
391
+ "start_index": 192,
392
+ "end_index": 192,
393
+ "node_id": "0060"
394
+ },
395
+ {
396
+ "title": "Reserves of depository institutions, Federal Reserve Bank credit, and related items, year-end 1984\u20132023 and month-end 2023",
397
+ "start_index": 193,
398
+ "end_index": 196,
399
+ "node_id": "0061"
400
+ },
401
+ {
402
+ "title": "Reserves of depository institutions, Federal Reserve Bank credit, and related items, year-end 1918\u20131983",
403
+ "start_index": 197,
404
+ "end_index": 200,
405
+ "node_id": "0062"
406
+ },
407
+ {
408
+ "title": "Principal assets and liabilities of insured commercial banks, by class of bank, June 30, 2023 and 2022",
409
+ "start_index": 201,
410
+ "end_index": 201,
411
+ "node_id": "0063"
412
+ },
413
+ {
414
+ "title": "Initial margin requirements under Regulations T, U, and X",
415
+ "start_index": 202,
416
+ "end_index": 203,
417
+ "node_id": "0064"
418
+ },
419
+ {
420
+ "title": "Statement of condition of the Federal Reserve Banks, by Bank, December 31, 2023 and 2022",
421
+ "start_index": 203,
422
+ "end_index": 209,
423
+ "node_id": "0065"
424
+ },
425
+ {
426
+ "title": "Statement of condition of the Federal Reserve Banks, December 31, 2023 and 2022",
427
+ "start_index": 209,
428
+ "end_index": 210,
429
+ "node_id": "0066"
430
+ },
431
+ {
432
+ "title": "Income and expenses of the Federal Reserve Banks, by Bank, 2023",
433
+ "start_index": 210,
434
+ "end_index": 212,
435
+ "nodes": [
436
+ {
437
+ "title": "Income and expenses of the Federal Reserve Banks, by Bank, 2023\u2014continued",
438
+ "start_index": 212,
439
+ "end_index": 214,
440
+ "node_id": "0068"
441
+ }
442
+ ],
443
+ "node_id": "0067"
444
+ },
445
+ {
446
+ "title": "Income and expenses of the Federal Reserve Banks, 1914\u20132023",
447
+ "start_index": 214,
448
+ "end_index": 215,
449
+ "nodes": [
450
+ {
451
+ "title": "Income and expenses of the Federal Reserve Banks, 1914\u20132023\u2014continued",
452
+ "start_index": 215,
453
+ "end_index": 216,
454
+ "node_id": "0070"
455
+ },
456
+ {
457
+ "title": "Income and expenses of the Federal Reserve Banks, 1914\u20132023\u2014continued",
458
+ "start_index": 216,
459
+ "end_index": 217,
460
+ "node_id": "0071"
461
+ },
462
+ {
463
+ "title": "Income and expenses of the Federal Reserve Banks, 1914\u20132023\u2014continued",
464
+ "start_index": 217,
465
+ "end_index": 217,
466
+ "node_id": "0072"
467
+ }
468
+ ],
469
+ "node_id": "0069"
470
+ },
471
+ {
472
+ "title": "Operations in principal departments of the Federal Reserve Banks, 2020\u201323",
473
+ "start_index": 218,
474
+ "end_index": 218,
475
+ "node_id": "0073"
476
+ },
477
+ {
478
+ "title": "Number and annual salaries of officers and employees of the Federal Reserve Banks, December 31, 2023",
479
+ "start_index": 219,
480
+ "end_index": 220,
481
+ "node_id": "0074"
482
+ },
483
+ {
484
+ "title": "Acquisition costs and net book value of the premises of the Federal Reserve Banks and Branches, December 31, 2023",
485
+ "start_index": 220,
486
+ "end_index": 222,
487
+ "node_id": "0075"
488
+ }
489
+ ],
490
+ "node_id": "0049"
491
+ }
492
+ ]
493
+ }
tests/results/PRML_structure.json ADDED
@@ -0,0 +1,1847 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "doc_name": "PRML.pdf",
3
+ "structure": [
4
+ {
5
+ "title": "Preface",
6
+ "start_index": 1,
7
+ "end_index": 6,
8
+ "node_id": "0000"
9
+ },
10
+ {
11
+ "title": "Preface",
12
+ "start_index": 7,
13
+ "end_index": 10,
14
+ "node_id": "0001"
15
+ },
16
+ {
17
+ "title": "Mathematical notation",
18
+ "start_index": 11,
19
+ "end_index": 13,
20
+ "node_id": "0002"
21
+ },
22
+ {
23
+ "title": "Contents",
24
+ "start_index": 13,
25
+ "end_index": 20,
26
+ "node_id": "0003"
27
+ },
28
+ {
29
+ "title": "Introduction",
30
+ "start_index": 21,
31
+ "end_index": 24,
32
+ "nodes": [
33
+ {
34
+ "title": "Example: Polynomial Curve Fitting",
35
+ "start_index": 24,
36
+ "end_index": 32,
37
+ "node_id": "0005"
38
+ },
39
+ {
40
+ "title": "Probability Theory",
41
+ "start_index": 32,
42
+ "end_index": 37,
43
+ "nodes": [
44
+ {
45
+ "title": "Probability densities",
46
+ "start_index": 37,
47
+ "end_index": 39,
48
+ "node_id": "0007"
49
+ },
50
+ {
51
+ "title": "Expectations and covariances",
52
+ "start_index": 39,
53
+ "end_index": 41,
54
+ "node_id": "0008"
55
+ },
56
+ {
57
+ "title": "Bayesian probabilities",
58
+ "start_index": 41,
59
+ "end_index": 44,
60
+ "node_id": "0009"
61
+ },
62
+ {
63
+ "title": "The Gaussian distribution",
64
+ "start_index": 44,
65
+ "end_index": 48,
66
+ "node_id": "0010"
67
+ },
68
+ {
69
+ "title": "Curve fitting re-visited",
70
+ "start_index": 48,
71
+ "end_index": 50,
72
+ "node_id": "0011"
73
+ },
74
+ {
75
+ "title": "Bayesian curve fitting",
76
+ "start_index": 50,
77
+ "end_index": 52,
78
+ "node_id": "0012"
79
+ }
80
+ ],
81
+ "node_id": "0006"
82
+ },
83
+ {
84
+ "title": "Model Selection",
85
+ "start_index": 52,
86
+ "end_index": 53,
87
+ "node_id": "0013"
88
+ },
89
+ {
90
+ "title": "The Curse of Dimensionality",
91
+ "start_index": 53,
92
+ "end_index": 58,
93
+ "node_id": "0014"
94
+ },
95
+ {
96
+ "title": "Decision Theory",
97
+ "start_index": 58,
98
+ "end_index": 59,
99
+ "nodes": [
100
+ {
101
+ "title": "Minimizing the misclassification rate",
102
+ "start_index": 59,
103
+ "end_index": 61,
104
+ "node_id": "0016"
105
+ },
106
+ {
107
+ "title": "Minimizing the expected loss",
108
+ "start_index": 61,
109
+ "end_index": 62,
110
+ "node_id": "0017"
111
+ },
112
+ {
113
+ "title": "The reject option",
114
+ "start_index": 62,
115
+ "end_index": 62,
116
+ "node_id": "0018"
117
+ },
118
+ {
119
+ "title": "Inference and decision",
120
+ "start_index": 62,
121
+ "end_index": 66,
122
+ "node_id": "0019"
123
+ },
124
+ {
125
+ "title": "Loss functions for regression",
126
+ "start_index": 66,
127
+ "end_index": 68,
128
+ "node_id": "0020"
129
+ }
130
+ ],
131
+ "node_id": "0015"
132
+ },
133
+ {
134
+ "title": "Information Theory",
135
+ "start_index": 68,
136
+ "end_index": 75,
137
+ "nodes": [
138
+ {
139
+ "title": "Relative entropy and mutual information",
140
+ "start_index": 75,
141
+ "end_index": 78,
142
+ "node_id": "0022"
143
+ }
144
+ ],
145
+ "node_id": "0021"
146
+ }
147
+ ],
148
+ "node_id": "0004"
149
+ },
150
+ {
151
+ "title": "Exercises",
152
+ "start_index": 78,
153
+ "end_index": 87,
154
+ "node_id": "0023"
155
+ },
156
+ {
157
+ "title": "Probability Distributions",
158
+ "start_index": 87,
159
+ "end_index": 88,
160
+ "nodes": [
161
+ {
162
+ "title": "Binary Variables",
163
+ "start_index": 88,
164
+ "end_index": 91,
165
+ "nodes": [
166
+ {
167
+ "title": "The beta distribution",
168
+ "start_index": 91,
169
+ "end_index": 94,
170
+ "node_id": "0026"
171
+ }
172
+ ],
173
+ "node_id": "0025"
174
+ },
175
+ {
176
+ "title": "Multinomial Variables",
177
+ "start_index": 94,
178
+ "end_index": 96,
179
+ "nodes": [
180
+ {
181
+ "title": "The Dirichlet distribution",
182
+ "start_index": 96,
183
+ "end_index": 98,
184
+ "node_id": "0028"
185
+ }
186
+ ],
187
+ "node_id": "0027"
188
+ },
189
+ {
190
+ "title": "The Gaussian Distribution",
191
+ "start_index": 98,
192
+ "end_index": 105,
193
+ "nodes": [
194
+ {
195
+ "title": "Conditional Gaussian distributions",
196
+ "start_index": 105,
197
+ "end_index": 108,
198
+ "node_id": "0030"
199
+ },
200
+ {
201
+ "title": "Marginal Gaussian distributions",
202
+ "start_index": 108,
203
+ "end_index": 110,
204
+ "node_id": "0031"
205
+ },
206
+ {
207
+ "title": "Bayes\u2019 theorem for Gaussian variables",
208
+ "start_index": 110,
209
+ "end_index": 113,
210
+ "node_id": "0032"
211
+ },
212
+ {
213
+ "title": "Maximum likelihood for the Gaussian",
214
+ "start_index": 113,
215
+ "end_index": 114,
216
+ "node_id": "0033"
217
+ },
218
+ {
219
+ "title": "Sequential estimation",
220
+ "start_index": 114,
221
+ "end_index": 117,
222
+ "node_id": "0034"
223
+ },
224
+ {
225
+ "title": "Bayesian inference for the Gaussian",
226
+ "start_index": 117,
227
+ "end_index": 122,
228
+ "node_id": "0035"
229
+ },
230
+ {
231
+ "title": "Student\u2019s t-distribution",
232
+ "start_index": 122,
233
+ "end_index": 125,
234
+ "node_id": "0036"
235
+ },
236
+ {
237
+ "title": "Periodic variables",
238
+ "start_index": 125,
239
+ "end_index": 130,
240
+ "node_id": "0037"
241
+ },
242
+ {
243
+ "title": "Mixtures of Gaussians",
244
+ "start_index": 130,
245
+ "end_index": 133,
246
+ "node_id": "0038"
247
+ }
248
+ ],
249
+ "node_id": "0029"
250
+ },
251
+ {
252
+ "title": "The Exponential Family",
253
+ "start_index": 133,
254
+ "end_index": 136,
255
+ "nodes": [
256
+ {
257
+ "title": "Maximum likelihood and sufficient statistics",
258
+ "start_index": 136,
259
+ "end_index": 137,
260
+ "node_id": "0040"
261
+ },
262
+ {
263
+ "title": "Conjugate priors",
264
+ "start_index": 137,
265
+ "end_index": 137,
266
+ "node_id": "0041"
267
+ },
268
+ {
269
+ "title": "Noninformative priors",
270
+ "start_index": 137,
271
+ "end_index": 140,
272
+ "node_id": "0042"
273
+ }
274
+ ],
275
+ "node_id": "0039"
276
+ },
277
+ {
278
+ "title": "Nonparametric Methods",
279
+ "start_index": 140,
280
+ "end_index": 142,
281
+ "nodes": [
282
+ {
283
+ "title": "Kernel density estimators",
284
+ "start_index": 142,
285
+ "end_index": 144,
286
+ "node_id": "0044"
287
+ },
288
+ {
289
+ "title": "Nearest-neighbour methods",
290
+ "start_index": 144,
291
+ "end_index": 147,
292
+ "node_id": "0045"
293
+ }
294
+ ],
295
+ "node_id": "0043"
296
+ }
297
+ ],
298
+ "node_id": "0024"
299
+ },
300
+ {
301
+ "title": "Exercises",
302
+ "start_index": 147,
303
+ "end_index": 156,
304
+ "node_id": "0046"
305
+ },
306
+ {
307
+ "title": "Linear Models for Regression",
308
+ "start_index": 157,
309
+ "end_index": 158,
310
+ "nodes": [
311
+ {
312
+ "title": "Linear Basis Function Models",
313
+ "start_index": 158,
314
+ "end_index": 160,
315
+ "nodes": [
316
+ {
317
+ "title": "Maximum likelihood and least squares",
318
+ "start_index": 160,
319
+ "end_index": 163,
320
+ "node_id": "0049"
321
+ },
322
+ {
323
+ "title": "Geometry of least squares",
324
+ "start_index": 163,
325
+ "end_index": 163,
326
+ "node_id": "0050"
327
+ },
328
+ {
329
+ "title": "Sequential learning",
330
+ "start_index": 163,
331
+ "end_index": 164,
332
+ "node_id": "0051"
333
+ },
334
+ {
335
+ "title": "Regularized least squares",
336
+ "start_index": 164,
337
+ "end_index": 166,
338
+ "node_id": "0052"
339
+ },
340
+ {
341
+ "title": "Multiple outputs",
342
+ "start_index": 166,
343
+ "end_index": 167,
344
+ "node_id": "0053"
345
+ }
346
+ ],
347
+ "node_id": "0048"
348
+ },
349
+ {
350
+ "title": "The Bias-Variance Decomposition",
351
+ "start_index": 167,
352
+ "end_index": 172,
353
+ "node_id": "0054"
354
+ },
355
+ {
356
+ "title": "Bayesian Linear Regression",
357
+ "start_index": 172,
358
+ "end_index": 172,
359
+ "nodes": [
360
+ {
361
+ "title": "Parameter distribution",
362
+ "start_index": 172,
363
+ "end_index": 176,
364
+ "node_id": "0056"
365
+ },
366
+ {
367
+ "title": "Predictive distribution",
368
+ "start_index": 176,
369
+ "end_index": 179,
370
+ "node_id": "0057"
371
+ },
372
+ {
373
+ "title": "Equivalent kernel",
374
+ "start_index": 179,
375
+ "end_index": 181,
376
+ "node_id": "0058"
377
+ }
378
+ ],
379
+ "node_id": "0055"
380
+ },
381
+ {
382
+ "title": "Bayesian Model Comparison",
383
+ "start_index": 181,
384
+ "end_index": 185,
385
+ "node_id": "0059"
386
+ },
387
+ {
388
+ "title": "The Evidence Approximation",
389
+ "start_index": 185,
390
+ "end_index": 186,
391
+ "nodes": [
392
+ {
393
+ "title": "Evaluation of the evidence function",
394
+ "start_index": 186,
395
+ "end_index": 188,
396
+ "node_id": "0061"
397
+ },
398
+ {
399
+ "title": "Maximizing the evidence function",
400
+ "start_index": 188,
401
+ "end_index": 190,
402
+ "node_id": "0062"
403
+ },
404
+ {
405
+ "title": "Effective number of parameters",
406
+ "start_index": 190,
407
+ "end_index": 192,
408
+ "node_id": "0063"
409
+ }
410
+ ],
411
+ "node_id": "0060"
412
+ },
413
+ {
414
+ "title": "Limitations of Fixed Basis Functions",
415
+ "start_index": 192,
416
+ "end_index": 193,
417
+ "node_id": "0064"
418
+ }
419
+ ],
420
+ "node_id": "0047"
421
+ },
422
+ {
423
+ "title": "Exercises",
424
+ "start_index": 193,
425
+ "end_index": 199,
426
+ "node_id": "0065"
427
+ },
428
+ {
429
+ "title": "Linear Models for Classification",
430
+ "start_index": 199,
431
+ "end_index": 201,
432
+ "nodes": [
433
+ {
434
+ "title": "Discriminant Functions",
435
+ "start_index": 201,
436
+ "end_index": 201,
437
+ "nodes": [
438
+ {
439
+ "title": "Two classes",
440
+ "start_index": 201,
441
+ "end_index": 202,
442
+ "node_id": "0068"
443
+ },
444
+ {
445
+ "title": "Multiple classes",
446
+ "start_index": 202,
447
+ "end_index": 204,
448
+ "node_id": "0069"
449
+ },
450
+ {
451
+ "title": "Least squares for classification",
452
+ "start_index": 204,
453
+ "end_index": 206,
454
+ "node_id": "0070"
455
+ },
456
+ {
457
+ "title": "Fisher\u2019s linear discriminant",
458
+ "start_index": 206,
459
+ "end_index": 209,
460
+ "node_id": "0071"
461
+ },
462
+ {
463
+ "title": "Relation to least squares",
464
+ "start_index": 209,
465
+ "end_index": 211,
466
+ "node_id": "0072"
467
+ },
468
+ {
469
+ "title": "Fisher\u2019s discriminant for multiple classes",
470
+ "start_index": 211,
471
+ "end_index": 212,
472
+ "node_id": "0073"
473
+ },
474
+ {
475
+ "title": "The perceptron algorithm",
476
+ "start_index": 212,
477
+ "end_index": 216,
478
+ "node_id": "0074"
479
+ }
480
+ ],
481
+ "node_id": "0067"
482
+ },
483
+ {
484
+ "title": "Probabilistic Generative Models",
485
+ "start_index": 216,
486
+ "end_index": 218,
487
+ "nodes": [
488
+ {
489
+ "title": "Continuous inputs",
490
+ "start_index": 218,
491
+ "end_index": 220,
492
+ "node_id": "0076"
493
+ },
494
+ {
495
+ "title": "Maximum likelihood solution",
496
+ "start_index": 220,
497
+ "end_index": 222,
498
+ "node_id": "0077"
499
+ },
500
+ {
501
+ "title": "Discrete features",
502
+ "start_index": 222,
503
+ "end_index": 222,
504
+ "node_id": "0078"
505
+ },
506
+ {
507
+ "title": "Exponential family",
508
+ "start_index": 222,
509
+ "end_index": 223,
510
+ "node_id": "0079"
511
+ }
512
+ ],
513
+ "node_id": "0075"
514
+ },
515
+ {
516
+ "title": "Probabilistic Discriminative Models",
517
+ "start_index": 223,
518
+ "end_index": 224,
519
+ "nodes": [
520
+ {
521
+ "title": "Fixed basis functions",
522
+ "start_index": 224,
523
+ "end_index": 225,
524
+ "node_id": "0081"
525
+ },
526
+ {
527
+ "title": "Logistic regression",
528
+ "start_index": 225,
529
+ "end_index": 227,
530
+ "node_id": "0082"
531
+ },
532
+ {
533
+ "title": "Iterative reweighted least squares",
534
+ "start_index": 227,
535
+ "end_index": 229,
536
+ "node_id": "0083"
537
+ },
538
+ {
539
+ "title": "Multiclass logistic regression",
540
+ "start_index": 229,
541
+ "end_index": 230,
542
+ "node_id": "0084"
543
+ },
544
+ {
545
+ "title": "Probit regression",
546
+ "start_index": 230,
547
+ "end_index": 232,
548
+ "node_id": "0085"
549
+ },
550
+ {
551
+ "title": "Canonical link functions",
552
+ "start_index": 232,
553
+ "end_index": 232,
554
+ "node_id": "0086"
555
+ }
556
+ ],
557
+ "node_id": "0080"
558
+ },
559
+ {
560
+ "title": "The Laplace Approximation",
561
+ "start_index": 233,
562
+ "end_index": 236,
563
+ "nodes": [
564
+ {
565
+ "title": "Model comparison and BIC",
566
+ "start_index": 236,
567
+ "end_index": 237,
568
+ "node_id": "0088"
569
+ }
570
+ ],
571
+ "node_id": "0087"
572
+ },
573
+ {
574
+ "title": "Bayesian Logistic Regression",
575
+ "start_index": 237,
576
+ "end_index": 237,
577
+ "nodes": [
578
+ {
579
+ "title": "Laplace approximation",
580
+ "start_index": 237,
581
+ "end_index": 238,
582
+ "node_id": "0090"
583
+ },
584
+ {
585
+ "title": "Predictive distribution",
586
+ "start_index": 238,
587
+ "end_index": 240,
588
+ "node_id": "0091"
589
+ }
590
+ ],
591
+ "node_id": "0089"
592
+ }
593
+ ],
594
+ "node_id": "0066"
595
+ },
596
+ {
597
+ "title": "Exercises",
598
+ "start_index": 240,
599
+ "end_index": 245,
600
+ "node_id": "0092"
601
+ },
602
+ {
603
+ "title": "Neural Networks",
604
+ "start_index": 245,
605
+ "end_index": 247,
606
+ "nodes": [
607
+ {
608
+ "title": "Feed-forward Network Functions",
609
+ "start_index": 247,
610
+ "end_index": 251,
611
+ "nodes": [
612
+ {
613
+ "title": "Weight-space symmetries",
614
+ "start_index": 251,
615
+ "end_index": 252,
616
+ "node_id": "0095"
617
+ }
618
+ ],
619
+ "node_id": "0094"
620
+ },
621
+ {
622
+ "title": "Network Training",
623
+ "start_index": 252,
624
+ "end_index": 256,
625
+ "nodes": [
626
+ {
627
+ "title": "Parameter optimization",
628
+ "start_index": 256,
629
+ "end_index": 257,
630
+ "node_id": "0097"
631
+ },
632
+ {
633
+ "title": "Local quadratic approximation",
634
+ "start_index": 257,
635
+ "end_index": 259,
636
+ "node_id": "0098"
637
+ },
638
+ {
639
+ "title": "Use of gradient information",
640
+ "start_index": 259,
641
+ "end_index": 260,
642
+ "node_id": "0099"
643
+ },
644
+ {
645
+ "title": "Gradient descent optimization",
646
+ "start_index": 260,
647
+ "end_index": 261,
648
+ "node_id": "0100"
649
+ }
650
+ ],
651
+ "node_id": "0096"
652
+ },
653
+ {
654
+ "title": "Error Backpropagation",
655
+ "start_index": 261,
656
+ "end_index": 262,
657
+ "nodes": [
658
+ {
659
+ "title": "Evaluation of error-function derivatives",
660
+ "start_index": 262,
661
+ "end_index": 265,
662
+ "node_id": "0102"
663
+ },
664
+ {
665
+ "title": "A simple example",
666
+ "start_index": 265,
667
+ "end_index": 266,
668
+ "node_id": "0103"
669
+ },
670
+ {
671
+ "title": "Efficiency of backpropagation",
672
+ "start_index": 266,
673
+ "end_index": 267,
674
+ "node_id": "0104"
675
+ },
676
+ {
677
+ "title": "The Jacobian matrix",
678
+ "start_index": 267,
679
+ "end_index": 269,
680
+ "node_id": "0105"
681
+ }
682
+ ],
683
+ "node_id": "0101"
684
+ },
685
+ {
686
+ "title": "The Hessian Matrix",
687
+ "start_index": 269,
688
+ "end_index": 270,
689
+ "nodes": [
690
+ {
691
+ "title": "Diagonal approximation",
692
+ "start_index": 270,
693
+ "end_index": 271,
694
+ "node_id": "0107"
695
+ },
696
+ {
697
+ "title": "Outer product approximation",
698
+ "start_index": 271,
699
+ "end_index": 272,
700
+ "node_id": "0108"
701
+ },
702
+ {
703
+ "title": "Inverse Hessian",
704
+ "start_index": 272,
705
+ "end_index": 272,
706
+ "node_id": "0109"
707
+ },
708
+ {
709
+ "title": "Finite differences",
710
+ "start_index": 272,
711
+ "end_index": 273,
712
+ "node_id": "0110"
713
+ },
714
+ {
715
+ "title": "Exact evaluation of the Hessian",
716
+ "start_index": 273,
717
+ "end_index": 274,
718
+ "node_id": "0111"
719
+ },
720
+ {
721
+ "title": "Fast multiplication by the Hessian",
722
+ "start_index": 274,
723
+ "end_index": 276,
724
+ "node_id": "0112"
725
+ }
726
+ ],
727
+ "node_id": "0106"
728
+ },
729
+ {
730
+ "title": "Regularization in Neural Networks",
731
+ "start_index": 276,
732
+ "end_index": 277,
733
+ "nodes": [
734
+ {
735
+ "title": "Consistent Gaussian priors",
736
+ "start_index": 277,
737
+ "end_index": 279,
738
+ "node_id": "0114"
739
+ },
740
+ {
741
+ "title": "Early stopping",
742
+ "start_index": 279,
743
+ "end_index": 281,
744
+ "node_id": "0115"
745
+ },
746
+ {
747
+ "title": "Invariances",
748
+ "start_index": 281,
749
+ "end_index": 283,
750
+ "node_id": "0116"
751
+ },
752
+ {
753
+ "title": "Tangent propagation",
754
+ "start_index": 283,
755
+ "end_index": 285,
756
+ "node_id": "0117"
757
+ },
758
+ {
759
+ "title": "Training with transformed data",
760
+ "start_index": 285,
761
+ "end_index": 287,
762
+ "node_id": "0118"
763
+ },
764
+ {
765
+ "title": "Convolutional networks",
766
+ "start_index": 287,
767
+ "end_index": 289,
768
+ "node_id": "0119"
769
+ },
770
+ {
771
+ "title": "Soft weight sharing",
772
+ "start_index": 289,
773
+ "end_index": 292,
774
+ "node_id": "0120"
775
+ }
776
+ ],
777
+ "node_id": "0113"
778
+ },
779
+ {
780
+ "title": "Mixture Density Networks",
781
+ "start_index": 292,
782
+ "end_index": 297,
783
+ "node_id": "0121"
784
+ },
785
+ {
786
+ "title": "Bayesian Neural Networks",
787
+ "start_index": 297,
788
+ "end_index": 298,
789
+ "nodes": [
790
+ {
791
+ "title": "Posterior parameter distribution",
792
+ "start_index": 298,
793
+ "end_index": 300,
794
+ "node_id": "0123"
795
+ },
796
+ {
797
+ "title": "Hyperparameter optimization",
798
+ "start_index": 300,
799
+ "end_index": 301,
800
+ "node_id": "0124"
801
+ },
802
+ {
803
+ "title": "Bayesian neural networks for classification",
804
+ "start_index": 301,
805
+ "end_index": 304,
806
+ "node_id": "0125"
807
+ }
808
+ ],
809
+ "node_id": "0122"
810
+ }
811
+ ],
812
+ "node_id": "0093"
813
+ },
814
+ {
815
+ "title": "Exercises",
816
+ "start_index": 304,
817
+ "end_index": 311,
818
+ "node_id": "0126"
819
+ },
820
+ {
821
+ "title": "Kernel Methods",
822
+ "start_index": 311,
823
+ "end_index": 313,
824
+ "nodes": [
825
+ {
826
+ "title": "Dual Representations",
827
+ "start_index": 313,
828
+ "end_index": 314,
829
+ "node_id": "0128"
830
+ },
831
+ {
832
+ "title": "Constructing Kernels",
833
+ "start_index": 314,
834
+ "end_index": 319,
835
+ "node_id": "0129"
836
+ },
837
+ {
838
+ "title": "Radial Basis Function Networks",
839
+ "start_index": 319,
840
+ "end_index": 321,
841
+ "nodes": [
842
+ {
843
+ "title": "Nadaraya-Watson model",
844
+ "start_index": 321,
845
+ "end_index": 323,
846
+ "node_id": "0131"
847
+ }
848
+ ],
849
+ "node_id": "0130"
850
+ },
851
+ {
852
+ "title": "Gaussian Processes",
853
+ "start_index": 323,
854
+ "end_index": 324,
855
+ "nodes": [
856
+ {
857
+ "title": "Linear regression revisited",
858
+ "start_index": 324,
859
+ "end_index": 326,
860
+ "node_id": "0133"
861
+ },
862
+ {
863
+ "title": "Gaussian processes for regression",
864
+ "start_index": 326,
865
+ "end_index": 331,
866
+ "node_id": "0134"
867
+ },
868
+ {
869
+ "title": "Learning the hyperparameters",
870
+ "start_index": 331,
871
+ "end_index": 332,
872
+ "node_id": "0135"
873
+ },
874
+ {
875
+ "title": "Automatic relevance determination",
876
+ "start_index": 332,
877
+ "end_index": 333,
878
+ "node_id": "0136"
879
+ },
880
+ {
881
+ "title": "Gaussian processes for classification",
882
+ "start_index": 333,
883
+ "end_index": 335,
884
+ "node_id": "0137"
885
+ },
886
+ {
887
+ "title": "Laplace approximation",
888
+ "start_index": 335,
889
+ "end_index": 339,
890
+ "node_id": "0138"
891
+ },
892
+ {
893
+ "title": "Connection to neural networks",
894
+ "start_index": 339,
895
+ "end_index": 340,
896
+ "node_id": "0139"
897
+ }
898
+ ],
899
+ "node_id": "0132"
900
+ }
901
+ ],
902
+ "node_id": "0127"
903
+ },
904
+ {
905
+ "title": "Exercises",
906
+ "start_index": 340,
907
+ "end_index": 344,
908
+ "node_id": "0140"
909
+ },
910
+ {
911
+ "title": "Sparse Kernel Machines",
912
+ "start_index": 345,
913
+ "end_index": 346,
914
+ "nodes": [
915
+ {
916
+ "title": "Maximum Margin Classifiers",
917
+ "start_index": 346,
918
+ "end_index": 351,
919
+ "nodes": [
920
+ {
921
+ "title": "Overlapping class distributions",
922
+ "start_index": 351,
923
+ "end_index": 356,
924
+ "node_id": "0143"
925
+ },
926
+ {
927
+ "title": "Relation to logistic regression",
928
+ "start_index": 356,
929
+ "end_index": 358,
930
+ "node_id": "0144"
931
+ },
932
+ {
933
+ "title": "Multiclass SVMs",
934
+ "start_index": 358,
935
+ "end_index": 359,
936
+ "node_id": "0145"
937
+ },
938
+ {
939
+ "title": "SVMs for regression",
940
+ "start_index": 359,
941
+ "end_index": 364,
942
+ "node_id": "0146"
943
+ },
944
+ {
945
+ "title": "Computational learning theory",
946
+ "start_index": 364,
947
+ "end_index": 365,
948
+ "node_id": "0147"
949
+ }
950
+ ],
951
+ "node_id": "0142"
952
+ },
953
+ {
954
+ "title": "Relevance Vector Machines",
955
+ "start_index": 365,
956
+ "end_index": 365,
957
+ "nodes": [
958
+ {
959
+ "title": "RVM for regression",
960
+ "start_index": 365,
961
+ "end_index": 369,
962
+ "node_id": "0149"
963
+ },
964
+ {
965
+ "title": "Analysis of sparsity",
966
+ "start_index": 369,
967
+ "end_index": 373,
968
+ "node_id": "0150"
969
+ },
970
+ {
971
+ "title": "RVM for classification",
972
+ "start_index": 373,
973
+ "end_index": 377,
974
+ "node_id": "0151"
975
+ }
976
+ ],
977
+ "node_id": "0148"
978
+ }
979
+ ],
980
+ "node_id": "0141"
981
+ },
982
+ {
983
+ "title": "Exercises",
984
+ "start_index": 377,
985
+ "end_index": 379,
986
+ "node_id": "0152"
987
+ },
988
+ {
989
+ "title": "Graphical Models",
990
+ "start_index": 379,
991
+ "end_index": 380,
992
+ "nodes": [
993
+ {
994
+ "title": "Bayesian Networks",
995
+ "start_index": 380,
996
+ "end_index": 382,
997
+ "nodes": [
998
+ {
999
+ "title": "Example: Polynomial regression",
1000
+ "start_index": 382,
1001
+ "end_index": 385,
1002
+ "node_id": "0155"
1003
+ },
1004
+ {
1005
+ "title": "Generative models",
1006
+ "start_index": 385,
1007
+ "end_index": 386,
1008
+ "node_id": "0156"
1009
+ },
1010
+ {
1011
+ "title": "Discrete variables",
1012
+ "start_index": 386,
1013
+ "end_index": 390,
1014
+ "node_id": "0157"
1015
+ },
1016
+ {
1017
+ "title": "Linear-Gaussian models",
1018
+ "start_index": 390,
1019
+ "end_index": 392,
1020
+ "node_id": "0158"
1021
+ }
1022
+ ],
1023
+ "node_id": "0154"
1024
+ },
1025
+ {
1026
+ "title": "Conditional Independence",
1027
+ "start_index": 392,
1028
+ "end_index": 393,
1029
+ "nodes": [
1030
+ {
1031
+ "title": "Three example graphs",
1032
+ "start_index": 393,
1033
+ "end_index": 398,
1034
+ "node_id": "0160"
1035
+ },
1036
+ {
1037
+ "title": "D-separation",
1038
+ "start_index": 398,
1039
+ "end_index": 403,
1040
+ "node_id": "0161"
1041
+ }
1042
+ ],
1043
+ "node_id": "0159"
1044
+ },
1045
+ {
1046
+ "title": "Markov Random Fields",
1047
+ "start_index": 403,
1048
+ "end_index": 403,
1049
+ "nodes": [
1050
+ {
1051
+ "title": "Conditional independence properties",
1052
+ "start_index": 403,
1053
+ "end_index": 404,
1054
+ "node_id": "0163"
1055
+ },
1056
+ {
1057
+ "title": "Factorization properties",
1058
+ "start_index": 404,
1059
+ "end_index": 407,
1060
+ "node_id": "0164"
1061
+ },
1062
+ {
1063
+ "title": "Illustration: Image de-noising",
1064
+ "start_index": 407,
1065
+ "end_index": 410,
1066
+ "node_id": "0165"
1067
+ },
1068
+ {
1069
+ "title": "Relation to directed graphs",
1070
+ "start_index": 410,
1071
+ "end_index": 413,
1072
+ "node_id": "0166"
1073
+ }
1074
+ ],
1075
+ "node_id": "0162"
1076
+ },
1077
+ {
1078
+ "title": "Inference in Graphical Models",
1079
+ "start_index": 413,
1080
+ "end_index": 414,
1081
+ "nodes": [
1082
+ {
1083
+ "title": "Inference on a chain",
1084
+ "start_index": 414,
1085
+ "end_index": 418,
1086
+ "node_id": "0168"
1087
+ },
1088
+ {
1089
+ "title": "Trees",
1090
+ "start_index": 418,
1091
+ "end_index": 419,
1092
+ "node_id": "0169"
1093
+ },
1094
+ {
1095
+ "title": "Factor graphs",
1096
+ "start_index": 419,
1097
+ "end_index": 422,
1098
+ "node_id": "0170"
1099
+ },
1100
+ {
1101
+ "title": "The sum-product algorithm",
1102
+ "start_index": 422,
1103
+ "end_index": 431,
1104
+ "node_id": "0171"
1105
+ },
1106
+ {
1107
+ "title": "The max-sum algorithm",
1108
+ "start_index": 431,
1109
+ "end_index": 436,
1110
+ "node_id": "0172"
1111
+ },
1112
+ {
1113
+ "title": "Exact inference in general graphs",
1114
+ "start_index": 436,
1115
+ "end_index": 437,
1116
+ "node_id": "0173"
1117
+ },
1118
+ {
1119
+ "title": "Loopy belief propagation",
1120
+ "start_index": 437,
1121
+ "end_index": 438,
1122
+ "node_id": "0174"
1123
+ },
1124
+ {
1125
+ "title": "Learning the graph structure",
1126
+ "start_index": 438,
1127
+ "end_index": 438,
1128
+ "node_id": "0175"
1129
+ }
1130
+ ],
1131
+ "node_id": "0167"
1132
+ }
1133
+ ],
1134
+ "node_id": "0153"
1135
+ },
1136
+ {
1137
+ "title": "Exercises",
1138
+ "start_index": 438,
1139
+ "end_index": 443,
1140
+ "node_id": "0176"
1141
+ },
1142
+ {
1143
+ "title": "Mixture Models and EM",
1144
+ "start_index": 443,
1145
+ "end_index": 444,
1146
+ "nodes": [
1147
+ {
1148
+ "title": "K-means Clustering",
1149
+ "start_index": 444,
1150
+ "end_index": 448,
1151
+ "nodes": [
1152
+ {
1153
+ "title": "Image segmentation and compression",
1154
+ "start_index": 448,
1155
+ "end_index": 450,
1156
+ "node_id": "0179"
1157
+ }
1158
+ ],
1159
+ "node_id": "0178"
1160
+ },
1161
+ {
1162
+ "title": "Mixtures of Gaussians",
1163
+ "start_index": 450,
1164
+ "end_index": 452,
1165
+ "nodes": [
1166
+ {
1167
+ "title": "Maximum likelihood",
1168
+ "start_index": 452,
1169
+ "end_index": 455,
1170
+ "node_id": "0181"
1171
+ },
1172
+ {
1173
+ "title": "EM for Gaussian mixtures",
1174
+ "start_index": 455,
1175
+ "end_index": 459,
1176
+ "node_id": "0182"
1177
+ }
1178
+ ],
1179
+ "node_id": "0180"
1180
+ },
1181
+ {
1182
+ "title": "An Alternative View of EM",
1183
+ "start_index": 459,
1184
+ "end_index": 461,
1185
+ "nodes": [
1186
+ {
1187
+ "title": "Gaussian mixtures revisited",
1188
+ "start_index": 461,
1189
+ "end_index": 463,
1190
+ "node_id": "0184"
1191
+ },
1192
+ {
1193
+ "title": "Relation to K-means",
1194
+ "start_index": 463,
1195
+ "end_index": 464,
1196
+ "node_id": "0185"
1197
+ },
1198
+ {
1199
+ "title": "Mixtures of Bernoulli distributions",
1200
+ "start_index": 464,
1201
+ "end_index": 468,
1202
+ "node_id": "0186"
1203
+ },
1204
+ {
1205
+ "title": "EM for Bayesian linear regression",
1206
+ "start_index": 468,
1207
+ "end_index": 470,
1208
+ "node_id": "0187"
1209
+ }
1210
+ ],
1211
+ "node_id": "0183"
1212
+ },
1213
+ {
1214
+ "title": "The EM Algorithm in General",
1215
+ "start_index": 470,
1216
+ "end_index": 475,
1217
+ "node_id": "0188"
1218
+ }
1219
+ ],
1220
+ "node_id": "0177"
1221
+ },
1222
+ {
1223
+ "title": "Exercises",
1224
+ "start_index": 475,
1225
+ "end_index": 480,
1226
+ "node_id": "0189"
1227
+ },
1228
+ {
1229
+ "title": "Approximate Inference",
1230
+ "start_index": 481,
1231
+ "end_index": 482,
1232
+ "nodes": [
1233
+ {
1234
+ "title": "Variational Inference",
1235
+ "start_index": 482,
1236
+ "end_index": 484,
1237
+ "nodes": [
1238
+ {
1239
+ "title": "Factorized distributions",
1240
+ "start_index": 484,
1241
+ "end_index": 486,
1242
+ "node_id": "0192"
1243
+ },
1244
+ {
1245
+ "title": "Properties of factorized approximations",
1246
+ "start_index": 486,
1247
+ "end_index": 490,
1248
+ "node_id": "0193"
1249
+ },
1250
+ {
1251
+ "title": "Example: The univariate Gaussian",
1252
+ "start_index": 490,
1253
+ "end_index": 493,
1254
+ "node_id": "0194"
1255
+ },
1256
+ {
1257
+ "title": "Model comparison",
1258
+ "start_index": 493,
1259
+ "end_index": 494,
1260
+ "node_id": "0195"
1261
+ }
1262
+ ],
1263
+ "node_id": "0191"
1264
+ },
1265
+ {
1266
+ "title": "Illustration: Variational Mixture of Gaussians",
1267
+ "start_index": 494,
1268
+ "end_index": 495,
1269
+ "nodes": [
1270
+ {
1271
+ "title": "Variational distribution",
1272
+ "start_index": 495,
1273
+ "end_index": 501,
1274
+ "node_id": "0197"
1275
+ },
1276
+ {
1277
+ "title": "Variational lower bound",
1278
+ "start_index": 501,
1279
+ "end_index": 502,
1280
+ "node_id": "0198"
1281
+ },
1282
+ {
1283
+ "title": "Predictive density",
1284
+ "start_index": 502,
1285
+ "end_index": 503,
1286
+ "node_id": "0199"
1287
+ },
1288
+ {
1289
+ "title": "Determining the number of components",
1290
+ "start_index": 503,
1291
+ "end_index": 505,
1292
+ "node_id": "0200"
1293
+ },
1294
+ {
1295
+ "title": "Induced factorizations",
1296
+ "start_index": 505,
1297
+ "end_index": 506,
1298
+ "node_id": "0201"
1299
+ }
1300
+ ],
1301
+ "node_id": "0196"
1302
+ },
1303
+ {
1304
+ "title": "Variational Linear Regression",
1305
+ "start_index": 506,
1306
+ "end_index": 506,
1307
+ "nodes": [
1308
+ {
1309
+ "title": "Variational distribution",
1310
+ "start_index": 506,
1311
+ "end_index": 508,
1312
+ "node_id": "0203"
1313
+ },
1314
+ {
1315
+ "title": "Predictive distribution",
1316
+ "start_index": 508,
1317
+ "end_index": 509,
1318
+ "node_id": "0204"
1319
+ },
1320
+ {
1321
+ "title": "Lower bound",
1322
+ "start_index": 509,
1323
+ "end_index": 510,
1324
+ "node_id": "0205"
1325
+ }
1326
+ ],
1327
+ "node_id": "0202"
1328
+ },
1329
+ {
1330
+ "title": "Exponential Family Distributions",
1331
+ "start_index": 510,
1332
+ "end_index": 511,
1333
+ "nodes": [
1334
+ {
1335
+ "title": "Variational message passing",
1336
+ "start_index": 511,
1337
+ "end_index": 512,
1338
+ "node_id": "0207"
1339
+ }
1340
+ ],
1341
+ "node_id": "0206"
1342
+ },
1343
+ {
1344
+ "title": "Local Variational Methods",
1345
+ "start_index": 513,
1346
+ "end_index": 518,
1347
+ "node_id": "0208"
1348
+ },
1349
+ {
1350
+ "title": "Variational Logistic Regression",
1351
+ "start_index": 518,
1352
+ "end_index": 518,
1353
+ "nodes": [
1354
+ {
1355
+ "title": "Variational posterior distribution",
1356
+ "start_index": 518,
1357
+ "end_index": 520,
1358
+ "node_id": "0210"
1359
+ },
1360
+ {
1361
+ "title": "Optimizing the variational parameters",
1362
+ "start_index": 520,
1363
+ "end_index": 522,
1364
+ "node_id": "0211"
1365
+ },
1366
+ {
1367
+ "title": "Inference of hyperparameters",
1368
+ "start_index": 522,
1369
+ "end_index": 525,
1370
+ "node_id": "0212"
1371
+ }
1372
+ ],
1373
+ "node_id": "0209"
1374
+ },
1375
+ {
1376
+ "title": "Expectation Propagation",
1377
+ "start_index": 525,
1378
+ "end_index": 531,
1379
+ "nodes": [
1380
+ {
1381
+ "title": "Example: The clutter problem",
1382
+ "start_index": 531,
1383
+ "end_index": 533,
1384
+ "node_id": "0214"
1385
+ },
1386
+ {
1387
+ "title": "Expectation propagation on graphs",
1388
+ "start_index": 533,
1389
+ "end_index": 537,
1390
+ "node_id": "0215"
1391
+ }
1392
+ ],
1393
+ "node_id": "0213"
1394
+ }
1395
+ ],
1396
+ "node_id": "0190"
1397
+ },
1398
+ {
1399
+ "title": "Exercises",
1400
+ "start_index": 537,
1401
+ "end_index": 542,
1402
+ "node_id": "0216"
1403
+ },
1404
+ {
1405
+ "title": "Sampling Methods",
1406
+ "start_index": 543,
1407
+ "end_index": 546,
1408
+ "nodes": [
1409
+ {
1410
+ "title": "Basic Sampling Algorithms",
1411
+ "start_index": 546,
1412
+ "end_index": 546,
1413
+ "nodes": [
1414
+ {
1415
+ "title": "Standard distributions",
1416
+ "start_index": 546,
1417
+ "end_index": 548,
1418
+ "node_id": "0219"
1419
+ },
1420
+ {
1421
+ "title": "Rejection sampling",
1422
+ "start_index": 548,
1423
+ "end_index": 550,
1424
+ "node_id": "0220"
1425
+ },
1426
+ {
1427
+ "title": "Adaptive rejection sampling",
1428
+ "start_index": 550,
1429
+ "end_index": 552,
1430
+ "node_id": "0221"
1431
+ },
1432
+ {
1433
+ "title": "Importance sampling",
1434
+ "start_index": 552,
1435
+ "end_index": 554,
1436
+ "node_id": "0222"
1437
+ },
1438
+ {
1439
+ "title": "Sampling-importance-resampling",
1440
+ "start_index": 554,
1441
+ "end_index": 556,
1442
+ "node_id": "0223"
1443
+ },
1444
+ {
1445
+ "title": "Sampling and the EM algorithm",
1446
+ "start_index": 556,
1447
+ "end_index": 556,
1448
+ "node_id": "0224"
1449
+ }
1450
+ ],
1451
+ "node_id": "0218"
1452
+ },
1453
+ {
1454
+ "title": "Markov Chain Monte Carlo",
1455
+ "start_index": 557,
1456
+ "end_index": 559,
1457
+ "nodes": [
1458
+ {
1459
+ "title": "Markov chains",
1460
+ "start_index": 559,
1461
+ "end_index": 561,
1462
+ "node_id": "0226"
1463
+ },
1464
+ {
1465
+ "title": "The Metropolis-Hastings algorithm",
1466
+ "start_index": 561,
1467
+ "end_index": 562,
1468
+ "node_id": "0227"
1469
+ }
1470
+ ],
1471
+ "node_id": "0225"
1472
+ },
1473
+ {
1474
+ "title": "Gibbs Sampling",
1475
+ "start_index": 562,
1476
+ "end_index": 566,
1477
+ "node_id": "0228"
1478
+ },
1479
+ {
1480
+ "title": "Slice Sampling",
1481
+ "start_index": 566,
1482
+ "end_index": 568,
1483
+ "node_id": "0229"
1484
+ },
1485
+ {
1486
+ "title": "The Hybrid Monte Carlo Algorithm",
1487
+ "start_index": 568,
1488
+ "end_index": 568,
1489
+ "nodes": [
1490
+ {
1491
+ "title": "Dynamical systems",
1492
+ "start_index": 568,
1493
+ "end_index": 572,
1494
+ "node_id": "0231"
1495
+ },
1496
+ {
1497
+ "title": "Hybrid Monte Carlo",
1498
+ "start_index": 572,
1499
+ "end_index": 574,
1500
+ "node_id": "0232"
1501
+ }
1502
+ ],
1503
+ "node_id": "0230"
1504
+ },
1505
+ {
1506
+ "title": "Estimating the Partition Function",
1507
+ "start_index": 574,
1508
+ "end_index": 576,
1509
+ "node_id": "0233"
1510
+ }
1511
+ ],
1512
+ "node_id": "0217"
1513
+ },
1514
+ {
1515
+ "title": "Exercises",
1516
+ "start_index": 576,
1517
+ "end_index": 579,
1518
+ "node_id": "0234"
1519
+ },
1520
+ {
1521
+ "title": "Continuous Latent Variables",
1522
+ "start_index": 579,
1523
+ "end_index": 581,
1524
+ "nodes": [
1525
+ {
1526
+ "title": "Principal Component Analysis",
1527
+ "start_index": 581,
1528
+ "end_index": 581,
1529
+ "nodes": [
1530
+ {
1531
+ "title": "Maximum variance formulation",
1532
+ "start_index": 581,
1533
+ "end_index": 583,
1534
+ "node_id": "0237"
1535
+ },
1536
+ {
1537
+ "title": "Minimum-error formulation",
1538
+ "start_index": 583,
1539
+ "end_index": 585,
1540
+ "node_id": "0238"
1541
+ },
1542
+ {
1543
+ "title": "Applications of PCA",
1544
+ "start_index": 585,
1545
+ "end_index": 589,
1546
+ "node_id": "0239"
1547
+ },
1548
+ {
1549
+ "title": "PCA for high-dimensional data",
1550
+ "start_index": 589,
1551
+ "end_index": 590,
1552
+ "node_id": "0240"
1553
+ }
1554
+ ],
1555
+ "node_id": "0236"
1556
+ },
1557
+ {
1558
+ "title": "Probabilistic PCA",
1559
+ "start_index": 590,
1560
+ "end_index": 594,
1561
+ "nodes": [
1562
+ {
1563
+ "title": "Maximum likelihood PCA",
1564
+ "start_index": 594,
1565
+ "end_index": 597,
1566
+ "node_id": "0242"
1567
+ },
1568
+ {
1569
+ "title": "EM algorithm for PCA",
1570
+ "start_index": 597,
1571
+ "end_index": 600,
1572
+ "node_id": "0243"
1573
+ },
1574
+ {
1575
+ "title": "Bayesian PCA",
1576
+ "start_index": 600,
1577
+ "end_index": 603,
1578
+ "node_id": "0244"
1579
+ },
1580
+ {
1581
+ "title": "Factor analysis",
1582
+ "start_index": 603,
1583
+ "end_index": 606,
1584
+ "node_id": "0245"
1585
+ }
1586
+ ],
1587
+ "node_id": "0241"
1588
+ },
1589
+ {
1590
+ "title": "Kernel PCA",
1591
+ "start_index": 606,
1592
+ "end_index": 610,
1593
+ "node_id": "0246"
1594
+ },
1595
+ {
1596
+ "title": "Nonlinear Latent Variable Models",
1597
+ "start_index": 611,
1598
+ "end_index": 611,
1599
+ "nodes": [
1600
+ {
1601
+ "title": "Independent component analysis",
1602
+ "start_index": 611,
1603
+ "end_index": 612,
1604
+ "node_id": "0248"
1605
+ },
1606
+ {
1607
+ "title": "Autoassociative neural networks",
1608
+ "start_index": 612,
1609
+ "end_index": 615,
1610
+ "node_id": "0249"
1611
+ },
1612
+ {
1613
+ "title": "Modelling nonlinear manifolds",
1614
+ "start_index": 615,
1615
+ "end_index": 619,
1616
+ "node_id": "0250"
1617
+ }
1618
+ ],
1619
+ "node_id": "0247"
1620
+ }
1621
+ ],
1622
+ "node_id": "0235"
1623
+ },
1624
+ {
1625
+ "title": "Exercises",
1626
+ "start_index": 619,
1627
+ "end_index": 624,
1628
+ "node_id": "0251"
1629
+ },
1630
+ {
1631
+ "title": "Sequential Data",
1632
+ "start_index": 625,
1633
+ "end_index": 627,
1634
+ "nodes": [
1635
+ {
1636
+ "title": "Markov Models",
1637
+ "start_index": 627,
1638
+ "end_index": 630,
1639
+ "node_id": "0253"
1640
+ },
1641
+ {
1642
+ "title": "Hidden Markov Models",
1643
+ "start_index": 630,
1644
+ "end_index": 635,
1645
+ "nodes": [
1646
+ {
1647
+ "title": "Maximum likelihood for the HMM",
1648
+ "start_index": 635,
1649
+ "end_index": 638,
1650
+ "node_id": "0255"
1651
+ },
1652
+ {
1653
+ "title": "The forward-backward algorithm",
1654
+ "start_index": 638,
1655
+ "end_index": 645,
1656
+ "node_id": "0256"
1657
+ },
1658
+ {
1659
+ "title": "The sum-product algorithm for the HMM",
1660
+ "start_index": 645,
1661
+ "end_index": 647,
1662
+ "node_id": "0257"
1663
+ },
1664
+ {
1665
+ "title": "Scaling factors",
1666
+ "start_index": 647,
1667
+ "end_index": 649,
1668
+ "node_id": "0258"
1669
+ },
1670
+ {
1671
+ "title": "The Viterbi algorithm",
1672
+ "start_index": 649,
1673
+ "end_index": 651,
1674
+ "node_id": "0259"
1675
+ },
1676
+ {
1677
+ "title": "Extensions of the hidden Markov model",
1678
+ "start_index": 651,
1679
+ "end_index": 655,
1680
+ "node_id": "0260"
1681
+ }
1682
+ ],
1683
+ "node_id": "0254"
1684
+ },
1685
+ {
1686
+ "title": "Linear Dynamical Systems",
1687
+ "start_index": 655,
1688
+ "end_index": 658,
1689
+ "nodes": [
1690
+ {
1691
+ "title": "Inference in LDS",
1692
+ "start_index": 658,
1693
+ "end_index": 662,
1694
+ "node_id": "0262"
1695
+ },
1696
+ {
1697
+ "title": "Learning in LDS",
1698
+ "start_index": 662,
1699
+ "end_index": 664,
1700
+ "node_id": "0263"
1701
+ },
1702
+ {
1703
+ "title": "Extensions of LDS",
1704
+ "start_index": 664,
1705
+ "end_index": 665,
1706
+ "node_id": "0264"
1707
+ },
1708
+ {
1709
+ "title": "Particle filters",
1710
+ "start_index": 665,
1711
+ "end_index": 666,
1712
+ "node_id": "0265"
1713
+ }
1714
+ ],
1715
+ "node_id": "0261"
1716
+ }
1717
+ ],
1718
+ "node_id": "0252"
1719
+ },
1720
+ {
1721
+ "title": "Exercises",
1722
+ "start_index": 666,
1723
+ "end_index": 672,
1724
+ "node_id": "0266"
1725
+ },
1726
+ {
1727
+ "title": "Combining Models",
1728
+ "start_index": 673,
1729
+ "end_index": 674,
1730
+ "nodes": [
1731
+ {
1732
+ "title": "Bayesian Model Averaging",
1733
+ "start_index": 674,
1734
+ "end_index": 675,
1735
+ "node_id": "0268"
1736
+ },
1737
+ {
1738
+ "title": "Committees",
1739
+ "start_index": 675,
1740
+ "end_index": 677,
1741
+ "node_id": "0269"
1742
+ },
1743
+ {
1744
+ "title": "Boosting",
1745
+ "start_index": 677,
1746
+ "end_index": 679,
1747
+ "nodes": [
1748
+ {
1749
+ "title": "Minimizing exponential error",
1750
+ "start_index": 679,
1751
+ "end_index": 681,
1752
+ "node_id": "0271"
1753
+ },
1754
+ {
1755
+ "title": "Error functions for boosting",
1756
+ "start_index": 681,
1757
+ "end_index": 683,
1758
+ "node_id": "0272"
1759
+ }
1760
+ ],
1761
+ "node_id": "0270"
1762
+ },
1763
+ {
1764
+ "title": "Tree-based Models",
1765
+ "start_index": 683,
1766
+ "end_index": 686,
1767
+ "node_id": "0273"
1768
+ },
1769
+ {
1770
+ "title": "Conditional Mixture Models",
1771
+ "start_index": 686,
1772
+ "end_index": 687,
1773
+ "nodes": [
1774
+ {
1775
+ "title": "Mixtures of linear regression models",
1776
+ "start_index": 687,
1777
+ "end_index": 690,
1778
+ "node_id": "0275"
1779
+ },
1780
+ {
1781
+ "title": "Mixtures of logistic models",
1782
+ "start_index": 690,
1783
+ "end_index": 692,
1784
+ "node_id": "0276"
1785
+ },
1786
+ {
1787
+ "title": "Mixtures of experts",
1788
+ "start_index": 692,
1789
+ "end_index": 694,
1790
+ "node_id": "0277"
1791
+ }
1792
+ ],
1793
+ "node_id": "0274"
1794
+ }
1795
+ ],
1796
+ "node_id": "0267"
1797
+ },
1798
+ {
1799
+ "title": "Exercises",
1800
+ "start_index": 694,
1801
+ "end_index": 696,
1802
+ "node_id": "0278"
1803
+ },
1804
+ {
1805
+ "title": "Appendix A Data Sets",
1806
+ "start_index": 697,
1807
+ "end_index": 704,
1808
+ "node_id": "0279"
1809
+ },
1810
+ {
1811
+ "title": "Appendix B Probability Distributions",
1812
+ "start_index": 705,
1813
+ "end_index": 714,
1814
+ "node_id": "0280"
1815
+ },
1816
+ {
1817
+ "title": "Appendix C Properties of Matrices",
1818
+ "start_index": 715,
1819
+ "end_index": 722,
1820
+ "node_id": "0281"
1821
+ },
1822
+ {
1823
+ "title": "Appendix D Calculus of Variations",
1824
+ "start_index": 723,
1825
+ "end_index": 726,
1826
+ "node_id": "0282"
1827
+ },
1828
+ {
1829
+ "title": "Appendix E Lagrange Multipliers",
1830
+ "start_index": 727,
1831
+ "end_index": 730,
1832
+ "node_id": "0283"
1833
+ },
1834
+ {
1835
+ "title": "References",
1836
+ "start_index": 731,
1837
+ "end_index": 749,
1838
+ "node_id": "0284"
1839
+ },
1840
+ {
1841
+ "title": "Index",
1842
+ "start_index": 749,
1843
+ "end_index": 758,
1844
+ "node_id": "0285"
1845
+ }
1846
+ ]
1847
+ }
tests/results/Regulation Best Interest_Interpretive release_structure.json ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "doc_name": "Regulation Best Interest_Interpretive release.pdf",
3
+ "doc_description": "A detailed analysis of the SEC's interpretation of the \"solely incidental\" prong of the broker-dealer exclusion under the Investment Advisers Act of 1940, including its historical context, application guidance, economic implications, and regulatory considerations.",
4
+ "structure": [
5
+ {
6
+ "title": "Preface",
7
+ "start_index": 1,
8
+ "end_index": 2,
9
+ "node_id": "0000",
10
+ "summary": "The partial document outlines an interpretation by the Securities and Exchange Commission (SEC) regarding the \"solely incidental\" prong of the broker-dealer exclusion under the Investment Advisers Act of 1940. It clarifies that brokers or dealers providing advisory services that are incidental to their primary business and for which they receive no special compensation are excluded from the definition of \"investment adviser\" under the Act. The document includes a historical and legislative context, the scope of the \"solely incidental\" prong, guidance on its application, and economic considerations related to the interpretation. It also provides contact information for further inquiries and specifies the effective date of the interpretation as July 12, 2019."
11
+ },
12
+ {
13
+ "title": "Introduction",
14
+ "start_index": 2,
15
+ "end_index": 6,
16
+ "node_id": "0001",
17
+ "summary": "The partial document discusses the regulation of investment advisers under the Advisers Act, specifically focusing on the \"broker-dealer exclusion,\" which exempts brokers and dealers from being classified as investment advisers under certain conditions. Key points include:\n\n1. **Introduction to the Advisers Act**: Overview of the regulation of investment advisers and the broker-dealer exclusion, which applies when advisory services are \"solely incidental\" to brokerage business and no special compensation is received.\n\n2. **Historical Context and Legislative History**: Examination of the historical practices of broker-dealers providing investment advice, distinguishing between auxiliary advice as part of brokerage services and separate advisory services.\n\n3. **Interpretation of the Solely Incidental Prong**: Clarification of the \"solely incidental\" condition of the broker-dealer exclusion, including its application to activities like investment discretion and account monitoring.\n\n4. **Economic Considerations**: Discussion of the potential economic effects of the interpretation and application of the broker-dealer exclusion.\n\n5. **Regulatory Developments**: Reference to the Commission's 2018 proposals, including Regulation Best Interest (Reg. BI), the Proposed Fiduciary Interpretation, and the Relationship Summary Proposal, aimed at enhancing standards of conduct and investor understanding.\n\n6. **Public Comments and Feedback**: Summary of public comments on the scope and interpretation of the broker-dealer exclusion, highlighting disagreements and requests for clarification on the \"solely incidental\" prong.\n\n7. **Adoption of Interpretation**: The Commission's adoption of an interpretation to confirm and clarify its position on the \"solely incidental\" prong, complementing related rules and forms to improve investor understanding of broker-dealer and adviser relationships."
18
+ },
19
+ {
20
+ "title": "Interpretation and Application",
21
+ "start_index": 6,
22
+ "end_index": 8,
23
+ "nodes": [
24
+ {
25
+ "title": "Historical Context and Legislative History",
26
+ "start_index": 8,
27
+ "end_index": 10,
28
+ "node_id": "0003",
29
+ "summary": "The partial document discusses the historical context and legislative development of the Investment Advisers Act of 1940. It highlights the findings of a congressional study conducted by the SEC between 1935 and 1939, which identified issues with distinguishing legitimate investment counselors from unregulated \"tipster\" organizations and problems in the organization and operation of investment counsel institutions. The document explains how these findings led to the passage of the Advisers Act, which broadly defined \"investment adviser\" and established regulatory oversight for those providing investment advice for compensation. It also addresses the exclusion of certain professionals, such as broker-dealers, from the definition of \"investment adviser\" if their advice is incidental to their primary business and not specially compensated. Additionally, the document explores the scope of the \"solely incidental\" prong of the broker-dealer exclusion, referencing interpretations and rules by the SEC, including a 2005 rule regarding fee-based brokerage accounts."
30
+ },
31
+ {
32
+ "title": "Scope of the Solely Incidental Prong of the Broker-Dealer Exclusion",
33
+ "start_index": 10,
34
+ "end_index": 14,
35
+ "node_id": "0004",
36
+ "summary": "The partial document discusses the \"broker-dealer exclusion\" under the Investment Advisers Act, specifically focusing on the \"solely incidental\" prong. It examines the scope of this exclusion, emphasizing that investment advice provided by broker-dealers is considered \"solely incidental\" if it is connected to and reasonably related to their primary business of effecting securities transactions. The document references historical interpretations, court rulings (e.g., Financial Planning Association v. SEC and Thomas v. Metropolitan Life Insurance Company), and legislative history to clarify this standard. It highlights that the frequency or importance of advice does not determine whether it meets the \"solely incidental\" standard, but rather its relationship to the broker-dealer's primary business. The document also provides guidance on applying this interpretation to specific practices, such as exercising investment discretion and account monitoring, noting that certain discretionary activities may fall outside the scope of the exclusion."
37
+ },
38
+ {
39
+ "title": "Guidance on Applying the Interpretation of the Solely Incidental Prong",
40
+ "start_index": 14,
41
+ "end_index": 22,
42
+ "node_id": "0005",
43
+ "summary": "The partial document provides guidance on the application of the \"solely incidental\" prong of the broker-dealer exclusion under the Advisers Act. It focuses on two key areas: (1) the exercise of investment discretion by broker-dealers over customer accounts and (2) account monitoring. The document discusses the Commission's interpretation that unlimited investment discretion is not \"solely incidental\" to a broker-dealer's business, as it indicates a primarily advisory relationship. However, temporary or limited discretion in specific scenarios (e.g., cash management, tax-loss sales, or margin requirements) may be consistent with the \"solely incidental\" prong. It also addresses account monitoring, stating that agreed-upon periodic monitoring for buy, sell, or hold recommendations may align with the broker-dealer exclusion, while continuous monitoring or advisory-like services would not. The document includes examples, refinements to prior interpretations, and considerations for broker-dealers to adopt policies ensuring compliance. It concludes with economic considerations, highlighting the potential impact on broker-dealers, customers, and the financial advice market."
44
+ }
45
+ ],
46
+ "node_id": "0002",
47
+ "summary": "The partial document discusses the historical context and legislative history of the Advisers Act of 1940, focusing on the roles of broker-dealers in providing investment advice. It highlights two distinct ways broker-dealers offered advice: as part of traditional brokerage services with fixed commissions and as separate advisory services for a fee. The document examines the concept of \"brokerage house advice,\" detailing the types of information and services provided, such as market analyses, tax information, and investment recommendations. It also references a congressional study conducted between 1935 and 1939, which identified issues with distinguishing legitimate investment counselors from \"tipster\" organizations and problems in the organization and operation of investment counsel institutions. These findings led to the enactment of the Advisers Act, which broadly defined \"investment adviser\" to regulate those providing investment advice for compensation. The document also references various reports, hearings, and literature that informed the development of the Act."
48
+ },
49
+ {
50
+ "title": "Economic Considerations",
51
+ "start_index": 22,
52
+ "end_index": 22,
53
+ "nodes": [
54
+ {
55
+ "title": "Background",
56
+ "start_index": 22,
57
+ "end_index": 23,
58
+ "node_id": "0007",
59
+ "summary": "The partial document discusses the U.S. Securities and Exchange Commission's (SEC) interpretation of the \"solely incidental\" prong of the broker-dealer exclusion, clarifying its understanding without creating new legal obligations. It examines the potential economic effects of this interpretation on broker-dealers, their associated persons, customers, and the broader financial advice market. The document provides background data on broker-dealers, including their assets, customer accounts, and dual registration as investment advisers. It highlights compliance costs for broker-dealers to align with the interpretation and notes the limited circumstances under which broker-dealers exercise temporary or limited investment discretion. The document also references the lack of data received during the Reg. BI Proposal to analyze the economic impact further."
60
+ },
61
+ {
62
+ "title": "Potential Economic Effects",
63
+ "start_index": 23,
64
+ "end_index": 28,
65
+ "node_id": "0008",
66
+ "summary": "The partial document discusses the economic effects and regulatory implications of the SEC's interpretation of the \"solely incidental\" prong of the broker-dealer exclusion from the definition of an investment adviser. Key points include:\n\n1. **Compliance Costs**: Broker-dealers currently incur costs to align their practices with the \"solely incidental\" prong, and the interpretation may lead to additional costs for evaluating and adjusting practices.\n\n2. **Impact on Broker-Dealer Practices**: Broker-dealers providing advisory services beyond the scope of the interpretation may need to adjust their practices, potentially resulting in reduced services, loss of customers, or a shift to advisory accounts.\n\n3. **Market Effects**: The interpretation could lead to decreased competition, increased fees, and a diminished number of broker-dealers offering commission-based services. It may also shift demand from broker-dealers to investment advisers.\n\n4. **Regulatory Adjustments**: Broker-dealers may choose to register as investment advisers, incurring new compliance costs, or migrate customers to advisory accounts of affiliates.\n\n5. **Potential Benefits**: Some broker-dealers may expand limited discretionary services or monitoring activities, benefiting investors with more efficient access to these services.\n\n6. **Regulatory Arbitrage Risks**: The interpretation raises concerns about regulatory arbitrage, though these risks may be mitigated by enhanced standards of conduct for broker-dealers.\n\n7. **Amendments to Regulations**: The document includes amendments to the Code of Federal Regulations, adding an interpretive release regarding the \"solely incidental\" prong, dated June 5, 2019."
67
+ }
68
+ ],
69
+ "node_id": "0006",
70
+ "summary": "The partial document discusses the SEC's interpretation of the \"solely incidental\" prong of the broker-dealer exclusion, clarifying that it does not impose new legal obligations but may have economic implications if broker-dealer practices deviate from this interpretation. It provides background on the potential effects on broker-dealers, their associated persons, customers, and the broader financial advice market. The document includes data on the number of registered broker-dealers, their customer accounts, total assets, and the prevalence of dual registrants (firms registered as both broker-dealers and investment advisers) as of December 2018."
71
+ }
72
+ ]
73
+ }
tests/results/Regulation Best Interest_proposed rule_structure.json ADDED
The diff for this file is too large to render. See raw diff
 
tests/results/earthmover_structure.json ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "doc_name": "earthmover.pdf",
3
+ "structure": [
4
+ {
5
+ "title": "Earth Mover\u2019s Distance based Similarity Search at Scale",
6
+ "start_index": 1,
7
+ "end_index": 1,
8
+ "node_id": "0000"
9
+ },
10
+ {
11
+ "title": "ABSTRACT",
12
+ "start_index": 1,
13
+ "end_index": 1,
14
+ "node_id": "0001"
15
+ },
16
+ {
17
+ "title": "INTRODUCTION",
18
+ "start_index": 1,
19
+ "end_index": 2,
20
+ "node_id": "0002"
21
+ },
22
+ {
23
+ "title": "PRELIMINARIES",
24
+ "start_index": 2,
25
+ "end_index": 2,
26
+ "nodes": [
27
+ {
28
+ "title": "Computing the EMD",
29
+ "start_index": 3,
30
+ "end_index": 3,
31
+ "node_id": "0004"
32
+ },
33
+ {
34
+ "title": "Filter-and-Refinement Framework",
35
+ "start_index": 3,
36
+ "end_index": 4,
37
+ "node_id": "0005"
38
+ }
39
+ ],
40
+ "node_id": "0003"
41
+ },
42
+ {
43
+ "title": "SCALING UP SSP",
44
+ "start_index": 4,
45
+ "end_index": 5,
46
+ "node_id": "0006"
47
+ },
48
+ {
49
+ "title": "BOOSTING THE REFINEMENT PHASE",
50
+ "start_index": 5,
51
+ "end_index": 5,
52
+ "nodes": [
53
+ {
54
+ "title": "Analysis of EMD Calculation",
55
+ "start_index": 5,
56
+ "end_index": 6,
57
+ "node_id": "0008"
58
+ },
59
+ {
60
+ "title": "Progressive Bounding",
61
+ "start_index": 6,
62
+ "end_index": 6,
63
+ "node_id": "0009"
64
+ },
65
+ {
66
+ "title": "Sensitivity to Refinement Order",
67
+ "start_index": 6,
68
+ "end_index": 7,
69
+ "node_id": "0010"
70
+ },
71
+ {
72
+ "title": "Dynamic Refinement Ordering",
73
+ "start_index": 7,
74
+ "end_index": 8,
75
+ "node_id": "0011"
76
+ },
77
+ {
78
+ "title": "Running Upper Bound",
79
+ "start_index": 8,
80
+ "end_index": 8,
81
+ "node_id": "0012"
82
+ }
83
+ ],
84
+ "node_id": "0007"
85
+ },
86
+ {
87
+ "title": "EXPERIMENTAL EVALUATION",
88
+ "start_index": 8,
89
+ "end_index": 9,
90
+ "nodes": [
91
+ {
92
+ "title": "Performance Improvement",
93
+ "start_index": 9,
94
+ "end_index": 10,
95
+ "node_id": "0014"
96
+ },
97
+ {
98
+ "title": "Scalability Experiments",
99
+ "start_index": 10,
100
+ "end_index": 11,
101
+ "node_id": "0015"
102
+ },
103
+ {
104
+ "title": "Parameter Tuning in DRO",
105
+ "start_index": 11,
106
+ "end_index": 12,
107
+ "node_id": "0016"
108
+ }
109
+ ],
110
+ "node_id": "0013"
111
+ },
112
+ {
113
+ "title": "RELATED WORK",
114
+ "start_index": 12,
115
+ "end_index": 12,
116
+ "node_id": "0017"
117
+ },
118
+ {
119
+ "title": "CONCLUSION",
120
+ "start_index": 12,
121
+ "end_index": 12,
122
+ "node_id": "0018"
123
+ },
124
+ {
125
+ "title": "ACKNOWLEDGMENT",
126
+ "start_index": 12,
127
+ "end_index": 12,
128
+ "node_id": "0019"
129
+ },
130
+ {
131
+ "title": "REFERENCES",
132
+ "start_index": 12,
133
+ "end_index": 12,
134
+ "node_id": "0020"
135
+ }
136
+ ]
137
+ }
tests/results/four-lectures_structure.json ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "doc_name": "four-lectures.pdf",
3
+ "structure": [
4
+ {
5
+ "title": "Preface",
6
+ "start_index": 1,
7
+ "end_index": 1,
8
+ "node_id": "0000"
9
+ },
10
+ {
11
+ "title": "ML at a Glance",
12
+ "start_index": 2,
13
+ "end_index": 2,
14
+ "nodes": [
15
+ {
16
+ "title": "An ML session",
17
+ "start_index": 2,
18
+ "end_index": 3,
19
+ "node_id": "0002"
20
+ },
21
+ {
22
+ "title": "Types and Values",
23
+ "start_index": 3,
24
+ "end_index": 4,
25
+ "node_id": "0003"
26
+ },
27
+ {
28
+ "title": "Recursive Functions",
29
+ "start_index": 4,
30
+ "end_index": 4,
31
+ "node_id": "0004"
32
+ },
33
+ {
34
+ "title": "Raising Exceptions",
35
+ "start_index": 4,
36
+ "end_index": 5,
37
+ "node_id": "0005"
38
+ },
39
+ {
40
+ "title": "Structures",
41
+ "start_index": 5,
42
+ "end_index": 6,
43
+ "node_id": "0006"
44
+ },
45
+ {
46
+ "title": "Signatures",
47
+ "start_index": 6,
48
+ "end_index": 7,
49
+ "node_id": "0007"
50
+ },
51
+ {
52
+ "title": "Coercive Signature Matching",
53
+ "start_index": 7,
54
+ "end_index": 8,
55
+ "node_id": "0008"
56
+ },
57
+ {
58
+ "title": "Functor Declaration",
59
+ "start_index": 8,
60
+ "end_index": 9,
61
+ "node_id": "0009"
62
+ },
63
+ {
64
+ "title": "Functor Application",
65
+ "start_index": 9,
66
+ "end_index": 9,
67
+ "node_id": "0010"
68
+ },
69
+ {
70
+ "title": "Summary",
71
+ "start_index": 9,
72
+ "end_index": 9,
73
+ "node_id": "0011"
74
+ }
75
+ ],
76
+ "node_id": "0001"
77
+ },
78
+ {
79
+ "title": "Programming with ML Modules",
80
+ "start_index": 10,
81
+ "end_index": 10,
82
+ "nodes": [
83
+ {
84
+ "title": "Introduction",
85
+ "start_index": 10,
86
+ "end_index": 11,
87
+ "node_id": "0013"
88
+ },
89
+ {
90
+ "title": "Signatures",
91
+ "start_index": 11,
92
+ "end_index": 12,
93
+ "node_id": "0014"
94
+ },
95
+ {
96
+ "title": "Structures",
97
+ "start_index": 12,
98
+ "end_index": 13,
99
+ "node_id": "0015"
100
+ },
101
+ {
102
+ "title": "Functors",
103
+ "start_index": 13,
104
+ "end_index": 14,
105
+ "node_id": "0016"
106
+ },
107
+ {
108
+ "title": "Substructures",
109
+ "start_index": 14,
110
+ "end_index": 15,
111
+ "node_id": "0017"
112
+ },
113
+ {
114
+ "title": "Sharing",
115
+ "start_index": 15,
116
+ "end_index": 16,
117
+ "node_id": "0018"
118
+ },
119
+ {
120
+ "title": "Building the System",
121
+ "start_index": 16,
122
+ "end_index": 17,
123
+ "node_id": "0019"
124
+ },
125
+ {
126
+ "title": "Separate Compilation",
127
+ "start_index": 17,
128
+ "end_index": 18,
129
+ "node_id": "0020"
130
+ },
131
+ {
132
+ "title": "Good Style",
133
+ "start_index": 18,
134
+ "end_index": 18,
135
+ "node_id": "0021"
136
+ },
137
+ {
138
+ "title": "Bad Style",
139
+ "start_index": 18,
140
+ "end_index": 19,
141
+ "node_id": "0022"
142
+ }
143
+ ],
144
+ "node_id": "0012"
145
+ },
146
+ {
147
+ "title": "The Static Semantics of Modules",
148
+ "start_index": 20,
149
+ "end_index": 20,
150
+ "nodes": [
151
+ {
152
+ "title": "Elaboration",
153
+ "start_index": 20,
154
+ "end_index": 21,
155
+ "node_id": "0024"
156
+ },
157
+ {
158
+ "title": "Names",
159
+ "start_index": 21,
160
+ "end_index": 21,
161
+ "node_id": "0025"
162
+ },
163
+ {
164
+ "title": "Decorating Structures",
165
+ "start_index": 21,
166
+ "end_index": 21,
167
+ "node_id": "0026"
168
+ },
169
+ {
170
+ "title": "Decorating Signatures",
171
+ "start_index": 22,
172
+ "end_index": 23,
173
+ "node_id": "0027"
174
+ },
175
+ {
176
+ "title": "Signature Instantiation",
177
+ "start_index": 23,
178
+ "end_index": 24,
179
+ "node_id": "0028"
180
+ },
181
+ {
182
+ "title": "Signature Matching",
183
+ "start_index": 24,
184
+ "end_index": 25,
185
+ "node_id": "0029"
186
+ },
187
+ {
188
+ "title": "Signature Constraints",
189
+ "start_index": 25,
190
+ "end_index": 25,
191
+ "node_id": "0030"
192
+ },
193
+ {
194
+ "title": "Decorating Functors",
195
+ "start_index": 26,
196
+ "end_index": 26,
197
+ "node_id": "0031"
198
+ },
199
+ {
200
+ "title": "External Sharing",
201
+ "start_index": 26,
202
+ "end_index": 27,
203
+ "node_id": "0032"
204
+ },
205
+ {
206
+ "title": "Functors with Arguments",
207
+ "start_index": 27,
208
+ "end_index": 28,
209
+ "node_id": "0033"
210
+ },
211
+ {
212
+ "title": "Sharing Between Argument and Result",
213
+ "start_index": 28,
214
+ "end_index": 28,
215
+ "node_id": "0034"
216
+ },
217
+ {
218
+ "title": "Explicit Result Signatures",
219
+ "start_index": 28,
220
+ "end_index": 29,
221
+ "node_id": "0035"
222
+ }
223
+ ],
224
+ "node_id": "0023"
225
+ },
226
+ {
227
+ "title": "Implementing an Interpreter in ML",
228
+ "start_index": 30,
229
+ "end_index": 32,
230
+ "nodes": [
231
+ {
232
+ "title": "Version 1: The Bare Typechecker",
233
+ "start_index": 32,
234
+ "end_index": 33,
235
+ "node_id": "0037"
236
+ },
237
+ {
238
+ "title": "Version 2: Adding Lists and Polymorphism",
239
+ "start_index": 33,
240
+ "end_index": 37,
241
+ "node_id": "0038"
242
+ },
243
+ {
244
+ "title": "Version 3: A Different Implementation of Types",
245
+ "start_index": 37,
246
+ "end_index": 39,
247
+ "node_id": "0039"
248
+ },
249
+ {
250
+ "title": "Version 4: Introducing Variables and Let",
251
+ "start_index": 39,
252
+ "end_index": 43,
253
+ "node_id": "0040"
254
+ },
255
+ {
256
+ "title": "Acknowledgement",
257
+ "start_index": 43,
258
+ "end_index": 43,
259
+ "node_id": "0041"
260
+ }
261
+ ],
262
+ "node_id": "0036"
263
+ },
264
+ {
265
+ "title": "Appendix A: The Bare Interpreter",
266
+ "start_index": 44,
267
+ "end_index": 44,
268
+ "nodes": [
269
+ {
270
+ "title": "Syntax",
271
+ "start_index": 44,
272
+ "end_index": 44,
273
+ "node_id": "0043"
274
+ },
275
+ {
276
+ "title": "Parsing",
277
+ "start_index": 44,
278
+ "end_index": 45,
279
+ "node_id": "0044"
280
+ },
281
+ {
282
+ "title": "Environments",
283
+ "start_index": 45,
284
+ "end_index": 45,
285
+ "node_id": "0045"
286
+ },
287
+ {
288
+ "title": "Evaluation",
289
+ "start_index": 45,
290
+ "end_index": 46,
291
+ "node_id": "0046"
292
+ },
293
+ {
294
+ "title": "Type Checking",
295
+ "start_index": 46,
296
+ "end_index": 46,
297
+ "node_id": "0047"
298
+ },
299
+ {
300
+ "title": "The Interpreter",
301
+ "start_index": 46,
302
+ "end_index": 47,
303
+ "node_id": "0048"
304
+ },
305
+ {
306
+ "title": "The Evaluator",
307
+ "start_index": 47,
308
+ "end_index": 48,
309
+ "node_id": "0049"
310
+ },
311
+ {
312
+ "title": "The Typechecker",
313
+ "start_index": 48,
314
+ "end_index": 49,
315
+ "node_id": "0050"
316
+ },
317
+ {
318
+ "title": "The Basics",
319
+ "start_index": 50,
320
+ "end_index": 52,
321
+ "node_id": "0051"
322
+ }
323
+ ],
324
+ "node_id": "0042"
325
+ },
326
+ {
327
+ "title": "Appendix B: Files",
328
+ "start_index": 53,
329
+ "end_index": 53,
330
+ "node_id": "0052"
331
+ }
332
+ ]
333
+ }
tests/results/q1-fy25-earnings_structure.json ADDED
@@ -0,0 +1,311 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "doc_name": "q1-fy25-earnings.pdf",
3
+ "doc_description": "A comprehensive financial report detailing The Walt Disney Company's first-quarter fiscal 2025 performance, including revenue growth, segment highlights, guidance for fiscal 2025, and key financial metrics such as adjusted EPS, operating income, and cash flow.",
4
+ "structure": [
5
+ {
6
+ "title": "THE WALT DISNEY COMPANY REPORTS FIRST QUARTER EARNINGS FOR FISCAL 2025",
7
+ "start_index": 1,
8
+ "end_index": 1,
9
+ "nodes": [
10
+ {
11
+ "title": "Financial Results for the Quarter",
12
+ "start_index": 1,
13
+ "end_index": 1,
14
+ "nodes": [
15
+ {
16
+ "title": "Key Points",
17
+ "start_index": 1,
18
+ "end_index": 1,
19
+ "node_id": "0002",
20
+ "summary": "The partial document outlines The Walt Disney Company's financial performance for the first fiscal quarter of 2025, ending December 28, 2024. Key points include:\n\n1. **Financial Results**: \n - Revenue increased by 5% to $24.7 billion.\n - Income before taxes rose by 27% to $3.7 billion.\n - Diluted EPS grew by 35% to $1.40.\n - Total segment operating income increased by 31% to $5.1 billion, with adjusted EPS up 44% to $1.76.\n\n2. **Entertainment Segment**:\n - Operating income increased by $0.8 billion to $1.7 billion.\n - Direct-to-Consumer operating income rose by $431 million to $293 million, with advertising revenue (excluding Disney+ Hotstar in India) up 16%.\n - Disney+ and Hulu subscriptions increased by 0.9 million, while Disney+ subscribers decreased by 0.7 million.\n - Content sales/licensing income grew by $536 million, driven by the success of *Moana 2*.\n\n3. **Sports Segment**:\n - Operating income increased by $350 million to $247 million.\n - Domestic ESPN advertising revenue grew by 15%.\n\n4. **Experiences Segment**:\n - Operating income remained at $3.1 billion, with a 6 percentage-point adverse impact due to Hurricanes Milton and Helene and pre-opening expenses for the Disney Treasure.\n - Domestic Parks & Experiences income declined by 5%, while International Parks & Experiences income increased by 28%."
21
+ }
22
+ ],
23
+ "node_id": "0001",
24
+ "summary": "The partial document is a report from The Walt Disney Company detailing its financial performance for the first fiscal quarter of 2025, ending December 28, 2024. Key points include:\n\n1. **Financial Performance**:\n - Revenue increased by 5% to $24.7 billion.\n - Income before taxes rose by 27% to $3.7 billion.\n - Diluted EPS grew by 35% to $1.40.\n - Total segment operating income increased by 31% to $5.1 billion, with adjusted EPS up 44% to $1.76.\n\n2. **Segment Highlights**:\n - **Entertainment**: Operating income increased by $0.8 billion to $1.7 billion. Direct-to-Consumer income rose by $431 million, though advertising revenue declined 2% (up 16% excluding Disney+ Hotstar in India). Disney+ and Hulu subscriptions increased slightly, while Disney+ subscribers decreased by 0.7 million. Content sales/licensing income grew, driven by the success of *Moana 2*.\n - **Sports**: Operating income increased by $350 million to $247 million, with ESPN domestic advertising revenue up 15%.\n - **Experiences**: Operating income remained at $3.1 billion, with adverse impacts from hurricanes and pre-opening expenses for the Disney Treasure. Domestic Parks & Experiences income declined by 5%, while International Parks & Experiences income rose by 28%.\n\n3. **Additional Notes**:\n - Non-GAAP financial measures are used for certain metrics.\n - Disney+ Hotstar in India saw a significant decline in advertising revenue compared to the previous year."
25
+ },
26
+ {
27
+ "title": "Guidance and Outlook",
28
+ "start_index": 2,
29
+ "end_index": 2,
30
+ "nodes": [
31
+ {
32
+ "title": "Star India deconsolidated in Q1",
33
+ "start_index": 2,
34
+ "end_index": 2,
35
+ "node_id": "0004",
36
+ "summary": "The partial document outlines Disney's financial guidance and outlook for fiscal 2025, including the deconsolidation of Star India and its impact on operating income for the Entertainment and Sports segments. It highlights expectations for Q2 fiscal 2025, such as a modest decline in Disney+ subscribers, adverse impacts on Sports segment income, and pre-opening expenses for Disney Cruise Line. For fiscal 2025, the company projects high-single-digit adjusted EPS growth, $15 billion in cash from operations, and segment operating income growth across Entertainment, Sports, and Experiences. The CEO emphasizes strong Q1 results, including box office success, improved profitability in streaming, advancements in ESPN\u2019s digital strategy, and continued investments in the Experiences segment, expressing confidence in Disney's growth strategy."
37
+ },
38
+ {
39
+ "title": "Q2 Fiscal 2025",
40
+ "start_index": 2,
41
+ "end_index": 2,
42
+ "node_id": "0005",
43
+ "summary": "The partial document outlines Disney's financial guidance and outlook for fiscal 2025, including the deconsolidation of Star India and its impact on operating income for the Entertainment and Sports segments. It highlights expectations for Q2 fiscal 2025, such as a modest decline in Disney+ subscribers, adverse impacts on Sports segment income, and pre-opening expenses for Disney Cruise Line. For the full fiscal year 2025, it projects high-single-digit adjusted EPS growth, $15 billion in cash from operations, and segment operating income growth across Entertainment, Sports, and Experiences. The CEO emphasizes Disney's strong start to the fiscal year, citing achievements in box office performance, improved streaming profitability, ESPN's digital strategy, and the enduring appeal of the Experiences segment."
44
+ },
45
+ {
46
+ "title": "Fiscal Year 2025",
47
+ "start_index": 2,
48
+ "end_index": 2,
49
+ "node_id": "0006",
50
+ "summary": "The partial document outlines Disney's financial guidance and outlook for fiscal 2025, including the deconsolidation of Star India and its impact on operating income for the Entertainment and Sports segments. It highlights expectations for Q2 fiscal 2025, such as a modest decline in Disney+ subscribers, adverse impacts on Sports segment income, and pre-opening expenses for Disney Cruise Line. For the full fiscal year 2025, it projects high-single-digit adjusted EPS growth, $15 billion in cash from operations, and segment operating income growth across Entertainment, Sports, and Experiences. The CEO emphasizes Disney's creative and financial strength, strong box office performance, improved streaming profitability, advancements in ESPN's digital strategy, and continued global investments in the Experiences segment."
51
+ }
52
+ ],
53
+ "node_id": "0003",
54
+ "summary": "The partial document outlines Disney's financial guidance and outlook for fiscal 2025, including the deconsolidation of Star India and its impact on operating income for the Entertainment and Sports segments. It highlights expectations for Q2 fiscal 2025, such as a modest decline in Disney+ subscribers, adverse impacts on Sports segment income, and pre-opening expenses for Disney Cruise Line. For the full fiscal year 2025, it projects high-single-digit adjusted EPS growth, $15 billion in cash from operations, and segment operating income growth across Entertainment, Sports, and Experiences. The CEO emphasizes strong Q1 results, including box office success, improved profitability in streaming, advancements in ESPN\u2019s digital strategy, and continued investment in global experiences."
55
+ },
56
+ {
57
+ "title": "Message From Our CEO",
58
+ "start_index": 2,
59
+ "end_index": 2,
60
+ "node_id": "0007",
61
+ "summary": "The partial document outlines Disney's financial guidance and outlook for fiscal 2025, including the deconsolidation of Star India and its impact on operating income for the Entertainment and Sports segments. It highlights expectations for Q2 fiscal 2025, such as a modest decline in Disney+ subscribers, adverse impacts on Sports segment income, and pre-opening expenses for Disney Cruise Line. For the full fiscal year 2025, it projects high-single-digit adjusted EPS growth, $15 billion in cash from operations, and segment operating income growth across Entertainment, Sports, and Experiences. The CEO emphasizes strong Q1 results, including box office success, improved profitability in streaming, advancements in ESPN\u2019s digital strategy, and continued investment in global experiences."
62
+ }
63
+ ],
64
+ "node_id": "0000",
65
+ "summary": "The partial document is a report from The Walt Disney Company detailing its financial performance for the first fiscal quarter of 2025, ending December 28, 2024. Key points include:\n\n1. **Financial Results**: \n - Revenue increased by 5% to $24.7 billion. \n - Income before taxes rose by 27% to $3.7 billion. \n - Diluted EPS grew by 35% to $1.40. \n - Total segment operating income increased by 31% to $5.1 billion, and adjusted EPS rose by 44% to $1.76. \n\n2. **Entertainment Segment**: \n - Operating income increased by $0.8 billion to $1.7 billion. \n - Direct-to-Consumer operating income rose by $431 million to $293 million, with advertising revenue up 16% (excluding Disney+ Hotstar in India). \n - Disney+ and Hulu subscriptions increased by 0.9 million, while Disney+ subscribers decreased by 0.7 million. \n - Content sales/licensing income grew by $536 million, driven by the success of *Moana 2*. \n\n3. **Sports Segment**: \n - Operating income increased by $350 million to $247 million. \n - Domestic ESPN advertising revenue grew by 15%. \n\n4. **Experiences Segment**: \n - Operating income remained at $3.1 billion, with a 6 percentage-point adverse impact due to Hurricanes Milton and Helene and pre-opening expenses for the Disney Treasure. \n - Domestic Parks & Experiences income declined by 5%, while International Parks & Experiences income increased by 28%. \n\nThe report also includes non-GAAP financial measures and notes the impact of Disney+ Hotstar's advertising revenue in India."
66
+ },
67
+ {
68
+ "title": "SUMMARIZED FINANCIAL RESULTS",
69
+ "start_index": 3,
70
+ "end_index": 3,
71
+ "nodes": [
72
+ {
73
+ "title": "SUMMARIZED SEGMENT FINANCIAL RESULTS",
74
+ "start_index": 3,
75
+ "end_index": 3,
76
+ "node_id": "0009",
77
+ "summary": "The partial document provides a summarized overview of financial results for the first quarter of fiscal years 2025 and 2024. Key points include:\n\n1. **Overall Financial Performance**:\n - Revenues increased by 5% from $23,549 million in 2024 to $24,690 million in 2025.\n - Income before income taxes rose by 27%.\n - Total segment operating income grew by 31%.\n - Diluted EPS increased by 35%, and diluted EPS excluding certain items rose by 44%.\n - Cash provided by operations increased by 47%, while free cash flow decreased by 17%.\n\n2. **Segment Financial Results**:\n - Revenue growth was observed in the Entertainment segment (9%) and Experiences segment (3%), while Sports revenue remained flat.\n - Segment operating income for Entertainment increased significantly by 95%, while Sports shifted from a loss to a positive income. Experiences segment operating income remained stable.\n\n3. **Non-GAAP Measures**:\n - The document highlights the use of non-GAAP financial measures such as total segment operating income, diluted EPS excluding certain items, and free cash flow, with references to further details and reconciliations provided elsewhere in the report."
78
+ }
79
+ ],
80
+ "node_id": "0008",
81
+ "summary": "The partial document provides a summarized overview of financial results for the first quarter of fiscal years 2025 and 2024. Key points include:\n\n1. **Overall Financial Performance**:\n - Revenues increased by 5% from $23,549 million in 2024 to $24,690 million in 2025.\n - Income before income taxes rose by 27%.\n - Total segment operating income grew by 31%.\n - Diluted EPS increased by 35%, and diluted EPS excluding certain items rose by 44%.\n - Cash provided by operations increased by 47%, while free cash flow decreased by 17%.\n\n2. **Segment Financial Results**:\n - Revenue growth was observed in the Entertainment segment (9%) and Experiences segment (3%), while Sports revenue remained flat.\n - Segment operating income for Entertainment increased significantly by 95%, while Sports shifted from a loss to a positive income. Experiences segment operating income remained stable.\n\n3. **Non-GAAP Measures**:\n - The document highlights the use of non-GAAP financial measures such as total segment operating income, diluted EPS excluding certain items, and free cash flow, with references to further details and reconciliations provided in later sections."
82
+ },
83
+ {
84
+ "title": "DISCUSSION OF FIRST QUARTER SEGMENT RESULTS",
85
+ "start_index": 4,
86
+ "end_index": 4,
87
+ "nodes": [
88
+ {
89
+ "title": "Star India",
90
+ "start_index": 4,
91
+ "end_index": 4,
92
+ "node_id": "0011",
93
+ "summary": "The partial document discusses the first-quarter segment results, focusing on the Star India joint venture formed between the Company and Reliance Industries Limited (RIL) on November 14, 2024. The joint venture combines Star-branded entertainment and sports television channels, Disney+ Hotstar, and certain RIL-controlled media businesses, with RIL holding a 56% controlling interest, the Company holding 37%, and a third-party investment company holding 7%. The Company now recognizes its 37% share of the joint venture\u2019s results under \"Equity in the income of investees.\" Additionally, the document provides financial results for the Entertainment segment, showing a 9% increase in total revenues and a 95% increase in operating income compared to the prior-year quarter. The growth in operating income is attributed to improved results in Content Sales/Licensing and Direct-to-Consumer, partially offset by a decline in Linear Networks."
94
+ },
95
+ {
96
+ "title": "Entertainment",
97
+ "start_index": 4,
98
+ "end_index": 4,
99
+ "nodes": [
100
+ {
101
+ "title": "Linear Networks",
102
+ "start_index": 5,
103
+ "end_index": 5,
104
+ "node_id": "0013",
105
+ "summary": "The partial document provides financial performance details for Linear Networks and Direct-to-Consumer segments for the quarters ending December 28, 2024, and December 30, 2023. Key points include:\n\n1. **Linear Networks**:\n - Revenue decreased by 7%, with domestic revenue remaining flat and international revenue declining by 31%.\n - Operating income decreased by 11%, with domestic income stable and international income dropping by 39%.\n - Domestic operating income was impacted by higher programming costs (due to the 2023 guild strikes), lower affiliate revenue (fewer subscribers), lower technology costs, and higher advertising revenue (driven by political advertising but offset by lower viewership).\n - International operating income decline was attributed to the Star India Transaction.\n - Equity income from investees decreased due to lower income from A+E Television Networks, reduced advertising and affiliate revenue, and the absence of a prior-year gain from an investment sale.\n\n2. **Direct-to-Consumer**:\n - Revenue increased by 9%, driven by higher subscription revenue due to increased pricing and more subscribers, partially offset by unfavorable foreign exchange impacts.\n - Operating income improved significantly, moving from a loss in the prior year to a profit, reflecting subscription revenue growth."
106
+ },
107
+ {
108
+ "title": "Direct-to-Consumer",
109
+ "start_index": 5,
110
+ "end_index": 7,
111
+ "node_id": "0014",
112
+ "summary": "The partial document provides a financial performance overview of various segments for the quarter ended December 28, 2024, compared to the prior-year quarter. Key points include:\n\n1. **Linear Networks**:\n - Revenue decreased by 7%, with domestic revenue flat and international revenue down 31%.\n - Operating income decreased by 11%, with domestic income flat and international income down 39%, primarily due to the Star India transaction.\n - Equity income from investees declined by 29%, driven by lower income from A+E Television Networks and the absence of a prior-year gain on an investment sale.\n\n2. **Direct-to-Consumer (DTC)**:\n - Revenue increased by 9%, and operating income improved significantly from a loss of $138 million to a profit of $293 million.\n - Growth was driven by higher subscription revenue due to pricing increases and more subscribers, partially offset by higher costs and lower advertising revenue.\n - Key metrics showed slight changes in Disney+ and Hulu subscriber numbers, with increases in average monthly revenue per paid subscriber due to pricing adjustments.\n\n3. **Content Sales/Licensing and Other**:\n - Revenue increased by 34%, and operating income improved significantly, driven by strong theatrical performance, particularly from \"Moana 2,\" and contributions from \"Mufasa: The Lion King.\"\n\n4. **Sports**:\n - ESPN revenue grew by 8%, with domestic and international segments showing increases, while Star India revenue dropped by 90%.\n - Operating income for ESPN improved by 15%, while Star India shifted from a loss to a small profit.\n\nThe document highlights revenue trends, operating income changes, and key drivers for each segment, including programming costs, subscriber growth, pricing adjustments, and content performance."
113
+ },
114
+ {
115
+ "title": "Content Sales/Licensing and Other",
116
+ "start_index": 7,
117
+ "end_index": 7,
118
+ "node_id": "0015",
119
+ "summary": "The partial document discusses the financial performance of Disney's streaming services, content sales, and sports segment. Key points include:\n\n1. **Disney+ Revenue**: Domestic and international Disney+ average monthly revenue per paid subscriber increased due to pricing hikes, partially offset by promotional offerings. International revenue also benefited from higher advertising revenue.\n\n2. **Hulu Revenue**: Hulu SVOD Only revenue remained stable, with pricing increases offsetting lower advertising revenue. Hulu Live TV + SVOD revenue increased due to pricing hikes.\n\n3. **Content Sales/Licensing**: Revenue and operating income improved significantly, driven by strong theatrical distribution results, particularly from \"Moana 2,\" and contributions from \"Mufasa: The Lion King.\"\n\n4. **Sports Revenue**: ESPN domestic and international revenues grew, while Star India revenue declined sharply. Operating income for ESPN improved, with domestic income slightly down and international losses reduced. Star India showed a notable recovery in operating income."
120
+ }
121
+ ],
122
+ "node_id": "0012",
123
+ "summary": "The partial document discusses the first-quarter segment results, focusing on the Star India joint venture formed between the Company and Reliance Industries Limited (RIL) on November 14, 2024. The joint venture combines Star-branded entertainment and sports television channels and the Disney+ Hotstar service in India, with RIL holding a 56% controlling interest, the Company holding 37%, and a third-party investment company holding 7%. The Company now recognizes its 37% share of the joint venture\u2019s results under \u201cEquity in the income of investees.\u201d Additionally, the document provides financial results for the Entertainment segment, showing a 9% increase in total revenues compared to the prior year, driven by growth in Direct-to-Consumer and Content Sales/Licensing and Other, despite a decline in Linear Networks. Operating income increased by 95%, primarily due to improved results in Content Sales/Licensing and Other and Direct-to-Consumer, partially offset by a decrease in Linear Networks."
124
+ },
125
+ {
126
+ "title": "Sports",
127
+ "start_index": 7,
128
+ "end_index": 7,
129
+ "nodes": [
130
+ {
131
+ "title": "Domestic ESPN",
132
+ "start_index": 8,
133
+ "end_index": 8,
134
+ "node_id": "0017",
135
+ "summary": "The partial document discusses the financial performance of ESPN, including domestic and international operations, as well as Star India, for the current quarter compared to the prior-year quarter. Key points include:\n\n1. **Domestic ESPN**: \n - Decrease in operating results due to higher programming and production costs, primarily from expanded college football programming rights and changes in the College Football Playoff (CFP) format.\n - Increase in advertising revenue due to higher rates.\n - Revenue from sub-licensing CFP programming rights.\n - Affiliate revenue remained comparable, with rate increases offset by fewer subscribers.\n\n2. **International ESPN**: \n - Decrease in operating loss driven by higher fees from the Entertainment segment for Disney+ sports content.\n - Increased programming and production costs due to higher soccer rights costs.\n - Lower affiliate revenue due to fewer subscribers.\n\n3. **Star India**: \n - Improved operating results due to the absence of significant cricket events in the current quarter compared to the prior-year quarter, which included the ICC Cricket World Cup.\n\n4. **Key Metrics for ESPN+**:\n - Paid subscribers decreased from 25.6 million to 24.9 million.\n - Average monthly revenue per paid subscriber increased from $5.94 to $6.36, driven by pricing increases and higher advertising revenue."
136
+ },
137
+ {
138
+ "title": "International ESPN",
139
+ "start_index": 8,
140
+ "end_index": 8,
141
+ "node_id": "0018",
142
+ "summary": "The partial document discusses the financial performance of ESPN, including domestic and international operations, as well as Star India, for the current quarter compared to the prior-year quarter. Key points include:\n\n1. **Domestic ESPN**: \n - Decrease in operating results due to higher programming and production costs, primarily from expanded college football programming rights and changes in the College Football Playoff (CFP) format.\n - Increase in advertising revenue due to higher rates.\n - Revenue from sub-licensing CFP programming rights.\n - Affiliate revenue remained comparable, with rate increases offset by fewer subscribers.\n\n2. **International ESPN**: \n - Decrease in operating loss driven by higher fees from the Entertainment segment for Disney+ sports content.\n - Increased programming and production costs due to higher soccer rights costs.\n - Lower affiliate revenue due to fewer subscribers.\n\n3. **Star India**: \n - Improved operating results due to the absence of significant cricket events in the current quarter compared to the ICC Cricket World Cup in the prior-year quarter.\n\n4. **Key Metrics for ESPN+**:\n - Paid subscribers decreased from 25.6 million to 24.9 million.\n - Average monthly revenue per paid subscriber increased from $5.94 to $6.36, driven by pricing increases and higher advertising revenue."
143
+ },
144
+ {
145
+ "title": "Star India",
146
+ "start_index": 8,
147
+ "end_index": 8,
148
+ "node_id": "0019",
149
+ "summary": "The partial document discusses the financial performance of ESPN, including domestic and international operations, as well as Star India, for a specific quarter. Key points include:\n\n1. **Domestic ESPN**: \n - Decrease in operating results due to higher programming and production costs, primarily from expanded college football programming rights, including additional College Football Playoff (CFP) games under a revised format.\n - Increase in advertising revenue due to higher rates.\n - Revenue from sub-licensing CFP programming rights.\n - Affiliate revenue remained comparable to the prior year due to effective rate increases offset by fewer subscribers.\n\n2. **International ESPN**: \n - Decrease in operating loss driven by higher fees from the Entertainment segment for sports content on Disney+.\n - Increased programming and production costs due to higher soccer rights costs.\n - Lower affiliate revenue due to fewer subscribers.\n\n3. **Star India**: \n - Improvement in operating results due to the absence of significant cricket events in the current quarter compared to the prior year, which included the ICC Cricket World Cup.\n\n4. **Key Metrics for ESPN+**:\n - Paid subscribers decreased from 25.6 million to 24.9 million.\n - Average monthly revenue per paid subscriber increased from $5.94 to $6.36, driven by pricing increases and higher advertising revenue."
150
+ }
151
+ ],
152
+ "node_id": "0016",
153
+ "summary": "The partial document discusses the financial performance of Disney's streaming services, content sales, and sports segment. Key points include:\n\n1. **Disney+ Revenue**: Domestic and international Disney+ average monthly revenue per paid subscriber increased due to pricing hikes, partially offset by promotional offerings. International revenue also benefited from higher advertising revenue.\n\n2. **Hulu Revenue**: Hulu SVOD Only revenue remained stable, with pricing increases offsetting lower advertising revenue. Hulu Live TV + SVOD revenue increased due to pricing hikes.\n\n3. **Content Sales/Licensing**: Revenue and operating income improved significantly, driven by strong theatrical performance, particularly from \"Moana 2,\" and contributions from \"Mufasa: The Lion King.\"\n\n4. **Sports Revenue**: ESPN domestic and international revenues grew, while Star India revenue declined sharply. Operating income for ESPN improved, with domestic income slightly down and international income showing significant recovery. Star India showed a notable turnaround in operating income."
154
+ },
155
+ {
156
+ "title": "Experiences",
157
+ "start_index": 9,
158
+ "end_index": 9,
159
+ "node_id": "0020",
160
+ "summary": "The partial document provides financial performance details for the Parks & Experiences segment, including revenues and operating income for domestic and international operations, as well as consumer products. It highlights a 3% increase in total revenue and stable operating income compared to the prior year. Domestic parks and experiences were negatively impacted by hurricanes, leading to lower volumes and higher costs, despite increased guest spending. International parks and experiences saw growth in operating income due to higher guest spending, increased attendance, and new offerings. The document also notes increased corporate expenses due to a legal settlement and a $143 million loss related to the Star India Transaction."
161
+ }
162
+ ],
163
+ "node_id": "0010",
164
+ "summary": "The partial document discusses the first-quarter segment results, focusing on the Star India joint venture formed between the Company and Reliance Industries Limited (RIL) on November 14, 2024. The joint venture combines Star-branded entertainment and sports television channels, Disney+ Hotstar, and certain RIL-controlled media businesses, with RIL holding a 56% controlling interest, the Company holding 37%, and a third-party investment company holding 7%. The Company now recognizes its 37% share of the joint venture\u2019s results under \"Equity in the income of investees.\" Additionally, the document provides financial results for the Entertainment segment, showing a 9% increase in total revenues and a 95% increase in operating income compared to the prior-year quarter. The growth in operating income is attributed to improved results in Content Sales/Licensing and Direct-to-Consumer, partially offset by a decline in Linear Networks."
165
+ },
166
+ {
167
+ "title": "OTHER FINANCIAL INFORMATION",
168
+ "start_index": 9,
169
+ "end_index": 9,
170
+ "nodes": [
171
+ {
172
+ "title": "Corporate and Unallocated Shared Expenses",
173
+ "start_index": 9,
174
+ "end_index": 9,
175
+ "node_id": "0022",
176
+ "summary": "The partial document provides a financial overview of revenues and operating income for Parks & Experiences, including Domestic, International, and Consumer Products segments, comparing the quarters ending December 28, 2024, and December 30, 2023. It highlights a 3% increase in overall revenue and stable operating income. Domestic Parks and Experiences were negatively impacted by Hurricanes Milton and Helene, leading to closures, cancellations, higher costs, and lower attendance, despite increased guest spending. International Parks and Experiences saw growth in operating income due to higher guest spending, increased attendance, and new offerings, offset by higher costs. The document also notes a $152 million increase in corporate and unallocated shared expenses due to a legal settlement and a $143 million loss related to the Star India Transaction."
177
+ },
178
+ {
179
+ "title": "Restructuring and Impairment Charges",
180
+ "start_index": 9,
181
+ "end_index": 9,
182
+ "node_id": "0023",
183
+ "summary": "The partial document provides financial performance details for the Parks & Experiences segment, including revenues and operating income for domestic and international operations, as well as consumer products. It highlights a 3% increase in overall revenue and stable operating income compared to the prior year. Domestic parks and experiences were negatively impacted by hurricanes, leading to lower volumes and higher costs, despite increased guest spending. International parks and experiences saw growth in operating income due to higher guest spending, increased attendance, and new offerings, though costs also rose. Additionally, corporate and unallocated shared expenses increased due to a legal settlement, and a $143 million loss was recorded related to the Star India Transaction."
184
+ },
185
+ {
186
+ "title": "Interest Expense, net",
187
+ "start_index": 10,
188
+ "end_index": 10,
189
+ "node_id": "0024",
190
+ "summary": "The partial document provides a financial analysis of interest expense, net, equity in the income of investees, and income taxes for the quarters ending December 28, 2024, and December 30, 2023. Key points include:\n\n1. **Interest Expense, Net**: A decrease in interest expense due to lower average rates and debt balances, partially offset by reduced capitalized interest. Interest income and investment income declined due to lower cash balances, pension-related costs, and investment losses compared to prior-year gains.\n\n2. **Equity in the Income of Investees**: A $89 million decrease in income from investees, primarily due to lower income from A+E and losses from the India joint venture.\n\n3. **Income Taxes**: An increase in the effective income tax rate from 25.1% to 27.8%, driven by a non-cash tax charge related to the Star India Transaction, partially offset by favorable adjustments related to prior years, lower foreign tax rates, and a comparison to unfavorable prior-year effects of employee share-based awards."
191
+ },
192
+ {
193
+ "title": "Equity in the Income of Investees",
194
+ "start_index": 10,
195
+ "end_index": 10,
196
+ "node_id": "0025",
197
+ "summary": "The partial document provides a financial analysis of interest expense, net, equity in the income of investees, and income taxes for the quarters ended December 28, 2024, and December 30, 2023. It highlights a decrease in net interest expense due to lower average rates and debt balances, offset by reduced capitalized interest. Interest income and investment income declined due to lower cash balances, pension-related costs, and investment losses. Equity income from investees decreased significantly, driven by lower income from A+E and losses from the India joint venture. The effective income tax rate increased due to a non-cash tax charge related to the Star India Transaction, partially offset by favorable adjustments related to prior years, lower foreign tax rates, and a comparison to unfavorable prior-year effects."
198
+ },
199
+ {
200
+ "title": "Income Taxes",
201
+ "start_index": 10,
202
+ "end_index": 10,
203
+ "node_id": "0026",
204
+ "summary": "The partial document provides a financial analysis of interest expense, net, equity in the income of investees, and income taxes for the quarters ended December 28, 2024, and December 30, 2023. It highlights a decrease in net interest expense due to lower average rates and debt balances, offset by reduced capitalized interest. Interest income and investment income declined due to lower cash balances, pension-related costs, and investment losses. Equity income from investees dropped significantly, driven by lower income from A+E and losses from the India joint venture. The effective income tax rate increased due to a non-cash tax charge related to the Star India Transaction, partially offset by favorable adjustments related to prior years, lower foreign tax rates, and a comparison to unfavorable prior-year effects."
205
+ },
206
+ {
207
+ "title": "Noncontrolling Interests",
208
+ "start_index": 11,
209
+ "end_index": 11,
210
+ "node_id": "0027",
211
+ "summary": "The partial document covers two main points:\n\n1. **Noncontrolling Interests**: It discusses the net income attributable to noncontrolling interests, which decreased by 63% compared to the prior-year quarter. The decrease is attributed to the prior-year accretion of NBC Universal\u2019s interest in Hulu. The calculation of net income attributable to noncontrolling interests is based on income after royalties, management fees, financing costs, and income taxes.\n\n2. **Cash from Operations**: It details cash provided by operations and free cash flow, showing an increase in cash provided by operations by $1.0 billion to $3.2 billion in the current quarter. The increase is driven by lower tax payments, higher operating income at Entertainment, and higher film and television production spending, along with the timing of payments for sports rights. Free cash flow decreased by $147 million compared to the prior-year quarter."
212
+ },
213
+ {
214
+ "title": "Cash from Operations",
215
+ "start_index": 11,
216
+ "end_index": 11,
217
+ "node_id": "0028",
218
+ "summary": "The partial document covers two main points:\n\n1. **Noncontrolling Interests**: It discusses the net income attributable to noncontrolling interests, which decreased by 63% in the quarter ended December 28, 2024, compared to the prior-year quarter. The decrease is attributed to the prior-year accretion of NBC Universal\u2019s interest in Hulu. The calculation of net income attributable to noncontrolling interests includes royalties, management fees, financing costs, and income taxes.\n\n2. **Cash from Operations**: It details cash provided by operations and free cash flow for the quarter ended December 28, 2024, compared to the prior-year quarter. Cash provided by operations increased by $1.0 billion, driven by lower tax payments, higher operating income at Entertainment, and higher film and television production spending, along with the timing of payments for sports rights. Free cash flow decreased by $147 million due to increased investments in parks, resorts, and other property."
219
+ },
220
+ {
221
+ "title": "Capital Expenditures",
222
+ "start_index": 12,
223
+ "end_index": 12,
224
+ "node_id": "0029",
225
+ "summary": "The partial document provides details on capital expenditures and depreciation expenses for parks, resorts, and other properties. It highlights an increase in capital expenditures from $1.3 billion to $2.5 billion, primarily due to higher spending on cruise ship fleet expansion in the Experiences segment. The document also breaks down investments and depreciation expenses by category (Entertainment, Sports, Domestic and International Experiences, and Corporate) for the quarters ending December 28, 2024, and December 30, 2023. Depreciation expenses increased from $823 million to $909 million, with detailed figures provided for each segment."
226
+ },
227
+ {
228
+ "title": "Depreciation Expense",
229
+ "start_index": 12,
230
+ "end_index": 12,
231
+ "node_id": "0030",
232
+ "summary": "The partial document provides details on capital expenditures and depreciation expenses for parks, resorts, and other properties. It highlights an increase in capital expenditures from $1.3 billion to $2.5 billion, primarily due to higher spending on cruise ship fleet expansion in the Experiences segment. The breakdown of investments and depreciation expenses is provided for Entertainment, Sports, Domestic and International Experiences, and Corporate segments for the quarters ending December 28, 2024, and December 30, 2023. Depreciation expenses also increased from $823 million to $909 million, with detailed segment-wise allocations."
233
+ }
234
+ ],
235
+ "node_id": "0021",
236
+ "summary": "The partial document provides a financial overview of revenues and operating income for Parks & Experiences, including Domestic, International, and Consumer Products segments, comparing the quarters ending December 28, 2024, and December 30, 2023. It highlights a 3% increase in total revenue and stable operating income. Domestic Parks and Experiences were negatively impacted by Hurricanes Milton and Helene, leading to closures, cancellations, higher costs, and lower attendance, despite increased guest spending. International Parks and Experiences saw growth in operating income due to higher guest spending, increased attendance, and new offerings, offset by increased costs. The document also notes a rise in corporate and unallocated shared expenses due to a legal settlement and a $143 million loss related to the Star India Transaction."
237
+ },
238
+ {
239
+ "title": "THE WALT DISNEY COMPANY CONDENSED CONSOLIDATED STATEMENTS OF INCOME",
240
+ "start_index": 13,
241
+ "end_index": 13,
242
+ "node_id": "0031",
243
+ "summary": "The partial document provides a condensed consolidated statement of income for The Walt Disney Company for the quarters ended December 28, 2024, and December 30, 2023. It includes details on revenues, costs and expenses, restructuring and impairment charges, net interest expense, equity in the income of investees, income before income taxes, income taxes, and net income. It also breaks down net income attributable to noncontrolling interests and The Walt Disney Company. Additionally, it provides earnings per share (diluted and basic) and the weighted average number of shares outstanding (diluted and basic) for both periods."
244
+ },
245
+ {
246
+ "title": "THE WALT DISNEY COMPANY CONDENSED CONSOLIDATED BALANCE SHEETS",
247
+ "start_index": 14,
248
+ "end_index": 14,
249
+ "node_id": "0032",
250
+ "summary": "The partial document is a condensed consolidated balance sheet for The Walt Disney Company, comparing financial data as of December 28, 2024, and September 28, 2024. It details the company's assets, liabilities, and equity. Key points include:\n\n1. **Assets**: Breakdown of current assets (cash, receivables, inventories, content advances, and other assets), produced and licensed content costs, investments, property (attractions, buildings, equipment, projects in progress, and land), intangible assets, goodwill, and other assets. Total assets increased slightly from $196.2 billion to $197 billion.\n\n2. **Liabilities**: Includes current liabilities (accounts payable, borrowings, deferred revenue), long-term borrowings, deferred income taxes, and other long-term liabilities. Total liabilities remained relatively stable.\n\n3. **Equity**: Details Disney shareholders' equity, including common stock, retained earnings, accumulated other comprehensive loss, and treasury stock. Noncontrolling interests are also included. Total equity increased from $105.5 billion to $106.7 billion.\n\n4. **Overall Financial Position**: The balance sheet reflects a stable financial position with slight changes in assets, liabilities, and equity over the period."
251
+ },
252
+ {
253
+ "title": "THE WALT DISNEY COMPANY CONDENSED CONSOLIDATED STATEMENTS OF CASH FLOWS",
254
+ "start_index": 15,
255
+ "end_index": 15,
256
+ "node_id": "0033",
257
+ "summary": "The partial document provides a condensed consolidated statement of cash flows for The Walt Disney Company for the quarters ended December 28, 2024, and December 30, 2023. It details cash flow activities categorized into operating, investing, and financing activities. Key points include:\n\n1. **Operating Activities**: Net income increased from $2,151 million in 2023 to $2,644 million in 2024. Other significant changes include variations in depreciation, deferred taxes, equity income, content costs, and changes in operating assets and liabilities, resulting in cash provided by operations of $3,205 million in 2024 compared to $2,185 million in 2023.\n\n2. **Investing Activities**: Investments in parks, resorts, and other properties increased significantly in 2024 ($2,466 million) compared to 2023 ($1,299 million), leading to higher cash used in investing activities.\n\n3. **Financing Activities**: The company saw a net cash outflow in financing activities, including commercial paper borrowings, stock repurchases, and debt reduction. In 2024, cash used in financing activities was $997 million, a significant improvement from $8,006 million in 2023.\n\n4. **Exchange Rate Impact**: Exchange rates negatively impacted cash in 2024 by $153 million, compared to a positive impact of $79 million in 2023.\n\n5. **Overall Cash Position**: The company\u2019s cash, cash equivalents, and restricted cash decreased from $14,235 million at the beginning of the 2023 period to $5,582 million at the end of the 2024 period."
258
+ },
259
+ {
260
+ "title": "DTC PRODUCT DESCRIPTIONS AND KEY DEFINITIONS",
261
+ "start_index": 16,
262
+ "end_index": 16,
263
+ "node_id": "0034",
264
+ "summary": "The partial document provides an overview of Disney's Direct-to-Consumer (DTC) product offerings, key definitions, and metrics. It details the availability of Disney+, ESPN+, and Hulu as standalone services or bundled offerings in the U.S., including Hulu Live TV + SVOD, which incorporates Disney+ and ESPN+. It explains the global reach of Disney+ in over 150 countries and the various purchase channels, including websites, third-party platforms, and wholesale arrangements. The document defines \"paid subscribers\" as those generating subscription revenue, excluding extra member add-ons, and outlines how subscribers are counted for multi-product offerings. It also describes the calculation of average monthly revenue per paid subscriber for Hulu, ESPN+, and Disney+, including revenue components like subscription fees, advertising, and add-ons, while noting differences in revenue allocation and the impact of wholesale arrangements on average revenue."
265
+ },
266
+ {
267
+ "title": "NON-GAAP FINANCIAL MEASURES",
268
+ "start_index": 17,
269
+ "end_index": 17,
270
+ "nodes": [
271
+ {
272
+ "title": "Diluted EPS excluding certain items",
273
+ "start_index": 17,
274
+ "end_index": 18,
275
+ "node_id": "0036",
276
+ "summary": "The partial document discusses the use of non-GAAP financial measures, specifically diluted EPS excluding certain items (adjusted EPS), total segment operating income, and free cash flow. It explains that these measures are not defined by GAAP but are important for evaluating the company's performance. The document highlights that these measures should be reviewed alongside comparable GAAP measures and may not be directly comparable to similar measures from other companies. It provides details on the adjustments made to diluted EPS, including the exclusion of certain items affecting comparability and amortization of TFCF and Hulu intangible assets, to better reflect operational performance. The document also includes a reconciliation table comparing reported diluted EPS to adjusted EPS for specific quarters, showing the impact of excluded items such as restructuring charges and intangible asset amortization. Additionally, it notes the challenges in providing forward-looking GAAP measures due to unpredictable factors."
277
+ },
278
+ {
279
+ "title": "Total segment operating income",
280
+ "start_index": 19,
281
+ "end_index": 20,
282
+ "node_id": "0037",
283
+ "summary": "The partial document focuses on the evaluation of the company's performance through two key financial metrics: total segment operating income and free cash flow. It explains that total segment operating income is used to assess the performance of operating segments separately from non-operational factors, providing insights into operational results. A reconciliation table is provided, showing the calculation of total segment operating income for two quarters, highlighting changes in various components such as corporate expenses, restructuring charges, and interest expenses. Additionally, the document discusses free cash flow as a measure of cash available for purposes beyond capital expenditures, such as debt servicing, acquisitions, and shareholder returns. A summary of consolidated cash flows and a reconciliation of cash provided by operations to free cash flow are presented, comparing figures for two quarters and highlighting changes in cash flow components."
284
+ },
285
+ {
286
+ "title": "Free cash flow",
287
+ "start_index": 20,
288
+ "end_index": 20,
289
+ "node_id": "0038",
290
+ "summary": "The partial document provides a reconciliation of the company's consolidated cash provided by operations to free cash flow for the quarters ended December 28, 2024, and December 30, 2023. It highlights a $1,020 million increase in cash provided by operations, a $1,167 million increase in investments in parks, resorts, and other property, and a $147 million decrease in free cash flow."
291
+ }
292
+ ],
293
+ "node_id": "0035",
294
+ "summary": "The partial document discusses the use of non-GAAP financial measures by the company, including diluted EPS excluding certain items (adjusted EPS), total segment operating income, and free cash flow. It explains that these measures are not defined by GAAP but are important for evaluating the company's performance. The document emphasizes that these measures should be reviewed alongside comparable GAAP measures and may not be directly comparable to similar measures from other companies. It highlights the company's inability to provide forward-looking GAAP measures or reconciliations due to uncertainties in predicting significant items. Additionally, the document details the rationale for excluding certain items and amortization of TFCF and Hulu intangible assets from diluted EPS to enhance comparability and provide a clearer evaluation of operational performance, particularly given the significant impact of the 2019 TFCF and Hulu acquisition."
295
+ },
296
+ {
297
+ "title": "FORWARD-LOOKING STATEMENTS",
298
+ "start_index": 21,
299
+ "end_index": 21,
300
+ "node_id": "0039",
301
+ "summary": "The partial document outlines the inclusion of forward-looking statements in an earnings release, emphasizing that these statements are based on management's views and assumptions about future events and business performance. It highlights that actual results may differ materially due to various factors, including company actions (e.g., restructuring, strategic initiatives, cost rationalization), external developments (e.g., economic conditions, competition, consumer behavior, regulatory changes, technological advancements, labor market activities, and natural disasters), and their potential impacts on operations, profitability, content performance, advertising markets, and taxation. The document also references additional risk factors and analyses detailed in the company's filings with the SEC, such as annual and quarterly reports."
302
+ },
303
+ {
304
+ "title": "PREPARED EARNINGS REMARKS AND CONFERENCE CALL INFORMATION",
305
+ "start_index": 22,
306
+ "end_index": 22,
307
+ "node_id": "0040",
308
+ "summary": "The partial document provides information about The Walt Disney Company's prepared management remarks and a conference call scheduled for February 5, 2025, at 8:30 AM EST/5:30 AM PST, accessible via a live webcast on their investor website. It also mentions that a replay of the webcast will be available on the site. Additionally, contact details for Corporate Communications (David Jefferson) and Investor Relations (Carlos Gomez) are provided."
309
+ }
310
+ ]
311
+ }
tutorials/doc-search/README.md ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ ## Document Search Examples
4
+
5
+
6
+ PageIndex currently enables reasoning-based RAG within a single document by default.
7
+ For users who need to search across multiple documents, we provide three best-practice workflows for different scenarios below.
8
+
9
+ * [**Search by Metadata**:](metadata.md) for documents that can be distinguished by metadata.
10
+ * [**Search by Semantics**:](semantics.md) for documents with different semantic content or cover diverse topics.
11
+ * [**Search by Description**:](description.md) a lightweight strategy for a small number of documents.
12
+
13
+
14
+ ## 💬 Support
15
+
16
+ * 🤝 [Join our Discord](https://discord.gg/VuXuf29EUj)
17
+ * 📨 [Contact Us](https://ii2abc2jejf.typeform.com/to/meB40zV0)
tutorials/doc-search/description.md ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ## Document Search by Description
3
+
4
+ For documents that don't have metadata, you can use LLM-generated descriptions to help with document selection. This is a lightweight approach that works best with a small number of documents.
5
+
6
+
7
+ ### Example Pipeline
8
+
9
+
10
+ #### PageIndex Tree Generation
11
+ Upload all documents into PageIndex to get their `doc_id` and tree structure.
12
+
13
+ #### Description Generation
14
+
15
+ Generate a description for each document based on its PageIndex tree structure and node summaries.
16
+ ```python
17
+ prompt = f"""
18
+ You are given a table of contents structure of a document.
19
+ Your task is to generate a one-sentence description for the document that makes it easy to distinguish from other documents.
20
+
21
+ Document tree structure: {PageIndex_Tree}
22
+
23
+ Directly return the description, do not include any other text.
24
+ """
25
+ ```
26
+
27
+ #### Search with LLM
28
+
29
+ Use an LLM to select relevant documents by comparing the user query against the generated descriptions.
30
+
31
+ Below is a sample prompt for document selection based on their descriptions:
32
+
33
+ ```python
34
+ prompt = f"""
35
+ You are given a list of documents with their IDs, file names, and descriptions. Your task is to select documents that may contain information relevant to answering the user query.
36
+
37
+ Query: {query}
38
+
39
+ Documents: [
40
+ {
41
+ "doc_id": "xxx",
42
+ "doc_name": "xxx",
43
+ "doc_description": "xxx"
44
+ }
45
+ ]
46
+
47
+ Response Format:
48
+ {{
49
+ "thinking": "<Your reasoning for document selection>",
50
+ "answer": <Python list of relevant doc_ids>, e.g. ['doc_id1', 'doc_id2']. Return [] if no documents are relevant.
51
+ }}
52
+
53
+ Return only the JSON structure, with no additional output.
54
+ """
55
+ ```
56
+
57
+ #### Retrieve with PageIndex
58
+
59
+ Use the PageIndex `doc_id` of the retrieved documents to perform further retrieval via the PageIndex retrieval API.
60
+
61
+
62
+
63
+ ## 💬 Help & Community
64
+ Contact us if you need any advice on conducting document searches for your use case.
65
+
66
+ - 🤝 [Join our Discord](https://discord.gg/VuXuf29EUj)
67
+ - 📨 [Leave us a message](https://ii2abc2jejf.typeform.com/to/meB40zV0)
tutorials/doc-search/metadata.md ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ ## Document Search by Metadata
4
+ <callout>PageIndex with metadata support is in closed beta. Fill out this form to request early access to this feature.</callout>
5
+
6
+ For documents that can be easily distinguished by metadata, we recommend using metadata to search the documents.
7
+ This method is ideal for the following document types:
8
+ - Financial reports categorized by company and time period
9
+ - Legal documents categorized by case type
10
+ - Medical records categorized by patient or condition
11
+ - And many others
12
+
13
+ In such cases, you can search documents by leveraging their metadata. A popular method is to use "Query to SQL" for document retrieval.
14
+
15
+
16
+ ### Example Pipeline
17
+
18
+ #### PageIndex Tree Generation
19
+ Upload all documents into PageIndex to get their `doc_id`.
20
+
21
+ #### Set up SQL tables
22
+
23
+ Store documents along with their metadata and the PageIndex `doc_id` in a database table.
24
+
25
+ #### Query to SQL
26
+
27
+ Use an LLM to transform a user’s retrieval request into a SQL query to fetch relevant documents.
28
+
29
+ #### Retrieve with PageIndex
30
+
31
+ Use the PageIndex `doc_id` of the retrieved documents to perform further retrieval via the PageIndex retrieval API.
32
+
33
+ ## 💬 Help & Community
34
+ Contact us if you need any advice on conducting document searches for your use case.
35
+
36
+ - 🤝 [Join our Discord](https://discord.gg/VuXuf29EUj)
37
+ - 📨 [Leave us a message](https://ii2abc2jejf.typeform.com/to/meB40zV0)
tutorials/doc-search/semantics.md ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Document Search by Semantics
2
+
3
+ For documents that cover diverse topics, one can also use vector-based semantic search to search the documents. The procedure is slightly different from the classic vector-search-based method.
4
+
5
+ ### Example Pipeline
6
+
7
+
8
+ #### Chunking and Embedding
9
+ Divide the documents into chunks, choose an embedding model to convert the chunks into vectors and store each vector with its corresponding `doc_id` in a vector database.
10
+
11
+
12
+ #### Vector Search
13
+
14
+ For each query, conduct a vector-based search to get top-K chunks with their corresponding documents.
15
+
16
+ #### Compute Document Score
17
+
18
+ For each document, calculate a relevance score. Let N be the number of content chunks associated with each document, and let **ChunkScore**(n) be the relevance score of chunk n. The document score is computed as:
19
+
20
+
21
+ $$
22
+ \text{DocScore}=\frac{1}{\sqrt{N+1}}\sum_{n=1}^N \text{ChunkScore}(n)
23
+ $$
24
+
25
+ - The sum aggregates relevance from all related chunks.
26
+ - The +1 inside the square root ensures the formula handles nodes with zero chunks.
27
+ - Using the square root in the denominator allows the score to increase with the number of relevant chunks, but with diminishing returns. This rewards documents with more relevant chunks, while preventing large nodes from dominating due to quantity alone.
28
+ - This scoring favors documents with fewer, highly relevant chunks over those with many weakly relevant ones.
29
+
30
+
31
+ #### Retrieve with PageIndex
32
+
33
+ Select the documents with the highest DocScore, then use their `doc_id` to perform further retrieval via the PageIndex retrieval API.
34
+
35
+
36
+
37
+ ## 💬 Help & Community
38
+ Contact us if you need any advice on conducting document searches for your use case.
39
+
40
+ - 🤝 [Join our Discord](https://discord.gg/VuXuf29EUj)
41
+ - 📨 [Leave us a message](https://ii2abc2jejf.typeform.com/to/meB40zV0)
tutorials/tree-search/README.md ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Tree Search Examples
2
+ This tutorial provides a basic example of how to perform retrieval using the PageIndex tree.
3
+
4
+ ### Basic LLM Tree Search Example
5
+ A simple strategy is to use an LLM agent to conduct tree search. Here is a basic tree search prompt.
6
+
7
+ ```python
8
+ prompt = f"""
9
+ You are given a query and the tree structure of a document.
10
+ You need to find all nodes that are likely to contain the answer.
11
+
12
+ Query: {query}
13
+
14
+ Document tree structure: {PageIndex_Tree}
15
+
16
+ Reply in the following JSON format:
17
+ {{
18
+ "thinking": <your reasoning about which nodes are relevant>,
19
+ "node_list": [node_id1, node_id2, ...]
20
+ }}
21
+ """
22
+ ```
23
+ <callout>
24
+ In our dashboard and retrieval API, we use a combination of LLM tree search and value function-based Monte Carlo Tree Search ([MCTS](https://en.wikipedia.org/wiki/Monte_Carlo_tree_search)). More details will be released soon.
25
+ </callout>
26
+
27
+ ### Integrating User Preference or Expert Knowledge
28
+ Unlike vector-based RAG where integrating expert knowledge or user preference requires fine-tuning the embedding model, in PageIndex, you can incorporate user preferences or expert knowledge by simply adding knowledge to the LLM tree search prompt. Here is an example pipeline.
29
+
30
+
31
+ #### 1. Preference Retrieval
32
+
33
+ When a query is received, the system selects the most relevant user preference or expert knowledge snippets from a database or a set of domain-specific rules. This can be done using keyword matching, semantic similarity, or LLM-based relevance search.
34
+
35
+ #### 2. Tree Search with Preference
36
+ Integrating preference into the tree search prompt.
37
+
38
+ **Enhanced Tree Search with Expert Preference Example**
39
+
40
+ ```python
41
+ prompt = f"""
42
+ You are given a question and a tree structure of a document.
43
+ You need to find all nodes that are likely to contain the answer.
44
+
45
+ Query: {query}
46
+
47
+ Document tree structure: {PageIndex_Tree}
48
+
49
+ Expert Knowledge of relevant sections: {Preference}
50
+
51
+ Reply in the following JSON format:
52
+ {{
53
+ "thinking": <reasoning about which nodes are relevant>,
54
+ "node_list": [node_id1, node_id2, ...]
55
+ }}
56
+ """
57
+ ```
58
+
59
+ **Example Expert Preference**
60
+ > If the query mentions EBITDA adjustments, prioritize Item 7 (MD&A) and footnotes in Item 8 (Financial Statements) in 10-K reports.
61
+
62
+
63
+
64
+ By integrating user or expert preferences, node search becomes more targeted and effective, leveraging both the document structure and domain-specific insights.
65
+
66
+ ## 💬 Help & Community
67
+ Contact us if you need any advice on conducting document searches for your use case.
68
+
69
+ - 🤝 [Join our Discord](https://discord.gg/VuXuf29EUj)
70
+ - 📨 [Leave us a message](https://ii2abc2jejf.typeform.com/to/tK3AXl8T)