Spaces:
Running
Running
Update pageindex/core/tree_index.py
Browse files- pageindex/core/tree_index.py +68 -0
pageindex/core/tree_index.py
CHANGED
|
@@ -148,6 +148,74 @@ class TreeIndex:
|
|
| 148 |
if node.get('nodes'):
|
| 149 |
self._add_node_ids(node['nodes'], prefix=f"{node_id}.")
|
| 150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
def reasoning_search(self, query: str, llm_client: Any, model: str = "gpt-4-turbo") -> str:
|
| 152 |
"""
|
| 153 |
Performs a tree search to find relevant nodes for the query.
|
|
|
|
| 148 |
if node.get('nodes'):
|
| 149 |
self._add_node_ids(node['nodes'], prefix=f"{node_id}.")
|
| 150 |
|
| 151 |
+
def reasoning_search_stream(self, query: str, llm_client: Any, model: str = "gpt-4-turbo"):
|
| 152 |
+
"""
|
| 153 |
+
Streamed version of reasoning_search. Yields status updates and finally the context.
|
| 154 |
+
"""
|
| 155 |
+
if not self.tree:
|
| 156 |
+
yield "<<<STATUS: Tree not built. Upload document first.>>>"
|
| 157 |
+
return "Tree not index built."
|
| 158 |
+
|
| 159 |
+
yield "<<<STATUS: Analyzing document structure...>>>"
|
| 160 |
+
tree_summary = self._get_tree_structure_summary(self.tree)
|
| 161 |
+
|
| 162 |
+
prompt = f"""
|
| 163 |
+
You are given a query and the tree structure of a document.
|
| 164 |
+
You need to find all nodes that are likely to contain the answer.
|
| 165 |
+
|
| 166 |
+
Query: {query}
|
| 167 |
+
|
| 168 |
+
Document tree structure:
|
| 169 |
+
{json.dumps(tree_summary, indent=2)}
|
| 170 |
+
|
| 171 |
+
Reply in the following JSON format:
|
| 172 |
+
{{
|
| 173 |
+
"thinking": <your reasoning about which nodes are relevant>,
|
| 174 |
+
"node_list": ["node_id1", "node_id2", ...]
|
| 175 |
+
}}
|
| 176 |
+
"""
|
| 177 |
+
|
| 178 |
+
try:
|
| 179 |
+
yield "<<<STATUS: Querying LLM for relevant sections...>>>"
|
| 180 |
+
response = llm_client.chat.completions.create(
|
| 181 |
+
model=model,
|
| 182 |
+
messages=[{"role": "user", "content": prompt}],
|
| 183 |
+
temperature=0.1
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
content = response.choices[0].message.content
|
| 187 |
+
# Basic JSON extraction
|
| 188 |
+
if "```json" in content:
|
| 189 |
+
content = content.split("```json")[1].split("```")[0].strip()
|
| 190 |
+
elif "```" in content:
|
| 191 |
+
content = content.split("```")[1].split("```")[0].strip()
|
| 192 |
+
|
| 193 |
+
result = json.loads(content)
|
| 194 |
+
thinking = result.get("thinking", "No reasoning provided.")
|
| 195 |
+
yield f"<<<STATUS: LLM Reasoning: {thinking}>>>"
|
| 196 |
+
|
| 197 |
+
relevant_node_ids = result.get("node_list", [])
|
| 198 |
+
yield f"<<<STATUS: Identifying {len(relevant_node_ids)} relevant sections...>>>"
|
| 199 |
+
|
| 200 |
+
# Retrieve text
|
| 201 |
+
context_parts = []
|
| 202 |
+
for node_id in relevant_node_ids:
|
| 203 |
+
node = self._find_node_by_id(self.tree, node_id)
|
| 204 |
+
if node:
|
| 205 |
+
yield f"<<<STATUS: Reading section: {node['title']}...>>>"
|
| 206 |
+
context_parts.append(f"--- Section: {node['title']} (ID: {node_id}) ---\n{node['text']}\n")
|
| 207 |
+
|
| 208 |
+
full_context = "\n".join(context_parts)
|
| 209 |
+
if not full_context:
|
| 210 |
+
yield "<<<STATUS: No relevant content found.>>>"
|
| 211 |
+
return "No relevant context found."
|
| 212 |
+
|
| 213 |
+
return full_context
|
| 214 |
+
|
| 215 |
+
except Exception as e:
|
| 216 |
+
yield f"<<<STATUS: Error: {str(e)}>>>"
|
| 217 |
+
return f"Error: {str(e)}"
|
| 218 |
+
|
| 219 |
def reasoning_search(self, query: str, llm_client: Any, model: str = "gpt-4-turbo") -> str:
|
| 220 |
"""
|
| 221 |
Performs a tree search to find relevant nodes for the query.
|