import gradio as gr import os import json import time import re from pageindex.core.tree_index import TreeIndex from llm_config import get_llm_client, get_model_name # Security: Check for APP_TOKEN env var REQUIRED_TOKEN = os.getenv("APP_TOKEN", "849ejdkf2Audjo2Jf3jdoirfjh") def extract_tables_from_markdown(markdown_text, token): """ Dedicated function to extract all tables from the markdown document. Returns JSON array of table objects. """ if token != REQUIRED_TOKEN: return json.dumps({"error": "Invalid Authentication Token", "tables": []}) if not markdown_text: return json.dumps({"error": "No markdown content provided", "tables": []}) try: print(f"[PageIndex] Starting table extraction from {len(markdown_text)} chars...") # 1. Build the PageIndex Tree tree = TreeIndex() try: tree.build_from_markdown(markdown_text) print("[PageIndex] Tree index built successfully for table extraction.") except Exception as e: print(f"[PageIndex] Tree build error: {e}, using fallback.") # 2. Initialize the LLM client try: client = get_llm_client(provider="nvidia") model = get_model_name(provider="nvidia") except Exception as e: print(f"[PageIndex] Nvidia client failed: {e}. Falling back to Mistral.") try: client = get_llm_client(provider="mistral") model = get_model_name(provider="mistral") except Exception as e2: return json.dumps({"error": f"LLM client error: {str(e2)}", "tables": []}) # 3. Search for table-rich sections table_query = """ Find all tables in the document including: Well Headers, Formation Tops, Casing Details, Drilling Data, Directional Surveys, Core Analysis, Cementing Records, BHA records, Cuttings Descriptions, and any other tabular data. Extract ALL rows and columns from each table found. """ context = "" try: if hasattr(tree, 'reasoning_search'): context = tree.reasoning_search(query=table_query, llm_client=client, model=model) else: # Fallback: use document directly context = markdown_text[:15000] # First 15k chars except Exception as e: print(f"[PageIndex] Tree search error: {e}, using fallback.") context = markdown_text[:15000] if not context or len(context) < 100: context = markdown_text[:15000] # 4. Generate structured JSON tables extraction_prompt = """You are a Petroleum Data Extraction Expert. Your task is to extract ALL tables AND convert structured paragraph data into tables from the provided document context. CRITICAL INSTRUCTIONS - READ CAREFULLY: 1. **EXTRACT ALL ROWS**: You MUST extract EVERY SINGLE ROW from each table. Do NOT skip rows, do NOT truncate, do NOT summarize. 2. **NO PARTIAL DATA**: If a table has 10 rows, you must return all 10 rows. If it has 100 rows, return all 100 rows. 3. **CONVERT PARAGRAPHS TO TABLES**: If you find formation tops, lithology data, or any structured data in text paragraphs (e.g., "Formation X encountered at 1000m depth"), CONVERT it into a proper table with columns and rows. 4. **COMPLETE EXTRACTION**: Count the rows in the source table and verify you extracted the same number. 5. **DO NOT SUMMARIZE**: Never say "etc" or "..." or truncate with "...". Every row must be fully extracted. 6. **SCRAPE PARAGRAPHS**: Look for: - Formation tops mentioned in text (e.g., "Eleana Formation at 2594 feet") - Lithology descriptions with depths - Drilling events with dates/depths - Equipment lists in bullet points - Any sequential data that can be tabulated **O&G TABLE CATEGORIES TO EXTRACT (including from paragraphs):** - Well Headers / Well Identification / Site Data - Formation Tops / Lithology / Stratigraphy (LOOK IN TEXT PARAGRAPHS TOO!) - Directional Survey / Well Path / Azimuth/Inclination data - Casing Records / Casing Data / Tubing specifications - Cementing Data / Cement Composition / Bond logs - Drilling Fluids / Mud Properties / Fluid Management - Core Analysis / Core Data / Petrophysics - Sidewall Samples / SWC data - Production Tests / DST / Pressure tests / Flow rates - Perforation Data / Completion details - Geophysical Logs / Wireline logs / Logging runs - Equipment Lists / BHA / Drill string components - Personnel / Company representatives / Supervisors - Timelines / Drilling events / Days depths - Cost data / AFE estimates **PARAGRAPH-TO-TABLE CONVERSION EXAMPLES:** If text says: "The Eleana Dolomite was encountered at 2,594 ft MD (2,594 ft TVD)..." CREATE: {"title": "Formation Tops", "headers": ["Formation", "Depth_ft", "Depth_m"], "rows": [...]} EXTRACTION REQUIREMENTS: - Find ALL tables in the document - CONVERT paragraph data describing formations, depths, lithology INTO tables - For each table, extract: - "title": A descriptive title for the table - "headers": Array of column names - "rows": Array of row objects - MUST INCLUDE ALL ROWS - "page_number": The page number where this table appears - **BE THOROUGH**: A typical completion report has 15-25+ separate tables. If you only found 3-5, you missed some. Scan paragraphs too! Return VALID JSON ONLY in this exact format: { "tables": [ { "title": "Well Header Information", "headers": ["Well Name", "API Number", "Operator", "Location"], "rows": [ {"Well Name": "OzAlpha-1", "API Number": "42-001", "Operator": "PetroCorp", "Location": "Texas"} ], "page_number": 1 } ] } VERIFICATION STEP: 1. Count tables found in explicit table format 2. Count data found in paragraphs that could be tables 3. Total should be 15-25+ for a completion report 4. Before returning, verify you converted paragraph data to tables Return ONLY the JSON, no markdown, no explanations, no code blocks.""" messages = [ {"role": "system", "content": extraction_prompt}, {"role": "user", "content": f"Document Context:\n{context}\n\nExtract all tables as JSON."} ] print("[PageIndex] Sending table extraction request to LLM...") response = client.chat.completions.create( model=model, messages=messages, stream=False, max_tokens=16384, temperature=0 ) response_text = response.choices[0].message.content print(f"[PageIndex] LLM response received: {len(response_text)} chars") # Parse JSON from response - handle markdown code blocks response_text = response_text.strip() # Try multiple extraction strategies data = None # Strategy 1: Try direct JSON parse try: data = json.loads(response_text) except json.JSONDecodeError: pass # Strategy 2: Extract JSON from markdown code block if data is None: code_block_match = re.search(r'```(?:json)?\s*(\{.*\})\s*```', response_text, re.DOTALL) if code_block_match: try: data = json.loads(code_block_match.group(1)) except json.JSONDecodeError: pass # Strategy 3: Extract JSON object directly if data is None: json_match = re.search(r'\{[\s\S]*"tables"[\s\S]*\}', response_text) if json_match: try: data = json.loads(json_match.group(0)) except json.JSONDecodeError: pass # Strategy 4: Look for any JSON-like structure if data is None: json_match = re.search(r'\{.*\}', response_text, re.DOTALL) if json_match: try: data = json.loads(json_match.group(0)) except json.JSONDecodeError: pass if data and "tables" in data: tables = data["tables"] # Ensure each table has required fields for table in tables: if "page_number" not in table: table["page_number"] = 1 if "source" not in table: table["source"] = "PageIndex" print(f"[PageIndex] Successfully extracted {len(tables)} tables.") return json.dumps({"tables": tables}) # If no valid JSON found, return empty print(f"[PageIndex] No valid JSON found in response. Raw preview: {response_text[:500]}") return json.dumps({"tables": []}) except Exception as e: print(f"[PageIndex] Table extraction error: {e}") return json.dumps({"error": str(e), "tables": []}) def process_docling_and_chat(markdown_text, user_query, token, chat_history_json=None): """ Process document markdown and answer user query using PageIndex RAG. Yields streaming updates for real-time feedback. """ start_time = time.time() # Token validation if token != REQUIRED_TOKEN: yield "<<>>>>" return if not markdown_text: yield "<<>>>>" return if not user_query: yield "<<>>>>" return try: # History parsing chat_history = [] if chat_history_json: try: chat_history = json.loads(chat_history_json) except Exception as e: print(f"[PageIndex] Warning: Could not parse chat history: {e}") reasoning_log = "" yield "<<>>" # 1. Build the PageIndex Tree locally in the Space reasoning_log += "<<>>\n" yield reasoning_log tree = TreeIndex() try: tree.build_from_markdown(markdown_text) reasoning_log += f"<<>>\n" yield reasoning_log except Exception as e: print(f"[PageIndex] Tree build error: {e}") reasoning_log += f"<<>>\n" yield reasoning_log # 2. Initialize the LLM client reasoning_log += "<<>>\n" yield reasoning_log try: client = get_llm_client(provider="nvidia") model = get_model_name(provider="nvidia") reasoning_log += f"<<>>\n" except Exception as e: print(f"[PageIndex] Nvidia client failed: {e}. Falling back to Mistral.") try: client = get_llm_client(provider="mistral") model = get_model_name(provider="mistral") reasoning_log += f"<<>>\n" except Exception as e2: yield f"<<>>" return yield reasoning_log # 3. Perform Reasoning Search (Streamed) reasoning_log += "<<>>\n" yield reasoning_log context = "" search_success = False # Use stream method if available if hasattr(tree, 'reasoning_search_stream'): try: for update in tree.reasoning_search_stream(user_query=user_query, llm_client=client, model=model): if update.startswith("<<>>\n" yield reasoning_log else: context = update search_success = True except Exception as e: print(f"[PageIndex] Streaming search error: {e}") reasoning_log += f"<<>>\n" yield reasoning_log # Fallback to standard search if streaming failed or not available if not search_success: try: reasoning_log += "<<>>\n" yield reasoning_log context = tree.reasoning_search(query=user_query, llm_client=client, model=model) search_success = True except Exception as e: print(f"[PageIndex] Standard search error: {e}") # Use full document as context as last resort context = markdown_text[:8000] # First 8000 chars reasoning_log += f"<<>>\n" yield reasoning_log if not context or context.strip() == "": context = "No specific context found in document tree. Using full document." # Include first and last part of document context = markdown_text[:4000] + "\n\n...[MIDDLE SECTIONS OMITTED]...\n\n" + markdown_text[-4000:] # 4. Final Answer Generation reasoning_log += "<<>>\n" yield reasoning_log # Construct messages with history messages = [ {"role": "system", "content": """You are a Senior Petroleum Engineer assistant. Your goal is to extract precise technical data from the provided document context. **Guidelines:** 1. **Tables**: If the user asks for data that can be tabulated (e.g., formation tops, casing, surveys, fluid props), **ALWAYS** format the output as a Markdown table. 2. **Completeness**: Extract ALL relevant data. Do NOT summarize or omit rows. 3. **Inference**: If data is text-based (e.g., "X formation at 1000m"), structure it into a table. 4. **No "Not Found"**: If you found related data, present that as the answer. 5. **Tone**: Technical, precise, no fluff. 6. **Charts**: If requested, visualize data using this JSON format: ```json:chart { "type": "line" | "bar" | "area" | "scatter", "title": "Title", "xAxis": "x_label", "yAxis": "y_label", "data": [{"x_label": 0, "y_label": 10}, ...] } ``` """} ] # Add history for msg in chat_history: role = msg.get("role", "user") content = msg.get("content", "") messages.append({"role": role, "content": content}) messages.append({ "role": "user", "content": f"Context from document:\n{context}\n\nUser Query: {user_query}\n\nIf the query requests tabular data, provide a complete Markdown Table with all rows." }) # Generate streaming response try: response_stream = client.chat.completions.create( model=model, messages=messages, stream=True, max_tokens=8192, temperature=0, ) full_response_text = "" for chunk in response_stream: if chunk.choices[0].delta.content: delta = chunk.choices[0].delta.content full_response_text += delta # Yield reasoning log + current response yield reasoning_log + "\n" + "="*50 + "\nFINAL ANSWER:\n" + "="*50 + "\n" + full_response_text elapsed = time.time() - start_time print(f"[PageIndex] Request completed in {elapsed:.2f}s") except Exception as e: print(f"[PageIndex] LLM generation error: {e}") yield reasoning_log + f"\n\nError generating response: {str(e)}" except Exception as e: error_msg = f"An error occurred: {str(e)}" print(f"[PageIndex] {error_msg}") yield f"<<>>" # Gradio UI setup with gr.Blocks(title="Petromind AI - PageIndex RAG") as demo: gr.Markdown("# Oil & Gas Report - PageIndex RAG") gr.Markdown("Upload document content (markdown format) and ask questions to extract specific information using PageIndex reasoning.") with gr.Tab("Chat / Query"): with gr.Row(): with gr.Column(scale=1): input_md = gr.Textbox( label="Paste Docling Markdown Here", lines=15, placeholder="# Document Title\n\n## Section 1\nContent..." ) with gr.Column(scale=1): query = gr.Textbox( label="What do you want to extract?", placeholder="e.g., Extract all formation tops tables with depths" ) token_input = gr.Textbox( label="API Token", placeholder="Enter access token", type="password", value="849ejdkf2Audjo2Jf3jdoirfjh" ) history_json = gr.Textbox(visible=False, label="History JSON") btn = gr.Button("Analyze", variant="primary") output = gr.Textbox(label="Result", lines=15, interactive=False) btn.click( fn=process_docling_and_chat, inputs=[input_md, query, token_input, history_json], outputs=output, api_name="process_docling_and_chat" ) with gr.Tab("Table Extraction"): with gr.Row(): with gr.Column(scale=1): table_input_md = gr.Textbox( label="Paste Docling Markdown Here", lines=15, placeholder="# Document Title\n\n## Section 1\nContent..." ) with gr.Column(scale=1): table_token_input = gr.Textbox( label="API Token", placeholder="Enter access token", type="password", value="849ejdkf2Audjo2Jf3jdoirfjh" ) table_btn = gr.Button("Extract All Tables", variant="primary") table_output = gr.Textbox(label="Extracted Tables (JSON)", lines=15, interactive=False) table_btn.click( fn=extract_tables_from_markdown, inputs=[table_input_md, table_token_input], outputs=table_output, api_name="extract_tables" ) if __name__ == "__main__": # Enable queue for concurrency demo.queue().launch(server_name="0.0.0.0", server_port=7860)