Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -67,18 +67,24 @@ def extract_tables_from_markdown(markdown_text, token):
|
|
| 67 |
context = markdown_text[:15000]
|
| 68 |
|
| 69 |
# 4. Generate structured JSON tables
|
| 70 |
-
extraction_prompt = """You are a Petroleum Data Extraction Expert. Your task is to extract ALL tables
|
| 71 |
|
| 72 |
CRITICAL INSTRUCTIONS - READ CAREFULLY:
|
| 73 |
1. **EXTRACT ALL ROWS**: You MUST extract EVERY SINGLE ROW from each table. Do NOT skip rows, do NOT truncate, do NOT summarize.
|
| 74 |
2. **NO PARTIAL DATA**: If a table has 10 rows, you must return all 10 rows. If it has 100 rows, return all 100 rows.
|
| 75 |
-
3. **
|
| 76 |
-
4. **
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
-
**
|
| 79 |
-
When scanning the document, look for these O&G table categories (extract ALL that you find):
|
| 80 |
- Well Headers / Well Identification / Site Data
|
| 81 |
-
- Formation Tops / Lithology / Stratigraphy
|
| 82 |
- Directional Survey / Well Path / Azimuth/Inclination data
|
| 83 |
- Casing Records / Casing Data / Tubing specifications
|
| 84 |
- Cementing Data / Cement Composition / Bond logs
|
|
@@ -91,17 +97,21 @@ When scanning the document, look for these O&G table categories (extract ALL tha
|
|
| 91 |
- Equipment Lists / BHA / Drill string components
|
| 92 |
- Personnel / Company representatives / Supervisors
|
| 93 |
- Timelines / Drilling events / Days depths
|
| 94 |
-
- Cost data / AFE estimates
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
EXTRACTION REQUIREMENTS:
|
| 98 |
-
- Find ALL tables in the document
|
|
|
|
| 99 |
- For each table, extract:
|
| 100 |
- "title": A descriptive title for the table
|
| 101 |
-
- "headers": Array of column names
|
| 102 |
-
- "rows": Array of row objects
|
| 103 |
- "page_number": The page number where this table appears
|
| 104 |
-
- **BE THOROUGH**: A typical completion report has
|
| 105 |
|
| 106 |
Return VALID JSON ONLY in this exact format:
|
| 107 |
|
|
@@ -119,13 +129,10 @@ Return VALID JSON ONLY in this exact format:
|
|
| 119 |
}
|
| 120 |
|
| 121 |
VERIFICATION STEP:
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
- Smaller tables embedded in text sections
|
| 127 |
-
- Equipment lists, BHA details, logging summaries
|
| 128 |
-
- Data tables you may have skipped as "minor"
|
| 129 |
|
| 130 |
Return ONLY the JSON, no markdown, no explanations, no code blocks."""
|
| 131 |
|
|
@@ -140,34 +147,65 @@ Return ONLY the JSON, no markdown, no explanations, no code blocks."""
|
|
| 140 |
model=model,
|
| 141 |
messages=messages,
|
| 142 |
stream=False,
|
| 143 |
-
max_tokens=
|
| 144 |
-
temperature=0
|
| 145 |
)
|
| 146 |
|
| 147 |
response_text = response.choices[0].message.content
|
| 148 |
print(f"[PageIndex] LLM response received: {len(response_text)} chars")
|
| 149 |
|
| 150 |
-
# Parse JSON from response
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
return json.dumps({"tables": []})
|
| 172 |
|
| 173 |
except Exception as e:
|
|
@@ -330,7 +368,7 @@ Your goal is to extract precise technical data from the provided document contex
|
|
| 330 |
messages=messages,
|
| 331 |
stream=True,
|
| 332 |
max_tokens=8192,
|
| 333 |
-
temperature=0
|
| 334 |
)
|
| 335 |
|
| 336 |
full_response_text = ""
|
|
|
|
| 67 |
context = markdown_text[:15000]
|
| 68 |
|
| 69 |
# 4. Generate structured JSON tables
|
| 70 |
+
extraction_prompt = """You are a Petroleum Data Extraction Expert. Your task is to extract ALL tables AND convert structured paragraph data into tables from the provided document context.
|
| 71 |
|
| 72 |
CRITICAL INSTRUCTIONS - READ CAREFULLY:
|
| 73 |
1. **EXTRACT ALL ROWS**: You MUST extract EVERY SINGLE ROW from each table. Do NOT skip rows, do NOT truncate, do NOT summarize.
|
| 74 |
2. **NO PARTIAL DATA**: If a table has 10 rows, you must return all 10 rows. If it has 100 rows, return all 100 rows.
|
| 75 |
+
3. **CONVERT PARAGRAPHS TO TABLES**: If you find formation tops, lithology data, or any structured data in text paragraphs (e.g., "Formation X encountered at 1000m depth"), CONVERT it into a proper table with columns and rows.
|
| 76 |
+
4. **COMPLETE EXTRACTION**: Count the rows in the source table and verify you extracted the same number.
|
| 77 |
+
5. **DO NOT SUMMARIZE**: Never say "etc" or "..." or truncate with "...". Every row must be fully extracted.
|
| 78 |
+
6. **SCRAPE PARAGRAPHS**: Look for:
|
| 79 |
+
- Formation tops mentioned in text (e.g., "Eleana Formation at 2594 feet")
|
| 80 |
+
- Lithology descriptions with depths
|
| 81 |
+
- Drilling events with dates/depths
|
| 82 |
+
- Equipment lists in bullet points
|
| 83 |
+
- Any sequential data that can be tabulated
|
| 84 |
|
| 85 |
+
**O&G TABLE CATEGORIES TO EXTRACT (including from paragraphs):**
|
|
|
|
| 86 |
- Well Headers / Well Identification / Site Data
|
| 87 |
+
- Formation Tops / Lithology / Stratigraphy (LOOK IN TEXT PARAGRAPHS TOO!)
|
| 88 |
- Directional Survey / Well Path / Azimuth/Inclination data
|
| 89 |
- Casing Records / Casing Data / Tubing specifications
|
| 90 |
- Cementing Data / Cement Composition / Bond logs
|
|
|
|
| 97 |
- Equipment Lists / BHA / Drill string components
|
| 98 |
- Personnel / Company representatives / Supervisors
|
| 99 |
- Timelines / Drilling events / Days depths
|
| 100 |
+
- Cost data / AFE estimates
|
| 101 |
+
|
| 102 |
+
**PARAGRAPH-TO-TABLE CONVERSION EXAMPLES:**
|
| 103 |
+
If text says: "The Eleana Dolomite was encountered at 2,594 ft MD (2,594 ft TVD)..."
|
| 104 |
+
CREATE: {"title": "Formation Tops", "headers": ["Formation", "Depth_ft", "Depth_m"], "rows": [...]}
|
| 105 |
|
| 106 |
EXTRACTION REQUIREMENTS:
|
| 107 |
+
- Find ALL tables in the document
|
| 108 |
+
- CONVERT paragraph data describing formations, depths, lithology INTO tables
|
| 109 |
- For each table, extract:
|
| 110 |
- "title": A descriptive title for the table
|
| 111 |
+
- "headers": Array of column names
|
| 112 |
+
- "rows": Array of row objects - MUST INCLUDE ALL ROWS
|
| 113 |
- "page_number": The page number where this table appears
|
| 114 |
+
- **BE THOROUGH**: A typical completion report has 15-25+ separate tables. If you only found 3-5, you missed some. Scan paragraphs too!
|
| 115 |
|
| 116 |
Return VALID JSON ONLY in this exact format:
|
| 117 |
|
|
|
|
| 129 |
}
|
| 130 |
|
| 131 |
VERIFICATION STEP:
|
| 132 |
+
1. Count tables found in explicit table format
|
| 133 |
+
2. Count data found in paragraphs that could be tables
|
| 134 |
+
3. Total should be 15-25+ for a completion report
|
| 135 |
+
4. Before returning, verify you converted paragraph data to tables
|
|
|
|
|
|
|
|
|
|
| 136 |
|
| 137 |
Return ONLY the JSON, no markdown, no explanations, no code blocks."""
|
| 138 |
|
|
|
|
| 147 |
model=model,
|
| 148 |
messages=messages,
|
| 149 |
stream=False,
|
| 150 |
+
max_tokens=16384,
|
| 151 |
+
temperature=0
|
| 152 |
)
|
| 153 |
|
| 154 |
response_text = response.choices[0].message.content
|
| 155 |
print(f"[PageIndex] LLM response received: {len(response_text)} chars")
|
| 156 |
|
| 157 |
+
# Parse JSON from response - handle markdown code blocks
|
| 158 |
+
response_text = response_text.strip()
|
| 159 |
+
|
| 160 |
+
# Try multiple extraction strategies
|
| 161 |
+
data = None
|
| 162 |
+
|
| 163 |
+
# Strategy 1: Try direct JSON parse
|
| 164 |
+
try:
|
| 165 |
+
data = json.loads(response_text)
|
| 166 |
+
except json.JSONDecodeError:
|
| 167 |
+
pass
|
| 168 |
+
|
| 169 |
+
# Strategy 2: Extract JSON from markdown code block
|
| 170 |
+
if data is None:
|
| 171 |
+
code_block_match = re.search(r'```(?:json)?\s*(\{.*\})\s*```', response_text, re.DOTALL)
|
| 172 |
+
if code_block_match:
|
| 173 |
+
try:
|
| 174 |
+
data = json.loads(code_block_match.group(1))
|
| 175 |
+
except json.JSONDecodeError:
|
| 176 |
+
pass
|
| 177 |
+
|
| 178 |
+
# Strategy 3: Extract JSON object directly
|
| 179 |
+
if data is None:
|
| 180 |
+
json_match = re.search(r'\{[\s\S]*"tables"[\s\S]*\}', response_text)
|
| 181 |
+
if json_match:
|
| 182 |
+
try:
|
| 183 |
+
data = json.loads(json_match.group(0))
|
| 184 |
+
except json.JSONDecodeError:
|
| 185 |
+
pass
|
| 186 |
+
|
| 187 |
+
# Strategy 4: Look for any JSON-like structure
|
| 188 |
+
if data is None:
|
| 189 |
+
json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
|
| 190 |
+
if json_match:
|
| 191 |
+
try:
|
| 192 |
+
data = json.loads(json_match.group(0))
|
| 193 |
+
except json.JSONDecodeError:
|
| 194 |
+
pass
|
| 195 |
+
|
| 196 |
+
if data and "tables" in data:
|
| 197 |
+
tables = data["tables"]
|
| 198 |
+
# Ensure each table has required fields
|
| 199 |
+
for table in tables:
|
| 200 |
+
if "page_number" not in table:
|
| 201 |
+
table["page_number"] = 1
|
| 202 |
+
if "source" not in table:
|
| 203 |
+
table["source"] = "PageIndex"
|
| 204 |
+
print(f"[PageIndex] Successfully extracted {len(tables)} tables.")
|
| 205 |
+
return json.dumps({"tables": tables})
|
| 206 |
+
|
| 207 |
+
# If no valid JSON found, return empty
|
| 208 |
+
print(f"[PageIndex] No valid JSON found in response. Raw preview: {response_text[:500]}")
|
| 209 |
return json.dumps({"tables": []})
|
| 210 |
|
| 211 |
except Exception as e:
|
|
|
|
| 368 |
messages=messages,
|
| 369 |
stream=True,
|
| 370 |
max_tokens=8192,
|
| 371 |
+
temperature=0,
|
| 372 |
)
|
| 373 |
|
| 374 |
full_response_text = ""
|