gauthamnairy commited on
Commit
59c1497
·
verified ·
1 Parent(s): ad30edb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -43
app.py CHANGED
@@ -67,18 +67,24 @@ def extract_tables_from_markdown(markdown_text, token):
67
  context = markdown_text[:15000]
68
 
69
  # 4. Generate structured JSON tables
70
- extraction_prompt = """You are a Petroleum Data Extraction Expert. Your task is to extract ALL tables from the provided document context and return them as a valid JSON object.
71
 
72
  CRITICAL INSTRUCTIONS - READ CAREFULLY:
73
  1. **EXTRACT ALL ROWS**: You MUST extract EVERY SINGLE ROW from each table. Do NOT skip rows, do NOT truncate, do NOT summarize.
74
  2. **NO PARTIAL DATA**: If a table has 10 rows, you must return all 10 rows. If it has 100 rows, return all 100 rows.
75
- 3. **COMPLETE EXTRACTION**: Count the rows in the source table and verify you extracted the same number.
76
- 4. **DO NOT SUMMARIZE**: Never say "etc" or "..." or truncate with "...". Every row must be fully extracted.
 
 
 
 
 
 
 
77
 
78
- **SUGGESTION FOR COMPREHENSIVE EXTRACTION**:
79
- When scanning the document, look for these O&G table categories (extract ALL that you find):
80
  - Well Headers / Well Identification / Site Data
81
- - Formation Tops / Lithology / Stratigraphy
82
  - Directional Survey / Well Path / Azimuth/Inclination data
83
  - Casing Records / Casing Data / Tubing specifications
84
  - Cementing Data / Cement Composition / Bond logs
@@ -91,17 +97,21 @@ When scanning the document, look for these O&G table categories (extract ALL tha
91
  - Equipment Lists / BHA / Drill string components
92
  - Personnel / Company representatives / Supervisors
93
  - Timelines / Drilling events / Days depths
94
- - Cost data / AFE estimates (if present)
95
- - Distribution lists are usually NOT useful - skip these.
 
 
 
96
 
97
  EXTRACTION REQUIREMENTS:
98
- - Find ALL tables in the document - Well Headers, Formation Tops, Casing, Surveys, Drilling Data, Core Analysis, Sidewall Samples, Production Tests, etc.
 
99
  - For each table, extract:
100
  - "title": A descriptive title for the table
101
- - "headers": Array of column names exactly as they appear
102
- - "rows": Array of row objects with column names as keys - MUST INCLUDE ALL ROWS
103
  - "page_number": The page number where this table appears
104
- - **BE THOROUGH**: A typical completion report has 10-20+ separate tables. If you only found 3-5, you missed some. Scan again.
105
 
106
  Return VALID JSON ONLY in this exact format:
107
 
@@ -119,13 +129,10 @@ Return VALID JSON ONLY in this exact format:
119
  }
120
 
121
  VERIFICATION STEP:
122
- Before returning, count the rows in the source table and verify your extracted rows match exactly.
123
- If the source shows 6 rows, your output must have 6 rows in the "rows" array.
124
-
125
- SUGGESTION: If you found fewer than 8-10 tables in a completion report, re-scan the document for:
126
- - Smaller tables embedded in text sections
127
- - Equipment lists, BHA details, logging summaries
128
- - Data tables you may have skipped as "minor"
129
 
130
  Return ONLY the JSON, no markdown, no explanations, no code blocks."""
131
 
@@ -140,34 +147,65 @@ Return ONLY the JSON, no markdown, no explanations, no code blocks."""
140
  model=model,
141
  messages=messages,
142
  stream=False,
143
- max_tokens=8192,
144
- temperature=0.1
145
  )
146
 
147
  response_text = response.choices[0].message.content
148
  print(f"[PageIndex] LLM response received: {len(response_text)} chars")
149
 
150
- # Parse JSON from response
151
- # Try to extract JSON block
152
- json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
153
- if json_match:
154
- try:
155
- data = json.loads(json_match.group(0))
156
- if "tables" in data:
157
- tables = data["tables"]
158
- # Ensure each table has required fields
159
- for table in tables:
160
- if "page_number" not in table:
161
- table["page_number"] = 1
162
- if "source" not in table:
163
- table["source"] = "PageIndex"
164
- print(f"[PageIndex] Successfully extracted {len(tables)} tables.")
165
- return json.dumps({"tables": tables})
166
- except json.JSONDecodeError as e:
167
- print(f"[PageIndex] JSON parse error: {e}")
168
-
169
- # If no JSON found, return empty
170
- print("[PageIndex] No valid JSON found in response, returning empty tables.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  return json.dumps({"tables": []})
172
 
173
  except Exception as e:
@@ -330,7 +368,7 @@ Your goal is to extract precise technical data from the provided document contex
330
  messages=messages,
331
  stream=True,
332
  max_tokens=8192,
333
- temperature=0.3
334
  )
335
 
336
  full_response_text = ""
 
67
  context = markdown_text[:15000]
68
 
69
  # 4. Generate structured JSON tables
70
+ extraction_prompt = """You are a Petroleum Data Extraction Expert. Your task is to extract ALL tables AND convert structured paragraph data into tables from the provided document context.
71
 
72
  CRITICAL INSTRUCTIONS - READ CAREFULLY:
73
  1. **EXTRACT ALL ROWS**: You MUST extract EVERY SINGLE ROW from each table. Do NOT skip rows, do NOT truncate, do NOT summarize.
74
  2. **NO PARTIAL DATA**: If a table has 10 rows, you must return all 10 rows. If it has 100 rows, return all 100 rows.
75
+ 3. **CONVERT PARAGRAPHS TO TABLES**: If you find formation tops, lithology data, or any structured data in text paragraphs (e.g., "Formation X encountered at 1000m depth"), CONVERT it into a proper table with columns and rows.
76
+ 4. **COMPLETE EXTRACTION**: Count the rows in the source table and verify you extracted the same number.
77
+ 5. **DO NOT SUMMARIZE**: Never say "etc" or "..." or truncate with "...". Every row must be fully extracted.
78
+ 6. **SCRAPE PARAGRAPHS**: Look for:
79
+ - Formation tops mentioned in text (e.g., "Eleana Formation at 2594 feet")
80
+ - Lithology descriptions with depths
81
+ - Drilling events with dates/depths
82
+ - Equipment lists in bullet points
83
+ - Any sequential data that can be tabulated
84
 
85
+ **O&G TABLE CATEGORIES TO EXTRACT (including from paragraphs):**
 
86
  - Well Headers / Well Identification / Site Data
87
+ - Formation Tops / Lithology / Stratigraphy (LOOK IN TEXT PARAGRAPHS TOO!)
88
  - Directional Survey / Well Path / Azimuth/Inclination data
89
  - Casing Records / Casing Data / Tubing specifications
90
  - Cementing Data / Cement Composition / Bond logs
 
97
  - Equipment Lists / BHA / Drill string components
98
  - Personnel / Company representatives / Supervisors
99
  - Timelines / Drilling events / Days depths
100
+ - Cost data / AFE estimates
101
+
102
+ **PARAGRAPH-TO-TABLE CONVERSION EXAMPLES:**
103
+ If text says: "The Eleana Dolomite was encountered at 2,594 ft MD (2,594 ft TVD)..."
104
+ CREATE: {"title": "Formation Tops", "headers": ["Formation", "Depth_ft", "Depth_m"], "rows": [...]}
105
 
106
  EXTRACTION REQUIREMENTS:
107
+ - Find ALL tables in the document
108
+ - CONVERT paragraph data describing formations, depths, lithology INTO tables
109
  - For each table, extract:
110
  - "title": A descriptive title for the table
111
+ - "headers": Array of column names
112
+ - "rows": Array of row objects - MUST INCLUDE ALL ROWS
113
  - "page_number": The page number where this table appears
114
+ - **BE THOROUGH**: A typical completion report has 15-25+ separate tables. If you only found 3-5, you missed some. Scan paragraphs too!
115
 
116
  Return VALID JSON ONLY in this exact format:
117
 
 
129
  }
130
 
131
  VERIFICATION STEP:
132
+ 1. Count tables found in explicit table format
133
+ 2. Count data found in paragraphs that could be tables
134
+ 3. Total should be 15-25+ for a completion report
135
+ 4. Before returning, verify you converted paragraph data to tables
 
 
 
136
 
137
  Return ONLY the JSON, no markdown, no explanations, no code blocks."""
138
 
 
147
  model=model,
148
  messages=messages,
149
  stream=False,
150
+ max_tokens=16384,
151
+ temperature=0
152
  )
153
 
154
  response_text = response.choices[0].message.content
155
  print(f"[PageIndex] LLM response received: {len(response_text)} chars")
156
 
157
+ # Parse JSON from response - handle markdown code blocks
158
+ response_text = response_text.strip()
159
+
160
+ # Try multiple extraction strategies
161
+ data = None
162
+
163
+ # Strategy 1: Try direct JSON parse
164
+ try:
165
+ data = json.loads(response_text)
166
+ except json.JSONDecodeError:
167
+ pass
168
+
169
+ # Strategy 2: Extract JSON from markdown code block
170
+ if data is None:
171
+ code_block_match = re.search(r'```(?:json)?\s*(\{.*\})\s*```', response_text, re.DOTALL)
172
+ if code_block_match:
173
+ try:
174
+ data = json.loads(code_block_match.group(1))
175
+ except json.JSONDecodeError:
176
+ pass
177
+
178
+ # Strategy 3: Extract JSON object directly
179
+ if data is None:
180
+ json_match = re.search(r'\{[\s\S]*"tables"[\s\S]*\}', response_text)
181
+ if json_match:
182
+ try:
183
+ data = json.loads(json_match.group(0))
184
+ except json.JSONDecodeError:
185
+ pass
186
+
187
+ # Strategy 4: Look for any JSON-like structure
188
+ if data is None:
189
+ json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
190
+ if json_match:
191
+ try:
192
+ data = json.loads(json_match.group(0))
193
+ except json.JSONDecodeError:
194
+ pass
195
+
196
+ if data and "tables" in data:
197
+ tables = data["tables"]
198
+ # Ensure each table has required fields
199
+ for table in tables:
200
+ if "page_number" not in table:
201
+ table["page_number"] = 1
202
+ if "source" not in table:
203
+ table["source"] = "PageIndex"
204
+ print(f"[PageIndex] Successfully extracted {len(tables)} tables.")
205
+ return json.dumps({"tables": tables})
206
+
207
+ # If no valid JSON found, return empty
208
+ print(f"[PageIndex] No valid JSON found in response. Raw preview: {response_text[:500]}")
209
  return json.dumps({"tables": []})
210
 
211
  except Exception as e:
 
368
  messages=messages,
369
  stream=True,
370
  max_tokens=8192,
371
+ temperature=0,
372
  )
373
 
374
  full_response_text = ""